MaCh3  2.4.2
Reference Guide
Classes | Functions
gpuSplineUtils.cuh File Reference

MaCh3 event-by-event cross-section spline code. More...

#include "Manager/gpuUtils.cuh"
#include "Splines/SplineCommon.h"
Include dependency graph for gpuSplineUtils.cuh:
This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Classes

class  SMonolithGPU
 Class responsible for calculating spline weight on GPU. More...
 

Functions

__host__ void SynchroniseSplines ()
 Make sure all Cuda threads finished execution. More...
 
__global__ void EvalOnGPU_Splines (const short int *__restrict__ gpu_paramNo_arr, const unsigned int *__restrict__ gpu_nKnots_arr, const float *__restrict__ gpu_coeff_many, float *__restrict__ gpu_weights, const cudaTextureObject_t __restrict__ text_coeff_x)
 Evaluate the spline on the GPU Using one {y,b,c,d} array and one {x} array Should be most efficient at cache hitting and memory coalescence But using spline segments rather than the parameter value: avoids doing binary search on GPU. More...
 
__global__ void EvalOnGPU_TF1 (const float *__restrict__ gpu_coeffs_tf1, const short int *__restrict__ gpu_paramNo_arr_tf1, float *__restrict__ gpu_weights_tf1)
 Evaluate the TF1 on the GPU Using 5th order polynomial. More...
 
__global__ void EvalOnGPU_TotWeight (const float *__restrict__ gpu_weights, const float *__restrict__ gpu_weights_tf1, float *__restrict__ gpu_total_weights, const cudaTextureObject_t __restrict__ text_nParamPerEvent, const cudaTextureObject_t __restrict__ text_nParamPerEvent_TF1)
 KS: Evaluate the total spline event weight on the GPU, as in most cases GPU is faster, even more this significant reduce memory transfer from GPU to CPU. More...
 

Detailed Description

MaCh3 event-by-event cross-section spline code.

Author
Richard Calland
Asher Kaboth
Clarence Wret
Kamil Skwarczynski

Contains code to run on CUDA GPUs. Essentially we load up stripped TSpline3 objects to the GPU and do the equivalent of TSpline3->Eval(double) for all events Now also supports TF1 evals Called from Samples/samplePDFND.cpp -> Splines/SplineMonolith.cpp -> Splines/gpuSplineUtils.cu

Definition in file gpuSplineUtils.cuh.

Function Documentation

◆ EvalOnGPU_Splines()

__global__ void EvalOnGPU_Splines ( const short int *__restrict__  gpu_paramNo_arr,
const unsigned int *__restrict__  gpu_nKnots_arr,
const float *__restrict__  gpu_coeff_many,
float *__restrict__  gpu_weights,
const cudaTextureObject_t __restrict__  text_coeff_x 
)

Evaluate the spline on the GPU Using one {y,b,c,d} array and one {x} array Should be most efficient at cache hitting and memory coalescence But using spline segments rather than the parameter value: avoids doing binary search on GPU.

Parameters
gpu_paramNo_arrhas length = spln_counter (keeps track of which parameter we're using on this thread)
gpu_nKnots_arrhas length = spln_counter (keeps track where current spline starts)
gpu_coeff_manyhas length = nKnots * 4, stores all coefficients for all splines and knots
gpu_weightshas length = spln_counter * spline_size
text_coeff_xarray storing info about X coeff, uses texture memory. Has length = n_params * spline_size,

Definition at line 347 of file gpuSplineUtils.cu.

352  {
353 //*********************************************************
354  // points per spline is the offset to skip in the index to move between splines
355  const unsigned int splineNum = (blockIdx.x * blockDim.x + threadIdx.x);
356 
357  // this is the stopping condition!
358  if (splineNum < d_n_splines) {
359  // This is the segment we want for this parameter variation
360  // for this particular splineNum; 0 = MACCQE, 1 = pFC, 2 = EBC, etc
361 
362  //CW: Which Parameter we are accessing
363  const short int Param = gpu_paramNo_arr[splineNum];
364 
365  //CW: Avoids doing costly binary search on GPU
366  const short int segment = segment_gpu[Param];
367 
368  //KS: Segment for coeff_x is simply parameter*max knots + segment as each parmeters has the same spacing
369  const short int segment_X = Param*d_spline_size+segment;
370 
371  //KS: Find knot position in out monolitical structure
372  const unsigned int CurrentKnotPos = gpu_nKnots_arr[splineNum]*_nCoeff_+segment*_nCoeff_;
373 
374  // We've read the segment straight from CPU and is saved in segment_gpu
375  // polynomial parameters from the monolithic splineMonolith
376  const float fY = gpu_coeff_many[CurrentKnotPos];
377  const float fB = gpu_coeff_many[CurrentKnotPos + 1];
378  const float fC = gpu_coeff_many[CurrentKnotPos + 2];
379  const float fD = gpu_coeff_many[CurrentKnotPos + 3];
380  // The is the variation itself (needed to evaluate variation - stored spline point = dx)
381  const float dx = val_gpu[Param] - tex1Dfetch<float>(text_coeff_x, segment_X);
382 
383  //CW: Wooow, let's use some fancy intrinsics and pull down the processing time by <1% from normal multiplication! HURRAY
384  gpu_weights[splineNum] = fmaf(dx, fmaf(dx, fmaf(dx, fD, fC), fB), fY);
385  // Or for the more "easy to read" version:
386  //gpu_weights[splineNum] = (fY+dx*(fB+dx*(fC+dx*fD)));
387 
388  //#ifdef DEBUG
389  //printf("splineNum = %i/%i, paramNo = %i, variation = %f, segment = %i, fX = %f, fX+1 = %f, dx = %f, d_n_splines = %i, d_spline_size = %i, weight = %f \n", splineNum, d_n_splines, gpu_paramNo_arr[splineNum], val_gpu[Param], segment, tex1Dfetch<float>(text_coeff_x, segment_X), tex1Dfetch<float>(text_coeff_x, segment_X+1), dx, d_n_splines, d_spline_size, gpu_weights[splineNum]);
390  //#endif
391  }
392 }
constexpr int _nCoeff_
KS: We store coefficients {y,b,c,d} in one array one by one, this is only to define it once rather th...
Definition: SplineCommon.h:13
__device__ __constant__ unsigned int d_n_splines
Number of splines living on GPU.
__device__ __constant__ short int segment_gpu[NSplines_GPU]
__device__ __constant__ float val_gpu[NSplines_GPU]
CW: Constant memory needs to be hard-coded on compile time. Could make this texture memory instead,...
__device__ __constant__ short int d_spline_size
Size of splines living on GPU.

◆ EvalOnGPU_TF1()

__global__ void EvalOnGPU_TF1 ( const float *__restrict__  gpu_coeffs_tf1,
const short int *__restrict__  gpu_paramNo_arr_tf1,
float *__restrict__  gpu_weights_tf1 
)

Evaluate the TF1 on the GPU Using 5th order polynomial.

Parameters
gpu_coeffs_tf1coefficients of TF1, has length = tf1 coeef counter
gpu_paramNo_arr_tf1has length = spln_counter (keeps track of which parameter we're using on this thread)
gpu_weights_tf1has length = spln_counter * spline_size

Definition at line 396 of file gpuSplineUtils.cu.

399  {
400 //*********************************************************
401  // points per spline is the offset to skip in the index to move between splines
402  const unsigned int tf1Num = (blockIdx.x * blockDim.x + threadIdx.x);
403 
404  if (tf1Num < d_n_TF1) {
405  // The is the variation itself (needed to evaluate variation - stored spline point = dx)
406  const float x = val_gpu[gpu_paramNo_arr_tf1[tf1Num]];
407 
408  // Read the coefficients
409  const unsigned int TF1_Index = tf1Num * _nTF1Coeff_;
410  const float a = gpu_coeffs_tf1[TF1_Index];
411  const float b = gpu_coeffs_tf1[TF1_Index+1];
412 
413  gpu_weights_tf1[tf1Num] = fmaf(a, x, b);
414 
415  // gpu_weights_tf1[tf1Num] = a*x + b;
416  //gpu_weights_tf1[tf1Num] = 1 + a*x + b*x*x + c*x*x*x + d*x*x*x*x + e*x*x*x*x*x;
417  }
418 }
constexpr int _nTF1Coeff_
KS: For TF1 we store at most 5 coefficients, we could make it more flexible but for now define it her...
Definition: SplineCommon.h:15
__device__ __constant__ unsigned int d_n_TF1
Number of tf1 living on GPU.

◆ EvalOnGPU_TotWeight()

__global__ void EvalOnGPU_TotWeight ( const float *__restrict__  gpu_weights,
const float *__restrict__  gpu_weights_tf1,
float *__restrict__  gpu_total_weights,
const cudaTextureObject_t __restrict__  text_nParamPerEvent,
const cudaTextureObject_t __restrict__  text_nParamPerEvent_TF1 
)

KS: Evaluate the total spline event weight on the GPU, as in most cases GPU is faster, even more this significant reduce memory transfer from GPU to CPU.

Parameters
gpu_weightsWeight for each spline object
gpu_weights_tf1Weight for each TF1 object
gpu_total_weightsTotal weight for each event
text_nParamPerEventmap keeping track how many parameters applies to each event, we keep two numbers here {number of splines per event, index where splines start for a given event}
text_nParamPerEvent_TF1map keeping track how many parameters applies to each event, we keep two numbers here {number of splines per event, index where splines start for a given event}

Definition at line 423 of file gpuSplineUtils.cu.

430  {
431 //*********************************************************
432  const unsigned int EventNum = (blockIdx.x * blockDim.x + threadIdx.x);
433 
434  if(EventNum < d_n_events) //stopping condition
435  {
436  float local_total_weight = 1.f;
437 
438  const unsigned int EventOffset = 2 * EventNum;
439 
440  for (unsigned int id = 0; id < tex1Dfetch<unsigned int>(text_nParamPerEvent, EventOffset); ++id) {
441  local_total_weight *= gpu_weights[tex1Dfetch<unsigned int>(text_nParamPerEvent, EventOffset+1) + id];
442  }
443 
444  for (unsigned int id = 0; id < tex1Dfetch<unsigned int>(text_nParamPerEvent_TF1, EventOffset); ++id) {
445  local_total_weight *= gpu_weights_tf1[tex1Dfetch<unsigned int>(text_nParamPerEvent_TF1, EventOffset+1) + id];
446  }
447  gpu_total_weights[EventNum] = local_total_weight;
448  }
449 }
__device__ __constant__ int d_n_events
Number of events living on GPU.

◆ SynchroniseSplines()

__host__ void SynchroniseSplines ( )

Make sure all Cuda threads finished execution.

Definition at line 73 of file gpuSplineUtils.cu.

73  {
74  cudaDeviceSynchronize();
75 }