MaCh3  2.5.0
Reference Guide
Functions
gpuSplineUtils.cu File Reference
#include "Splines/gpuSplineUtils.cuh"
Include dependency graph for gpuSplineUtils.cu:

Go to the source code of this file.

Functions

__host__ void SynchroniseSplines ()
 Make sure all Cuda threads finished execution. More...
 
__global__ void EvalOnGPU_Splines (const unsigned int gpu_n_splines, const short int gpu_spline_size, const short int *__restrict__ gpu_paramNo_arr, const unsigned int *__restrict__ gpu_nKnots_arr, const float *__restrict__ gpu_coeff_many, const float *__restrict__ gpu_par_val, const short int *__restrict__ gpu_spline_segment, float *__restrict__ gpu_weights, const cudaTextureObject_t __restrict__ text_coeff_x)
 Evaluate the spline on the GPU Using one {y,b,c,d} array and one {x} array Should be most efficient at cache hitting and memory coalescence But using spline segments rather than the parameter value: avoids doing binary search on GPU. More...
 
__global__ void EvalOnGPU_TF1 (const unsigned int gpu_n_TF1, const float *__restrict__ gpu_coeffs_tf1, const short int *__restrict__ gpu_paramNo_arr_tf1, const float *__restrict__ gpu_par_val, float *__restrict__ gpu_weights_tf1)
 Evaluate the TF1 on the GPU Using first order polynomial Polynomial form: w(x) = a0 + a1·x. More...
 
__global__ void EvalOnGPU_TotWeight (const int gpu_n_events, const float *__restrict__ gpu_weights, const float *__restrict__ gpu_weights_tf1, M3::float_t *__restrict__ gpu_total_weights, const cudaTextureObject_t __restrict__ text_nParamPerEvent, const cudaTextureObject_t __restrict__ text_nParamPerEvent_TF1)
 KS: Evaluate the total spline event weight on the GPU, as in most cases GPU is faster, even more this significant reduce memory transfer from GPU to CPU. More...
 

Function Documentation

◆ EvalOnGPU_Splines()

__global__ void EvalOnGPU_Splines ( const unsigned int  gpu_n_splines,
const short int  gpu_spline_size,
const short int *__restrict__  gpu_paramNo_arr,
const unsigned int *__restrict__  gpu_nKnots_arr,
const float *__restrict__  gpu_coeff_many,
const float *__restrict__  gpu_par_val,
const short int *__restrict__  gpu_spline_segment,
float *__restrict__  gpu_weights,
const cudaTextureObject_t __restrict__  text_coeff_x 
)

Evaluate the spline on the GPU Using one {y,b,c,d} array and one {x} array Should be most efficient at cache hitting and memory coalescence But using spline segments rather than the parameter value: avoids doing binary search on GPU.

Parameters
gpu_n_splinesTotal number of splines to evaluate (one thread per spline)
gpu_spline_sizeMax number of knots per spline (shared across all splines)
gpu_paramNo_arrhas length = spln_counter (keeps track of which parameter we're using on this thread)
gpu_nKnots_arrhas length = spln_counter (keeps track where current spline starts)
gpu_coeff_manyhas length = nKnots * 4, stores all coefficients for all splines and knots
gpu_weightshas length = spln_counter * spline_size
text_coeff_xarray storing info about X coeff, uses texture memory. Has length = n_params * spline_size,

Definition at line 330 of file gpuSplineUtils.cu.

339  {
340 //*********************************************************
341  // points per spline is the offset to skip in the index to move between splines
342  const unsigned int splineNum = (blockIdx.x * blockDim.x + threadIdx.x);
343 
344  // this is the stopping condition!
345  if (splineNum < gpu_n_splines) {
346  // This is the segment we want for this parameter variation
347  // for this particular splineNum; 0 = MACCQE, 1 = pFC, 2 = EBC, etc
348 
349  //CW: Which Parameter we are accessing
350  const short int Param = gpu_paramNo_arr[splineNum];
351 
352  //CW: Avoids doing costly binary search on GPU
353  const short int segment = gpu_spline_segment[Param];
354 
355  //KS: Segment for coeff_x is simply parameter*max knots + segment as each parmeters has the same spacing
356  const short int segment_X = Param*gpu_spline_size+segment;
357 
358  //KS: Find knot position in out monolitical structure
359  const unsigned int CurrentKnotPos = gpu_nKnots_arr[splineNum]*_nCoeff_+segment*_nCoeff_;
360 
361  // We've read the segment straight from CPU and is saved in gpu_spline_segment
362  // polynomial parameters from the monolithic splineMonolith
363  const float fY = gpu_coeff_many[CurrentKnotPos];
364  const float fB = gpu_coeff_many[CurrentKnotPos + 1];
365  const float fC = gpu_coeff_many[CurrentKnotPos + 2];
366  const float fD = gpu_coeff_many[CurrentKnotPos + 3];
367  // The is the variation itself (needed to evaluate variation - stored spline point = dx)
368  const float dx = gpu_par_val[Param] - tex1Dfetch<float>(text_coeff_x, segment_X);
369 
370  //CW: Wooow, let's use some fancy intrinsics and pull down the processing time by <1% from normal multiplication! HURRAY
371  gpu_weights[splineNum] = fmaf(dx, fmaf(dx, fmaf(dx, fD, fC), fB), fY);
372  // Or for the more "easy to read" version:
373  //gpu_weights[splineNum] = (fY+dx*(fB+dx*(fC+dx*fD)));
374 
375  //#ifdef MACH3_DEBUG
376  //printf("splineNum = %i/%i, paramNo = %i, variation = %f, segment = %i, fX = %f, fX+1 = %f, dx = %f, gpu_n_splines = %i, gpu_spline_size = %i, weight = %f \n", splineNum, gpu_n_splines, gpu_paramNo_arr[splineNum], gpu_par_val[Param], segment, tex1Dfetch<float>(text_coeff_x, segment_X), tex1Dfetch<float>(text_coeff_x, segment_X+1), dx, gpu_n_splines, gpu_spline_size, gpu_weights[splineNum]);
377  //#endif
378  }
379 }
constexpr int _nCoeff_
KS: We store coefficients {y,b,c,d} in one array one by one, this is only to define it once rather th...
Definition: SplineCommon.h:18

◆ EvalOnGPU_TF1()

__global__ void EvalOnGPU_TF1 ( const unsigned int  gpu_n_TF1,
const float *__restrict__  gpu_coeffs_tf1,
const short int *__restrict__  gpu_paramNo_arr_tf1,
const float *__restrict__  gpu_par_val,
float *__restrict__  gpu_weights_tf1 
)

Evaluate the TF1 on the GPU Using first order polynomial Polynomial form: w(x) = a0 + a1·x.

Parameters
gpu_n_TF1Total number of TF1 functions to evaluate (one thread per TF1)
gpu_coeffs_tf1coefficients of TF1, has length = tf1 coeef counter
gpu_paramNo_arr_tf1has length = spln_counter (keeps track of which parameter we're using on this thread)
gpu_weights_tf1has length = spln_counter * spline_size

Definition at line 383 of file gpuSplineUtils.cu.

388  {
389 //*********************************************************
390  // points per spline is the offset to skip in the index to move between splines
391  const unsigned int tf1Num = (blockIdx.x * blockDim.x + threadIdx.x);
392 
393  if (tf1Num < gpu_n_TF1) {
394  // The is the variation itself (needed to evaluate variation - stored spline point = dx)
395  const float x = gpu_par_val[gpu_paramNo_arr_tf1[tf1Num]];
396 
397  // Read the coefficients
398  const unsigned int TF1_Index = tf1Num * _nTF1Coeff_;
399  const float a = gpu_coeffs_tf1[TF1_Index];
400  const float b = gpu_coeffs_tf1[TF1_Index+1];
401 
402  gpu_weights_tf1[tf1Num] = fmaf(a, x, b);
403  // gpu_weights_tf1[tf1Num] = a*x + b;
404  // gpu_weights_tf1[tf1Num] = 1 + a*x + b*x*x + c*x*x*x + d*x*x*x*x + e*x*x*x*x*x;
405  }
406 }
constexpr int _nTF1Coeff_
KS: For TF1 we store at most 5 coefficients, we could make it more flexible but for now define it her...
Definition: SplineCommon.h:20

◆ EvalOnGPU_TotWeight()

__global__ void EvalOnGPU_TotWeight ( const int  gpu_n_events,
const float *__restrict__  gpu_weights,
const float *__restrict__  gpu_weights_tf1,
M3::float_t *__restrict__  gpu_total_weights,
const cudaTextureObject_t __restrict__  text_nParamPerEvent,
const cudaTextureObject_t __restrict__  text_nParamPerEvent_TF1 
)

KS: Evaluate the total spline event weight on the GPU, as in most cases GPU is faster, even more this significant reduce memory transfer from GPU to CPU.

Parameters
gpu_n_eventsTotal number of events to process (one thread per event)
gpu_weightsWeight for each spline object
gpu_weights_tf1Weight for each TF1 object
gpu_total_weightsTotal weight for each event
text_nParamPerEventmap keeping track how many parameters applies to each event, we keep two numbers here {number of splines per event, index where splines start for a given event}
text_nParamPerEvent_TF1map keeping track how many parameters applies to each event, we keep two numbers here {number of splines per event, index where splines start for a given event}

Definition at line 410 of file gpuSplineUtils.cu.

418  {
419 //*********************************************************
420  const unsigned int EventNum = (blockIdx.x * blockDim.x + threadIdx.x);
421 
422  if(EventNum < gpu_n_events) //stopping condition
423  {
424  float local_total_weight = 1.f;
425 
426  const unsigned int EventOffset = 2 * EventNum;
427 
428  for (unsigned int id = 0; id < tex1Dfetch<unsigned int>(text_nParamPerEvent, EventOffset); ++id) {
429  local_total_weight *= gpu_weights[tex1Dfetch<unsigned int>(text_nParamPerEvent, EventOffset+1) + id];
430  }
431 
432  for (unsigned int id = 0; id < tex1Dfetch<unsigned int>(text_nParamPerEvent_TF1, EventOffset); ++id) {
433  local_total_weight *= gpu_weights_tf1[tex1Dfetch<unsigned int>(text_nParamPerEvent_TF1, EventOffset+1) + id];
434  }
435  gpu_total_weights[EventNum] = static_cast<M3::float_t>(local_total_weight);
436  }
437 }
double float_t
Definition: Core.h:37

◆ SynchroniseSplines()

__host__ void SynchroniseSplines ( )

Make sure all Cuda threads finished execution.

Definition at line 50 of file gpuSplineUtils.cu.

50  {
51  cudaDeviceSynchronize();
52 }