MaCh3  2.5.0
Reference Guide
Classes | Functions
gpuSplineUtils.cuh File Reference

MaCh3 event-by-event cross-section spline code. More...

#include "Manager/gpuUtils.cuh"
#include "Splines/SplineCommon.h"
Include dependency graph for gpuSplineUtils.cuh:
This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Classes

class  SplineMonolithGPU
 Class responsible for calculating spline weight on GPU. More...
 

Functions

__host__ void SynchroniseSplines ()
 Make sure all Cuda threads finished execution. More...
 
__global__ void EvalOnGPU_Splines (const unsigned int gpu_n_splines, const short int gpu_spline_size, const short int *__restrict__ gpu_paramNo_arr, const unsigned int *__restrict__ gpu_nKnots_arr, const float *__restrict__ gpu_coeff_many, const float *__restrict__ gpu_par_val, const short int *__restrict__ gpu_spline_segment, float *__restrict__ gpu_weights, const cudaTextureObject_t __restrict__ text_coeff_x)
 Evaluate the spline on the GPU Using one {y,b,c,d} array and one {x} array Should be most efficient at cache hitting and memory coalescence But using spline segments rather than the parameter value: avoids doing binary search on GPU. More...
 
__global__ void EvalOnGPU_TF1 (const unsigned int gpu_n_TF1, const float *__restrict__ gpu_coeffs_tf1, const short int *__restrict__ gpu_paramNo_arr_tf1, const float *__restrict__ gpu_par_val, float *__restrict__ gpu_weights_tf1)
 Evaluate the TF1 on the GPU Using first order polynomial Polynomial form: w(x) = a0 + a1·x. More...
 
__global__ void EvalOnGPU_TotWeight (const int gpu_n_events, const float *__restrict__ gpu_weights, const float *__restrict__ gpu_weights_tf1, M3::float_t *__restrict__ gpu_total_weights, const cudaTextureObject_t __restrict__ text_nParamPerEvent, const cudaTextureObject_t __restrict__ text_nParamPerEvent_TF1)
 KS: Evaluate the total spline event weight on the GPU, as in most cases GPU is faster, even more this significant reduce memory transfer from GPU to CPU. More...
 

Detailed Description

MaCh3 event-by-event cross-section spline code.

Author
Richard Calland
Asher Kaboth
Clarence Wret
Kamil Skwarczynski

Contains code to run on CUDA GPUs. Essentially we load up stripped TSpline3 objects to the GPU and do the equivalent of TSpline3->Eval(double) for all events Now also supports TF1 evals Called from Samples/SampleHandler.cpp -> Splines/SplineMonolith.cpp -> Splines/gpuSplineUtils.cu

Definition in file gpuSplineUtils.cuh.

Function Documentation

◆ EvalOnGPU_Splines()

__global__ void EvalOnGPU_Splines ( const unsigned int  gpu_n_splines,
const short int  gpu_spline_size,
const short int *__restrict__  gpu_paramNo_arr,
const unsigned int *__restrict__  gpu_nKnots_arr,
const float *__restrict__  gpu_coeff_many,
const float *__restrict__  gpu_par_val,
const short int *__restrict__  gpu_spline_segment,
float *__restrict__  gpu_weights,
const cudaTextureObject_t __restrict__  text_coeff_x 
)

Evaluate the spline on the GPU Using one {y,b,c,d} array and one {x} array Should be most efficient at cache hitting and memory coalescence But using spline segments rather than the parameter value: avoids doing binary search on GPU.

Parameters
gpu_n_splinesTotal number of splines to evaluate (one thread per spline)
gpu_spline_sizeMax number of knots per spline (shared across all splines)
gpu_paramNo_arrhas length = spln_counter (keeps track of which parameter we're using on this thread)
gpu_nKnots_arrhas length = spln_counter (keeps track where current spline starts)
gpu_coeff_manyhas length = nKnots * 4, stores all coefficients for all splines and knots
gpu_weightshas length = spln_counter * spline_size
text_coeff_xarray storing info about X coeff, uses texture memory. Has length = n_params * spline_size,

Definition at line 330 of file gpuSplineUtils.cu.

339  {
340 //*********************************************************
341  // points per spline is the offset to skip in the index to move between splines
342  const unsigned int splineNum = (blockIdx.x * blockDim.x + threadIdx.x);
343 
344  // this is the stopping condition!
345  if (splineNum < gpu_n_splines) {
346  // This is the segment we want for this parameter variation
347  // for this particular splineNum; 0 = MACCQE, 1 = pFC, 2 = EBC, etc
348 
349  //CW: Which Parameter we are accessing
350  const short int Param = gpu_paramNo_arr[splineNum];
351 
352  //CW: Avoids doing costly binary search on GPU
353  const short int segment = gpu_spline_segment[Param];
354 
355  //KS: Segment for coeff_x is simply parameter*max knots + segment as each parmeters has the same spacing
356  const short int segment_X = Param*gpu_spline_size+segment;
357 
358  //KS: Find knot position in out monolitical structure
359  const unsigned int CurrentKnotPos = gpu_nKnots_arr[splineNum]*_nCoeff_+segment*_nCoeff_;
360 
361  // We've read the segment straight from CPU and is saved in gpu_spline_segment
362  // polynomial parameters from the monolithic splineMonolith
363  const float fY = gpu_coeff_many[CurrentKnotPos];
364  const float fB = gpu_coeff_many[CurrentKnotPos + 1];
365  const float fC = gpu_coeff_many[CurrentKnotPos + 2];
366  const float fD = gpu_coeff_many[CurrentKnotPos + 3];
367  // The is the variation itself (needed to evaluate variation - stored spline point = dx)
368  const float dx = gpu_par_val[Param] - tex1Dfetch<float>(text_coeff_x, segment_X);
369 
370  //CW: Wooow, let's use some fancy intrinsics and pull down the processing time by <1% from normal multiplication! HURRAY
371  gpu_weights[splineNum] = fmaf(dx, fmaf(dx, fmaf(dx, fD, fC), fB), fY);
372  // Or for the more "easy to read" version:
373  //gpu_weights[splineNum] = (fY+dx*(fB+dx*(fC+dx*fD)));
374 
375  //#ifdef MACH3_DEBUG
376  //printf("splineNum = %i/%i, paramNo = %i, variation = %f, segment = %i, fX = %f, fX+1 = %f, dx = %f, gpu_n_splines = %i, gpu_spline_size = %i, weight = %f \n", splineNum, gpu_n_splines, gpu_paramNo_arr[splineNum], gpu_par_val[Param], segment, tex1Dfetch<float>(text_coeff_x, segment_X), tex1Dfetch<float>(text_coeff_x, segment_X+1), dx, gpu_n_splines, gpu_spline_size, gpu_weights[splineNum]);
377  //#endif
378  }
379 }
constexpr int _nCoeff_
KS: We store coefficients {y,b,c,d} in one array one by one, this is only to define it once rather th...
Definition: SplineCommon.h:18

◆ EvalOnGPU_TF1()

__global__ void EvalOnGPU_TF1 ( const unsigned int  gpu_n_TF1,
const float *__restrict__  gpu_coeffs_tf1,
const short int *__restrict__  gpu_paramNo_arr_tf1,
const float *__restrict__  gpu_par_val,
float *__restrict__  gpu_weights_tf1 
)

Evaluate the TF1 on the GPU Using first order polynomial Polynomial form: w(x) = a0 + a1·x.

Parameters
gpu_n_TF1Total number of TF1 functions to evaluate (one thread per TF1)
gpu_coeffs_tf1coefficients of TF1, has length = tf1 coeef counter
gpu_paramNo_arr_tf1has length = spln_counter (keeps track of which parameter we're using on this thread)
gpu_weights_tf1has length = spln_counter * spline_size

Definition at line 383 of file gpuSplineUtils.cu.

388  {
389 //*********************************************************
390  // points per spline is the offset to skip in the index to move between splines
391  const unsigned int tf1Num = (blockIdx.x * blockDim.x + threadIdx.x);
392 
393  if (tf1Num < gpu_n_TF1) {
394  // The is the variation itself (needed to evaluate variation - stored spline point = dx)
395  const float x = gpu_par_val[gpu_paramNo_arr_tf1[tf1Num]];
396 
397  // Read the coefficients
398  const unsigned int TF1_Index = tf1Num * _nTF1Coeff_;
399  const float a = gpu_coeffs_tf1[TF1_Index];
400  const float b = gpu_coeffs_tf1[TF1_Index+1];
401 
402  gpu_weights_tf1[tf1Num] = fmaf(a, x, b);
403  // gpu_weights_tf1[tf1Num] = a*x + b;
404  // gpu_weights_tf1[tf1Num] = 1 + a*x + b*x*x + c*x*x*x + d*x*x*x*x + e*x*x*x*x*x;
405  }
406 }
constexpr int _nTF1Coeff_
KS: For TF1 we store at most 5 coefficients, we could make it more flexible but for now define it her...
Definition: SplineCommon.h:20

◆ EvalOnGPU_TotWeight()

__global__ void EvalOnGPU_TotWeight ( const int  gpu_n_events,
const float *__restrict__  gpu_weights,
const float *__restrict__  gpu_weights_tf1,
M3::float_t *__restrict__  gpu_total_weights,
const cudaTextureObject_t __restrict__  text_nParamPerEvent,
const cudaTextureObject_t __restrict__  text_nParamPerEvent_TF1 
)

KS: Evaluate the total spline event weight on the GPU, as in most cases GPU is faster, even more this significant reduce memory transfer from GPU to CPU.

Parameters
gpu_n_eventsTotal number of events to process (one thread per event)
gpu_weightsWeight for each spline object
gpu_weights_tf1Weight for each TF1 object
gpu_total_weightsTotal weight for each event
text_nParamPerEventmap keeping track how many parameters applies to each event, we keep two numbers here {number of splines per event, index where splines start for a given event}
text_nParamPerEvent_TF1map keeping track how many parameters applies to each event, we keep two numbers here {number of splines per event, index where splines start for a given event}

Definition at line 410 of file gpuSplineUtils.cu.

418  {
419 //*********************************************************
420  const unsigned int EventNum = (blockIdx.x * blockDim.x + threadIdx.x);
421 
422  if(EventNum < gpu_n_events) //stopping condition
423  {
424  float local_total_weight = 1.f;
425 
426  const unsigned int EventOffset = 2 * EventNum;
427 
428  for (unsigned int id = 0; id < tex1Dfetch<unsigned int>(text_nParamPerEvent, EventOffset); ++id) {
429  local_total_weight *= gpu_weights[tex1Dfetch<unsigned int>(text_nParamPerEvent, EventOffset+1) + id];
430  }
431 
432  for (unsigned int id = 0; id < tex1Dfetch<unsigned int>(text_nParamPerEvent_TF1, EventOffset); ++id) {
433  local_total_weight *= gpu_weights_tf1[tex1Dfetch<unsigned int>(text_nParamPerEvent_TF1, EventOffset+1) + id];
434  }
435  gpu_total_weights[EventNum] = static_cast<M3::float_t>(local_total_weight);
436  }
437 }
double float_t
Definition: Core.h:37

◆ SynchroniseSplines()

__host__ void SynchroniseSplines ( )

Make sure all Cuda threads finished execution.

Definition at line 50 of file gpuSplineUtils.cu.

50  {
51  cudaDeviceSynchronize();
52 }