MaCh3 2.2.1
Reference Guide
Loading...
Searching...
No Matches
Classes | Functions
gpuSplineUtils.cuh File Reference

MaCh3 event-by-event cross-section spline code. More...

#include "Manager/gpuUtils.cuh"
#include "Splines/SplineCommon.h"
Include dependency graph for gpuSplineUtils.cuh:
This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Classes

class  SMonolithGPU
 Class responsible for calculating spline weight on GPU. More...
 

Functions

__host__ void SynchroniseSplines ()
 Make sure all Cuda threads finished execution.
 
__global__ void EvalOnGPU_Splines (const short int *__restrict__ gpu_paramNo_arr, const unsigned int *__restrict__ gpu_nKnots_arr, const float *__restrict__ gpu_coeff_many, float *__restrict__ gpu_weights, const cudaTextureObject_t __restrict__ text_coeff_x)
 Evaluate the spline on the GPU Using one {y,b,c,d} array and one {x} array Should be most efficient at cache hitting and memory coalescence But using spline segments rather than the parameter value: avoids doing binary search on GPU.
 
__global__ void EvalOnGPU_TF1 (const float *__restrict__ gpu_coeffs_tf1, const short int *__restrict__ gpu_paramNo_arr_tf1, float *__restrict__ gpu_weights_tf1)
 Evaluate the TF1 on the GPU Using 5th order polynomial.
 
__global__ void EvalOnGPU_TotWeight (const float *__restrict__ gpu_weights, const float *__restrict__ gpu_weights_tf1, float *__restrict__ gpu_total_weights, const cudaTextureObject_t __restrict__ text_nParamPerEvent, const cudaTextureObject_t __restrict__ text_nParamPerEvent_TF1)
 KS: Evaluate the total spline event weight on the GPU, as in most cases GPU is faster, even more this significant reduce memory transfer from GPU to CPU.
 

Detailed Description

MaCh3 event-by-event cross-section spline code.

Author
Richard Calland
Asher Kaboth
Clarence Wret
Kamil Skwarczynski

Contains code to run on CUDA GPUs. Essentially we load up stripped TSpline3 objects to the GPU and do the equivalent of TSpline3->Eval(double) for all events Now also supports TF1 evals Called from Samples/samplePDFND.cpp -> Splines/SplineMonolith.cpp -> Splines/gpuSplineUtils.cu

Definition in file gpuSplineUtils.cuh.

Function Documentation

◆ EvalOnGPU_Splines()

__global__ void EvalOnGPU_Splines ( const short int *__restrict__  gpu_paramNo_arr,
const unsigned int *__restrict__  gpu_nKnots_arr,
const float *__restrict__  gpu_coeff_many,
float *__restrict__  gpu_weights,
const cudaTextureObject_t __restrict__  text_coeff_x 
)

Evaluate the spline on the GPU Using one {y,b,c,d} array and one {x} array Should be most efficient at cache hitting and memory coalescence But using spline segments rather than the parameter value: avoids doing binary search on GPU.

Parameters
gpu_paramNo_arrhas length = spln_counter (keeps track of which parameter we're using on this thread)
gpu_nKnots_arrhas length = spln_counter (keeps track where current spline starts)
gpu_coeff_manyhas length = nKnots * 4, stores all coefficients for all splines and knots
gpu_weightshas length = spln_counter * spline_size
text_coeff_xarray storing info about X coeff, uses texture memory. Has length = n_params * spline_size,

Definition at line 348 of file gpuSplineUtils.cu.

353 {
354//*********************************************************
355 // points per spline is the offset to skip in the index to move between splines
356 const unsigned int splineNum = (blockIdx.x * blockDim.x + threadIdx.x);
357
358 // this is the stopping condition!
359 if (splineNum < d_n_splines) {
360 // This is the segment we want for this parameter variation
361 // for this particular splineNum; 0 = MACCQE, 1 = pFC, 2 = EBC, etc
362
363 //CW: Which Parameter we are accessing
364 const short int Param = gpu_paramNo_arr[splineNum];
365
366 //CW: Avoids doing costly binary search on GPU
367 const short int segment = segment_gpu[Param];
368
369 //KS: Segment for coeff_x is simply parameter*max knots + segment as each parmeters has the same spacing
370 const short int segment_X = Param*d_spline_size+segment;
371
372 //KS: Find knot position in out monolitical structure
373 const unsigned int CurrentKnotPos = gpu_nKnots_arr[splineNum]*_nCoeff_+segment*_nCoeff_;
374
375 // We've read the segment straight from CPU and is saved in segment_gpu
376 // polynomial parameters from the monolithic splineMonolith
377 const float fY = gpu_coeff_many[CurrentKnotPos];
378 const float fB = gpu_coeff_many[CurrentKnotPos + 1];
379 const float fC = gpu_coeff_many[CurrentKnotPos + 2];
380 const float fD = gpu_coeff_many[CurrentKnotPos + 3];
381 // The is the variation itself (needed to evaluate variation - stored spline point = dx)
382 const float dx = val_gpu[Param] - tex1Dfetch<float>(text_coeff_x, segment_X);
383
384 //CW: Wooow, let's use some fancy intrinsics and pull down the processing time by <1% from normal multiplication! HURRAY
385 gpu_weights[splineNum] = fmaf(dx, fmaf(dx, fmaf(dx, fD, fC), fB), fY);
386 // Or for the more "easy to read" version:
387 //gpu_weights[splineNum] = (fY+dx*(fB+dx*(fC+dx*fD)));
388
389 //#ifdef DEBUG
390 //printf("splineNum = %i/%i, paramNo = %i, variation = %f, segment = %i, fX = %f, fX+1 = %f, dx = %f, d_n_splines = %i, d_spline_size = %i, weight = %f \n", splineNum, d_n_splines, gpu_paramNo_arr[splineNum], val_gpu[Param], segment, tex1Dfetch<float>(text_coeff_x, segment_X), tex1Dfetch<float>(text_coeff_x, segment_X+1), dx, d_n_splines, d_spline_size, gpu_weights[splineNum]);
391 //#endif
392 }
393}
#define _nCoeff_
KS: We store coefficients {y,b,c,d} in one array one by one, this is only to define it once rather th...
Definition: SplineCommon.h:13
__device__ __constant__ unsigned int d_n_splines
Number of splines living on GPU.
__device__ __constant__ short int segment_gpu[NSplines_GPU]
__device__ __constant__ float val_gpu[NSplines_GPU]
CW: Constant memory needs to be hard-coded on compile time. Could make this texture memory instead,...
__device__ __constant__ short int d_spline_size
Size of splines living on GPU.

◆ EvalOnGPU_TF1()

__global__ void EvalOnGPU_TF1 ( const float *__restrict__  gpu_coeffs_tf1,
const short int *__restrict__  gpu_paramNo_arr_tf1,
float *__restrict__  gpu_weights_tf1 
)

Evaluate the TF1 on the GPU Using 5th order polynomial.

Parameters
gpu_coeffs_tf1coefficients of TF1, has length = tf1 coeef counter
gpu_paramNo_arr_tf1has length = spln_counter (keeps track of which parameter we're using on this thread)
gpu_weights_tf1has length = spln_counter * spline_size

Definition at line 397 of file gpuSplineUtils.cu.

400 {
401//*********************************************************
402 // points per spline is the offset to skip in the index to move between splines
403 const unsigned int tf1Num = (blockIdx.x * blockDim.x + threadIdx.x);
404
405 if (tf1Num < d_n_TF1) {
406 // The is the variation itself (needed to evaluate variation - stored spline point = dx)
407 const float x = val_gpu[gpu_paramNo_arr_tf1[tf1Num]];
408
409 // Read the coefficients
410 const unsigned int TF1_Index = tf1Num * _nTF1Coeff_;
411 const float a = gpu_coeffs_tf1[TF1_Index];
412 const float b = gpu_coeffs_tf1[TF1_Index+1];
413
414 gpu_weights_tf1[tf1Num] = fmaf(a, x, b);
415
416 // gpu_weights_tf1[tf1Num] = a*x + b;
417 //gpu_weights_tf1[tf1Num] = 1 + a*x + b*x*x + c*x*x*x + d*x*x*x*x + e*x*x*x*x*x;
418 }
419}
#define _nTF1Coeff_
KS: For TF1 we store at most 5 coefficients, we could make it more flexible but for now define it her...
Definition: SplineCommon.h:15
__device__ __constant__ unsigned int d_n_TF1
Number of tf1 living on GPU.

◆ EvalOnGPU_TotWeight()

__global__ void EvalOnGPU_TotWeight ( const float *__restrict__  gpu_weights,
const float *__restrict__  gpu_weights_tf1,
float *__restrict__  gpu_total_weights,
const cudaTextureObject_t __restrict__  text_nParamPerEvent,
const cudaTextureObject_t __restrict__  text_nParamPerEvent_TF1 
)

KS: Evaluate the total spline event weight on the GPU, as in most cases GPU is faster, even more this significant reduce memory transfer from GPU to CPU.

Parameters
gpu_weightsWeight for each spline object
gpu_weights_tf1Weight for each TF1 object
gpu_total_weightsTotal weight for each event
text_nParamPerEventmap keeping track how many parameters applies to each event, we keep two numbers here {number of splines per event, index where splines start for a given event}
text_nParamPerEvent_TF1map keeping track how many parameters applies to each event, we keep two numbers here {number of splines per event, index where splines start for a given event}

Definition at line 424 of file gpuSplineUtils.cu.

431 {
432//*********************************************************
433 const unsigned int EventNum = (blockIdx.x * blockDim.x + threadIdx.x);
434
435 //KS: Accessing shared memory is much much faster than global memory hence we use shared memory for calculation and then write to global memory
436 __shared__ float shared_total_weights[_BlockSize_];
437 if(EventNum < d_n_events) //stopping condition
438 {
439 shared_total_weights[threadIdx.x] = 1.f;
440
441 const unsigned int EventOffset = 2 * EventNum;
442
443 for (unsigned int id = 0; id < tex1Dfetch<unsigned int>(text_nParamPerEvent, EventOffset); ++id) {
444 shared_total_weights[threadIdx.x] *= gpu_weights[tex1Dfetch<unsigned int>(text_nParamPerEvent, EventOffset+1) + id];
445 }
446
447 for (unsigned int id = 0; id < tex1Dfetch<unsigned int>(text_nParamPerEvent_TF1, EventOffset); ++id) {
448 shared_total_weights[threadIdx.x] *= gpu_weights_tf1[tex1Dfetch<unsigned int>(text_nParamPerEvent_TF1, EventOffset+1) + id];
449 }
450 gpu_total_weights[EventNum] = shared_total_weights[threadIdx.x];
451 }
452}
__device__ __constant__ int d_n_events
Number of events living on GPU.
#define _BlockSize_
KS: Need it for shared memory, there is way to use dynamic shared memory but I am lazy right now.
Definition: gpuUtils.cuh:25

◆ SynchroniseSplines()

__host__ void SynchroniseSplines ( )

Make sure all Cuda threads finished execution.

Definition at line 73 of file gpuSplineUtils.cu.

73 {
74 cudaDeviceSynchronize();
75}