MaCh3 2.2.1
Reference Guide
Loading...
Searching...
No Matches
Macros | Functions | Variables
gpuSplineUtils.cu File Reference
#include "Splines/gpuSplineUtils.cuh"
Include dependency graph for gpuSplineUtils.cu:

Go to the source code of this file.

Macros

#define _N_SPLINES_   NSplines_GPU
 

Functions

__host__ void SynchroniseSplines ()
 Make sure all Cuda threads finished execution.
 
__global__ void EvalOnGPU_Splines (const short int *__restrict__ gpu_paramNo_arr, const unsigned int *__restrict__ gpu_nKnots_arr, const float *__restrict__ gpu_coeff_many, float *__restrict__ gpu_weights, const cudaTextureObject_t __restrict__ text_coeff_x)
 Evaluate the spline on the GPU Using one {y,b,c,d} array and one {x} array Should be most efficient at cache hitting and memory coalescence But using spline segments rather than the parameter value: avoids doing binary search on GPU.
 
__global__ void EvalOnGPU_TF1 (const float *__restrict__ gpu_coeffs_tf1, const short int *__restrict__ gpu_paramNo_arr_tf1, float *__restrict__ gpu_weights_tf1)
 Evaluate the TF1 on the GPU Using 5th order polynomial.
 
__global__ void EvalOnGPU_TotWeight (const float *__restrict__ gpu_weights, const float *__restrict__ gpu_weights_tf1, float *__restrict__ gpu_total_weights, const cudaTextureObject_t __restrict__ text_nParamPerEvent, const cudaTextureObject_t __restrict__ text_nParamPerEvent_TF1)
 KS: Evaluate the total spline event weight on the GPU, as in most cases GPU is faster, even more this significant reduce memory transfer from GPU to CPU.
 

Variables

__device__ __constant__ unsigned int d_n_splines
 Number of splines living on GPU.
 
__device__ __constant__ unsigned int d_n_TF1
 Number of tf1 living on GPU.
 
__device__ __constant__ short int d_spline_size
 Size of splines living on GPU.
 
__device__ __constant__ int d_n_events
 Number of events living on GPU.
 
__device__ __constant__ float val_gpu [NSplines_GPU]
 CW: Constant memory needs to be hard-coded on compile time. Could make this texture memory instead, but don't care enough right now...
 
__device__ __constant__ short int segment_gpu [NSplines_GPU]
 

Macro Definition Documentation

◆ _N_SPLINES_

#define _N_SPLINES_   NSplines_GPU

Hard code the number of splines Not entirely necessary: only used for val_gpu and segment_gpu being device constants. Could move them to not being device constants

Definition at line 6 of file gpuSplineUtils.cu.

Function Documentation

◆ EvalOnGPU_Splines()

__global__ void EvalOnGPU_Splines ( const short int *__restrict__  gpu_paramNo_arr,
const unsigned int *__restrict__  gpu_nKnots_arr,
const float *__restrict__  gpu_coeff_many,
float *__restrict__  gpu_weights,
const cudaTextureObject_t __restrict__  text_coeff_x 
)

Evaluate the spline on the GPU Using one {y,b,c,d} array and one {x} array Should be most efficient at cache hitting and memory coalescence But using spline segments rather than the parameter value: avoids doing binary search on GPU.

Parameters
gpu_paramNo_arrhas length = spln_counter (keeps track of which parameter we're using on this thread)
gpu_nKnots_arrhas length = spln_counter (keeps track where current spline starts)
gpu_coeff_manyhas length = nKnots * 4, stores all coefficients for all splines and knots
gpu_weightshas length = spln_counter * spline_size
text_coeff_xarray storing info about X coeff, uses texture memory. Has length = n_params * spline_size,

Definition at line 348 of file gpuSplineUtils.cu.

353 {
354//*********************************************************
355 // points per spline is the offset to skip in the index to move between splines
356 const unsigned int splineNum = (blockIdx.x * blockDim.x + threadIdx.x);
357
358 // this is the stopping condition!
359 if (splineNum < d_n_splines) {
360 // This is the segment we want for this parameter variation
361 // for this particular splineNum; 0 = MACCQE, 1 = pFC, 2 = EBC, etc
362
363 //CW: Which Parameter we are accessing
364 const short int Param = gpu_paramNo_arr[splineNum];
365
366 //CW: Avoids doing costly binary search on GPU
367 const short int segment = segment_gpu[Param];
368
369 //KS: Segment for coeff_x is simply parameter*max knots + segment as each parmeters has the same spacing
370 const short int segment_X = Param*d_spline_size+segment;
371
372 //KS: Find knot position in out monolitical structure
373 const unsigned int CurrentKnotPos = gpu_nKnots_arr[splineNum]*_nCoeff_+segment*_nCoeff_;
374
375 // We've read the segment straight from CPU and is saved in segment_gpu
376 // polynomial parameters from the monolithic splineMonolith
377 const float fY = gpu_coeff_many[CurrentKnotPos];
378 const float fB = gpu_coeff_many[CurrentKnotPos + 1];
379 const float fC = gpu_coeff_many[CurrentKnotPos + 2];
380 const float fD = gpu_coeff_many[CurrentKnotPos + 3];
381 // The is the variation itself (needed to evaluate variation - stored spline point = dx)
382 const float dx = val_gpu[Param] - tex1Dfetch<float>(text_coeff_x, segment_X);
383
384 //CW: Wooow, let's use some fancy intrinsics and pull down the processing time by <1% from normal multiplication! HURRAY
385 gpu_weights[splineNum] = fmaf(dx, fmaf(dx, fmaf(dx, fD, fC), fB), fY);
386 // Or for the more "easy to read" version:
387 //gpu_weights[splineNum] = (fY+dx*(fB+dx*(fC+dx*fD)));
388
389 //#ifdef DEBUG
390 //printf("splineNum = %i/%i, paramNo = %i, variation = %f, segment = %i, fX = %f, fX+1 = %f, dx = %f, d_n_splines = %i, d_spline_size = %i, weight = %f \n", splineNum, d_n_splines, gpu_paramNo_arr[splineNum], val_gpu[Param], segment, tex1Dfetch<float>(text_coeff_x, segment_X), tex1Dfetch<float>(text_coeff_x, segment_X+1), dx, d_n_splines, d_spline_size, gpu_weights[splineNum]);
391 //#endif
392 }
393}
#define _nCoeff_
KS: We store coefficients {y,b,c,d} in one array one by one, this is only to define it once rather th...
Definition: SplineCommon.h:13
__device__ __constant__ unsigned int d_n_splines
Number of splines living on GPU.
__device__ __constant__ short int segment_gpu[NSplines_GPU]
__device__ __constant__ float val_gpu[NSplines_GPU]
CW: Constant memory needs to be hard-coded on compile time. Could make this texture memory instead,...
__device__ __constant__ short int d_spline_size
Size of splines living on GPU.

◆ EvalOnGPU_TF1()

__global__ void EvalOnGPU_TF1 ( const float *__restrict__  gpu_coeffs_tf1,
const short int *__restrict__  gpu_paramNo_arr_tf1,
float *__restrict__  gpu_weights_tf1 
)

Evaluate the TF1 on the GPU Using 5th order polynomial.

Parameters
gpu_coeffs_tf1coefficients of TF1, has length = tf1 coeef counter
gpu_paramNo_arr_tf1has length = spln_counter (keeps track of which parameter we're using on this thread)
gpu_weights_tf1has length = spln_counter * spline_size

Definition at line 397 of file gpuSplineUtils.cu.

400 {
401//*********************************************************
402 // points per spline is the offset to skip in the index to move between splines
403 const unsigned int tf1Num = (blockIdx.x * blockDim.x + threadIdx.x);
404
405 if (tf1Num < d_n_TF1) {
406 // The is the variation itself (needed to evaluate variation - stored spline point = dx)
407 const float x = val_gpu[gpu_paramNo_arr_tf1[tf1Num]];
408
409 // Read the coefficients
410 const unsigned int TF1_Index = tf1Num * _nTF1Coeff_;
411 const float a = gpu_coeffs_tf1[TF1_Index];
412 const float b = gpu_coeffs_tf1[TF1_Index+1];
413
414 gpu_weights_tf1[tf1Num] = fmaf(a, x, b);
415
416 // gpu_weights_tf1[tf1Num] = a*x + b;
417 //gpu_weights_tf1[tf1Num] = 1 + a*x + b*x*x + c*x*x*x + d*x*x*x*x + e*x*x*x*x*x;
418 }
419}
#define _nTF1Coeff_
KS: For TF1 we store at most 5 coefficients, we could make it more flexible but for now define it her...
Definition: SplineCommon.h:15
__device__ __constant__ unsigned int d_n_TF1
Number of tf1 living on GPU.

◆ EvalOnGPU_TotWeight()

__global__ void EvalOnGPU_TotWeight ( const float *__restrict__  gpu_weights,
const float *__restrict__  gpu_weights_tf1,
float *__restrict__  gpu_total_weights,
const cudaTextureObject_t __restrict__  text_nParamPerEvent,
const cudaTextureObject_t __restrict__  text_nParamPerEvent_TF1 
)

KS: Evaluate the total spline event weight on the GPU, as in most cases GPU is faster, even more this significant reduce memory transfer from GPU to CPU.

Parameters
gpu_weightsWeight for each spline object
gpu_weights_tf1Weight for each TF1 object
gpu_total_weightsTotal weight for each event
text_nParamPerEventmap keeping track how many parameters applies to each event, we keep two numbers here {number of splines per event, index where splines start for a given event}
text_nParamPerEvent_TF1map keeping track how many parameters applies to each event, we keep two numbers here {number of splines per event, index where splines start for a given event}

Definition at line 424 of file gpuSplineUtils.cu.

431 {
432//*********************************************************
433 const unsigned int EventNum = (blockIdx.x * blockDim.x + threadIdx.x);
434
435 //KS: Accessing shared memory is much much faster than global memory hence we use shared memory for calculation and then write to global memory
436 __shared__ float shared_total_weights[_BlockSize_];
437 if(EventNum < d_n_events) //stopping condition
438 {
439 shared_total_weights[threadIdx.x] = 1.f;
440
441 const unsigned int EventOffset = 2 * EventNum;
442
443 for (unsigned int id = 0; id < tex1Dfetch<unsigned int>(text_nParamPerEvent, EventOffset); ++id) {
444 shared_total_weights[threadIdx.x] *= gpu_weights[tex1Dfetch<unsigned int>(text_nParamPerEvent, EventOffset+1) + id];
445 }
446
447 for (unsigned int id = 0; id < tex1Dfetch<unsigned int>(text_nParamPerEvent_TF1, EventOffset); ++id) {
448 shared_total_weights[threadIdx.x] *= gpu_weights_tf1[tex1Dfetch<unsigned int>(text_nParamPerEvent_TF1, EventOffset+1) + id];
449 }
450 gpu_total_weights[EventNum] = shared_total_weights[threadIdx.x];
451 }
452}
__device__ __constant__ int d_n_events
Number of events living on GPU.
#define _BlockSize_
KS: Need it for shared memory, there is way to use dynamic shared memory but I am lazy right now.
Definition: gpuUtils.cuh:25

◆ SynchroniseSplines()

__host__ void SynchroniseSplines ( )

Make sure all Cuda threads finished execution.

Definition at line 73 of file gpuSplineUtils.cu.

73 {
74 cudaDeviceSynchronize();
75}

Variable Documentation

◆ d_n_events

__device__ __constant__ int d_n_events

Number of events living on GPU.

Definition at line 64 of file gpuSplineUtils.cu.

◆ d_n_splines

__device__ __constant__ unsigned int d_n_splines

Number of splines living on GPU.

Definition at line 57 of file gpuSplineUtils.cu.

◆ d_n_TF1

__device__ __constant__ unsigned int d_n_TF1

Number of tf1 living on GPU.

Definition at line 59 of file gpuSplineUtils.cu.

◆ d_spline_size

__device__ __constant__ short int d_spline_size

Size of splines living on GPU.

Definition at line 61 of file gpuSplineUtils.cu.

◆ segment_gpu

__device__ __constant__ short int segment_gpu[NSplines_GPU]

Definition at line 68 of file gpuSplineUtils.cu.

◆ val_gpu

__device__ __constant__ float val_gpu[NSplines_GPU]

CW: Constant memory needs to be hard-coded on compile time. Could make this texture memory instead, but don't care enough right now...

Definition at line 67 of file gpuSplineUtils.cu.