MaCh3 event-by-event cross-section spline code. More...

#include "Manager/gpuUtils.cuh"
#include "Splines/SplineCommon.h"

Include dependency graph for gpuSplineUtils.cuh:

This graph shows which files directly or indirectly include this file:

Classes
class	SMonolithGPU
	Class responsible for calculating spline weight on GPU. More...

Functions
__host__ void	SynchroniseSplines ()
	Make sure all Cuda threads finished execution.

__global__ void	EvalOnGPU_Splines (const short int __restrict__ gpu_paramNo_arr, const unsigned int __restrict__ gpu_nKnots_arr, const float __restrict__ gpu_coeff_many, float __restrict__ gpu_weights, const cudaTextureObject_t __restrict__ text_coeff_x)
	Evaluate the spline on the GPU Using one {y,b,c,d} array and one {x} array Should be most efficient at cache hitting and memory coalescence But using spline segments rather than the parameter value: avoids doing binary search on GPU.

__global__ void	EvalOnGPU_TF1 (const float __restrict__ gpu_coeffs_tf1, const short int __restrict__ gpu_paramNo_arr_tf1, float *__restrict__ gpu_weights_tf1)
	Evaluate the TF1 on the GPU Using 5th order polynomial.

__global__ void	EvalOnGPU_TotWeight (const float __restrict__ gpu_weights, const float __restrict__ gpu_weights_tf1, float *__restrict__ gpu_total_weights, const cudaTextureObject_t __restrict__ text_nParamPerEvent, const cudaTextureObject_t __restrict__ text_nParamPerEvent_TF1)
	KS: Evaluate the total spline event weight on the GPU, as in most cases GPU is faster, even more this significant reduce memory transfer from GPU to CPU.

Detailed Description

MaCh3 event-by-event cross-section spline code.

Author: Richard Calland; Asher Kaboth; Clarence Wret; Kamil Skwarczynski

Contains code to run on CUDA GPUs. Essentially we load up stripped TSpline3 objects to the GPU and do the equivalent of TSpline3->Eval(double) for all events Now also supports TF1 evals Called from Samples/samplePDFND.cpp -> Splines/SplineMonolith.cpp -> Splines/gpuSplineUtils.cu

Definition in file gpuSplineUtils.cuh.

Function Documentation

◆ EvalOnGPU_Splines()

__global__ void EvalOnGPU_Splines	(	const short int *__restrict__	gpu_paramNo_arr,
		const unsigned int *__restrict__	gpu_nKnots_arr,
		const float *__restrict__	gpu_coeff_many,
		float *__restrict__	gpu_weights,
		const cudaTextureObject_t __restrict__	text_coeff_x
	)

Evaluate the spline on the GPU Using one {y,b,c,d} array and one {x} array Should be most efficient at cache hitting and memory coalescence But using spline segments rather than the parameter value: avoids doing binary search on GPU.

Parameters

gpu_paramNo_arr	has length = spln_counter (keeps track of which parameter we're using on this thread)
gpu_nKnots_arr	has length = spln_counter (keeps track where current spline starts)
gpu_coeff_many	has length = nKnots * 4, stores all coefficients for all splines and knots
gpu_weights	has length = spln_counter * spline_size
text_coeff_x	array storing info about X coeff, uses texture memory. Has length = n_params * spline_size,

Definition at line 348 of file gpuSplineUtils.cu.

                                                       {
//*********************************************************
  // points per spline is the offset to skip in the index to move between splines
  const unsigned int splineNum = (blockIdx.x * blockDim.x + threadIdx.x);
 
  // this is the stopping condition!
  if (splineNum < d_n_splines) {
    // This is the segment we want for this parameter variation
    // for this particular splineNum; 0 = MACCQE, 1 = pFC, 2 = EBC, etc
 
    //CW: Which Parameter we are accessing
    const short int Param = gpu_paramNo_arr[splineNum];
 
    //CW: Avoids doing costly binary search on GPU
    const short int segment = segment_gpu[Param];
 
    //KS: Segment for coeff_x is simply parameter*max knots + segment as each parmeters has the same spacing
    const short int segment_X = Param*d_spline_size+segment;
 
    //KS: Find knot position in out monolitical structure
    const unsigned int CurrentKnotPos = gpu_nKnots_arr[splineNum]*_nCoeff_+segment*_nCoeff_;
 
    // We've read the segment straight from CPU and is saved in segment_gpu
    // polynomial parameters from the monolithic splineMonolith
    const float fY = gpu_coeff_many[CurrentKnotPos];
    const float fB = gpu_coeff_many[CurrentKnotPos + 1];
    const float fC = gpu_coeff_many[CurrentKnotPos + 2];
    const float fD = gpu_coeff_many[CurrentKnotPos + 3];
    // The is the variation itself (needed to evaluate variation - stored spline point = dx)
    const float dx = val_gpu[Param] - tex1Dfetch<float>(text_coeff_x, segment_X);
 
    //CW: Wooow, let's use some fancy intrinsics and pull down the processing time by <1% from normal multiplication! HURRAY
    gpu_weights[splineNum] = fmaf(dx, fmaf(dx, fmaf(dx, fD, fC), fB), fY);
    // Or for the more "easy to read" version:
    //gpu_weights[splineNum] = (fY+dx*(fB+dx*(fC+dx*fD)));
 
    //#ifdef DEBUG
    //printf("splineNum = %i/%i, paramNo = %i, variation = %f, segment = %i, fX = %f, fX+1 = %f, dx = %f, d_n_splines = %i, d_spline_size = %i, weight = %f \n", splineNum, d_n_splines, gpu_paramNo_arr[splineNum], val_gpu[Param], segment, tex1Dfetch<float>(text_coeff_x, segment_X), tex1Dfetch<float>(text_coeff_x, segment_X+1), dx, d_n_splines, d_spline_size, gpu_weights[splineNum]);
    //#endif
  }
}

◆ EvalOnGPU_TF1()

__global__ void EvalOnGPU_TF1	(	const float *__restrict__	gpu_coeffs_tf1,
		const short int *__restrict__	gpu_paramNo_arr_tf1,
		float *__restrict__	gpu_weights_tf1
	)

Evaluate the TF1 on the GPU Using 5th order polynomial.

Parameters

gpu_coeffs_tf1	coefficients of TF1, has length = tf1 coeef counter
gpu_paramNo_arr_tf1	has length = spln_counter (keeps track of which parameter we're using on this thread)
gpu_weights_tf1	has length = spln_counter * spline_size

Definition at line 397 of file gpuSplineUtils.cu.

                                         {
//*********************************************************
  // points per spline is the offset to skip in the index to move between splines
  const unsigned int tf1Num = (blockIdx.x * blockDim.x + threadIdx.x);
 
  if (tf1Num < d_n_TF1) {
    // The is the variation itself (needed to evaluate variation - stored spline point = dx)
    const float x = val_gpu[gpu_paramNo_arr_tf1[tf1Num]];
 
    // Read the coefficients
    const unsigned int TF1_Index = tf1Num * _nTF1Coeff_;
    const float a = gpu_coeffs_tf1[TF1_Index];
    const float b = gpu_coeffs_tf1[TF1_Index+1];
 
    gpu_weights_tf1[tf1Num] = fmaf(a, x, b);
 
    // gpu_weights_tf1[tf1Num] = a*x + b;
    //gpu_weights_tf1[tf1Num] = 1 + a*x + b*x*x + c*x*x*x + d*x*x*x*x + e*x*x*x*x*x;
  }
}

◆ EvalOnGPU_TotWeight()

__global__ void EvalOnGPU_TotWeight	(	const float *__restrict__	gpu_weights,
		const float *__restrict__	gpu_weights_tf1,
		float *__restrict__	gpu_total_weights,
		const cudaTextureObject_t __restrict__	text_nParamPerEvent,
		const cudaTextureObject_t __restrict__	text_nParamPerEvent_TF1
	)

KS: Evaluate the total spline event weight on the GPU, as in most cases GPU is faster, even more this significant reduce memory transfer from GPU to CPU.

Parameters

gpu_weights	Weight for each spline object
gpu_weights_tf1	Weight for each TF1 object
gpu_total_weights	Total weight for each event
text_nParamPerEvent	map keeping track how many parameters applies to each event, we keep two numbers here {number of splines per event, index where splines start for a given event}
text_nParamPerEvent_TF1	map keeping track how many parameters applies to each event, we keep two numbers here {number of splines per event, index where splines start for a given event}

Definition at line 424 of file gpuSplineUtils.cu.

                                                                  {
//*********************************************************
  const unsigned int EventNum = (blockIdx.x * blockDim.x + threadIdx.x);
 
  //KS: Accessing shared memory is much much faster than global memory hence we use shared memory for calculation and then write to global memory
  __shared__ float shared_total_weights[_BlockSize_];
  if(EventNum < d_n_events) //stopping condition
  {
    shared_total_weights[threadIdx.x] = 1.f;
 
    const unsigned int EventOffset = 2 * EventNum;
 
    for (unsigned int id = 0; id < tex1Dfetch<unsigned int>(text_nParamPerEvent, EventOffset); ++id) {
      shared_total_weights[threadIdx.x] *= gpu_weights[tex1Dfetch<unsigned int>(text_nParamPerEvent, EventOffset+1) + id];
    }
 
    for (unsigned int id = 0; id < tex1Dfetch<unsigned int>(text_nParamPerEvent_TF1, EventOffset); ++id) {
      shared_total_weights[threadIdx.x] *= gpu_weights_tf1[tex1Dfetch<unsigned int>(text_nParamPerEvent_TF1, EventOffset+1) + id];
    }
    gpu_total_weights[EventNum] = shared_total_weights[threadIdx.x];
  }
}

◆ SynchroniseSplines()

__host__ void SynchroniseSplines ( )

Make sure all Cuda threads finished execution.

Definition at line 73 of file gpuSplineUtils.cu.

                                   {
  cudaDeviceSynchronize();
}

Classes

Functions

Detailed Description

Function Documentation

◆ EvalOnGPU_Splines()

◆ EvalOnGPU_TF1()

◆ EvalOnGPU_TotWeight()

◆ SynchroniseSplines()