6#define _N_SPLINES_ NSplines_GPU
9#if defined(__CUDA_ARCH__)
10 #if __CUDA_ARCH__ >= 1200
11 #pragma message("Compiling with CUDA Architecture: 12.x")
12 #elif __CUDA_ARCH__ >= 1100
13 #pragma message("Compiling with CUDA Architecture: 11.x")
14 #elif __CUDA_ARCH__ >= 1000
15 #pragma message("Compiling with CUDA Architecture: 10.x")
16 #elif __CUDA_ARCH__ >= 900
17 #pragma message("Compiling with CUDA Architecture: 9.x")
18 #elif __CUDA_ARCH__ >= 800
19 #pragma message("Compiling with CUDA Architecture: 8.x")
20 #elif __CUDA_ARCH__ >= 750
21 #pragma message("Compiling with CUDA Architecture: 7.5")
22 #elif __CUDA_ARCH__ >= 730
23 #pragma message("Compiling with CUDA Architecture: 7.3")
24 #elif __CUDA_ARCH__ >= 720
25 #pragma message("Compiling with CUDA Architecture: 7.2")
26 #elif __CUDA_ARCH__ >= 710
27 #pragma message("Compiling with CUDA Architecture: 7.1")
28 #elif __CUDA_ARCH__ >= 700
29 #pragma message("Compiling with CUDA Architecture: 7.x")
30 #elif __CUDA_ARCH__ >= 650
31 #pragma message("Compiling with CUDA Architecture: 6.5")
32 #elif __CUDA_ARCH__ >= 600
33 #pragma message("Compiling with CUDA Architecture: 6.x")
34 #elif __CUDA_ARCH__ >= 530
35 #pragma message("Compiling with CUDA Architecture: 5.3")
36 #elif __CUDA_ARCH__ >= 520
37 #pragma message("Compiling with CUDA Architecture: 5.2")
38 #elif __CUDA_ARCH__ >= 510
39 #pragma message("Compiling with CUDA Architecture: 5.1")
40 #elif __CUDA_ARCH__ >= 500
41 #pragma message("Compiling with CUDA Architecture: 5.x")
42 #elif __CUDA_ARCH__ >= 400
43 #pragma message("Compiling with CUDA Architecture: 4.x")
44 #elif __CUDA_ARCH__ >= 300
45 #pragma message("Compiling with CUDA Architecture: 3.x")
47 #pragma message("Compiling with CUDA Architecture: < 3.x")
59__device__ __constant__
unsigned int d_n_TF1;
62#ifndef Weight_On_SplineBySpline_Basis
74 cudaDeviceSynchronize();
107 #ifndef Weight_On_SplineBySpline_Basis
108 float **cpu_total_weights,
111 unsigned int total_nknots,
112 unsigned int n_splines,
120 cudaMalloc((
void **) &
gpu_nKnots_arr, n_splines*
sizeof(
unsigned int));
123 cudaMalloc((
void **) &
gpu_coeff_x, Eve_size*
sizeof(
float));
130 cudaMalloc((
void **) &
gpu_weights, n_splines*
sizeof(
float));
143#ifndef Weight_On_SplineBySpline_Basis
145 cudaMallocHost((
void **) cpu_total_weights, n_events*
sizeof(
float));
162 printf(
"Allocated %i entries for paramNo and nKnots arrays, size = %f MB\n",
163 n_splines,
static_cast<double>(
sizeof(
short int) * n_splines +
sizeof(
unsigned int) * n_splines) / 1.0e6);
164 printf(
"Allocated %i entries for x coeff arrays, size = %f MB\n",
165 Eve_size,
static_cast<double>(
sizeof(
float) * Eve_size) / 1.0e6);
166 printf(
"Allocated %i entries for {ybcd} coeff arrays, size = %f MB\n",
167 _nCoeff_ * total_nknots,
static_cast<double>(
sizeof(
float) *
_nCoeff_ * total_nknots) / 1.0e6);
168 printf(
"Allocated %i entries for TF1 coefficient arrays, size = %f MB\n",
181 cudaMallocHost((
void **) segment,
_N_SPLINES_*
sizeof(
short int));
190 cudaMallocHost((
void **) vals,
_N_SPLINES_*
sizeof(
float));
205 std::vector<float> cpu_many_array_TF1,
206 std::vector<short int> cpu_paramNo_arr_TF1,
207 #ifndef Weight_On_SplineBySpline_Basis
209 std::vector<unsigned int> cpu_nParamPerEvent,
211 std::vector<unsigned int> cpu_nParamPerEvent_TF1,
214 unsigned int n_splines,
215 short int spline_size,
216 unsigned int total_nknots,
217 unsigned int n_tf1) {
220 printf(
"Number of splines not equal to %i, GPU code for event-by-event splines will fail\n",
_N_SPLINES_);
221 printf(
"n_params = %i\n", n_params);
222 printf(
"%s : %i\n", __FILE__, __LINE__);
228#ifndef Weight_On_SplineBySpline_Basis
233 cudaMemcpyToSymbol(
d_n_splines, &n_splines,
sizeof(n_splines));
237 cudaMemcpyToSymbol(
d_n_TF1, &n_tf1,
sizeof(n_tf1));
241 cudaMemcpyToSymbol(
d_spline_size, &spline_size,
sizeof(spline_size));
243#ifndef Weight_On_SplineBySpline_Basis
252 cudaMemcpy(
gpu_coeff_x, cpu_spline_handler->
coeff_x.data(),
sizeof(
float)*spline_size*n_params, cudaMemcpyHostToDevice);
257 struct cudaResourceDesc resDesc_coeff_x;
258 memset(&resDesc_coeff_x, 0,
sizeof(resDesc_coeff_x));
259 resDesc_coeff_x.resType = cudaResourceTypeLinear;
261 resDesc_coeff_x.res.linear.desc = cudaCreateChannelDesc<float>();
262 resDesc_coeff_x.res.linear.sizeInBytes =
sizeof(float)*spline_size*n_params;
265 struct cudaTextureDesc texDesc_coeff_x;
266 memset(&texDesc_coeff_x, 0,
sizeof(texDesc_coeff_x));
267 texDesc_coeff_x.readMode = cudaReadModeElementType;
270 cudaCreateTextureObject(&
text_coeff_x, &resDesc_coeff_x, &texDesc_coeff_x,
nullptr);
278 cudaMemcpy(
gpu_nKnots_arr, cpu_spline_handler->
nKnots_arr.data(), n_splines*
sizeof(
unsigned int), cudaMemcpyHostToDevice);
287 cudaMemcpy(
gpu_paramNo_TF1_arr, cpu_paramNo_arr_TF1.data(), n_tf1*
sizeof(
short int), cudaMemcpyHostToDevice);
290 #ifndef Weight_On_SplineBySpline_Basis
292 cudaMemcpy(
gpu_nParamPerEvent, cpu_nParamPerEvent.data(), 2*n_events*
sizeof(
unsigned int), cudaMemcpyHostToDevice);
297 struct cudaResourceDesc resDesc_nParamPerEvent;
298 memset(&resDesc_nParamPerEvent, 0,
sizeof(resDesc_nParamPerEvent));
299 resDesc_nParamPerEvent.resType = cudaResourceTypeLinear;
301 resDesc_nParamPerEvent.res.linear.desc = cudaCreateChannelDesc<unsigned int>();
302 resDesc_nParamPerEvent.res.linear.sizeInBytes = 2*n_events*
sizeof(
unsigned int);
305 struct cudaTextureDesc texDesc_nParamPerEvent;
306 memset(&texDesc_nParamPerEvent, 0,
sizeof(texDesc_nParamPerEvent));
307 texDesc_nParamPerEvent.readMode = cudaReadModeElementType;
310 cudaCreateTextureObject(&
text_nParamPerEvent, &resDesc_nParamPerEvent, &texDesc_nParamPerEvent,
nullptr);
314 cudaMemcpy(
gpu_nParamPerEvent_TF1, cpu_nParamPerEvent_TF1.data(), 2*n_events*
sizeof(
unsigned int), cudaMemcpyHostToDevice);
319 struct cudaResourceDesc resDesc_nParamPerEvent_tf1;
320 memset(&resDesc_nParamPerEvent_tf1, 0,
sizeof(resDesc_nParamPerEvent_tf1));
321 resDesc_nParamPerEvent_tf1.resType = cudaResourceTypeLinear;
323 resDesc_nParamPerEvent_tf1.res.linear.desc = cudaCreateChannelDesc<unsigned int>();
324 resDesc_nParamPerEvent_tf1.res.linear.sizeInBytes = 2*n_events*
sizeof(
unsigned int);
327 struct cudaTextureDesc texDesc_nParamPerEvent_tf1;
328 memset(&texDesc_nParamPerEvent_tf1, 0,
sizeof(texDesc_nParamPerEvent_tf1));
329 texDesc_nParamPerEvent_tf1.readMode = cudaReadModeElementType;
332 cudaCreateTextureObject(&
text_nParamPerEvent_TF1, &resDesc_nParamPerEvent_tf1, &texDesc_nParamPerEvent_tf1,
nullptr);
349 const short int* __restrict__ gpu_paramNo_arr,
350 const unsigned int* __restrict__ gpu_nKnots_arr,
351 const float* __restrict__ gpu_coeff_many,
352 float* __restrict__ gpu_weights,
353 const cudaTextureObject_t __restrict__ text_coeff_x) {
356 const unsigned int splineNum = (blockIdx.x * blockDim.x + threadIdx.x);
364 const short int Param = gpu_paramNo_arr[splineNum];
373 const unsigned int CurrentKnotPos = gpu_nKnots_arr[splineNum]*
_nCoeff_+segment*
_nCoeff_;
377 const float fY = gpu_coeff_many[CurrentKnotPos];
378 const float fB = gpu_coeff_many[CurrentKnotPos + 1];
379 const float fC = gpu_coeff_many[CurrentKnotPos + 2];
380 const float fD = gpu_coeff_many[CurrentKnotPos + 3];
382 const float dx =
val_gpu[Param] - tex1Dfetch<float>(text_coeff_x, segment_X);
385 gpu_weights[splineNum] = fmaf(dx, fmaf(dx, fmaf(dx, fD, fC), fB), fY);
398 const float* __restrict__ gpu_coeffs_tf1,
399 const short int* __restrict__ gpu_paramNo_arr_tf1,
400 float* __restrict__ gpu_weights_tf1) {
403 const unsigned int tf1Num = (blockIdx.x * blockDim.x + threadIdx.x);
407 const float x =
val_gpu[gpu_paramNo_arr_tf1[tf1Num]];
410 const unsigned int TF1_Index = tf1Num *
_nTF1Coeff_;
411 const float a = gpu_coeffs_tf1[TF1_Index];
412 const float b = gpu_coeffs_tf1[TF1_Index+1];
414 gpu_weights_tf1[tf1Num] = fmaf(a, x, b);
421#ifndef Weight_On_SplineBySpline_Basis
425 const float* __restrict__ gpu_weights,
426 const float* __restrict__ gpu_weights_tf1,
428 float* __restrict__ gpu_total_weights,
430 const cudaTextureObject_t __restrict__ text_nParamPerEvent,
431 const cudaTextureObject_t __restrict__ text_nParamPerEvent_TF1) {
433 const unsigned int EventNum = (blockIdx.x * blockDim.x + threadIdx.x);
436 __shared__
float shared_total_weights[
_BlockSize_];
439 shared_total_weights[threadIdx.x] = 1.f;
441 const unsigned int EventOffset = 2 * EventNum;
443 for (
unsigned int id = 0; id < tex1Dfetch<unsigned int>(text_nParamPerEvent, EventOffset); ++id) {
444 shared_total_weights[threadIdx.x] *= gpu_weights[tex1Dfetch<unsigned int>(text_nParamPerEvent, EventOffset+1) + id];
447 for (
unsigned int id = 0; id < tex1Dfetch<unsigned int>(text_nParamPerEvent_TF1, EventOffset); ++id) {
448 shared_total_weights[threadIdx.x] *= gpu_weights_tf1[tex1Dfetch<unsigned int>(text_nParamPerEvent_TF1, EventOffset+1) + id];
450 gpu_total_weights[EventNum] = shared_total_weights[threadIdx.x];
460#ifdef Weight_On_SplineBySpline_Basis
462 float* cpu_weights_tf1,
464 float* cpu_total_weights,
470 const unsigned int h_n_splines,
471 const unsigned int h_n_tf1) {
477 grid_size.x = (h_n_splines / block_size.x) + 1;
490 EvalOnGPU_Splines<<<grid_size, block_size>>>(
501 grid_size.x = (h_n_tf1 / block_size.x) + 1;
502 EvalOnGPU_TF1<<<grid_size, block_size>>>(
511#ifdef Weight_On_SplineBySpline_Basis
513 cudaMemcpy(cpu_weights,
gpu_weights, h_n_splines*
sizeof(
float), cudaMemcpyDeviceToHost);
516 cudaMemcpy(cpu_weights_tf1,
gpu_weights_tf1, h_n_tf1*
sizeof(
float), cudaMemcpyDeviceToHost);
521 grid_size.x = (
h_n_events / block_size.x) + 1;
523 EvalOnGPU_TotWeight<<<grid_size, block_size>>>(
541 printf(
"Copied GPU total weights to CPU with SUCCESS (drink more tea)\n");
542 printf(
"Released calculated response from GPU with SUCCESS (drink most tea)\n");
553 #ifndef Weight_On_SplineBySpline_Basis
554 float *cpu_total_weights
572#ifndef Weight_On_SplineBySpline_Basis
580 cudaFreeHost(cpu_total_weights);
581 cpu_total_weights =
nullptr;
589 cudaFreeHost(segment);
#define _nCoeff_
KS: We store coefficients {y,b,c,d} in one array one by one, this is only to define it once rather th...
#define _nTF1Coeff_
KS: For TF1 we store at most 5 coefficients, we could make it more flexible but for now define it her...
cudaTextureObject_t text_nParamPerEvent_TF1
KS: Map keeping track how many parameters applies to each event, we keep two numbers here {number of ...
unsigned int * gpu_nParamPerEvent
KS: GPU map keeping track how many parameters applies to each event, we keep two numbers here {number...
__host__ void CleanupGPU_SplineMonolith(float *cpu_total_weights)
This function deallocates the resources allocated for the separate {x} and {ybcd} arrays in the and T...
cudaTextureObject_t text_coeff_x
KS: Textures are L1 cache variables which are well optimised for fetching. Make texture only for vari...
short int * gpu_paramNo_TF1_arr
CW: GPU array with the number of points per TF1 object.
__host__ void InitGPU_Vals(float **vals)
Allocate memory for spline segments.
int h_n_events
Number of events living on CPU.
float * gpu_weights
GPU arrays to hold weight for each spline.
float * gpu_coeff_many
GPU arrays to hold other coefficients.
unsigned int * gpu_nParamPerEvent_TF1
KS: GPU map keeping track how many parameters applies to each event, we keep two numbers here {number...
cudaTextureObject_t text_nParamPerEvent
KS: Map keeping track how many parameters applies to each event, we keep two numbers here {number of ...
__host__ void InitGPU_SplineMonolith(float **cpu_total_weights, int n_events, unsigned int total_nknots, unsigned int n_splines, unsigned int n_tf1, int Eve_size)
Allocate memory on gpu for spline monolith.
virtual ~SMonolithGPU()
destructor
__host__ void RunGPU_SplineMonolith(float *cpu_total_weights, float *vals, short int *segment, const unsigned int h_n_splines, const unsigned int h_n_tf1)
Run the GPU code for the separate many arrays. As in separate {x}, {y,b,c,d} arrays Pass the segment ...
float * gpu_coeff_x
KS: GPU arrays to hold X coefficient.
SMonolithGPU()
constructor
short int * gpu_nPoints_arr
GPU arrays to hold number of points.
float * gpu_weights_tf1
GPU arrays to hold weight for each TF1.
__host__ void InitGPU_Segments(short int **segment)
Allocate memory for spline segments.
unsigned int * gpu_nKnots_arr
KS: GPU Number of knots per spline.
int h_n_params
Number of params living on CPU.
float * gpu_coeff_TF1_many
GPU arrays to hold TF1 coefficients.
short int * gpu_paramNo_arr
CW: GPU array with the number of points per spline (not per spline point!)
__host__ void CopyToGPU_SplineMonolith(SplineMonoStruct *cpu_spline_handler, std::vector< float > cpu_many_array_TF1, std::vector< short int > cpu_paramNo_arr_TF1, int n_events, std::vector< unsigned int > cpu_nParamPerEvent, std::vector< unsigned int > cpu_nParamPerEvent_TF1, int n_params, unsigned int n_splines, short int spline_size, unsigned int total_nknots, unsigned int n_tf1)
Copies data from CPU to GPU for the spline monolith.
float * gpu_total_weights
GPU arrays to hold weight for event.
__host__ void CleanupGPU_Segments(short int *segment, float *vals)
Clean up pinned variables at CPU.
__global__ void EvalOnGPU_TotWeight(const float *__restrict__ gpu_weights, const float *__restrict__ gpu_weights_tf1, float *__restrict__ gpu_total_weights, const cudaTextureObject_t __restrict__ text_nParamPerEvent, const cudaTextureObject_t __restrict__ text_nParamPerEvent_TF1)
KS: Evaluate the total spline event weight on the GPU, as in most cases GPU is faster,...
__global__ void EvalOnGPU_Splines(const short int *__restrict__ gpu_paramNo_arr, const unsigned int *__restrict__ gpu_nKnots_arr, const float *__restrict__ gpu_coeff_many, float *__restrict__ gpu_weights, const cudaTextureObject_t __restrict__ text_coeff_x)
Evaluate the spline on the GPU Using one {y,b,c,d} array and one {x} array Should be most efficient a...
__device__ __constant__ unsigned int d_n_TF1
Number of tf1 living on GPU.
__device__ __constant__ unsigned int d_n_splines
Number of splines living on GPU.
__host__ void SynchroniseSplines()
Make sure all Cuda threads finished execution.
__global__ void EvalOnGPU_TF1(const float *__restrict__ gpu_coeffs_tf1, const short int *__restrict__ gpu_paramNo_arr_tf1, float *__restrict__ gpu_weights_tf1)
Evaluate the TF1 on the GPU Using 5th order polynomial.
__device__ __constant__ short int segment_gpu[NSplines_GPU]
__device__ __constant__ float val_gpu[NSplines_GPU]
CW: Constant memory needs to be hard-coded on compile time. Could make this texture memory instead,...
__device__ __constant__ int d_n_events
Number of events living on GPU.
__device__ __constant__ short int d_spline_size
Size of splines living on GPU.
MaCh3 event-by-event cross-section spline code.
void checkGpuMem()
KS: Get some fancy info about VRAM usage.
void PrintNdevices()
KS: Get some fancy info about GPU.
#define _BlockSize_
KS: Need it for shared memory, there is way to use dynamic shared memory but I am lazy right now.
KS: Struct storing information for spline monolith.
std::vector< unsigned int > nKnots_arr
KS: CPU Number of knots per spline.
std::vector< float > coeff_x
KS: CPU arrays to hold X coefficient.
std::vector< float > coeff_many
CPU arrays to hold other coefficients.
std::vector< short int > paramNo_arr
CW: CPU array with the number of points per spline (not per spline point!)