MaCh3  2.5.0
Reference Guide
gpuSplineUtils.cu
Go to the documentation of this file.
1 //MaCh3 included
3 
4 // KS: Forgive me father, for I have sinned.
5 #if defined(__CUDA_ARCH__)
6  #if __CUDA_ARCH__ >= 1200
7  #pragma message("Compiling with CUDA Architecture: 12.x")
8  #elif __CUDA_ARCH__ >= 1100
9  #pragma message("Compiling with CUDA Architecture: 11.x")
10  #elif __CUDA_ARCH__ >= 1000
11  #pragma message("Compiling with CUDA Architecture: 10.x")
12  #elif __CUDA_ARCH__ >= 900
13  #pragma message("Compiling with CUDA Architecture: 9.x")
14  #elif __CUDA_ARCH__ >= 800
15  #pragma message("Compiling with CUDA Architecture: 8.x")
16  #elif __CUDA_ARCH__ >= 750
17  #pragma message("Compiling with CUDA Architecture: 7.5")
18  #elif __CUDA_ARCH__ >= 730
19  #pragma message("Compiling with CUDA Architecture: 7.3")
20  #elif __CUDA_ARCH__ >= 720
21  #pragma message("Compiling with CUDA Architecture: 7.2")
22  #elif __CUDA_ARCH__ >= 710
23  #pragma message("Compiling with CUDA Architecture: 7.1")
24  #elif __CUDA_ARCH__ >= 700
25  #pragma message("Compiling with CUDA Architecture: 7.x")
26  #elif __CUDA_ARCH__ >= 650
27  #pragma message("Compiling with CUDA Architecture: 6.5")
28  #elif __CUDA_ARCH__ >= 600
29  #pragma message("Compiling with CUDA Architecture: 6.x")
30  #elif __CUDA_ARCH__ >= 530
31  #pragma message("Compiling with CUDA Architecture: 5.3")
32  #elif __CUDA_ARCH__ >= 520
33  #pragma message("Compiling with CUDA Architecture: 5.2")
34  #elif __CUDA_ARCH__ >= 510
35  #pragma message("Compiling with CUDA Architecture: 5.1")
36  #elif __CUDA_ARCH__ >= 500
37  #pragma message("Compiling with CUDA Architecture: 5.x")
38  #elif __CUDA_ARCH__ >= 400
39  #pragma message("Compiling with CUDA Architecture: 4.x")
40  #elif __CUDA_ARCH__ >= 300
41  #pragma message("Compiling with CUDA Architecture: 3.x")
42  #else
43  #pragma message("Compiling with CUDA Architecture: < 3.x")
44  #endif
45 #endif
46 
47 
48 // *****************************************
49 // Make sure all Cuda threads finished execution
50 __host__ void SynchroniseSplines() {
51  cudaDeviceSynchronize();
52 }
53 
54 // *******************************************
55 // INITIALISE GPU
56 // *******************************************
57 
60  cpu_n_params = -1;
61  cpu_n_events = -1;
62  cpu_n_TF1 = 0;
63  cpu_n_splines = 0;
64  cpu_spline_size = 0;
65 
66  gpu_weights = nullptr;
67  gpu_total_weights = nullptr;
68  gpu_nParamPerEvent = nullptr;
69  gpu_nPoints_arr = nullptr;
70  gpu_paramNo_arr = nullptr;
71  gpu_nKnots_arr = nullptr;
72  gpu_coeff_x = nullptr;
73  gpu_coeff_many = nullptr;
74  gpu_coeff_TF1_many = nullptr;
75  gpu_paramNo_TF1_arr = nullptr;
76  gpu_nParamPerEvent_TF1 = nullptr;
77  gpu_weights_tf1 = nullptr;
78 }
79 
80 // *******************************************
82 // *******************************************
83  cudaFree(gpu_paramNo_arr);
84  cudaFree(gpu_nKnots_arr);
85 
86  // free the coefficient arrays
87  cudaDestroyTextureObject(text_coeff_x);
88  cudaFree(gpu_coeff_x);
89  cudaFree(gpu_coeff_many);
90 
91  cudaFree(gpu_par_val);
92  cudaFree(gpu_spline_segment);
93 
94  cudaFree(gpu_coeff_TF1_many);
95  cudaFree(gpu_paramNo_TF1_arr);
96  // free weights on the gpu
97  cudaFree(gpu_weights);
98  cudaFree(gpu_weights_tf1);
99  cudaFree(gpu_total_weights);
100  //KS: Before removing variable let's destroy texture
101  cudaDestroyTextureObject(text_nParamPerEvent);
102  cudaDestroyTextureObject(text_nParamPerEvent_TF1);
103 
104  cudaFree(gpu_nParamPerEvent);
105  cudaFree(gpu_nParamPerEvent_TF1);
106 }
107 
108 // *******************************************
109 // Initialiser when using the x array and combined y,b,c,d array
111  M3::float_t **cpu_total_weights,
112  int n_events,
113  unsigned int total_nknots,
114  unsigned int n_splines,
115  unsigned int n_tf1,
116  int Eve_size) {
117 // *******************************************
118  // Allocate chunks of memory to GPU
119  cudaMalloc((void **) &gpu_paramNo_arr, n_splines*sizeof(short int));
120  CudaCheckError();
121 
122  cudaMalloc((void **) &gpu_nKnots_arr, n_splines*sizeof(unsigned int));
123  CudaCheckError();
124 
125  cudaMalloc((void **) &gpu_coeff_x, Eve_size*sizeof(float));
126  CudaCheckError();
127 
128  cudaMalloc((void **) &gpu_coeff_many, _nCoeff_*total_nknots*sizeof(float));
129  CudaCheckError();
130 
131  // Allocate memory for the array of weights to be returned to CPU
132  cudaMalloc((void **) &gpu_weights, n_splines*sizeof(float));
133  CudaCheckError();
134 
135  // Now TF1 specific
136  cudaMalloc((void **) &gpu_coeff_TF1_many, _nTF1Coeff_*n_tf1*sizeof(float));
137  CudaCheckError();
138 
139  cudaMalloc((void **) &gpu_weights_tf1, n_tf1*sizeof(float));
140  CudaCheckError();
141 
142  cudaMalloc((void **) &gpu_paramNo_TF1_arr, n_tf1*sizeof(short int));
143  CudaCheckError();
144 
145  //KS: Rather than allocate memory in standard way this fancy cuda tool allows to pin host memory which make memory transfer faster
146  cudaMallocHost((void **) cpu_total_weights, n_events*sizeof(M3::float_t));
147  CudaCheckError();
148 
149  //KS: Allocate memory for the array of total weights to be returned to CPU
150  cudaMalloc((void **) &gpu_total_weights, n_events*sizeof(M3::float_t));
151  CudaCheckError();
152 
153  //KS: Allocate memory for the map keeping track how many splines each parameter has
154  cudaMalloc((void **) &gpu_nParamPerEvent, 2*n_events*sizeof(unsigned int));
155  CudaCheckError();
156 
157  //KS: Allocate memory for the map keeping track how many TF1 each parameter has
158  cudaMalloc((void **) &gpu_nParamPerEvent_TF1, 2*n_events*sizeof(unsigned int));
159  CudaCheckError();
160 
161  // Print allocation info to user
162  printf("Allocated %i entries for paramNo and nKnots arrays, size = %f MB\n",
163  n_splines, static_cast<double>(sizeof(short int) * n_splines + sizeof(unsigned int) * n_splines) / 1.0e6);
164  printf("Allocated %i entries for x coeff arrays, size = %f MB\n",
165  Eve_size, static_cast<double>(sizeof(float) * Eve_size) / 1.0e6);
166  printf("Allocated %i entries for {ybcd} coeff arrays, size = %f MB\n",
167  _nCoeff_ * total_nknots, static_cast<double>(sizeof(float) * _nCoeff_ * total_nknots) / 1.0e6);
168  printf("Allocated %i entries for TF1 coefficient arrays, size = %f MB\n",
169  _nTF1Coeff_ * n_tf1, static_cast<double>(sizeof(float) * _nTF1Coeff_ * n_tf1) / 1.0e6);
170 
171  //KS: Ask CUDA about memory usage
172  checkGpuMem();
173  PrintNdevices();
174 }
175 
176 // *******************************************
177 // Allocate memory for spline segments
178 __host__ void SplineMonolithGPU::InitGPU_Segments(short int **segment) {
179 // *******************************************
180  //KS: Rather than allocate memory in standard way this fancy cuda tool allows to pin host memory which make memory transfer faster
181  cudaMallocHost((void **) segment, cpu_n_params*sizeof(short int));
182  CudaCheckError();
183 }
184 
185 // *******************************************
186 // Allocate memory for spline segments
187 __host__ void SplineMonolithGPU::InitGPU_Vals(float **vals) {
188 // *******************************************
189  //KS: Rather than allocate memory in standard way this fancy cuda tool allows to pin host memory which make memory transfer faster
190  cudaMallocHost((void **) vals, cpu_n_params*sizeof(float));
191  CudaCheckError();
192 }
193 
194 // ******************************************************
195 // START COPY TO GPU
196 // ******************************************************
197 
198 // ******************************************************
199 // Copy to GPU for x array and separate ybcd array
201  const SplineMonoStruct* cpu_spline_handler,
202 
203  // TFI related now
204  const std::vector<float>& cpu_many_array_TF1,
205  const std::vector<short int>& cpu_paramNo_arr_TF1,
206  const int n_events,
207  const std::vector<unsigned int>& cpu_nParamPerEvent,
208  // TFI related now
209  const std::vector<unsigned int>& cpu_nParamPerEvent_TF1,
210 
211  const int n_params,
212  const unsigned int n_splines,
213  const short int spline_size,
214  const unsigned int total_nknots,
215  const unsigned int n_tf1) {
216 // ******************************************************
217  // Write to the global statics (h_* denotes host stored variable)
218  cpu_n_params = n_params;
219  // Number of events
220  cpu_n_events = n_events;
221  // Total number of valid TF1 for all loaded events
222  cpu_n_TF1 = n_tf1;
223  // Total number of valid splines for all loaded events
224  cpu_n_splines = n_splines;
226  cpu_spline_size = spline_size;
227 
228  //CW: Allocate memory for the frequently copied objects
229  cudaMalloc(&gpu_par_val, n_params * sizeof(float));
230  cudaMalloc(&gpu_spline_segment, n_params * sizeof(short int));
231 
232  // Copy the coefficient arrays to the GPU; this only happens once per entire Markov Chain so is OK to do multiple extensive memory copies
233  cudaMemcpy(gpu_coeff_many, cpu_spline_handler->coeff_many.data(), sizeof(float)*total_nknots*_nCoeff_, cudaMemcpyHostToDevice);
234  CudaCheckError();
235 
236  cudaMemcpy(gpu_coeff_x, cpu_spline_handler->coeff_x.data(), sizeof(float)*spline_size*n_params, cudaMemcpyHostToDevice);
237  CudaCheckError();
238 
239  //KS: Bind our texture with the GPU variable
240  //KS: Tried also moving gpu_many_array to texture memory it only worked with restricted number of MC runs, most likely hit texture memory limit :(
241  struct cudaResourceDesc resDesc_coeff_x;
242  memset(&resDesc_coeff_x, 0, sizeof(resDesc_coeff_x));
243  resDesc_coeff_x.resType = cudaResourceTypeLinear;
244  resDesc_coeff_x.res.linear.devPtr = gpu_coeff_x;
245  resDesc_coeff_x.res.linear.desc = cudaCreateChannelDesc<float>();
246  resDesc_coeff_x.res.linear.sizeInBytes = sizeof(float)*spline_size*n_params;
247 
248  // Specify texture object parameters
249  struct cudaTextureDesc texDesc_coeff_x;
250  memset(&texDesc_coeff_x, 0, sizeof(texDesc_coeff_x));
251  texDesc_coeff_x.readMode = cudaReadModeElementType;
252 
253  // Create texture object
254  cudaCreateTextureObject(&text_coeff_x, &resDesc_coeff_x, &texDesc_coeff_x, nullptr);
255  CudaCheckError();
256 
257  // Also copy the parameter number for each spline onto the GPU; i.e. what spline parameter are we calculating right now
258  cudaMemcpy(gpu_paramNo_arr, cpu_spline_handler->paramNo_arr.data(), n_splines*sizeof(short int), cudaMemcpyHostToDevice);
259  CudaCheckError();
260 
261  // Also copy the knot map for each spline onto the GPU;
262  cudaMemcpy(gpu_nKnots_arr, cpu_spline_handler->nKnots_arr.data(), n_splines*sizeof(unsigned int), cudaMemcpyHostToDevice);
263  CudaCheckError();
264 
265  //Now TF1
266  // Copy the coefficient arrays to the GPU; this only happens once per entire Markov Chain so is OK to do multiple extensive memory copies
267  cudaMemcpy(gpu_coeff_TF1_many, cpu_many_array_TF1.data(), sizeof(float)*n_tf1*_nTF1Coeff_, cudaMemcpyHostToDevice);
268  CudaCheckError();
269 
270  // Also copy the parameter number for each TF1 onto the GPU; i.e. what TF1 parameter are we calculating right now
271  cudaMemcpy(gpu_paramNo_TF1_arr, cpu_paramNo_arr_TF1.data(), n_tf1*sizeof(short int), cudaMemcpyHostToDevice);
272  CudaCheckError();
273 
274  //KS: Keep track how much splines each event has
275  cudaMemcpy(gpu_nParamPerEvent, cpu_nParamPerEvent.data(), 2*n_events*sizeof(unsigned int), cudaMemcpyHostToDevice);
276  CudaCheckError();
277 
278  //KS: Bind our texture with the GPU variable
279  // create a resource descriptor based on device pointers
280  struct cudaResourceDesc resDesc_nParamPerEvent;
281  memset(&resDesc_nParamPerEvent, 0, sizeof(resDesc_nParamPerEvent));
282  resDesc_nParamPerEvent.resType = cudaResourceTypeLinear;
283  resDesc_nParamPerEvent.res.linear.devPtr = gpu_nParamPerEvent;
284  resDesc_nParamPerEvent.res.linear.desc = cudaCreateChannelDesc<unsigned int>();
285  resDesc_nParamPerEvent.res.linear.sizeInBytes = 2*n_events*sizeof(unsigned int);
286 
287  // Specify texture object parameters
288  struct cudaTextureDesc texDesc_nParamPerEvent;
289  memset(&texDesc_nParamPerEvent, 0, sizeof(texDesc_nParamPerEvent));
290  texDesc_nParamPerEvent.readMode = cudaReadModeElementType;
291 
292  //Finally create texture object
293  cudaCreateTextureObject(&text_nParamPerEvent, &resDesc_nParamPerEvent, &texDesc_nParamPerEvent, nullptr);
294  CudaCheckError();
295 
296  // Now TF1
297  cudaMemcpy(gpu_nParamPerEvent_TF1, cpu_nParamPerEvent_TF1.data(), 2*n_events*sizeof(unsigned int), cudaMemcpyHostToDevice);
298  CudaCheckError();
299 
300  //KS: Bind our texture with the GPU variable
301  // create a resource descriptor based on device pointers
302  struct cudaResourceDesc resDesc_nParamPerEvent_tf1;
303  memset(&resDesc_nParamPerEvent_tf1, 0, sizeof(resDesc_nParamPerEvent_tf1));
304  resDesc_nParamPerEvent_tf1.resType = cudaResourceTypeLinear;
305  resDesc_nParamPerEvent_tf1.res.linear.devPtr = gpu_nParamPerEvent_TF1;
306  resDesc_nParamPerEvent_tf1.res.linear.desc = cudaCreateChannelDesc<unsigned int>();
307  resDesc_nParamPerEvent_tf1.res.linear.sizeInBytes = 2*n_events*sizeof(unsigned int);
308 
309  // Specify texture object parameters
310  struct cudaTextureDesc texDesc_nParamPerEvent_tf1;
311  memset(&texDesc_nParamPerEvent_tf1, 0, sizeof(texDesc_nParamPerEvent_tf1));
312  texDesc_nParamPerEvent_tf1.readMode = cudaReadModeElementType;
313 
314  //Finally create texture object
315  cudaCreateTextureObject(&text_nParamPerEvent_TF1, &resDesc_nParamPerEvent_tf1, &texDesc_nParamPerEvent_tf1, nullptr);
316  CudaCheckError();
317 }
318 
319 // ********************************************************
320 // START GPU KERNELS
321 //*********************************************************
322 // All the GPU kernels have similar tasks but different implementations
323 // Essentially they perform a binary search to find which TSpline3 point is nearest to our parameter variation
324 // Once it knows this, we simply extract the pre-computed coefficients for that spline point and multiply together to get a weight
325 
326 //*********************************************************
327 // Evaluate the spline on the GPU Using one {y,b,c,d} array and one {x} array
328 // Should be most efficient at cache hitting and memory coalescence
329 // But using spline segments rather than the parameter value: avoids doing binary search on GPU
330 __global__ void EvalOnGPU_Splines(
331  const unsigned int gpu_n_splines,
332  const short int gpu_spline_size,
333  const short int* __restrict__ gpu_paramNo_arr,
334  const unsigned int* __restrict__ gpu_nKnots_arr,
335  const float* __restrict__ gpu_coeff_many,
336  const float* __restrict__ gpu_par_val,
337  const short int* __restrict__ gpu_spline_segment,
338  float* __restrict__ gpu_weights,
339  const cudaTextureObject_t __restrict__ text_coeff_x) {
340 //*********************************************************
341  // points per spline is the offset to skip in the index to move between splines
342  const unsigned int splineNum = (blockIdx.x * blockDim.x + threadIdx.x);
343 
344  // this is the stopping condition!
345  if (splineNum < gpu_n_splines) {
346  // This is the segment we want for this parameter variation
347  // for this particular splineNum; 0 = MACCQE, 1 = pFC, 2 = EBC, etc
348 
349  //CW: Which Parameter we are accessing
350  const short int Param = gpu_paramNo_arr[splineNum];
351 
352  //CW: Avoids doing costly binary search on GPU
353  const short int segment = gpu_spline_segment[Param];
354 
355  //KS: Segment for coeff_x is simply parameter*max knots + segment as each parmeters has the same spacing
356  const short int segment_X = Param*gpu_spline_size+segment;
357 
358  //KS: Find knot position in out monolitical structure
359  const unsigned int CurrentKnotPos = gpu_nKnots_arr[splineNum]*_nCoeff_+segment*_nCoeff_;
360 
361  // We've read the segment straight from CPU and is saved in gpu_spline_segment
362  // polynomial parameters from the monolithic splineMonolith
363  const float fY = gpu_coeff_many[CurrentKnotPos];
364  const float fB = gpu_coeff_many[CurrentKnotPos + 1];
365  const float fC = gpu_coeff_many[CurrentKnotPos + 2];
366  const float fD = gpu_coeff_many[CurrentKnotPos + 3];
367  // The is the variation itself (needed to evaluate variation - stored spline point = dx)
368  const float dx = gpu_par_val[Param] - tex1Dfetch<float>(text_coeff_x, segment_X);
369 
370  //CW: Wooow, let's use some fancy intrinsics and pull down the processing time by <1% from normal multiplication! HURRAY
371  gpu_weights[splineNum] = fmaf(dx, fmaf(dx, fmaf(dx, fD, fC), fB), fY);
372  // Or for the more "easy to read" version:
373  //gpu_weights[splineNum] = (fY+dx*(fB+dx*(fC+dx*fD)));
374 
375  //#ifdef MACH3_DEBUG
376  //printf("splineNum = %i/%i, paramNo = %i, variation = %f, segment = %i, fX = %f, fX+1 = %f, dx = %f, gpu_n_splines = %i, gpu_spline_size = %i, weight = %f \n", splineNum, gpu_n_splines, gpu_paramNo_arr[splineNum], gpu_par_val[Param], segment, tex1Dfetch<float>(text_coeff_x, segment_X), tex1Dfetch<float>(text_coeff_x, segment_X+1), dx, gpu_n_splines, gpu_spline_size, gpu_weights[splineNum]);
377  //#endif
378  }
379 }
380 
381 //*********************************************************
382 // Evaluate the TF1 on the GPU Using 5th order polynomial
383 __global__ void EvalOnGPU_TF1(
384  const unsigned int gpu_n_TF1,
385  const float* __restrict__ gpu_coeffs_tf1,
386  const short int* __restrict__ gpu_paramNo_arr_tf1,
387  const float* __restrict__ gpu_par_val,
388  float* __restrict__ gpu_weights_tf1) {
389 //*********************************************************
390  // points per spline is the offset to skip in the index to move between splines
391  const unsigned int tf1Num = (blockIdx.x * blockDim.x + threadIdx.x);
392 
393  if (tf1Num < gpu_n_TF1) {
394  // The is the variation itself (needed to evaluate variation - stored spline point = dx)
395  const float x = gpu_par_val[gpu_paramNo_arr_tf1[tf1Num]];
396 
397  // Read the coefficients
398  const unsigned int TF1_Index = tf1Num * _nTF1Coeff_;
399  const float a = gpu_coeffs_tf1[TF1_Index];
400  const float b = gpu_coeffs_tf1[TF1_Index+1];
401 
402  gpu_weights_tf1[tf1Num] = fmaf(a, x, b);
403  // gpu_weights_tf1[tf1Num] = a*x + b;
404  // gpu_weights_tf1[tf1Num] = 1 + a*x + b*x*x + c*x*x*x + d*x*x*x*x + e*x*x*x*x*x;
405  }
406 }
407 
408 //*********************************************************
409 // KS: Evaluate the total spline event weight on the GPU, as in most cases GPU is faster, even more this significant reduce memory transfer from GPU to CPU
410 __global__ void EvalOnGPU_TotWeight(
411  const int gpu_n_events,
412  const float* __restrict__ gpu_weights,
413  const float* __restrict__ gpu_weights_tf1,
414 
415  M3::float_t* __restrict__ gpu_total_weights,
416 
417  const cudaTextureObject_t __restrict__ text_nParamPerEvent,
418  const cudaTextureObject_t __restrict__ text_nParamPerEvent_TF1) {
419 //*********************************************************
420  const unsigned int EventNum = (blockIdx.x * blockDim.x + threadIdx.x);
421 
422  if(EventNum < gpu_n_events) //stopping condition
423  {
424  float local_total_weight = 1.f;
425 
426  const unsigned int EventOffset = 2 * EventNum;
427 
428  for (unsigned int id = 0; id < tex1Dfetch<unsigned int>(text_nParamPerEvent, EventOffset); ++id) {
429  local_total_weight *= gpu_weights[tex1Dfetch<unsigned int>(text_nParamPerEvent, EventOffset+1) + id];
430  }
431 
432  for (unsigned int id = 0; id < tex1Dfetch<unsigned int>(text_nParamPerEvent_TF1, EventOffset); ++id) {
433  local_total_weight *= gpu_weights_tf1[tex1Dfetch<unsigned int>(text_nParamPerEvent_TF1, EventOffset+1) + id];
434  }
435  gpu_total_weights[EventNum] = static_cast<M3::float_t>(local_total_weight);
436  }
437 }
438 
439 // *****************************************
440 // Run the GPU code for the separate many arrays. As in separate {x}, {y,b,c,d} arrays
441 // Pass the segment and the parameter values
442 // (binary search already performed in SplineBase::FindSplineSegment()
444  M3::float_t* cpu_total_weights,
445  // Holds the changes in parameters
446  float *vals,
447  // Holds the segments for parameters
448  short int *segment) {
449 // *****************************************
450  dim3 block_size;
451  dim3 grid_size;
452 
453  block_size.x = _BlockSize_;
454  grid_size.x = (cpu_n_splines / block_size.x) + 1;
455 
456  // Copy the segment values to the GPU (segment_gpu), which is cpu_n_params long
457  cudaMemcpy(gpu_spline_segment, segment, cpu_n_params * sizeof(short int), cudaMemcpyHostToDevice);
458  CudaCheckError();
459 
460  // Copy the parameter values values to the GPU (vals_gpu), which is cpu_n_params long
461  cudaMemcpy(gpu_par_val, vals, cpu_n_params * sizeof(float), cudaMemcpyHostToDevice);
462  CudaCheckError();
463 
464  // KS: Consider asynchronous kernel call, this might help EvalOnGPU_Splines and EvalOnGPU_TF1 are independent
465  // Set the cache config to prefer L1 for the kernel
466  //cudaFuncSetCacheConfig(EvalOnGPU_Splines, cudaFuncCachePreferL1);
467  EvalOnGPU_Splines<<<grid_size, block_size>>>(
473  gpu_par_val,
475 
476  gpu_weights,
478  );
479  CudaCheckError();
480 
481  grid_size.x = (cpu_n_TF1 / block_size.x) + 1;
482  EvalOnGPU_TF1<<<grid_size, block_size>>>(
483  cpu_n_TF1,
486  gpu_par_val,
488  );
489  CudaCheckError();
490 
491  grid_size.x = (cpu_n_events / block_size.x) + 1;
492 
493  EvalOnGPU_TotWeight<<<grid_size, block_size>>>(
494  cpu_n_events,
495  gpu_weights,
497 
499 
502  );
503  CudaCheckError();
504 
505  //KS: Here we have to make a somewhat large GPU->CPU transfer because it is proportional to number of events
506  //KS: Normally code wait for memory transfer to finish before moving further cudaMemcpyAsync means we will continue to execute code and in a meantime keep copying stuff.
507  cudaMemcpyAsync(cpu_total_weights, gpu_total_weights, cpu_n_events * sizeof(M3::float_t), cudaMemcpyDeviceToHost, 0);
508  CudaCheckError();
509 
510  #ifdef MACH3_DEBUG
511  printf("Copied GPU total weights to CPU with SUCCESS (drink more tea)\n");
512  printf("Released calculated response from GPU with SUCCESS (drink most tea)\n");
513  #endif
514 }
515 
516 // *******************************************
518 __host__ void SplineMonolithGPU::CleanupPinnedMemory(M3::float_t *cpu_total_weights,
519  short int *segment, float *vals) {
520 // *******************************************
521  cudaFreeHost(cpu_total_weights);
522  cudaFreeHost(segment);
523  cudaFreeHost(vals);
524 
525  cpu_total_weights = nullptr;
526  segment = nullptr;
527  vals = nullptr;
528 }
constexpr int _nCoeff_
KS: We store coefficients {y,b,c,d} in one array one by one, this is only to define it once rather th...
Definition: SplineCommon.h:18
constexpr int _nTF1Coeff_
KS: For TF1 we store at most 5 coefficients, we could make it more flexible but for now define it her...
Definition: SplineCommon.h:20
float * gpu_weights
GPU arrays to hold weight for each spline.
unsigned int * gpu_nParamPerEvent
KS: GPU map keeping track how many parameters applies to each event, we keep two numbers here {number...
__host__ void RunGPU_SplineMonolith(M3::float_t *cpu_total_weights, float *vals, short int *segment)
Run the GPU code for the separate many arrays. As in separate {x}, {y,b,c,d} arrays Pass the segment ...
unsigned int cpu_n_TF1
Number of tf1 living on CPU.
virtual ~SplineMonolithGPU()
This function deallocates the resources allocated for the separate {x} and {ybcd} arrays in the and T...
unsigned int * gpu_nParamPerEvent_TF1
KS: GPU map keeping track how many parameters applies to each event, we keep two numbers here {number...
float * gpu_weights_tf1
GPU arrays to hold weight for each TF1.
unsigned int * gpu_nKnots_arr
KS: GPU Number of knots per spline.
float * gpu_par_val
CW: parameter value on GPU.
float * gpu_coeff_TF1_many
GPU arrays to hold TF1 coefficients.
__host__ void InitGPU_Vals(float **vals)
Allocate memory for spline segments.
cudaTextureObject_t text_nParamPerEvent_TF1
KS: Map keeping track how many parameters applies to each event, we keep two numbers here {number of ...
float * gpu_coeff_many
GPU arrays to hold other coefficients.
__host__ void CleanupPinnedMemory(M3::float_t *cpu_total_weights, short int *segment, float *vals)
Clean up pinned variables at CPU.
short int * gpu_paramNo_TF1_arr
CW: GPU array with the number of points per TF1 object.
__host__ void InitGPU_SplineMonolith(M3::float_t **cpu_total_weights, int n_events, unsigned int total_nknots, unsigned int n_splines, unsigned int n_tf1, int Eve_size)
Allocate memory on gpu for spline monolith.
cudaTextureObject_t text_coeff_x
KS: Textures are L1 cache variables which are well optimised for fetching. Make texture only for vari...
int cpu_n_params
Number of params living on CPU.
__host__ void CopyToGPU_SplineMonolith(const SplineMonoStruct *cpu_spline_handler, const std::vector< float > &cpu_many_array_TF1, const std::vector< short int > &cpu_paramNo_arr_TF1, const int n_events, const std::vector< unsigned int > &cpu_nParamPerEvent, const std::vector< unsigned int > &cpu_nParamPerEvent_TF1, const int n_params, const unsigned int n_splines, const short int spline_size, const unsigned int total_nknots, const unsigned int n_tf1)
Copies data from CPU to GPU for the spline monolith.
__host__ void InitGPU_Segments(short int **segment)
Allocate memory for spline segments.
int cpu_n_events
Number of events living on CPU.
short int * gpu_nPoints_arr
GPU arrays to hold number of points.
unsigned int cpu_n_splines
Number of splines living on CPU.
SplineMonolithGPU()
constructor
short int * gpu_paramNo_arr
CW: GPU array with the number of points per spline (not per spline point!)
float * gpu_coeff_x
KS: GPU arrays to hold X coefficient.
cudaTextureObject_t text_nParamPerEvent
KS: Map keeping track how many parameters applies to each event, we keep two numbers here {number of ...
short int cpu_spline_size
Size of splines living on CPU.
M3::float_t * gpu_total_weights
GPU arrays to hold weight for event.
short int * gpu_spline_segment
CW: Spline segment on GPU.
__global__ void EvalOnGPU_TF1(const unsigned int gpu_n_TF1, const float *__restrict__ gpu_coeffs_tf1, const short int *__restrict__ gpu_paramNo_arr_tf1, const float *__restrict__ gpu_par_val, float *__restrict__ gpu_weights_tf1)
Evaluate the TF1 on the GPU Using first order polynomial Polynomial form: w(x) = a0 + a1·x.
__host__ void SynchroniseSplines()
Make sure all Cuda threads finished execution.
__global__ void EvalOnGPU_TotWeight(const int gpu_n_events, const float *__restrict__ gpu_weights, const float *__restrict__ gpu_weights_tf1, M3::float_t *__restrict__ gpu_total_weights, const cudaTextureObject_t __restrict__ text_nParamPerEvent, const cudaTextureObject_t __restrict__ text_nParamPerEvent_TF1)
KS: Evaluate the total spline event weight on the GPU, as in most cases GPU is faster,...
__global__ void EvalOnGPU_Splines(const unsigned int gpu_n_splines, const short int gpu_spline_size, const short int *__restrict__ gpu_paramNo_arr, const unsigned int *__restrict__ gpu_nKnots_arr, const float *__restrict__ gpu_coeff_many, const float *__restrict__ gpu_par_val, const short int *__restrict__ gpu_spline_segment, float *__restrict__ gpu_weights, const cudaTextureObject_t __restrict__ text_coeff_x)
Evaluate the spline on the GPU Using one {y,b,c,d} array and one {x} array Should be most efficient a...
MaCh3 event-by-event cross-section spline code.
void checkGpuMem()
KS: Get some fancy info about VRAM usage.
Definition: gpuUtils.cu:43
void PrintNdevices()
KS: Get some fancy info about GPU.
Definition: gpuUtils.cu:58
#define CudaCheckError()
Definition: gpuUtils.cuh:21
#define _BlockSize_
KS: Need it for shared memory, there is way to use dynamic shared memory but I am lazy right now.
Definition: gpuUtils.cuh:24
double float_t
Definition: Core.h:37
KS: Struct storing information for spline monolith.
Definition: SplineCommon.h:35
std::vector< unsigned int > nKnots_arr
KS: CPU Number of knots per spline.
Definition: SplineCommon.h:47
std::vector< float > coeff_x
KS: CPU arrays to hold X coefficient.
Definition: SplineCommon.h:41
std::vector< float > coeff_many
CPU arrays to hold other coefficients.
Definition: SplineCommon.h:44
std::vector< short int > paramNo_arr
CW: CPU array with the number of points per spline (not per spline point!)
Definition: SplineCommon.h:50