MaCh3  2.4.2
Reference Guide
gpuSplineUtils.cu
Go to the documentation of this file.
1 //MaCh3 included
3 
6 #define _N_SPLINES_ NSplines_GPU
7 
8 // KS: Forgive me father, for I have sinned.
9 #if defined(__CUDA_ARCH__)
10  #if __CUDA_ARCH__ >= 1200
11  #pragma message("Compiling with CUDA Architecture: 12.x")
12  #elif __CUDA_ARCH__ >= 1100
13  #pragma message("Compiling with CUDA Architecture: 11.x")
14  #elif __CUDA_ARCH__ >= 1000
15  #pragma message("Compiling with CUDA Architecture: 10.x")
16  #elif __CUDA_ARCH__ >= 900
17  #pragma message("Compiling with CUDA Architecture: 9.x")
18  #elif __CUDA_ARCH__ >= 800
19  #pragma message("Compiling with CUDA Architecture: 8.x")
20  #elif __CUDA_ARCH__ >= 750
21  #pragma message("Compiling with CUDA Architecture: 7.5")
22  #elif __CUDA_ARCH__ >= 730
23  #pragma message("Compiling with CUDA Architecture: 7.3")
24  #elif __CUDA_ARCH__ >= 720
25  #pragma message("Compiling with CUDA Architecture: 7.2")
26  #elif __CUDA_ARCH__ >= 710
27  #pragma message("Compiling with CUDA Architecture: 7.1")
28  #elif __CUDA_ARCH__ >= 700
29  #pragma message("Compiling with CUDA Architecture: 7.x")
30  #elif __CUDA_ARCH__ >= 650
31  #pragma message("Compiling with CUDA Architecture: 6.5")
32  #elif __CUDA_ARCH__ >= 600
33  #pragma message("Compiling with CUDA Architecture: 6.x")
34  #elif __CUDA_ARCH__ >= 530
35  #pragma message("Compiling with CUDA Architecture: 5.3")
36  #elif __CUDA_ARCH__ >= 520
37  #pragma message("Compiling with CUDA Architecture: 5.2")
38  #elif __CUDA_ARCH__ >= 510
39  #pragma message("Compiling with CUDA Architecture: 5.1")
40  #elif __CUDA_ARCH__ >= 500
41  #pragma message("Compiling with CUDA Architecture: 5.x")
42  #elif __CUDA_ARCH__ >= 400
43  #pragma message("Compiling with CUDA Architecture: 4.x")
44  #elif __CUDA_ARCH__ >= 300
45  #pragma message("Compiling with CUDA Architecture: 3.x")
46  #else
47  #pragma message("Compiling with CUDA Architecture: < 3.x")
48  #endif
49 #endif
50 
51 // ******************************************
52 // CONSTANTS
53 // ******************************************
54 
55 // d_NAME declares DEVICE constants (live on GPU)
57 __device__ __constant__ unsigned int d_n_splines;
59 __device__ __constant__ unsigned int d_n_TF1;
61 __device__ __constant__ short int d_spline_size;
62 #ifndef Weight_On_SplineBySpline_Basis
64 __device__ __constant__ int d_n_events;
65 #endif
67 __device__ __constant__ float val_gpu[_N_SPLINES_];
68 __device__ __constant__ short int segment_gpu[_N_SPLINES_];
69 
70 
71 // *****************************************
72 // Make sure all Cuda threads finished execution
73 __host__ void SynchroniseSplines() {
74  cudaDeviceSynchronize();
75 }
76 
77 // *******************************************
78 // INITIALISE GPU
79 // *******************************************
80 
82  h_n_params = -1;
84  h_n_events = -1;
85 
86  gpu_weights = nullptr;
87  gpu_total_weights = nullptr;
88  gpu_nParamPerEvent = nullptr;
89  gpu_nPoints_arr = nullptr;
90  gpu_paramNo_arr = nullptr;
91  gpu_nKnots_arr = nullptr;
92  gpu_coeff_x = nullptr;
93  gpu_coeff_many = nullptr;
94  gpu_coeff_TF1_many = nullptr;
95  gpu_paramNo_TF1_arr = nullptr;
96  gpu_nParamPerEvent_TF1 = nullptr;
97  gpu_weights_tf1 = nullptr;
98 }
99 
101 }
102 
103 // *******************************************
104 // Initialiser when using the x array and combined y,b,c,d array
106  #ifndef Weight_On_SplineBySpline_Basis
107  float **cpu_total_weights,
108  int n_events,
109  #endif
110  unsigned int total_nknots,
111  unsigned int n_splines,
112  unsigned int n_tf1,
113  int Eve_size) {
114 // *******************************************
115  // Allocate chunks of memory to GPU
116  cudaMalloc((void **) &gpu_paramNo_arr, n_splines*sizeof(short int));
117  CudaCheckError();
118 
119  cudaMalloc((void **) &gpu_nKnots_arr, n_splines*sizeof(unsigned int));
120  CudaCheckError();
121 
122  cudaMalloc((void **) &gpu_coeff_x, Eve_size*sizeof(float));
123  CudaCheckError();
124 
125  cudaMalloc((void **) &gpu_coeff_many, _nCoeff_*total_nknots*sizeof(float));
126  CudaCheckError();
127 
128  // Allocate memory for the array of weights to be returned to CPU
129  cudaMalloc((void **) &gpu_weights, n_splines*sizeof(float));
130  CudaCheckError();
131 
132  // Now TF1 specific
133  cudaMalloc((void **) &gpu_coeff_TF1_many, _nTF1Coeff_*n_tf1*sizeof(float));
134  CudaCheckError();
135 
136  cudaMalloc((void **) &gpu_weights_tf1, n_tf1*sizeof(float));
137  CudaCheckError();
138 
139  cudaMalloc((void **) &gpu_paramNo_TF1_arr, n_tf1*sizeof(short int));
140  CudaCheckError();
141 
142 #ifndef Weight_On_SplineBySpline_Basis
143  //KS: Rather than allocate memory in standard way this fancy cuda tool allows to pin host memory which make memory transfer faster
144  cudaMallocHost((void **) cpu_total_weights, n_events*sizeof(float));
145  CudaCheckError();
146 
147  //KS: Allocate memory for the array of total weights to be returned to CPU
148  cudaMalloc((void **) &gpu_total_weights, n_events*sizeof(float));
149  CudaCheckError();
150 
151  //KS: Allocate memory for the map keeping track how many splines each parameter has
152  cudaMalloc((void **) &gpu_nParamPerEvent, 2*n_events*sizeof(unsigned int));
153  CudaCheckError();
154 
155  //KS: Allocate memory for the map keeping track how many TF1 each parameter has
156  cudaMalloc((void **) &gpu_nParamPerEvent_TF1, 2*n_events*sizeof(unsigned int));
157  CudaCheckError();
158 #endif
159 
160  // Print allocation info to user
161  printf("Allocated %i entries for paramNo and nKnots arrays, size = %f MB\n",
162  n_splines, static_cast<double>(sizeof(short int) * n_splines + sizeof(unsigned int) * n_splines) / 1.0e6);
163  printf("Allocated %i entries for x coeff arrays, size = %f MB\n",
164  Eve_size, static_cast<double>(sizeof(float) * Eve_size) / 1.0e6);
165  printf("Allocated %i entries for {ybcd} coeff arrays, size = %f MB\n",
166  _nCoeff_ * total_nknots, static_cast<double>(sizeof(float) * _nCoeff_ * total_nknots) / 1.0e6);
167  printf("Allocated %i entries for TF1 coefficient arrays, size = %f MB\n",
168  _nTF1Coeff_ * n_tf1, static_cast<double>(sizeof(float) * _nTF1Coeff_ * n_tf1) / 1.0e6);
169 
170  //KS: Ask CUDA about memory usage
171  checkGpuMem();
172  PrintNdevices();
173 }
174 
175 // *******************************************
176 // Allocate memory for spline segments
177 __host__ void SMonolithGPU::InitGPU_Segments(short int **segment) {
178 // *******************************************
179  //KS: Rather than allocate memory in standard way this fancy cuda tool allows to pin host memory which make memory transfer faster
180  cudaMallocHost((void **) segment, _N_SPLINES_*sizeof(short int));
181  CudaCheckError();
182 }
183 
184 // *******************************************
185 // Allocate memory for spline segments
186 __host__ void SMonolithGPU::InitGPU_Vals(float **vals) {
187 // *******************************************
188  //KS: Rather than allocate memory in standard way this fancy cuda tool allows to pin host memory which make memory transfer faster
189  cudaMallocHost((void **) vals, _N_SPLINES_*sizeof(float));
190  CudaCheckError();
191 }
192 
193 
194 // ******************************************************
195 // START COPY TO GPU
196 // ******************************************************
197 
198 // ******************************************************
199 // Copy to GPU for x array and separate ybcd array
201  SplineMonoStruct* cpu_spline_handler,
202 
203  // TFI related now
204  std::vector<float> cpu_many_array_TF1,
205  std::vector<short int> cpu_paramNo_arr_TF1,
206  #ifndef Weight_On_SplineBySpline_Basis
207  int n_events,
208  std::vector<unsigned int> cpu_nParamPerEvent,
209  // TFI related now
210  std::vector<unsigned int> cpu_nParamPerEvent_TF1,
211  #endif
212  int n_params,
213  unsigned int n_splines,
214  short int spline_size,
215  unsigned int total_nknots,
216  unsigned int n_tf1) {
217 // ******************************************************
218  if (n_params != _N_SPLINES_) {
219  printf("Number of splines not equal to %i, GPU code for event-by-event splines will fail\n", _N_SPLINES_);
220  printf("n_params = %i\n", n_params);
221  printf("%s : %i\n", __FILE__, __LINE__);
222  throw;
223  }
224 
225  // Write to the global statics (h_* denotes host stored variable)
226  h_n_params = n_params;
227 #ifndef Weight_On_SplineBySpline_Basis
228  h_n_events = n_events;
229 #endif
230  // Copy the constants
231  // Total number of valid splines for all loaded events
232  cudaMemcpyToSymbol(d_n_splines, &n_splines, sizeof(n_splines));
233  CudaCheckError();
234 
235  // Total number of valid TF1 for all loaded events
236  cudaMemcpyToSymbol(d_n_TF1, &n_tf1, sizeof(n_tf1));
237  CudaCheckError();
238 
239  // Total spline size per spline; i.e. just the number of points or knots in the spline
240  cudaMemcpyToSymbol(d_spline_size, &spline_size, sizeof(spline_size));
241  CudaCheckError();
242 #ifndef Weight_On_SplineBySpline_Basis
243  // Number of events
244  cudaMemcpyToSymbol(d_n_events, &h_n_events, sizeof(h_n_events));
245  CudaCheckError();
246 #endif
247  // Copy the coefficient arrays to the GPU; this only happens once per entire Markov Chain so is OK to do multiple extensive memory copies
248  cudaMemcpy(gpu_coeff_many, cpu_spline_handler->coeff_many.data(), sizeof(float)*total_nknots*_nCoeff_, cudaMemcpyHostToDevice);
249  CudaCheckError();
250 
251  cudaMemcpy(gpu_coeff_x, cpu_spline_handler->coeff_x.data(), sizeof(float)*spline_size*n_params, cudaMemcpyHostToDevice);
252  CudaCheckError();
253 
254  //KS: Bind our texture with the GPU variable
255  //KS: Tried also moving gpu_many_array to texture memory it only worked with restricted number of MC runs, most likely hit texture memory limit :(
256  struct cudaResourceDesc resDesc_coeff_x;
257  memset(&resDesc_coeff_x, 0, sizeof(resDesc_coeff_x));
258  resDesc_coeff_x.resType = cudaResourceTypeLinear;
259  resDesc_coeff_x.res.linear.devPtr = gpu_coeff_x;
260  resDesc_coeff_x.res.linear.desc = cudaCreateChannelDesc<float>();
261  resDesc_coeff_x.res.linear.sizeInBytes = sizeof(float)*spline_size*n_params;
262 
263  // Specify texture object parameters
264  struct cudaTextureDesc texDesc_coeff_x;
265  memset(&texDesc_coeff_x, 0, sizeof(texDesc_coeff_x));
266  texDesc_coeff_x.readMode = cudaReadModeElementType;
267 
268  // Create texture object
269  cudaCreateTextureObject(&text_coeff_x, &resDesc_coeff_x, &texDesc_coeff_x, nullptr);
270  CudaCheckError();
271 
272  // Also copy the parameter number for each spline onto the GPU; i.e. what spline parameter are we calculating right now
273  cudaMemcpy(gpu_paramNo_arr, cpu_spline_handler->paramNo_arr.data(), n_splines*sizeof(short int), cudaMemcpyHostToDevice);
274  CudaCheckError();
275 
276  // Also copy the knot map for each spline onto the GPU;
277  cudaMemcpy(gpu_nKnots_arr, cpu_spline_handler->nKnots_arr.data(), n_splines*sizeof(unsigned int), cudaMemcpyHostToDevice);
278  CudaCheckError();
279 
280  //Now TF1
281  // Copy the coefficient arrays to the GPU; this only happens once per entire Markov Chain so is OK to do multiple extensive memory copies
282  cudaMemcpy(gpu_coeff_TF1_many, cpu_many_array_TF1.data(), sizeof(float)*n_tf1*_nTF1Coeff_, cudaMemcpyHostToDevice);
283  CudaCheckError();
284 
285  // Also copy the parameter number for each TF1 onto the GPU; i.e. what TF1 parameter are we calculating right now
286  cudaMemcpy(gpu_paramNo_TF1_arr, cpu_paramNo_arr_TF1.data(), n_tf1*sizeof(short int), cudaMemcpyHostToDevice);
287  CudaCheckError();
288 
289  #ifndef Weight_On_SplineBySpline_Basis
290  //KS: Keep track how much splines each event has
291  cudaMemcpy(gpu_nParamPerEvent, cpu_nParamPerEvent.data(), 2*n_events*sizeof(unsigned int), cudaMemcpyHostToDevice);
292  CudaCheckError();
293 
294  //KS: Bind our texture with the GPU variable
295  // create a resource descriptor based on device pointers
296  struct cudaResourceDesc resDesc_nParamPerEvent;
297  memset(&resDesc_nParamPerEvent, 0, sizeof(resDesc_nParamPerEvent));
298  resDesc_nParamPerEvent.resType = cudaResourceTypeLinear;
299  resDesc_nParamPerEvent.res.linear.devPtr = gpu_nParamPerEvent;
300  resDesc_nParamPerEvent.res.linear.desc = cudaCreateChannelDesc<unsigned int>();
301  resDesc_nParamPerEvent.res.linear.sizeInBytes = 2*n_events*sizeof(unsigned int);
302 
303  // Specify texture object parameters
304  struct cudaTextureDesc texDesc_nParamPerEvent;
305  memset(&texDesc_nParamPerEvent, 0, sizeof(texDesc_nParamPerEvent));
306  texDesc_nParamPerEvent.readMode = cudaReadModeElementType;
307 
308  //Finally create texture object
309  cudaCreateTextureObject(&text_nParamPerEvent, &resDesc_nParamPerEvent, &texDesc_nParamPerEvent, nullptr);
310  CudaCheckError();
311 
312  // Now TF1
313  cudaMemcpy(gpu_nParamPerEvent_TF1, cpu_nParamPerEvent_TF1.data(), 2*n_events*sizeof(unsigned int), cudaMemcpyHostToDevice);
314  CudaCheckError();
315 
316  //KS: Bind our texture with the GPU variable
317  // create a resource descriptor based on device pointers
318  struct cudaResourceDesc resDesc_nParamPerEvent_tf1;
319  memset(&resDesc_nParamPerEvent_tf1, 0, sizeof(resDesc_nParamPerEvent_tf1));
320  resDesc_nParamPerEvent_tf1.resType = cudaResourceTypeLinear;
321  resDesc_nParamPerEvent_tf1.res.linear.devPtr = gpu_nParamPerEvent_TF1;
322  resDesc_nParamPerEvent_tf1.res.linear.desc = cudaCreateChannelDesc<unsigned int>();
323  resDesc_nParamPerEvent_tf1.res.linear.sizeInBytes = 2*n_events*sizeof(unsigned int);
324 
325  // Specify texture object parameters
326  struct cudaTextureDesc texDesc_nParamPerEvent_tf1;
327  memset(&texDesc_nParamPerEvent_tf1, 0, sizeof(texDesc_nParamPerEvent_tf1));
328  texDesc_nParamPerEvent_tf1.readMode = cudaReadModeElementType;
329 
330  //Finally create texture object
331  cudaCreateTextureObject(&text_nParamPerEvent_TF1, &resDesc_nParamPerEvent_tf1, &texDesc_nParamPerEvent_tf1, nullptr);
332  CudaCheckError();
333  #endif
334 }
335 
336 // ********************************************************
337 // START GPU KERNELS
338 //*********************************************************
339 // All the GPU kernels have similar tasks but different implementations
340 // Essentially they perform a binary search to find which TSpline3 point is nearest to our parameter variation
341 // Once it knows this, we simply extract the pre-computed coefficients for that spline point and multiply together to get a weight
342 
343 //*********************************************************
344 // Evaluate the spline on the GPU Using one {y,b,c,d} array and one {x} array
345 // Should be most efficient at cache hitting and memory coalescence
346 // But using spline segments rather than the parameter value: avoids doing binary search on GPU
347 __global__ void EvalOnGPU_Splines(
348  const short int* __restrict__ gpu_paramNo_arr,
349  const unsigned int* __restrict__ gpu_nKnots_arr,
350  const float* __restrict__ gpu_coeff_many,
351  float* __restrict__ gpu_weights,
352  const cudaTextureObject_t __restrict__ text_coeff_x) {
353 //*********************************************************
354  // points per spline is the offset to skip in the index to move between splines
355  const unsigned int splineNum = (blockIdx.x * blockDim.x + threadIdx.x);
356 
357  // this is the stopping condition!
358  if (splineNum < d_n_splines) {
359  // This is the segment we want for this parameter variation
360  // for this particular splineNum; 0 = MACCQE, 1 = pFC, 2 = EBC, etc
361 
362  //CW: Which Parameter we are accessing
363  const short int Param = gpu_paramNo_arr[splineNum];
364 
365  //CW: Avoids doing costly binary search on GPU
366  const short int segment = segment_gpu[Param];
367 
368  //KS: Segment for coeff_x is simply parameter*max knots + segment as each parmeters has the same spacing
369  const short int segment_X = Param*d_spline_size+segment;
370 
371  //KS: Find knot position in out monolitical structure
372  const unsigned int CurrentKnotPos = gpu_nKnots_arr[splineNum]*_nCoeff_+segment*_nCoeff_;
373 
374  // We've read the segment straight from CPU and is saved in segment_gpu
375  // polynomial parameters from the monolithic splineMonolith
376  const float fY = gpu_coeff_many[CurrentKnotPos];
377  const float fB = gpu_coeff_many[CurrentKnotPos + 1];
378  const float fC = gpu_coeff_many[CurrentKnotPos + 2];
379  const float fD = gpu_coeff_many[CurrentKnotPos + 3];
380  // The is the variation itself (needed to evaluate variation - stored spline point = dx)
381  const float dx = val_gpu[Param] - tex1Dfetch<float>(text_coeff_x, segment_X);
382 
383  //CW: Wooow, let's use some fancy intrinsics and pull down the processing time by <1% from normal multiplication! HURRAY
384  gpu_weights[splineNum] = fmaf(dx, fmaf(dx, fmaf(dx, fD, fC), fB), fY);
385  // Or for the more "easy to read" version:
386  //gpu_weights[splineNum] = (fY+dx*(fB+dx*(fC+dx*fD)));
387 
388  //#ifdef DEBUG
389  //printf("splineNum = %i/%i, paramNo = %i, variation = %f, segment = %i, fX = %f, fX+1 = %f, dx = %f, d_n_splines = %i, d_spline_size = %i, weight = %f \n", splineNum, d_n_splines, gpu_paramNo_arr[splineNum], val_gpu[Param], segment, tex1Dfetch<float>(text_coeff_x, segment_X), tex1Dfetch<float>(text_coeff_x, segment_X+1), dx, d_n_splines, d_spline_size, gpu_weights[splineNum]);
390  //#endif
391  }
392 }
393 
394 //*********************************************************
395 // Evaluate the TF1 on the GPU Using 5th order polynomial
396 __global__ void EvalOnGPU_TF1(
397  const float* __restrict__ gpu_coeffs_tf1,
398  const short int* __restrict__ gpu_paramNo_arr_tf1,
399  float* __restrict__ gpu_weights_tf1) {
400 //*********************************************************
401  // points per spline is the offset to skip in the index to move between splines
402  const unsigned int tf1Num = (blockIdx.x * blockDim.x + threadIdx.x);
403 
404  if (tf1Num < d_n_TF1) {
405  // The is the variation itself (needed to evaluate variation - stored spline point = dx)
406  const float x = val_gpu[gpu_paramNo_arr_tf1[tf1Num]];
407 
408  // Read the coefficients
409  const unsigned int TF1_Index = tf1Num * _nTF1Coeff_;
410  const float a = gpu_coeffs_tf1[TF1_Index];
411  const float b = gpu_coeffs_tf1[TF1_Index+1];
412 
413  gpu_weights_tf1[tf1Num] = fmaf(a, x, b);
414 
415  // gpu_weights_tf1[tf1Num] = a*x + b;
416  //gpu_weights_tf1[tf1Num] = 1 + a*x + b*x*x + c*x*x*x + d*x*x*x*x + e*x*x*x*x*x;
417  }
418 }
419 
420 #ifndef Weight_On_SplineBySpline_Basis
421 //*********************************************************
422 // KS: Evaluate the total spline event weight on the GPU, as in most cases GPU is faster, even more this significant reduce memory transfer from GPU to CPU
423 __global__ void EvalOnGPU_TotWeight(
424  const float* __restrict__ gpu_weights,
425  const float* __restrict__ gpu_weights_tf1,
426 
427  float* __restrict__ gpu_total_weights,
428 
429  const cudaTextureObject_t __restrict__ text_nParamPerEvent,
430  const cudaTextureObject_t __restrict__ text_nParamPerEvent_TF1) {
431 //*********************************************************
432  const unsigned int EventNum = (blockIdx.x * blockDim.x + threadIdx.x);
433 
434  if(EventNum < d_n_events) //stopping condition
435  {
436  float local_total_weight = 1.f;
437 
438  const unsigned int EventOffset = 2 * EventNum;
439 
440  for (unsigned int id = 0; id < tex1Dfetch<unsigned int>(text_nParamPerEvent, EventOffset); ++id) {
441  local_total_weight *= gpu_weights[tex1Dfetch<unsigned int>(text_nParamPerEvent, EventOffset+1) + id];
442  }
443 
444  for (unsigned int id = 0; id < tex1Dfetch<unsigned int>(text_nParamPerEvent_TF1, EventOffset); ++id) {
445  local_total_weight *= gpu_weights_tf1[tex1Dfetch<unsigned int>(text_nParamPerEvent_TF1, EventOffset+1) + id];
446  }
447  gpu_total_weights[EventNum] = local_total_weight;
448  }
449 }
450 #endif
451 
452 // *****************************************
453 // Run the GPU code for the separate many arrays. As in separate {x}, {y,b,c,d} arrays
454 // Pass the segment and the parameter values
455 // (binary search already performed in samplePDFND::FindSplineSegment()
457 #ifdef Weight_On_SplineBySpline_Basis
458  float* cpu_weights,
459  float* cpu_weights_tf1,
460 #else
461  float* cpu_total_weights,
462 #endif
463  // Holds the changes in parameters
464  float *vals,
465  // Holds the segments for parameters
466  short int *segment,
467  const unsigned int h_n_splines,
468  const unsigned int h_n_tf1) {
469 // *****************************************
470  dim3 block_size;
471  dim3 grid_size;
472 
473  block_size.x = _BlockSize_;
474  grid_size.x = (h_n_splines / block_size.x) + 1;
475 
476  // Copy the segment values to the GPU (segment_gpu), which is h_n_params long
477  cudaMemcpyToSymbol(segment_gpu, segment, h_n_params*sizeof(short int));
478  CudaCheckError();
479 
480  // Copy the parameter values values to the GPU (vals_gpu), which is h_n_params long
481  cudaMemcpyToSymbol(val_gpu, vals, h_n_params*sizeof(float));
482  CudaCheckError();
483 
484  // KS: Consider asynchronous kernel call, this might help EvalOnGPU_Splines and EvalOnGPU_TF1 are independent
485  // Set the cache config to prefer L1 for the kernel
486  //cudaFuncSetCacheConfig(EvalOnGPU_Splines, cudaFuncCachePreferL1);
487  EvalOnGPU_Splines<<<grid_size, block_size>>>(
490 
492 
493  gpu_weights,
495  );
496  CudaCheckError();
497 
498  grid_size.x = (h_n_tf1 / block_size.x) + 1;
499  EvalOnGPU_TF1<<<grid_size, block_size>>>(
502 
504  );
505  CudaCheckError();
506 
507 //KS: We can either copy gpu_weight and calculate total weight in reweighting loop, or not copy and calculate total weight stall at GPU, which means less memory transfer
508 #ifdef Weight_On_SplineBySpline_Basis
509  // Here we have to make a somewhat large GPU->CPU transfer because it's all the splines' response
510  cudaMemcpy(cpu_weights, gpu_weights, h_n_splines*sizeof(float), cudaMemcpyDeviceToHost);
511  CudaCheckError();
512 
513  cudaMemcpy(cpu_weights_tf1, gpu_weights_tf1, h_n_tf1*sizeof(float), cudaMemcpyDeviceToHost);
514  CudaCheckError();
515 
516 //KS: Else calculate Total Weight
517 #else
518  grid_size.x = (h_n_events / block_size.x) + 1;
519 
520  EvalOnGPU_TotWeight<<<grid_size, block_size>>>(
521  gpu_weights,
523 
525 
528  );
529  CudaCheckError();
530 
531  //KS: Here we have to make a somewhat large GPU->CPU transfer because it is proportional to number of events
532  //KS: Normally code wait for memory transfer to finish before moving further cudaMemcpyAsync means we will continue to execute code and in a meantime keep copying stuff.
533  cudaMemcpyAsync(cpu_total_weights, gpu_total_weights, h_n_events * sizeof(float), cudaMemcpyDeviceToHost, 0);
534  CudaCheckError();
535 #endif
536 
537  #ifdef DEBUG
538  printf("Copied GPU total weights to CPU with SUCCESS (drink more tea)\n");
539  printf("Released calculated response from GPU with SUCCESS (drink most tea)\n");
540  #endif
541 }
542 
543 // *********************************
544 // CLEANING
545 // *********************************
546 
547 // *********************************
548 // Clean up the {x},{ybcd} arrays
550  #ifndef Weight_On_SplineBySpline_Basis
551  float *cpu_total_weights
552  #endif
553 ){
554 // *********************************
555  cudaFree(gpu_paramNo_arr);
556  cudaFree(gpu_nKnots_arr);
557 
558  // free the coefficient arrays
559  cudaDestroyTextureObject(text_coeff_x);
560 
561  cudaFree(gpu_coeff_x);
562  cudaFree(gpu_coeff_many);
563 
564  cudaFree(gpu_coeff_TF1_many);
565  cudaFree(gpu_paramNo_TF1_arr);
566  // free weights on the gpu
567  cudaFree(gpu_weights);
568  cudaFree(gpu_weights_tf1);
569 #ifndef Weight_On_SplineBySpline_Basis
570  cudaFree(gpu_total_weights);
571  //KS: Before removing variable let's destroy texture
572  cudaDestroyTextureObject(text_nParamPerEvent);
573  cudaDestroyTextureObject(text_nParamPerEvent_TF1);
574 
575  cudaFree(gpu_nParamPerEvent);
576  cudaFree(gpu_nParamPerEvent_TF1);
577  cudaFreeHost(cpu_total_weights);
578  cpu_total_weights = nullptr;
579 #endif
580 }
581 
582 // *******************************************
584 __host__ void SMonolithGPU::CleanupGPU_Segments(short int *segment, float *vals) {
585 // *******************************************
586  cudaFreeHost(segment);
587  cudaFreeHost(vals);
588 
589  segment = nullptr;
590  vals = nullptr;
591 }
constexpr int _nCoeff_
KS: We store coefficients {y,b,c,d} in one array one by one, this is only to define it once rather th...
Definition: SplineCommon.h:13
constexpr int _nTF1Coeff_
KS: For TF1 we store at most 5 coefficients, we could make it more flexible but for now define it her...
Definition: SplineCommon.h:15
cudaTextureObject_t text_nParamPerEvent_TF1
KS: Map keeping track how many parameters applies to each event, we keep two numbers here {number of ...
unsigned int * gpu_nParamPerEvent
KS: GPU map keeping track how many parameters applies to each event, we keep two numbers here {number...
__host__ void CleanupGPU_SplineMonolith(float *cpu_total_weights)
This function deallocates the resources allocated for the separate {x} and {ybcd} arrays in the and T...
cudaTextureObject_t text_coeff_x
KS: Textures are L1 cache variables which are well optimised for fetching. Make texture only for vari...
short int * gpu_paramNo_TF1_arr
CW: GPU array with the number of points per TF1 object.
__host__ void InitGPU_Vals(float **vals)
Allocate memory for spline segments.
int h_n_events
Number of events living on CPU.
float * gpu_weights
GPU arrays to hold weight for each spline.
float * gpu_coeff_many
GPU arrays to hold other coefficients.
unsigned int * gpu_nParamPerEvent_TF1
KS: GPU map keeping track how many parameters applies to each event, we keep two numbers here {number...
cudaTextureObject_t text_nParamPerEvent
KS: Map keeping track how many parameters applies to each event, we keep two numbers here {number of ...
__host__ void InitGPU_SplineMonolith(float **cpu_total_weights, int n_events, unsigned int total_nknots, unsigned int n_splines, unsigned int n_tf1, int Eve_size)
Allocate memory on gpu for spline monolith.
virtual ~SMonolithGPU()
destructor
__host__ void RunGPU_SplineMonolith(float *cpu_total_weights, float *vals, short int *segment, const unsigned int h_n_splines, const unsigned int h_n_tf1)
Run the GPU code for the separate many arrays. As in separate {x}, {y,b,c,d} arrays Pass the segment ...
float * gpu_coeff_x
KS: GPU arrays to hold X coefficient.
SMonolithGPU()
constructor
short int * gpu_nPoints_arr
GPU arrays to hold number of points.
float * gpu_weights_tf1
GPU arrays to hold weight for each TF1.
__host__ void InitGPU_Segments(short int **segment)
Allocate memory for spline segments.
unsigned int * gpu_nKnots_arr
KS: GPU Number of knots per spline.
int h_n_params
Number of params living on CPU.
float * gpu_coeff_TF1_many
GPU arrays to hold TF1 coefficients.
short int * gpu_paramNo_arr
CW: GPU array with the number of points per spline (not per spline point!)
__host__ void CopyToGPU_SplineMonolith(SplineMonoStruct *cpu_spline_handler, std::vector< float > cpu_many_array_TF1, std::vector< short int > cpu_paramNo_arr_TF1, int n_events, std::vector< unsigned int > cpu_nParamPerEvent, std::vector< unsigned int > cpu_nParamPerEvent_TF1, int n_params, unsigned int n_splines, short int spline_size, unsigned int total_nknots, unsigned int n_tf1)
Copies data from CPU to GPU for the spline monolith.
float * gpu_total_weights
GPU arrays to hold weight for event.
__host__ void CleanupGPU_Segments(short int *segment, float *vals)
Clean up pinned variables at CPU.
__global__ void EvalOnGPU_TotWeight(const float *__restrict__ gpu_weights, const float *__restrict__ gpu_weights_tf1, float *__restrict__ gpu_total_weights, const cudaTextureObject_t __restrict__ text_nParamPerEvent, const cudaTextureObject_t __restrict__ text_nParamPerEvent_TF1)
KS: Evaluate the total spline event weight on the GPU, as in most cases GPU is faster,...
#define _N_SPLINES_
__global__ void EvalOnGPU_Splines(const short int *__restrict__ gpu_paramNo_arr, const unsigned int *__restrict__ gpu_nKnots_arr, const float *__restrict__ gpu_coeff_many, float *__restrict__ gpu_weights, const cudaTextureObject_t __restrict__ text_coeff_x)
Evaluate the spline on the GPU Using one {y,b,c,d} array and one {x} array Should be most efficient a...
__device__ __constant__ unsigned int d_n_TF1
Number of tf1 living on GPU.
__device__ __constant__ unsigned int d_n_splines
Number of splines living on GPU.
__host__ void SynchroniseSplines()
Make sure all Cuda threads finished execution.
__global__ void EvalOnGPU_TF1(const float *__restrict__ gpu_coeffs_tf1, const short int *__restrict__ gpu_paramNo_arr_tf1, float *__restrict__ gpu_weights_tf1)
Evaluate the TF1 on the GPU Using 5th order polynomial.
__device__ __constant__ short int segment_gpu[NSplines_GPU]
__device__ __constant__ float val_gpu[NSplines_GPU]
CW: Constant memory needs to be hard-coded on compile time. Could make this texture memory instead,...
__device__ __constant__ int d_n_events
Number of events living on GPU.
__device__ __constant__ short int d_spline_size
Size of splines living on GPU.
MaCh3 event-by-event cross-section spline code.
void checkGpuMem()
KS: Get some fancy info about VRAM usage.
Definition: gpuUtils.cu:43
void PrintNdevices()
KS: Get some fancy info about GPU.
Definition: gpuUtils.cu:60
#define CudaCheckError()
Definition: gpuUtils.cuh:22
#define _BlockSize_
KS: Need it for shared memory, there is way to use dynamic shared memory but I am lazy right now.
Definition: gpuUtils.cuh:25
KS: Struct storing information for spline monolith.
Definition: SplineCommon.h:30
std::vector< unsigned int > nKnots_arr
KS: CPU Number of knots per spline.
Definition: SplineCommon.h:41
std::vector< float > coeff_x
KS: CPU arrays to hold X coefficient.
Definition: SplineCommon.h:32
std::vector< float > coeff_many
CPU arrays to hold other coefficients.
Definition: SplineCommon.h:38
std::vector< short int > paramNo_arr
CW: CPU array with the number of points per spline (not per spline point!)
Definition: SplineCommon.h:44