MaCh3 2.2.1
Reference Guide
Loading...
Searching...
No Matches
gpuSplineUtils.cu
Go to the documentation of this file.
1//MaCh3 included
3
6#define _N_SPLINES_ NSplines_GPU
7
8// KS: Forgive me father, for I have sinned.
9#if defined(__CUDA_ARCH__)
10 #if __CUDA_ARCH__ >= 1200
11 #pragma message("Compiling with CUDA Architecture: 12.x")
12 #elif __CUDA_ARCH__ >= 1100
13 #pragma message("Compiling with CUDA Architecture: 11.x")
14 #elif __CUDA_ARCH__ >= 1000
15 #pragma message("Compiling with CUDA Architecture: 10.x")
16 #elif __CUDA_ARCH__ >= 900
17 #pragma message("Compiling with CUDA Architecture: 9.x")
18 #elif __CUDA_ARCH__ >= 800
19 #pragma message("Compiling with CUDA Architecture: 8.x")
20 #elif __CUDA_ARCH__ >= 750
21 #pragma message("Compiling with CUDA Architecture: 7.5")
22 #elif __CUDA_ARCH__ >= 730
23 #pragma message("Compiling with CUDA Architecture: 7.3")
24 #elif __CUDA_ARCH__ >= 720
25 #pragma message("Compiling with CUDA Architecture: 7.2")
26 #elif __CUDA_ARCH__ >= 710
27 #pragma message("Compiling with CUDA Architecture: 7.1")
28 #elif __CUDA_ARCH__ >= 700
29 #pragma message("Compiling with CUDA Architecture: 7.x")
30 #elif __CUDA_ARCH__ >= 650
31 #pragma message("Compiling with CUDA Architecture: 6.5")
32 #elif __CUDA_ARCH__ >= 600
33 #pragma message("Compiling with CUDA Architecture: 6.x")
34 #elif __CUDA_ARCH__ >= 530
35 #pragma message("Compiling with CUDA Architecture: 5.3")
36 #elif __CUDA_ARCH__ >= 520
37 #pragma message("Compiling with CUDA Architecture: 5.2")
38 #elif __CUDA_ARCH__ >= 510
39 #pragma message("Compiling with CUDA Architecture: 5.1")
40 #elif __CUDA_ARCH__ >= 500
41 #pragma message("Compiling with CUDA Architecture: 5.x")
42 #elif __CUDA_ARCH__ >= 400
43 #pragma message("Compiling with CUDA Architecture: 4.x")
44 #elif __CUDA_ARCH__ >= 300
45 #pragma message("Compiling with CUDA Architecture: 3.x")
46 #else
47 #pragma message("Compiling with CUDA Architecture: < 3.x")
48 #endif
49#endif
50
51// ******************************************
52// CONSTANTS
53// ******************************************
54
55// d_NAME declares DEVICE constants (live on GPU)
57__device__ __constant__ unsigned int d_n_splines;
59__device__ __constant__ unsigned int d_n_TF1;
61__device__ __constant__ short int d_spline_size;
62#ifndef Weight_On_SplineBySpline_Basis
64__device__ __constant__ int d_n_events;
65#endif
67__device__ __constant__ float val_gpu[_N_SPLINES_];
68__device__ __constant__ short int segment_gpu[_N_SPLINES_];
69
70
71// *****************************************
72// Make sure all Cuda threads finished execution
73__host__ void SynchroniseSplines() {
74 cudaDeviceSynchronize();
75}
76
77// *******************************************
78// INITIALISE GPU
79// *******************************************
80
82 h_n_params = -1;
84 h_n_events = -1;
85
86 gpu_weights = nullptr;
87 gpu_total_weights = nullptr;
88 gpu_nParamPerEvent = nullptr;
89 gpu_nPoints_arr = nullptr;
90 gpu_paramNo_arr = nullptr;
91 gpu_nKnots_arr = nullptr;
92 gpu_coeff_x = nullptr;
93 gpu_coeff_many = nullptr;
94 gpu_coeff_TF1_many = nullptr;
95 gpu_paramNo_TF1_arr = nullptr;
96 gpu_nParamPerEvent_TF1 = nullptr;
97 gpu_weights_tf1 = nullptr;
98}
99
101
102}
103
104// *******************************************
105// Initialiser when using the x array and combined y,b,c,d array
107 #ifndef Weight_On_SplineBySpline_Basis
108 float **cpu_total_weights,
109 int n_events,
110 #endif
111 unsigned int total_nknots,
112 unsigned int n_splines,
113 unsigned int n_tf1,
114 int Eve_size) {
115// *******************************************
116 // Allocate chunks of memory to GPU
117 cudaMalloc((void **) &gpu_paramNo_arr, n_splines*sizeof(short int));
119
120 cudaMalloc((void **) &gpu_nKnots_arr, n_splines*sizeof(unsigned int));
122
123 cudaMalloc((void **) &gpu_coeff_x, Eve_size*sizeof(float));
125
126 cudaMalloc((void **) &gpu_coeff_many, _nCoeff_*total_nknots*sizeof(float));
128
129 // Allocate memory for the array of weights to be returned to CPU
130 cudaMalloc((void **) &gpu_weights, n_splines*sizeof(float));
132
133 // Now TF1 specific
134 cudaMalloc((void **) &gpu_coeff_TF1_many, _nTF1Coeff_*n_tf1*sizeof(float));
136
137 cudaMalloc((void **) &gpu_weights_tf1, n_tf1*sizeof(float));
139
140 cudaMalloc((void **) &gpu_paramNo_TF1_arr, n_tf1*sizeof(short int));
142
143#ifndef Weight_On_SplineBySpline_Basis
144 //KS: Rather than allocate memory in standard way this fancy cuda tool allows to pin host memory which make memory transfer faster
145 cudaMallocHost((void **) cpu_total_weights, n_events*sizeof(float));
147
148 //KS: Allocate memory for the array of total weights to be returned to CPU
149 cudaMalloc((void **) &gpu_total_weights, n_events*sizeof(float));
151
152 //KS: Allocate memory for the map keeping track how many splines each parameter has
153 cudaMalloc((void **) &gpu_nParamPerEvent, 2*n_events*sizeof(unsigned int));
155
156 //KS: Allocate memory for the map keeping track how many TF1 each parameter has
157 cudaMalloc((void **) &gpu_nParamPerEvent_TF1, 2*n_events*sizeof(unsigned int));
159#endif
160
161 // Print allocation info to user
162 printf("Allocated %i entries for paramNo and nKnots arrays, size = %f MB\n",
163 n_splines, static_cast<double>(sizeof(short int) * n_splines + sizeof(unsigned int) * n_splines) / 1.0e6);
164 printf("Allocated %i entries for x coeff arrays, size = %f MB\n",
165 Eve_size, static_cast<double>(sizeof(float) * Eve_size) / 1.0e6);
166 printf("Allocated %i entries for {ybcd} coeff arrays, size = %f MB\n",
167 _nCoeff_ * total_nknots, static_cast<double>(sizeof(float) * _nCoeff_ * total_nknots) / 1.0e6);
168 printf("Allocated %i entries for TF1 coefficient arrays, size = %f MB\n",
169 _nTF1Coeff_ * n_tf1, static_cast<double>(sizeof(float) * _nTF1Coeff_ * n_tf1) / 1.0e6);
170
171 //KS: Ask CUDA about memory usage
172 checkGpuMem();
174}
175
176// *******************************************
177// Allocate memory for spline segments
178__host__ void SMonolithGPU::InitGPU_Segments(short int **segment) {
179// *******************************************
180 //KS: Rather than allocate memory in standard way this fancy cuda tool allows to pin host memory which make memory transfer faster
181 cudaMallocHost((void **) segment, _N_SPLINES_*sizeof(short int));
183}
184
185// *******************************************
186// Allocate memory for spline segments
187__host__ void SMonolithGPU::InitGPU_Vals(float **vals) {
188// *******************************************
189 //KS: Rather than allocate memory in standard way this fancy cuda tool allows to pin host memory which make memory transfer faster
190 cudaMallocHost((void **) vals, _N_SPLINES_*sizeof(float));
192}
193
194
195// ******************************************************
196// START COPY TO GPU
197// ******************************************************
198
199// ******************************************************
200// Copy to GPU for x array and separate ybcd array
202 SplineMonoStruct* cpu_spline_handler,
203
204 // TFI related now
205 std::vector<float> cpu_many_array_TF1,
206 std::vector<short int> cpu_paramNo_arr_TF1,
207 #ifndef Weight_On_SplineBySpline_Basis
208 int n_events,
209 std::vector<unsigned int> cpu_nParamPerEvent,
210 // TFI related now
211 std::vector<unsigned int> cpu_nParamPerEvent_TF1,
212 #endif
213 int n_params,
214 unsigned int n_splines,
215 short int spline_size,
216 unsigned int total_nknots,
217 unsigned int n_tf1) {
218// ******************************************************
219 if (n_params != _N_SPLINES_) {
220 printf("Number of splines not equal to %i, GPU code for event-by-event splines will fail\n", _N_SPLINES_);
221 printf("n_params = %i\n", n_params);
222 printf("%s : %i\n", __FILE__, __LINE__);
223 throw;
224 }
225
226 // Write to the global statics (h_* denotes host stored variable)
227 h_n_params = n_params;
228#ifndef Weight_On_SplineBySpline_Basis
229 h_n_events = n_events;
230#endif
231 // Copy the constants
232 // Total number of valid splines for all loaded events
233 cudaMemcpyToSymbol(d_n_splines, &n_splines, sizeof(n_splines));
235
236 // Total number of valid TF1 for all loaded events
237 cudaMemcpyToSymbol(d_n_TF1, &n_tf1, sizeof(n_tf1));
239
240 // Total spline size per spline; i.e. just the number of points or knots in the spline
241 cudaMemcpyToSymbol(d_spline_size, &spline_size, sizeof(spline_size));
243#ifndef Weight_On_SplineBySpline_Basis
244 // Number of events
245 cudaMemcpyToSymbol(d_n_events, &h_n_events, sizeof(h_n_events));
247#endif
248 // Copy the coefficient arrays to the GPU; this only happens once per entire Markov Chain so is OK to do multiple extensive memory copies
249 cudaMemcpy(gpu_coeff_many, cpu_spline_handler->coeff_many.data(), sizeof(float)*total_nknots*_nCoeff_, cudaMemcpyHostToDevice);
251
252 cudaMemcpy(gpu_coeff_x, cpu_spline_handler->coeff_x.data(), sizeof(float)*spline_size*n_params, cudaMemcpyHostToDevice);
254
255 //KS: Bind our texture with the GPU variable
256 //KS: Tried also moving gpu_many_array to texture memory it only worked with restricted number of MC runs, most likely hit texture memory limit :(
257 struct cudaResourceDesc resDesc_coeff_x;
258 memset(&resDesc_coeff_x, 0, sizeof(resDesc_coeff_x));
259 resDesc_coeff_x.resType = cudaResourceTypeLinear;
260 resDesc_coeff_x.res.linear.devPtr = gpu_coeff_x;
261 resDesc_coeff_x.res.linear.desc = cudaCreateChannelDesc<float>();
262 resDesc_coeff_x.res.linear.sizeInBytes = sizeof(float)*spline_size*n_params;
263
264 // Specify texture object parameters
265 struct cudaTextureDesc texDesc_coeff_x;
266 memset(&texDesc_coeff_x, 0, sizeof(texDesc_coeff_x));
267 texDesc_coeff_x.readMode = cudaReadModeElementType;
268
269 // Create texture object
270 cudaCreateTextureObject(&text_coeff_x, &resDesc_coeff_x, &texDesc_coeff_x, nullptr);
272
273 // Also copy the parameter number for each spline onto the GPU; i.e. what spline parameter are we calculating right now
274 cudaMemcpy(gpu_paramNo_arr, cpu_spline_handler->paramNo_arr.data(), n_splines*sizeof(short int), cudaMemcpyHostToDevice);
276
277 // Also copy the knot map for each spline onto the GPU;
278 cudaMemcpy(gpu_nKnots_arr, cpu_spline_handler->nKnots_arr.data(), n_splines*sizeof(unsigned int), cudaMemcpyHostToDevice);
280
281 //Now TF1
282 // Copy the coefficient arrays to the GPU; this only happens once per entire Markov Chain so is OK to do multiple extensive memory copies
283 cudaMemcpy(gpu_coeff_TF1_many, cpu_many_array_TF1.data(), sizeof(float)*n_tf1*_nTF1Coeff_, cudaMemcpyHostToDevice);
285
286 // Also copy the parameter number for each TF1 onto the GPU; i.e. what TF1 parameter are we calculating right now
287 cudaMemcpy(gpu_paramNo_TF1_arr, cpu_paramNo_arr_TF1.data(), n_tf1*sizeof(short int), cudaMemcpyHostToDevice);
289
290 #ifndef Weight_On_SplineBySpline_Basis
291 //KS: Keep track how much splines each event has
292 cudaMemcpy(gpu_nParamPerEvent, cpu_nParamPerEvent.data(), 2*n_events*sizeof(unsigned int), cudaMemcpyHostToDevice);
294
295 //KS: Bind our texture with the GPU variable
296 // create a resource descriptor based on device pointers
297 struct cudaResourceDesc resDesc_nParamPerEvent;
298 memset(&resDesc_nParamPerEvent, 0, sizeof(resDesc_nParamPerEvent));
299 resDesc_nParamPerEvent.resType = cudaResourceTypeLinear;
300 resDesc_nParamPerEvent.res.linear.devPtr = gpu_nParamPerEvent;
301 resDesc_nParamPerEvent.res.linear.desc = cudaCreateChannelDesc<unsigned int>();
302 resDesc_nParamPerEvent.res.linear.sizeInBytes = 2*n_events*sizeof(unsigned int);
303
304 // Specify texture object parameters
305 struct cudaTextureDesc texDesc_nParamPerEvent;
306 memset(&texDesc_nParamPerEvent, 0, sizeof(texDesc_nParamPerEvent));
307 texDesc_nParamPerEvent.readMode = cudaReadModeElementType;
308
309 //Finally create texture object
310 cudaCreateTextureObject(&text_nParamPerEvent, &resDesc_nParamPerEvent, &texDesc_nParamPerEvent, nullptr);
312
313 // Now TF1
314 cudaMemcpy(gpu_nParamPerEvent_TF1, cpu_nParamPerEvent_TF1.data(), 2*n_events*sizeof(unsigned int), cudaMemcpyHostToDevice);
316
317 //KS: Bind our texture with the GPU variable
318 // create a resource descriptor based on device pointers
319 struct cudaResourceDesc resDesc_nParamPerEvent_tf1;
320 memset(&resDesc_nParamPerEvent_tf1, 0, sizeof(resDesc_nParamPerEvent_tf1));
321 resDesc_nParamPerEvent_tf1.resType = cudaResourceTypeLinear;
322 resDesc_nParamPerEvent_tf1.res.linear.devPtr = gpu_nParamPerEvent_TF1;
323 resDesc_nParamPerEvent_tf1.res.linear.desc = cudaCreateChannelDesc<unsigned int>();
324 resDesc_nParamPerEvent_tf1.res.linear.sizeInBytes = 2*n_events*sizeof(unsigned int);
325
326 // Specify texture object parameters
327 struct cudaTextureDesc texDesc_nParamPerEvent_tf1;
328 memset(&texDesc_nParamPerEvent_tf1, 0, sizeof(texDesc_nParamPerEvent_tf1));
329 texDesc_nParamPerEvent_tf1.readMode = cudaReadModeElementType;
330
331 //Finally create texture object
332 cudaCreateTextureObject(&text_nParamPerEvent_TF1, &resDesc_nParamPerEvent_tf1, &texDesc_nParamPerEvent_tf1, nullptr);
334 #endif
335}
336
337// ********************************************************
338// START GPU KERNELS
339//*********************************************************
340// All the GPU kernels have similar tasks but different implementations
341// Essentially they perform a binary search to find which TSpline3 point is nearest to our parameter variation
342// Once it knows this, we simply extract the pre-computed coefficients for that spline point and multiply together to get a weight
343
344//*********************************************************
345// Evaluate the spline on the GPU Using one {y,b,c,d} array and one {x} array
346// Should be most efficient at cache hitting and memory coalescence
347// But using spline segments rather than the parameter value: avoids doing binary search on GPU
348__global__ void EvalOnGPU_Splines(
349 const short int* __restrict__ gpu_paramNo_arr,
350 const unsigned int* __restrict__ gpu_nKnots_arr,
351 const float* __restrict__ gpu_coeff_many,
352 float* __restrict__ gpu_weights,
353 const cudaTextureObject_t __restrict__ text_coeff_x) {
354//*********************************************************
355 // points per spline is the offset to skip in the index to move between splines
356 const unsigned int splineNum = (blockIdx.x * blockDim.x + threadIdx.x);
357
358 // this is the stopping condition!
359 if (splineNum < d_n_splines) {
360 // This is the segment we want for this parameter variation
361 // for this particular splineNum; 0 = MACCQE, 1 = pFC, 2 = EBC, etc
362
363 //CW: Which Parameter we are accessing
364 const short int Param = gpu_paramNo_arr[splineNum];
365
366 //CW: Avoids doing costly binary search on GPU
367 const short int segment = segment_gpu[Param];
368
369 //KS: Segment for coeff_x is simply parameter*max knots + segment as each parmeters has the same spacing
370 const short int segment_X = Param*d_spline_size+segment;
371
372 //KS: Find knot position in out monolitical structure
373 const unsigned int CurrentKnotPos = gpu_nKnots_arr[splineNum]*_nCoeff_+segment*_nCoeff_;
374
375 // We've read the segment straight from CPU and is saved in segment_gpu
376 // polynomial parameters from the monolithic splineMonolith
377 const float fY = gpu_coeff_many[CurrentKnotPos];
378 const float fB = gpu_coeff_many[CurrentKnotPos + 1];
379 const float fC = gpu_coeff_many[CurrentKnotPos + 2];
380 const float fD = gpu_coeff_many[CurrentKnotPos + 3];
381 // The is the variation itself (needed to evaluate variation - stored spline point = dx)
382 const float dx = val_gpu[Param] - tex1Dfetch<float>(text_coeff_x, segment_X);
383
384 //CW: Wooow, let's use some fancy intrinsics and pull down the processing time by <1% from normal multiplication! HURRAY
385 gpu_weights[splineNum] = fmaf(dx, fmaf(dx, fmaf(dx, fD, fC), fB), fY);
386 // Or for the more "easy to read" version:
387 //gpu_weights[splineNum] = (fY+dx*(fB+dx*(fC+dx*fD)));
388
389 //#ifdef DEBUG
390 //printf("splineNum = %i/%i, paramNo = %i, variation = %f, segment = %i, fX = %f, fX+1 = %f, dx = %f, d_n_splines = %i, d_spline_size = %i, weight = %f \n", splineNum, d_n_splines, gpu_paramNo_arr[splineNum], val_gpu[Param], segment, tex1Dfetch<float>(text_coeff_x, segment_X), tex1Dfetch<float>(text_coeff_x, segment_X+1), dx, d_n_splines, d_spline_size, gpu_weights[splineNum]);
391 //#endif
392 }
393}
394
395//*********************************************************
396// Evaluate the TF1 on the GPU Using 5th order polynomial
397__global__ void EvalOnGPU_TF1(
398 const float* __restrict__ gpu_coeffs_tf1,
399 const short int* __restrict__ gpu_paramNo_arr_tf1,
400 float* __restrict__ gpu_weights_tf1) {
401//*********************************************************
402 // points per spline is the offset to skip in the index to move between splines
403 const unsigned int tf1Num = (blockIdx.x * blockDim.x + threadIdx.x);
404
405 if (tf1Num < d_n_TF1) {
406 // The is the variation itself (needed to evaluate variation - stored spline point = dx)
407 const float x = val_gpu[gpu_paramNo_arr_tf1[tf1Num]];
408
409 // Read the coefficients
410 const unsigned int TF1_Index = tf1Num * _nTF1Coeff_;
411 const float a = gpu_coeffs_tf1[TF1_Index];
412 const float b = gpu_coeffs_tf1[TF1_Index+1];
413
414 gpu_weights_tf1[tf1Num] = fmaf(a, x, b);
415
416 // gpu_weights_tf1[tf1Num] = a*x + b;
417 //gpu_weights_tf1[tf1Num] = 1 + a*x + b*x*x + c*x*x*x + d*x*x*x*x + e*x*x*x*x*x;
418 }
419}
420
421#ifndef Weight_On_SplineBySpline_Basis
422//*********************************************************
423// KS: Evaluate the total spline event weight on the GPU, as in most cases GPU is faster, even more this significant reduce memory transfer from GPU to CPU
424__global__ void EvalOnGPU_TotWeight(
425 const float* __restrict__ gpu_weights,
426 const float* __restrict__ gpu_weights_tf1,
427
428 float* __restrict__ gpu_total_weights,
429
430 const cudaTextureObject_t __restrict__ text_nParamPerEvent,
431 const cudaTextureObject_t __restrict__ text_nParamPerEvent_TF1) {
432//*********************************************************
433 const unsigned int EventNum = (blockIdx.x * blockDim.x + threadIdx.x);
434
435 //KS: Accessing shared memory is much much faster than global memory hence we use shared memory for calculation and then write to global memory
436 __shared__ float shared_total_weights[_BlockSize_];
437 if(EventNum < d_n_events) //stopping condition
438 {
439 shared_total_weights[threadIdx.x] = 1.f;
440
441 const unsigned int EventOffset = 2 * EventNum;
442
443 for (unsigned int id = 0; id < tex1Dfetch<unsigned int>(text_nParamPerEvent, EventOffset); ++id) {
444 shared_total_weights[threadIdx.x] *= gpu_weights[tex1Dfetch<unsigned int>(text_nParamPerEvent, EventOffset+1) + id];
445 }
446
447 for (unsigned int id = 0; id < tex1Dfetch<unsigned int>(text_nParamPerEvent_TF1, EventOffset); ++id) {
448 shared_total_weights[threadIdx.x] *= gpu_weights_tf1[tex1Dfetch<unsigned int>(text_nParamPerEvent_TF1, EventOffset+1) + id];
449 }
450 gpu_total_weights[EventNum] = shared_total_weights[threadIdx.x];
451 }
452}
453#endif
454
455// *****************************************
456// Run the GPU code for the separate many arrays. As in separate {x}, {y,b,c,d} arrays
457// Pass the segment and the parameter values
458// (binary search already performed in samplePDFND::FindSplineSegment()
460#ifdef Weight_On_SplineBySpline_Basis
461 float* cpu_weights,
462 float* cpu_weights_tf1,
463#else
464 float* cpu_total_weights,
465#endif
466 // Holds the changes in parameters
467 float *vals,
468 // Holds the segments for parameters
469 short int *segment,
470 const unsigned int h_n_splines,
471 const unsigned int h_n_tf1) {
472// *****************************************
473 dim3 block_size;
474 dim3 grid_size;
475
476 block_size.x = _BlockSize_;
477 grid_size.x = (h_n_splines / block_size.x) + 1;
478
479 // Copy the segment values to the GPU (segment_gpu), which is h_n_params long
480 cudaMemcpyToSymbol(segment_gpu, segment, h_n_params*sizeof(short int));
482
483 // Copy the parameter values values to the GPU (vals_gpu), which is h_n_params long
484 cudaMemcpyToSymbol(val_gpu, vals, h_n_params*sizeof(float));
486
487 // KS: Consider asynchronous kernel call, this might help EvalOnGPU_Splines and EvalOnGPU_TF1 are independent
488 // Set the cache config to prefer L1 for the kernel
489 //cudaFuncSetCacheConfig(EvalOnGPU_Splines, cudaFuncCachePreferL1);
490 EvalOnGPU_Splines<<<grid_size, block_size>>>(
493
495
498 );
500
501 grid_size.x = (h_n_tf1 / block_size.x) + 1;
502 EvalOnGPU_TF1<<<grid_size, block_size>>>(
505
507 );
509
510//KS: We can either copy gpu_weight and calculate total weight in reweighting loop, or not copy and calculate total weight stall at GPU, which means less memory transfer
511#ifdef Weight_On_SplineBySpline_Basis
512 // Here we have to make a somewhat large GPU->CPU transfer because it's all the splines' response
513 cudaMemcpy(cpu_weights, gpu_weights, h_n_splines*sizeof(float), cudaMemcpyDeviceToHost);
515
516 cudaMemcpy(cpu_weights_tf1, gpu_weights_tf1, h_n_tf1*sizeof(float), cudaMemcpyDeviceToHost);
518
519//KS: Else calculate Total Weight
520#else
521 grid_size.x = (h_n_events / block_size.x) + 1;
522
523 EvalOnGPU_TotWeight<<<grid_size, block_size>>>(
526
528
531 );
533
534 //KS: Here we have to make a somewhat large GPU->CPU transfer because it is proportional to number of events
535 //KS: Normally code wait for memory transfer to finish before moving further cudaMemcpyAsync means we will continue to execute code and in a meantime keep copying stuff.
536 cudaMemcpyAsync(cpu_total_weights, gpu_total_weights, h_n_events * sizeof(float), cudaMemcpyDeviceToHost, 0);
538#endif
539
540 #ifdef DEBUG
541 printf("Copied GPU total weights to CPU with SUCCESS (drink more tea)\n");
542 printf("Released calculated response from GPU with SUCCESS (drink most tea)\n");
543 #endif
544}
545
546// *********************************
547// CLEANING
548// *********************************
549
550// *********************************
551// Clean up the {x},{ybcd} arrays
553 #ifndef Weight_On_SplineBySpline_Basis
554 float *cpu_total_weights
555 #endif
556){
557// *********************************
558 cudaFree(gpu_paramNo_arr);
559 cudaFree(gpu_nKnots_arr);
560
561 // free the coefficient arrays
562 cudaDestroyTextureObject(text_coeff_x);
563
564 cudaFree(gpu_coeff_x);
565 cudaFree(gpu_coeff_many);
566
567 cudaFree(gpu_coeff_TF1_many);
568 cudaFree(gpu_paramNo_TF1_arr);
569 // free weights on the gpu
570 cudaFree(gpu_weights);
571 cudaFree(gpu_weights_tf1);
572#ifndef Weight_On_SplineBySpline_Basis
573 cudaFree(gpu_total_weights);
574 //KS: Before removing variable let's destroy texture
575 cudaDestroyTextureObject(text_nParamPerEvent);
576 cudaDestroyTextureObject(text_nParamPerEvent_TF1);
577
578 cudaFree(gpu_nParamPerEvent);
579 cudaFree(gpu_nParamPerEvent_TF1);
580 cudaFreeHost(cpu_total_weights);
581 cpu_total_weights = nullptr;
582#endif
583}
584
585// *******************************************
587__host__ void SMonolithGPU::CleanupGPU_Segments(short int *segment, float *vals) {
588// *******************************************
589 cudaFreeHost(segment);
590 cudaFreeHost(vals);
591
592 segment = nullptr;
593 vals = nullptr;
594}
#define _nCoeff_
KS: We store coefficients {y,b,c,d} in one array one by one, this is only to define it once rather th...
Definition: SplineCommon.h:13
#define _nTF1Coeff_
KS: For TF1 we store at most 5 coefficients, we could make it more flexible but for now define it her...
Definition: SplineCommon.h:15
cudaTextureObject_t text_nParamPerEvent_TF1
KS: Map keeping track how many parameters applies to each event, we keep two numbers here {number of ...
unsigned int * gpu_nParamPerEvent
KS: GPU map keeping track how many parameters applies to each event, we keep two numbers here {number...
__host__ void CleanupGPU_SplineMonolith(float *cpu_total_weights)
This function deallocates the resources allocated for the separate {x} and {ybcd} arrays in the and T...
cudaTextureObject_t text_coeff_x
KS: Textures are L1 cache variables which are well optimised for fetching. Make texture only for vari...
short int * gpu_paramNo_TF1_arr
CW: GPU array with the number of points per TF1 object.
__host__ void InitGPU_Vals(float **vals)
Allocate memory for spline segments.
int h_n_events
Number of events living on CPU.
float * gpu_weights
GPU arrays to hold weight for each spline.
float * gpu_coeff_many
GPU arrays to hold other coefficients.
unsigned int * gpu_nParamPerEvent_TF1
KS: GPU map keeping track how many parameters applies to each event, we keep two numbers here {number...
cudaTextureObject_t text_nParamPerEvent
KS: Map keeping track how many parameters applies to each event, we keep two numbers here {number of ...
__host__ void InitGPU_SplineMonolith(float **cpu_total_weights, int n_events, unsigned int total_nknots, unsigned int n_splines, unsigned int n_tf1, int Eve_size)
Allocate memory on gpu for spline monolith.
virtual ~SMonolithGPU()
destructor
__host__ void RunGPU_SplineMonolith(float *cpu_total_weights, float *vals, short int *segment, const unsigned int h_n_splines, const unsigned int h_n_tf1)
Run the GPU code for the separate many arrays. As in separate {x}, {y,b,c,d} arrays Pass the segment ...
float * gpu_coeff_x
KS: GPU arrays to hold X coefficient.
SMonolithGPU()
constructor
short int * gpu_nPoints_arr
GPU arrays to hold number of points.
float * gpu_weights_tf1
GPU arrays to hold weight for each TF1.
__host__ void InitGPU_Segments(short int **segment)
Allocate memory for spline segments.
unsigned int * gpu_nKnots_arr
KS: GPU Number of knots per spline.
int h_n_params
Number of params living on CPU.
float * gpu_coeff_TF1_many
GPU arrays to hold TF1 coefficients.
short int * gpu_paramNo_arr
CW: GPU array with the number of points per spline (not per spline point!)
__host__ void CopyToGPU_SplineMonolith(SplineMonoStruct *cpu_spline_handler, std::vector< float > cpu_many_array_TF1, std::vector< short int > cpu_paramNo_arr_TF1, int n_events, std::vector< unsigned int > cpu_nParamPerEvent, std::vector< unsigned int > cpu_nParamPerEvent_TF1, int n_params, unsigned int n_splines, short int spline_size, unsigned int total_nknots, unsigned int n_tf1)
Copies data from CPU to GPU for the spline monolith.
float * gpu_total_weights
GPU arrays to hold weight for event.
__host__ void CleanupGPU_Segments(short int *segment, float *vals)
Clean up pinned variables at CPU.
__global__ void EvalOnGPU_TotWeight(const float *__restrict__ gpu_weights, const float *__restrict__ gpu_weights_tf1, float *__restrict__ gpu_total_weights, const cudaTextureObject_t __restrict__ text_nParamPerEvent, const cudaTextureObject_t __restrict__ text_nParamPerEvent_TF1)
KS: Evaluate the total spline event weight on the GPU, as in most cases GPU is faster,...
#define _N_SPLINES_
__global__ void EvalOnGPU_Splines(const short int *__restrict__ gpu_paramNo_arr, const unsigned int *__restrict__ gpu_nKnots_arr, const float *__restrict__ gpu_coeff_many, float *__restrict__ gpu_weights, const cudaTextureObject_t __restrict__ text_coeff_x)
Evaluate the spline on the GPU Using one {y,b,c,d} array and one {x} array Should be most efficient a...
__device__ __constant__ unsigned int d_n_TF1
Number of tf1 living on GPU.
__device__ __constant__ unsigned int d_n_splines
Number of splines living on GPU.
__host__ void SynchroniseSplines()
Make sure all Cuda threads finished execution.
__global__ void EvalOnGPU_TF1(const float *__restrict__ gpu_coeffs_tf1, const short int *__restrict__ gpu_paramNo_arr_tf1, float *__restrict__ gpu_weights_tf1)
Evaluate the TF1 on the GPU Using 5th order polynomial.
__device__ __constant__ short int segment_gpu[NSplines_GPU]
__device__ __constant__ float val_gpu[NSplines_GPU]
CW: Constant memory needs to be hard-coded on compile time. Could make this texture memory instead,...
__device__ __constant__ int d_n_events
Number of events living on GPU.
__device__ __constant__ short int d_spline_size
Size of splines living on GPU.
MaCh3 event-by-event cross-section spline code.
void checkGpuMem()
KS: Get some fancy info about VRAM usage.
Definition: gpuUtils.cu:43
void PrintNdevices()
KS: Get some fancy info about GPU.
Definition: gpuUtils.cu:60
#define CudaCheckError()
Definition: gpuUtils.cuh:22
#define _BlockSize_
KS: Need it for shared memory, there is way to use dynamic shared memory but I am lazy right now.
Definition: gpuUtils.cuh:25
KS: Struct storing information for spline monolith.
Definition: SplineCommon.h:30
std::vector< unsigned int > nKnots_arr
KS: CPU Number of knots per spline.
Definition: SplineCommon.h:41
std::vector< float > coeff_x
KS: CPU arrays to hold X coefficient.
Definition: SplineCommon.h:35
std::vector< float > coeff_many
CPU arrays to hold other coefficients.
Definition: SplineCommon.h:38
std::vector< short int > paramNo_arr
CW: CPU array with the number of points per spline (not per spline point!)
Definition: SplineCommon.h:44