MaCh3  2.5.0
Reference Guide
UnbinnedSplineHandler.cpp
Go to the documentation of this file.
2 
3 #ifdef MaCh3_CUDA
5 #endif
6 
7 #pragma GCC diagnostic ignored "-Wuseless-cast"
8 #pragma GCC diagnostic ignored "-Wfloat-conversion"
9 
10 // *****************************************
11 //Set everything to NULL or 0
13 // *****************************************
14 #ifdef MaCh3_CUDA
15  MACH3LOG_INFO("Using GPU version event by event monolith");
16  gpu_spline_handler = nullptr;
17 #endif
18 
20 
21  nKnots = 0;
22  nTF1coeff = 0;
23  NEvents = 0;
24  _max_knots = 0;
25 
26  NSplines_valid = 0;
27  NTF1_valid = 0;
28 
29  cpu_weights_spline_var = nullptr;
30  cpu_weights_tf1_var = nullptr;
31 
32  cpu_total_weights = nullptr;
33 }
34 
35 // *****************************************
36 UnbinnedSplineHandler::UnbinnedSplineHandler(std::vector<std::vector<TResponseFunction_red*> > &MasterSpline,
37  const std::vector<RespFuncType> &SplineType,
38  const bool SaveFlatTree,
39  const std::string& _FastSplineName) : SplineBase() {
40 // *****************************************
41  //KS: If true it will save spline monolith into huge ROOT file
42  SaveSplineFile = SaveFlatTree;
43  FastSplineName = _FastSplineName;
44  Initialise();
45  MACH3LOG_INFO("-- GPUING WITH arrays and master spline containing TResponseFunction_red");
46 
47  // Convert the TSpline3 pointers to the reduced form and call the reduced constructor
48  PrepareForGPU(MasterSpline, SplineType);
49 }
50 
51 // *****************************************
52 // The shared initialiser from constructors of TSpline3 and TSpline3_red
53 void UnbinnedSplineHandler::PrepareForGPU(std::vector<std::vector<TResponseFunction_red*> > &MasterSpline, const std::vector<RespFuncType> &SplineType) {
54 // *****************************************
55  // Scan for the max number of knots, the number of events (number of splines), and number of parameters
56  int maxnSplines = 0;
57  ScanMasterSpline(MasterSpline,
58  NEvents,
59  _max_knots,
60  nParams,
61  maxnSplines,
63  nKnots,
64  NTF1_valid,
65  nTF1coeff,
66  SplineType);
67 
68  MACH3LOG_INFO("Found {} events", NEvents);
69  MACH3LOG_INFO("Found {} knots at max", _max_knots);
70  MACH3LOG_INFO("Found {} parameters", nParams);
71  MACH3LOG_INFO("Found {} maximum number of splines in an event", maxnSplines);
72  MACH3LOG_INFO("Found total {} knots in all splines", nKnots);
73  MACH3LOG_INFO("Number of splines = {}", NSplines_valid);
74  MACH3LOG_INFO("Found total {} coeffs in all TF1", nTF1coeff);
75  MACH3LOG_INFO("Number of TF1 = {}", NTF1_valid);
76 
77  unsigned int event_size_max = _max_knots * nParams;
78  // Declare the {x}, {y,b,c,d} arrays for all possible splines which the event has
79  // We'll filter off the flat and "disabled" (e.g. CCQE event should not have MARES spline) ones in the next for loop, but need to declare these beasts here
80 
81  // Declare the {y,b,c,d} for each knot
82  // float because GPU precision (could change to double, but will incur significant speed reduction on GPU unless you're very rich!)
83  cpu_spline_handler->coeff_many.resize(nKnots*_nCoeff_); // *4 because we store y,b,c,d parameters in this array
84  //KS: For x coeff we assume that for given dial (MAQE) spacing is identical,
85  // here we are sloppy and assume each dial has the same number of knots, not a big problem
86  cpu_spline_handler->coeff_x.resize(event_size_max, -999);
87 
88  //CW: With TF1 we only save the coefficients and the order of the polynomial
89  // Makes most sense to have one large monolithic array, but then it becomes impossible to tell apart a coefficient from a "number of points". So have two arrays: one of coefficients and one of number of points
90  // Let's first assume all are of _max_knots size
91  // Now declare the arrays for each point in the valid splines which the event actually has (i.e. include the splines that the event undergoes)
92  // Also make array with the number of points per spline (not per spline point!)
93  // float because GPU precision (could change to double, but will incur significant speed reduction on GPU unless you're very rich!)
95  cpu_coeff_TF1_many.resize(nTF1coeff); // *5 because this array holds a,b,c,d,e parameters
96 
97  //KS: Map keeping track how many parameters applies to each event, we keep two numbers here {number of splines per event, index where splines start for a given event}
98  cpu_nParamPerEvent.resize(2 * NEvents, -1);
99  cpu_nParamPerEvent_tf1.resize(2 * NEvents, -1);
100 
101  // Make array with the number of points per spline (not per spline point!)
103  //KS: And array which tells where each spline stars in a big monolith array, sort of knot map
106 
107  // Temporary arrays to hold the coefficients for each spline
108  // We get one x, one y, one b,... for each point, so only need to be _max_knots big
109  //KS: Some params has less splines but this is all right main array will get proper number while this temp will be deleted
110  float *x_tmp = new float[_max_knots]();
111  float *many_tmp = new float[_max_knots*_nCoeff_]();
112  float *temp_coeffs = new float[_nTF1Coeff_]();
113 
114  // Count the number of events
115  unsigned int KnotCounter = 0;
116  unsigned int TF1PointsCounter = 0;
117  unsigned int NSplinesCounter = 0;
118  unsigned int TF1sCounter = 0;
119  int ParamCounter = 0;
120  int ParamCounterGlobal = 0;
121  int ParamCounter_TF1 = 0;
122  int ParamCounterGlobalTF1 = 0;
123  // Loop over events and extract the spline coefficients
124  for(unsigned int EventCounter = 0; EventCounter < MasterSpline.size(); ++EventCounter) {
125  // Structure of MasterSpline is std::vector<std::vector<TSpline3*>>
126  // A conventional iterator to count which parameter a given spline should be applied to
127  for(unsigned int ParamNumber = 0; ParamNumber < MasterSpline[EventCounter].size(); ++ParamNumber) {
128  // If NULL we don't have this spline for the event, so move to next spline
129  if (MasterSpline[EventCounter][ParamNumber] == NULL) continue;
130 
131  if(SplineType[ParamNumber] == kTSpline3_red)
132  {
133  //KS: how much knots each spline has
134  int nPoints_tmp = 0;
135  // Get a pointer to the current spline for this event
136  TResponseFunction_red* TespFunc = MasterSpline[EventCounter][ParamNumber];
137  TSpline3_red* CurrSpline = static_cast<TSpline3_red*>(TespFunc);
138 
139  // If the number of knots are greater than 2 the spline is not a dummy and we should extract coefficients to load onto the GPU
140  GetSplineCoeff_SepMany(CurrSpline, nPoints_tmp, x_tmp, many_tmp);
141 
142  //KS: One knot means flat spline so ignore
143  if (nPoints_tmp == 1) continue;
144  for (int j = 0; j < _max_knots; ++j) {
145  cpu_spline_handler->coeff_x[ParamNumber*_max_knots + j] = x_tmp[j];
146  }
147  //KS: Contrary to X coeff we keep for other coeff only filled knots, there is no much gain for doing so for x coeff
148  for (int j = 0; j < nPoints_tmp; ++j) {
149  for (int k = 0; k < _nCoeff_; k++) {
150  cpu_spline_handler->coeff_many[KnotCounter*_nCoeff_ + j*_nCoeff_ + k] = many_tmp[j*_nCoeff_+k];
151  }
152  }
153  // Set the parameter number for this spline
154  cpu_spline_handler->paramNo_arr[NSplinesCounter] = short(ParamNumber);
155  //KS: Fill map when each spline starts
156  cpu_spline_handler->nKnots_arr[NSplinesCounter] = KnotCounter;
157  KnotCounter += nPoints_tmp;
158 
159  ++ParamCounter;
160  // Increment the counter for the number of good splines we have
161  ++NSplinesCounter;
162  }
163  else if (SplineType[ParamNumber] == kTF1_red)
164  {
165  // Don't actually use this ever -- we give each spline the maximum number of points found in all splines
166  int nPoints_tmp = 0;
167  // Get a pointer to the current spline for this event
168  TF1_red* CurrSpline = dynamic_cast<TF1_red*>(MasterSpline[EventCounter][ParamNumber]);
169 
170  // If the number of knots are greater than 2 the spline is not a dummy and we should extract coefficients to load onto the GPU
171  GetTF1Coeff(CurrSpline, nPoints_tmp, temp_coeffs);
172  for (int j = 0; j < _nTF1Coeff_; ++j) {
173  cpu_coeff_TF1_many[TF1PointsCounter+j] = temp_coeffs[j];
174  }
175  // Save the number of points for this spline
176  cpu_nPoints_arr[TF1sCounter] = short(nPoints_tmp);
177 
178  TF1PointsCounter += nPoints_tmp;
179  // Set the parameter number for this spline
180  cpu_paramNo_TF1_arr[TF1sCounter] = short(ParamNumber);
181  ++ParamCounter_TF1;
182  // Increment the counter for the number of good splines we have
183  ++TF1sCounter;
184  }
185  //KS: Don't delete in debug
186  #ifndef MACH3_DEBUG
187  delete MasterSpline[EventCounter][ParamNumber];
188  MasterSpline[EventCounter][ParamNumber] = nullptr;
189  #endif
190  } // End the loop over the parameters in the MasterSpline
191  cpu_nParamPerEvent[2*EventCounter] = ParamCounter;
192  cpu_nParamPerEvent[2*EventCounter+1] = ParamCounterGlobal;
193  ParamCounterGlobal += ParamCounter;
194 
195  cpu_nParamPerEvent_tf1[2*EventCounter] = ParamCounter_TF1;
196  cpu_nParamPerEvent_tf1[2*EventCounter+1] = ParamCounterGlobalTF1;
197  ParamCounterGlobalTF1 += ParamCounter_TF1;
198 
199  ParamCounter = 0;
200  ParamCounter_TF1 = 0;
201  } // End the loop over the number of events
202  delete[] many_tmp;
203  delete[] x_tmp;
204  delete[] temp_coeffs;
205 
206  int BadXCounter = 0;
207  for (unsigned int j = 0; j < event_size_max; j++) {
208  if (cpu_spline_handler->coeff_x[j] == -999) BadXCounter++;
209  // Perform checks that all entries have been modified from initial values
210  if (cpu_spline_handler->coeff_x[j] == -999 && BadXCounter < 5) {
211  MACH3LOG_WARN("***** BAD X !! *****");
212  MACH3LOG_WARN("Indicates some parameter doesn't have a single spline");
213  MACH3LOG_WARN("j = {}", j);
214  //throw MaCh3Exception(__FILE__ , __LINE__ );
215  }
216  if(BadXCounter == 5) MACH3LOG_WARN("There is more unutilised knots although I will stop spamming");
217  }
218 
219  MACH3LOG_WARN("Found in total {} BAD X", BadXCounter);
220  //KS: This is tricky as this variable use both by CPU and GPU, however if use CUDA we use cudaMallocHost
221  #ifndef MaCh3_CUDA
223  cpu_weights_spline_var = new float[NSplines_valid]();
224  cpu_weights_tf1_var = new float[NTF1_valid]();
225  #endif
226 
227  // Print some info; could probably make this to a separate function
230 
231  MoveToGPU();
232 
233  // Can pass the spline segments to the GPU instead of the values
234  // Make these here and only refill them for each loop, avoiding unnecessary new/delete on each reconfigure
235  SetupSegments();
236 }
237 
238 // *****************************************
239 // The shared initialiser from constructors of TSpline3 and TSpline3_red
241 // *****************************************
242  #ifdef MaCh3_CUDA
243  unsigned int event_size_max = _max_knots * nParams;
244  MACH3LOG_INFO("Total size = {:.2f} MB memory on CPU to move to GPU",
245  (double(sizeof(float) * nKnots * _nCoeff_) + double(sizeof(float) * event_size_max) / 1.E6 +
246  double(sizeof(short int) * NSplines_valid)) / 1.E6);
247  MACH3LOG_INFO("Total TF1 size = {:.2f} MB memory on CPU to move to GPU",
248  double(sizeof(float) * NTF1_valid * _nTF1Coeff_) / 1.E6);
249  MACH3LOG_INFO("GPU weight array (GPU->CPU every step) = {:.2f} MB", static_cast<double>(sizeof(float)) * (NSplines_valid + NTF1_valid) / 1.0e6);
250  MACH3LOG_INFO("Since you are running Total event weight mode then GPU weight array (GPU->CPU every step) = {:.2f} MB",
251  double(sizeof(float) * NEvents) / 1.E6);
252  MACH3LOG_INFO("Parameter value array (CPU->GPU every step) = {:.4f} MB", double(sizeof(float) * nParams) / 1.E6);
253  //CW: With the new set-up we have: 1 coefficient array of size coeff_array_size, all same size
254  // 1 coefficient array of size coeff_array_size*4, holding y,b,c,d in order (y11,b11,c11,d11; y12,b12,c12,d12;...) where ynm is n = spline number, m = spline point. Should really make array so that order is (y11,b11,c11,d11; y21,b21,c21,d21;...) because it will optimise cache hits I think; try this if you have time
255  // return gpu_weights
256 
258 
259  // The gpu_XY arrays don't actually need initialising, since they are only placeholders for what we'll move onto the GPU. As long as we cudaMalloc the size of the arrays correctly there shouldn't be any problems
260  // Can probably make this a bit prettier but will do for now
261  // Could be a lot smaller of a function...
264  NEvents,
265  nKnots, // How many entries in coefficient array (*4 for the "many" array)
266  NSplines_valid, // What's the number of splines we have (also number of entries in gpu_nPoints_arr)
267  NTF1_valid,
268  event_size_max //Knots times event number of unique splines
269  );
270 
271  // Move number of splines and spline size to constant GPU memory; every thread does not need a copy...
272  // The implementation lives in splines/gpuSplineUtils.cu
273  // The GPU splines don't actually need declaring but is good for demonstration, kind of
274  // fixed by passing const reference
277 
278  // TFI related now
281  NEvents,
284  nParams,
286  _max_knots,
287  nKnots,
288  NTF1_valid);
289 
290  // Delete all the coefficient arrays from the CPU once they are on the GPU
295  delete cpu_spline_handler;
296  cpu_spline_handler = nullptr;
297  MACH3LOG_INFO("Good GPU loading");
298  #endif
299 }
300 
301 // Need to specify template functions in header
302 // *****************************************
303 // Scan the master spline to get the maximum number of knots in any of the TSpline3*
304 void UnbinnedSplineHandler::ScanMasterSpline(std::vector<std::vector<TResponseFunction_red*> > & MasterSpline,
305  unsigned int &nEvents,
306  short int &MaxPoints,
307  short int &numParams,
308  int &nSplines,
309  unsigned int &NSplinesValid,
310  unsigned int &numKnots,
311  unsigned int &nTF1Valid,
312  unsigned int &nTF1_coeff,
313  const std::vector<RespFuncType> &SplineType) {
314 // *****************************************
315  // Need to extract: the total number of events
316  // number of parameters
317  // maximum number of knots
318  MaxPoints = 0;
319  nEvents = 0;
320  numParams = 0;
321  nSplines = 0;
322  numKnots = 0;
323  NSplinesValid = 0;
324  nTF1Valid = 0;
325  nTF1_coeff = 0;
326 
327  // Check the number of events
328  nEvents = int(MasterSpline.size());
329 
330  // Maximum number of splines one event can have (scan through and find this number)
331  int nMaxSplines_PerEvent = 0;
332 
333  //KS: We later check that each event has the same number of splines so this is fine
334  numParams = short(MasterSpline[0].size());
335  // Initialise
336  SplineInfoArray.resize(numParams);
337 
338  // Loop over each parameter
339  for(unsigned int EventCounter = 0; EventCounter < MasterSpline.size(); ++EventCounter) {
340  // Check that each event has each spline saved
341  if (numParams > 0) {
342  int TempSize = int(MasterSpline[EventCounter].size());
343  if (TempSize != numParams) {
344  MACH3LOG_ERROR("Found {} parameters for event {}", TempSize, EventCounter);
345  MACH3LOG_ERROR("but was expecting {} since that's what I found for the previous event", numParams);
346  MACH3LOG_ERROR("Somehow this event has a different number of spline parameters... Please study further!");
347  throw MaCh3Exception(__FILE__ , __LINE__ );
348  }
349  }
350  numParams = short(MasterSpline[EventCounter].size());
351 
352  int nSplines_SingleEvent = 0;
353  int nPoints = 0;
354  // Loop over each pointer
355  for(unsigned int ParamNumber = 0; ParamNumber < MasterSpline[EventCounter].size(); ++ParamNumber) {
356  if (MasterSpline[EventCounter][ParamNumber]) {
357  if(SplineType[ParamNumber] == kTSpline3_red)
358  {
359  TResponseFunction_red* TespFunc = MasterSpline[EventCounter][ParamNumber];
360  TSpline3_red* CurrSpline = dynamic_cast<TSpline3_red*>(TespFunc);
361  if(CurrSpline){
362  nPoints = CurrSpline->GetNp();
363  }
364 
365  if (nPoints > MaxPoints) {
366  MaxPoints = static_cast<short int>(nPoints);
367  }
368  numKnots += nPoints;
369  nSplines_SingleEvent++;
370 
371  // Fill the SplineInfoArray entries with information on each splinified parameter
372  if (SplineInfoArray[ParamNumber].xPts.size() == 0)
373  {
374  // Fill the number of points
375  SplineInfoArray[ParamNumber].nPts = CurrSpline->GetNp();
376 
377  // Fill the x points
378  SplineInfoArray[ParamNumber].xPts.resize(SplineInfoArray[ParamNumber].nPts);
379  for (M3::int_t k = 0; k < SplineInfoArray[ParamNumber].nPts; ++k)
380  {
381  M3::float_t xtemp = M3::float_t(-999.99);
382  M3::float_t ytemp = M3::float_t(-999.99);
383  CurrSpline->GetKnot(k, xtemp, ytemp);
384  SplineInfoArray[ParamNumber].xPts[k] = xtemp;
385  }
386  }
387  NSplinesValid++;
388  }
389  else if (SplineType[ParamNumber] == kTF1_red)
390  {
391  TResponseFunction_red* TespFunc = MasterSpline[EventCounter][ParamNumber];
392  TF1_red* CurrSpline = dynamic_cast<TF1_red*>(TespFunc);
393  nPoints = CurrSpline->GetSize();
394  nTF1_coeff += nPoints;
395  nTF1Valid++;
396  }
397  } else {
398  // If NULL we don't have this spline for the event, so move to next spline
399  continue;
400  }
401  }
402  if (nSplines_SingleEvent > nMaxSplines_PerEvent) nMaxSplines_PerEvent = nSplines_SingleEvent;
403  }
404  nSplines = nMaxSplines_PerEvent;
405 
406  int Counter = 0;
407  //KS: Sanity check that everything was set correctly
408  for (M3::int_t i = 0; i < numParams; ++i)
409  {
410  // KS: We don't find segment for TF1, so ignore this
411  if (SplineType[i] == kTF1_red) continue;
412 
413  const M3::int_t nPoints = SplineInfoArray[i].nPts;
414  const std::vector<M3::float_t>& xArray = SplineInfoArray[i].xPts;
415  if (nPoints == -999 || xArray.size() == 0) {
416  Counter++;
417  if(Counter < 5) {
418  MACH3LOG_WARN("SplineInfoArray[{}] isn't set yet", i);
419  }
420  continue;
421  //throw MaCh3Exception(__FILE__ , __LINE__ );
422  }
423  }
424  MACH3LOG_WARN("In total SplineInfoArray for {} hasn't been initialised", Counter);
425 }
426 
427 // *****************************************
428 // Load SplineFile
430  : SplineBase() {
431 // *****************************************
432  Initialise();
433  MACH3LOG_INFO("-- GPUING WITH {X} and {Y,B,C,D} arrays and master spline containing TSpline3_red");
434  // Convert the TSpline3 pointers to the reduced form and call the reduced constructor
435  LoadSplineFile(FileName);
436 }
437 
438 // *****************************************
439 // Load SplineMonolith from ROOT file
440 void UnbinnedSplineHandler::LoadSplineFile(std::string FileName) {
441 // *****************************************
442  M3::AddPath(FileName);
443  auto SplineFile = std::make_unique<TFile>(FileName.c_str(), "OPEN");
444  TTree *Settings = SplineFile->Get<TTree>("Settings");
445  TTree *Monolith_TF1 = SplineFile->Get<TTree>("Monolith_TF1");
446  TTree *EventInfo = SplineFile->Get<TTree>("EventInfo");
447  TTree *SplineTree = SplineFile->Get<TTree>("SplineTree");
448 
449  unsigned int NEvents_temp;
450  short int nParams_temp;
451  int _max_knots_temp;
452  unsigned int nKnots_temp;
453  unsigned int NSplines_valid_temp;
454  unsigned int nTF1Valid_temp;
455  unsigned int nTF1coeff_temp;
456 
457  Settings->SetBranchAddress("NEvents", &NEvents_temp);
458  Settings->SetBranchAddress("nParams", &nParams_temp);
459  Settings->SetBranchAddress("_max_knots", &_max_knots_temp);
460  Settings->SetBranchAddress("nKnots", &nKnots_temp);
461  Settings->SetBranchAddress("NSplines_valid", &NSplines_valid_temp);
462  Settings->SetBranchAddress("NTF1_valid", &nTF1Valid_temp);
463  Settings->SetBranchAddress("nTF1coeff", &nTF1coeff_temp);
464 
465  Settings->GetEntry(0);
466 
467  NEvents = NEvents_temp;
468  nParams = nParams_temp;
469  _max_knots = static_cast<short int>(_max_knots_temp);
470  nKnots = nKnots_temp;
471  NSplines_valid = NSplines_valid_temp;
472  NTF1_valid = nTF1Valid_temp;
473  nTF1coeff = nTF1coeff_temp;
474 
475  cpu_nParamPerEvent.resize(2*NEvents);
478 
479  //KS: This is tricky as this variable use both by CPU and GPU, however if use CUDA we use cudaMallocHost
480 #ifndef MaCh3_CUDA
482  cpu_weights_spline_var = new float[NSplines_valid]();
483  cpu_weights_tf1_var = new float[NTF1_valid]();
484 #endif
485 
486  SplineTree->SetBranchAddress("SplineObject", &cpu_spline_handler);
487  SplineTree->GetEntry(0);
488 
489  float coeff_tf1 = 0.;
490  Monolith_TF1->SetBranchAddress("cpu_coeff_TF1_many", &coeff_tf1);
491  for(unsigned int i = 0; i < nTF1coeff; i++)
492  {
493  Monolith_TF1->GetEntry(i);
494  cpu_coeff_TF1_many[i] = coeff_tf1;
495  }
496 
497  unsigned int nParamPerEvent = 0;
498  unsigned int nParamPerEvent_tf1 = 0;
499 
500  EventInfo->SetBranchAddress("cpu_nParamPerEvent", &nParamPerEvent);
501  EventInfo->SetBranchAddress("cpu_nParamPerEvent_tf1", &nParamPerEvent_tf1);
502  for(unsigned int i = 0; i < 2*NEvents; i++)
503  {
504  EventInfo->GetEntry(i);
505  cpu_nParamPerEvent[i] = nParamPerEvent;
506  cpu_nParamPerEvent_tf1[i] = nParamPerEvent_tf1;
507  }
508 
509  LoadFastSplineInfoDir(SplineFile);
510 
511  SplineFile->Close();
512 
513  // Print some info; could probably make this to a separate function
515 
516  MoveToGPU();
517 
518  SetupSegments();
519 }
520 
521 // *****************************************
523 // *****************************************
524  //KS: Since we are going to copy it each step use fancy CUDA memory allocation
525  #ifdef MaCh3_CUDA
528  #else
529  SplineSegments = new short int[nParams]();
530  ParamValues = new float[nParams]();
531  #endif
532  for (M3::int_t j = 0; j < nParams; j++)
533  {
534  SplineSegments[j] = 0;
535  ParamValues[j] = -999;
536  }
537 }
538 
539 // *****************************************
540 // Save SplineMonolith into ROOT file
541 void UnbinnedSplineHandler::PrepareSplineFile(std::string FileName) {
542 // *****************************************
543  M3::AddPath(FileName);
544 
545  auto SplineFile = std::make_unique<TFile>(FileName.c_str(), "recreate");
546  TTree *Settings = new TTree("Settings", "Settings");
547  TTree *Monolith_TF1 = new TTree("Monolith_TF1", "Monolith_TF1");
548  TTree *XKnots = new TTree("XKnots", "XKnots");
549  TTree *EventInfo = new TTree("EventInfo", "EventInfo");
550 
551  unsigned int NEvents_temp = NEvents;
552  short int nParams_temp = nParams;
553  int _max_knots_temp = _max_knots;
554  unsigned int nKnots_temp = nKnots;
555  unsigned int NSplines_valid_temp = NSplines_valid;
556  unsigned int nTF1Valid_temp = NTF1_valid;
557  unsigned int nTF1coeff_temp = nTF1coeff;
558 
559  Settings->Branch("NEvents", &NEvents_temp, "NEvents/i");
560  Settings->Branch("nParams", &nParams_temp, "nParams/S");
561  Settings->Branch("_max_knots", &_max_knots_temp, "_max_knots/I");
562  Settings->Branch("nKnots", &nKnots_temp, "nKnots/i");
563  Settings->Branch("NSplines_valid", &NSplines_valid_temp, "NSplines_valid/i");
564  Settings->Branch("NTF1_valid", &nTF1Valid_temp, "NTF1_valid/i");
565  Settings->Branch("nTF1coeff", &nTF1coeff_temp, "nTF1coeff/i");
566 
567  Settings->Fill();
568 
569  SplineFile->cd();
570  Settings->Write();
571 
572  TTree *SplineTree = new TTree("SplineTree", "SplineTree");
573  // Create a branch for the SplineMonoStruct object
574  SplineTree->Branch("SplineObject", &cpu_spline_handler);
575  SplineTree->Fill();
576  SplineTree->Write();
577  delete SplineTree;
578 
579  float coeff_tf1 = 0.;
580  Monolith_TF1->Branch("cpu_coeff_TF1_many", &coeff_tf1, "cpu_coeff_TF1_many/F");
581  for(unsigned int i = 0; i < nTF1coeff; i++)
582  {
583  coeff_tf1 = cpu_coeff_TF1_many[i];
584  Monolith_TF1->Fill();
585  }
586  SplineFile->cd();
587  Monolith_TF1->Write();
588 
589  unsigned int nParamPerEvent = 0;
590  unsigned int nParamPerEvent_tf1 = 0;
591 
592  EventInfo->Branch("cpu_nParamPerEvent", &nParamPerEvent, "cpu_nParamPerEvent/i");
593  EventInfo->Branch("cpu_nParamPerEvent_tf1", &nParamPerEvent_tf1, "cpu_nParamPerEvent_tf1/i");
594 
595  for(unsigned int i = 0; i < 2*NEvents; i++)
596  {
597  nParamPerEvent = cpu_nParamPerEvent[i];
598  nParamPerEvent_tf1 = cpu_nParamPerEvent_tf1[i];
599  EventInfo->Fill();
600  }
601  SplineFile->cd();
602  EventInfo->Write();
603 
604  PrepareFastSplineInfoDir(SplineFile);
605 
606  delete Settings;
607  delete Monolith_TF1;
608  delete XKnots;
609  delete EventInfo;
610  SplineFile->Close();
611 }
612 
613 // *****************************************
614 // Destructor
615 // Cleans up the allocated GPU memory
617 // *****************************************
618  #ifdef MaCh3_CUDA
619  //KS: Since we declared them using CUDA alloc we have to free memory using also cuda functions
621  delete gpu_spline_handler;
622  #else
623  if(SplineSegments != nullptr) delete[] SplineSegments;
624  if(ParamValues != nullptr) delete[] ParamValues;
625  if(cpu_total_weights != nullptr) delete[] cpu_total_weights;
626  #endif
627 
628  if(cpu_weights_spline_var != nullptr) delete[] cpu_weights_spline_var;
629  if(cpu_weights_tf1_var != nullptr) delete[] cpu_weights_tf1_var;
630 
631  if(cpu_spline_handler != nullptr) delete cpu_spline_handler;
632 }
633 
634 // *****************************************
635 // Get the spline coefficients from the TSpline3 so that we can load ONLY these onto the GPU, not the whole TSpline3 object
636 // This loads up coefficients into two arrays: one x array and one yabcd array
637 // This should maximize our cache hits!
638 void UnbinnedSplineHandler::GetSplineCoeff_SepMany(TSpline3_red* &spl, int &nPoints, float *& xArray, float *& manyArray) const {
639 // *****************************************
640  // Initialise all arrays to 1.0
641  for (int i = 0; i < _max_knots; ++i) {
642  xArray[i] = 1.0;
643  for (int j = 0; j < _nCoeff_; j++) {
644  manyArray[i*_nCoeff_+j] = 1.0;
645  }
646  }
647  // Get number of points in spline
648  int Np = spl->GetNp();
649  // If spline is flat, set number of knots to 1.0,
650  // This is used later to expedite the calculations for flat splines
651  // tmpArray[0] is number of knots
652  nPoints = Np;
653  if (Np > _max_knots) {
654  MACH3LOG_ERROR("Error, number of points is greater than saved {}", _max_knots);
655  MACH3LOG_ERROR("This _WILL_ cause problems with GPU splines and _SHOULD_ be fixed!");
656  MACH3LOG_ERROR("nPoints = {}, _max_knots = {}", nPoints, _max_knots);
657  throw MaCh3Exception(__FILE__ , __LINE__ );
658  }
659 
660  // The coefficients we're writing to
661  M3::float_t x, y, b, c, d;
662  // TSpline3 can only take doubles, not floats
663  // But our GPU is slow with doubles, so need to cast to float
664  for(int i = 0; i < Np; i++) {
665  // Get the coefficients from the TSpline3 object
666  spl->GetCoeff(i, x, y, b, c, d);
667  // Write the arrays
668  xArray[i] = float(x);
669  manyArray[i*_nCoeff_] = float(y); // 4 because manyArray stores y,b,c,d
670  manyArray[i*_nCoeff_+1] = float(b);
671  manyArray[i*_nCoeff_+2] = float(c);
672  manyArray[i*_nCoeff_+3] = float(d);
673  if((xArray[i] == -999) || (manyArray[i*_nCoeff_] == -999) || (manyArray[i*_nCoeff_ +1] == -999) || (manyArray[i*_nCoeff_+2] == -999) || (manyArray[i*_nCoeff_+3] == -999)){
674  MACH3LOG_ERROR("*********** Bad params in {} ************", __func__);
675  MACH3LOG_ERROR("pre cast to float (x, y, b, c, d) = {:.2f}, {:.2f}, {:.2f}, {:.2f}, {:.2f}", x, y, b, c, d);
676  MACH3LOG_ERROR("pre cast to float (x, y, b, c, d) = {:.2f}, {:.2f}, {:.2f}, {:.2f}, {:.2f}", xArray[i], manyArray[i*_nCoeff_], manyArray[i*_nCoeff_+1], manyArray[i*_nCoeff_+2], manyArray[i*_nCoeff_+3]);
677  MACH3LOG_ERROR("This will cause problems when preparing for GPU");
678  MACH3LOG_ERROR("***************************************************************");
679  }
680  }
681 }
682 
683 #ifdef MaCh3_CUDA
684 // *****************************************
685 // Tell the GPU to evaluate the weights
686 // Load up the two x,{y,b,c,d} arrays into memory and have GPU read them with more coalescence instead of one monolithic array
687 // This should be used when we're using separate x,y,a,b,c,d arrays
688 // Also pass the segments for the parameter along with their parameter values
689 // This avoids doing lots of binary searches on the GPU
691 // *****************************************
692  // There's a parameter mapping that goes from spline parameter to a global parameter index
693  // Find the spline segments
695 
696  // The main call to the GPU
699  ParamValues,
701 }
702 #else
703 //If CUDA is not enabled do the same on CPU
704 // *****************************************
706 // *****************************************
707  // There's a parameter mapping that goes from spline parameter to a global parameter index
708  // Find the spline segments
710 
711  //KS: Huge MP loop over all valid splines
713 
714  //KS: Huge MP loop over all events calculating total weight per event
716 }
717 #endif
718 
719 //*********************************************************
721 //*********************************************************
722  #ifdef MULTITHREAD
723  //KS: Open parallel region
724  #pragma omp parallel
725  {
726  #endif
727  //KS: First we calculate
728  #ifdef MULTITHREAD
729  #pragma omp for simd nowait
730  #endif
731  for (unsigned int splineNum = 0; splineNum < NSplines_valid; ++splineNum)
732  {
733  //CW: Which Parameter we are accessing
734  const short int Param = cpu_spline_handler->paramNo_arr[splineNum];
735 
736  //CW: Avoids doing costly binary search on GPU
737  const short int segment = SplineSegments[Param];
738 
739  //KS: Segment for coeff_x is simply parameter*max knots + segment as each parameters has the same spacing
740  const short int segment_X = short(Param*_max_knots+segment);
741 
742  //KS: Find knot position in out monolithical structure
743  const unsigned int CurrentKnotPos = cpu_spline_handler->nKnots_arr[splineNum]*_nCoeff_+segment*_nCoeff_;
744 
745  // We've read the segment straight from CPU and is saved in segment_gpu
746  // polynomial parameters from the monolithic splineMonolith
747  const float fY = cpu_spline_handler->coeff_many[CurrentKnotPos];
748  const float fB = cpu_spline_handler->coeff_many[CurrentKnotPos + 1];
749  const float fC = cpu_spline_handler->coeff_many[CurrentKnotPos + 2];
750  const float fD = cpu_spline_handler->coeff_many[CurrentKnotPos + 3];
751  // The is the variation itself (needed to evaluate variation - stored spline point = dx)
752  const float dx = ParamValues[Param] - cpu_spline_handler->coeff_x[segment_X];
753 
754  //CW: Wooow, let's use some fancy intrinsic and pull down the processing time by <1% from normal multiplication! HURRAY
755  cpu_weights_spline_var[splineNum] = fmaf(dx, fmaf(dx, fmaf(dx, fD, fC), fB), fY);
756  // Or for the more "easy to read" version:
757  //cpu_weights_spline_var[splineNum] = (fY+dx*(fB+dx*(fC+dx*fD)));
758  }
759 
760  #ifdef MULTITHREAD
761  #pragma omp for simd
762  #endif
763  for (unsigned int tf1Num = 0; tf1Num < NTF1_valid; ++tf1Num)
764  {
765  // The is the variation itself (needed to evaluate variation - stored spline point = dx)
766  const float x = ParamValues[cpu_paramNo_TF1_arr[tf1Num]];
767 
768  // Read the coefficients
769  const unsigned int TF1_Index = tf1Num * _nTF1Coeff_;
770  const float a = cpu_coeff_TF1_many[TF1_Index];
771  const float b = cpu_coeff_TF1_many[TF1_Index + 1];
772 
773  cpu_weights_tf1_var[tf1Num] = fmaf(a, x, b);
774  // cpu_weights_tf1_var[tf1Num] = a*x + b;
775  //cpu_weights_tf1_var[splineNum] = 1 + a*x + b*x*x + c*x*x*x + d*x*x*x*x + e*x*x*x*x*x;
776  }
777  #ifdef MULTITHREAD
778  //KS: End parallel region
779  }
780  #endif
781 }
782 
783 //*********************************************************
784 //KS: Calc total event weight on CPU
786 //*********************************************************
787  #ifdef MULTITHREAD
788  #pragma omp parallel for
789  #endif
790  for (unsigned int EventNum = 0; EventNum < NEvents; ++EventNum)
791  {
792  float totalWeight = 1.0f; // Initialize total weight for each event
793 
794  const unsigned int Offset = 2 * EventNum;
795 
796  // Extract the parameters for the current event
797  const unsigned int startIndex = cpu_nParamPerEvent[Offset + 1];
798  const unsigned int numParams = cpu_nParamPerEvent[Offset];
799 
800  // Compute total weight for the current event
801  #ifdef MULTITHREAD
802  #pragma omp simd reduction(*:totalWeight)
803  #endif
804  for (unsigned int id = 0; id < numParams; ++id) {
805  totalWeight *= cpu_weights_spline_var[startIndex + id];
806  }
807  //Now TF1
808  // Extract the parameters for the current event
809  const unsigned int startIndex_tf1 = cpu_nParamPerEvent_tf1[Offset + 1];
810  const unsigned int numParams_tf1 = cpu_nParamPerEvent_tf1[Offset];
811 
812  // Compute total weight for the current event
813  #ifdef MULTITHREAD
814  #pragma omp simd reduction(*:totalWeight)
815  #endif
816  for (unsigned int id = 0; id < numParams_tf1; ++id) {
817  totalWeight *= cpu_weights_tf1_var[startIndex_tf1 + id];
818  }
819 
820  // Store the total weight for the current event
821  cpu_total_weights[EventNum] = static_cast<M3::float_t>(totalWeight);
822  }
823 }
824 
825 //*********************************************************
826 //KS: Print info about how much knots etc has been initialised
828 //*********************************************************
829  unsigned int event_size_max = _max_knots * nParams;
830 
831  MACH3LOG_INFO("--- INITIALISED Spline Monolith ---");
832  MACH3LOG_INFO("{} events with {} splines", NEvents, NSplines_valid);
833  MACH3LOG_INFO("On average {:.2f} splines per event ({}/{})", float(NSplines_valid)/float(NEvents), NSplines_valid, NEvents);
834  MACH3LOG_INFO("Size of x array = {:.4f} MB", double(sizeof(float)*event_size_max)/1.E6);
835  MACH3LOG_INFO("Size of coefficient (y,b,c,d) array = {:.2f} MB", double(sizeof(float)*nKnots*_nCoeff_)/1.E6);
836  MACH3LOG_INFO("Size of parameter # array = {:.2f} MB", double(sizeof(short int)*NSplines_valid)/1.E6);
837 
838  MACH3LOG_INFO("On average {:.2f} TF1 per event ({}/{})", float(NTF1_valid)/float(NEvents), NTF1_valid, NEvents);
839  MACH3LOG_INFO("Size of TF1 coefficient (a,b,c,d,e) array = {:.2f} MB", double(sizeof(float)*NTF1_valid*_nTF1Coeff_)/1.E6);
840 }
841 
842 //*********************************************************
843 //KS: After calculations are done on GPU we copy memory to CPU. This operation is asynchronous meaning while memory is being copied some operations are being carried. Memory must be copied before actual reweight. This function make sure all has been copied.
845 //*********************************************************
846  #ifdef MaCh3_CUDA
848  CudaCheckError();
849  #endif
850 }
#define MACH3LOG_ERROR
Definition: MaCh3Logger.h:37
#define MACH3LOG_INFO
Definition: MaCh3Logger.h:35
#define MACH3LOG_WARN
Definition: MaCh3Logger.h:36
@ kTF1_red
Uses TF1_red for interpolation.
@ kTSpline3_red
Uses TSpline3_red for interpolation.
void CleanVector(T &)
Base case: do nothing for non-vector types.
constexpr int _nCoeff_
KS: We store coefficients {y,b,c,d} in one array one by one, this is only to define it once rather th...
Definition: SplineCommon.h:18
constexpr int _nTF1Coeff_
KS: For TF1 we store at most 5 coefficients, we could make it more flexible but for now define it her...
Definition: SplineCommon.h:20
Custom exception class used throughout MaCh3.
Base class for calculating weight from spline.
Definition: SplineBase.h:27
void GetTF1Coeff(TF1_red *&spl, int &nPoints, float *&coeffs) const
CW: Gets the polynomial coefficients for TF1.
Definition: SplineBase.cpp:115
short int nParams
Number of parameters that have splines.
Definition: SplineBase.h:81
void FindSplineSegment()
CW:Code used in step by step reweighting, Find Spline Segment for each param.
Definition: SplineBase.cpp:44
short int * SplineSegments
Definition: SplineBase.h:77
std::vector< FastSplineInfo > SplineInfoArray
Definition: SplineBase.h:74
float * ParamValues
Store parameter values they are not in FastSplineInfo as in case of GPU we need to copy paste it to G...
Definition: SplineBase.h:79
void LoadFastSplineInfoDir(std::unique_ptr< TFile > &SplineFile)
KS: Load preprocessed FastSplineInfo.
Definition: SplineBase.cpp:167
void PrepareFastSplineInfoDir(std::unique_ptr< TFile > &SplineFile) const
KS: Prepare Fast Spline Info within SplineFile.
Definition: SplineBase.cpp:139
Class responsible for calculating spline weight on GPU.
__host__ void RunGPU_SplineMonolith(M3::float_t *cpu_total_weights, float *vals, short int *segment)
Run the GPU code for the separate many arrays. As in separate {x}, {y,b,c,d} arrays Pass the segment ...
__host__ void InitGPU_Vals(float **vals)
Allocate memory for spline segments.
__host__ void CleanupPinnedMemory(M3::float_t *cpu_total_weights, short int *segment, float *vals)
Clean up pinned variables at CPU.
__host__ void InitGPU_SplineMonolith(M3::float_t **cpu_total_weights, int n_events, unsigned int total_nknots, unsigned int n_splines, unsigned int n_tf1, int Eve_size)
Allocate memory on gpu for spline monolith.
__host__ void CopyToGPU_SplineMonolith(const SplineMonoStruct *cpu_spline_handler, const std::vector< float > &cpu_many_array_TF1, const std::vector< short int > &cpu_paramNo_arr_TF1, const int n_events, const std::vector< unsigned int > &cpu_nParamPerEvent, const std::vector< unsigned int > &cpu_nParamPerEvent_TF1, const int n_params, const unsigned int n_splines, const short int spline_size, const unsigned int total_nknots, const unsigned int n_tf1)
Copies data from CPU to GPU for the spline monolith.
__host__ void InitGPU_Segments(short int **segment)
Allocate memory for spline segments.
CW: A reduced TF1 class only. Only saves parameters for each TF1 and how many parameters each paramet...
int GetSize()
Get the size.
KS: A reduced ResponseFunction Generic function used for evaluating weight.
CW: Reduced TSpline3 class.
void GetKnot(int i, M3::float_t &xtmp, M3::float_t &ytmp)
M3::int_t GetNp() override
CW: Get the number of points.
void GetCoeff(int segment, M3::float_t &x, M3::float_t &y, M3::float_t &b, M3::float_t &c, M3::float_t &d)
CW: Get the coefficient of a given segment.
float * cpu_weights_tf1_var
CPU arrays to hold weight for each TF1.
float * cpu_weights_spline_var
CPU arrays to hold weight for each spline.
unsigned int nTF1coeff
Sum of all coefficients over all TF1.
unsigned int NTF1_valid
Number of valid TF1.
void ScanMasterSpline(std::vector< std::vector< TResponseFunction_red * > > &MasterSpline, unsigned int &nEvents, short int &MaxPoints, short int &numParams, int &nSplines, unsigned int &NSplinesValid, unsigned int &numKnots, unsigned int &nTF1Valid, unsigned int &nTF1_coeff, const std::vector< RespFuncType > &SplineType)
CW: Function to scan through the MasterSpline of TSpline3.
void PrepareSplineFile(std::string FileName) final
KS: Prepare spline file that can be used for fast loading.
std::vector< unsigned int > cpu_nParamPerEvent
KS: CPU map keeping track how many parameters applies to each event, we keep two numbers here {number...
void Evaluate() final
CW: This Eval should be used when using two separate x,{y,a,b,c,d} arrays to store the weights; proba...
void SynchroniseMemTransfer() const final
KS: After calculations are done on GPU we copy memory to CPU. This operation is asynchronous meaning ...
std::vector< short int > cpu_paramNo_TF1_arr
CW: CPU array with the number of points per spline (not per spline point!)
void PrintInitialsiation() const
KS: Print info about how much knots etc has been initialised.
std::vector< unsigned int > cpu_nParamPerEvent_tf1
KS: CPU map keeping track how many parameters applies to each event, we keep two numbers here {number...
SplineMonolithGPU * gpu_spline_handler
KS: Store info about Spline monolith, this allow to obtain better step time. As all necessary informa...
unsigned int NSplines_valid
Number of valid splines.
void MoveToGPU()
CW: The shared initialiser from constructors of TResponseFunction_red.
unsigned int nKnots
Sum of all knots over all splines.
void CalcSplineWeights() final
CPU based code which eval weight for each spline.
short int _max_knots
Max knots for production.
std::vector< short int > cpu_nPoints_arr
CPU arrays to hold number of points.
void PrepareForGPU(std::vector< std::vector< TResponseFunction_red * > > &MasterSpline, const std::vector< RespFuncType > &SplineType)
CW: Prepare the TSpline3_red objects for the GPU.
void Initialise()
KS: Set everything to null etc.
SplineMonoStruct * cpu_spline_handler
KS: Store info about Spline monolith, this allow to obtain better step time. As all necessary informa...
void LoadSplineFile(std::string FileName) final
KS: Load preprocessed spline file.
virtual ~UnbinnedSplineHandler()
Destructor for UnbinnedSplineHandler class.
std::vector< float > cpu_coeff_TF1_many
CPU arrays to hold TF1 coefficients.
void GetSplineCoeff_SepMany(TSpline3_red *&spl, int &nPoints, float *&xArray, float *&manyArray) const
CW: This loads up coefficients into two arrays: one x array and one yabcd array.
M3::float_t * cpu_total_weights
KS: This holds the total CPU weights that gets read in SampleHandler.
UnbinnedSplineHandler(std::vector< std::vector< TResponseFunction_red * > > &MasterSpline, const std::vector< RespFuncType > &SplineType, const bool SaveFlatTree=false, const std::string &_FastSplineName="SplineFile.root")
Constructor.
bool SaveSplineFile
Flag telling whether we are saving spline monolith into handy root file.
unsigned int NEvents
Number of events.
std::string FastSplineName
Name of Fast Spline to which will be saved.
void CalcTotalEventWeight()
Calc total event weight.
__host__ void SynchroniseSplines()
Make sure all Cuda threads finished execution.
MaCh3 event-by-event cross-section spline code.
#define CudaCheckError()
Definition: gpuUtils.cuh:21
double float_t
Definition: Core.h:37
void AddPath(std::string &FilePath)
Prepends the MACH3 environment path to FilePath if it is not already present.
Definition: Monitor.cpp:382
int int_t
Definition: Core.h:38
Stores info about each MC event used during reweighting routine.
KS: Struct storing information for spline monolith.
Definition: SplineCommon.h:35
std::vector< unsigned int > nKnots_arr
KS: CPU Number of knots per spline.
Definition: SplineCommon.h:47
std::vector< float > coeff_x
KS: CPU arrays to hold X coefficient.
Definition: SplineCommon.h:41
std::vector< float > coeff_many
CPU arrays to hold other coefficients.
Definition: SplineCommon.h:44
std::vector< short int > paramNo_arr
CW: CPU array with the number of points per spline (not per spline point!)
Definition: SplineCommon.h:50