MaCh3  2.4.2
Reference Guide
gpuMCMCProcessorUtils.cu
Go to the documentation of this file.
2 
3 // ******************************************
4 // CONSTANTS
5 // ******************************************
6 
7 // d_NAME declares DEVICE constants (live on GPU)
8 __device__ __constant__ int d_nLag;
9 __device__ __constant__ int d_nDraws;
10 __device__ __constant__ int d_nEntries;
11 
12 // *******************************************
13 // INITIALISE GPU
14 // *******************************************
15 
17  ParStep_gpu = nullptr;
18  NumeratorSum_gpu = nullptr;
19  ParamSums_gpu = nullptr;
20  DenomSum_gpu = nullptr;
21 
22  h_nLag = -1;
23  h_nDraws = -1;
24  h_nEntries = -1;
25 }
26 
28 }
29 
30 // *******************************************
33  int n_Entries,
34  int n_Pars,
35  const int n_Lags) {
36 // *******************************************
37  // Write to the global statics (h_* denotes host stored variable)
38  h_nDraws = n_Pars;
39  h_nLag = n_Lags;
40  h_nEntries = n_Entries;
41 
42  // Copy the constants
43  cudaMemcpyToSymbol(d_nLag, &h_nLag, sizeof(h_nLag));
45 
46  cudaMemcpyToSymbol(d_nDraws, &h_nDraws, sizeof(h_nDraws));
48 
49  cudaMemcpyToSymbol(d_nEntries, &h_nEntries, sizeof(h_nEntries));
51 
52  // Allocate chunks of memory to GPU
53  //Numerator which is directly used for calculating LagL
54  cudaMalloc((void **) &NumeratorSum_gpu, h_nLag*h_nDraws*sizeof(float));
56 
57  //Denominator which is directly used for calculating LagL
58  cudaMalloc((void **) &DenomSum_gpu, h_nLag*h_nDraws*sizeof(float));
60 
61  //Mean value for a given parameter
62  cudaMalloc((void **) &ParamSums_gpu, h_nDraws*sizeof(float));
64 
65  //store value of parameter for each step
66  cudaMalloc((void **) &ParStep_gpu, h_nDraws*h_nEntries*sizeof(float*));
68 
69  printf(" Allocated in total %f MB for autocorrelations calculations on GPU\n",
70  static_cast<double>(sizeof(float) * (h_nLag * h_nDraws + h_nLag * h_nDraws + h_nDraws + h_nDraws * h_nEntries)) / 1.0e6);
71 }
72 
73 // ******************************************************
74 // START COPY TO GPU
75 // ******************************************************
76 
77 // ******************************************************
80  float *ParStep_cpu,
81  float *NumeratorSum_cpu,
82  float *ParamSums_cpu,
83  float *DenomSum_cpu) {
84 // ******************************************************
85  //store value of parameter for each step
86  cudaMemcpy(ParStep_gpu, ParStep_cpu, h_nDraws*h_nEntries*sizeof(float), cudaMemcpyHostToDevice);
88 
89  //Mean value for a given parameter
90  cudaMemcpy(ParamSums_gpu, ParamSums_cpu, h_nDraws*sizeof(float), cudaMemcpyHostToDevice);
92 
93  //Numerator which is directly used for calculating LagL
94  cudaMemcpy(NumeratorSum_gpu, NumeratorSum_cpu, h_nLag*h_nDraws*sizeof(float), cudaMemcpyHostToDevice);
96 
97  //Denominator which is directly used for calculating LagL
98  cudaMemcpy(DenomSum_gpu, DenomSum_cpu, h_nLag*h_nDraws*sizeof(float), cudaMemcpyHostToDevice);
100 }
101 
102 
103 // ********************************************************
104 // START GPU KERNELS
105 //*********************************************************
106 
107 //*********************************************************
109 __global__ void EvalOnGPU_AutoCorr(
110  const float* __restrict__ ParStep_gpu,
111  const float* __restrict__ ParamSums_gpu,
112  float* NumeratorSum_gpu,
113  float* DenomSum_gpu) {
114 //*********************************************************
115  const unsigned int CurrentLagNum = (blockIdx.x * blockDim.x + threadIdx.x);
116 
117  //KS: Accessing shared memory is much much faster than global memory hence we use shared memory for calculation and then write to global memory
118  __shared__ float shared_NumeratorSum[_BlockSize_];
119  __shared__ float shared_DenomSum[_BlockSize_];
120 
121  // this is the stopping condition!
122  if (CurrentLagNum < d_nLag*d_nDraws)
123  {
124  shared_NumeratorSum[threadIdx.x] = 0;
125  shared_DenomSum[threadIdx.x] = 0;
126 
127  //KS: Might consider caching this information, which MIGHT be faster too lazy right now
128  const int Param = int(CurrentLagNum/d_nLag);
129  const int nLag = CurrentLagNum - Param*d_nLag;
130  // Loop over the number of entries
131  for (int i = 0; i < d_nEntries; ++i)
132  {
133  //KS: Use fmaf to have it tiny bit faster, for something easier to read: Param*d_nEntries + i
134  int CurrParStep = fmaf(Param, d_nEntries, i);
135  const float Diff = ParStep_gpu[CurrParStep]-ParamSums_gpu[Param];
136  // Only sum the numerator up to i = N-k
137  if (i < d_nEntries-nLag)
138  {
139  //KS: Use fmaf to have it tiny bit faster, for something easier to read: Param*d_nEntries + (i + nLag)
140  CurrParStep = fmaf(Param, d_nEntries, i + nLag);
141  const float LagTerm = ParStep_gpu[CurrParStep]-ParamSums_gpu[Param];
142  const float Product = Diff*LagTerm;
143  shared_NumeratorSum[threadIdx.x] += Product;
144  }
145  // Square the difference to form the denominator
146  const float Denom = Diff*Diff;
147  shared_DenomSum[threadIdx.x] += Denom;
148  }
149 
150  //KS: Make sure threads are synchronised before moving to global memory
151  __syncthreads();
152  NumeratorSum_gpu[CurrentLagNum] = shared_NumeratorSum[threadIdx.x];
153  DenomSum_gpu[CurrentLagNum] = shared_DenomSum[threadIdx.x];
154  }
155 }
156 
157 // *****************************************
159 __host__ void MCMCProcessorGPU::RunGPU_AutoCorr(float* NumeratorSum_cpu,
160  float* DenomSum_cpu) {
161 // *****************************************
162  dim3 block_size;
163  dim3 grid_size;
164 
165  block_size.x = _BlockSize_;
166  grid_size.x = (h_nLag*h_nDraws / block_size.x) + 1;
167 
168  EvalOnGPU_AutoCorr<<<grid_size, block_size>>>(
169  ParStep_gpu,
172  DenomSum_gpu);
173  CudaCheckError();
174 
175  printf(" Finished calculating now copying results back to CPU \n");
176 
177  //KS: Finally copy paste memory from GPU to CPU
178  cudaMemcpy(NumeratorSum_cpu, NumeratorSum_gpu, h_nLag*h_nDraws*sizeof(float), cudaMemcpyDeviceToHost);
179  CudaCheckError();
180 
181  cudaMemcpy(DenomSum_cpu, DenomSum_gpu, h_nLag*h_nDraws*sizeof(float), cudaMemcpyDeviceToHost);
182  CudaCheckError();
183 }
184 
185 // *********************************
186 // CLEANING
187 // *********************************
188 
189 // *********************************
192 // *********************************
193  if(ParStep_gpu) cudaFree(ParStep_gpu);
194  CudaCheckError();
195  if(NumeratorSum_gpu) cudaFree(NumeratorSum_gpu);
196  CudaCheckError();
197  if(ParamSums_gpu) cudaFree(ParamSums_gpu);
198  CudaCheckError();
199  if(DenomSum_gpu) cudaFree(DenomSum_gpu);
200  CudaCheckError();
201 
202  printf(" Cleared memory at GPU, I am free \n");
203 }
__host__ void InitGPU_AutoCorr(int n_Entries, int n_Pars, const int n_Lags)
KS: Initialiser, here we allocate memory for variables and copy constants.
virtual ~MCMCProcessorGPU()
destructor
__host__ void CleanupGPU_AutoCorr()
KS: free memory on gpu.
MCMCProcessorGPU()
constructor
__host__ void CopyToGPU_AutoCorr(float *ParStep_cpu, float *NumeratorSum_cpu, float *ParamSums_cpu, float *DenomSum_cpu)
KS: Copy necessary variables from CPU to GPU.
__host__ void RunGPU_AutoCorr(float *NumeratorSum_cpu, float *DenomSum_cpu)
KS: This call the main kernel responsible for calculating LagL and later copy results back to CPU.
float * ParStep_gpu
Value of each param at GPU.
__device__ __constant__ int d_nEntries
__global__ void EvalOnGPU_AutoCorr(const float *__restrict__ ParStep_gpu, const float *__restrict__ ParamSums_gpu, float *NumeratorSum_gpu, float *DenomSum_gpu)
Eval autocorrelations based on Box and Jenkins.
__device__ __constant__ int d_nDraws
__device__ __constant__ int d_nLag
#define CudaCheckError()
Definition: gpuUtils.cuh:22
#define _BlockSize_
KS: Need it for shared memory, there is way to use dynamic shared memory but I am lazy right now.
Definition: gpuUtils.cuh:25