MaCh3  2.2.3
Reference Guide
gpuMCMCProcessorUtils.cu
Go to the documentation of this file.
2 
3 // ******************************************
4 // CONSTANTS
5 // ******************************************
6 
7 // d_NAME declares DEVICE constants (live on GPU)
8 __device__ __constant__ int d_nLag;
9 __device__ __constant__ int d_nDraws;
10 __device__ __constant__ int d_nEntries;
11 
12 // h_NAME declares HOST constants (live on CPU)
13 static int h_nLag = -1;
14 static int h_nDraws = -1;
15 static int h_nEntries = -1;
16 
17 // *******************************************
18 // INITIALISE GPU
19 // *******************************************
20 
21 // *******************************************
23 __host__ void InitGPU_AutoCorr(
24  float **ParStep_gpu,
25  float **NumeratorSum_gpu,
26  float **ParamSums_gpu,
27  float **DenomSum_gpu,
28 
29  int n_Entries,
30  int n_Pars,
31  const int n_Lags) {
32 // *******************************************
33  // Write to the global statics (h_* denotes host stored variable)
34  h_nDraws = n_Pars;
35  h_nLag = n_Lags;
36  h_nEntries = n_Entries;
37 
38  // Copy the constants
39  cudaMemcpyToSymbol(d_nLag, &h_nLag, sizeof(h_nLag));
41 
42  cudaMemcpyToSymbol(d_nDraws, &h_nDraws, sizeof(h_nDraws));
44 
45  cudaMemcpyToSymbol(d_nEntries, &h_nEntries, sizeof(h_nEntries));
47 
48  // Allocate chunks of memory to GPU
49  //Numerator which is directly used for calculating LagL
50  cudaMalloc((void **) NumeratorSum_gpu, h_nLag*h_nDraws*sizeof(float));
52 
53  //Denominator which is directly used for calculating LagL
54  cudaMalloc((void **) DenomSum_gpu, h_nLag*h_nDraws*sizeof(float));
56 
57  //Mean value for a given parameter
58  cudaMalloc((void **) ParamSums_gpu, h_nDraws*sizeof(float));
60 
61  //store value of paramter for each step
62  cudaMalloc((void **) ParStep_gpu, h_nDraws*h_nEntries*sizeof(float*));
64 
65  printf(" Allocated in total %f MB for autocorrelations calculations on GPU\n",
66  static_cast<double>(sizeof(float) * (h_nLag * h_nDraws + h_nLag * h_nDraws + h_nDraws + h_nDraws * h_nEntries)) / 1.0e6);
67 }
68 
69 // ******************************************************
70 // START COPY TO GPU
71 // ******************************************************
72 
73 // ******************************************************
75 __host__ void CopyToGPU_AutoCorr(
76  float *ParStep_cpu,
77  float *NumeratorSum_cpu,
78  float *ParamSums_cpu,
79  float *DenomSum_cpu,
80 
81  float *ParStep_gpu,
82  float *NumeratorSum_gpu,
83  float *ParamSums_gpu,
84  float *DenomSum_gpu) {
85 // ******************************************************
86  //store value of parameter for each step
87  cudaMemcpy(ParStep_gpu, ParStep_cpu, h_nDraws*h_nEntries*sizeof(float), cudaMemcpyHostToDevice);
89 
90  //Mean value for a given parameter
91  cudaMemcpy(ParamSums_gpu, ParamSums_cpu, h_nDraws*sizeof(float), cudaMemcpyHostToDevice);
93 
94  //Numerator which is directly used for calculating LagL
95  cudaMemcpy(NumeratorSum_gpu, NumeratorSum_cpu, h_nLag*h_nDraws*sizeof(float), cudaMemcpyHostToDevice);
97 
98  //Denominator which is directly used for calculating LagL
99  cudaMemcpy(DenomSum_gpu, DenomSum_cpu, h_nLag*h_nDraws*sizeof(float), cudaMemcpyHostToDevice);
100  CudaCheckError();
101 }
102 
103 
104 // ********************************************************
105 // START GPU KERNELS
106 //*********************************************************
107 
108 //*********************************************************
110 __global__ void EvalOnGPU_AutoCorr(
111  const float* __restrict__ ParStep_gpu,
112  const float* __restrict__ ParamSums_gpu,
113  float* NumeratorSum_gpu,
114  float* DenomSum_gpu) {
115 //*********************************************************
116  const unsigned int CurrentLagNum = (blockIdx.x * blockDim.x + threadIdx.x);
117 
118  //KS: Accessing shared memory is much much faster than global memory hence we use shared memory for calculation and then write to global memory
119  __shared__ float shared_NumeratorSum[_BlockSize_];
120  __shared__ float shared_DenomSum[_BlockSize_];
121 
122  // this is the stopping condition!
123  if (CurrentLagNum < d_nLag*d_nDraws)
124  {
125  shared_NumeratorSum[threadIdx.x] = 0;
126  shared_DenomSum[threadIdx.x] = 0;
127 
128  //KS: Might consider caching this information, which MIGHT be faster too lazy right now
129  const int Param = int(CurrentLagNum/d_nLag);
130  const int nLag = CurrentLagNum - Param*d_nLag;
131  // Loop over the number of entries
132  for (int i = 0; i < d_nEntries; ++i)
133  {
134  //KS: Use fmaf to have it tiny bit faster, for something easier to read: Param*d_nEntries + i
135  int CurrParStep = fmaf(Param, d_nEntries, i);
136  const float Diff = ParStep_gpu[CurrParStep]-ParamSums_gpu[Param];
137  // Only sum the numerator up to i = N-k
138  if (i < d_nEntries-nLag)
139  {
140  //KS: Use fmaf to have it tiny bit faster, for something easier to read: Param*d_nEntries + (i + nLag)
141  CurrParStep = fmaf(Param, d_nEntries, i + nLag);
142  const float LagTerm = ParStep_gpu[CurrParStep]-ParamSums_gpu[Param];
143  const float Product = Diff*LagTerm;
144  shared_NumeratorSum[threadIdx.x] += Product;
145  }
146  // Square the difference to form the denominator
147  const float Denom = Diff*Diff;
148  shared_DenomSum[threadIdx.x] += Denom;
149  }
150 
151  //KS: Make sure threads are synchronised before moving to global memory
152  __syncthreads();
153  NumeratorSum_gpu[CurrentLagNum] = shared_NumeratorSum[threadIdx.x];
154  DenomSum_gpu[CurrentLagNum] = shared_DenomSum[threadIdx.x];
155  }
156 }
157 
158 // *****************************************
160 __host__ void RunGPU_AutoCorr(
161  float* ParStep_gpu,
162  float* ParamSums_gpu,
163  float* NumeratorSum_gpu,
164  float* DenomSum_gpu,
165  float* NumeratorSum_cpu,
166  float* DenomSum_cpu) {
167 // *****************************************
168  dim3 block_size;
169  dim3 grid_size;
170 
171  block_size.x = _BlockSize_;
172  grid_size.x = (h_nLag*h_nDraws / block_size.x) + 1;
173 
174  EvalOnGPU_AutoCorr<<<grid_size, block_size>>>(
175  ParStep_gpu,
176  ParamSums_gpu,
177  NumeratorSum_gpu,
178  DenomSum_gpu);
179  CudaCheckError();
180 
181  printf(" Finished calculating now copying results back to CPU \n");
182 
183  //KS: Finally copy paste memory from GPU to CPU
184  cudaMemcpy(NumeratorSum_cpu, NumeratorSum_gpu, h_nLag*h_nDraws*sizeof(float), cudaMemcpyDeviceToHost);
185  CudaCheckError();
186 
187  cudaMemcpy(DenomSum_cpu, DenomSum_gpu, h_nLag*h_nDraws*sizeof(float), cudaMemcpyDeviceToHost);
188  CudaCheckError();
189 }
190 
191 // *********************************
192 // CLEANING
193 // *********************************
194 
195 // *********************************
197 __host__ void CleanupGPU_AutoCorr(
198  float *ParStep_gpu,
199  float *NumeratorSum_gpu,
200  float *ParamSums_gpu,
201  float *DenomSum_gpu) {
202 // *********************************
203  cudaFree(ParStep_gpu);
204  cudaFree(NumeratorSum_gpu);
205  cudaFree(ParamSums_gpu);
206  cudaFree(DenomSum_gpu);
207 
208  printf(" Cleared memory at GPU, I am free \n");
209 }
__device__ __constant__ int d_nEntries
__global__ void EvalOnGPU_AutoCorr(const float *__restrict__ ParStep_gpu, const float *__restrict__ ParamSums_gpu, float *NumeratorSum_gpu, float *DenomSum_gpu)
Eval autocorrelations based on Box and Jenkins.
static int h_nLag
__host__ void InitGPU_AutoCorr(float **ParStep_gpu, float **NumeratorSum_gpu, float **ParamSums_gpu, float **DenomSum_gpu, int n_Entries, int n_Pars, const int n_Lags)
KS: Initialiser, here we allocate memory for variables and copy constants.
__host__ void CopyToGPU_AutoCorr(float *ParStep_cpu, float *NumeratorSum_cpu, float *ParamSums_cpu, float *DenomSum_cpu, float *ParStep_gpu, float *NumeratorSum_gpu, float *ParamSums_gpu, float *DenomSum_gpu)
KS: Copy necessary variables from CPU to GPU.
__host__ void RunGPU_AutoCorr(float *ParStep_gpu, float *ParamSums_gpu, float *NumeratorSum_gpu, float *DenomSum_gpu, float *NumeratorSum_cpu, float *DenomSum_cpu)
KS: This call the main kernel responsible for calculating LagL and later copy results back to CPU.
__host__ void CleanupGPU_AutoCorr(float *ParStep_gpu, float *NumeratorSum_gpu, float *ParamSums_gpu, float *DenomSum_gpu)
KS: free memory on gpu.
__device__ __constant__ int d_nDraws
static int h_nEntries
__device__ __constant__ int d_nLag
static int h_nDraws
#define CudaCheckError()
Definition: gpuUtils.cuh:22
#define _BlockSize_
KS: Need it for shared memory, there is way to use dynamic shared memory but I am lazy right now.
Definition: gpuUtils.cuh:25