MaCh3 2.2.1
Reference Guide
Loading...
Searching...
No Matches
gpuMCMCProcessorUtils.cu
Go to the documentation of this file.
2
3// ******************************************
4// CONSTANTS
5// ******************************************
6
7// d_NAME declares DEVICE constants (live on GPU)
8__device__ __constant__ int d_nLag;
9__device__ __constant__ int d_nDraws;
10__device__ __constant__ int d_nEntries;
11
12// h_NAME declares HOST constants (live on CPU)
13static int h_nLag = -1;
14static int h_nDraws = -1;
15static int h_nEntries = -1;
16
17// *******************************************
18// INITIALISE GPU
19// *******************************************
20
21// *******************************************
23__host__ void InitGPU_AutoCorr(
24 float **ParStep_gpu,
25 float **NumeratorSum_gpu,
26 float **ParamSums_gpu,
27 float **DenomSum_gpu,
28
29 int n_Entries,
30 int n_Pars,
31 const int n_Lags) {
32// *******************************************
33 // Write to the global statics (h_* denotes host stored variable)
34 h_nDraws = n_Pars;
35 h_nLag = n_Lags;
36 h_nEntries = n_Entries;
37
38 // Copy the constants
39 cudaMemcpyToSymbol(d_nLag, &h_nLag, sizeof(h_nLag));
41
42 cudaMemcpyToSymbol(d_nDraws, &h_nDraws, sizeof(h_nDraws));
44
45 cudaMemcpyToSymbol(d_nEntries, &h_nEntries, sizeof(h_nEntries));
47
48 // Allocate chunks of memory to GPU
49 //Numerator which is directly used for calculating LagL
50 cudaMalloc((void **) NumeratorSum_gpu, h_nLag*h_nDraws*sizeof(float));
52
53 //Denominator which is directly used for calculating LagL
54 cudaMalloc((void **) DenomSum_gpu, h_nLag*h_nDraws*sizeof(float));
56
57 //Mean value for a given parameter
58 cudaMalloc((void **) ParamSums_gpu, h_nDraws*sizeof(float));
60
61 //store value of paramter for each step
62 cudaMalloc((void **) ParStep_gpu, h_nDraws*h_nEntries*sizeof(float*));
64
65 printf(" Allocated in total %f MB for autocorrelations calculations on GPU\n",
66 static_cast<double>(sizeof(float) * (h_nLag * h_nDraws + h_nLag * h_nDraws + h_nDraws + h_nDraws * h_nEntries)) / 1.0e6);
67}
68
69// ******************************************************
70// START COPY TO GPU
71// ******************************************************
72
73// ******************************************************
75__host__ void CopyToGPU_AutoCorr(
76 float *ParStep_cpu,
77 float *NumeratorSum_cpu,
78 float *ParamSums_cpu,
79 float *DenomSum_cpu,
80
81 float *ParStep_gpu,
82 float *NumeratorSum_gpu,
83 float *ParamSums_gpu,
84 float *DenomSum_gpu) {
85// ******************************************************
86 //store value of parameter for each step
87 cudaMemcpy(ParStep_gpu, ParStep_cpu, h_nDraws*h_nEntries*sizeof(float), cudaMemcpyHostToDevice);
89
90 //Mean value for a given parameter
91 cudaMemcpy(ParamSums_gpu, ParamSums_cpu, h_nDraws*sizeof(float), cudaMemcpyHostToDevice);
93
94 //Numerator which is directly used for calculating LagL
95 cudaMemcpy(NumeratorSum_gpu, NumeratorSum_cpu, h_nLag*h_nDraws*sizeof(float), cudaMemcpyHostToDevice);
97
98 //Denominator which is directly used for calculating LagL
99 cudaMemcpy(DenomSum_gpu, DenomSum_cpu, h_nLag*h_nDraws*sizeof(float), cudaMemcpyHostToDevice);
101}
102
103
104// ********************************************************
105// START GPU KERNELS
106//*********************************************************
107
108//*********************************************************
110__global__ void EvalOnGPU_AutoCorr(
111 const float* __restrict__ ParStep_gpu,
112 const float* __restrict__ ParamSums_gpu,
113 float* NumeratorSum_gpu,
114 float* DenomSum_gpu) {
115//*********************************************************
116 const unsigned int CurrentLagNum = (blockIdx.x * blockDim.x + threadIdx.x);
117
118 //KS: Accessing shared memory is much much faster than global memory hence we use shared memory for calculation and then write to global memory
119 __shared__ float shared_NumeratorSum[_BlockSize_];
120 __shared__ float shared_DenomSum[_BlockSize_];
121
122 // this is the stopping condition!
123 if (CurrentLagNum < d_nLag*d_nDraws)
124 {
125 shared_NumeratorSum[threadIdx.x] = 0;
126 shared_DenomSum[threadIdx.x] = 0;
127
128 //KS: Might consider caching this information, which MIGHT be faster too lazy right now
129 const int Param = int(CurrentLagNum/d_nLag);
130 const int nLag = CurrentLagNum - Param*d_nLag;
131 // Loop over the number of entries
132 for (int i = 0; i < d_nEntries; ++i)
133 {
134 //KS: Use fmaf to have it tiny bit faster, for something easier to read: Param*d_nEntries + i
135 int CurrParStep = fmaf(Param, d_nEntries, i);
136 const float Diff = ParStep_gpu[CurrParStep]-ParamSums_gpu[Param];
137 // Only sum the numerator up to i = N-k
138 if (i < d_nEntries-nLag)
139 {
140 //KS: Use fmaf to have it tiny bit faster, for something easier to read: Param*d_nEntries + (i + nLag)
141 CurrParStep = fmaf(Param, d_nEntries, i + nLag);
142 const float LagTerm = ParStep_gpu[CurrParStep]-ParamSums_gpu[Param];
143 const float Product = Diff*LagTerm;
144 shared_NumeratorSum[threadIdx.x] += Product;
145 }
146 // Square the difference to form the denominator
147 const float Denom = Diff*Diff;
148 shared_DenomSum[threadIdx.x] += Denom;
149 }
150
151 //KS: Make sure threads are synchronised before moving to global memory
152 __syncthreads();
153 NumeratorSum_gpu[CurrentLagNum] = shared_NumeratorSum[threadIdx.x];
154 DenomSum_gpu[CurrentLagNum] = shared_DenomSum[threadIdx.x];
155 }
156}
157
158// *****************************************
160__host__ void RunGPU_AutoCorr(
161 float* ParStep_gpu,
162 float* ParamSums_gpu,
163 float* NumeratorSum_gpu,
164 float* DenomSum_gpu,
165 float* NumeratorSum_cpu,
166 float* DenomSum_cpu) {
167// *****************************************
168 dim3 block_size;
169 dim3 grid_size;
170
171 block_size.x = _BlockSize_;
172 grid_size.x = (h_nLag*h_nDraws / block_size.x) + 1;
173
174 EvalOnGPU_AutoCorr<<<grid_size, block_size>>>(
175 ParStep_gpu,
176 ParamSums_gpu,
177 NumeratorSum_gpu,
178 DenomSum_gpu);
180
181 printf(" Finished calculating now copying results back to CPU \n");
182
183 //KS: Finally copy paste memory from GPU to CPU
184 cudaMemcpy(NumeratorSum_cpu, NumeratorSum_gpu, h_nLag*h_nDraws*sizeof(float), cudaMemcpyDeviceToHost);
186
187 cudaMemcpy(DenomSum_cpu, DenomSum_gpu, h_nLag*h_nDraws*sizeof(float), cudaMemcpyDeviceToHost);
189}
190
191// *********************************
192// CLEANING
193// *********************************
194
195// *********************************
198 float *ParStep_gpu,
199 float *NumeratorSum_gpu,
200 float *ParamSums_gpu,
201 float *DenomSum_gpu) {
202// *********************************
203 cudaFree(ParStep_gpu);
204 cudaFree(NumeratorSum_gpu);
205 cudaFree(ParamSums_gpu);
206 cudaFree(DenomSum_gpu);
207
208 printf(" Cleared memory at GPU, I am free \n");
209}
__device__ __constant__ int d_nEntries
__global__ void EvalOnGPU_AutoCorr(const float *__restrict__ ParStep_gpu, const float *__restrict__ ParamSums_gpu, float *NumeratorSum_gpu, float *DenomSum_gpu)
Eval autocorrelations based on Box and Jenkins.
static int h_nLag
__host__ void InitGPU_AutoCorr(float **ParStep_gpu, float **NumeratorSum_gpu, float **ParamSums_gpu, float **DenomSum_gpu, int n_Entries, int n_Pars, const int n_Lags)
KS: Initialiser, here we allocate memory for variables and copy constants.
__host__ void CopyToGPU_AutoCorr(float *ParStep_cpu, float *NumeratorSum_cpu, float *ParamSums_cpu, float *DenomSum_cpu, float *ParStep_gpu, float *NumeratorSum_gpu, float *ParamSums_gpu, float *DenomSum_gpu)
KS: Copy necessary variables from CPU to GPU.
__host__ void RunGPU_AutoCorr(float *ParStep_gpu, float *ParamSums_gpu, float *NumeratorSum_gpu, float *DenomSum_gpu, float *NumeratorSum_cpu, float *DenomSum_cpu)
KS: This call the main kernel responsible for calculating LagL and later copy results back to CPU.
__host__ void CleanupGPU_AutoCorr(float *ParStep_gpu, float *NumeratorSum_gpu, float *ParamSums_gpu, float *DenomSum_gpu)
KS: free memory on gpu.
__device__ __constant__ int d_nDraws
static int h_nEntries
__device__ __constant__ int d_nLag
static int h_nDraws
#define CudaCheckError()
Definition: gpuUtils.cuh:22
#define _BlockSize_
KS: Need it for shared memory, there is way to use dynamic shared memory but I am lazy right now.
Definition: gpuUtils.cuh:25