MaCh3 2.2.1
Reference Guide
Loading...
Searching...
No Matches
Functions | Variables
gpuMCMCProcessorUtils.cu File Reference
#include "Fitters/gpuMCMCProcessorUtils.cuh"
Include dependency graph for gpuMCMCProcessorUtils.cu:

Go to the source code of this file.

Functions

__host__ void InitGPU_AutoCorr (float **ParStep_gpu, float **NumeratorSum_gpu, float **ParamSums_gpu, float **DenomSum_gpu, int n_Entries, int n_Pars, const int n_Lags)
 KS: Initialiser, here we allocate memory for variables and copy constants.
 
__host__ void CopyToGPU_AutoCorr (float *ParStep_cpu, float *NumeratorSum_cpu, float *ParamSums_cpu, float *DenomSum_cpu, float *ParStep_gpu, float *NumeratorSum_gpu, float *ParamSums_gpu, float *DenomSum_gpu)
 KS: Copy necessary variables from CPU to GPU.
 
__global__ void EvalOnGPU_AutoCorr (const float *__restrict__ ParStep_gpu, const float *__restrict__ ParamSums_gpu, float *NumeratorSum_gpu, float *DenomSum_gpu)
 Eval autocorrelations based on Box and Jenkins.
 
__host__ void RunGPU_AutoCorr (float *ParStep_gpu, float *ParamSums_gpu, float *NumeratorSum_gpu, float *DenomSum_gpu, float *NumeratorSum_cpu, float *DenomSum_cpu)
 KS: This call the main kernel responsible for calculating LagL and later copy results back to CPU.
 
__host__ void CleanupGPU_AutoCorr (float *ParStep_gpu, float *NumeratorSum_gpu, float *ParamSums_gpu, float *DenomSum_gpu)
 KS: free memory on gpu.
 

Variables

__device__ __constant__ int d_nLag
 
__device__ __constant__ int d_nDraws
 
__device__ __constant__ int d_nEntries
 
static int h_nLag = -1
 
static int h_nDraws = -1
 
static int h_nEntries = -1
 

Function Documentation

◆ CleanupGPU_AutoCorr()

__host__ void CleanupGPU_AutoCorr ( float *  ParStep_gpu,
float *  NumeratorSum_gpu,
float *  ParamSums_gpu,
float *  DenomSum_gpu 
)

KS: free memory on gpu.

Parameters
ParStep_gpuParameter value at each step
NumeratorSum_gpuSum used for nominator of autocorrelation calculations
ParamSums_gpuOverall sum for each parameter over all steps
DenomSum_gpuSum used for denominator of autocorrelation calculations

Definition at line 197 of file gpuMCMCProcessorUtils.cu.

201 {
202// *********************************
203 cudaFree(ParStep_gpu);
204 cudaFree(NumeratorSum_gpu);
205 cudaFree(ParamSums_gpu);
206 cudaFree(DenomSum_gpu);
207
208 printf(" Cleared memory at GPU, I am free \n");
209}

◆ CopyToGPU_AutoCorr()

__host__ void CopyToGPU_AutoCorr ( float *  ParStep_cpu,
float *  NumeratorSum_cpu,
float *  ParamSums_cpu,
float *  DenomSum_cpu,
float *  ParStep_gpu,
float *  NumeratorSum_gpu,
float *  ParamSums_gpu,
float *  DenomSum_gpu 
)

KS: Copy necessary variables from CPU to GPU.

Parameters
ParStep_cpuParameter value at each step
NumeratorSum_cpuSum used for nominator of autocorrelation calculations
ParamSums_cpuOverall sum for each parameter over all steps
DenomSum_cpuSum used for denominator of autocorrelation calculations
ParStep_gpuParameter value at each step
NumeratorSum_gpuSum used for nominator of autocorrelation calculations
ParamSums_gpuOverall sum for each parameter over all steps
DenomSum_gpuSum used for denominator of autocorrelation calculations

Definition at line 75 of file gpuMCMCProcessorUtils.cu.

84 {
85// ******************************************************
86 //store value of parameter for each step
87 cudaMemcpy(ParStep_gpu, ParStep_cpu, h_nDraws*h_nEntries*sizeof(float), cudaMemcpyHostToDevice);
89
90 //Mean value for a given parameter
91 cudaMemcpy(ParamSums_gpu, ParamSums_cpu, h_nDraws*sizeof(float), cudaMemcpyHostToDevice);
93
94 //Numerator which is directly used for calculating LagL
95 cudaMemcpy(NumeratorSum_gpu, NumeratorSum_cpu, h_nLag*h_nDraws*sizeof(float), cudaMemcpyHostToDevice);
97
98 //Denominator which is directly used for calculating LagL
99 cudaMemcpy(DenomSum_gpu, DenomSum_cpu, h_nLag*h_nDraws*sizeof(float), cudaMemcpyHostToDevice);
101}
static int h_nLag
static int h_nEntries
static int h_nDraws
#define CudaCheckError()
Definition: gpuUtils.cuh:22

◆ EvalOnGPU_AutoCorr()

__global__ void EvalOnGPU_AutoCorr ( const float *__restrict__  ParStep_gpu,
const float *__restrict__  ParamSums_gpu,
float *  NumeratorSum_gpu,
float *  DenomSum_gpu 
)

Eval autocorrelations based on Box and Jenkins.

Parameters
ParStep_gpuParameter value at each step
NumeratorSum_gpuSum used for nominator of autocorrelation calculations
ParamSums_gpuOverall sum for each parameter over all steps
DenomSum_gpuSum used for denominator of autocorrelation calculations

Definition at line 110 of file gpuMCMCProcessorUtils.cu.

114 {
115//*********************************************************
116 const unsigned int CurrentLagNum = (blockIdx.x * blockDim.x + threadIdx.x);
117
118 //KS: Accessing shared memory is much much faster than global memory hence we use shared memory for calculation and then write to global memory
119 __shared__ float shared_NumeratorSum[_BlockSize_];
120 __shared__ float shared_DenomSum[_BlockSize_];
121
122 // this is the stopping condition!
123 if (CurrentLagNum < d_nLag*d_nDraws)
124 {
125 shared_NumeratorSum[threadIdx.x] = 0;
126 shared_DenomSum[threadIdx.x] = 0;
127
128 //KS: Might consider caching this information, which MIGHT be faster too lazy right now
129 const int Param = int(CurrentLagNum/d_nLag);
130 const int nLag = CurrentLagNum - Param*d_nLag;
131 // Loop over the number of entries
132 for (int i = 0; i < d_nEntries; ++i)
133 {
134 //KS: Use fmaf to have it tiny bit faster, for something easier to read: Param*d_nEntries + i
135 int CurrParStep = fmaf(Param, d_nEntries, i);
136 const float Diff = ParStep_gpu[CurrParStep]-ParamSums_gpu[Param];
137 // Only sum the numerator up to i = N-k
138 if (i < d_nEntries-nLag)
139 {
140 //KS: Use fmaf to have it tiny bit faster, for something easier to read: Param*d_nEntries + (i + nLag)
141 CurrParStep = fmaf(Param, d_nEntries, i + nLag);
142 const float LagTerm = ParStep_gpu[CurrParStep]-ParamSums_gpu[Param];
143 const float Product = Diff*LagTerm;
144 shared_NumeratorSum[threadIdx.x] += Product;
145 }
146 // Square the difference to form the denominator
147 const float Denom = Diff*Diff;
148 shared_DenomSum[threadIdx.x] += Denom;
149 }
150
151 //KS: Make sure threads are synchronised before moving to global memory
152 __syncthreads();
153 NumeratorSum_gpu[CurrentLagNum] = shared_NumeratorSum[threadIdx.x];
154 DenomSum_gpu[CurrentLagNum] = shared_DenomSum[threadIdx.x];
155 }
156}
__device__ __constant__ int d_nEntries
__device__ __constant__ int d_nDraws
__device__ __constant__ int d_nLag
#define _BlockSize_
KS: Need it for shared memory, there is way to use dynamic shared memory but I am lazy right now.
Definition: gpuUtils.cuh:25

◆ InitGPU_AutoCorr()

__host__ void InitGPU_AutoCorr ( float **  ParStep_gpu,
float **  NumeratorSum_gpu,
float **  ParamSums_gpu,
float **  DenomSum_gpu,
int  n_Entries,
int  n_Pars,
const int  n_Lags 
)

KS: Initialiser, here we allocate memory for variables and copy constants.

Parameters
ParStep_gpuParameter value at each step
NumeratorSum_gpuSum used for nominator of autocorrelation calculations
ParamSums_gpuOverall sum for each parameter over all steps
DenomSum_gpuSum used for denominator of autocorrelation calculations
n_EntriesTotal number of entries in mcmc chain
n_ParsNumber of relevant parameters
n_LagsValue of Lag in autocreation calculation

Definition at line 23 of file gpuMCMCProcessorUtils.cu.

31 {
32// *******************************************
33 // Write to the global statics (h_* denotes host stored variable)
34 h_nDraws = n_Pars;
35 h_nLag = n_Lags;
36 h_nEntries = n_Entries;
37
38 // Copy the constants
39 cudaMemcpyToSymbol(d_nLag, &h_nLag, sizeof(h_nLag));
41
42 cudaMemcpyToSymbol(d_nDraws, &h_nDraws, sizeof(h_nDraws));
44
45 cudaMemcpyToSymbol(d_nEntries, &h_nEntries, sizeof(h_nEntries));
47
48 // Allocate chunks of memory to GPU
49 //Numerator which is directly used for calculating LagL
50 cudaMalloc((void **) NumeratorSum_gpu, h_nLag*h_nDraws*sizeof(float));
52
53 //Denominator which is directly used for calculating LagL
54 cudaMalloc((void **) DenomSum_gpu, h_nLag*h_nDraws*sizeof(float));
56
57 //Mean value for a given parameter
58 cudaMalloc((void **) ParamSums_gpu, h_nDraws*sizeof(float));
60
61 //store value of paramter for each step
62 cudaMalloc((void **) ParStep_gpu, h_nDraws*h_nEntries*sizeof(float*));
64
65 printf(" Allocated in total %f MB for autocorrelations calculations on GPU\n",
66 static_cast<double>(sizeof(float) * (h_nLag * h_nDraws + h_nLag * h_nDraws + h_nDraws + h_nDraws * h_nEntries)) / 1.0e6);
67}

◆ RunGPU_AutoCorr()

__host__ void RunGPU_AutoCorr ( float *  ParStep_gpu,
float *  ParamSums_gpu,
float *  NumeratorSum_gpu,
float *  DenomSum_gpu,
float *  NumeratorSum_cpu,
float *  DenomSum_cpu 
)

KS: This call the main kernel responsible for calculating LagL and later copy results back to CPU.

Parameters
ParStep_gpuParameter value at each step
NumeratorSum_gpuSum used for nominator of autocorrelation calculations
ParamSums_gpuOverall sum for each parameter over all steps
DenomSum_gpuSum used for denominator of autocorrelation calculations
NumeratorSum_cpuSum used for nominator of autocorrelation calculations
DenomSum_cpuSum used for denominator of autocorrelation calculations

Definition at line 160 of file gpuMCMCProcessorUtils.cu.

166 {
167// *****************************************
168 dim3 block_size;
169 dim3 grid_size;
170
171 block_size.x = _BlockSize_;
172 grid_size.x = (h_nLag*h_nDraws / block_size.x) + 1;
173
174 EvalOnGPU_AutoCorr<<<grid_size, block_size>>>(
175 ParStep_gpu,
176 ParamSums_gpu,
177 NumeratorSum_gpu,
178 DenomSum_gpu);
180
181 printf(" Finished calculating now copying results back to CPU \n");
182
183 //KS: Finally copy paste memory from GPU to CPU
184 cudaMemcpy(NumeratorSum_cpu, NumeratorSum_gpu, h_nLag*h_nDraws*sizeof(float), cudaMemcpyDeviceToHost);
186
187 cudaMemcpy(DenomSum_cpu, DenomSum_gpu, h_nLag*h_nDraws*sizeof(float), cudaMemcpyDeviceToHost);
189}

Variable Documentation

◆ d_nDraws

__device__ __constant__ int d_nDraws

Definition at line 9 of file gpuMCMCProcessorUtils.cu.

◆ d_nEntries

__device__ __constant__ int d_nEntries

Definition at line 10 of file gpuMCMCProcessorUtils.cu.

◆ d_nLag

__device__ __constant__ int d_nLag

Definition at line 8 of file gpuMCMCProcessorUtils.cu.

◆ h_nDraws

int h_nDraws = -1
static

Definition at line 14 of file gpuMCMCProcessorUtils.cu.

◆ h_nEntries

int h_nEntries = -1
static

Definition at line 15 of file gpuMCMCProcessorUtils.cu.

◆ h_nLag

int h_nLag = -1
static

Definition at line 13 of file gpuMCMCProcessorUtils.cu.