8 __device__ __constant__
int d_nLag;
69 printf(
" Allocated in total %f MB for autocorrelations calculations on GPU\n",
81 float *NumeratorSum_cpu,
83 float *DenomSum_cpu) {
110 const float* __restrict__ ParStep_gpu,
111 const float* __restrict__ ParamSums_gpu,
112 float* NumeratorSum_gpu,
113 float* DenomSum_gpu) {
115 const unsigned int CurrentLagNum = (blockIdx.x * blockDim.x + threadIdx.x);
124 shared_NumeratorSum[threadIdx.x] = 0;
125 shared_DenomSum[threadIdx.x] = 0;
128 const int Param = int(CurrentLagNum/
d_nLag);
129 const int nLag = CurrentLagNum - Param*
d_nLag;
135 const float Diff = ParStep_gpu[CurrParStep]-ParamSums_gpu[Param];
140 CurrParStep = fmaf(Param,
d_nEntries, i + nLag);
141 const float LagTerm = ParStep_gpu[CurrParStep]-ParamSums_gpu[Param];
142 const float Product = Diff*LagTerm;
143 shared_NumeratorSum[threadIdx.x] += Product;
146 const float Denom = Diff*Diff;
147 shared_DenomSum[threadIdx.x] += Denom;
152 NumeratorSum_gpu[CurrentLagNum] = shared_NumeratorSum[threadIdx.x];
153 DenomSum_gpu[CurrentLagNum] = shared_DenomSum[threadIdx.x];
160 float* DenomSum_cpu) {
168 EvalOnGPU_AutoCorr<<<grid_size, block_size>>>(
175 printf(
" Finished calculating now copying results back to CPU \n");
202 printf(
" Cleared memory at GPU, I am free \n");
__host__ void InitGPU_AutoCorr(int n_Entries, int n_Pars, const int n_Lags)
KS: Initialiser, here we allocate memory for variables and copy constants.
virtual ~MCMCProcessorGPU()
destructor
__host__ void CleanupGPU_AutoCorr()
KS: free memory on gpu.
MCMCProcessorGPU()
constructor
__host__ void CopyToGPU_AutoCorr(float *ParStep_cpu, float *NumeratorSum_cpu, float *ParamSums_cpu, float *DenomSum_cpu)
KS: Copy necessary variables from CPU to GPU.
__host__ void RunGPU_AutoCorr(float *NumeratorSum_cpu, float *DenomSum_cpu)
KS: This call the main kernel responsible for calculating LagL and later copy results back to CPU.
float * ParStep_gpu
Value of each param at GPU.
__device__ __constant__ int d_nEntries
__global__ void EvalOnGPU_AutoCorr(const float *__restrict__ ParStep_gpu, const float *__restrict__ ParamSums_gpu, float *NumeratorSum_gpu, float *DenomSum_gpu)
Eval autocorrelations based on Box and Jenkins.
__device__ __constant__ int d_nDraws
__device__ __constant__ int d_nLag
#define _BlockSize_
KS: Need it for shared memory, there is way to use dynamic shared memory but I am lazy right now.