8__device__ __constant__
int d_nLag;
25 float **NumeratorSum_gpu,
26 float **ParamSums_gpu,
50 cudaMalloc((
void **) NumeratorSum_gpu,
h_nLag*
h_nDraws*
sizeof(
float));
58 cudaMalloc((
void **) ParamSums_gpu,
h_nDraws*
sizeof(
float));
65 printf(
" Allocated in total %f MB for autocorrelations calculations on GPU\n",
77 float *NumeratorSum_cpu,
82 float *NumeratorSum_gpu,
84 float *DenomSum_gpu) {
87 cudaMemcpy(ParStep_gpu, ParStep_cpu,
h_nDraws*
h_nEntries*
sizeof(
float), cudaMemcpyHostToDevice);
91 cudaMemcpy(ParamSums_gpu, ParamSums_cpu,
h_nDraws*
sizeof(
float), cudaMemcpyHostToDevice);
95 cudaMemcpy(NumeratorSum_gpu, NumeratorSum_cpu,
h_nLag*
h_nDraws*
sizeof(
float), cudaMemcpyHostToDevice);
99 cudaMemcpy(DenomSum_gpu, DenomSum_cpu,
h_nLag*
h_nDraws*
sizeof(
float), cudaMemcpyHostToDevice);
111 const float* __restrict__ ParStep_gpu,
112 const float* __restrict__ ParamSums_gpu,
113 float* NumeratorSum_gpu,
114 float* DenomSum_gpu) {
116 const unsigned int CurrentLagNum = (blockIdx.x * blockDim.x + threadIdx.x);
125 shared_NumeratorSum[threadIdx.x] = 0;
126 shared_DenomSum[threadIdx.x] = 0;
129 const int Param = int(CurrentLagNum/
d_nLag);
130 const int nLag = CurrentLagNum - Param*
d_nLag;
136 const float Diff = ParStep_gpu[CurrParStep]-ParamSums_gpu[Param];
141 CurrParStep = fmaf(Param,
d_nEntries, i + nLag);
142 const float LagTerm = ParStep_gpu[CurrParStep]-ParamSums_gpu[Param];
143 const float Product = Diff*LagTerm;
144 shared_NumeratorSum[threadIdx.x] += Product;
147 const float Denom = Diff*Diff;
148 shared_DenomSum[threadIdx.x] += Denom;
153 NumeratorSum_gpu[CurrentLagNum] = shared_NumeratorSum[threadIdx.x];
154 DenomSum_gpu[CurrentLagNum] = shared_DenomSum[threadIdx.x];
162 float* ParamSums_gpu,
163 float* NumeratorSum_gpu,
165 float* NumeratorSum_cpu,
166 float* DenomSum_cpu) {
174 EvalOnGPU_AutoCorr<<<grid_size, block_size>>>(
181 printf(
" Finished calculating now copying results back to CPU \n");
184 cudaMemcpy(NumeratorSum_cpu, NumeratorSum_gpu,
h_nLag*
h_nDraws*
sizeof(
float), cudaMemcpyDeviceToHost);
187 cudaMemcpy(DenomSum_cpu, DenomSum_gpu,
h_nLag*
h_nDraws*
sizeof(
float), cudaMemcpyDeviceToHost);
199 float *NumeratorSum_gpu,
200 float *ParamSums_gpu,
201 float *DenomSum_gpu) {
203 cudaFree(ParStep_gpu);
204 cudaFree(NumeratorSum_gpu);
205 cudaFree(ParamSums_gpu);
206 cudaFree(DenomSum_gpu);
208 printf(
" Cleared memory at GPU, I am free \n");
__device__ __constant__ int d_nEntries
__global__ void EvalOnGPU_AutoCorr(const float *__restrict__ ParStep_gpu, const float *__restrict__ ParamSums_gpu, float *NumeratorSum_gpu, float *DenomSum_gpu)
Eval autocorrelations based on Box and Jenkins.
__host__ void InitGPU_AutoCorr(float **ParStep_gpu, float **NumeratorSum_gpu, float **ParamSums_gpu, float **DenomSum_gpu, int n_Entries, int n_Pars, const int n_Lags)
KS: Initialiser, here we allocate memory for variables and copy constants.
__host__ void CopyToGPU_AutoCorr(float *ParStep_cpu, float *NumeratorSum_cpu, float *ParamSums_cpu, float *DenomSum_cpu, float *ParStep_gpu, float *NumeratorSum_gpu, float *ParamSums_gpu, float *DenomSum_gpu)
KS: Copy necessary variables from CPU to GPU.
__host__ void RunGPU_AutoCorr(float *ParStep_gpu, float *ParamSums_gpu, float *NumeratorSum_gpu, float *DenomSum_gpu, float *NumeratorSum_cpu, float *DenomSum_cpu)
KS: This call the main kernel responsible for calculating LagL and later copy results back to CPU.
__host__ void CleanupGPU_AutoCorr(float *ParStep_gpu, float *NumeratorSum_gpu, float *ParamSums_gpu, float *DenomSum_gpu)
KS: free memory on gpu.
__device__ __constant__ int d_nDraws
__device__ __constant__ int d_nLag
#define _BlockSize_
KS: Need it for shared memory, there is way to use dynamic shared memory but I am lazy right now.