MaCh3  2.4.2
Reference Guide
Functions | Variables
gpuMCMCProcessorUtils.cu File Reference
#include "Fitters/gpuMCMCProcessorUtils.cuh"
Include dependency graph for gpuMCMCProcessorUtils.cu:

Go to the source code of this file.

Functions

__global__ void EvalOnGPU_AutoCorr (const float *__restrict__ ParStep_gpu, const float *__restrict__ ParamSums_gpu, float *NumeratorSum_gpu, float *DenomSum_gpu)
 Eval autocorrelations based on Box and Jenkins. More...
 

Variables

__device__ __constant__ int d_nLag
 
__device__ __constant__ int d_nDraws
 
__device__ __constant__ int d_nEntries
 

Function Documentation

◆ EvalOnGPU_AutoCorr()

__global__ void EvalOnGPU_AutoCorr ( const float *__restrict__  ParStep_gpu,
const float *__restrict__  ParamSums_gpu,
float *  NumeratorSum_gpu,
float *  DenomSum_gpu 
)

Eval autocorrelations based on Box and Jenkins.

Parameters
ParStep_gpuParameter value at each step
NumeratorSum_gpuSum used for nominator of autocorrelation calculations
ParamSums_gpuOverall sum for each parameter over all steps
DenomSum_gpuSum used for denominator of autocorrelation calculations

Definition at line 109 of file gpuMCMCProcessorUtils.cu.

113  {
114 //*********************************************************
115  const unsigned int CurrentLagNum = (blockIdx.x * blockDim.x + threadIdx.x);
116 
117  //KS: Accessing shared memory is much much faster than global memory hence we use shared memory for calculation and then write to global memory
118  __shared__ float shared_NumeratorSum[_BlockSize_];
119  __shared__ float shared_DenomSum[_BlockSize_];
120 
121  // this is the stopping condition!
122  if (CurrentLagNum < d_nLag*d_nDraws)
123  {
124  shared_NumeratorSum[threadIdx.x] = 0;
125  shared_DenomSum[threadIdx.x] = 0;
126 
127  //KS: Might consider caching this information, which MIGHT be faster too lazy right now
128  const int Param = int(CurrentLagNum/d_nLag);
129  const int nLag = CurrentLagNum - Param*d_nLag;
130  // Loop over the number of entries
131  for (int i = 0; i < d_nEntries; ++i)
132  {
133  //KS: Use fmaf to have it tiny bit faster, for something easier to read: Param*d_nEntries + i
134  int CurrParStep = fmaf(Param, d_nEntries, i);
135  const float Diff = ParStep_gpu[CurrParStep]-ParamSums_gpu[Param];
136  // Only sum the numerator up to i = N-k
137  if (i < d_nEntries-nLag)
138  {
139  //KS: Use fmaf to have it tiny bit faster, for something easier to read: Param*d_nEntries + (i + nLag)
140  CurrParStep = fmaf(Param, d_nEntries, i + nLag);
141  const float LagTerm = ParStep_gpu[CurrParStep]-ParamSums_gpu[Param];
142  const float Product = Diff*LagTerm;
143  shared_NumeratorSum[threadIdx.x] += Product;
144  }
145  // Square the difference to form the denominator
146  const float Denom = Diff*Diff;
147  shared_DenomSum[threadIdx.x] += Denom;
148  }
149 
150  //KS: Make sure threads are synchronised before moving to global memory
151  __syncthreads();
152  NumeratorSum_gpu[CurrentLagNum] = shared_NumeratorSum[threadIdx.x];
153  DenomSum_gpu[CurrentLagNum] = shared_DenomSum[threadIdx.x];
154  }
155 }
__device__ __constant__ int d_nEntries
__device__ __constant__ int d_nDraws
__device__ __constant__ int d_nLag
#define _BlockSize_
KS: Need it for shared memory, there is way to use dynamic shared memory but I am lazy right now.
Definition: gpuUtils.cuh:25

Variable Documentation

◆ d_nDraws

__device__ __constant__ int d_nDraws

Definition at line 9 of file gpuMCMCProcessorUtils.cu.

◆ d_nEntries

__device__ __constant__ int d_nEntries

Definition at line 10 of file gpuMCMCProcessorUtils.cu.

◆ d_nLag

__device__ __constant__ int d_nLag

Definition at line 8 of file gpuMCMCProcessorUtils.cu.