8 #include "arch/cuda/cuda_util.hpp" 9 #include "utils/PVAssert.hpp" 11 #include <cudakernels/CudaPoolingDeliverKernel.hpp> 16 CudaPoolingDeliverKernel::CudaPoolingDeliverKernel(CudaDevice *inDevice) : CudaKernel(inDevice) {
17 kernelName =
"CudaPoolingDeliverKernel";
20 CudaPoolingDeliverKernel::~CudaPoolingDeliverKernel() {
21 cudnnDestroyPoolingDescriptor(mPoolingDescriptor);
22 cudnnDestroyTensorDescriptor((cudnnTensorDescriptor_t)mDataStoreDescriptor);
23 cudnnDestroyTensorDescriptor((cudnnTensorDescriptor_t)mGSynDescriptor);
26 void CudaPoolingDeliverKernel::setArgs(
31 cudnnPoolingMode_t poolingMode,
40 CudaBuffer *dataStoreBuffer,
41 CudaBuffer *gSynBuffer,
45 preLoc->nx < postLoc->nx,
46 "Pooling is not defined for one-to-many connections (pre->nx=%d, post->nx=%d\n",
50 preLoc->ny < postLoc->ny,
51 "Pooling is not defined for one-to-many connections (pre->ny=%d, post->ny=%d\n",
57 mPoolingMode = poolingMode;
58 mMultiplier = (float)multiplier;
60 int strideX = calcStride(preLoc->nx, postLoc->nx);
61 int strideY = calcStride(preLoc->ny, postLoc->ny);
64 status = cudnnCreatePoolingDescriptor(&mPoolingDescriptor);
65 cudnnHandleError(status,
"Create pooling descriptor");
66 status = cudnnSetPooling2dDescriptor(
70 CUDNN_NOT_PROPAGATE_NAN,
79 const PVHalo *preHalo = &preLoc->halo;
80 mBorderExcessX = calcBorderExcess(preLoc->nx, postLoc->nx, preHalo->lt, nxpPost);
81 mBorderExcessY = calcBorderExcess(preLoc->ny, postLoc->ny, preHalo->up, nypPost);
82 status = cudnnCreateTensorDescriptor(&mDataStoreDescriptor);
83 cudnnHandleError(status,
"Create input tensor descriptor");
84 status = cudnnSetTensor4dDescriptor(
91 preLoc->ny + preHalo->up + preHalo->dn - 2 * mBorderExcessY,
92 preLoc->nx + preHalo->lt + preHalo->rt - 2 * mBorderExcessX);
93 mDataStore = (
float *)dataStoreBuffer->getPointer();
95 status = cudnnCreateTensorDescriptor(&mGSynDescriptor);
96 cudnnHandleError(status,
"Create input tensor descriptor");
97 status = cudnnSetTensor4dDescriptor(
106 cudnnHandleError(status,
"Set output tensor descriptor");
108 std::string str(kernelName);
109 mCudnnDataStore = device->createBuffer(dataStoreBuffer->getSize(), &str);
111 int numGSynNeuronsAcrossBatch = postLoc->nf * postLoc->ny * postLoc->nf * postLoc->nbatch;
112 float *gSynHead = (
float *)gSynBuffer->getPointer();
113 mGSyn = &gSynHead[channel * numGSynNeuronsAcrossBatch];
115 size_t gSynSize = gSynBuffer->getSize();
116 mCudnnGSyn = device->createBuffer(numGSynNeuronsAcrossBatch, &str);
119 int CudaPoolingDeliverKernel::calcBorderExcess(
123 int patchSizePostPerspective) {
124 int preToPostScale = preRestricted / postRestricted;
125 int borderNeeded = (patchSizePostPerspective - preToPostScale) / 2;
126 return border - borderNeeded;
129 int CudaPoolingDeliverKernel::calcManyScale(
int preRestricted,
int postRestricted) {
return 1; }
131 int CudaPoolingDeliverKernel::calcStride(
int preRestricted,
int postRestricted) {
132 return preRestricted / postRestricted;
135 int CudaPoolingDeliverKernel::do_run() {
136 float scalingFactor = 1.0f;
138 int const blockSize = device->get_max_threads();
141 PVHalo const *halo = &mPreLoc->halo;
142 int const nxPreExt = mPreLoc->nx + halo->lt + halo->rt;
143 int const nyPreExt = mPreLoc->ny + halo->dn + halo->up;
144 int const nf = mPreLoc->nf;
145 int const nbatch = mPreLoc->nbatch;
147 int numNeurons = nbatch * nyPreExt * nxPreExt * nf;
149 int const gridSizePre = std::ceil((
float)numNeurons / blockSize);
150 float *cudnnDataStorePointer = (
float *)mCudnnDataStore->getPointer();
151 callPermuteDatastorePVToCudnnKernel(
155 cudnnDataStorePointer,
162 handleCallError(
"Permute DataStore PV to CUDNN");
165 int const nxPost = mPostLoc->nx;
166 int const nyPost = mPostLoc->ny;
167 int const nfPost = mPostLoc->nf;
168 pvAssert(mPostLoc->nbatch == mPreLoc->nbatch);
170 numNeurons = nbatch * nxPost * nyPost * nf;
171 float *cudnnGSynPointer = (
float *)mCudnnGSyn->getPointer();
173 int const gridSizePost = std::ceil((
float)numNeurons / (
float)blockSize);
174 callPermuteGSynPVToCudnnKernel(
175 gridSizePost, blockSize, mGSyn, cudnnGSynPointer, nbatch, nyPost, nxPost, nf, 1, 1);
176 handleCallError(
"Permute GSyn PV to CUDNN");
178 cudnnPoolingMode_t checkMode;
179 int h, w, vPad, hPad, vStride, hStride;
181 cudnnNanPropagation_t cudnnNanPropagation;
182 cudnnGetPooling2dDescriptor(
183 (cudnnPoolingDescriptor_t)mPoolingDescriptor,
185 &cudnnNanPropagation,
193 #elif CUDNN_MAJOR == 4 194 cudnnGetPooling2dDescriptor(
195 (cudnnPoolingDescriptor_t)mPoolingDescriptor,
204 #error The cuDNN version is required to be either v4 or v5. 208 cudnnStatus_t status = cudnnPoolingForward(
209 (cudnnHandle_t)device->getCudnnHandle(),
212 mDataStoreDescriptor,
213 cudnnDataStorePointer,
217 cudnnHandleError(status,
"Forward pooling run");
220 callPermuteGSynCudnnToPVKernel(
221 gridSizePost, blockSize, mGSyn, cudnnGSynPointer, nbatch, nyPost, nxPost, nf, 1, 1);
222 handleCallError(
"Permute GSyn CUDNN back to PV");