PetaVision  Alpha
CudaPoolingDeliverKernel.cpp
1 /*
2  * CudaPoolingDeliverKernel.cpp
3  *
4  * Created on: Aug 2, 2016
5  * Author: pschultz
6  */
7 
8 #include "arch/cuda/cuda_util.hpp"
9 #include "utils/PVAssert.hpp"
10 #include <cmath>
11 #include <cudakernels/CudaPoolingDeliverKernel.hpp>
12 #include <cudnn.h>
13 
14 namespace PVCuda {
15 
16 CudaPoolingDeliverKernel::CudaPoolingDeliverKernel(CudaDevice *inDevice) : CudaKernel(inDevice) {
17  kernelName = "CudaPoolingDeliverKernel";
18 }
19 
20 CudaPoolingDeliverKernel::~CudaPoolingDeliverKernel() {
21  cudnnDestroyPoolingDescriptor(mPoolingDescriptor);
22  cudnnDestroyTensorDescriptor((cudnnTensorDescriptor_t)mDataStoreDescriptor);
23  cudnnDestroyTensorDescriptor((cudnnTensorDescriptor_t)mGSynDescriptor);
24 }
25 
26 void CudaPoolingDeliverKernel::setArgs(
27  PVLayerLoc const *preLoc,
28  PVLayerLoc const *postLoc,
29  int nxpPost,
30  int nypPost,
31  cudnnPoolingMode_t poolingMode,
32  int multiplier,
33  // TODO: instead of passing poolingMode and multiplier, I would prefer
34  // to pass the PoolingConn poolingType, and have the setArgs method
35  // determine the pooling mode and the multiplier from poolingType and
36  // the patch size. However, this causes a circular dependency between
37  // PoolingConn and CudaPoolingRecvPost. It could be moved into
38  // pv_types.h, but it would be nice to define a pooling-specific enum
39  // in a pooling-specific file.
40  CudaBuffer *dataStoreBuffer,
41  CudaBuffer *gSynBuffer,
42  int channel) {
43 
44  FatalIf(
45  preLoc->nx < postLoc->nx,
46  "Pooling is not defined for one-to-many connections (pre->nx=%d, post->nx=%d\n",
47  preLoc->nx,
48  postLoc->nx);
49  FatalIf(
50  preLoc->ny < postLoc->ny,
51  "Pooling is not defined for one-to-many connections (pre->ny=%d, post->ny=%d\n",
52  preLoc->ny,
53  postLoc->ny);
54 
55  mPreLoc = preLoc;
56  mPostLoc = postLoc;
57  mPoolingMode = poolingMode;
58  mMultiplier = (float)multiplier;
59 
60  int strideX = calcStride(preLoc->nx, postLoc->nx);
61  int strideY = calcStride(preLoc->ny, postLoc->ny);
62 
63  cudnnStatus_t status;
64  status = cudnnCreatePoolingDescriptor(&mPoolingDescriptor);
65  cudnnHandleError(status, "Create pooling descriptor");
66  status = cudnnSetPooling2dDescriptor(
67  mPoolingDescriptor,
68  poolingMode,
69 #if CUDNN_MAJOR >= 5
70  CUDNN_NOT_PROPAGATE_NAN,
71 #endif
72  nypPost,
73  nxpPost,
74  0 /*horizontal padding*/,
75  0 /*vertical padding*/,
76  strideY,
77  strideX);
78 
79  const PVHalo *preHalo = &preLoc->halo;
80  mBorderExcessX = calcBorderExcess(preLoc->nx, postLoc->nx, preHalo->lt, nxpPost);
81  mBorderExcessY = calcBorderExcess(preLoc->ny, postLoc->ny, preHalo->up, nypPost);
82  status = cudnnCreateTensorDescriptor(&mDataStoreDescriptor);
83  cudnnHandleError(status, "Create input tensor descriptor");
84  status = cudnnSetTensor4dDescriptor(
85  mDataStoreDescriptor,
86  CUDNN_TENSOR_NCHW, // PetaVision arrays are ordered NHWC; they will be permuted to NCHW
87  // inside do_run()
88  CUDNN_DATA_FLOAT,
89  preLoc->nbatch, // Number of images
90  preLoc->nf, // Number of feature maps per image
91  preLoc->ny + preHalo->up + preHalo->dn - 2 * mBorderExcessY, // Height of each feature map
92  preLoc->nx + preHalo->lt + preHalo->rt - 2 * mBorderExcessX); // Width of each feature map
93  mDataStore = (float *)dataStoreBuffer->getPointer();
94 
95  status = cudnnCreateTensorDescriptor(&mGSynDescriptor);
96  cudnnHandleError(status, "Create input tensor descriptor");
97  status = cudnnSetTensor4dDescriptor(
98  mGSynDescriptor,
99  CUDNN_TENSOR_NCHW, // PetaVision arrays are ordered NHWC; they will be permuted to NCHW
100  // inside do_run()
101  CUDNN_DATA_FLOAT,
102  preLoc->nbatch, // Number of images
103  postLoc->nf, // Number of feature maps per image
104  postLoc->ny, // ny restricted
105  postLoc->nx); // nx restricted
106  cudnnHandleError(status, "Set output tensor descriptor");
107 
108  std::string str(kernelName);
109  mCudnnDataStore = device->createBuffer(dataStoreBuffer->getSize(), &str);
110 
111  int numGSynNeuronsAcrossBatch = postLoc->nf * postLoc->ny * postLoc->nf * postLoc->nbatch;
112  float *gSynHead = (float *)gSynBuffer->getPointer();
113  mGSyn = &gSynHead[channel * numGSynNeuronsAcrossBatch];
114 
115  size_t gSynSize = gSynBuffer->getSize();
116  mCudnnGSyn = device->createBuffer(numGSynNeuronsAcrossBatch, &str);
117 }
118 
119 int CudaPoolingDeliverKernel::calcBorderExcess(
120  int preRestricted,
121  int postRestricted,
122  int border,
123  int patchSizePostPerspective) {
124  int preToPostScale = preRestricted / postRestricted;
125  int borderNeeded = (patchSizePostPerspective - preToPostScale) / 2;
126  return border - borderNeeded;
127 }
128 
129 int CudaPoolingDeliverKernel::calcManyScale(int preRestricted, int postRestricted) { return 1; }
130 
131 int CudaPoolingDeliverKernel::calcStride(int preRestricted, int postRestricted) {
132  return preRestricted / postRestricted;
133 }
134 
135 int CudaPoolingDeliverKernel::do_run() {
136  float scalingFactor = 1.0f;
137 
138  int const blockSize = device->get_max_threads();
139 
140  // Permute PV-organized DataStore to CUDNN organization.
141  PVHalo const *halo = &mPreLoc->halo;
142  int const nxPreExt = mPreLoc->nx + halo->lt + halo->rt;
143  int const nyPreExt = mPreLoc->ny + halo->dn + halo->up;
144  int const nf = mPreLoc->nf;
145  int const nbatch = mPreLoc->nbatch;
146  // Calculate grid and work size
147  int numNeurons = nbatch * nyPreExt * nxPreExt * nf;
148  // Ceil to get all neurons
149  int const gridSizePre = std::ceil((float)numNeurons / blockSize);
150  float *cudnnDataStorePointer = (float *)mCudnnDataStore->getPointer();
151  callPermuteDatastorePVToCudnnKernel(
152  gridSizePre,
153  blockSize,
154  mDataStore,
155  cudnnDataStorePointer,
156  nbatch,
157  nyPreExt,
158  nxPreExt,
159  nf,
160  mBorderExcessX,
161  mBorderExcessY);
162  handleCallError("Permute DataStore PV to CUDNN");
163 
164  // Permute the PV-ordered GSyn channel to CUDNN ordering.
165  int const nxPost = mPostLoc->nx;
166  int const nyPost = mPostLoc->ny;
167  int const nfPost = mPostLoc->nf;
168  pvAssert(mPostLoc->nbatch == mPreLoc->nbatch);
169  // Calculate grid and work size
170  numNeurons = nbatch * nxPost * nyPost * nf;
171  float *cudnnGSynPointer = (float *)mCudnnGSyn->getPointer();
172  // Ceil to get all neurons
173  int const gridSizePost = std::ceil((float)numNeurons / (float)blockSize);
174  callPermuteGSynPVToCudnnKernel(
175  gridSizePost, blockSize, mGSyn, cudnnGSynPointer, nbatch, nyPost, nxPost, nf, 1, 1);
176  handleCallError("Permute GSyn PV to CUDNN");
177 
178  cudnnPoolingMode_t checkMode;
179  int h, w, vPad, hPad, vStride, hStride;
180 #if CUDNN_MAJOR >= 5
181  cudnnNanPropagation_t cudnnNanPropagation;
182  cudnnGetPooling2dDescriptor(
183  (cudnnPoolingDescriptor_t)mPoolingDescriptor,
184  &checkMode,
185  &cudnnNanPropagation,
186  &h,
187  &w,
188  &vPad,
189  &hPad,
190  &vStride,
191  &hStride);
192 
193 #elif CUDNN_MAJOR == 4
194  cudnnGetPooling2dDescriptor(
195  (cudnnPoolingDescriptor_t)mPoolingDescriptor,
196  &checkMode,
197  &h,
198  &w,
199  &vPad,
200  &hPad,
201  &vStride,
202  &hStride);
203 #else
204 #error The cuDNN version is required to be either v4 or v5.
205 #endif
206 
207  // Do the pooling
208  cudnnStatus_t status = cudnnPoolingForward(
209  (cudnnHandle_t)device->getCudnnHandle(),
210  mPoolingDescriptor,
211  &mMultiplier,
212  mDataStoreDescriptor,
213  cudnnDataStorePointer,
214  &scalingFactor,
215  mGSynDescriptor,
216  cudnnGSynPointer);
217  cudnnHandleError(status, "Forward pooling run");
218 
219  // Permute the CUDNN-ordering GSyn back to PV ordering
220  callPermuteGSynCudnnToPVKernel(
221  gridSizePost, blockSize, mGSyn, cudnnGSynPointer, nbatch, nyPost, nxPost, nf, 1, 1);
222  handleCallError("Permute GSyn CUDNN back to PV");
223  return 0;
224 }
225 
226 } /* namespace PVCuda */