PetaVision  Alpha
CudaTransposePoolingDeliverKernel.cpp
1 /*
2  * CudaTransposePoolingDeliverKernel.cpp
3  *
4  * Created on: Aug 16, 2016
5  * Author: pschultz
6  */
7 
8 #include "cudakernels/CudaTransposePoolingDeliverKernel.hpp"
9 #include "arch/cuda/cuda_util.hpp"
10 #include "utils/PVAssert.hpp"
11 #include <cmath>
12 #include <vector> // Added for debugging
13 
14 namespace PVCuda {
15 
16 CudaTransposePoolingDeliverKernel::CudaTransposePoolingDeliverKernel(CudaDevice *inDevice)
17  : CudaKernel(inDevice) {
18  kernelName = "CudaTransposePoolingDeliverKernel";
19 }
20 
21 CudaTransposePoolingDeliverKernel::~CudaTransposePoolingDeliverKernel() {}
22 
23 void CudaTransposePoolingDeliverKernel::setArgs(
24  PVLayerLoc const *preLoc,
25  PVLayerLoc const *postLoc,
26  PVLayerLoc const *origConnPreLoc,
27  PVLayerLoc const *origConnPostLoc,
28  int nxpPost,
29  int nypPost,
30  cudnnPoolingMode_t poolingMode,
31  int multiplier,
32  CudaBuffer *dataStoreBuffer,
33  CudaBuffer *gSynBuffer,
34  CudaBuffer *origConnDataStoreBuffer,
35  CudaBuffer *origConnGSynBuffer,
36  int channel) {
37  mPreLoc = preLoc;
38  mPostLoc = postLoc;
39  pvAssertMessage(
40  preLoc->nx <= postLoc->nx && preLoc->ny <= postLoc->ny,
41  "CudaTransposePoolingDeliverKernel: Transpose pooling requires pre-layer to have same or "
42  "lower density as post-layer.\n");
43  mPoolingMode = poolingMode;
44  mMultiplier = (float)multiplier;
45 
46  int strideX = CudaPoolingDeliverKernel::calcStride(mPostLoc->nx, mPreLoc->nx);
47  int strideY = CudaPoolingDeliverKernel::calcStride(mPostLoc->ny, mPreLoc->ny);
48  int nxpPre = nxpPost * mPostLoc->nx / mPreLoc->nx;
49  pvAssert(nxpPre * mPreLoc->nx == nxpPost * mPostLoc->nx);
50  int nypPre = nypPost * mPostLoc->ny / mPreLoc->ny;
51  pvAssert(nypPre * mPreLoc->ny == nypPost * mPostLoc->ny);
52 
53  cudnnStatus_t status;
54  status = cudnnCreatePoolingDescriptor(&mPoolingDescriptor);
55  cudnnHandleError(status, "Create pooling descriptor");
56 #if CUDNN_MAJOR >= 5
57  status = cudnnSetPooling2dDescriptor(
58  mPoolingDescriptor,
59  poolingMode,
60  CUDNN_NOT_PROPAGATE_NAN,
61  nypPre,
62  nxpPre,
63  0 /*horizontal padding*/,
64  0 /*vertical padding*/,
65  strideY,
66  strideX);
67 #elif CUDNN_MAJOR == 4
68  status = cudnnSetPooling2dDescriptor(
69  mPoolingDescriptor,
70  poolingMode,
71  nypPre,
72  nxpPre,
73  0 /*horizontal padding*/,
74  0 /*vertical padding*/,
75  strideY,
76  strideX);
77 #else
78 #error The cuDNN version is required to be v4 or greater.\n
79 #endif
80 
81  const PVHalo *preHalo = &mPreLoc->halo;
82  mBorderExcessX = calcBorderExcess(mPreLoc->nx, mPostLoc->nx, preHalo->lt, nxpPost);
83  mBorderExcessY = calcBorderExcess(mPreLoc->ny, mPostLoc->ny, preHalo->up, nypPost);
84  status = cudnnCreateTensorDescriptor(&mDataStoreDescriptor);
85  cudnnHandleError(status, "Create input tensor descriptor");
86  status = cudnnSetTensor4dDescriptor(
87  mDataStoreDescriptor,
88  CUDNN_TENSOR_NCHW, // PetaVision arrays are ordered NHWC; they will be permuted to NCHW
89  // inside do_run()
90  CUDNN_DATA_FLOAT,
91  mPreLoc->nbatch, // Number of images
92  mPreLoc->nf, // Number of feature maps per image
93  mPreLoc->ny + preHalo->up + preHalo->dn - 2 * mBorderExcessY, // Height of each feature map
94  mPreLoc->nx + preHalo->lt + preHalo->rt - 2 * mBorderExcessX); // Width of each feature map
95  cudnnHandleError(status, "Set input tensor descriptor");
96  mDataStore = (float *)dataStoreBuffer->getPointer();
97  std::string str(kernelName);
98  mCudnnDataStore = device->createBuffer(dataStoreBuffer->getSize(), &str);
99 
100  status = cudnnCreateTensorDescriptor(&mGSynDescriptor);
101  cudnnHandleError(status, "Create input tensor descriptor");
102  status = cudnnSetTensor4dDescriptor(
103  mGSynDescriptor,
104  CUDNN_TENSOR_NCHW, // PetaVision arrays are ordered NHWC; they will be permuted to NCHW
105  // inside do_run()
106  CUDNN_DATA_FLOAT,
107  mPreLoc->nbatch, // Number of images
108  mPostLoc->nf, // Number of feature maps per image
109  mPostLoc->ny, // ny restricted
110  mPostLoc->nx); // nx restricted
111  cudnnHandleError(status, "Set output tensor descriptor");
112  int numGSynNeuronsAcrossBatch = mPostLoc->nx * mPostLoc->ny * mPostLoc->nf * mPostLoc->nbatch;
113  float *gSynHead = (float *)gSynBuffer->getPointer();
114  mGSyn = &gSynHead[channel * numGSynNeuronsAcrossBatch];
115  mCudnnGSyn = device->createBuffer(numGSynNeuronsAcrossBatch * sizeof(float), &str);
116 
117  mOrigConnPreLoc = origConnPreLoc;
118  mOrigConnPostLoc = origConnPostLoc;
119 
120  const PVHalo *origConnPreHalo = &mOrigConnPreLoc->halo;
121  mOrigConnBorderExcessX =
122  calcBorderExcess(mOrigConnPreLoc->nx, mOrigConnPostLoc->nx, origConnPreHalo->lt, nxpPost);
123  mOrigConnBorderExcessY =
124  calcBorderExcess(mOrigConnPreLoc->ny, mOrigConnPostLoc->ny, origConnPreHalo->up, nypPost);
125  status = cudnnCreateTensorDescriptor(&mOrigConnDataStoreDescriptor);
126  cudnnHandleError(status, "Create original conn pre datastore tensor descriptor");
127  status = cudnnSetTensor4dDescriptor(
128  mOrigConnDataStoreDescriptor,
129  CUDNN_TENSOR_NCHW, // PetaVision arrays are ordered NHWC; they will be permuted to NCHW
130  // inside do_run()
131  CUDNN_DATA_FLOAT,
132  mOrigConnPreLoc->nbatch, // Number of images
133  mOrigConnPreLoc->nf, // Number of feature maps per image
134  mOrigConnPreLoc->ny + origConnPreHalo->up + origConnPreHalo->dn
135  - 2 * mOrigConnBorderExcessY, // Height of each feature map
136  mOrigConnPreLoc->nx + origConnPreHalo->lt + origConnPreHalo->rt
137  - 2 * mOrigConnBorderExcessX); // Width of each feature map
138  cudnnHandleError(status, "Set original conn pre datastore tensor descriptor");
139  mOrigConnDataStore = (float *)origConnDataStoreBuffer->getPointer();
140  mCudnnOrigConnDataStore = device->createBuffer(origConnDataStoreBuffer->getSize(), &str);
141 
142  status = cudnnCreateTensorDescriptor(&mOrigConnGSynDescriptor);
143  cudnnHandleError(status, "Create original conn post gsyn tensor descriptor");
144  status = cudnnSetTensor4dDescriptor(
145  mOrigConnGSynDescriptor,
146  CUDNN_TENSOR_NCHW, // PetaVision arrays are ordered NHWC; they will be permuted to NCHW
147  // inside do_run()
148  CUDNN_DATA_FLOAT,
149  mOrigConnPostLoc->nbatch, // Number of images
150  mOrigConnPostLoc->nf, // Number of feature maps per image
151  mOrigConnPostLoc->ny, // ny restricted
152  mOrigConnPostLoc->nx); // nx restricted
153  cudnnHandleError(status, "Set original conn post gsyn tensor descriptor");
154  int numOrigConnGSynNeuronsAcrossBatch = mOrigConnPostLoc->nf * mOrigConnPostLoc->ny
155  * mOrigConnPostLoc->nf * mOrigConnPostLoc->nbatch;
156  float *origConnGSynHead = (float *)origConnGSynBuffer->getPointer();
157  mOrigConnGSyn = &origConnGSynHead[channel * numOrigConnGSynNeuronsAcrossBatch];
158  mCudnnOrigConnGSyn =
159  device->createBuffer(numOrigConnGSynNeuronsAcrossBatch * sizeof(float), &str);
160 }
161 
162 int CudaTransposePoolingDeliverKernel::calcBorderExcess(
163  int preRestricted,
164  int postRestricted,
165  int border,
166  int patchSizePostPerspective) {
167  int borderNeeded = (patchSizePostPerspective - 1) / 2;
168  return border - borderNeeded;
169 }
170 
171 int CudaTransposePoolingDeliverKernel::calcManyScale(int preRestricted, int postRestricted) {
172  int manyScale = postRestricted / preRestricted;
173  if (manyScale * preRestricted != postRestricted) {
174  throw;
175  }
176  return manyScale;
177 }
178 
179 int CudaTransposePoolingDeliverKernel::calcStride(int preRestricted, int postRestricted) {
180  return 1;
181 }
182 
183 int CudaTransposePoolingDeliverKernel::do_run() {
184  float scalingFactor = 1.0f;
185 
186  int const blockSize = device->get_max_threads();
187 
188  // Permute PV-organized DataStore to CUDNN organization.
189  PVHalo const *halo = &mPreLoc->halo;
190  int const nxPreExt = mPreLoc->nx + halo->lt + halo->rt;
191  int const nyPreExt = mPreLoc->ny + halo->dn + halo->up;
192  int const nf = mPreLoc->nf;
193  int const nbatch = mPreLoc->nbatch;
194  // Calculate grid and work size
195  int numNeurons = nbatch * nyPreExt * nxPreExt * nf;
196  // Ceil to get all neurons
197  int const gridSizePre = std::ceil((float)numNeurons / blockSize);
198  float *cudnnDataStorePointer = (float *)mCudnnDataStore->getPointer();
199  callPermuteDatastorePVToCudnnKernel(
200  gridSizePre,
201  blockSize,
202  mDataStore,
203  cudnnDataStorePointer,
204  nbatch,
205  nyPreExt,
206  nxPreExt,
207  nf,
208  mBorderExcessX,
209  mBorderExcessY);
210  handleCallError("CudaTransposeConn: permute DataStore PV to CUDNN");
211 
212  // Permute the PV-ordered GSyn channel to CUDNN ordering.
213  int const nxPost = mPostLoc->nx;
214  int const nyPost = mPostLoc->ny;
215  pvAssert(nf == mPostLoc->nf);
216  pvAssert(mPostLoc->nbatch == mPreLoc->nbatch);
217  // Calculate grid and work size
218  numNeurons = nbatch * nxPost * nyPost * nf;
219  float *cudnnGSynPointer = (float *)mCudnnGSyn->getPointer();
220  // Ceil to get all neurons
221  int const gridSizePost = std::ceil((float)numNeurons / (float)blockSize);
222  callPermuteGSynPVToCudnnKernel(
223  gridSizePost, blockSize, mGSyn, cudnnGSynPointer, nbatch, nyPost, nxPost, nf, 1, 1);
224  handleCallError("CudaTransposeConn: permute GSyn PV to CUDNN");
225 
226  // Permute PV-organized original conn's DataStore to CUDNN organization.
227  PVHalo const *origConnHalo = &mOrigConnPreLoc->halo;
228  int const origConnNxPreExt = mOrigConnPreLoc->nx + origConnHalo->lt + origConnHalo->rt;
229  int const origConnNyPreExt = mOrigConnPreLoc->ny + origConnHalo->dn + origConnHalo->up;
230  pvAssert(nf == mOrigConnPreLoc->nf);
231  pvAssert(nbatch == mOrigConnPreLoc->nbatch);
232  // Calculate grid and work size
233  numNeurons = nbatch * origConnNyPreExt * origConnNxPreExt * nf;
234  // Ceil to get all neurons
235  int const gridSizeOrigConnPre = std::ceil((float)numNeurons / blockSize);
236  float *cudnnOrigConnDataStorePointer = (float *)mCudnnOrigConnDataStore->getPointer();
237  callPermuteDatastorePVToCudnnKernel(
238  gridSizeOrigConnPre,
239  blockSize,
240  mOrigConnDataStore,
241  cudnnOrigConnDataStorePointer,
242  nbatch,
243  origConnNyPreExt,
244  origConnNxPreExt,
245  nf,
246  mBorderExcessX,
247  mBorderExcessY);
248  handleCallError("CudaTransposeConn: permute original conn's DataStore PV to CUDNN");
249 
250  // Permute the PV-ordered original conn's GSyn channel to CUDNN ordering.
251  int const origConnNxPost = mOrigConnPostLoc->nx;
252  int const origConnNyPost = mOrigConnPostLoc->ny;
253  pvAssert(nf == mOrigConnPostLoc->nf);
254  pvAssert(mOrigConnPostLoc->nbatch == nbatch);
255  // Calculate grid and work size
256  numNeurons = nbatch * origConnNxPost * origConnNyPost * nf;
257  float *cudnnOrigConnGSynPointer = (float *)mCudnnOrigConnGSyn->getPointer();
258  // Ceil to get all neurons
259  int const gridSizeOrigConnPost = std::ceil((float)numNeurons / (float)blockSize);
260  callPermuteGSynPVToCudnnKernel(
261  gridSizeOrigConnPost,
262  blockSize,
263  mOrigConnGSyn,
264  cudnnOrigConnGSynPointer,
265  nbatch,
266  origConnNyPost,
267  origConnNxPost,
268  nf,
269  1,
270  1);
271  handleCallError("CudaTransposeConn: permute original conn's GSyn PV to CUDNN");
272 
273  // Do the pooling
274  cudnnStatus_t status = cudnnPoolingBackward(
275  (cudnnHandle_t)device->getCudnnHandle(),
276  mPoolingDescriptor,
277  &mMultiplier,
278  mOrigConnGSynDescriptor,
279  cudnnOrigConnGSynPointer,
280  mDataStoreDescriptor,
281  cudnnDataStorePointer,
282  mOrigConnDataStoreDescriptor,
283  cudnnOrigConnDataStorePointer,
284  &scalingFactor,
285  mGSynDescriptor,
286  cudnnGSynPointer);
287  cudnnHandleError(status, "CudaTransposeConn: backward pooling run");
288 
289  device->syncDevice();
290 
291  // Permute the CUDNN-ordering GSyn back to PV ordering
292  callPermuteGSynCudnnToPVKernel(
293  gridSizePost, blockSize, mGSyn, cudnnGSynPointer, nbatch, nyPost, nxPost, nf, 1, 1);
294  handleCallError("CudaTransposeConn: permute GSyn CUDNN back to PV");
295  return 0;
296 }
297 
298 } /* namespace PVCuda */