PetaVision  Alpha
CudaRecvPost.cpp
1 #include "CudaRecvPost.hpp"
2 #include "arch/cuda/cuda_util.hpp"
3 #include "conversions.hcu"
4 #include "utils/PVAssert.hpp"
5 #include "utils/PVLog.hpp"
6 #include <cmath>
7 #include <sstream>
8 
9 namespace PVCuda {
10 
11 #ifdef PV_USE_CUDNN
12 #include <cudnn.h>
13 
14 #endif // PV_USE_CUDNN
15 
16 CudaRecvPost::CudaRecvPost(CudaDevice *inDevice) : CudaKernel(inDevice) {
17  kernelName = "CudaRecvPost";
18 }
19 
20 CudaRecvPost::~CudaRecvPost() {
21 #ifdef PV_USE_CUDNN
22  if (params.v_inputDescriptor) {
23  cudnnTensorDescriptor_t inputDescriptor = (cudnnTensorDescriptor_t)params.v_inputDescriptor;
24  cudnnDestroyTensorDescriptor(inputDescriptor);
25  }
26  if (params.v_filterDescriptor) {
27  cudnnFilterDescriptor_t filterDescriptor = (cudnnFilterDescriptor_t)params.v_filterDescriptor;
28  cudnnDestroyFilterDescriptor(filterDescriptor);
29  }
30  if (params.v_outputDescriptor) {
31  cudnnTensorDescriptor_t outputDescriptor = (cudnnTensorDescriptor_t)params.v_outputDescriptor;
32  cudnnDestroyTensorDescriptor(outputDescriptor);
33  }
34  if (params.v_convDescriptor) {
35  cudnnConvolutionDescriptor_t convDescriptor =
36  (cudnnConvolutionDescriptor_t)params.v_convDescriptor;
37  cudnnDestroyConvolutionDescriptor(convDescriptor);
38  }
39  if (params.v_convAlgo) {
40  cudnnConvolutionFwdAlgo_t *convAlgo = (cudnnConvolutionFwdAlgo_t *)params.v_convAlgo;
41  delete convAlgo;
42  }
43  if (params.cudnn_workspace) {
44  handleError(cudaFree(params.cudnn_workspace), "Freeing workspace pointer");
45  }
46  if (params.workspaceSize) {
47  delete params.workspaceSize;
48  }
49 #endif // PV_USE_CUDNN
50 }
51 
52 void CudaRecvPost::setArgs(
53  const int nbatch,
54  const int nxRes, // num post neurons
55  const int nyRes,
56  const int nf,
57 
58  const int nblt, // Border of orig
59  const int nbrt, // Border of orig
60  const int nbdn, // Border of orig
61  const int nbup, // Border of orig
62 
63  const int preNx,
64  const int preNy,
65  const int preNf,
66  const int preNblt,
67  const int preNbrt,
68  const int preNbup,
69  const int preNbdn,
70 
71  const int nxp,
72  const int nyp,
73  const int nfp,
74 
75  const float preToPostScaleX,
76  const float preToPostScaleY,
77 
78  const int sy,
79  const int syp,
80  const int numPerStride,
81  const float dt_factor,
82  const int sharedWeights,
83 
84  /* long* */ CudaBuffer *startSourceExtBuf,
85  /* float* */ CudaBuffer *preData,
86  /* float* */ CudaBuffer *weights,
87  /* float* */ CudaBuffer *postGsyn,
88 #ifdef PV_USE_CUDNN
89  /* float* */ CudaBuffer *cudnn_preData,
90  /* float* */ CudaBuffer *cudnn_weights,
91  /* float* */ CudaBuffer *cudnn_gSyn,
92 #endif // PV_USE_CUDNN
93  /* int* */ CudaBuffer *patch2datalookuptable) {
94  params.nbatch = nbatch;
95  params.nxRes = nxRes;
96  params.nyRes = nyRes;
97  params.nf = nf;
98 
99  params.nblt = nblt;
100  params.nbrt = nbrt;
101  params.nbdn = nbdn;
102  params.nbup = nbup;
103 
104  params.preNx = preNx;
105  params.preNy = preNy;
106  params.preNf = preNf;
107  params.preNblt = preNblt;
108  params.preNbrt = preNbrt;
109  params.preNbup = preNbup;
110  params.preNbdn = preNbdn;
111 
112  params.nxp = nxp;
113  params.nyp = nyp;
114  params.nfp = nfp;
115 
116  params.preToPostScaleX = preToPostScaleX;
117  params.preToPostScaleY = preToPostScaleY;
118 
119  params.sy = sy;
120  params.syp = syp;
121  params.numPerStride = numPerStride;
122  params.dt_factor = dt_factor;
123  params.sharedWeights = sharedWeights;
124 
125  params.startSourceExtBuf = (long *)startSourceExtBuf->getPointer();
126  params.preData = (float *)preData->getPointer();
127  params.weights = (float *)weights->getPointer();
128  params.postGsyn = (float *)postGsyn->getPointer();
129 #ifdef PV_USE_CUDNN
130  params.cudnn_weights = (float *)cudnn_weights->getPointer();
131  params.cudnn_preData = (float *)cudnn_preData->getPointer();
132  params.cudnn_gSyn = (float *)cudnn_gSyn->getPointer();
133 #endif // PV_USE_CUDNN
134  params.patch2datalookuptable = (int *)patch2datalookuptable->getPointer();
135 
136  params.warpSize = device->get_warp_size();
137 
138 #ifdef PV_USE_CUDNN
139  // CUDNN code
140  // Calculate how much space is left on the gpu for the workspace memory
141  // Do not add to device's count since there might be more than one kernel that needs workspace
142  // memory
143  size_t workspaceMem = device->getMemory() / device->getNumConvKernels();
144 
145  int strideX, strideY;
146  int actualXBorder, actualYBorder;
147  pvAssert(params.preNblt == params.preNbrt);
148  pvAssert(params.preNbup == params.preNbdn);
149  // One to many case
150  if (preToPostScaleX < 1) {
151  float fmanyScale = (float)1 / params.preToPostScaleX;
152  // Make sure manyScale is an actual integer
153  pvAssert(std::ceil(fmanyScale) == fmanyScale);
154  params.manyScaleX = fmanyScale;
155  fmanyScale = (float)1 / params.preToPostScaleY;
156  pvAssert(std::ceil(fmanyScale) == fmanyScale);
157  params.manyScaleY = fmanyScale;
158  strideX = 1;
159  strideY = 1;
160 
161  // Patch sizes must be odd multiple of many
162  if (nxp % 2 == 0 || nyp % 2 == 0) {
163  ErrorLog().printf(
164  "cuDNN: Running on a one to many connection with CUDNN must have patch size (%d, "
165  "%d) be an odd muliple of many (%d, %d)\n",
166  nxp * params.manyScaleX,
167  nyp * params.manyScaleY,
168  params.manyScaleX,
169  params.manyScaleY);
170  }
171 
172  // There's the case where the border of pre is made bigger through other connections. Need to
173  // calculate difference
174  // between current recv border and actual recv border
175  // This is calculating what the border would be if this was a one to one connection
176  actualXBorder = params.nxp / 2;
177  actualYBorder = params.nyp / 2;
178  }
179  // Many to one or one to one case
180  else {
181  params.manyScaleX = 1;
182  params.manyScaleY = 1;
183  pvAssert(std::ceil(preToPostScaleX) == preToPostScaleX);
184  pvAssert(std::ceil(preToPostScaleY) == preToPostScaleY);
185  strideX = preToPostScaleX;
186  strideY = preToPostScaleY;
187 
188  // There's the case where the border of pre is made bigger through other connections. Need to
189  // calculate difference
190  // between current recv border and actual recv border
191  actualXBorder = (params.nxp - params.preToPostScaleX) / 2;
192  actualYBorder = (params.nyp - params.preToPostScaleY) / 2;
193  }
194 
195  // diffX is positive value of cropping
196  params.diffX = params.preNblt - actualXBorder;
197  params.diffY = params.preNbup - actualYBorder;
198 
199  // Set up pre descriptor
200  cudnnTensorDescriptor_t inputDescriptor;
201  cudnnStatus_t status = cudnnCreateTensorDescriptor(&inputDescriptor);
202  cudnnHandleError(status, "Create input tensor descriptor");
203 
204  status = cudnnSetTensor4dDescriptor(
205  inputDescriptor,
206  CUDNN_TENSOR_NCHW,
207  CUDNN_DATA_FLOAT,
208  nbatch, // Number of images
209  params.preNf, // Number of feature maps per image
210  params.preNy + params.preNbup + params.preNbdn
211  - 2 * params.diffY, // Height of each feature map
212  params.preNx + params.preNblt + params.preNbrt
213  - 2 * params.diffX); // Width of each feature map
214  if (status != CUDNN_STATUS_SUCCESS) {
215  switch (status) {
216  case CUDNN_STATUS_BAD_PARAM: Fatal().printf("cuDNN bad parameter\n"); break;
217  default: Fatal().printf("cuDNN unknown error code %d\n", status);
218  }
219  pvAssert(0);
220  }
221  cudnnHandleError(status, "Set input tensor descriptor");
222  params.v_inputDescriptor = (void *)inputDescriptor;
223 
224  // Set up filter descriptor
225  cudnnFilterDescriptor_t filterDescriptor;
226  status = cudnnCreateFilterDescriptor(&filterDescriptor);
227  cudnnHandleError(status, "Create filter tensor descriptor");
228 #if CUDNN_MAJOR >= 5
229  status = cudnnSetFilter4dDescriptor(
230  filterDescriptor,
231  CUDNN_DATA_FLOAT,
232  CUDNN_TENSOR_NCHW,
233  params.nf * params.manyScaleX * params.manyScaleY, // Number of output feature maps. For
234  // one to many, output feature maps are
235  // repeated for each kernel
236  params.nfp, // Number of input feature maps
237  params.nyp, // Height of each filter
238  params.nxp); // Width of each filter
239 #elif CUDNN_MAJOR == 4
240  status = cudnnSetFilter4dDescriptor(
241  filterDescriptor,
242  CUDNN_DATA_FLOAT,
243  params.nf * params.manyScaleX * params.manyScaleY, // Number of output feature maps. For
244  // one to many, output feature maps are
245  // repeated for each kernel
246  params.nfp, // Number of input feature maps
247  params.nyp, // Height of each filter
248  params.nxp); // Width of each filter
249 #else
250 #error The cuDNN version is required to be either v4 or greater.\n
251 #endif
252  cudnnHandleError(status, "Set filter tensor descriptor");
253  params.v_filterDescriptor = (void *)filterDescriptor;
254 
255  // Set convolution descriptor
256  cudnnConvolutionDescriptor_t convDescriptor;
257  status = cudnnCreateConvolutionDescriptor(&convDescriptor);
258  cudnnHandleError(status, "Create convolution tensor descriptor");
259  status = cudnnSetConvolution2dDescriptor(
260  convDescriptor,
261  0,
262  0, // zero-padding height and width
263  strideY, // Vertical filter stride
264  strideX, // Horizontal filter stride
265  1,
266  1, // upscale the input in x/y direction
267  CUDNN_CONVOLUTION
268 #if CUDNN_MAJOR >= 6
269  ,
270  CUDNN_DATA_FLOAT
271 #endif
272  );
273  cudnnHandleError(status, "Set convolution tensor descriptor");
274  params.v_convDescriptor = (void *)convDescriptor;
275 
276  // Query output layout and check with PV layout
277  int out_n, out_c, out_h, out_w;
278  status = cudnnGetConvolution2dForwardOutputDim(
279  convDescriptor,
280  inputDescriptor,
281  filterDescriptor,
282  &out_n, // num images
283  &out_c, // num output features
284  &out_h, // output height
285  &out_w); // output width
286  cudnnHandleError(status, "Get output tensor descriptor");
287 
288  // Make sure dimensions match up with PV layer
289  if (out_n != nbatch || out_h != nyRes / params.manyScaleY || out_w != nxRes / params.manyScaleX
290  || out_c != nf * params.manyScaleX * params.manyScaleY) {
291  std::stringstream errmsg("");
292  errmsg << "CUDNN:: Dimensions don't match: \n";
293  errmsg << "Dimensions of output tensor (n, y, x, f): " << out_n << ", " << out_h << ", "
294  << out_w << ", " << out_c << "\n";
295  errmsg << "Scaled dimensions of output PV layer (n, y, x, f): " << nbatch << ", "
296  << nyRes / params.manyScaleY << ", " << nxRes / params.manyScaleX << ", "
297  << nf * params.manyScaleX * params.manyScaleY << "\n";
298  errmsg << "Actual dimensions of output PV layer (n, y, x, f): " << nbatch << ", " << nyRes
299  << ", " << nxRes << ", " << nf << "\n";
300  Fatal() << errmsg.str() << std::endl;
301  }
302 
303  // Set up output descriptor
304  cudnnTensorDescriptor_t outputDescriptor;
305  status = cudnnCreateTensorDescriptor(&outputDescriptor);
306  cudnnHandleError(status, "Create output tensor descriptor");
307  status = cudnnSetTensor4dDescriptor(
308  outputDescriptor,
309  CUDNN_TENSOR_NCHW,
310  CUDNN_DATA_FLOAT,
311  nbatch, // Number of images
312  nf * params.manyScaleX * params.manyScaleY, // Number of feature maps per image
313  nyRes / params.manyScaleY, // ny restricted
314  nxRes / params.manyScaleX); // nx restricted
315  cudnnHandleError(status, "Set output tensor descriptor");
316  params.v_outputDescriptor = (void *)outputDescriptor;
317 
318  // Calculate and set up best forward conv algorithm to use
319  cudnnHandle_t handle = (cudnnHandle_t)device->getCudnnHandle();
320  cudnnConvolutionFwdAlgo_t *convAlgo = new cudnnConvolutionFwdAlgo_t();
321 
322  status = cudnnGetConvolutionForwardAlgorithm(
323  handle,
324  inputDescriptor,
325  filterDescriptor,
326  convDescriptor,
327  outputDescriptor,
328  CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
329  workspaceMem,
330  convAlgo);
331  cudnnHandleError(status, "Get convolution forward algorithm");
332  params.v_convAlgo = (void *)convAlgo;
333 
334  // Based on algorithm, allocate workspace memory for GPU
335  size_t *temp = new size_t();
336  status = cudnnGetConvolutionForwardWorkspaceSize(
337  handle,
338  inputDescriptor,
339  filterDescriptor,
340  convDescriptor,
341  outputDescriptor,
342  *convAlgo,
343  temp);
344  params.workspaceSize = temp;
345  cudnnHandleError(status, "Get convolution forward workspace size");
346 
347  // Allocate workspace based on size
348  handleError(
349  cudaMalloc(&params.cudnn_workspace, *params.workspaceSize), "Cudnn workspace cudaMalloc");
350 
351 #endif // PV_USE_CUDNN
352 
353  setArgsFlag();
354 }
355 
356 int CudaRecvPost::do_run() {
357 
358 #ifdef PV_USE_CUDNN
359  cudnnHandle_t handle = (cudnnHandle_t)device->getCudnnHandle();
360  cudnnTensorDescriptor_t inputDescriptor = (cudnnTensorDescriptor_t)params.v_inputDescriptor;
361  cudnnFilterDescriptor_t filterDescriptor = (cudnnFilterDescriptor_t)params.v_filterDescriptor;
362  cudnnTensorDescriptor_t outputDescriptor = (cudnnTensorDescriptor_t)params.v_outputDescriptor;
363  cudnnConvolutionDescriptor_t convDescriptor =
364  (cudnnConvolutionDescriptor_t)params.v_convDescriptor;
365  cudnnConvolutionFwdAlgo_t *convAlgo = (cudnnConvolutionFwdAlgo_t *)params.v_convAlgo;
366 
367  float scalingFactor = 1;
368 
369  cudnnStatus_t status = cudnnConvolutionForward(
370  handle,
371  &(scalingFactor),
372  inputDescriptor,
373  params.cudnn_preData,
374  filterDescriptor,
375  params.cudnn_weights,
376  convDescriptor,
377  *convAlgo,
378  params.cudnn_workspace,
379  *params.workspaceSize,
380  &(scalingFactor),
381  outputDescriptor,
382  params.cudnn_gSyn);
383 
384  cudnnHandleError(status, "Convolution run");
385 #endif // PV_USE_CUDNN
386 
387  return 0;
388 }
389 
390 #ifdef PV_USE_CUDNN
391 void CudaRecvPost::permuteDatastorePVToCudnn() {
392  // Ext pre activity
393  int ny = params.preNy + params.preNbup + params.preNbdn;
394  int nx = params.preNx + params.preNblt + params.preNbrt;
395  int nf = params.preNf;
396  int nbatch = params.nbatch;
397 
398  // Calculate grid and work size
399  int numNeurons = nbatch * ny * nx * nf;
400  int blockSize = device->get_max_threads();
401  // Ceil to get all weights
402  int gridSize = ceil((float)numNeurons / blockSize);
403 
404  device->syncDevice();
405 
406  callPermuteDatastorePVToCudnnKernel(
407  gridSize,
408  blockSize,
409  params.preData,
410  params.cudnn_preData,
411  nbatch,
412  ny,
413  nx,
414  nf,
415  params.diffX,
416  params.diffY);
417  handleCallError("Permute PV to CUDNN");
418 }
419 
420 void CudaRecvPost::permuteGSynPVToCudnn(int channel) {
421  // Res post activity
422  int ny = params.nyRes;
423  int nx = params.nxRes;
424  int nf = params.nf;
425  int nbatch = params.nbatch;
426 
427  // Calculate grid and work size
428  int numNeurons = nbatch * ny * nx * nf;
429  float *gSynPatchHead = &(params.postGsyn[numNeurons * channel]);
430 
431  int blockSize = device->get_max_threads();
432  // Ceil to get all weights
433  int gridSize = std::ceil((float)numNeurons / (float)blockSize);
434  callPermuteGSynPVToCudnnKernel(
435  gridSize,
436  blockSize,
437  gSynPatchHead,
438  params.cudnn_gSyn,
439  nbatch,
440  ny,
441  nx,
442  nf,
443  params.manyScaleX,
444  params.manyScaleY);
445  handleCallError("Permute GSyn PV to CUDNN");
446 }
447 
448 void CudaRecvPost::permuteGSynCudnnToPV(int channel) {
449  // Res post activity
450  int ny = params.nyRes;
451  int nx = params.nxRes;
452  int nf = params.nf;
453  int nbatch = params.nbatch;
454 
455  // Calculate grid and work size
456  int numNeurons = nbatch * ny * nx * nf;
457  float *gSynPatchHead = &(params.postGsyn[numNeurons * channel]);
458 
459  int blockSize = device->get_max_threads();
460  // Ceil to get all weights
461  int gridSize = ceil((float)numNeurons / blockSize);
462  callPermuteGSynCudnnToPVKernel(
463  gridSize,
464  blockSize,
465  gSynPatchHead,
466  params.cudnn_gSyn,
467  nbatch,
468  ny,
469  nx,
470  nf,
471  params.manyScaleX,
472  params.manyScaleY);
473  handleCallError("Permute GSyn CUDNN to PV");
474 }
475 
476 #endif // PV_USE_CUDNN
477 
478 } // namespace PVCuda