1 #include "CudaRecvPost.hpp" 2 #include "arch/cuda/cuda_util.hpp" 3 #include "conversions.hcu" 4 #include "utils/PVAssert.hpp" 5 #include "utils/PVLog.hpp" 14 #endif // PV_USE_CUDNN 16 CudaRecvPost::CudaRecvPost(CudaDevice *inDevice) : CudaKernel(inDevice) {
17 kernelName =
"CudaRecvPost";
20 CudaRecvPost::~CudaRecvPost() {
22 if (params.v_inputDescriptor) {
23 cudnnTensorDescriptor_t inputDescriptor = (cudnnTensorDescriptor_t)params.v_inputDescriptor;
24 cudnnDestroyTensorDescriptor(inputDescriptor);
26 if (params.v_filterDescriptor) {
27 cudnnFilterDescriptor_t filterDescriptor = (cudnnFilterDescriptor_t)params.v_filterDescriptor;
28 cudnnDestroyFilterDescriptor(filterDescriptor);
30 if (params.v_outputDescriptor) {
31 cudnnTensorDescriptor_t outputDescriptor = (cudnnTensorDescriptor_t)params.v_outputDescriptor;
32 cudnnDestroyTensorDescriptor(outputDescriptor);
34 if (params.v_convDescriptor) {
35 cudnnConvolutionDescriptor_t convDescriptor =
36 (cudnnConvolutionDescriptor_t)params.v_convDescriptor;
37 cudnnDestroyConvolutionDescriptor(convDescriptor);
39 if (params.v_convAlgo) {
40 cudnnConvolutionFwdAlgo_t *convAlgo = (cudnnConvolutionFwdAlgo_t *)params.v_convAlgo;
43 if (params.cudnn_workspace) {
44 handleError(cudaFree(params.cudnn_workspace),
"Freeing workspace pointer");
46 if (params.workspaceSize) {
47 delete params.workspaceSize;
49 #endif // PV_USE_CUDNN 52 void CudaRecvPost::setArgs(
75 const float preToPostScaleX,
76 const float preToPostScaleY,
80 const int numPerStride,
81 const float dt_factor,
82 const int sharedWeights,
84 CudaBuffer *startSourceExtBuf,
89 CudaBuffer *cudnn_preData,
90 CudaBuffer *cudnn_weights,
91 CudaBuffer *cudnn_gSyn,
93 CudaBuffer *patch2datalookuptable) {
94 params.nbatch = nbatch;
104 params.preNx = preNx;
105 params.preNy = preNy;
106 params.preNf = preNf;
107 params.preNblt = preNblt;
108 params.preNbrt = preNbrt;
109 params.preNbup = preNbup;
110 params.preNbdn = preNbdn;
116 params.preToPostScaleX = preToPostScaleX;
117 params.preToPostScaleY = preToPostScaleY;
121 params.numPerStride = numPerStride;
122 params.dt_factor = dt_factor;
123 params.sharedWeights = sharedWeights;
125 params.startSourceExtBuf = (
long *)startSourceExtBuf->getPointer();
126 params.preData = (
float *)preData->getPointer();
127 params.weights = (
float *)weights->getPointer();
128 params.postGsyn = (
float *)postGsyn->getPointer();
130 params.cudnn_weights = (
float *)cudnn_weights->getPointer();
131 params.cudnn_preData = (
float *)cudnn_preData->getPointer();
132 params.cudnn_gSyn = (
float *)cudnn_gSyn->getPointer();
133 #endif // PV_USE_CUDNN 134 params.patch2datalookuptable = (
int *)patch2datalookuptable->getPointer();
136 params.warpSize = device->get_warp_size();
143 size_t workspaceMem = device->getMemory() / device->getNumConvKernels();
145 int strideX, strideY;
146 int actualXBorder, actualYBorder;
147 pvAssert(params.preNblt == params.preNbrt);
148 pvAssert(params.preNbup == params.preNbdn);
150 if (preToPostScaleX < 1) {
151 float fmanyScale = (float)1 / params.preToPostScaleX;
153 pvAssert(std::ceil(fmanyScale) == fmanyScale);
154 params.manyScaleX = fmanyScale;
155 fmanyScale = (float)1 / params.preToPostScaleY;
156 pvAssert(std::ceil(fmanyScale) == fmanyScale);
157 params.manyScaleY = fmanyScale;
162 if (nxp % 2 == 0 || nyp % 2 == 0) {
164 "cuDNN: Running on a one to many connection with CUDNN must have patch size (%d, " 165 "%d) be an odd muliple of many (%d, %d)\n",
166 nxp * params.manyScaleX,
167 nyp * params.manyScaleY,
176 actualXBorder = params.nxp / 2;
177 actualYBorder = params.nyp / 2;
181 params.manyScaleX = 1;
182 params.manyScaleY = 1;
183 pvAssert(std::ceil(preToPostScaleX) == preToPostScaleX);
184 pvAssert(std::ceil(preToPostScaleY) == preToPostScaleY);
185 strideX = preToPostScaleX;
186 strideY = preToPostScaleY;
191 actualXBorder = (params.nxp - params.preToPostScaleX) / 2;
192 actualYBorder = (params.nyp - params.preToPostScaleY) / 2;
196 params.diffX = params.preNblt - actualXBorder;
197 params.diffY = params.preNbup - actualYBorder;
200 cudnnTensorDescriptor_t inputDescriptor;
201 cudnnStatus_t status = cudnnCreateTensorDescriptor(&inputDescriptor);
202 cudnnHandleError(status,
"Create input tensor descriptor");
204 status = cudnnSetTensor4dDescriptor(
210 params.preNy + params.preNbup + params.preNbdn
212 params.preNx + params.preNblt + params.preNbrt
214 if (status != CUDNN_STATUS_SUCCESS) {
216 case CUDNN_STATUS_BAD_PARAM: Fatal().printf(
"cuDNN bad parameter\n");
break;
217 default: Fatal().printf(
"cuDNN unknown error code %d\n", status);
221 cudnnHandleError(status,
"Set input tensor descriptor");
222 params.v_inputDescriptor = (
void *)inputDescriptor;
225 cudnnFilterDescriptor_t filterDescriptor;
226 status = cudnnCreateFilterDescriptor(&filterDescriptor);
227 cudnnHandleError(status,
"Create filter tensor descriptor");
229 status = cudnnSetFilter4dDescriptor(
233 params.nf * params.manyScaleX * params.manyScaleY,
239 #elif CUDNN_MAJOR == 4 240 status = cudnnSetFilter4dDescriptor(
243 params.nf * params.manyScaleX * params.manyScaleY,
250 #error The cuDNN version is required to be either v4 or greater.\n 252 cudnnHandleError(status,
"Set filter tensor descriptor");
253 params.v_filterDescriptor = (
void *)filterDescriptor;
256 cudnnConvolutionDescriptor_t convDescriptor;
257 status = cudnnCreateConvolutionDescriptor(&convDescriptor);
258 cudnnHandleError(status,
"Create convolution tensor descriptor");
259 status = cudnnSetConvolution2dDescriptor(
273 cudnnHandleError(status,
"Set convolution tensor descriptor");
274 params.v_convDescriptor = (
void *)convDescriptor;
277 int out_n, out_c, out_h, out_w;
278 status = cudnnGetConvolution2dForwardOutputDim(
286 cudnnHandleError(status,
"Get output tensor descriptor");
289 if (out_n != nbatch || out_h != nyRes / params.manyScaleY || out_w != nxRes / params.manyScaleX
290 || out_c != nf * params.manyScaleX * params.manyScaleY) {
291 std::stringstream errmsg(
"");
292 errmsg <<
"CUDNN:: Dimensions don't match: \n";
293 errmsg <<
"Dimensions of output tensor (n, y, x, f): " << out_n <<
", " << out_h <<
", " 294 << out_w <<
", " << out_c <<
"\n";
295 errmsg <<
"Scaled dimensions of output PV layer (n, y, x, f): " << nbatch <<
", " 296 << nyRes / params.manyScaleY <<
", " << nxRes / params.manyScaleX <<
", " 297 << nf * params.manyScaleX * params.manyScaleY <<
"\n";
298 errmsg <<
"Actual dimensions of output PV layer (n, y, x, f): " << nbatch <<
", " << nyRes
299 <<
", " << nxRes <<
", " << nf <<
"\n";
300 Fatal() << errmsg.str() << std::endl;
304 cudnnTensorDescriptor_t outputDescriptor;
305 status = cudnnCreateTensorDescriptor(&outputDescriptor);
306 cudnnHandleError(status,
"Create output tensor descriptor");
307 status = cudnnSetTensor4dDescriptor(
312 nf * params.manyScaleX * params.manyScaleY,
313 nyRes / params.manyScaleY,
314 nxRes / params.manyScaleX);
315 cudnnHandleError(status,
"Set output tensor descriptor");
316 params.v_outputDescriptor = (
void *)outputDescriptor;
319 cudnnHandle_t handle = (cudnnHandle_t)device->getCudnnHandle();
320 cudnnConvolutionFwdAlgo_t *convAlgo =
new cudnnConvolutionFwdAlgo_t();
322 status = cudnnGetConvolutionForwardAlgorithm(
328 CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
331 cudnnHandleError(status,
"Get convolution forward algorithm");
332 params.v_convAlgo = (
void *)convAlgo;
335 size_t *temp =
new size_t();
336 status = cudnnGetConvolutionForwardWorkspaceSize(
344 params.workspaceSize = temp;
345 cudnnHandleError(status,
"Get convolution forward workspace size");
349 cudaMalloc(¶ms.cudnn_workspace, *params.workspaceSize),
"Cudnn workspace cudaMalloc");
351 #endif // PV_USE_CUDNN 356 int CudaRecvPost::do_run() {
359 cudnnHandle_t handle = (cudnnHandle_t)device->getCudnnHandle();
360 cudnnTensorDescriptor_t inputDescriptor = (cudnnTensorDescriptor_t)params.v_inputDescriptor;
361 cudnnFilterDescriptor_t filterDescriptor = (cudnnFilterDescriptor_t)params.v_filterDescriptor;
362 cudnnTensorDescriptor_t outputDescriptor = (cudnnTensorDescriptor_t)params.v_outputDescriptor;
363 cudnnConvolutionDescriptor_t convDescriptor =
364 (cudnnConvolutionDescriptor_t)params.v_convDescriptor;
365 cudnnConvolutionFwdAlgo_t *convAlgo = (cudnnConvolutionFwdAlgo_t *)params.v_convAlgo;
367 float scalingFactor = 1;
369 cudnnStatus_t status = cudnnConvolutionForward(
373 params.cudnn_preData,
375 params.cudnn_weights,
378 params.cudnn_workspace,
379 *params.workspaceSize,
384 cudnnHandleError(status,
"Convolution run");
385 #endif // PV_USE_CUDNN 391 void CudaRecvPost::permuteDatastorePVToCudnn() {
393 int ny = params.preNy + params.preNbup + params.preNbdn;
394 int nx = params.preNx + params.preNblt + params.preNbrt;
395 int nf = params.preNf;
396 int nbatch = params.nbatch;
399 int numNeurons = nbatch * ny * nx * nf;
400 int blockSize = device->get_max_threads();
402 int gridSize = ceil((
float)numNeurons / blockSize);
404 device->syncDevice();
406 callPermuteDatastorePVToCudnnKernel(
410 params.cudnn_preData,
417 handleCallError(
"Permute PV to CUDNN");
420 void CudaRecvPost::permuteGSynPVToCudnn(
int channel) {
422 int ny = params.nyRes;
423 int nx = params.nxRes;
425 int nbatch = params.nbatch;
428 int numNeurons = nbatch * ny * nx * nf;
429 float *gSynPatchHead = &(params.postGsyn[numNeurons * channel]);
431 int blockSize = device->get_max_threads();
433 int gridSize = std::ceil((
float)numNeurons / (
float)blockSize);
434 callPermuteGSynPVToCudnnKernel(
445 handleCallError(
"Permute GSyn PV to CUDNN");
448 void CudaRecvPost::permuteGSynCudnnToPV(
int channel) {
450 int ny = params.nyRes;
451 int nx = params.nxRes;
453 int nbatch = params.nbatch;
456 int numNeurons = nbatch * ny * nx * nf;
457 float *gSynPatchHead = &(params.postGsyn[numNeurons * channel]);
459 int blockSize = device->get_max_threads();
461 int gridSize = ceil((
float)numNeurons / blockSize);
462 callPermuteGSynCudnnToPVKernel(
473 handleCallError(
"Permute GSyn CUDNN to PV");
476 #endif // PV_USE_CUDNN