1 #include "CudaRecvPost.hpp"     2 #include "arch/cuda/cuda_util.hpp"     3 #include "conversions.hcu"     4 #include "utils/PVAssert.hpp"     5 #include "utils/PVLog.hpp"    14 #endif // PV_USE_CUDNN    16 CudaRecvPost::CudaRecvPost(CudaDevice *inDevice) : CudaKernel(inDevice) {
    17    kernelName = 
"CudaRecvPost";
    20 CudaRecvPost::~CudaRecvPost() {
    22    if (params.v_inputDescriptor) {
    23       cudnnTensorDescriptor_t inputDescriptor = (cudnnTensorDescriptor_t)params.v_inputDescriptor;
    24       cudnnDestroyTensorDescriptor(inputDescriptor);
    26    if (params.v_filterDescriptor) {
    27       cudnnFilterDescriptor_t filterDescriptor = (cudnnFilterDescriptor_t)params.v_filterDescriptor;
    28       cudnnDestroyFilterDescriptor(filterDescriptor);
    30    if (params.v_outputDescriptor) {
    31       cudnnTensorDescriptor_t outputDescriptor = (cudnnTensorDescriptor_t)params.v_outputDescriptor;
    32       cudnnDestroyTensorDescriptor(outputDescriptor);
    34    if (params.v_convDescriptor) {
    35       cudnnConvolutionDescriptor_t convDescriptor =
    36             (cudnnConvolutionDescriptor_t)params.v_convDescriptor;
    37       cudnnDestroyConvolutionDescriptor(convDescriptor);
    39    if (params.v_convAlgo) {
    40       cudnnConvolutionFwdAlgo_t *convAlgo = (cudnnConvolutionFwdAlgo_t *)params.v_convAlgo;
    43    if (params.cudnn_workspace) {
    44       handleError(cudaFree(params.cudnn_workspace), 
"Freeing workspace pointer");
    46    if (params.workspaceSize) {
    47       delete params.workspaceSize;
    49 #endif // PV_USE_CUDNN    52 void CudaRecvPost::setArgs(
    75       const float preToPostScaleX,
    76       const float preToPostScaleY,
    80       const int numPerStride,
    81       const float dt_factor,
    82       const int sharedWeights,
    84        CudaBuffer *startSourceExtBuf,
    89        CudaBuffer *cudnn_preData,
    90        CudaBuffer *cudnn_weights,
    91        CudaBuffer *cudnn_gSyn,
    93        CudaBuffer *patch2datalookuptable) {
    94    params.nbatch = nbatch;
   104    params.preNx   = preNx;
   105    params.preNy   = preNy;
   106    params.preNf   = preNf;
   107    params.preNblt = preNblt;
   108    params.preNbrt = preNbrt;
   109    params.preNbup = preNbup;
   110    params.preNbdn = preNbdn;
   116    params.preToPostScaleX = preToPostScaleX;
   117    params.preToPostScaleY = preToPostScaleY;
   121    params.numPerStride  = numPerStride;
   122    params.dt_factor     = dt_factor;
   123    params.sharedWeights = sharedWeights;
   125    params.startSourceExtBuf = (
long *)startSourceExtBuf->getPointer();
   126    params.preData           = (
float *)preData->getPointer();
   127    params.weights           = (
float *)weights->getPointer();
   128    params.postGsyn          = (
float *)postGsyn->getPointer();
   130    params.cudnn_weights = (
float *)cudnn_weights->getPointer();
   131    params.cudnn_preData = (
float *)cudnn_preData->getPointer();
   132    params.cudnn_gSyn    = (
float *)cudnn_gSyn->getPointer();
   133 #endif // PV_USE_CUDNN   134    params.patch2datalookuptable = (
int *)patch2datalookuptable->getPointer();
   136    params.warpSize = device->get_warp_size();
   143    size_t workspaceMem = device->getMemory() / device->getNumConvKernels();
   145    int strideX, strideY;
   146    int actualXBorder, actualYBorder;
   147    pvAssert(params.preNblt == params.preNbrt);
   148    pvAssert(params.preNbup == params.preNbdn);
   150    if (preToPostScaleX < 1) {
   151       float fmanyScale = (float)1 / params.preToPostScaleX;
   153       pvAssert(std::ceil(fmanyScale) == fmanyScale);
   154       params.manyScaleX = fmanyScale;
   155       fmanyScale        = (float)1 / params.preToPostScaleY;
   156       pvAssert(std::ceil(fmanyScale) == fmanyScale);
   157       params.manyScaleY = fmanyScale;
   162       if (nxp % 2 == 0 || nyp % 2 == 0) {
   164                "cuDNN: Running on a one to many connection with CUDNN must have patch size (%d, "   165                "%d) be an odd muliple of many (%d, %d)\n",
   166                nxp * params.manyScaleX,
   167                nyp * params.manyScaleY,
   176       actualXBorder = params.nxp / 2;
   177       actualYBorder = params.nyp / 2;
   181       params.manyScaleX = 1;
   182       params.manyScaleY = 1;
   183       pvAssert(std::ceil(preToPostScaleX) == preToPostScaleX);
   184       pvAssert(std::ceil(preToPostScaleY) == preToPostScaleY);
   185       strideX = preToPostScaleX;
   186       strideY = preToPostScaleY;
   191       actualXBorder = (params.nxp - params.preToPostScaleX) / 2;
   192       actualYBorder = (params.nyp - params.preToPostScaleY) / 2;
   196    params.diffX = params.preNblt - actualXBorder;
   197    params.diffY = params.preNbup - actualYBorder;
   200    cudnnTensorDescriptor_t inputDescriptor;
   201    cudnnStatus_t status = cudnnCreateTensorDescriptor(&inputDescriptor);
   202    cudnnHandleError(status, 
"Create input tensor descriptor");
   204    status = cudnnSetTensor4dDescriptor(
   210          params.preNy + params.preNbup + params.preNbdn
   212          params.preNx + params.preNblt + params.preNbrt
   214    if (status != CUDNN_STATUS_SUCCESS) {
   216          case CUDNN_STATUS_BAD_PARAM: Fatal().printf(
"cuDNN bad parameter\n"); 
break;
   217          default: Fatal().printf(
"cuDNN unknown error code %d\n", status);
   221    cudnnHandleError(status, 
"Set input tensor descriptor");
   222    params.v_inputDescriptor = (
void *)inputDescriptor;
   225    cudnnFilterDescriptor_t filterDescriptor;
   226    status = cudnnCreateFilterDescriptor(&filterDescriptor);
   227    cudnnHandleError(status, 
"Create filter tensor descriptor");
   229    status = cudnnSetFilter4dDescriptor(
   233          params.nf * params.manyScaleX * params.manyScaleY, 
   239 #elif CUDNN_MAJOR == 4   240    status = cudnnSetFilter4dDescriptor(
   243          params.nf * params.manyScaleX * params.manyScaleY, 
   250 #error The cuDNN version is required to be either v4 or greater.\n   252    cudnnHandleError(status, 
"Set filter tensor descriptor");
   253    params.v_filterDescriptor = (
void *)filterDescriptor;
   256    cudnnConvolutionDescriptor_t convDescriptor;
   257    status = cudnnCreateConvolutionDescriptor(&convDescriptor);
   258    cudnnHandleError(status, 
"Create convolution tensor descriptor");
   259    status = cudnnSetConvolution2dDescriptor(
   273    cudnnHandleError(status, 
"Set convolution tensor descriptor");
   274    params.v_convDescriptor = (
void *)convDescriptor;
   277    int out_n, out_c, out_h, out_w;
   278    status = cudnnGetConvolution2dForwardOutputDim(
   286    cudnnHandleError(status, 
"Get output tensor descriptor");
   289    if (out_n != nbatch || out_h != nyRes / params.manyScaleY || out_w != nxRes / params.manyScaleX
   290        || out_c != nf * params.manyScaleX * params.manyScaleY) {
   291       std::stringstream errmsg(
"");
   292       errmsg << 
"CUDNN:: Dimensions don't match: \n";
   293       errmsg << 
"Dimensions of output tensor (n, y, x, f): " << out_n << 
", " << out_h << 
", "   294              << out_w << 
", " << out_c << 
"\n";
   295       errmsg << 
"Scaled dimensions of output PV layer (n, y, x, f): " << nbatch << 
", "   296              << nyRes / params.manyScaleY << 
", " << nxRes / params.manyScaleX << 
", "   297              << nf * params.manyScaleX * params.manyScaleY << 
"\n";
   298       errmsg << 
"Actual dimensions of output PV layer (n, y, x, f): " << nbatch << 
", " << nyRes
   299              << 
", " << nxRes << 
", " << nf << 
"\n";
   300       Fatal() << errmsg.str() << std::endl;
   304    cudnnTensorDescriptor_t outputDescriptor;
   305    status = cudnnCreateTensorDescriptor(&outputDescriptor);
   306    cudnnHandleError(status, 
"Create output tensor descriptor");
   307    status = cudnnSetTensor4dDescriptor(
   312          nf * params.manyScaleX * params.manyScaleY, 
   313          nyRes / params.manyScaleY, 
   314          nxRes / params.manyScaleX); 
   315    cudnnHandleError(status, 
"Set output tensor descriptor");
   316    params.v_outputDescriptor = (
void *)outputDescriptor;
   319    cudnnHandle_t handle                = (cudnnHandle_t)device->getCudnnHandle();
   320    cudnnConvolutionFwdAlgo_t *convAlgo = 
new cudnnConvolutionFwdAlgo_t();
   322    status = cudnnGetConvolutionForwardAlgorithm(
   328          CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
   331    cudnnHandleError(status, 
"Get convolution forward algorithm");
   332    params.v_convAlgo = (
void *)convAlgo;
   335    size_t *temp = 
new size_t();
   336    status       = cudnnGetConvolutionForwardWorkspaceSize(
   344    params.workspaceSize = temp;
   345    cudnnHandleError(status, 
"Get convolution forward workspace size");
   349          cudaMalloc(¶ms.cudnn_workspace, *params.workspaceSize), 
"Cudnn workspace cudaMalloc");
   351 #endif // PV_USE_CUDNN   356 int CudaRecvPost::do_run() {
   359    cudnnHandle_t handle                     = (cudnnHandle_t)device->getCudnnHandle();
   360    cudnnTensorDescriptor_t inputDescriptor  = (cudnnTensorDescriptor_t)params.v_inputDescriptor;
   361    cudnnFilterDescriptor_t filterDescriptor = (cudnnFilterDescriptor_t)params.v_filterDescriptor;
   362    cudnnTensorDescriptor_t outputDescriptor = (cudnnTensorDescriptor_t)params.v_outputDescriptor;
   363    cudnnConvolutionDescriptor_t convDescriptor =
   364          (cudnnConvolutionDescriptor_t)params.v_convDescriptor;
   365    cudnnConvolutionFwdAlgo_t *convAlgo = (cudnnConvolutionFwdAlgo_t *)params.v_convAlgo;
   367    float scalingFactor = 1;
   369    cudnnStatus_t status = cudnnConvolutionForward(
   373          params.cudnn_preData,
   375          params.cudnn_weights,
   378          params.cudnn_workspace,
   379          *params.workspaceSize,
   384    cudnnHandleError(status, 
"Convolution run");
   385 #endif // PV_USE_CUDNN   391 void CudaRecvPost::permuteDatastorePVToCudnn() {
   393    int ny     = params.preNy + params.preNbup + params.preNbdn;
   394    int nx     = params.preNx + params.preNblt + params.preNbrt;
   395    int nf     = params.preNf;
   396    int nbatch = params.nbatch;
   399    int numNeurons = nbatch * ny * nx * nf;
   400    int blockSize  = device->get_max_threads();
   402    int gridSize = ceil((
float)numNeurons / blockSize);
   404    device->syncDevice();
   406    callPermuteDatastorePVToCudnnKernel(
   410          params.cudnn_preData,
   417    handleCallError(
"Permute PV to CUDNN");
   420 void CudaRecvPost::permuteGSynPVToCudnn(
int channel) {
   422    int ny     = params.nyRes;
   423    int nx     = params.nxRes;
   425    int nbatch = params.nbatch;
   428    int numNeurons       = nbatch * ny * nx * nf;
   429    float *gSynPatchHead = &(params.postGsyn[numNeurons * channel]);
   431    int blockSize = device->get_max_threads();
   433    int gridSize = std::ceil((
float)numNeurons / (
float)blockSize);
   434    callPermuteGSynPVToCudnnKernel(
   445    handleCallError(
"Permute GSyn PV to CUDNN");
   448 void CudaRecvPost::permuteGSynCudnnToPV(
int channel) {
   450    int ny     = params.nyRes;
   451    int nx     = params.nxRes;
   453    int nbatch = params.nbatch;
   456    int numNeurons       = nbatch * ny * nx * nf;
   457    float *gSynPatchHead = &(params.postGsyn[numNeurons * channel]);
   459    int blockSize = device->get_max_threads();
   461    int gridSize = ceil((
float)numNeurons / blockSize);
   462    callPermuteGSynCudnnToPVKernel(
   473    handleCallError(
"Permute GSyn CUDNN to PV");
   476 #endif // PV_USE_CUDNN