8 #include "arch/cuda/cuda_util.hpp"     9 #include "utils/PVAssert.hpp"    11 #include <cudakernels/CudaPoolingDeliverKernel.hpp>    16 CudaPoolingDeliverKernel::CudaPoolingDeliverKernel(CudaDevice *inDevice) : CudaKernel(inDevice) {
    17    kernelName = 
"CudaPoolingDeliverKernel";
    20 CudaPoolingDeliverKernel::~CudaPoolingDeliverKernel() {
    21    cudnnDestroyPoolingDescriptor(mPoolingDescriptor);
    22    cudnnDestroyTensorDescriptor((cudnnTensorDescriptor_t)mDataStoreDescriptor);
    23    cudnnDestroyTensorDescriptor((cudnnTensorDescriptor_t)mGSynDescriptor);
    26 void CudaPoolingDeliverKernel::setArgs(
    31       cudnnPoolingMode_t poolingMode,
    40       CudaBuffer *dataStoreBuffer,
    41       CudaBuffer *gSynBuffer,
    45          preLoc->nx < postLoc->nx,
    46          "Pooling is not defined for one-to-many connections (pre->nx=%d, post->nx=%d\n",
    50          preLoc->ny < postLoc->ny,
    51          "Pooling is not defined for one-to-many connections (pre->ny=%d, post->ny=%d\n",
    57    mPoolingMode = poolingMode;
    58    mMultiplier  = (float)multiplier;
    60    int strideX = calcStride(preLoc->nx, postLoc->nx);
    61    int strideY = calcStride(preLoc->ny, postLoc->ny);
    64    status = cudnnCreatePoolingDescriptor(&mPoolingDescriptor);
    65    cudnnHandleError(status, 
"Create pooling descriptor");
    66    status = cudnnSetPooling2dDescriptor(
    70          CUDNN_NOT_PROPAGATE_NAN,
    79    const PVHalo *preHalo = &preLoc->halo;
    80    mBorderExcessX        = calcBorderExcess(preLoc->nx, postLoc->nx, preHalo->lt, nxpPost);
    81    mBorderExcessY        = calcBorderExcess(preLoc->ny, postLoc->ny, preHalo->up, nypPost);
    82    status                = cudnnCreateTensorDescriptor(&mDataStoreDescriptor);
    83    cudnnHandleError(status, 
"Create input tensor descriptor");
    84    status = cudnnSetTensor4dDescriptor(
    91          preLoc->ny + preHalo->up + preHalo->dn - 2 * mBorderExcessY, 
    92          preLoc->nx + preHalo->lt + preHalo->rt - 2 * mBorderExcessX); 
    93    mDataStore = (
float *)dataStoreBuffer->getPointer();
    95    status = cudnnCreateTensorDescriptor(&mGSynDescriptor);
    96    cudnnHandleError(status, 
"Create input tensor descriptor");
    97    status = cudnnSetTensor4dDescriptor(
   106    cudnnHandleError(status, 
"Set output tensor descriptor");
   108    std::string str(kernelName);
   109    mCudnnDataStore = device->createBuffer(dataStoreBuffer->getSize(), &str);
   111    int numGSynNeuronsAcrossBatch = postLoc->nf * postLoc->ny * postLoc->nf * postLoc->nbatch;
   112    float *gSynHead               = (
float *)gSynBuffer->getPointer();
   113    mGSyn                         = &gSynHead[channel * numGSynNeuronsAcrossBatch];
   115    size_t gSynSize = gSynBuffer->getSize();
   116    mCudnnGSyn      = device->createBuffer(numGSynNeuronsAcrossBatch, &str);
   119 int CudaPoolingDeliverKernel::calcBorderExcess(
   123       int patchSizePostPerspective) {
   124    int preToPostScale = preRestricted / postRestricted;
   125    int borderNeeded   = (patchSizePostPerspective - preToPostScale) / 2;
   126    return border - borderNeeded;
   129 int CudaPoolingDeliverKernel::calcManyScale(
int preRestricted, 
int postRestricted) { 
return 1; }
   131 int CudaPoolingDeliverKernel::calcStride(
int preRestricted, 
int postRestricted) {
   132    return preRestricted / postRestricted;
   135 int CudaPoolingDeliverKernel::do_run() {
   136    float scalingFactor = 1.0f;
   138    int const blockSize = device->get_max_threads();
   141    PVHalo const *halo = &mPreLoc->halo;
   142    int const nxPreExt = mPreLoc->nx + halo->lt + halo->rt;
   143    int const nyPreExt = mPreLoc->ny + halo->dn + halo->up;
   144    int const nf       = mPreLoc->nf;
   145    int const nbatch   = mPreLoc->nbatch;
   147    int numNeurons = nbatch * nyPreExt * nxPreExt * nf;
   149    int const gridSizePre        = std::ceil((
float)numNeurons / blockSize);
   150    float *cudnnDataStorePointer = (
float *)mCudnnDataStore->getPointer();
   151    callPermuteDatastorePVToCudnnKernel(
   155          cudnnDataStorePointer,
   162    handleCallError(
"Permute DataStore PV to CUDNN");
   165    int const nxPost = mPostLoc->nx;
   166    int const nyPost = mPostLoc->ny;
   167    int const nfPost = mPostLoc->nf;
   168    pvAssert(mPostLoc->nbatch == mPreLoc->nbatch);
   170    numNeurons              = nbatch * nxPost * nyPost * nf;
   171    float *cudnnGSynPointer = (
float *)mCudnnGSyn->getPointer();
   173    int const gridSizePost = std::ceil((
float)numNeurons / (
float)blockSize);
   174    callPermuteGSynPVToCudnnKernel(
   175          gridSizePost, blockSize, mGSyn, cudnnGSynPointer, nbatch, nyPost, nxPost, nf, 1, 1);
   176    handleCallError(
"Permute GSyn PV to CUDNN");
   178    cudnnPoolingMode_t checkMode;
   179    int h, w, vPad, hPad, vStride, hStride;
   181    cudnnNanPropagation_t cudnnNanPropagation;
   182    cudnnGetPooling2dDescriptor(
   183          (cudnnPoolingDescriptor_t)mPoolingDescriptor,
   185          &cudnnNanPropagation,
   193 #elif CUDNN_MAJOR == 4   194    cudnnGetPooling2dDescriptor(
   195          (cudnnPoolingDescriptor_t)mPoolingDescriptor,
   204 #error The cuDNN version is required to be either v4 or v5.   208    cudnnStatus_t status = cudnnPoolingForward(
   209          (cudnnHandle_t)device->getCudnnHandle(),
   212          mDataStoreDescriptor,
   213          cudnnDataStorePointer,
   217    cudnnHandleError(status, 
"Forward pooling run");
   220    callPermuteGSynCudnnToPVKernel(
   221          gridSizePost, blockSize, mGSyn, cudnnGSynPointer, nbatch, nyPost, nxPost, nf, 1, 1);
   222    handleCallError(
"Permute GSyn CUDNN back to PV");