8 #include "PostsynapticPerspectiveGPUDelivery.hpp" 9 #include "columns/HyPerCol.hpp" 13 PostsynapticPerspectiveGPUDelivery::PostsynapticPerspectiveGPUDelivery(
19 PostsynapticPerspectiveGPUDelivery::PostsynapticPerspectiveGPUDelivery() {}
21 PostsynapticPerspectiveGPUDelivery::~PostsynapticPerspectiveGPUDelivery() {
23 delete mDevicePostToPreActivity;
26 int PostsynapticPerspectiveGPUDelivery::initialize(
char const *name, HyPerCol *hc) {
27 return BaseObject::initialize(name, hc);
30 void PostsynapticPerspectiveGPUDelivery::setObjectType() {
31 mObjectType =
"PostsynapticPerspectiveGPUDelivery";
43 Response::Status PostsynapticPerspectiveGPUDelivery::communicateInitInfo(
44 std::shared_ptr<CommunicateInitInfoMessage const> message) {
45 auto status = HyPerDelivery::communicateInitInfo(message);
56 getPreLayer()->setAllocDeviceDatastore();
57 mWeightsPair->getPostWeights()->useGPU();
58 getPostLayer()->setAllocDeviceGSyn();
61 if (!mUpdateGSynFromPostPerspective && getPreLayer()->getSparseFlag()) {
62 getPreLayer()->setAllocDeviceActiveIndices();
64 return Response::SUCCESS;
67 Response::Status PostsynapticPerspectiveGPUDelivery::setCudaDevice(
68 std::shared_ptr<SetCudaDeviceMessage const> message) {
69 pvAssert(mUsingGPUFlag);
70 auto status = HyPerDelivery::setCudaDevice(message);
71 if (status != Response::SUCCESS) {
74 mWeightsPair->getPostWeights()->setCudaDevice(message->mCudaDevice);
76 parent->getDevice()->incrementConvKernels();
80 Response::Status PostsynapticPerspectiveGPUDelivery::allocateDataStructures() {
82 mCudaDevice ==
nullptr,
83 "%s received AllocateData without having received SetCudaDevice.\n",
86 return Response::POSTPONE;
89 auto status = HyPerDelivery::allocateDataStructures();
94 initializeRecvKernelArgs();
96 return Response::SUCCESS;
99 void PostsynapticPerspectiveGPUDelivery::initializeRecvKernelArgs() {
100 PVCuda::CudaDevice *device = parent->getDevice();
101 Weights *postWeights = mWeightsPair->getPostWeights();
104 PVLayerLoc const *preLoc = getPreLayer()->getLayerLoc();
105 PVLayerLoc const *postLoc = getPostLayer()->getLayerLoc();
107 PVCuda::CudaBuffer *d_PreData = getPreLayer()->getDeviceDatastore();
108 PVCuda::CudaBuffer *d_PostGSyn = getPostLayer()->getDeviceGSyn();
109 PVCuda::CudaBuffer *d_PatchToDataLookup = postWeights->getDevicePatchToDataLookup();
110 PVCuda::CudaBuffer *d_WData = postWeights->getDeviceData();
113 PVCuda::CudaBuffer *cudnn_preData = getPreLayer()->getCudnnDatastore();
114 PVCuda::CudaBuffer *cudnn_gSyn = getPostLayer()->getCudnnGSyn();
115 PVCuda::CudaBuffer *cudnn_WData = postWeights->getCUDNNData();
116 pvAssert(cudnn_preData);
117 pvAssert(cudnn_gSyn);
118 pvAssert(cudnn_WData);
122 pvAssert(d_PostGSyn);
123 pvAssert(d_PatchToDataLookup);
126 int sy = (preLoc->nx + preLoc->halo.rt + preLoc->halo.lt) * preLoc->nf;
130 int oNblt = postLoc->halo.lt;
131 int oNbrt = postLoc->halo.rt;
132 int oNbup = postLoc->halo.up;
133 int oNbdn = postLoc->halo.dn;
139 int postNx = postLoc->nx;
140 int postNy = postLoc->ny;
141 int postNf = postLoc->nf;
143 int preNx = preLoc->nx;
144 int preNy = preLoc->ny;
145 int preNf = preLoc->nf;
146 int preNblt = preLoc->halo.lt;
147 int preNbrt = preLoc->halo.rt;
148 int preNbup = preLoc->halo.up;
149 int preNbdn = preLoc->halo.dn;
151 int nbatch = preLoc->nbatch;
154 float preToPostScaleX = (float)preLoc->nx / ((
float)postLoc->nx);
155 float preToPostScaleY = (float)preLoc->ny / ((
float)postLoc->ny);
163 int const postNumRestricted = postNx * postNy * postNf;
164 mDevicePostToPreActivity =
165 parent->getDevice()->createBuffer(postNumRestricted *
sizeof(
long), &description);
166 auto *h_PostToPreActivityVector =
new vector<long>(postNumRestricted);
167 auto *h_PostToPreActivity = h_PostToPreActivityVector->data();
169 for (
int k = 0; k < postNumRestricted; k++) {
170 int const kExtended = kIndexExtended(k, postNx, postNy, postNf, oNblt, oNbrt, oNbup, oNbdn);
171 h_PostToPreActivity[k] = postGeometry->getUnshrunkenStart(kExtended);
173 mDevicePostToPreActivity->copyToDevice(h_PostToPreActivity);
174 delete h_PostToPreActivityVector;
175 h_PostToPreActivityVector =
nullptr;
176 h_PostToPreActivity =
nullptr;
181 if (parent->columnId() == 0) {
182 InfoLog() <<
"preToPostScale: (" << preToPostScaleX <<
"," << preToPostScaleY <<
")\n";
185 mRecvKernel->setArgs(
217 mDevicePostToPreActivity,
226 d_PatchToDataLookup);
231 if (getChannelCode() == CHANNEL_NOUPDATE) {
234 float *postChannel = mPostLayer->getChannel(getChannelCode());
235 pvAssert(postChannel);
237 pvAssert(mRecvKernel);
239 PVLayerLoc const *preLoc = mPreLayer->getLayerLoc();
240 PVLayerLoc const *postLoc = mPostLayer->getLayerLoc();
241 Weights *weights = mWeightsPair->getPostWeights();
243 int const nxPreExtended = preLoc->nx + preLoc->halo.rt + preLoc->halo.rt;
244 int const nyPreExtended = preLoc->ny + preLoc->halo.dn + preLoc->halo.up;
245 int const numPreExtended = nxPreExtended * nyPreExtended * preLoc->nf;
247 int const numPostRestricted = postLoc->nx * postLoc->ny * postLoc->nf;
249 int nbatch = preLoc->nbatch;
250 pvAssert(nbatch == postLoc->nbatch);
252 const int sy = postLoc->nx * postLoc->nf;
253 const int syw = weights->
getGeometry()->getPatchStrideY();
255 bool const preLayerIsSparse = mPreLayer->getSparseFlag();
258 for (
int arbor = 0; arbor < numAxonalArbors; arbor++) {
259 int delay = mArborList->getDelay(arbor);
262 mRecvKernel->set_dt_factor(mDeltaTimeFactor);
264 const int postNx = postLoc->nx;
265 const int postNy = postLoc->ny;
266 const int postNf = postLoc->nf;
268 bool updatePreAct =
false;
271 if (mPreLayer->getUpdatedDeviceDatastoreFlag()) {
272 float *h_preDatastore = activityCube.data;
273 PVCuda::CudaBuffer *d_preDatastore = mPreLayer->getDeviceDatastore();
274 pvAssert(d_preDatastore);
275 d_preDatastore->copyToDevice(h_preDatastore);
277 mPreLayer->setUpdatedDeviceDatastoreFlag(
false);
283 mRecvKernel->permuteDatastorePVToCudnn();
287 mRecvKernel->permuteGSynPVToCudnn(getChannelCode());
293 mRecvKernel->run(totX, totY, totF, 1L, 1L, 1L);
296 mRecvKernel->permuteGSynCudnnToPV(getChannelCode());
300 mPostLayer->setUpdatedDeviceGSynFlag(
false);
306 void PostsynapticPerspectiveGPUDelivery::deliverUnitInput(
310 const int numPostRestricted = mPostLayer->getNumNeurons();
312 const PVLayerLoc *targetLoc = mPostLayer->getLayerLoc();
314 const int targetNx = targetLoc->nx;
315 const int targetNy = targetLoc->ny;
316 const int targetNf = targetLoc->nf;
317 const int nbatch = targetLoc->nbatch;
319 const PVHalo *targetHalo = &targetLoc->halo;
322 Weights *postWeights = mWeightsPair->getPostWeights();
326 int neuronIndexStride = targetNf < 4 ? 1 : targetNf / 4;
329 for (
int arbor = 0; arbor < numAxonalArbors; arbor++) {
330 for (
int b = 0; b < nbatch; b++) {
331 float *recvBatch = recvBuffer + b * numPostRestricted;
334 for (
int ky = 0; ky < yPatchSize; ky++) {
337 #ifdef PV_USE_OPENMP_THREADS 338 #pragma omp parallel for schedule(static) 340 for (
int feature = 0; feature < neuronIndexStride; feature++) {
341 for (
int idx = feature; idx < numPostRestricted; idx += neuronIndexStride) {
342 float *recvLocation = recvBatch + idx;
344 int kTargetExt = kIndexExtended(
354 float *weightValues = weightBuf + ky * syp;
357 for (
int k = 0; k < numPerStride; ++k) {
358 dv += weightValues[k];
360 *recvLocation += mDeltaTimeFactor * dv;
virtual int ioParamsFillGroup(enum ParamsIOFlag ioFlag) override
virtual void ioParam_receiveGpu(enum ParamsIOFlag ioFlag) override
receiveGpu: PostsynapticPerspectiveGPUDelivery always sets receiveGpu to true. The receiveGpu=false c...
bool getSharedFlag() const
int getPatchSizeX() const
PVLayerCube createCube(int delay=0)
bool getDataStructuresAllocatedFlag() const
static bool completed(Status &a)
int getPatchSizeY() const
std::shared_ptr< PatchGeometry > getGeometry() const
int getPatchStrideY() const
int getNumAxonalArbors() const
float * getDataFromPatchIndex(int arbor, int patchIndex)
virtual void deliver() override
virtual int ioParamsFillGroup(enum ParamsIOFlag ioFlag) override
int getPatchSizeF() const
bool getInitInfoCommunicatedFlag() const