PetaVision  Alpha
PostsynapticPerspectiveGPUDelivery.cpp
1 /*
2  * PostsynapticPerspectiveGPUDelivery.cpp
3  *
4  * Created on: Aug 24, 2017
5  * Author: Pete Schultz
6  */
7 
8 #include "PostsynapticPerspectiveGPUDelivery.hpp"
9 #include "columns/HyPerCol.hpp"
10 
11 namespace PV {
12 
13 PostsynapticPerspectiveGPUDelivery::PostsynapticPerspectiveGPUDelivery(
14  char const *name,
15  HyPerCol *hc) {
16  initialize(name, hc);
17 }
18 
19 PostsynapticPerspectiveGPUDelivery::PostsynapticPerspectiveGPUDelivery() {}
20 
21 PostsynapticPerspectiveGPUDelivery::~PostsynapticPerspectiveGPUDelivery() {
22  delete mRecvKernel;
23  delete mDevicePostToPreActivity;
24 }
25 
26 int PostsynapticPerspectiveGPUDelivery::initialize(char const *name, HyPerCol *hc) {
27  return BaseObject::initialize(name, hc);
28 }
29 
30 void PostsynapticPerspectiveGPUDelivery::setObjectType() {
31  mObjectType = "PostsynapticPerspectiveGPUDelivery";
32 }
33 
35  int status = HyPerDelivery::ioParamsFillGroup(ioFlag);
36  return status;
37 }
38 
40  mReceiveGpu = true; // If it's false, we should be using a different class.
41 }
42 
43 Response::Status PostsynapticPerspectiveGPUDelivery::communicateInitInfo(
44  std::shared_ptr<CommunicateInitInfoMessage const> message) {
45  auto status = HyPerDelivery::communicateInitInfo(message);
46  if (!Response::completed(status)) {
47  return status;
48  }
49  // HyPerDelivery::communicateInitInfo() postpones until mWeightsPair communicates.
50  pvAssert(mWeightsPair and mWeightsPair->getInitInfoCommunicatedFlag());
51  mWeightsPair->needPost();
52  // Tell pre and post layers to allocate memory on gpu, which they will do
53  // during the AllocateDataStructures stage.
54 
55  // we need pre datastore, weights, and post gsyn for the channelCode allocated on the GPU.
56  getPreLayer()->setAllocDeviceDatastore();
57  mWeightsPair->getPostWeights()->useGPU();
58  getPostLayer()->setAllocDeviceGSyn();
59 
60  // If recv from pre and pre layer is sparse, allocate activeIndices
61  if (!mUpdateGSynFromPostPerspective && getPreLayer()->getSparseFlag()) {
62  getPreLayer()->setAllocDeviceActiveIndices();
63  }
64  return Response::SUCCESS;
65 }
66 
67 Response::Status PostsynapticPerspectiveGPUDelivery::setCudaDevice(
68  std::shared_ptr<SetCudaDeviceMessage const> message) {
69  pvAssert(mUsingGPUFlag);
70  auto status = HyPerDelivery::setCudaDevice(message);
71  if (status != Response::SUCCESS) {
72  return status;
73  }
74  mWeightsPair->getPostWeights()->setCudaDevice(message->mCudaDevice);
75  // Increment number of postKernels for cuDNN workspace memory
76  parent->getDevice()->incrementConvKernels();
77  return status;
78 }
79 
80 Response::Status PostsynapticPerspectiveGPUDelivery::allocateDataStructures() {
81  FatalIf(
82  mCudaDevice == nullptr,
83  "%s received AllocateData without having received SetCudaDevice.\n",
84  getDescription_c());
85  if (!mWeightsPair->getDataStructuresAllocatedFlag()) {
86  return Response::POSTPONE;
87  }
88 
89  auto status = HyPerDelivery::allocateDataStructures();
90  if (!Response::completed(status)) {
91  return status;
92  }
93 
94  initializeRecvKernelArgs();
95 
96  return Response::SUCCESS;
97 }
98 
99 void PostsynapticPerspectiveGPUDelivery::initializeRecvKernelArgs() {
100  PVCuda::CudaDevice *device = parent->getDevice();
101  Weights *postWeights = mWeightsPair->getPostWeights();
102  mRecvKernel = new PVCuda::CudaRecvPost(device);
103 
104  PVLayerLoc const *preLoc = getPreLayer()->getLayerLoc();
105  PVLayerLoc const *postLoc = getPostLayer()->getLayerLoc();
106 
107  PVCuda::CudaBuffer *d_PreData = getPreLayer()->getDeviceDatastore();
108  PVCuda::CudaBuffer *d_PostGSyn = getPostLayer()->getDeviceGSyn();
109  PVCuda::CudaBuffer *d_PatchToDataLookup = postWeights->getDevicePatchToDataLookup();
110  PVCuda::CudaBuffer *d_WData = postWeights->getDeviceData();
111 
112 #ifdef PV_USE_CUDNN
113  PVCuda::CudaBuffer *cudnn_preData = getPreLayer()->getCudnnDatastore();
114  PVCuda::CudaBuffer *cudnn_gSyn = getPostLayer()->getCudnnGSyn();
115  PVCuda::CudaBuffer *cudnn_WData = postWeights->getCUDNNData();
116  pvAssert(cudnn_preData);
117  pvAssert(cudnn_gSyn);
118  pvAssert(cudnn_WData);
119 #endif
120 
121  pvAssert(d_PreData);
122  pvAssert(d_PostGSyn);
123  pvAssert(d_PatchToDataLookup);
124  pvAssert(d_WData);
125 
126  int sy = (preLoc->nx + preLoc->halo.rt + preLoc->halo.lt) * preLoc->nf;
127  int syp = postWeights->getPatchStrideY();
128  int numPerStride = postWeights->getPatchSizeX() * postWeights->getPatchSizeF();
129 
130  int oNblt = postLoc->halo.lt;
131  int oNbrt = postLoc->halo.rt;
132  int oNbup = postLoc->halo.up;
133  int oNbdn = postLoc->halo.dn;
134 
135  // nxp, nyp, and nfp should be orig conn's
136  int oNxp = postWeights->getPatchSizeX();
137  int oNyp = postWeights->getPatchSizeY();
138  int oNfp = postWeights->getPatchSizeF();
139  int postNx = postLoc->nx;
140  int postNy = postLoc->ny;
141  int postNf = postLoc->nf;
142 
143  int preNx = preLoc->nx;
144  int preNy = preLoc->ny;
145  int preNf = preLoc->nf;
146  int preNblt = preLoc->halo.lt;
147  int preNbrt = preLoc->halo.rt;
148  int preNbup = preLoc->halo.up;
149  int preNbdn = preLoc->halo.dn;
150 
151  int nbatch = preLoc->nbatch;
152 
153  // Set local sizes here
154  float preToPostScaleX = (float)preLoc->nx / ((float)postLoc->nx);
155  float preToPostScaleY = (float)preLoc->ny / ((float)postLoc->ny);
156 
157  // The CudaRecvPost kernel needs a buffer containing, for each postsynaptic GSyn neuron,
158  // the offset of the start of the receptive field into the presynaptic activity buffer.
159  // It expects a buffer of long ints, of length post->getNumNeurons().
160  //
161  // The relevant information is in the PatchGeometry's mUnshrunkenStart buffer, but this
162  // has length post->getNumExtended().
163  int const postNumRestricted = postNx * postNy * postNf;
164  mDevicePostToPreActivity =
165  parent->getDevice()->createBuffer(postNumRestricted * sizeof(long), &description);
166  auto *h_PostToPreActivityVector = new vector<long>(postNumRestricted);
167  auto *h_PostToPreActivity = h_PostToPreActivityVector->data();
168  auto postGeometry = postWeights->getGeometry();
169  for (int k = 0; k < postNumRestricted; k++) {
170  int const kExtended = kIndexExtended(k, postNx, postNy, postNf, oNblt, oNbrt, oNbup, oNbdn);
171  h_PostToPreActivity[k] = postGeometry->getUnshrunkenStart(kExtended);
172  }
173  mDevicePostToPreActivity->copyToDevice(h_PostToPreActivity);
174  delete h_PostToPreActivityVector;
175  h_PostToPreActivityVector = nullptr;
176  h_PostToPreActivity = nullptr;
177 
178  // See the size of buffer needed based on x and y
179  // oNxp is the patch size from the post point of view
180 
181  if (parent->columnId() == 0) {
182  InfoLog() << "preToPostScale: (" << preToPostScaleX << "," << preToPostScaleY << ")\n";
183  }
184 
185  mRecvKernel->setArgs(
186  nbatch,
187  postNx, // num post neurons
188  postNy,
189  postNf,
190 
191  oNblt, // Border of orig
192  oNbrt, // Border of orig
193  oNbdn, // Border of orig
194  oNbup, // Border of orig
195 
196  preNx,
197  preNy,
198  preNf,
199  preNblt,
200  preNbrt,
201  preNbup,
202  preNbdn,
203 
204  oNxp,
205  oNyp,
206  oNfp,
207 
208  preToPostScaleX,
209  preToPostScaleY,
210 
211  sy,
212  syp,
213  numPerStride,
214  mDeltaTimeFactor,
215  postWeights->getSharedFlag(),
216 
217  mDevicePostToPreActivity,
218  d_PreData,
219  d_WData,
220  d_PostGSyn,
221 #ifdef PV_USE_CUDNN
222  cudnn_preData,
223  cudnn_WData,
224  cudnn_gSyn,
225 #endif
226  d_PatchToDataLookup);
227 }
228 
230  // Check if we need to update based on connection's channel
231  if (getChannelCode() == CHANNEL_NOUPDATE) {
232  return;
233  }
234  float *postChannel = mPostLayer->getChannel(getChannelCode());
235  pvAssert(postChannel);
236 
237  pvAssert(mRecvKernel);
238 
239  PVLayerLoc const *preLoc = mPreLayer->getLayerLoc();
240  PVLayerLoc const *postLoc = mPostLayer->getLayerLoc();
241  Weights *weights = mWeightsPair->getPostWeights();
242 
243  int const nxPreExtended = preLoc->nx + preLoc->halo.rt + preLoc->halo.rt;
244  int const nyPreExtended = preLoc->ny + preLoc->halo.dn + preLoc->halo.up;
245  int const numPreExtended = nxPreExtended * nyPreExtended * preLoc->nf;
246 
247  int const numPostRestricted = postLoc->nx * postLoc->ny * postLoc->nf;
248 
249  int nbatch = preLoc->nbatch;
250  pvAssert(nbatch == postLoc->nbatch);
251 
252  const int sy = postLoc->nx * postLoc->nf; // stride in restricted layer
253  const int syw = weights->getGeometry()->getPatchStrideY(); // stride in patch
254 
255  bool const preLayerIsSparse = mPreLayer->getSparseFlag();
256 
257  int numAxonalArbors = mArborList->getNumAxonalArbors();
258  for (int arbor = 0; arbor < numAxonalArbors; arbor++) {
259  int delay = mArborList->getDelay(arbor);
260  PVLayerCube activityCube = mPreLayer->getPublisher()->createCube(delay);
261 
262  mRecvKernel->set_dt_factor(mDeltaTimeFactor);
263 
264  const int postNx = postLoc->nx;
265  const int postNy = postLoc->ny;
266  const int postNf = postLoc->nf;
267 
268  bool updatePreAct = false;
269  // Update pre activity, post gsyn, and conn weights
270  // Only if they're updated
271  if (mPreLayer->getUpdatedDeviceDatastoreFlag()) {
272  float *h_preDatastore = activityCube.data;
273  PVCuda::CudaBuffer *d_preDatastore = mPreLayer->getDeviceDatastore();
274  pvAssert(d_preDatastore);
275  d_preDatastore->copyToDevice(h_preDatastore);
276  // Device now has updated
277  mPreLayer->setUpdatedDeviceDatastoreFlag(false);
278  updatePreAct = true;
279  }
280 
281  // Permutation buffer is local to the kernel, NOT the layer
282  // Therefore, we must permute Datastore every time
283  mRecvKernel->permuteDatastorePVToCudnn();
284  //}
285 
286  // Permute GSyn
287  mRecvKernel->permuteGSynPVToCudnn(getChannelCode());
288 
289  int totF = postNf;
290  int totX = postNx;
291  int totY = postNy;
292  // Make sure local sizes are divisible by f, x, and y
293  mRecvKernel->run(totX, totY, totF, 1L, 1L, 1L);
294 
295 #ifdef PV_USE_CUDNN
296  mRecvKernel->permuteGSynCudnnToPV(getChannelCode());
297 #endif
298  }
299  // GSyn already living on GPU
300  mPostLayer->setUpdatedDeviceGSynFlag(false);
301 }
302 
303 // This is a copy of PostsynapticPerspectiveConvolveDelivery.
304 // The spirit of this class says we should put this method on the GPU,
305 // but the priority for doing so is rather low.
306 void PostsynapticPerspectiveGPUDelivery::deliverUnitInput(
307 
308  float *recvBuffer) {
309  // Get number of neurons restricted target
310  const int numPostRestricted = mPostLayer->getNumNeurons();
311 
312  const PVLayerLoc *targetLoc = mPostLayer->getLayerLoc();
313 
314  const int targetNx = targetLoc->nx;
315  const int targetNy = targetLoc->ny;
316  const int targetNf = targetLoc->nf;
317  const int nbatch = targetLoc->nbatch;
318 
319  const PVHalo *targetHalo = &targetLoc->halo;
320 
321  // Get source layer's patch y stride
322  Weights *postWeights = mWeightsPair->getPostWeights();
323  int syp = postWeights->getPatchStrideY();
324  int yPatchSize = postWeights->getPatchSizeY();
325  int numPerStride = postWeights->getPatchSizeX() * postWeights->getPatchSizeF();
326  int neuronIndexStride = targetNf < 4 ? 1 : targetNf / 4;
327 
328  int numAxonalArbors = mArborList->getNumAxonalArbors();
329  for (int arbor = 0; arbor < numAxonalArbors; arbor++) {
330  for (int b = 0; b < nbatch; b++) {
331  float *recvBatch = recvBuffer + b * numPostRestricted;
332 
333  // Iterate over each line in the y axis, the goal is to keep weights in the cache
334  for (int ky = 0; ky < yPatchSize; ky++) {
335 // Threading over feature was the important change that improved cache performance by
336 // 5-10x. dynamic scheduling also gave another performance increase over static.
337 #ifdef PV_USE_OPENMP_THREADS
338 #pragma omp parallel for schedule(static)
339 #endif
340  for (int feature = 0; feature < neuronIndexStride; feature++) {
341  for (int idx = feature; idx < numPostRestricted; idx += neuronIndexStride) {
342  float *recvLocation = recvBatch + idx;
343 
344  int kTargetExt = kIndexExtended(
345  idx,
346  targetNx,
347  targetNy,
348  targetNf,
349  targetHalo->lt,
350  targetHalo->rt,
351  targetHalo->dn,
352  targetHalo->up);
353  float *weightBuf = postWeights->getDataFromPatchIndex(arbor, kTargetExt);
354  float *weightValues = weightBuf + ky * syp;
355 
356  float dv = 0.0f;
357  for (int k = 0; k < numPerStride; ++k) {
358  dv += weightValues[k];
359  }
360  *recvLocation += mDeltaTimeFactor * dv;
361  }
362  }
363  }
364  }
365  }
366 }
367 
368 } // end namespace PV
virtual int ioParamsFillGroup(enum ParamsIOFlag ioFlag) override
virtual void ioParam_receiveGpu(enum ParamsIOFlag ioFlag) override
receiveGpu: PostsynapticPerspectiveGPUDelivery always sets receiveGpu to true. The receiveGpu=false c...
bool getSharedFlag() const
Definition: Weights.hpp:142
int getPatchSizeX() const
Definition: Weights.hpp:219
PVLayerCube createCube(int delay=0)
Definition: Publisher.cpp:60
bool getDataStructuresAllocatedFlag() const
Definition: BaseObject.hpp:102
static bool completed(Status &a)
Definition: Response.hpp:49
int getPatchSizeY() const
Definition: Weights.hpp:222
std::shared_ptr< PatchGeometry > getGeometry() const
Definition: Weights.hpp:148
int getPatchStrideY() const
Definition: Weights.hpp:248
int getNumAxonalArbors() const
Definition: ArborList.hpp:52
float * getDataFromPatchIndex(int arbor, int patchIndex)
Definition: Weights.cpp:205
virtual int ioParamsFillGroup(enum ParamsIOFlag ioFlag) override
int getPatchSizeF() const
Definition: Weights.hpp:225
bool getInitInfoCommunicatedFlag() const
Definition: BaseObject.hpp:95