PetaVision  Alpha
PresynapticPerspectiveGPUDelivery.cpp
1 /*
2  * PresynapticPerspectiveGPUDelivery.cpp
3  *
4  * Created on: Jan 10, 2018
5  * Author: Pete Schultz
6  */
7 
8 #include "PresynapticPerspectiveGPUDelivery.hpp"
9 #include "columns/HyPerCol.hpp"
10 
11 namespace PV {
12 
13 PresynapticPerspectiveGPUDelivery::PresynapticPerspectiveGPUDelivery(
14  char const *name,
15  HyPerCol *hc) {
16  initialize(name, hc);
17 }
18 
19 PresynapticPerspectiveGPUDelivery::PresynapticPerspectiveGPUDelivery() {}
20 
21 PresynapticPerspectiveGPUDelivery::~PresynapticPerspectiveGPUDelivery() {}
22 
23 int PresynapticPerspectiveGPUDelivery::initialize(char const *name, HyPerCol *hc) {
24  return BaseObject::initialize(name, hc);
25 }
26 
27 void PresynapticPerspectiveGPUDelivery::setObjectType() {
28  mObjectType = "PresynapticPerspectiveGPUDelivery";
29 }
30 
32  int status = HyPerDelivery::ioParamsFillGroup(ioFlag);
33  return status;
34 }
35 
37  mReceiveGpu = true; // If it's false, we should be using a different class.
38 }
39 
40 Response::Status PresynapticPerspectiveGPUDelivery::communicateInitInfo(
41  std::shared_ptr<CommunicateInitInfoMessage const> message) {
42  auto status = HyPerDelivery::communicateInitInfo(message);
43  if (!Response::completed(status)) {
44  return status;
45  }
46  // HyPerDelivery::communicateInitInfo() postpones until mWeightsPair communicates.
47  pvAssert(mWeightsPair and mWeightsPair->getInitInfoCommunicatedFlag());
48  mWeightsPair->needPre();
49  // Tell pre and post layers to allocate memory on gpu, which they will do
50  // during the AllocateDataStructures stage.
51 
52  // we need pre datastore, weights, and post gsyn for the channelCode allocated on the GPU.
53  getPreLayer()->setAllocDeviceDatastore();
54  mWeightsPair->getPreWeights()->useGPU();
55  getPostLayer()->setAllocDeviceGSyn();
56 
57  // If recv from pre and pre layer is sparse, allocate activeIndices
58  if (!mUpdateGSynFromPostPerspective && getPreLayer()->getSparseFlag()) {
59  getPreLayer()->setAllocDeviceActiveIndices();
60  }
61 
62  return status;
63 }
64 
65 Response::Status PresynapticPerspectiveGPUDelivery::setCudaDevice(
66  std::shared_ptr<SetCudaDeviceMessage const> message) {
67  pvAssert(mUsingGPUFlag);
68  auto status = HyPerDelivery::setCudaDevice(message);
69  if (status != Response::SUCCESS) {
70  return status;
71  }
72  mWeightsPair->getPreWeights()->setCudaDevice(message->mCudaDevice);
73  mCudaDevice = message->mCudaDevice;
74  return status;
75 }
76 
77 Response::Status PresynapticPerspectiveGPUDelivery::allocateDataStructures() {
78  FatalIf(
79  mCudaDevice == nullptr,
80  "%s received AllocateData without having received SetCudaDevice.\n",
81  getDescription_c());
82  if (!mWeightsPair->getDataStructuresAllocatedFlag()) {
83  return Response::POSTPONE;
84  }
85 
86  auto status = HyPerDelivery::allocateDataStructures();
87  if (!Response::completed(status)) {
88  return status;
89  }
90 
91  initializeRecvKernelArgs();
92 
93  allocateThreadGSyn(); // Needed for deliverUnitInput, because it doesn't use GPU yet.
94  return Response::SUCCESS;
95 }
96 
97 void PresynapticPerspectiveGPUDelivery::initializeRecvKernelArgs() {
98  PVCuda::CudaDevice *device = parent->getDevice();
99  Weights *preWeights = mWeightsPair->getPreWeights();
100  mRecvKernel = new PVCuda::CudaRecvPre(device);
101 
102  const PVLayerLoc *preLoc = getPreLayer()->getLayerLoc();
103  const PVLayerLoc *postLoc = getPostLayer()->getLayerLoc();
104  const PVHalo *preHalo = &getPreLayer()->getLayerLoc()->halo;
105  const PVHalo *postHalo = &getPostLayer()->getLayerLoc()->halo;
106 
107  PVCuda::CudaBuffer *d_PreData = getPreLayer()->getDeviceDatastore();
108  PVCuda::CudaBuffer *d_PostGSyn = getPostLayer()->getDeviceGSyn();
109  PVCuda::CudaBuffer *d_PatchToDataLookup = preWeights->getDevicePatchToDataLookup();
110  PVCuda::CudaBuffer *d_WData = preWeights->getDeviceData();
111 
112  pvAssert(d_PreData);
113  pvAssert(d_PostGSyn);
114  pvAssert(d_PatchToDataLookup);
115  pvAssert(d_WData);
116 
117  // We create mDevicePatches and mDeviceGSynPatchStart here, as opposed to creating them in
118  // the Weights object, because they are only needed by presynaptic-perspective delivery.
119  auto preGeometry = preWeights->getGeometry();
120  std::size_t const numPatches = (std::size_t)preGeometry->getNumPatches();
121  std::size_t size;
122 
123  Patch const *hostPatches = &preGeometry->getPatch(0); // Patches allocated as one vector
124  size = (std::size_t)numPatches * sizeof(*hostPatches);
125  mDevicePatches = mCudaDevice->createBuffer(size, &description);
126  pvAssert(mDevicePatches);
127  // Copy patch geometry information onto CUDA device because it never changes.
128  mDevicePatches->copyToDevice(hostPatches);
129 
130  auto const *hostGSynPatchStart = preGeometry->getGSynPatchStart().data();
131  size = (std::size_t)numPatches * sizeof(*hostGSynPatchStart);
132  mDeviceGSynPatchStart = mCudaDevice->createBuffer(size, &description);
133  // Copy GSynPatchStart array onto CUDA device because it never changes.
134  pvAssert(mDeviceGSynPatchStart);
135  mDeviceGSynPatchStart->copyToDevice(hostGSynPatchStart);
136 
137  int nxp = mWeightsPair->getPreWeights()->getPatchSizeX();
138  int nyp = mWeightsPair->getPreWeights()->getPatchSizeY();
139  int nfp = mWeightsPair->getPreWeights()->getPatchSizeF();
140 
141  int sy = postLoc->nx * postLoc->nf; // stride in restricted post layer
142  int syw = preWeights->getPatchStrideY();
143 
144  bool isSparse = getPreLayer()->getSparseFlag();
145 
146  int numPreExt = getPreLayer()->getNumExtended();
147  int numPostRes = getPostLayer()->getNumNeurons();
148 
149  int nbatch = postLoc->nbatch;
150 
151  PVCuda::CudaBuffer *d_activeIndices = NULL;
152  PVCuda::CudaBuffer *d_numActive = NULL;
153  if (isSparse) {
154  d_numActive = getPreLayer()->getDeviceNumActive();
155  pvAssert(d_numActive);
156  d_activeIndices = getPreLayer()->getDeviceActiveIndices();
157  pvAssert(d_activeIndices);
158  }
159 
160  mRecvKernel->setArgs(
161  nbatch,
162  numPreExt,
163  numPostRes,
164  nxp,
165  nyp,
166  nfp,
167 
168  sy,
169  syw,
170  mDeltaTimeFactor,
171  preWeights->getSharedFlag(),
172  mChannelCode,
173  mDevicePatches,
174  mDeviceGSynPatchStart,
175 
176  d_PreData,
177  preWeights->getDeviceData(),
178  d_PostGSyn,
179  d_PatchToDataLookup,
180 
181  isSparse,
182  d_numActive,
183  d_activeIndices);
184 }
185 
186 void PresynapticPerspectiveGPUDelivery::allocateThreadGSyn() {
187  // If multithreaded, allocate a GSyn buffer for each thread, to avoid collisions.
188  int const numThreads = parent->getNumThreads();
189  if (numThreads > 1) {
190  mThreadGSyn.resize(numThreads);
191  // mThreadGSyn is only a buffer for one batch element. We're threading over presynaptic
192  // neuron index, not batch element; so batch elements will be processed serially.
193  for (auto &th : mThreadGSyn) {
194  th.resize(mPostLayer->getNumNeurons());
195  }
196  }
197 }
198 
200  // Check if we need to update based on connection's channel
201  if (getChannelCode() == CHANNEL_NOUPDATE) {
202  return;
203  }
204  float *postChannel = mPostLayer->getChannel(getChannelCode());
205  pvAssert(postChannel);
206 
207  pvAssert(mRecvKernel);
208 
209  PVLayerLoc const *preLoc = mPreLayer->getLayerLoc();
210  PVLayerLoc const *postLoc = mPostLayer->getLayerLoc();
211  Weights *weights = mWeightsPair->getPreWeights();
212 
213  int const nxPreExtended = preLoc->nx + preLoc->halo.rt + preLoc->halo.rt;
214  int const nyPreExtended = preLoc->ny + preLoc->halo.dn + preLoc->halo.up;
215  int const numPreExtended = nxPreExtended * nyPreExtended * preLoc->nf;
216 
217  int const numPostRestricted = postLoc->nx * postLoc->ny * postLoc->nf;
218 
219  int nbatch = preLoc->nbatch;
220  pvAssert(nbatch == postLoc->nbatch);
221 
222  const int sy = postLoc->nx * postLoc->nf; // stride in restricted layer
223  const int syw = weights->getGeometry()->getPatchStrideY(); // stride in patch
224 
225  bool const preLayerIsSparse = mPreLayer->getSparseFlag();
226 
227  int numAxonalArbors = mArborList->getNumAxonalArbors();
228  for (int arbor = 0; arbor < numAxonalArbors; arbor++) {
229  int delay = mArborList->getDelay(arbor);
230  PVLayerCube activityCube = mPreLayer->getPublisher()->createCube(delay);
231 
232  mRecvKernel->set_dt_factor(mDeltaTimeFactor);
233 
234  // Post layer receives synaptic input
235  // Only with respect to post layer
236  const PVLayerLoc *preLoc = getPreLayer()->getLayerLoc();
237  const PVLayerLoc *postLoc = getPostLayer()->getLayerLoc();
238  // If the connection uses gpu to receive, update all buffers
239 
240  // Update pre datastore, post gsyn, and conn weights only if they're updated
241  if (getPreLayer()->getUpdatedDeviceDatastoreFlag()) {
242  float *h_preDatastore = activityCube.data;
243  PVCuda::CudaBuffer *d_preDatastore = getPreLayer()->getDeviceDatastore();
244  pvAssert(d_preDatastore);
245  d_preDatastore->copyToDevice(h_preDatastore);
246 
247  // Copy active indices and num active if needed
248  if (activityCube.isSparse) {
249  PVCuda::CudaBuffer *d_ActiveIndices;
250  PVCuda::CudaBuffer *d_numActive;
251  d_ActiveIndices = getPreLayer()->getDeviceActiveIndices();
252  d_numActive = getPreLayer()->getDeviceNumActive();
253  pvAssert(d_ActiveIndices);
254  SparseList<float>::Entry const *h_ActiveIndices =
255  (SparseList<float>::Entry *)activityCube.activeIndices;
256  long const *h_numActive = activityCube.numActive;
257  pvAssert(h_ActiveIndices);
258  d_numActive->copyToDevice(h_numActive);
259  d_ActiveIndices->copyToDevice(h_ActiveIndices);
260  }
261  // Device now has updated
262  getPreLayer()->setUpdatedDeviceDatastoreFlag(false);
263  }
264 
265  // X direction is active neuron
266  // Y direction is post patch size
267  long totActiveNeuron[parent->getNBatch()];
268  long maxTotalActiveNeuron = 0;
269  for (int b = 0; b < parent->getNBatch(); b++) {
270  if (activityCube.isSparse) {
271  totActiveNeuron[b] = activityCube.numActive[b];
272  }
273  else {
274  totActiveNeuron[b] = getPreLayer()->getNumExtended();
275  }
276  if (totActiveNeuron[b] > maxTotalActiveNeuron) {
277  maxTotalActiveNeuron = totActiveNeuron[b];
278  }
279  }
280 
281  if (maxTotalActiveNeuron > 0) {
282  long totPatchSize = (long)weights->getPatchSizeOverall();
283  long totThreads = maxTotalActiveNeuron * totPatchSize;
284  int maxThreads = parent->getDevice()->get_max_threads();
285  int numLocalThreads = totPatchSize < maxThreads ? totPatchSize : maxThreads;
286 
287  mRecvKernel->run_nocheck(totThreads, numLocalThreads);
288  }
289  }
290  // GSyn already living on GPU
291  mPostLayer->setUpdatedDeviceGSynFlag(false);
292 }
293 
294 // This is a copy of PresynapticPerspectiveConvolveDelivery.
295 // The spirit of this class says we should put this method on the GPU,
296 // but the priority for doing so is rather low.
297 void PresynapticPerspectiveGPUDelivery::deliverUnitInput(float *recvBuffer) {
298  PVLayerLoc const *postLoc = mPostLayer->getLayerLoc();
299  Weights *weights = mWeightsPair->getPreWeights();
300 
301  int const numPostRestricted = postLoc->nx * postLoc->ny * postLoc->nf;
302 
303  int nbatch = postLoc->nbatch;
304 
305  const int sy = postLoc->nx * postLoc->nf; // stride in restricted layer
306  const int syw = weights->getGeometry()->getPatchStrideY(); // stride in patch
307 
308  int numAxonalArbors = mArborList->getNumAxonalArbors();
309  for (int arbor = 0; arbor < numAxonalArbors; arbor++) {
310  for (int b = 0; b < nbatch; b++) {
311  float *recvBatch = recvBuffer + b * numPostRestricted;
312  SparseList<float>::Entry const *activeIndicesBatch = NULL;
313 
314  int numNeurons = mPreLayer->getNumExtended();
315 
316 #ifdef PV_USE_OPENMP_THREADS
317  // Clear all thread gsyn buffer
318  if (!mThreadGSyn.empty()) {
319 #pragma omp parallel for schedule(static)
320  for (int ti = 0; ti < parent->getNumThreads(); ++ti) {
321  for (int ni = 0; ni < numPostRestricted; ++ni) {
322  mThreadGSyn[ti][ni] = 0.0;
323  }
324  }
325  }
326 #endif
327 
328  std::size_t const *gSynPatchStart = weights->getGeometry()->getGSynPatchStart().data();
329  for (int y = 0; y < weights->getPatchSizeY(); y++) {
330 #ifdef PV_USE_OPENMP_THREADS
331 #pragma omp parallel for schedule(guided)
332 #endif
333  for (int idx = 0; idx < numNeurons; idx++) {
334  int kPreExt = idx;
335 
336  // Weight
337  Patch const *patch = &weights->getPatch(kPreExt);
338 
339  if (y >= patch->ny) {
340  continue;
341  }
342 
343  // gSyn
344  float *recvPatchHead = recvBatch;
345 
346 #ifdef PV_USE_OPENMP_THREADS
347  if (!mThreadGSyn.empty()) {
348  recvPatchHead = mThreadGSyn[omp_get_thread_num()].data();
349  }
350 #endif // PV_USE_OPENMP_THREADS
351 
352  float *postPatchStart = &recvPatchHead[gSynPatchStart[kPreExt]];
353 
354  const int nk = patch->nx * weights->getPatchSizeF();
355  float const *weightDataHead = weights->getDataFromPatchIndex(arbor, kPreExt);
356  float const *weightDataStart = &weightDataHead[patch->offset];
357 
358  float *v = postPatchStart + y * sy;
359  float const *weightValues = weightDataStart + y * syw;
360  for (int k = 0; k < nk; k++) {
361  v[k] += mDeltaTimeFactor * weightValues[k];
362  }
363  }
364  }
365 #ifdef PV_USE_OPENMP_THREADS
366  // Accumulate back into gSyn. Should this be done in HyPerLayer where it can be done once,
367  // as opposed to once per connection?
368  if (!mThreadGSyn.empty()) {
369  float *recvPatchHead = recvBatch;
370  int numNeurons = mPostLayer->getNumNeurons();
371  for (int ti = 0; ti < parent->getNumThreads(); ti++) {
372  float *onethread = mThreadGSyn[ti].data();
373 // Looping over neurons is thread safe
374 #pragma omp parallel for
375  for (int ni = 0; ni < numNeurons; ni++) {
376  recvPatchHead[ni] += onethread[ni];
377  }
378  }
379  }
380 #endif // PV_USE_OPENMP_THREADS
381  }
382  }
383 }
384 
385 } // end namespace PV
bool getSharedFlag() const
Definition: Weights.hpp:142
int getPatchSizeX() const
Definition: Weights.hpp:219
PVLayerCube createCube(int delay=0)
Definition: Publisher.cpp:60
virtual int ioParamsFillGroup(enum ParamsIOFlag ioFlag) override
int getPatchSizeOverall() const
Definition: Weights.hpp:231
bool getDataStructuresAllocatedFlag() const
Definition: BaseObject.hpp:102
static bool completed(Status &a)
Definition: Response.hpp:49
Patch const & getPatch(int patchIndex) const
Definition: Weights.cpp:194
int getPatchSizeY() const
Definition: Weights.hpp:222
std::shared_ptr< PatchGeometry > getGeometry() const
Definition: Weights.hpp:148
int getPatchStrideY() const
Definition: Weights.hpp:248
int getNumAxonalArbors() const
Definition: ArborList.hpp:52
float * getDataFromPatchIndex(int arbor, int patchIndex)
Definition: Weights.cpp:205
virtual void ioParam_receiveGpu(enum ParamsIOFlag ioFlag) override
receiveGpu: PresynapticPerspectiveGPUDelivery always sets receiveGpu to true. The receiveGpu=false ca...
virtual int ioParamsFillGroup(enum ParamsIOFlag ioFlag) override
int getPatchSizeF() const
Definition: Weights.hpp:225
bool getInitInfoCommunicatedFlag() const
Definition: BaseObject.hpp:95