8 #include "PresynapticPerspectiveGPUDelivery.hpp" 9 #include "columns/HyPerCol.hpp" 13 PresynapticPerspectiveGPUDelivery::PresynapticPerspectiveGPUDelivery(
19 PresynapticPerspectiveGPUDelivery::PresynapticPerspectiveGPUDelivery() {}
21 PresynapticPerspectiveGPUDelivery::~PresynapticPerspectiveGPUDelivery() {}
23 int PresynapticPerspectiveGPUDelivery::initialize(
char const *name, HyPerCol *hc) {
24 return BaseObject::initialize(name, hc);
27 void PresynapticPerspectiveGPUDelivery::setObjectType() {
28 mObjectType =
"PresynapticPerspectiveGPUDelivery";
40 Response::Status PresynapticPerspectiveGPUDelivery::communicateInitInfo(
41 std::shared_ptr<CommunicateInitInfoMessage const> message) {
42 auto status = HyPerDelivery::communicateInitInfo(message);
53 getPreLayer()->setAllocDeviceDatastore();
54 mWeightsPair->getPreWeights()->useGPU();
55 getPostLayer()->setAllocDeviceGSyn();
58 if (!mUpdateGSynFromPostPerspective && getPreLayer()->getSparseFlag()) {
59 getPreLayer()->setAllocDeviceActiveIndices();
65 Response::Status PresynapticPerspectiveGPUDelivery::setCudaDevice(
66 std::shared_ptr<SetCudaDeviceMessage const> message) {
67 pvAssert(mUsingGPUFlag);
68 auto status = HyPerDelivery::setCudaDevice(message);
69 if (status != Response::SUCCESS) {
72 mWeightsPair->getPreWeights()->setCudaDevice(message->mCudaDevice);
73 mCudaDevice = message->mCudaDevice;
77 Response::Status PresynapticPerspectiveGPUDelivery::allocateDataStructures() {
79 mCudaDevice ==
nullptr,
80 "%s received AllocateData without having received SetCudaDevice.\n",
83 return Response::POSTPONE;
86 auto status = HyPerDelivery::allocateDataStructures();
91 initializeRecvKernelArgs();
94 return Response::SUCCESS;
97 void PresynapticPerspectiveGPUDelivery::initializeRecvKernelArgs() {
98 PVCuda::CudaDevice *device = parent->getDevice();
99 Weights *preWeights = mWeightsPair->getPreWeights();
102 const PVLayerLoc *preLoc = getPreLayer()->getLayerLoc();
103 const PVLayerLoc *postLoc = getPostLayer()->getLayerLoc();
104 const PVHalo *preHalo = &getPreLayer()->getLayerLoc()->halo;
105 const PVHalo *postHalo = &getPostLayer()->getLayerLoc()->halo;
107 PVCuda::CudaBuffer *d_PreData = getPreLayer()->getDeviceDatastore();
108 PVCuda::CudaBuffer *d_PostGSyn = getPostLayer()->getDeviceGSyn();
109 PVCuda::CudaBuffer *d_PatchToDataLookup = preWeights->getDevicePatchToDataLookup();
110 PVCuda::CudaBuffer *d_WData = preWeights->getDeviceData();
113 pvAssert(d_PostGSyn);
114 pvAssert(d_PatchToDataLookup);
120 std::size_t
const numPatches = (std::size_t)preGeometry->getNumPatches();
123 Patch const *hostPatches = &preGeometry->getPatch(0);
124 size = (std::size_t)numPatches *
sizeof(*hostPatches);
125 mDevicePatches = mCudaDevice->createBuffer(size, &description);
126 pvAssert(mDevicePatches);
128 mDevicePatches->copyToDevice(hostPatches);
130 auto const *hostGSynPatchStart = preGeometry->getGSynPatchStart().data();
131 size = (std::size_t)numPatches *
sizeof(*hostGSynPatchStart);
132 mDeviceGSynPatchStart = mCudaDevice->createBuffer(size, &description);
134 pvAssert(mDeviceGSynPatchStart);
135 mDeviceGSynPatchStart->copyToDevice(hostGSynPatchStart);
141 int sy = postLoc->nx * postLoc->nf;
144 bool isSparse = getPreLayer()->getSparseFlag();
146 int numPreExt = getPreLayer()->getNumExtended();
147 int numPostRes = getPostLayer()->getNumNeurons();
149 int nbatch = postLoc->nbatch;
151 PVCuda::CudaBuffer *d_activeIndices = NULL;
152 PVCuda::CudaBuffer *d_numActive = NULL;
154 d_numActive = getPreLayer()->getDeviceNumActive();
155 pvAssert(d_numActive);
156 d_activeIndices = getPreLayer()->getDeviceActiveIndices();
157 pvAssert(d_activeIndices);
160 mRecvKernel->setArgs(
174 mDeviceGSynPatchStart,
177 preWeights->getDeviceData(),
186 void PresynapticPerspectiveGPUDelivery::allocateThreadGSyn() {
188 int const numThreads = parent->getNumThreads();
189 if (numThreads > 1) {
190 mThreadGSyn.resize(numThreads);
193 for (
auto &th : mThreadGSyn) {
194 th.resize(mPostLayer->getNumNeurons());
201 if (getChannelCode() == CHANNEL_NOUPDATE) {
204 float *postChannel = mPostLayer->getChannel(getChannelCode());
205 pvAssert(postChannel);
207 pvAssert(mRecvKernel);
209 PVLayerLoc const *preLoc = mPreLayer->getLayerLoc();
210 PVLayerLoc const *postLoc = mPostLayer->getLayerLoc();
211 Weights *weights = mWeightsPair->getPreWeights();
213 int const nxPreExtended = preLoc->nx + preLoc->halo.rt + preLoc->halo.rt;
214 int const nyPreExtended = preLoc->ny + preLoc->halo.dn + preLoc->halo.up;
215 int const numPreExtended = nxPreExtended * nyPreExtended * preLoc->nf;
217 int const numPostRestricted = postLoc->nx * postLoc->ny * postLoc->nf;
219 int nbatch = preLoc->nbatch;
220 pvAssert(nbatch == postLoc->nbatch);
222 const int sy = postLoc->nx * postLoc->nf;
223 const int syw = weights->
getGeometry()->getPatchStrideY();
225 bool const preLayerIsSparse = mPreLayer->getSparseFlag();
228 for (
int arbor = 0; arbor < numAxonalArbors; arbor++) {
229 int delay = mArborList->getDelay(arbor);
232 mRecvKernel->set_dt_factor(mDeltaTimeFactor);
236 const PVLayerLoc *preLoc = getPreLayer()->getLayerLoc();
237 const PVLayerLoc *postLoc = getPostLayer()->getLayerLoc();
241 if (getPreLayer()->getUpdatedDeviceDatastoreFlag()) {
242 float *h_preDatastore = activityCube.data;
243 PVCuda::CudaBuffer *d_preDatastore = getPreLayer()->getDeviceDatastore();
244 pvAssert(d_preDatastore);
245 d_preDatastore->copyToDevice(h_preDatastore);
248 if (activityCube.isSparse) {
249 PVCuda::CudaBuffer *d_ActiveIndices;
250 PVCuda::CudaBuffer *d_numActive;
251 d_ActiveIndices = getPreLayer()->getDeviceActiveIndices();
252 d_numActive = getPreLayer()->getDeviceNumActive();
253 pvAssert(d_ActiveIndices);
256 long const *h_numActive = activityCube.numActive;
257 pvAssert(h_ActiveIndices);
258 d_numActive->copyToDevice(h_numActive);
259 d_ActiveIndices->copyToDevice(h_ActiveIndices);
262 getPreLayer()->setUpdatedDeviceDatastoreFlag(
false);
267 long totActiveNeuron[parent->getNBatch()];
268 long maxTotalActiveNeuron = 0;
269 for (
int b = 0; b < parent->getNBatch(); b++) {
270 if (activityCube.isSparse) {
271 totActiveNeuron[b] = activityCube.numActive[b];
274 totActiveNeuron[b] = getPreLayer()->getNumExtended();
276 if (totActiveNeuron[b] > maxTotalActiveNeuron) {
277 maxTotalActiveNeuron = totActiveNeuron[b];
281 if (maxTotalActiveNeuron > 0) {
283 long totThreads = maxTotalActiveNeuron * totPatchSize;
284 int maxThreads = parent->getDevice()->get_max_threads();
285 int numLocalThreads = totPatchSize < maxThreads ? totPatchSize : maxThreads;
287 mRecvKernel->run_nocheck(totThreads, numLocalThreads);
291 mPostLayer->setUpdatedDeviceGSynFlag(
false);
297 void PresynapticPerspectiveGPUDelivery::deliverUnitInput(
float *recvBuffer) {
298 PVLayerLoc const *postLoc = mPostLayer->getLayerLoc();
299 Weights *weights = mWeightsPair->getPreWeights();
301 int const numPostRestricted = postLoc->nx * postLoc->ny * postLoc->nf;
303 int nbatch = postLoc->nbatch;
305 const int sy = postLoc->nx * postLoc->nf;
306 const int syw = weights->
getGeometry()->getPatchStrideY();
309 for (
int arbor = 0; arbor < numAxonalArbors; arbor++) {
310 for (
int b = 0; b < nbatch; b++) {
311 float *recvBatch = recvBuffer + b * numPostRestricted;
314 int numNeurons = mPreLayer->getNumExtended();
316 #ifdef PV_USE_OPENMP_THREADS 318 if (!mThreadGSyn.empty()) {
319 #pragma omp parallel for schedule(static) 320 for (
int ti = 0; ti < parent->getNumThreads(); ++ti) {
321 for (
int ni = 0; ni < numPostRestricted; ++ni) {
322 mThreadGSyn[ti][ni] = 0.0;
328 std::size_t
const *gSynPatchStart = weights->
getGeometry()->getGSynPatchStart().data();
330 #ifdef PV_USE_OPENMP_THREADS 331 #pragma omp parallel for schedule(guided) 333 for (
int idx = 0; idx < numNeurons; idx++) {
339 if (y >= patch->ny) {
344 float *recvPatchHead = recvBatch;
346 #ifdef PV_USE_OPENMP_THREADS 347 if (!mThreadGSyn.empty()) {
348 recvPatchHead = mThreadGSyn[omp_get_thread_num()].data();
350 #endif // PV_USE_OPENMP_THREADS 352 float *postPatchStart = &recvPatchHead[gSynPatchStart[kPreExt]];
356 float const *weightDataStart = &weightDataHead[patch->offset];
358 float *v = postPatchStart + y * sy;
359 float const *weightValues = weightDataStart + y * syw;
360 for (
int k = 0; k < nk; k++) {
361 v[k] += mDeltaTimeFactor * weightValues[k];
365 #ifdef PV_USE_OPENMP_THREADS 368 if (!mThreadGSyn.empty()) {
369 float *recvPatchHead = recvBatch;
370 int numNeurons = mPostLayer->getNumNeurons();
371 for (
int ti = 0; ti < parent->getNumThreads(); ti++) {
372 float *onethread = mThreadGSyn[ti].data();
374 #pragma omp parallel for 375 for (
int ni = 0; ni < numNeurons; ni++) {
376 recvPatchHead[ni] += onethread[ni];
380 #endif // PV_USE_OPENMP_THREADS bool getSharedFlag() const
int getPatchSizeX() const
PVLayerCube createCube(int delay=0)
virtual int ioParamsFillGroup(enum ParamsIOFlag ioFlag) override
int getPatchSizeOverall() const
bool getDataStructuresAllocatedFlag() const
static bool completed(Status &a)
Patch const & getPatch(int patchIndex) const
virtual void deliver() override
int getPatchSizeY() const
std::shared_ptr< PatchGeometry > getGeometry() const
int getPatchStrideY() const
int getNumAxonalArbors() const
float * getDataFromPatchIndex(int arbor, int patchIndex)
virtual void ioParam_receiveGpu(enum ParamsIOFlag ioFlag) override
receiveGpu: PresynapticPerspectiveGPUDelivery always sets receiveGpu to true. The receiveGpu=false ca...
virtual int ioParamsFillGroup(enum ParamsIOFlag ioFlag) override
int getPatchSizeF() const
bool getInitInfoCommunicatedFlag() const