8 #include "Checkpointer.hpp" 10 #include "checkpointing/CheckpointingMessages.hpp" 18 #include <sys/types.h> 20 #define DEFAULT_OUTPUT_PATH "output" 24 Checkpointer::Checkpointer(
25 std::string
const &name,
26 MPIBlock
const *globalMPIBlock,
27 Arguments
const *arguments)
29 initMPIBlock(globalMPIBlock, arguments);
30 initBlockDirectoryName();
32 mOutputPath = arguments->getStringArgument(
"OutputPath");
33 mWarmStart = arguments->getBooleanArgument(
"Restart");
34 mCheckpointReadDirectory = arguments->getStringArgument(
"CheckpointReadDirectory");
35 if (!mCheckpointReadDirectory.empty()) {
36 extractCheckpointReadDirectory();
39 mTimeInfoCheckpointEntry = std::make_shared<CheckpointEntryData<Checkpointer::TimeInfo>>(
40 std::string(
"timeinfo"), mMPIBlock, &mTimeInfo, (size_t)1,
true );
42 mCheckpointTimer =
new Timer(mName.c_str(),
"column",
"checkpoint");
43 registerTimer(mCheckpointTimer);
46 Checkpointer::~Checkpointer() {
47 free(mCheckpointWriteDir);
48 free(mCheckpointWriteTriggerModeString);
49 free(mCheckpointWriteWallclockUnit);
50 free(mLastCheckpointDir);
51 free(mInitializeFromCheckpointDir);
52 delete mCheckpointTimer;
56 void Checkpointer::initMPIBlock(MPIBlock
const *globalMPIBlock, Arguments
const *arguments) {
57 pvAssert(mMPIBlock ==
nullptr);
58 int cellNumRows = arguments->getIntegerArgument(
"CheckpointCellNumRows");
59 int cellNumColumns = arguments->getIntegerArgument(
"CheckpointCellNumColumns");
60 int cellBatchDimension = arguments->getIntegerArgument(
"CheckpointCellBatchDimension");
62 mMPIBlock =
new MPIBlock(
63 globalMPIBlock->getComm(),
64 globalMPIBlock->getNumRows(),
65 globalMPIBlock->getNumColumns(),
66 globalMPIBlock->getBatchDimension(),
72 void Checkpointer::initBlockDirectoryName() {
73 mBlockDirectoryName.clear();
74 if (mMPIBlock->getGlobalNumRows() != mMPIBlock->getNumRows()
75 or mMPIBlock->getGlobalNumColumns() != mMPIBlock->getNumColumns()
76 or mMPIBlock->getGlobalBatchDimension() != mMPIBlock->getBatchDimension()) {
77 int const blockColumnIndex = mMPIBlock->getStartColumn() / mMPIBlock->getNumColumns();
78 int const blockRowIndex = mMPIBlock->getStartRow() / mMPIBlock->getNumRows();
79 int const blockBatchIndex = mMPIBlock->getStartBatch() / mMPIBlock->getBatchDimension();
80 mBlockDirectoryName.append(
"block_");
81 mBlockDirectoryName.append(
"col" + std::to_string(blockColumnIndex));
82 mBlockDirectoryName.append(
"row" + std::to_string(blockRowIndex));
83 mBlockDirectoryName.append(
"elem" + std::to_string(blockBatchIndex));
88 FatalIf(path[0] ==
'/',
"makeOutputPathFilename called with absolute path argument\n");
89 std::string fullPath(mOutputPath);
90 if (!mBlockDirectoryName.empty()) {
91 fullPath.append(
"/").append(mBlockDirectoryName);
93 fullPath.append(
"/").append(path);
97 void Checkpointer::ioParams(
enum ParamsIOFlag ioFlag,
PVParams *params) {
98 ioParamsFillGroup(ioFlag, params);
104 if (mWarmStart and ioFlag == PARAMS_IO_READ) {
106 pvAssert(mCheckpointReadDirectory.empty());
107 if (mCheckpointWriteFlag) {
109 findWarmStartDirectory();
112 mCheckpointReadDirectory = mLastCheckpointDir;
117 void Checkpointer::ioParamsFillGroup(
enum ParamsIOFlag ioFlag,
PVParams *params) {
118 ioParam_outputPath(ioFlag, params);
119 ioParam_verifyWrites(ioFlag, params);
120 ioParam_checkpointWrite(ioFlag, params);
121 ioParam_checkpointWriteDir(ioFlag, params);
122 ioParam_checkpointWriteTriggerMode(ioFlag, params);
123 ioParam_checkpointWriteStepInterval(ioFlag, params);
124 ioParam_checkpointWriteTimeInterval(ioFlag, params);
125 ioParam_checkpointWriteClockInterval(ioFlag, params);
126 ioParam_checkpointWriteClockUnit(ioFlag, params);
127 ioParam_checkpointIndexWidth(ioFlag, params);
128 ioParam_suppressNonplasticCheckpoints(ioFlag, params);
129 ioParam_deleteOlderCheckpoints(ioFlag, params);
130 ioParam_numCheckpointsKept(ioFlag, params);
131 ioParam_lastCheckpointDir(ioFlag, params);
132 ioParam_initializeFromCheckpointDir(ioFlag, params);
136 params->ioParamValue(ioFlag, mName.c_str(),
"verifyWrites", &mVerifyWrites, mVerifyWrites);
143 if (mOutputPath.empty()) {
144 if (params->stringPresent(mName.c_str(),
"outputPath")) {
145 mOutputPath = std::string(params->stringValue(mName.c_str(),
"outputPath"));
148 mOutputPath = std::string(DEFAULT_OUTPUT_PATH);
149 if (getMPIBlock()->getGlobalRank() == 0) {
151 "Output path specified neither in command line nor in params file.\n" 152 "Output path set to default \"%s\"\n",
153 DEFAULT_OUTPUT_PATH);
158 case PARAMS_IO_WRITE: params->writeParamString(
"outputPath", mOutputPath.c_str());
break;
164 params->ioParamValue(
165 ioFlag, mName.c_str(),
"checkpointWrite", &mCheckpointWriteFlag, mCheckpointWriteFlag);
169 pvAssert(!params->presentAndNotBeenRead(mName.c_str(),
"checkpointWrite"));
170 if (mCheckpointWriteFlag) {
171 params->ioParamStringRequired(
172 ioFlag, mName.c_str(),
"checkpointWriteDir", &mCheckpointWriteDir);
173 if (ioFlag == PARAMS_IO_READ) {
174 ensureDirExists(mMPIBlock, mCheckpointWriteDir);
180 pvAssert(!params->presentAndNotBeenRead(mName.c_str(),
"checkpointWrite"));
181 if (mCheckpointWriteFlag) {
182 params->ioParamString(
185 "checkpointWriteTriggerMode",
186 &mCheckpointWriteTriggerModeString,
188 if (ioFlag == PARAMS_IO_READ) {
189 pvAssert(mCheckpointWriteTriggerModeString);
190 if (!strcmp(mCheckpointWriteTriggerModeString,
"step")
191 || !strcmp(mCheckpointWriteTriggerModeString,
"Step")
192 || !strcmp(mCheckpointWriteTriggerModeString,
"STEP")) {
193 mCheckpointWriteTriggerMode = STEP;
194 registerCheckpointData(
196 std::string(
"nextCheckpointStep"),
197 &mNextCheckpointStep,
203 !strcmp(mCheckpointWriteTriggerModeString,
"time")
204 || !strcmp(mCheckpointWriteTriggerModeString,
"Time")
205 || !strcmp(mCheckpointWriteTriggerModeString,
"TIME")) {
206 mCheckpointWriteTriggerMode = SIMTIME;
207 registerCheckpointData(
209 std::string(
"nextCheckpointTime"),
210 &mNextCheckpointSimtime,
216 !strcmp(mCheckpointWriteTriggerModeString,
"clock")
217 || !strcmp(mCheckpointWriteTriggerModeString,
"Clock")
218 || !strcmp(mCheckpointWriteTriggerModeString,
"CLOCK")) {
219 mCheckpointWriteTriggerMode = WALLCLOCK;
222 if (mMPIBlock->getRank() == 0) {
223 ErrorLog() <<
"Parameter group \"" << mName <<
"\" checkpointWriteTriggerMode \"" 224 << mCheckpointWriteTriggerModeString <<
"\" is not recognized.\n";
226 MPI_Barrier(mMPIBlock->getComm());
234 pvAssert(!params->presentAndNotBeenRead(mName.c_str(),
"checkpointWrite"));
235 if (mCheckpointWriteFlag) {
236 pvAssert(!params->presentAndNotBeenRead(mName.c_str(),
"checkpointWriteTriggerMode"));
237 if (mCheckpointWriteTriggerMode == STEP) {
238 params->ioParamValue(
241 "checkpointWriteStepInterval",
242 &mCheckpointWriteStepInterval,
243 mCheckpointWriteStepInterval);
249 pvAssert(!params->presentAndNotBeenRead(mName.c_str(),
"checkpointWrite"));
250 if (mCheckpointWriteFlag) {
251 pvAssert(!params->presentAndNotBeenRead(mName.c_str(),
"checkpointWriteTriggerMode"));
252 if (mCheckpointWriteTriggerMode == SIMTIME) {
253 params->ioParamValue(
256 "checkpointWriteTimeInterval",
257 &mCheckpointWriteSimtimeInterval,
258 mCheckpointWriteSimtimeInterval);
264 enum ParamsIOFlag ioFlag,
266 assert(!params->presentAndNotBeenRead(mName.c_str(),
"checkpointWrite"));
267 if (mCheckpointWriteFlag) {
268 pvAssert(!params->presentAndNotBeenRead(mName.c_str(),
"checkpointWriteTriggerMode"));
269 if (mCheckpointWriteTriggerMode == WALLCLOCK) {
270 params->ioParamValueRequired(
273 "checkpointWriteClockInterval",
274 &mCheckpointWriteWallclockInterval);
280 pvAssert(!params->presentAndNotBeenRead(mName.c_str(),
"checkpointWrite"));
281 if (mCheckpointWriteFlag) {
282 pvAssert(!params->presentAndNotBeenRead(mName.c_str(),
"checkpointWriteTriggerMode"));
283 if (mCheckpointWriteTriggerMode == WALLCLOCK) {
285 !params->presentAndNotBeenRead(
286 mName.c_str(),
"checkpointWriteTriggerClockInterval"));
287 params->ioParamString(
290 "checkpointWriteClockUnit",
291 &mCheckpointWriteWallclockUnit,
293 if (ioFlag == PARAMS_IO_READ) {
294 pvAssert(mCheckpointWriteWallclockUnit);
295 for (
size_t n = 0; n < strlen(mCheckpointWriteWallclockUnit); n++) {
296 mCheckpointWriteWallclockUnit[n] = tolower(mCheckpointWriteWallclockUnit[n]);
298 if (!strcmp(mCheckpointWriteWallclockUnit,
"second")
299 || !strcmp(mCheckpointWriteWallclockUnit,
"seconds")
300 || !strcmp(mCheckpointWriteWallclockUnit,
"sec")
301 || !strcmp(mCheckpointWriteWallclockUnit,
"s")) {
302 free(mCheckpointWriteWallclockUnit);
303 mCheckpointWriteWallclockUnit = strdup(
"seconds");
304 mCheckpointWriteWallclockIntervalSeconds = mCheckpointWriteWallclockInterval;
307 !strcmp(mCheckpointWriteWallclockUnit,
"minute")
308 || !strcmp(mCheckpointWriteWallclockUnit,
"minutes")
309 || !strcmp(mCheckpointWriteWallclockUnit,
"min")
310 || !strcmp(mCheckpointWriteWallclockUnit,
"m")) {
311 free(mCheckpointWriteWallclockUnit);
312 mCheckpointWriteWallclockUnit = strdup(
"minutes");
313 mCheckpointWriteWallclockIntervalSeconds =
314 mCheckpointWriteWallclockInterval * (time_t)60;
317 !strcmp(mCheckpointWriteWallclockUnit,
"hour")
318 || !strcmp(mCheckpointWriteWallclockUnit,
"hours")
319 || !strcmp(mCheckpointWriteWallclockUnit,
"hr")
320 || !strcmp(mCheckpointWriteWallclockUnit,
"h")) {
321 free(mCheckpointWriteWallclockUnit);
322 mCheckpointWriteWallclockUnit = strdup(
"hours");
323 mCheckpointWriteWallclockIntervalSeconds =
324 mCheckpointWriteWallclockInterval * (time_t)3600;
327 !strcmp(mCheckpointWriteWallclockUnit,
"day")
328 || !strcmp(mCheckpointWriteWallclockUnit,
"days")) {
329 free(mCheckpointWriteWallclockUnit);
330 mCheckpointWriteWallclockUnit = strdup(
"days");
331 mCheckpointWriteWallclockIntervalSeconds =
332 mCheckpointWriteWallclockInterval * (time_t)86400;
335 if (mMPIBlock->getRank() == 0) {
337 "checkpointWriteClockUnit \"%s\" is unrecognized. Use \"seconds\", " 338 "\"minutes\", \"hours\", or \"days\".\n",
339 mCheckpointWriteWallclockUnit);
341 MPI_Barrier(mMPIBlock->getComm());
345 mCheckpointWriteWallclockUnit ==
nullptr,
346 "Error in global rank %d process converting checkpointWriteClockUnit: %s\n",
347 mMPIBlock->getRank(),
355 assert(!params->presentAndNotBeenRead(mName.c_str(),
"checkpointWrite"));
356 if (mCheckpointWriteFlag) {
357 params->ioParamValue(
360 "deleteOlderCheckpoints",
361 &mDeleteOlderCheckpoints,
367 pvAssert(!params->presentAndNotBeenRead(mName.c_str(),
"checkpointWrite"));
368 if (mCheckpointWriteFlag) {
369 pvAssert(!params->presentAndNotBeenRead(mName.c_str(),
"deleteOlderCheckpoints"));
370 if (mDeleteOlderCheckpoints) {
371 params->ioParamValue(ioFlag, mName.c_str(),
"numCheckpointsKept", &mNumCheckpointsKept, 1);
372 if (ioFlag == PARAMS_IO_READ && mNumCheckpointsKept <= 0) {
373 if (mMPIBlock->getRank() == 0) {
374 ErrorLog() <<
"HyPerCol \"" << mName
375 <<
"\": numCheckpointsKept must be positive (value was " 376 << mNumCheckpointsKept <<
")\n";
378 MPI_Barrier(mMPIBlock->getComm());
381 if (ioFlag == PARAMS_IO_READ) {
382 if (mNumCheckpointsKept < 0) {
383 if (mMPIBlock->getRank() == 0) {
384 ErrorLog() <<
"HyPerCol \"" << mName
385 <<
"\": numCheckpointsKept must be positive (value was " 386 << mNumCheckpointsKept <<
")\n";
388 MPI_Barrier(mMPIBlock->getComm());
391 if (mOldCheckpointDirectories.size() != 0) {
392 WarnLog() <<
"ioParamsFillGroup called after list of old checkpoint directories was " 393 "created. Reinitializing.\n";
395 mOldCheckpointDirectories.resize(mNumCheckpointsKept,
"");
396 mOldCheckpointDirectoriesIndex = 0;
403 assert(!params->presentAndNotBeenRead(mName.c_str(),
"checkpointWrite"));
404 if (mCheckpointWriteFlag) {
405 params->ioParamValue(
408 "checkpointIndexWidth",
409 &mCheckpointIndexWidth,
410 mCheckpointIndexWidth);
415 enum ParamsIOFlag ioFlag,
417 assert(!params->presentAndNotBeenRead(mName.c_str(),
"checkpointWrite"));
418 if (mCheckpointWriteFlag) {
419 params->ioParamValue(
422 "suppressNonplasticCheckpoints",
423 &mSuppressNonplasticCheckpoints,
424 mSuppressNonplasticCheckpoints);
429 assert(!params->presentAndNotBeenRead(mName.c_str(),
"checkpointWrite"));
430 if (!mCheckpointWriteFlag) {
431 params->ioParamStringRequired(
432 ioFlag, mName.c_str(),
"lastCheckpointDir", &mLastCheckpointDir);
437 params->ioParamString(
440 "initializeFromCheckpointDir",
441 &mInitializeFromCheckpointDir,
444 if (ioFlag == PARAMS_IO_READ and mInitializeFromCheckpointDir !=
nullptr 445 and mInitializeFromCheckpointDir[0] !=
'\0') {
446 verifyDirectory(mInitializeFromCheckpointDir,
"InitializeFromCheckpointDir.\n");
450 void Checkpointer::provideFinalStep(
long int finalStep) {
451 if (mCheckpointIndexWidth < 0) {
452 mWidthOfFinalStepNumber = (int)std::floor(std::log10((
float)finalStep)) + 1;
457 mObserverTable.addObject(observer->getDescription(), observer);
460 bool Checkpointer::registerCheckpointEntry(
461 std::shared_ptr<CheckpointEntry> checkpointEntry,
462 bool constantEntireRun) {
463 if (mSuppressNonplasticCheckpoints && constantEntireRun) {
466 std::string
const &name = checkpointEntry->getName();
467 for (
auto &c : mCheckpointRegistry) {
468 if (c->getName() == checkpointEntry->getName()) {
472 mCheckpointRegistry.push_back(checkpointEntry);
476 void Checkpointer::registerTimer(
Timer const *timer) { mTimers.push_back(timer); }
478 void Checkpointer::readNamedCheckpointEntry(
479 std::string
const &objName,
480 std::string
const &dataName,
481 bool constantEntireRun) {
482 std::string checkpointEntryName(objName);
483 if (!(objName.empty() || dataName.empty())) {
484 checkpointEntryName.append(
"_");
486 checkpointEntryName.append(dataName);
487 readNamedCheckpointEntry(checkpointEntryName, constantEntireRun);
490 void Checkpointer::readNamedCheckpointEntry(
491 std::string
const &checkpointEntryName,
492 bool constantEntireRun) {
493 if (mSuppressNonplasticCheckpoints and constantEntireRun) {
496 std::string checkpointDirectory = generateBlockPath(mInitializeFromCheckpointDir);
497 for (
auto &c : mCheckpointRegistry) {
498 if (c->getName() == checkpointEntryName) {
499 double timestamp = 0.0;
500 c->read(checkpointDirectory, ×tamp);
504 Fatal() <<
"initializeFromCheckpoint failed to find checkpointEntryName " << checkpointEntryName
508 void Checkpointer::findWarmStartDirectory() {
509 char warmStartDirectoryBuffer[PV_PATH_MAX];
510 if (mMPIBlock->getRank() == 0) {
511 if (mCheckpointWriteFlag) {
513 pvAssert(mCheckpointWriteDir);
514 std::string cpDirString = mCheckpointWriteDir;
515 if (cpDirString.c_str()[cpDirString.length() - 1] !=
'/') {
519 int statstatus = PV_stat(cpDirString.c_str(), &statbuf);
520 if (statstatus == 0) {
521 if (statbuf.st_mode & S_IFDIR) {
522 char *dirs[] = {mCheckpointWriteDir,
nullptr};
523 FTS *fts = fts_open(dirs, FTS_LOGICAL,
nullptr);
524 FTSENT *ftsent = fts_read(fts);
526 long int cpIndex = LONG_MIN;
527 std::string indexedDir;
528 for (ftsent = fts_children(fts, 0); ftsent !=
nullptr; ftsent = ftsent->fts_link) {
529 if (ftsent->fts_statp->st_mode & S_IFDIR) {
531 int k = sscanf(ftsent->fts_name,
"Checkpoint%ld", &x);
534 indexedDir = ftsent->fts_name;
541 "restarting but checkpointWriteFlag is set and " 542 "checkpointWriteDir directory \"%s\" does not have any " 544 mCheckpointWriteDir);
545 mCheckpointReadDirectory = cpDirString;
546 mCheckpointReadDirectory.append(indexedDir);
550 "checkpoint read directory \"%s\" is " 551 "not a directory.\n",
552 mCheckpointWriteDir);
555 else if (errno == ENOENT) {
557 "restarting but neither Last nor checkpointWriteDir " 558 "directory \"%s\" exists.\n",
559 mCheckpointWriteDir);
563 pvAssert(mLastCheckpointDir);
565 mLastCheckpointDir[0] ==
'\0',
566 "Restart flag set, but unable to determine restart directory.\n");
567 mCheckpointReadDirectory = strdup(mLastCheckpointDir);
570 mCheckpointReadDirectory.size() >= PV_PATH_MAX,
571 "Restart flag set, but inferred checkpoint read directory is too long (%zu " 573 mCheckpointReadDirectory.size());
575 warmStartDirectoryBuffer,
576 mCheckpointReadDirectory.c_str(),
577 mCheckpointReadDirectory.size());
578 warmStartDirectoryBuffer[mCheckpointReadDirectory.size()] =
'\0';
580 MPI_Bcast(warmStartDirectoryBuffer, PV_PATH_MAX, MPI_CHAR, 0, mMPIBlock->getComm());
581 if (mMPIBlock->getRank() != 0) {
582 mCheckpointReadDirectory = warmStartDirectoryBuffer;
586 void Checkpointer::readStateFromCheckpoint() {
587 if (getInitializeFromCheckpointDir() and getInitializeFromCheckpointDir()[0]) {
591 mMPIBlock->getRank() == 0 );
596 std::vector<std::string> checkpointReadDirs;
597 checkpointReadDirs.reserve(mMPIBlock->getBatchDimension());
598 std::size_t dirStart = (std::size_t)0;
599 while (dirStart < mCheckpointReadDirectory.size()) {
600 std::size_t dirStop = mCheckpointReadDirectory.find(
':', dirStart);
601 if (dirStop == std::string::npos) {
602 dirStop = mCheckpointReadDirectory.size();
604 checkpointReadDirs.push_back(mCheckpointReadDirectory.substr(dirStart, dirStop - dirStart));
606 checkpointReadDirs.size() > (std::size_t)mMPIBlock->getBatchDimension(),
607 "Checkpoint read parsing error: Too many colon separated " 608 "checkpoint read directories. " 609 "Only specify %d checkpoint directories.\n",
610 mMPIBlock->getBatchDimension());
611 dirStart = dirStop + 1;
614 int const count = (int)checkpointReadDirs.size();
616 count != mMPIBlock->getBatchDimension() && count != 1,
617 "Checkpoint read parsing error: Not enough colon separated " 618 "checkpoint read directories. " 619 "Running with %d batch MPIs but only %d colon separated checkpoint " 621 mMPIBlock->getBatchDimension(),
624 int const checkpointIndex = count == 1 ? 0 : mMPIBlock->getBatchIndex();
625 std::string dirString = expandLeadingTilde(checkpointReadDirs[checkpointIndex].c_str());
626 mCheckpointReadDirectory = dirString.c_str();
627 pvAssert(!mCheckpointReadDirectory.empty());
631 "Setting CheckpointReadDirectory to %s.\n",
632 mMPIBlock->getGlobalRank(),
633 mCheckpointReadDirectory.c_str());
637 void Checkpointer::checkpointRead(
double *simTimePointer,
long int *currentStepPointer) {
638 verifyDirectory(mCheckpointReadDirectory.c_str(),
"CheckpointReadDirectory");
639 std::string checkpointReadDirectory = generateBlockPath(mCheckpointReadDirectory);
641 for (
auto &c : mCheckpointRegistry) {
642 c->read(checkpointReadDirectory, &readTime);
644 mTimeInfoCheckpointEntry->read(checkpointReadDirectory.c_str(), &readTime);
645 if (simTimePointer) {
646 *simTimePointer = mTimeInfo.mSimTime;
648 if (currentStepPointer) {
649 *currentStepPointer = mTimeInfo.mCurrentCheckpointStep;
653 std::make_shared<ProcessCheckpointReadMessage const>(checkpointReadDirectory),
654 mMPIBlock->getRank() == 0 );
657 void Checkpointer::checkpointWrite(
double simTime) {
658 mTimeInfo.mSimTime = simTime;
660 if (!mCheckpointWriteFlag) {
663 bool isScheduled = scheduledCheckpoint();
666 if (receivedSignal()) {
667 checkpointWriteSignal();
669 else if (isScheduled) {
672 mTimeInfo.mCurrentCheckpointStep++;
677 int checkpointSignal;
678 if (mMPIBlock->getGlobalRank() == 0) {
679 int sigstatus = PV_SUCCESS;
682 sigstatus = sigpending(&pollusr1);
683 assert(sigstatus == 0);
684 checkpointSignal = sigismember(&pollusr1, SIGUSR1);
685 assert(checkpointSignal == 0 || checkpointSignal == 1);
686 if (checkpointSignal) {
687 sigstatus = sigemptyset(&pollusr1);
688 assert(sigstatus == 0);
689 sigstatus = sigaddset(&pollusr1, SIGUSR1);
690 assert(sigstatus == 0);
692 sigwait(&pollusr1, &result);
693 assert(result == SIGUSR1);
696 MPI_Bcast(&checkpointSignal, 1 , MPI_INT, 0, mMPIBlock->getGlobalComm());
697 return (checkpointSignal != 0);
701 bool isScheduled =
false;
702 switch (mCheckpointWriteTriggerMode) {
707 case STEP: isScheduled = scheduledStep();
break;
708 case SIMTIME: isScheduled = scheduledSimTime();
break;
709 case WALLCLOCK: isScheduled = scheduledWallclock();
break;
710 default: pvAssert(0);
break;
716 bool isScheduled =
false;
717 pvAssert(mCheckpointWriteStepInterval > 0);
718 if (mTimeInfo.mCurrentCheckpointStep % mCheckpointWriteStepInterval == 0) {
719 mNextCheckpointStep = mTimeInfo.mCurrentCheckpointStep + mCheckpointWriteStepInterval;
726 bool isScheduled =
false;
727 if (mTimeInfo.mSimTime >= mNextCheckpointSimtime) {
728 mNextCheckpointSimtime += mCheckpointWriteSimtimeInterval;
735 bool isScheduled =
false;
736 std::time_t currentTime;
737 if (mMPIBlock->getGlobalRank() == 0) {
738 currentTime = std::time(
nullptr);
740 MPI_Bcast(¤tTime,
sizeof(currentTime), MPI_CHAR, 0, mMPIBlock->getComm());
741 if (currentTime == (std::time_t)(-1)) {
744 double elapsed = std::difftime(currentTime, mLastCheckpointWallclock);
745 if (elapsed >= mCheckpointWriteWallclockInterval) {
747 mLastCheckpointWallclock = currentTime;
754 "Global rank %d: checkpointing in response to SIGUSR1 at time %f.\n",
755 mMPIBlock->getGlobalRank(),
757 std::string checkpointDirectory = makeCheckpointDirectoryFromCurrentStep();
758 checkpointToDirectory(checkpointDirectory);
761 std::string Checkpointer::makeCheckpointDirectoryFromCurrentStep() {
762 std::stringstream checkpointDirStream;
763 checkpointDirStream << mCheckpointWriteDir <<
"/Checkpoint";
764 int fieldWidth = mCheckpointIndexWidth < 0 ? mWidthOfFinalStepNumber : mCheckpointIndexWidth;
765 checkpointDirStream.fill(
'0');
766 checkpointDirStream.width(fieldWidth);
767 checkpointDirStream << mTimeInfo.mCurrentCheckpointStep;
768 std::string checkpointDirectory = checkpointDirStream.str();
769 return checkpointDirectory;
773 std::string checkpointDirectory = makeCheckpointDirectoryFromCurrentStep();
774 if (checkpointDirectory != mCheckpointReadDirectory) {
777 if (mMPIBlock->getGlobalRank() == 0) {
778 InfoLog() <<
"Checkpointing to \"" << checkpointDirectory
779 <<
"\", simTime = " << mTimeInfo.mSimTime <<
"\n";
783 if (mMPIBlock->getGlobalRank() == 0) {
785 "Skipping checkpoint to \"%s\"," 786 " which would clobber the checkpointRead checkpoint.\n",
787 checkpointDirectory.c_str());
791 checkpointToDirectory(checkpointDirectory);
793 if (mDeleteOlderCheckpoints) {
794 rotateOldCheckpoints(checkpointDirectory);
799 std::string checkpointDirectory = generateBlockPath(directory);
800 mCheckpointTimer->start();
801 if (mMPIBlock->getRank() == 0) {
802 InfoLog() <<
"Checkpointing to directory \"" << checkpointDirectory
803 <<
"\" at simTime = " << mTimeInfo.mSimTime <<
"\n";
804 struct stat timeinfostat;
805 std::string timeinfoFilename(checkpointDirectory);
806 timeinfoFilename.append(
"/timeinfo.bin");
807 int statstatus = stat(timeinfoFilename.c_str(), &timeinfostat);
808 if (statstatus == 0) {
809 WarnLog() <<
"Checkpoint directory \"" << checkpointDirectory
810 <<
"\" has existing timeinfo.bin, which is now being deleted.\n";
811 mTimeInfoCheckpointEntry->remove(checkpointDirectory);
816 std::make_shared<PrepareCheckpointWriteMessage const>(checkpointDirectory),
817 mMPIBlock->getRank() == 0 );
818 ensureDirExists(mMPIBlock, checkpointDirectory.c_str());
819 for (
auto &c : mCheckpointRegistry) {
820 c->write(checkpointDirectory, mTimeInfo.mSimTime, mVerifyWrites);
822 mTimeInfoCheckpointEntry->write(checkpointDirectory, mTimeInfo.mSimTime, mVerifyWrites);
823 mCheckpointTimer->stop();
824 mCheckpointTimer->start();
825 writeTimers(checkpointDirectory);
826 mCheckpointTimer->stop();
827 if (mMPIBlock->getRank() == 0) {
828 InfoLog().printf(
"checkpointWrite complete. simTime = %f\n", mTimeInfo.mSimTime);
833 void Checkpointer::finalCheckpoint(
double simTime) {
834 mTimeInfo.mSimTime = simTime;
835 if (mCheckpointWriteFlag) {
838 else if (mLastCheckpointDir !=
nullptr && mLastCheckpointDir[0] !=
'\0') {
839 checkpointToDirectory(std::string(mLastCheckpointDir));
844 std::string &oldestCheckpointDir = mOldCheckpointDirectories[mOldCheckpointDirectoriesIndex];
845 if (!oldestCheckpointDir.empty()) {
846 if (mMPIBlock->getRank() == 0) {
847 std::string targetDirectory = generateBlockPath(oldestCheckpointDir);
848 struct stat lcp_stat;
849 int statstatus = stat(targetDirectory.c_str(), &lcp_stat);
850 if (statstatus != 0 || !(lcp_stat.st_mode & S_IFDIR)) {
851 if (statstatus == 0) {
853 "Failed to delete older checkpoint: failed to stat \"%s\": %s.\n",
854 targetDirectory.c_str(),
859 "Deleting older checkpoint: \"%s\" exists but is not a directory.\n",
860 targetDirectory.c_str());
864 std::string rmrf_string(
"");
865 rmrf_string = rmrf_string +
"rm -r '" + targetDirectory +
"'";
866 int rmrf_result = system(rmrf_string.c_str());
867 if (rmrf_result != 0) {
869 "unable to delete older checkpoint \"%s\": rm command returned %d\n",
870 targetDirectory.c_str(),
871 WEXITSTATUS(rmrf_result));
874 MPI_Barrier(mMPIBlock->getGlobalComm());
875 if (mMPIBlock->getGlobalRank() == 0) {
877 struct stat oldcp_stat;
878 int statstatus = stat(oldestCheckpointDir.c_str(), &oldcp_stat);
879 if (statstatus == 0 && (oldcp_stat.st_mode & S_IFDIR)) {
880 int rmdirstatus = rmdir(oldestCheckpointDir.c_str());
883 "Unable to delete older checkpoint \"%s\": rmdir command returned %d " 885 oldestCheckpointDir.c_str(),
887 std::strerror(errno));
892 mOldCheckpointDirectories[mOldCheckpointDirectoriesIndex] = newCheckpointDirectory;
893 mOldCheckpointDirectoriesIndex++;
894 if (mOldCheckpointDirectoriesIndex == mNumCheckpointsKept) {
895 mOldCheckpointDirectoriesIndex = 0;
899 void Checkpointer::writeTimers(
PrintStream &stream)
const {
900 for (
auto timer : mTimers) {
901 timer->fprint_time(stream);
905 std::string Checkpointer::generateBlockPath(std::string
const &baseDirectory) {
906 std::string path(baseDirectory);
907 if (!mBlockDirectoryName.empty()) {
908 path.append(
"/").append(mBlockDirectoryName);
913 void Checkpointer::verifyDirectory(
char const *directory, std::string
const &description) {
914 int status = PV_SUCCESS;
915 if (mMPIBlock->getRank() == 0) {
916 if (directory ==
nullptr || directory[0] ==
'\0') {
917 ErrorLog() <<
"Checkpointer \"" << mName <<
"\": " << description <<
" is not set.\n";
920 struct stat directoryStat;
921 int statResult = stat(expandLeadingTilde(directory).c_str(), &directoryStat);
922 if (statResult != 0) {
923 ErrorLog() <<
"Checkpointer \"" << mName <<
"\": checking status of " << description
924 <<
" \"" << directory <<
"\" returned error \"" << strerror(errno) <<
"\".\n";
927 bool isDirectory = S_ISDIR(directoryStat.st_mode);
929 ErrorLog() <<
"Checkpointer \"" << mName <<
"\": " << description <<
" \"" << directory
930 <<
" is not a directory.\n";
938 void Checkpointer::writeTimers(std::string
const &directory) {
939 if (mMPIBlock->getRank() == 0) {
940 std::string timerpathstring = directory;
941 timerpathstring +=
"/";
942 timerpathstring +=
"timers.txt";
944 const char *timerpath = timerpathstring.c_str();
945 FileStream timerstream(timerpath, std::ios_base::out, mVerifyWrites);
946 writeTimers(timerstream);
950 std::string
const Checkpointer::mDefaultOutputPath =
"output";
void ioParam_suppressNonplasticCheckpoints(enum ParamsIOFlag ioFlag, PVParams *params)
void checkpointWriteSignal()
void ioParam_checkpointWriteClockUnit(enum ParamsIOFlag ioFlag, PVParams *params)
checkpointWriteClockInteval: If checkpointWrite on clock, specifies the units used in checkpointWrite...
void rotateOldCheckpoints(std::string const &newCheckpointDirectory)
virtual void addObserver(Observer *observer) override
void ioParam_deleteOlderCheckpoints(enum ParamsIOFlag ioFlag, PVParams *params)
deleteOlderCheckpoints: If checkpointWrite, specifies if the run should delete older checkpoints when...
void ioParam_checkpointWriteTriggerMode(enum ParamsIOFlag ioFlag, PVParams *params)
mCheckpointWriteTriggerMode: If checkpointWrite is set, specifies the method to checkpoint.
virtual void ioParam_verifyWrites(enum ParamsIOFlag ioFlag, PVParams *params)
verifyWrites: If true, calls to FileStream::write are checked by opening the file in read mode and re...
void ioParam_initializeFromCheckpointDir(enum ParamsIOFlag ioFlag, PVParams *params)
initializeFromCheckpointDir: Sets directory used by Checkpointer::initializeFromCheckpoint(). Layers and connections use this directory if they set their initializeFromCheckpointFlag parameter.
bool scheduledCheckpoint()
int getGlobalRank() const
virtual void ioParam_outputPath(enum ParamsIOFlag ioFlag, PVParams *params)
mOutputPath: Specifies the absolute or relative output path of the run
void ioParam_checkpointWriteDir(enum ParamsIOFlag ioFlag, PVParams *params)
checkpointWriteDir: If checkpointWrite is set, specifies the output checkpoint directory.
void ioParam_checkpointWriteStepInterval(enum ParamsIOFlag ioFlag, PVParams *params)
checkpointWriteStepInterval: If checkpointWrite on step, specifies the number of steps between checkp...
bool scheduledWallclock()
void checkpointToDirectory(std::string const &checkpointDirectory)
void ioParam_lastCheckpointDir(enum ParamsIOFlag ioFlag, PVParams *params)
lastCheckpointDir: If checkpointWrite is not set, this required parameter specifies the directory to ...
void ioParam_checkpointIndexWidth(enum ParamsIOFlag ioFlag, PVParams *params)
If checkpointWrite is true, checkpointIndexWidth specifies the minimum width for the step number appe...
void ioParam_checkpointWrite(enum ParamsIOFlag ioFlag, PVParams *params)
checkpointWrite: Flag to determine if the run writes checkpoints.
void ioParam_numCheckpointsKept(enum ParamsIOFlag ioFlag, PVParams *params)
mNumCheckpointsKept: If mDeleteOlderCheckpoints is set, keep this many checkpoints before deleting th...
void ioParam_checkpointWriteTimeInterval(enum ParamsIOFlag ioFlag, PVParams *params)
checkpointWriteTimeInteval: If checkpointWrite on time, specifies the amount of simulation time betwe...
void extractCheckpointReadDirectory()
void ioParam_checkpointWriteClockInterval(enum ParamsIOFlag ioFlag, PVParams *params)
checkpointWriteClockInteval: If checkpointWrite on clock, specifies the amount of clock time between ...
std::string makeOutputPathFilename(std::string const &path)