Skip to content

Instantly share code, notes, and snippets.

@planetA
Created April 29, 2017 01:03
Show Gist options
  • Save planetA/cc0cbf7cecec56049c79d7d8d53f7d5d to your computer and use it in GitHub Desktop.
Save planetA/cc0cbf7cecec56049c79d7d8d53f7d5d to your computer and use it in GitHub Desktop.
[11908] NOTE at dmtcp_coordinator.cpp:855 in onConnect; REASON='worker connected'
hello_remote.from = 55b24a7c80853818-30373-22c2200d527e
[11908] NOTE at dmtcp_coordinator.cpp:855 in onConnect; REASON='worker connected'
hello_remote.from = 55b24a7c80853818-30374-22c2202e8d96
[11908] NOTE at dmtcp_coordinator.cpp:855 in onConnect; REASON='worker connected'
hello_remote.from = 55b24a7c80853818-30376-22c2205c4835
[11908] NOTE at dmtcp_coordinator.cpp:643 in onData; REASON='Updating process Information after exec()'
progname = simple
msg.from = 55b24a7c80853818-221000-22c220b60ada
client->identity() = 55b24a7c80853818-30374-22c2202e8d96
[11908] NOTE at dmtcp_coordinator.cpp:855 in onConnect; REASON='worker connected'
hello_remote.from = 55b24a7c80853818-30379-22c22079bf81
[11908] NOTE at dmtcp_coordinator.cpp:643 in onData; REASON='Updating process Information after exec()'
progname = simple
msg.from = 55b24a7c80853818-220000-22c220dd7c92
client->identity() = 55b24a7c80853818-30373-22c2200d527e
[11908] NOTE at dmtcp_coordinator.cpp:643 in onData; REASON='Updating process Information after exec()'
progname = simple
msg.from = 55b24a7c80853818-222000-22c220e595fb
client->identity() = 55b24a7c80853818-30376-22c2205c4835
[11908] NOTE at dmtcp_coordinator.cpp:643 in onData; REASON='Updating process Information after exec()'
progname = simple
msg.from = 55b24a7c80853818-223000-22c223c0d3c2
client->identity() = 55b24a7c80853818-30379-22c22079bf81
[11908] NOTE at dmtcp_coordinator.cpp:693 in onDisconnect; REASON='client disconnected'
client->identity() = 55b24a7c80853818-221000-22c220b60ada
client->progname() = simple
[11908] NOTE at dmtcp_coordinator.cpp:1071 in startCheckpoint; REASON='starting checkpoint, suspending all nodes'
s.numPeers = 3
[11908] NOTE at dmtcp_coordinator.cpp:1073 in startCheckpoint; REASON='Incremented computationGeneration'
compId.computationGeneration() = 1
[11908] NOTE at dmtcp_coordinator.cpp:693 in onDisconnect; REASON='client disconnected'
client->identity() = 55b24a7c80853818-222000-22c220e595fb
client->progname() = simple
[11908] NOTE at dmtcp_coordinator.cpp:413 in updateMinimumState; REASON='locking all nodes'
[11908] NOTE at dmtcp_coordinator.cpp:419 in updateMinimumState; REASON='draining all nodes'
[11908] NOTE at dmtcp_coordinator.cpp:425 in updateMinimumState; REASON='checkpointing all nodes'
[11908] NOTE at dmtcp_coordinator.cpp:449 in updateMinimumState; REASON='building name service database'
[11908] NOTE at dmtcp_coordinator.cpp:465 in updateMinimumState; REASON='entertaining queries now'
[11908] NOTE at dmtcp_coordinator.cpp:470 in updateMinimumState; REASON='refilling all nodes'
[11908] NOTE at dmtcp_coordinator.cpp:510 in updateMinimumState; REASON='restarting all nodes'
JTIMER(checkpoint) : 0.496187
[11908] NOTE at dmtcp_coordinator.cpp:693 in onDisconnect; REASON='client disconnected'
client->identity() = 55b24a7c80853818-223000-22c223c0d3c2
client->progname() = simple
[11908] NOTE at dmtcp_coordinator.cpp:1071 in startCheckpoint; REASON='starting checkpoint, suspending all nodes'
s.numPeers = 1
[11908] NOTE at dmtcp_coordinator.cpp:1073 in startCheckpoint; REASON='Incremented computationGeneration'
compId.computationGeneration() = 2
[11908] NOTE at dmtcp_coordinator.cpp:413 in updateMinimumState; REASON='locking all nodes'
[11908] NOTE at dmtcp_coordinator.cpp:419 in updateMinimumState; REASON='draining all nodes'
[11908] NOTE at dmtcp_coordinator.cpp:425 in updateMinimumState; REASON='checkpointing all nodes'
[11908] NOTE at dmtcp_coordinator.cpp:449 in updateMinimumState; REASON='building name service database'
[11908] NOTE at dmtcp_coordinator.cpp:465 in updateMinimumState; REASON='entertaining queries now'
[11908] NOTE at dmtcp_coordinator.cpp:470 in updateMinimumState; REASON='refilling all nodes'
[11908] NOTE at dmtcp_coordinator.cpp:510 in updateMinimumState; REASON='restarting all nodes'
JTIMER(checkpoint) : 0.459036
[11908] NOTE at dmtcp_coordinator.cpp:1071 in startCheckpoint; REASON='starting checkpoint, suspending all nodes'
s.numPeers = 1
[11908] NOTE at dmtcp_coordinator.cpp:1073 in startCheckpoint; REASON='Incremented computationGeneration'
compId.computationGeneration() = 3
[11908] NOTE at dmtcp_coordinator.cpp:413 in updateMinimumState; REASON='locking all nodes'
[11908] NOTE at dmtcp_coordinator.cpp:419 in updateMinimumState; REASON='draining all nodes'
[11908] NOTE at dmtcp_coordinator.cpp:425 in updateMinimumState; REASON='checkpointing all nodes'
[11908] NOTE at dmtcp_coordinator.cpp:449 in updateMinimumState; REASON='building name service database'
[11908] NOTE at dmtcp_coordinator.cpp:465 in updateMinimumState; REASON='entertaining queries now'
[11908] NOTE at dmtcp_coordinator.cpp:470 in updateMinimumState; REASON='refilling all nodes'
[11908] NOTE at dmtcp_coordinator.cpp:510 in updateMinimumState; REASON='restarting all nodes'
JTIMER(checkpoint) : 0.467209
[11908] NOTE at dmtcp_coordinator.cpp:693 in onDisconnect; REASON='client disconnected'
client->identity() = 55b24a7c80853818-220000-22c220dd7c92
client->progname() = simple
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <string.h>
#include <dmtcp.h>
#define CRASH(format, ...) ({ \
fprintf(stderr, "DMTCPTESTEND:DMTCPCRASH:%s:%d " format, \
__FILE__, __LINE__, ##__VA_ARGS__); \
exit(EXIT_FAILURE); \
})
#define OK(format, ...) ({ \
fprintf(stderr, "DMTCPTESTEND:DMTCPOK:%s:%d " format, \
__FILE__, __LINE__, ##__VA_ARGS__); \
exit(EXIT_SUCCESS); \
})
/* By default assume that we are the only process */
int rank = 0;
void parse_args(int argc, char ** argv)
{
char *rank_str = getenv(MPIRANK);
if (rank_str != NULL) {
rank = atoi(rank_str);
assert(rank >= 0);
}
}
int main(int argc, char **argv)
{
if (dmtcp_is_enabled() == 0) {
CRASH("dmtcpcheckpoint: DMTCP not enabled.\n");
}
parse_args(argc, argv);
printf("My rank is %d\n", rank);
for (int i = 0; i < 3; i++) {
printf("Going sleep\n");
/* sleep(1); */
printf("Woke up. Making checkpoint. %d\n", rank);
fflush(stdout);
if (rank)
continue;
sleep(1);
printf("Entering ckpt. %d\n", rank);
if (dmtcp_checkpoint() == DMTCP_NOT_PRESENT) {
CRASH("DMTCP not present.\n");
}
printf("Woke up. %d\n", rank);
}
printf("Preparing to leave the program\n");
/* Comment this out */
sleep(1);
printf("Leaving the program: %d\n", rank);
OK("Normal exit\n");
}
$ mpirun -n 4 --oversubscribe -tag-output ../dmtcp/bin/dmtcp_launch --coord-host localhost --coord-port 7779 ./simple checkpoint
[1,1]<stdout>:My rank is 1
[1,1]<stdout>:Going sleep
[1,1]<stdout>:Woke up. Making checkpoint. 1
[1,1]<stdout>:Going sleep
[1,1]<stdout>:Woke up. Making checkpoint. 1
[1,1]<stdout>:Going sleep
[1,1]<stdout>:Woke up. Making checkpoint. 1
[1,1]<stdout>:Preparing to leave the program
[1,0]<stdout>:My rank is 0
[1,0]<stdout>:Going sleep
[1,0]<stdout>:Woke up. Making checkpoint. 0
[1,2]<stdout>:My rank is 2
[1,2]<stdout>:Going sleep
[1,2]<stdout>:Woke up. Making checkpoint. 2
[1,2]<stdout>:Going sleep
[1,2]<stdout>:Woke up. Making checkpoint. 2
[1,2]<stdout>:Going sleep
[1,2]<stdout>:Woke up. Making checkpoint. 2
[1,2]<stdout>:Preparing to leave the program
[1,3]<stdout>:My rank is 3
[1,3]<stdout>:Going sleep
[1,3]<stdout>:Woke up. Making checkpoint. 3
[1,3]<stdout>:Going sleep
[1,3]<stdout>:Woke up. Making checkpoint. 3
[1,3]<stdout>:Going sleep
[1,3]<stdout>:Woke up. Making checkpoint. 3
[1,3]<stdout>:Preparing to leave the program
[1,1]<stdout>:Leaving the program: 1
[1,1]<stderr>:DMTCPTESTEND:DMTCPOK:simple.c:61 Normal exit
[1,0]<stdout>:Entering ckpt. 0
[1,2]<stdout>:Leaving the program: 2
[1,2]<stderr>:DMTCPTESTEND:DMTCPOK:simple.c:61 Normal exit
[1,3]<stdout>:Leaving the program: 3
[1,0]<stdout>:Woke up. 0
[1,0]<stdout>:Going sleep
[1,3]<stderr>:DMTCPTESTEND:DMTCPOK:simple.c:61 Normal exit
[1,0]<stdout>:Woke up. Making checkpoint. 0
[1,0]<stdout>:Entering ckpt. 0
[1,0]<stdout>:Woke up. 0
[1,0]<stdout>:Going sleep
[1,0]<stdout>:Woke up. Making checkpoint. 0
[1,0]<stdout>:Entering ckpt. 0
[1,0]<stdout>:Woke up. 0
[1,0]<stdout>:Preparing to leave the program
[1,0]<stderr>:DMTCPTESTEND:DMTCPOK:simple.c:61 Normal exit
[1,0]<stdout>:Leaving the program: 0
@ruseykin2
Copy link

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment