#!/bin/sh
# Support for checkpointing with DMTCP
# under SGE.
# Copyright (C) 2012, 2015 Dave Love, University of Liverpool
# This file is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3, or (at your option)
# any later version.
# See file GPL-3 in the SGE LICENCES directory.
# Borrows from the shim_dmtcp script distributed in the Debian version
# of Condor, but not sufficiently to be a copyright derived work.
# Originally intended for use as a starter_method, but this version
# won't directly work that way.
# Requires DMTCP v2 commands.
# Fixme: Consider SGE_STARTER_SHELL_PATH,
# SGE_STARTER_SHELL_START_MODE for possible use as a starter.
prog_version=1
[ "$SGE_TASK_ID" = undefined ] && SGE_TASK_ID=1
self=$(basename $0)
usage () {
help="\
Usage: $self [options] [[--] ]
DMTCP checkpointing support for SGE, covering running, checkpoint,
migrate and clean.
Actions (only one specified, default -r):
-c
Behave like \"-r sh -c ''\", e.g. for use with qsub -S.
Quoting may be problematic due to an extra shell expansion.
-k Clean up (stop processes and delete )
-m Migrate (checkpoint and exit with code 99)
-p Make a checkpoint
-r Run under checkpointing (default action).
Tries to restart an existing checkpoint if RESTARTED is not 0.
Due to an apparent bug in DMTCP 2.4, at least, is not found
on PATH, and so must be a file name.
Options:
-d Directory in which to write checkpoints (default
$SGE_CKPT_DIR/$JOB_ID.$SGE_TASK_ID)
-s Signal to cause a checkpoint (e.g. \"10\", \"USR1\"),
probably as in checkpoint(5).
Signals caught: USR1, and , as above, causes a checkpoint;
USR2 initiates the migrate action, and returns 99
to cause rescheduling.
Files ~/.dmtcpckpt and .dmtcpckpt are sourced in that order, if they
exist, to supply hook variables. Of these, variables
{pre,post}_{migrate,checkpoint}_hook and pre_launch_hook are evaluated
as commands before/after the relevant actions, and $launch_opts is
expanded as extra options for dmtcp_launch when is run
initially, e.g. to use plugins, or add --ckpt-open-files or
--interval. The migrate and checkpoint actions call dmctp_command
with the appropriate coordinator commands as arguments.
The RESTARTED environment variable must have a valid value per submit(1),
and is used to decide whether or not to start from scratch.
Example checkpoint(5):
ckpt_name dmtcp
interface application-level
ckpt_command $sge_root/site/dmtcpckpt -p
migr_command $sge_root/site/dmtcpckpt -m
restart_command NONE
clean_command $sge_root/site/dmtcpckpt -k
ckpt_dir /scratch/$job_owner/checkpoints
signal NONE
when xs
"
if [ -z "$1" ]; then
echo "$help"
exit 0
else
echo "$help" >&2
exit $1
fi
}
error () {
echo "$self: $@" >&2
exit 1
}
OPTS=$(getopt -o hc:rkmd:p -l help,version -n "$self" -- "$@") : ||
usage $?
eval set -- "$OPTS"
cpsignal= # arg of -s
cmd= # arg of -c
while [ $# -gt 0 ]; do
case $1 in
-h|--help) usage;;
-p) do_ckpt=1;;
-c) do_c=1; cmd=$2; shift;;
-r) do_run=1;;
-k) do_clean=1;;
-m) do_migrate=1;;
-d) cpdir=$2; shift;;
-s) cpsignal=$2; shift;;
-v|--version) echo $prog_version; exit;;
--) shift; break;;
*) break;;
esac
shift
done
# only one allowed
case "$do_ckpt$do_run$do_clean$do_migrate$do_c" in
'') [ $# -eq 0 ] && usage 1 # need command
do_run=1;;
11*) usage 1;;
esac
case $(dmtcp_command --version 2>&1) in
"dmtcp_command (DMTCP) 2"*) :;;
*) error "DMTCP version 2 required";;
esac
[ -n "$cmd" ] && set -- /bin/sh -c "$cmd"
# cpdir is a useful abbreviation
[ -n "$cpdir" ] || cpdir="$SGE_CKPT_DIR/$JOB_ID.$SGE_TASK_ID"
DMTCP_CHECKPOINT_DIR=$cpdir; export DMTCP_CHECKPOINT_DIR
DMTCP_QUIET=1; export DMTCP_QUIET
# possible config of hooks
[ -f ~/.dmtcpckpt ] && source ~/.dmtcpckpt
[ -f .dmtcpckpt ] && source .dmtcpckpt
# Deal with -notify or checkpointing signals
launch_pid= # pid of (re-)started command being waited on
# For pending STOP (USR1) or a defined checkpointing one
trap 'checkpoint; wait $launch_pid' USR1 $cpsignal
# Pending KILL -- reschedule; perhaps reschedule should be optional
trap 'migrate; exit 99' USR2
# read coordinator port from cpdir
get_port () {
# Would use read, but it returns non-zero due with no newline in the file
cat "$cpdir/coord-port" ||
error "Can't read port from $cpdir/coord-port"
}
# Clean up
clean () {
# may fail if communicator is dead
dmtcp_command -p $(get_port) -k 2>/dev/null
rm -rf "$cpdir"
exit 0
}
# Migration action
migrate () {
$pre_migrate_hook
checkpoint &&
dmtcp_command -p $(get_port) -q ||
error "migration failed"
$post_migrate_hook
}
# Checkpoint action
checkpoint () {
$pre_checkpoint_hook
# fixme: check for error (e.g. filesystem full?)
dmtcp_command -p $(get_port) -bc
$post_checkpoint_hook
}
# Launch from scratch
from_scratch () {
rm -rf "$cpdir"
(umask o-rwx; mkdir -p "$cpdir") ||
error "Can't make checkpoint directory \"$cpdir\""
$pre_launch_hook
# Launch in background and wait, so that signals can be caught.
# Fixme: -p 0 doesn't retry when the random port is in use --
# either grab one initially or fix the dmtcp code.
dmtcp_launch --new-coordinator --port-file "$cpdir/coord-port" -p 0 $launch_opts "$@" ||
error 'launch failed' &
launch_pid=$!
wait $launch_pid
}
# Re-start from existing checkpoint, else launch
restarter () {
if [ -f "$cpdir/dmtcp_restart_script.sh" ]; then
echo "$self: restarting from checkpoint"
# Fixme: Deal properly with port; restart script doesn't have
# --port-file to use with -p 0.
"$cpdir/dmtcp_restart_script.sh" -p $(get_port) -h $(hostname) ||
error 'restart failed' &
launch_pid=$!
wait $launch_pid
else
echo "$self: no checkpoint -- doing fresh start"
from_scratch "$@"
fi
}
run () {
case "$RESTARTED" in
0) from_scratch "$@";;
1|2) restarter "$@";; # restarted somehow
*) echo "$self: Bad value ($RESTARTED) of RESTARTED -- doing fresh start"
from_scratch "$@";;
esac
}
if [ -n "$do_run" -a $# -gt 0 ]; then
run "$@"
elif [ -n "$do_run" ]; then
restarter "$@"
elif [ -n "$do_c" ]; then
run "$@"
elif [ -n "$do_ckpt" ]; then
checkpoint
elif [ -n "$do_migrate" ]; then
migrate
elif [ -n "$do_clean" ]; then
clean
else
error "internal error: no action"
fi