#!/bin/sh # Support for checkpointing with DMTCP # under SGE. # Copyright (C) 2012, 2015 Dave Love, University of Liverpool # This file is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3, or (at your option) # any later version. # See file GPL-3 in the SGE LICENCES directory. # Borrows from the shim_dmtcp script distributed in the Debian version # of Condor, but not sufficiently to be a copyright derived work. # Originally intended for use as a starter_method, but this version # won't directly work that way. # Requires DMTCP v2 commands. # Fixme: Consider SGE_STARTER_SHELL_PATH, # SGE_STARTER_SHELL_START_MODE for possible use as a starter. prog_version=1 [ "$SGE_TASK_ID" = undefined ] && SGE_TASK_ID=1 self=$(basename $0) usage () { help="\ Usage: $self [options] [[--] ] DMTCP checkpointing support for SGE, covering running, checkpoint, migrate and clean. Actions (only one specified, default -r): -c Behave like \"-r sh -c ''\", e.g. for use with qsub -S. Quoting may be problematic due to an extra shell expansion. -k Clean up (stop processes and delete ) -m Migrate (checkpoint and exit with code 99) -p Make a checkpoint -r Run under checkpointing (default action). Tries to restart an existing checkpoint if RESTARTED is not 0. Due to an apparent bug in DMTCP 2.4, at least, is not found on PATH, and so must be a file name. Options: -d Directory in which to write checkpoints (default $SGE_CKPT_DIR/$JOB_ID.$SGE_TASK_ID) -s Signal to cause a checkpoint (e.g. \"10\", \"USR1\"), probably as in checkpoint(5). Signals caught: USR1, and , as above, causes a checkpoint; USR2 initiates the migrate action, and returns 99 to cause rescheduling. Files ~/.dmtcpckpt and .dmtcpckpt are sourced in that order, if they exist, to supply hook variables. Of these, variables {pre,post}_{migrate,checkpoint}_hook and pre_launch_hook are evaluated as commands before/after the relevant actions, and $launch_opts is expanded as extra options for dmtcp_launch when is run initially, e.g. to use plugins, or add --ckpt-open-files or --interval. The migrate and checkpoint actions call dmctp_command with the appropriate coordinator commands as arguments. The RESTARTED environment variable must have a valid value per submit(1), and is used to decide whether or not to start from scratch. Example checkpoint(5): ckpt_name dmtcp interface application-level ckpt_command $sge_root/site/dmtcpckpt -p migr_command $sge_root/site/dmtcpckpt -m restart_command NONE clean_command $sge_root/site/dmtcpckpt -k ckpt_dir /scratch/$job_owner/checkpoints signal NONE when xs " if [ -z "$1" ]; then echo "$help" exit 0 else echo "$help" >&2 exit $1 fi } error () { echo "$self: $@" >&2 exit 1 } OPTS=$(getopt -o hc:rkmd:p -l help,version -n "$self" -- "$@") : || usage $? eval set -- "$OPTS" cpsignal= # arg of -s cmd= # arg of -c while [ $# -gt 0 ]; do case $1 in -h|--help) usage;; -p) do_ckpt=1;; -c) do_c=1; cmd=$2; shift;; -r) do_run=1;; -k) do_clean=1;; -m) do_migrate=1;; -d) cpdir=$2; shift;; -s) cpsignal=$2; shift;; -v|--version) echo $prog_version; exit;; --) shift; break;; *) break;; esac shift done # only one allowed case "$do_ckpt$do_run$do_clean$do_migrate$do_c" in '') [ $# -eq 0 ] && usage 1 # need command do_run=1;; 11*) usage 1;; esac case $(dmtcp_command --version 2>&1) in "dmtcp_command (DMTCP) 2"*) :;; *) error "DMTCP version 2 required";; esac [ -n "$cmd" ] && set -- /bin/sh -c "$cmd" # cpdir is a useful abbreviation [ -n "$cpdir" ] || cpdir="$SGE_CKPT_DIR/$JOB_ID.$SGE_TASK_ID" DMTCP_CHECKPOINT_DIR=$cpdir; export DMTCP_CHECKPOINT_DIR DMTCP_QUIET=1; export DMTCP_QUIET # possible config of hooks [ -f ~/.dmtcpckpt ] && source ~/.dmtcpckpt [ -f .dmtcpckpt ] && source .dmtcpckpt # Deal with -notify or checkpointing signals launch_pid= # pid of (re-)started command being waited on # For pending STOP (USR1) or a defined checkpointing one trap 'checkpoint; wait $launch_pid' USR1 $cpsignal # Pending KILL -- reschedule; perhaps reschedule should be optional trap 'migrate; exit 99' USR2 # read coordinator port from cpdir get_port () { # Would use read, but it returns non-zero due with no newline in the file cat "$cpdir/coord-port" || error "Can't read port from $cpdir/coord-port" } # Clean up clean () { # may fail if communicator is dead dmtcp_command -p $(get_port) -k 2>/dev/null rm -rf "$cpdir" exit 0 } # Migration action migrate () { $pre_migrate_hook checkpoint && dmtcp_command -p $(get_port) -q || error "migration failed" $post_migrate_hook } # Checkpoint action checkpoint () { $pre_checkpoint_hook # fixme: check for error (e.g. filesystem full?) dmtcp_command -p $(get_port) -bc $post_checkpoint_hook } # Launch from scratch from_scratch () { rm -rf "$cpdir" (umask o-rwx; mkdir -p "$cpdir") || error "Can't make checkpoint directory \"$cpdir\"" $pre_launch_hook # Launch in background and wait, so that signals can be caught. # Fixme: -p 0 doesn't retry when the random port is in use -- # either grab one initially or fix the dmtcp code. dmtcp_launch --new-coordinator --port-file "$cpdir/coord-port" -p 0 $launch_opts "$@" || error 'launch failed' & launch_pid=$! wait $launch_pid } # Re-start from existing checkpoint, else launch restarter () { if [ -f "$cpdir/dmtcp_restart_script.sh" ]; then echo "$self: restarting from checkpoint" # Fixme: Deal properly with port; restart script doesn't have # --port-file to use with -p 0. "$cpdir/dmtcp_restart_script.sh" -p $(get_port) -h $(hostname) || error 'restart failed' & launch_pid=$! wait $launch_pid else echo "$self: no checkpoint -- doing fresh start" from_scratch "$@" fi } run () { case "$RESTARTED" in 0) from_scratch "$@";; 1|2) restarter "$@";; # restarted somehow *) echo "$self: Bad value ($RESTARTED) of RESTARTED -- doing fresh start" from_scratch "$@";; esac } if [ -n "$do_run" -a $# -gt 0 ]; then run "$@" elif [ -n "$do_run" ]; then restarter "$@" elif [ -n "$do_c" ]; then run "$@" elif [ -n "$do_ckpt" ]; then checkpoint elif [ -n "$do_migrate" ]; then migrate elif [ -n "$do_clean" ]; then clean else error "internal error: no action" fi