#!/bin/sh
# -*-coding: utf-8;-*-

# Copyright (C) 2010  Dave Love <d.love@liv.ac.uk>
# License:  GNU GPL v. 3 <http://www.gnu.org/licenses/lgpl.html>
#                   or any later version.

# From the cluster head node, run padb against a tightly-integrated
# MPI job running under SGE.  This involves identifying the
# master node and then remote shelling out to that with at least
# TMPDIR set to what SGE provides.
# Assumptions:
# * rmgr is set in padb.conf to disambiguate the possibilities or
#   passed as an arg to this script;
# * mpirun (specifically that name for orterun) is being run by the
#   job on the master node, and only one instance of it;
# * pstree is available.

# Fixme:
# * Assuming each SGE job only has a single MPI `job' associated
#   with it (check with pad --list-rmgrs?), fill in the MPI jobid info,
#   as appropriate, in the args passed to padb:  Add a jobid if there
#   isn't one, and -a or -A isn't given, and add it to any bare
#   --full-report arg;
# * Use ps instead of pstree and grovelling /proc?  Maybe use the cpuset
#   if we have it;
# * Extract the spool directory from the SGE config?
# * Allow orterun aliases other than mpirun.

# Customization:

# Change this, e.g. to `qrsh', if you don't have ssh to the compute nodes.
remote_shell=ssh
#remote_shell=/opt/sge/bin/lx-amd64/qrsh

# This is a hook for doing anything necessary for the command to run
# the MPI stuff on the remote host.  It needs a trailing newline or
# `;' unless it's setting up the environment, as in this example.
# Note that we deal with setting PATH and LD_LIBRARY_PATH anyway.
# The OMPI... here undoes the normal restriction on our system of
# running open-mpi only under the resource manager.
remote_preamble="OMPI_MCA_orte_allocation_required=0"

# Job spool directory on the master host
spooldir="$SGE_ROOT/$SGE_CELL/spool/$host/active_jobs/$jobnum.$task"

# end likely customization

error () {
    echo "$@" 1>&2; exit 1
}

usage () {
    msg="Usage: $(basename $0) <jobnum>[.<tasknum>] <padb args>...
Run padb(1) with given args against the parallel job with the given job/task
number.  E.g. $(basename $0) 123.1 --proc-summary
The -a option is supplied to padb on the assumption that there is only
one MPI session associated with the job."
    if [ -z "$1" -o "$1" = 0 ]; then
	echo "$msg" 1>&2
    else
	echo "$msg"
    fi
    exit $1
}

if [ "$1" = --help ]; then usage
elif [ $# -lt 2 ]; then usage 1	# need multiple args
elif expr "$1" : '.*[^.0-9]' >/dev/null; then usage 1 # & first numeric
fi

jobnum=$1
shift
case $jobnum in
    *.*) task=$(expr $jobnum : '.*\.\(.*\)$') # explicit task number
	 jobnum=$(expr $jobnum : '\(.*\)\.')
	 task_specified=1
	 ;;
    *)   task=1 ;;
esac

# Ensure the host queue isn't truncated.
SGE_LONG_QNAMES=255
export SGE_LONG_QNAMES

if [ "$task_specified" = 1 ]; then
    master=$(qstat -s r |
             awk '/'$jobnum'/ {if ($10=='$task'||($10 "'$task'"=="1")) print $8}')
    [ -z "$master" ] && error "Can't find running task $jobnum.$task"
else
    tasks=$(qstat -s r | grep "^  *$jobnum ") ||
        error "Job no. $jobnum isn't running"
    if [ $(echo "$tasks" | wc -l) -eq 1 ]; then
	master=$(echo "$tasks" | awk '{print $8}')
    else
	error "Job $jobnum has multiple tasks -- specify $jobnum.<n>"
    fi
fi

# We may get the full host name in the extracted queue, but we need the
# short form here for the spool directory.
host=$(expr $master : '.*@\([^.]*\)')

# Stuff from the spool directory.
env="$spooldir/environment"
[ -f "$env" ] || error "Can't find spool file $env"
grep -q ^PE_HOSTFILE "$env" || error "Not a parallel job: $jobnum.$task"
read shepherd_pid < "$spooldir/pid"

# Finally remote shell out padb after fiddling with the environment to
# propagate PATH and LD_LIBRARY_PATH, which are probably enough to avoid
# having to load a module, for instance, to run MPI stuff.
# We also need the job's TMPDIR into which MPI information will have been
# written, at least for the ORTE case (otherwise ompi-ps fails,
# expecting to find the sessions directly under /tmp).  The directory
# structure for SGE task 123.1 in queue parallel might look like:
#   /tmp/123.1.parallel/openmpi-sessions-fred@node01_0/456
#        ^ SGE job      ^ ORTE `jobs' in padb terms
# Initially, use pstree to find the pid of the `mpirun' command in the
# process tree under the shepherd; the output typically looks like:
#   sge_shepherd(12539)───29960(12540)───mpirun(12552)─┬─qrsh(12556)─┬─{qrsh}(12557)
#    ...
# With the mpirun pid, we can extract the relevant components from the
# null-separated pairs in the /proc filesystem.  Eventually we can
# invoke padb in the right environment.  Note that env is necessary.
exec $remote_shell $host "\
mpirun_pid=\$(expr \"\$(pstree -p $shepherd_pid)\" : '.*mpirun(\\([0-9]*\\))')
vars=\$(tr '\\000' '\\n' </proc/\$mpirun_pid/environ | 
        egrep '^LD_LIBRARY_PATH=|^PATH=|^TMPDIR=' |
        tr '\\n' ' ')
$remote_preamble env \$vars padb -a $@"
