#!/bin/sh # -*-coding: utf-8;-*- # Copyright (C) 2010 Dave Love # License: GNU GPL v. 3 # or any later version. # From the cluster head node, run padb against a tightly-integrated # MPI job running under SGE. This involves identifying the # master node and then remote shelling out to that with at least # TMPDIR set to what SGE provides. # Assumptions: # * rmgr is set in padb.conf to disambiguate the possibilities or # passed as an arg to this script; # * mpirun (specifically that name for orterun) is being run by the # job on the master node, and only one instance of it; # * pstree is available. # Fixme: # * Assuming each SGE job only has a single MPI `job' associated # with it (check with pad --list-rmgrs?), fill in the MPI jobid info, # as appropriate, in the args passed to padb: Add a jobid if there # isn't one, and -a or -A isn't given, and add it to any bare # --full-report arg; # * Use ps instead of pstree and grovelling /proc? Maybe use the cpuset # if we have it; # * Extract the spool directory from the SGE config? # * Allow orterun aliases other than mpirun. # Customization: # Change this, e.g. to `qrsh', if you don't have ssh to the compute nodes. remote_shell=ssh #remote_shell=/opt/sge/bin/lx-amd64/qrsh # This is a hook for doing anything necessary for the command to run # the MPI stuff on the remote host. It needs a trailing newline or # `;' unless it's setting up the environment, as in this example. # Note that we deal with setting PATH and LD_LIBRARY_PATH anyway. # The OMPI... here undoes the normal restriction on our system of # running open-mpi only under the resource manager. remote_preamble="OMPI_MCA_orte_allocation_required=0" # Job spool directory on the master host spooldir="$SGE_ROOT/$SGE_CELL/spool/$host/active_jobs/$jobnum.$task" # end likely customization error () { echo "$@" 1>&2; exit 1 } usage () { msg="Usage: $(basename $0) [.] ... Run padb(1) with given args against the parallel job with the given job/task number. E.g. $(basename $0) 123.1 --proc-summary The -a option is supplied to padb on the assumption that there is only one MPI session associated with the job." if [ -z "$1" -o "$1" = 0 ]; then echo "$msg" 1>&2 else echo "$msg" fi exit $1 } if [ "$1" = --help ]; then usage elif [ $# -lt 2 ]; then usage 1 # need multiple args elif expr "$1" : '.*[^.0-9]' >/dev/null; then usage 1 # & first numeric fi jobnum=$1 shift case $jobnum in *.*) task=$(expr $jobnum : '.*\.\(.*\)$') # explicit task number jobnum=$(expr $jobnum : '\(.*\)\.') task_specified=1 ;; *) task=1 ;; esac # Ensure the host queue isn't truncated. SGE_LONG_QNAMES=255 export SGE_LONG_QNAMES if [ "$task_specified" = 1 ]; then master=$(qstat -s r | awk '/'$jobnum'/ {if ($10=='$task'||($10 "'$task'"=="1")) print $8}') [ -z "$master" ] && error "Can't find running task $jobnum.$task" else tasks=$(qstat -s r | grep "^ *$jobnum ") || error "Job no. $jobnum isn't running" if [ $(echo "$tasks" | wc -l) -eq 1 ]; then master=$(echo "$tasks" | awk '{print $8}') else error "Job $jobnum has multiple tasks -- specify $jobnum." fi fi # We may get the full host name in the extracted queue, but we need the # short form here for the spool directory. host=$(expr $master : '.*@\([^.]*\)') # Stuff from the spool directory. env="$spooldir/environment" [ -f "$env" ] || error "Can't find spool file $env" grep -q ^PE_HOSTFILE "$env" || error "Not a parallel job: $jobnum.$task" read shepherd_pid < "$spooldir/pid" # Finally remote shell out padb after fiddling with the environment to # propagate PATH and LD_LIBRARY_PATH, which are probably enough to avoid # having to load a module, for instance, to run MPI stuff. # We also need the job's TMPDIR into which MPI information will have been # written, at least for the ORTE case (otherwise ompi-ps fails, # expecting to find the sessions directly under /tmp). The directory # structure for SGE task 123.1 in queue parallel might look like: # /tmp/123.1.parallel/openmpi-sessions-fred@node01_0/456 # ^ SGE job ^ ORTE `jobs' in padb terms # Initially, use pstree to find the pid of the `mpirun' command in the # process tree under the shepherd; the output typically looks like: # sge_shepherd(12539)───29960(12540)───mpirun(12552)─┬─qrsh(12556)─┬─{qrsh}(12557) # ... # With the mpirun pid, we can extract the relevant components from the # null-separated pairs in the /proc filesystem. Eventually we can # invoke padb in the right environment. Note that env is necessary. exec $remote_shell $host "\ mpirun_pid=\$(expr \"\$(pstree -p $shepherd_pid)\" : '.*mpirun(\\([0-9]*\\))') vars=\$(tr '\\000' '\\n'