#!/bin/sh -f
#
#
#___INFO__MARK_BEGIN__
##########################################################################
#
#  The Contents of this file are made available subject to the terms of
#  the Sun Industry Standards Source License Version 1.2
#
#  Sun Microsystems Inc., March, 2001
#
#
#  Sun Industry Standards Source License Version 1.2
#  =================================================
#  The contents of this file are subject to the Sun Industry Standards
#  Source License Version 1.2 (the "License"); You may not use this file
#  except in compliance with the License. You may obtain a copy of the
#  License at http://gridengine.sunsource.net/Gridengine_SISSL_license.html
#
#  Software provided under this License is provided on an "AS IS" basis,
#  WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING,
#  WITHOUT LIMITATION, WARRANTIES THAT THE SOFTWARE IS FREE OF DEFECTS,
#  MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE, OR NON-INFRINGING.
#  See the License for the specific provisions governing your rights and
#  obligations concerning the Software.
#
#  The Initial Developer of the Original Code is: Sun Microsystems, Inc.
#
#  Copyright: 2001 by Sun Microsystems, Inc.
#
#  All Rights Reserved.
#
##########################################################################
#___INFO__MARK_END__

# startup of MPICH2 conforming with Grid Engine
# parallel environment interface
#
# usage: startmpich2.sh [options] <pe_hostfile> <mpich2_root>
#
#        options are:
#                     -catch_hostname 
#                      force use of hostname wrapper in $TMPDIR when starting mpirun   
#
#                     -catch_rsh
#                      force use of rsh wrapper in $TMPDIR when starting mpirun 
#
#                      This will enable a tight integration of theMPICH2 
#                      jobs.

PeHostfile2MachineFile()
{
   cat $1 | while read line; do
      # echo $line
      host=`echo $line|cut -f1 -d" "|cut -f1 -d"."`
      nslots=`echo $line|cut -f2 -d" "`
      i=1
      while [ $i -le $nslots ]; do
         # add here code to map regular hostnames into ATM hostnames
         echo $host
         i=`expr $i + 1`
      done
   done
}

# useful to control parameters passed to us  
echo $*

SLEEPTIME=5
RETRIES=10

catch_rsh=0
catch_hostname=0
while [ $# -gt 0 ]; do
   case "$1" in
      -catch_rsh)
         catch_rsh=1
         ;;
      -catch_hostname)
         catch_hostname=1
         ;;
      *)
         break;
         ;;
   esac
   shift
done

me=`basename $0`

# test number of args
if [ $# -lt 2 ]; then
   echo "$me: got wrong number of arguments" >&2
   exit 1
fi

# get arguments
pe_hostfile=$1
MPICH2_ROOT=$2
export MPICH2_ROOT

# ensure we are able to exec our starter
if [ ! -x $SGE_ROOT/mpich2_smpd/bin/$ARC/start_mpich2 ]; then
   echo "$me: can't execute $SGE_ROOT/mpich2_smpd/bin/$ARC/start_mpich2" >&2
   exit 1
fi

# ensure we are able to start smpd
if [ ! -x $MPICH2_ROOT/bin/smpd ]; then
   echo "$me: can't execute $MPICH2_ROOT/bin/smpd" >&2
   exit 1
fi

# ensure pe_hostfile is readable
if [ ! -r $pe_hostfile ]; then
   echo "$me: can't read $pe_hostfile" >&2
   exit 1
fi

# create machine-file
# remove column with number of slots per queue
# mpi does not support them in this form
machines="$TMPDIR/machines"

PeHostfile2MachineFile $pe_hostfile >> $machines

# trace machines file
cat $machines

#
# Make script wrapper for 'rsh' available in jobs tmp dir
#
if [ $catch_rsh = 1 ]; then
   rsh_wrapper=$SGE_ROOT/mpich2_smpd/rsh
   if [ ! -x $rsh_wrapper ]; then
      echo "$me: can't execute $rsh_wrapper" >&2
      echo "     maybe it resides at a file system not available at this machine" >&2
      exit 1
   fi

   rshcmd=rsh
   case "$ARC" in
      hp|hp10|hp11|hp11-64) rshcmd=remsh ;;
      *) ;;
   esac
   # note: This could also be done using rcp, ftp or s.th.
   #       else. We use a symbolic link since it is the
   #       cheapest in case of a shared filesystem
   #
   ln -s $rsh_wrapper $TMPDIR/$rshcmd

   hash -r
fi

#
# Make script wrapper for 'hostname' available in jobs tmp dir
#
if [ $catch_hostname = 1 ]; then
   hostname_wrapper=$SGE_ROOT/mpich2_smpd/hostname
   if [ ! -x $hostname_wrapper ]; then
      echo "$me: can't execute $hostname_wrapper" >&2
      echo "     maybe it resides at a file system not available at this machine" >&2
      exit 1
   fi

   # note: This could also be done using rcp, ftp or s.th.
   #       else. We use a symbolic link since it is the
   #       cheapest in case of a shared filesystem
   #
   ln -s $hostname_wrapper $TMPDIR/hostname
fi

port=$((JOB_ID % 5000 + 20000))

for host in `cat $machines | uniq`; do
   $SGE_ROOT/mpich2_smpd/bin/$ARC/start_mpich2 -n $host $MPICH2_ROOT/bin/smpd $port
done

actual_retry=1
while [ $actual_retry -le $RETRIES ] ; do
   echo "$me: check for smpd daemons ($actual_retry of $RETRIES)"
   numhosts=0

   for host in `cat $machines | uniq`; do
      if [ "`$MPICH2_ROOT/bin/smpd -port $port -status $host | head -n 1 | cut -f 1 -d ' '`" == "smpd" ] ; then
         ((numhosts += 1))
         echo "$me: found running smpd on $host"
      else
         echo "$me: missing smpd on $host"
      fi
   done

   if [ $NHOSTS -eq $numhosts ] ; then
      echo "$me: got all $numhosts of $NHOSTS nodes"
      exit 0
   else
      sleep $SLEEPTIME
   fi

   ((actual_retry += 1))
done

# signal failure to caller
echo "$me: got only $numhosts of $NHOSTS nodes, aborting"
exit 1
