MPI-Start as EMI-ES ParallelEnvironment backend


Cream support for the ParallelEnvironment uses mpi-start out of the box for its implementation. There is no need for extra configuration. The default job generated by CREAM looks like this:

/usr/bin/mpi-start -t <ParallelEnvironmentType> -npnode <ProccessesPerHost> -d THREADS_PER_PROCESS=<ThreadsPerProcess> <UserExecutable> <UserExecutableArgs>


The UNICORE EMI-ES ParallelEnvironment is also a complete implementation of the specification.

Configuration is done via xml files where the mapping from the job specification to the final executable is specified. For each mpi-flavor that needs to be supported by mpi-start an entry in the idb must be included. For example for OpenMPI:

   1 <ee:ExecutionEnvironment xmlns:ee="http://www.unicore.eu/unicore/jsdl-extensions">
   2     <ee:Name>OpenMPI</ee:Name>
   3     <ee:Version>1.5.4</ee:Version>
   4     <ee:Description>Runs OpenMPI job using mpi-start to start the job</ee:Description>
   5     <ee:ExecutableName>/usr/bin/mpi-start</ee:ExecutableName>
   6     <ee:CommandlineTemplate>#EXECUTABLE -t openmpi #ARGS #USERCOMMAND #USERARGS</ee:CommandlineTemplate>
   7     <ee:Argument>
   8       <ee:Name>Output</ee:Name>
   9       <ee:IncarnatedValue>-o </ee:IncarnatedValue>
  10       <ee:ArgumentMetadata>
  11         <ee:Type>string</ee:Type>
  12         <ee:Description>Write the job output to a FILE instead of to the standard output stream</ee:Description>
  13       </ee:ArgumentMetadata>
  14     </ee:Argument>
  15     <ee:Argument>
  16       <ee:Name>PostHook</ee:Name>
  17       <ee:IncarnatedValue>-post </ee:IncarnatedValue>
  18       <ee:ArgumentMetadata>
  19         <ee:Type>string</ee:Type>
  20         <ee:Description>Use the file as post hook</ee:Description>
  21       </ee:ArgumentMetadata>
  22     </ee:Argument>
  23     <ee:Argument>
  24       <ee:Name>PreHook</ee:Name>
  25       <ee:IncarnatedValue>-pre </ee:IncarnatedValue>
  26       <ee:ArgumentMetadata>
  27         <ee:Type>string</ee:Type>
  28         <ee:Description>Use the file as pre hook</ee:Description>
  29       </ee:ArgumentMetadata>
  30     </ee:Argument>
  31     <ee:Argument>
  32       <ee:Name>Error</ee:Name>
  33       <ee:IncarnatedValue>-e </ee:IncarnatedValue>
  34       <ee:ArgumentMetadata>
  35         <ee:Type>string</ee:Type>
  36         <ee:Description>Write the job error to a FILE instead of to the standard error stream</ee:Description>
  37       </ee:ArgumentMetadata>
  38     </ee:Argument>
  39     <ee:Option> 
  40       <ee:Name>Verbose</ee:Name> 
  41       <ee:IncarnatedValue>-v</ee:IncarnatedValue> 
  42       <ee:OptionMetadata> 
  43         <ee:Description>Enable verbose mode</ee:Description> 
  44       </ee:OptionMetadata> 
  45     </ee:Option> 
  46     <ee:Option> 
  47       <ee:Name>PerNode</ee:Name> 
  48       <ee:IncarnatedValue>-pnode</ee:IncarnatedValue> 
  49       <ee:OptionMetadata> 
  50         <ee:Description>Start one process per available node</ee:Description> 
  51       </ee:OptionMetadata> 
  52     </ee:Option> 
  53     <ee:Option> 
  54       <ee:Name>PerSocket</ee:Name> 
  55       <ee:IncarnatedValue>-psocket</ee:IncarnatedValue> 
  56       <ee:OptionMetadata> 
  57         <ee:Description>Start one process per available socket</ee:Description> 
  58       </ee:OptionMetadata> 
  59     </ee:Option> 
  60     <ee:Option> 
  61       <ee:Name>PerCore</ee:Name> 
  62       <ee:IncarnatedValue>-pcore</ee:IncarnatedValue> 
  63       <ee:OptionMetadata> 
  64         <ee:Description>Start one process per available core</ee:Description> 
  65       </ee:OptionMetadata> 
  66     </ee:Option> 
  67 </ee:ExecutionEnvironment>

3. ARC

ARC supports the ParallelEnvironment through the RunTimeEnvironments. You need to create a RTE for the ParallelEnvironment that uses mpi-start:

   1 #!/bin/sh
   3 case "$1" in
   4 0)
   5         TYPE=`echo "$joboption_penv_type" | tr [:upper:] [:lower:]`
   6         OPTS="-t $TYPE"
   7         if [ "x$joboption_penv_procperslot" != "x" ] ; then
   8                 OPTS="$OPTS -npnode $joboption_penv_procperslot"
   9         fi
  10         if [ "x$joboption_penv_threadsperslot" != "x" ] ; then
  11                 OPTS="$OPTS -d THREADS_PER_CORE=$joboption_penv_threadsperslot"
  12         fi
  13         joboption_args="mpi-start $OPTS -- $joboption_args"
  14 ;;
  15 1)
  16 ;;
  17 2)
  18 ;;
  19 *)
  20         return 1
  21 ;;
  22 esac

In the user job, you should include both the RTE (e.g. MPISTART) and PE in the job description (only shown the relevant elements):

   1 <RuntimeEnvironment><Name>MPISTART</Name></RuntimeEnvironment>
   2 <ParallelEnvironment>
   3   <Type>OpenMPI</Type>
   4   <!-- Any other PE options -->
   5 </ParallelEnvironment>

4. Testing

Testing the use of PEs with mpi-start was done as part of mpi-start 1.5.0 testing. Check the integration tests for details.

5. Sample Job

This is a sample job that can be submitted to the EMI-ES endpoints:

   1 <ActivityDescription xmlns="http://www.eu-emi.eu/es/2010/12/adl">
   2     <ActivityIdentification>
   3         <Name>test job</Name>
   4         <Description>A test job showing the features of EMI-ES</Description>
   5         <Type>single</Type>
   6         <Annotation>test</Annotation>
   7     </ActivityIdentification>
   8     <Application>
   9         <Executable><Path>cpi.c</Path></Executable>
  10         <Error>std.err</Error>
  11         <Output>std.out</Output>
  12         <Environment><Name>I2G_MPI_PRE_RUN_HOOK</Name><Value>pre.sh</Value></Environment>
  13         <Environment><Name>I2G_MPI_START_VERBOSE</Name><Value>1</Value></Environment>
  14     </Application>
  15     <Resources>
  16         <SlotRequirement>
  17             <NumberOfSlots>2</NumberOfSlots>
  18         </SlotRequirement>
  19         <ParallelEnvironment>
  20                 <Type>OpenMPI</Type>
  21         </ParallelEnvironment>
  22     </Resources>
  23     <DataStaging>
  24         <InputFile> <Name>pre.sh</Name> <Source><URI>pre.sh</URI></Source> </InputFile>
  25         <InputFile> <Name>cpi.c</Name> <Source><URI>cpi.c</URI></Source> </InputFile>
  26     </DataStaging>
  27 </ActivityDescription>

The hook (pre.sh) is the following:

   1 #!/bin/sh
   3 pre_run_hook () {
   4   # Compile the program.
   5   info_msg "Compiling ${I2G_MPI_APPLICATION}"
   7   export I2G_MPI_APPLICATION=`echo $I2G_MPI_APPLICATION | sed -e "s/\.c$//"`
   8   # Actually compile the program.
  10   info_msg $cmd
  11   $cmd
  12   st=$?
  13   if [ $st -ne 0 ]; then
  14     error_msg "Error compiling program.  Exiting..."
  15     return $st 
  16   fi
  18   # Everything's OK.
  19   info_msg "Successfully compiled ${I2G_MPI_APPLICATION}"
  20   return 0
  21 }

And the C code (cpi.c):

   1 #include "mpi.h"
   2 #include <stdio.h>
   3 #include <math.h>
   5 double f( double );
   6 double f( double a )
   7 {
   8     return (4.0 / (1.0 + a*a));
   9 }
  11 int main( int argc, char *argv[])
  12 {
  13    int n_intervals = 16384;
  15    int done = 0, n, myid, numprocs, i;
  16    double PI25DT = 3.141592653589793238462643;
  17    double mypi, pi, h, sum, x;
  18    double startwtime = 0.0, endwtime;
  19    int  namelen;
  20    char processor_name[MPI_MAX_PROCESSOR_NAME];
  22    MPI_Init(&argc,&argv);
  23    MPI_Comm_size(MPI_COMM_WORLD,&numprocs);
  24    MPI_Comm_rank(MPI_COMM_WORLD,&myid);
  25    MPI_Get_processor_name(processor_name,&namelen);
  27    fprintf(stderr,"Process %d on %s: n=%d\n",myid, processor_name,n);
  28    if (numprocs >= 1) {
  29        if( myid == 0 ) fprintf(stderr,"Using %d intervals\n",n_intervals);
  31        n = 0;
  32        while (!done)
  33        {
  34       if (myid == 0) {
  35          startwtime = MPI_Wtime();
  36       }
  37       if( n == 0  ) n = n_intervals; else n = 0;
  38       MPI_Bcast(&n, 1, MPI_INT, 0, MPI_COMM_WORLD);
  39       if (n == 0)
  40          done = 1;
  41       else
  42       {
  43          h   = 1.0 / (double) n;
  44          sum = 0.0;
  45          for (i = myid + 1; i <= n; i += numprocs)
  46          {
  47         x = h * ((double)i - 0.5);
  48         sum += f(x);
  49          }
  50          mypi = h * sum;
  52          MPI_Reduce(&mypi, &pi, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
  53          if (myid == 0)
  54          {
  55         printf("pi is approximately %.16f, Error is %.16f\n",
  56                pi, fabs(pi - PI25DT));
  57         endwtime = MPI_Wtime();
  58         printf("wall clock time = %f\n",
  59         endwtime-startwtime);
  60          }
  61       }
  62        }
  63    } else {
  64        fprintf(stderr,"Only 1 process, not doing anything");
  65    }
  66    MPI_Finalize();
  68    return 0;
  69 }