Skip to content

Commit

Permalink
Merge branch 'slurm-2.5'
Browse files Browse the repository at this point in the history
  • Loading branch information
jette committed Jan 11, 2013
2 parents dd14254 + d534d48 commit 1d49940
Show file tree
Hide file tree
Showing 12 changed files with 68 additions and 31 deletions.
1 change: 1 addition & 0 deletions NEWS
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ documents those changes that are of interest to users and admins.
-- Fix logic to optimize GRES topology with respect to allocated CPUs.
-- Add job_submit/all_partitions plugin to set a job's default partition
to ALL available partitions in the cluster.
-- Modify switch/nrt logic to permit build without libnrt.so library.

* Changes in SLURM 2.5.1
========================
Expand Down
7 changes: 4 additions & 3 deletions auxdir/x_ac_nrt.m4
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,9 @@ AC_DEFUN([X_AC_NRT],
if test -f "$nrt_dir/nrt.h" -a -f "$nrt_dir/permapi.h"; then
ac_have_nrt_h="yes"
NRT_CPPFLAGS="-I$nrt_dir"
break;
AC_DEFINE(HAVE_NRT_H, 1, [define if you have nrt.h])
AC_DEFINE(HAVE_PERMAPI_H, 1, [define if you have permapi_h])
break;
fi
done
if test "x$ac_have_nrt_h" != "xyes" ; then
Expand Down Expand Up @@ -58,13 +60,12 @@ AC_DEFUN([X_AC_NRT],
if test "x$ac_have_libnrt" != "xyes" ; then
AC_MSG_RESULT([no])
AC_MSG_NOTICE([Cannot support IBM NRT API without libnrt.so])
else
AC_MSG_RESULT([yes])
AC_DEFINE(HAVE_LIBNRT, 1, [define if you have libnrt.so])
fi
if test "x$ac_have_nrt_h" = "xyes" && test "x$ac_have_libnrt" = "xyes"; then
if test "x$ac_have_nrt_h" = "xyes"; then
ac_have_nrt="yes"
fi
AM_CONDITIONAL(HAVE_NRT, test "x$ac_have_nrt" = "xyes")
Expand Down
6 changes: 6 additions & 0 deletions config.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,9 @@
/* Define to 1 if you have the <netdb.h> header file. */
#undef HAVE_NETDB_H

/* define if you have nrt.h */
#undef HAVE_NRT_H

/* define if numa library installed */
#undef HAVE_NUMA

Expand All @@ -197,6 +200,9 @@
/* Define to 1 if you have the <paths.h> header file. */
#undef HAVE_PATHS_H

/* define if you have permapi_h */
#undef HAVE_PERMAPI_H

/* Define to 1 if using PostgreSQL libaries */
#undef HAVE_PGSQL

Expand Down
12 changes: 8 additions & 4 deletions configure
Original file line number Diff line number Diff line change
Expand Up @@ -21755,7 +21755,13 @@ $as_echo_n "checking Checking NRT and PERMAPI header files... " >&6; }
if test -f "$nrt_dir/nrt.h" -a -f "$nrt_dir/permapi.h"; then
ac_have_nrt_h="yes"
NRT_CPPFLAGS="-I$nrt_dir"
break;

$as_echo "#define HAVE_NRT_H 1" >>confdefs.h


$as_echo "#define HAVE_PERMAPI_H 1" >>confdefs.h

break;
fi
done
if test "x$ac_have_nrt_h" != "xyes" ; then
Expand Down Expand Up @@ -21796,8 +21802,6 @@ $as_echo_n "checking whether to enable IBM NRT support... " >&6; }
if test "x$ac_have_libnrt" != "xyes" ; then
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
$as_echo "no" >&6; }
{ $as_echo "$as_me:${as_lineno-$LINENO}: Cannot support IBM NRT API without libnrt.so" >&5
$as_echo "$as_me: Cannot support IBM NRT API without libnrt.so" >&6;}
else
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
$as_echo "yes" >&6; }
Expand All @@ -21806,7 +21810,7 @@ $as_echo "#define HAVE_LIBNRT 1" >>confdefs.h

fi

if test "x$ac_have_nrt_h" = "xyes" && test "x$ac_have_libnrt" = "xyes"; then
if test "x$ac_have_nrt_h" = "xyes"; then
ac_have_nrt="yes"
fi
if test "x$ac_have_nrt" = "xyes"; then
Expand Down
27 changes: 16 additions & 11 deletions doc/html/ibm-pe.shtml
Original file line number Diff line number Diff line change
Expand Up @@ -128,8 +128,8 @@ Allocate one switch window per task on each node and every network supporting
MPI and a second window supporting PAMI.
<br><br>
<b>--network=devtype=ib,instances=2,lapi,mpi</b><br>
On every Infiniband network connection, allocate two switch windows each for
both lapi and mpi interfaces. If each node has one Infinband network connection,
On every InfiniBand network connection, allocate two switch windows each for
both lapi and mpi interfaces. If each node has one InfiniBand network connection,
this would result in four switch windows per task.
</p>

Expand Down Expand Up @@ -249,7 +249,7 @@ $ ls -l ~/.slurm/slurm_cmdfile.*
$ rm ~/.slurm/slurm_cmdfile.*
</pre>

<p>The -L/--label option differs slighly in that when the output from multiple
<p>The -L/--label option differs slightly in that when the output from multiple
tasks are identical, they are combined on a single line with the prefix
identifying which task(s) generated the output. In addition, there is a colon
but no space between the task IDs and output. For example:</p>
Expand All @@ -270,7 +270,7 @@ but no space between the task IDs and output. For example:</p>

<p>In addition, when srun's --multi-prog option (for Multiple Program,
Multiple Data configurations) is used with the -L/--label option then a job
step ID, colon and space will preceed the task ID and colon. For example:</p>
step ID, colon and space will precede the task ID and colon. For example:</p>
<pre>
# SLURM OUTPUT
0: zero
Expand All @@ -284,7 +284,7 @@ step ID, colon and space will preceed the task ID and colon. For example:</p>
</pre>

<p>The srun command is not able to report task status upon receipt of a SIGINT
signal (ctrl-c interupt from keyboard), however two SIGINT signals within a
signal (ctrl-c interrupt from keyboard), however two SIGINT signals within a
one second interval will terminate the job as on other SLURM configurations.</p>

<h3>Environment Variables</h3>
Expand Down Expand Up @@ -381,6 +381,11 @@ ProctrackType=proctrack/cgroup
<p>In order for these plugins to be built, the locations of the POE Resource
Manager header file (permapi.h) the NRT header file (nrt.h) and NRT library
(libnrt.so) must be identified at the time the SLURM is built.
Slurm may be built and used on nodes without the NRT library (libnrt.so)
installed, but both the permapi.h and nrt.h header files must be available
when Slurm is built and the switch/nrt plugin built on such a node can not be
used on a compute node. The permapi.h and nrt.h header files may be installed
in a temporary location while Slurm is being built, such as in /tmp.
SLURM searches for the header files in the /usr/include directory by default.
If the files are not installed there, you can specify a different location using
the <b>--with-nrth=PATH</b> option to the configure program, where "PATH" is
Expand Down Expand Up @@ -438,12 +443,12 @@ export MP_TIMEOUT=600
Environment. Job's can be explicitly preempted and later resumed using the
<b>scontrol suspend &lt;jobid&gt;</b> and <b>scontrol resume &lt;jobid&gt;</b>
commands. This functionality relies upon NRT functions to suspend/resume
programs and reset MPI timeouts. Note that SLURM suports the preemption only
programs and reset MPI timeouts. Note that SLURM supports the preemption only
of whole jobs rather than individual job steps. A suspended job will relinquish
CPU resources, but retain memory and switch window resources. Note that the
long term suspension of jobs with any allocated Collective Acceleration
Units (CAU) is disabled and an error message to that effect will be generated
in response to such a request. In addition, verion 1200 or higher of IBM's NRT
in response to such a request. In addition, version 1200 or higher of IBM's NRT
API is required to support this functionality.</p>

<h3>Design Notes</h3>
Expand All @@ -460,7 +465,7 @@ node.</p>
<p>It is possible to configure SLURM and LoadLeveler to simultaneously exist
on a cluster, however each scheduler must be configured to manage different
compute nodes (e.g. LoadLeveler can manage compute nodes "tux[1-8]" and SLURM
can manaage compute nodes "tux[9-16]" on the same cluster). In addition, the
can manage compute nodes "tux[9-16]" on the same cluster). In addition, the
/etc/poe.limits file on each node must identify the MP_PE_RMLIB appropriate
for that node (e.g. IBM's or SLURM's libpermapi.so)</p>

Expand All @@ -469,7 +474,7 @@ Then poe uses the <b>launch/slurm</b> plugin to launch the "pmd" process on the
compute nodes, so two launch plugins are actually used.</p>

<p>Depending upon job size and network options, allocating and deallocating
switch resources can take multple seconds per node and the process of launching
switch resources can take multiple seconds per node and the process of launching
applications on multiple nodes is not well parallelized.
This is outside of SLURM's control.</p>

Expand All @@ -478,7 +483,7 @@ This is outside of SLURM's control.</p>
<p>It is possible to generate detailed logging of all switch/nrt actions and
data by configuring <b>DebugFlags=switch</b>.</p>

<p>The envirnoment variable <b>MP_INFOLEVEL</b> can be used to enable the
<p>The environment variable <b>MP_INFOLEVEL</b> can be used to enable the
logging of POE debug messages. To enable fairly detailed logging, set
<b>MP_INFOLEVEL=6</b>.</p>

Expand All @@ -494,6 +499,6 @@ startsrc -s pnsd -a -D

<p class="footer"><a href="#top">top</a></p>

<p style="text-align:center;">Last modified 21 November 2012</p></td>
<p style="text-align:center;">Last modified 10 January 2013</p></td>

<!--#include virtual="footer.txt"-->
10 changes: 6 additions & 4 deletions src/plugins/job_submit/all_partitions/Makefile.in
Original file line number Diff line number Diff line change
Expand Up @@ -130,12 +130,14 @@ am__uninstall_files_from_dir = { \
am__installdirs = "$(DESTDIR)$(pkglibdir)"
LTLIBRARIES = $(pkglib_LTLIBRARIES)
job_submit_all_partitions_la_LIBADD =
am_job_submit_all_partitions_la_OBJECTS = job_submit_all_partitions.lo
am_job_submit_all_partitions_la_OBJECTS = \
job_submit_all_partitions.lo
job_submit_all_partitions_la_OBJECTS = \
$(am_job_submit_all_partitions_la_OBJECTS)
job_submit_all_partitions_la_LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) \
$(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
$(job_submit_all_partitions_la_LDFLAGS) $(LDFLAGS) -o $@
job_submit_all_partitions_la_LINK = $(LIBTOOL) --tag=CC \
$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CCLD) \
$(AM_CFLAGS) $(CFLAGS) $(job_submit_all_partitions_la_LDFLAGS) \
$(LDFLAGS) -o $@
DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir) -I$(top_builddir)/slurm
depcomp = $(SHELL) $(top_srcdir)/auxdir/depcomp
am__depfiles_maybe = depfiles
Expand Down
12 changes: 11 additions & 1 deletion src/plugins/switch/nrt/libpermapi/shr_64.c
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/

#include <permapi.h>
#include <ctype.h>
#include <dlfcn.h>
#include <fcntl.h>
Expand All @@ -47,6 +46,17 @@
# include "config.h"
#endif

#if HAVE_NRT_H
# include <nrt.h>
#else
# error "Must have nrt.h to compile this module!"
#endif
#if HAVE_PERMAPI_H
# include <permapi.h>
#else
# error "Must have permapi.h to compile this module!"
#endif

#include "src/common/slurm_xlator.h"
#include "slurm/slurm.h"
#include "slurm/slurm_errno.h"
Expand Down
12 changes: 10 additions & 2 deletions src/plugins/switch/nrt/nrt.c
Original file line number Diff line number Diff line change
Expand Up @@ -58,10 +58,10 @@
# include "config.h"
#endif

#if HAVE_LIBNRT
#if HAVE_NRT_H
# include <nrt.h>
#else
# error "Must have libnrt to compile this module!"
# error "Must have nrt.h to compile this module!"
#endif

#include <arpa/inet.h>
Expand All @@ -76,6 +76,14 @@
#include "src/plugins/switch/nrt/nrt_keys.h"
#include "src/plugins/switch/nrt/slurm_nrt.h"

/* If the head node has nrt.h, but no libnrt.so, we need to build the
* switch/nrt plugin in order to manage the nrt data structures, but
* will not make use of the nrt_command function. */
#if !HAVE_LIBNRT
int nrt_command(int version, nrt_cmd_type_t cmd_type, void *cmd)
{ fatal("nrt_command not supported without libnrt"); return 0; }
#endif

extern int drain_nodes ( char *nodes, char *reason, uint32_t reason_uid );

/*
Expand Down
4 changes: 2 additions & 2 deletions src/plugins/switch/nrt/nrt_keys.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,10 @@
#ifndef _NRT_KEYS_INCLUDED
#define _NRT_KEYS_INCLUDED

#if HAVE_LIBNRT
#if HAVE_NRT_H
# include <nrt.h>
#else
# error "Must have libnrt to compile this module!"
# error "Must have nrt.h to compile this module!"
#endif

enum {
Expand Down
4 changes: 2 additions & 2 deletions src/plugins/switch/nrt/slurm_nrt.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,10 @@
#ifndef _SLURM_NRT_INCLUDED
#define _SLURM_NRT_INCLUDED

#if HAVE_LIBNRT
#if HAVE_NRT_H
# include <nrt.h>
#else
# error "Must have libnrt to compile this module!"
# error "Must have nrt.h to compile this module!"
#endif

/* opaque data structures - no peeking! */
Expand Down
2 changes: 1 addition & 1 deletion testsuite/expect/test6.13
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ if {[wait_for_job $job_id RUNNING] != 0} {
}
# Allow time for the step to start
# This could take a while with launch/poe
sleep 10
sleep 20

#
# Test verbose scancel
Expand Down
2 changes: 1 addition & 1 deletion testsuite/expect/test6.7
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ if {[wait_for_job $job_id RUNNING] != 0} {
}
# Allow time for the step to start
# This could take a while with launch/poe
sleep 10
sleep 20

#
# Test verbose scancel
Expand Down

0 comments on commit 1d49940

Please sign in to comment.