Difference between revisions of "Multiple Program Runs in one Slurm Job"
m |
|||
Line 17: | Line 17: | ||
#SBATCH --job-name=run2x24 | #SBATCH --job-name=run2x24 | ||
− | |||
− | |||
#SBATCH --output=%j.log | #SBATCH --output=%j.log | ||
#SBATCH --error=%j.err | #SBATCH --error=%j.err | ||
− | |||
− | |||
#SBATCH --time=00-01:00:00 | #SBATCH --time=00-01:00:00 | ||
− | |||
#SBATCH --mem=180G | #SBATCH --mem=180G | ||
− | + | ### exclusive usage of a single node | |
− | ### | + | #SBATCH --exclusive |
+ | ### use all cores, one thread per core | ||
#SBATCH --ntasks=1 --nodes=1 | #SBATCH --ntasks=1 --nodes=1 | ||
#SBATCH --cpus-per-task=48 | #SBATCH --cpus-per-task=48 | ||
#SBATCH --threads-per-core=1 | #SBATCH --threads-per-core=1 | ||
− | ### | + | ### prepare your environment for running gaussian |
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
module load CHEMISTRY gaussian | module load CHEMISTRY gaussian | ||
− | ### the gaussian module allocates the scratch directory | + | ### make sure this environment variable points to a suitable location |
+ | ### here the gaussian module allocates the scratch directory | ||
echo $GAUSS_SCRDIR | echo $GAUSS_SCRDIR | ||
### adjust working directory and input file names and output directory names | ### adjust working directory and input file names and output directory names | ||
− | export WDIR= | + | export WDIR=.... |
− | export INP1= | + | export INP1=small1.inp24 |
− | export INP2= | + | export INP2=small2.inp24 |
export OUT1=run1 | export OUT1=run1 | ||
Line 60: | Line 46: | ||
### Scratch files will be put in $GAUSS_SCRDIR/$SLURM_JOB_ID/$OUTx | ### Scratch files will be put in $GAUSS_SCRDIR/$SLURM_JOB_ID/$OUTx | ||
### Input files are assumed to be in $WDIR/$INPx | ### Input files are assumed to be in $WDIR/$INPx | ||
− | |||
mkdir -p $WDIR/$SLURM_JOB_ID/$OUT1 | mkdir -p $WDIR/$SLURM_JOB_ID/$OUT1 | ||
mkdir -p $WDIR/$SLURM_JOB_ID/$OUT2 | mkdir -p $WDIR/$SLURM_JOB_ID/$OUT2 | ||
Line 82: | Line 67: | ||
<syntaxhighlight lang="bash"> | <syntaxhighlight lang="bash"> | ||
#!/usr/local_rwth/bin/zsh | #!/usr/local_rwth/bin/zsh | ||
− | + | #SBATCH --job-name=run4x12 | |
− | #SBATCH --job-name= | ||
− | |||
− | |||
#SBATCH --output=%j.log | #SBATCH --output=%j.log | ||
#SBATCH --error=%j.err | #SBATCH --error=%j.err | ||
− | |||
− | |||
− | |||
− | |||
#SBATCH --time=00-01:00:00 | #SBATCH --time=00-01:00:00 | ||
+ | #SBATCH --mem=180G | ||
− | #SBATCH -- | + | ### exclusive usage of a single node |
− | + | #SBATCH --exclusive | |
− | ### | + | ### use all cores, one thread per core |
#SBATCH --ntasks=1 --nodes=1 | #SBATCH --ntasks=1 --nodes=1 | ||
#SBATCH --cpus-per-task=48 | #SBATCH --cpus-per-task=48 | ||
#SBATCH --threads-per-core=1 | #SBATCH --threads-per-core=1 | ||
− | ### | + | ### prepare your environment for running gaussian |
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
module load CHEMISTRY gaussian | module load CHEMISTRY gaussian | ||
− | ### the gaussian module allocates the scratch directory | + | ### make sure this environment variable points to a suitable location |
+ | ### here the gaussian module allocates the scratch directory | ||
echo $GAUSS_SCRDIR | echo $GAUSS_SCRDIR | ||
### adjust working directory and input file names and output directory names | ### adjust working directory and input file names and output directory names | ||
− | export WDIR= | + | export WDIR=.... |
− | export INP1= | + | export INP1=small1.inp12 |
− | export INP2= | + | export INP2=small2.inp12 |
− | export INP3= | + | export INP3=small3.inp12 |
− | export INP4= | + | export INP4=small4.inp12 |
export OUT1=run1 | export OUT1=run1 | ||
Line 142: | Line 111: | ||
mkdir -p $GAUSS_SCRDIR/$SLURM_JOB_ID/$OUT3 | mkdir -p $GAUSS_SCRDIR/$SLURM_JOB_ID/$OUT3 | ||
mkdir -p $GAUSS_SCRDIR/$SLURM_JOB_ID/$OUT4 | mkdir -p $GAUSS_SCRDIR/$SLURM_JOB_ID/$OUT4 | ||
− | |||
− | |||
− | |||
( cd $WDIR/$SLURM_JOB_ID/$OUT1; \ | ( cd $WDIR/$SLURM_JOB_ID/$OUT1; \ | ||
Line 164: | Line 130: | ||
wait $pid1 $pid2 $pid3 $pid4 | wait $pid1 $pid2 $pid3 $pid4 | ||
− | |||
− | |||
− | |||
− | |||
− | |||
</syntaxhighlight> | </syntaxhighlight> | ||
Revision as of 18:04, 20 March 2019
Work in progress ...
Mehrfaches Starten von Programm mit etwas gleicher Laufzeit in einem Batchjob auf einem Multicore-Knoten am Beispiel von Gaussian.
Basic usage
Problem: Ein Programm (hier Gaussian) skaliert nicht gut über alle Cores eines Multicore-Knoten. Bei nicht-exklusiver Nutzung von solchen Rechenknoten laufen Jobs mehrere Nutzer gleichzeitig und beeinflussen sich gegenseitig in ihrer Laufzeit. Eine zuverlässige Abschätzung der Laufzeit zur Angabe des Rechenzeitlimits fällt dadurch schwer. Eine Maßnahme dagegen wäre das Starten von mehreren Programmläufen innerhalb eines Batchjobs, das einen Knoten exklusiv nutzt. Bei üblicher NUMA-Architektur von solchen Multicore-Knoten ist es wichtig die einzelnen Programmläufe sorgfältig zu platzieren - z.B. ein Programmlauf pro NUMA-Node.
#!/usr/local_rwth/bin/zsh
#SBATCH --job-name=run2x24
#SBATCH --output=%j.log
#SBATCH --error=%j.err
#SBATCH --time=00-01:00:00
#SBATCH --mem=180G
### exclusive usage of a single node
#SBATCH --exclusive
### use all cores, one thread per core
#SBATCH --ntasks=1 --nodes=1
#SBATCH --cpus-per-task=48
#SBATCH --threads-per-core=1
### prepare your environment for running gaussian
module load CHEMISTRY gaussian
### make sure this environment variable points to a suitable location
### here the gaussian module allocates the scratch directory
echo $GAUSS_SCRDIR
### adjust working directory and input file names and output directory names
export WDIR=....
export INP1=small1.inp24
export INP2=small2.inp24
export OUT1=run1
export OUT2=run2
### the program will run in $WDIR/$SLURM_JOB_ID/$OUTx
### Scratch files will be put in $GAUSS_SCRDIR/$SLURM_JOB_ID/$OUTx
### Input files are assumed to be in $WDIR/$INPx
mkdir -p $WDIR/$SLURM_JOB_ID/$OUT1
mkdir -p $WDIR/$SLURM_JOB_ID/$OUT2
mkdir -p $GAUSS_SCRDIR/$SLURM_JOB_ID/$OUT1
mkdir -p $GAUSS_SCRDIR/$SLURM_JOB_ID/$OUT2
numactl --cpubind=0,1 --membind=0,1 -- numactl -show
numactl --cpubind=2,3 --membind=2,3 -- numactl -show
( cd $WDIR/$SLURM_JOB_ID/$OUT1; \
export GAUSS_SCRDIR=$GAUSS_SCRDIR/$SLURM_JOB_ID/$OUT1; \
numactl --cpubind=0,1 --membind=0,1 -- timex g09 < ../../$INP1 > g09.out 2> g09.err ) &
pid1=$!
( cd $WDIR/$SLURM_JOB_ID/$OUT2; \
export GAUSS_SCRDIR=$GAUSS_SCRDIR/$SLURM_JOB_ID/$OUT2; \
numactl --cpubind=2,3 --membind=2,3 -- timex g09 < ../../$INP2 > g09.out 2> g09.err ) &
pid2=$!
wait $pid1 $pid2
#!/usr/local_rwth/bin/zsh
#SBATCH --job-name=run4x12
#SBATCH --output=%j.log
#SBATCH --error=%j.err
#SBATCH --time=00-01:00:00
#SBATCH --mem=180G
### exclusive usage of a single node
#SBATCH --exclusive
### use all cores, one thread per core
#SBATCH --ntasks=1 --nodes=1
#SBATCH --cpus-per-task=48
#SBATCH --threads-per-core=1
### prepare your environment for running gaussian
module load CHEMISTRY gaussian
### make sure this environment variable points to a suitable location
### here the gaussian module allocates the scratch directory
echo $GAUSS_SCRDIR
### adjust working directory and input file names and output directory names
export WDIR=....
export INP1=small1.inp12
export INP2=small2.inp12
export INP3=small3.inp12
export INP4=small4.inp12
export OUT1=run1
export OUT2=run2
export OUT3=run3
export OUT4=run4
### the program will run in $WDIR/$SLURM_JOB_ID/$OUTx
### Scratch files will be put in $GAUSS_SCRDIR/$SLURM_JOB_ID/$OUTx
### Input files are assumed to be in $WDIR/$INPx
mkdir -p $WDIR/$SLURM_JOB_ID/$OUT1
mkdir -p $WDIR/$SLURM_JOB_ID/$OUT2
mkdir -p $WDIR/$SLURM_JOB_ID/$OUT3
mkdir -p $WDIR/$SLURM_JOB_ID/$OUT4
mkdir -p $GAUSS_SCRDIR/$SLURM_JOB_ID/$OUT1
mkdir -p $GAUSS_SCRDIR/$SLURM_JOB_ID/$OUT2
mkdir -p $GAUSS_SCRDIR/$SLURM_JOB_ID/$OUT3
mkdir -p $GAUSS_SCRDIR/$SLURM_JOB_ID/$OUT4
( cd $WDIR/$SLURM_JOB_ID/$OUT1; \
export GAUSS_SCRDIR=$GAUSS_SCRDIR/$SLURM_JOB_ID/$OUT1; \
numactl --cpubind=0 --membind=0 -- timex g09 < ../../$INP1 > g09.out ) &
pid1=$!
( cd $WDIR/$SLURM_JOB_ID/$OUT2; \
export GAUSS_SCRDIR=$GAUSS_SCRDIR/$SLURM_JOB_ID/$OUT2; \
numactl --cpubind=1 --membind=1 -- timex g09 < ../../$INP2 > g09.out ) &
pid2=$!
( cd $WDIR/$SLURM_JOB_ID/$OUT3; \
export GAUSS_SCRDIR=$GAUSS_SCRDIR/$SLURM_JOB_ID/$OUT3; \
numactl --cpubind=2 --membind=2 -- timex g09 < ../../$INP3 > g09.out ) &
pid3=$!
( cd $WDIR/$SLURM_JOB_ID/$OUT4; \
export GAUSS_SCRDIR=$GAUSS_SCRDIR/$SLURM_JOB_ID/$OUT4; \
numactl --cpubind=3 --membind=3 -- timex g09 < ../../$INP4 > g09.out ) &
wait $pid1 $pid2 $pid3 $pid4
Links and more Information
t.b.a.