Difference between revisions of "Multiple Program Runs in one Slurm Job"

From HPC Wiki
Jump to: navigation, search
m (Dieter-anmey-6410@rwth-aachen.de moved page Multiple Program Runs in one Slurm Job to Multiple Program Runs in one Slurm Job)
Line 2: Line 2:
  
  
Short introduction: Sample Page is a page that shows how to layout a Wiki-Page. In the introduction you should describe what this is and what it is used for.
+
Mehrfaches Starten von Programm mit etwas gleicher Laufzeit in einem Batchjob auf einem Multicore-Knoten am Beispiel von Gaussian.
  
[[File:ProPE_Logo.PNG|thumb|200px|ProPE Logo]]
+
__TOC__
  
  
 
== Basic usage ==
 
== Basic usage ==
 +
Problem: Ein Programm (hier Gaussian) skaliert nicht gut über alle Cores eines  Multicore-Knoten.
 +
Bei nicht-exklusiver Nutzung von solchen Rechenknoten laufen Jobs mehrere Nutzer gleichzeitig und beeinflussen sich gegenseitig in ihrer Laufzeit. Eine zuverlässige Abschätzung der Laufzeit zur Angabe des Rechenzeitlimits fällt dadurch schwer.
 +
Eine Maßnahme dagegen wäre das Starten von mehreren Programmläufen innerhalb eines Batchjobs, das einen Knoten exklusiv nutzt.
 +
Bei üblicher NUMA-Architektur von solchen Multicore-Knoten ist es wichtig die einzelnen Programmläufe sorgfältig zu platzieren - z.B. ein Programmlauf pro NUMA-Node.
 +
 
<syntaxhighlight lang="bash">
 
<syntaxhighlight lang="bash">
$ cd ..
+
#!/usr/local_rwth/bin/zsh
 +
 
 +
#SBATCH  --job-name=run2x24 
 +
 
 +
### output-Pfad und error-file-Pfad
 +
#SBATCH --output=%j.log
 +
#SBATCH --error=%j.err
 +
 
 +
### Anzahl Stunden  dd-hh:mm:ss
 +
#SBATCH --time=00-01:00:00
 +
 
 +
#SBATCH --mem=180G
 +
 
 +
### Anzahl Prozessoren
 +
#SBATCH --ntasks=1 --nodes=1
 +
#SBATCH --cpus-per-task=48
 +
#SBATCH --threads-per-core=1
 +
 
 +
### exclusive usage of a single node ?
 +
#SBATCH --exclusive
 +
 
 +
### CLAIX-2018
 +
#SBATCH --partition=c18m
 +
 
 +
### use Project account once accounting is implemented ##SBATCH --account rwth0303   
 +
 
 +
### send email at job start and end
 +
#SBATCH --mail-type=ALL
 +
#SBATCH --mail-user=anmey@itc.rwth-aachen.de
 +
 
 +
module load CHEMISTRY gaussian
 +
### the gaussian module allocates the scratch directory
 +
echo $GAUSS_SCRDIR
 +
 
 +
### adjust working directory and input file names and output directory names
 +
export WDIR=/home/da026566/hpc/benchmarks/Gaussian/Raabe
 +
 
 +
export INP1=small.inp24-10gb
 +
export INP2=small.inp24-10gb
 +
 
 +
export OUT1=run1
 +
export OUT2=run2
 +
 
 +
### the program will run in $WDIR/$SLURM_JOB_ID/$OUTx
 +
### Scratch files will be put in $GAUSS_SCRDIR/$SLURM_JOB_ID/$OUTx
 +
### Input files are assumed to be in $WDIR/$INPx
 +
 
 +
mkdir -p $WDIR/$SLURM_JOB_ID/$OUT1
 +
mkdir -p $WDIR/$SLURM_JOB_ID/$OUT2
 +
mkdir -p $GAUSS_SCRDIR/$SLURM_JOB_ID/$OUT1
 +
mkdir -p $GAUSS_SCRDIR/$SLURM_JOB_ID/$OUT2
 +
 
 +
numactl --cpubind=0,1 --membind=0,1 -- numactl -show
 +
numactl --cpubind=2,3 --membind=2,3 -- numactl -show
 +
 
 +
( cd $WDIR/$SLURM_JOB_ID/$OUT1; \
 +
  export GAUSS_SCRDIR=$GAUSS_SCRDIR/$SLURM_JOB_ID/$OUT1; \
 +
  numactl --cpubind=0,1 --membind=0,1 -- timex g09 < ../../$INP1 > g09.out 2> g09.err ) &
 +
pid1=$!
 +
( cd $WDIR/$SLURM_JOB_ID/$OUT2; \
 +
  export GAUSS_SCRDIR=$GAUSS_SCRDIR/$SLURM_JOB_ID/$OUT2; \
 +
  numactl --cpubind=2,3 --membind=2,3 -- timex g09 < ../../$INP2 > g09.out 2> g09.err ) &
 +
pid2=$!
 +
 
 +
wait $pid1 $pid2
 
</syntaxhighlight>
 
</syntaxhighlight>
tut a, b, c
 
 
<syntaxhighlight lang="bash">
 
<syntaxhighlight lang="bash">
$ ls -l
+
#!/usr/local_rwth/bin/zsh
</syntaxhighlight>
+
 
tut d, e und f
+
#SBATCH  --job-name=Salvarsan_hexa_S_DFT_fine_CAM_small 
 +
 
 +
### output-Pfad und error-file-Pfad
 +
#SBATCH --output=%j.log
 +
#SBATCH --error=%j.err
 +
 
 +
### require access to Lustre Filesystem (HPCWORK) ? ## SBATCH -C hpcwork
 +
 
 +
### Anzahl Stunden  dd-hh:mm:ss
 +
#SBATCH --time=00-01:00:00
 +
 
 +
#SBATCH --mem=45G
 +
 
 +
### Anzahl Prozessoren
 +
#SBATCH --ntasks=1 --nodes=1
 +
#SBATCH --cpus-per-task=48
 +
#SBATCH --threads-per-core=1
 +
 
 +
### exclusive usage of a single node ?
 +
#SBATCH --exclusive
 +
 
 +
### CLAIX-2018
 +
#SBATCH --partition=c18m
 +
 
 +
### use Project account once accounting is implemented ##SBATCH --account rwth0303   
 +
 
 +
### send email at job start and end
 +
#SBATCH --mail-type=ALL
 +
#SBATCH --mail-user=anmey@itc.rwth-aachen.de
 +
 
 +
module load CHEMISTRY gaussian
 +
### the gaussian module allocates the scratch directory
 +
echo $GAUSS_SCRDIR
 +
 
 +
### adjust working directory and input file names and output directory names
 +
export WDIR=/home/da026566/hpc/benchmarks/Gaussian/Raabe
 +
 
 +
export INP1=small.inp12-5gb
 +
export INP2=small.inp12-5gb
 +
export INP3=small.inp12-5gb
 +
export INP4=small.inp12-5gb
 +
 
 +
export OUT1=run1
 +
export OUT2=run2
 +
export OUT3=run3
 +
export OUT4=run4
 +
 
 +
### the program will run in $WDIR/$SLURM_JOB_ID/$OUTx
 +
### Scratch files will be put in $GAUSS_SCRDIR/$SLURM_JOB_ID/$OUTx
 +
### Input files are assumed to be in $WDIR/$INPx
 +
 
 +
mkdir -p $WDIR/$SLURM_JOB_ID/$OUT1
 +
mkdir -p $WDIR/$SLURM_JOB_ID/$OUT2
 +
mkdir -p $WDIR/$SLURM_JOB_ID/$OUT3
 +
mkdir -p $WDIR/$SLURM_JOB_ID/$OUT4
 +
mkdir -p $GAUSS_SCRDIR/$SLURM_JOB_ID/$OUT1
 +
mkdir -p $GAUSS_SCRDIR/$SLURM_JOB_ID/$OUT2
 +
mkdir -p $GAUSS_SCRDIR/$SLURM_JOB_ID/$OUT3
 +
mkdir -p $GAUSS_SCRDIR/$SLURM_JOB_ID/$OUT4
 +
 
 +
uptime
 +
date
 +
 
 +
( cd $WDIR/$SLURM_JOB_ID/$OUT1; \
 +
  export GAUSS_SCRDIR=$GAUSS_SCRDIR/$SLURM_JOB_ID/$OUT1; \
 +
  numactl --cpubind=0 --membind=0 -- timex g09 < ../../$INP1 > g09.out  ) &
 +
pid1=$!
 +
( cd $WDIR/$SLURM_JOB_ID/$OUT2; \
 +
  export GAUSS_SCRDIR=$GAUSS_SCRDIR/$SLURM_JOB_ID/$OUT2; \
 +
  numactl --cpubind=1 --membind=1 -- timex g09 < ../../$INP2 > g09.out  ) &
 +
pid2=$!
 +
( cd $WDIR/$SLURM_JOB_ID/$OUT3; \
 +
  export GAUSS_SCRDIR=$GAUSS_SCRDIR/$SLURM_JOB_ID/$OUT3; \
 +
  numactl --cpubind=2 --membind=2 -- timex g09 < ../../$INP3 > g09.out  ) &
 +
pid3=$!
 +
( cd $WDIR/$SLURM_JOB_ID/$OUT4; \
 +
  export GAUSS_SCRDIR=$GAUSS_SCRDIR/$SLURM_JOB_ID/$OUT4; \
 +
  numactl --cpubind=3 --membind=3 -- timex g09 < ../../$INP4 > g09.out  ) &
  
== Tips and Tricks ==
+
wait $pid1 $pid2 $pid3 $pid4
Get the source code of this sample Page and copy it into new pages to start with a resonable structure.
 
  
== Common Pitfalls ==
+
date
blabla ... Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua.
+
uptime
 +
cd $WDIR
 +
ls -l $WDIR/$SLURM_JOB_ID/*/*
 +
ls -l $GAUSS_SCRDIR/$SLURM_JOB_ID/*/*
 +
</syntaxhighlight>
  
  
 
== Links and more Information ==
 
== Links and more Information ==
For some information how to do some things like LaTeX, Code Highlight or pictures, check the [[Wiki Syntax]]
+
t.b.a.

Revision as of 17:55, 20 March 2019

Work in progress ...


Mehrfaches Starten von Programm mit etwas gleicher Laufzeit in einem Batchjob auf einem Multicore-Knoten am Beispiel von Gaussian.


Basic usage

Problem: Ein Programm (hier Gaussian) skaliert nicht gut über alle Cores eines Multicore-Knoten. Bei nicht-exklusiver Nutzung von solchen Rechenknoten laufen Jobs mehrere Nutzer gleichzeitig und beeinflussen sich gegenseitig in ihrer Laufzeit. Eine zuverlässige Abschätzung der Laufzeit zur Angabe des Rechenzeitlimits fällt dadurch schwer. Eine Maßnahme dagegen wäre das Starten von mehreren Programmläufen innerhalb eines Batchjobs, das einen Knoten exklusiv nutzt. Bei üblicher NUMA-Architektur von solchen Multicore-Knoten ist es wichtig die einzelnen Programmläufe sorgfältig zu platzieren - z.B. ein Programmlauf pro NUMA-Node.

#!/usr/local_rwth/bin/zsh

#SBATCH  --job-name=run2x24  

### output-Pfad und error-file-Pfad
#SBATCH --output=%j.log
#SBATCH --error=%j.err

### Anzahl Stunden  dd-hh:mm:ss
#SBATCH --time=00-01:00:00

#SBATCH --mem=180G

### Anzahl Prozessoren
#SBATCH --ntasks=1 --nodes=1
#SBATCH --cpus-per-task=48
#SBATCH --threads-per-core=1

### exclusive usage of a single node ? 
#SBATCH --exclusive

### CLAIX-2018
#SBATCH --partition=c18m

### use Project account once accounting is implemented ##SBATCH --account rwth0303     

### send email at job start and end
#SBATCH --mail-type=ALL
#SBATCH --mail-user=anmey@itc.rwth-aachen.de

module load CHEMISTRY gaussian
### the gaussian module allocates the scratch directory
echo $GAUSS_SCRDIR

### adjust working directory and input file names and output directory names
export WDIR=/home/da026566/hpc/benchmarks/Gaussian/Raabe

export INP1=small.inp24-10gb
export INP2=small.inp24-10gb

export OUT1=run1
export OUT2=run2

### the program will run in $WDIR/$SLURM_JOB_ID/$OUTx
### Scratch files will be put in $GAUSS_SCRDIR/$SLURM_JOB_ID/$OUTx
### Input files are assumed to be in $WDIR/$INPx

mkdir -p $WDIR/$SLURM_JOB_ID/$OUT1
mkdir -p $WDIR/$SLURM_JOB_ID/$OUT2
mkdir -p $GAUSS_SCRDIR/$SLURM_JOB_ID/$OUT1
mkdir -p $GAUSS_SCRDIR/$SLURM_JOB_ID/$OUT2

numactl --cpubind=0,1 --membind=0,1 -- numactl -show
numactl --cpubind=2,3 --membind=2,3 -- numactl -show

( cd $WDIR/$SLURM_JOB_ID/$OUT1; \
  export GAUSS_SCRDIR=$GAUSS_SCRDIR/$SLURM_JOB_ID/$OUT1; \
  numactl --cpubind=0,1 --membind=0,1 -- timex g09 < ../../$INP1 > g09.out 2> g09.err ) &
pid1=$!
( cd $WDIR/$SLURM_JOB_ID/$OUT2; \
  export GAUSS_SCRDIR=$GAUSS_SCRDIR/$SLURM_JOB_ID/$OUT2; \
  numactl --cpubind=2,3 --membind=2,3 -- timex g09 < ../../$INP2 > g09.out 2> g09.err ) &
pid2=$!

wait $pid1 $pid2
#!/usr/local_rwth/bin/zsh

#SBATCH  --job-name=Salvarsan_hexa_S_DFT_fine_CAM_small  

### output-Pfad und error-file-Pfad
#SBATCH --output=%j.log
#SBATCH --error=%j.err

### require access to Lustre Filesystem (HPCWORK) ? ## SBATCH -C hpcwork

### Anzahl Stunden  dd-hh:mm:ss
#SBATCH --time=00-01:00:00

#SBATCH --mem=45G

### Anzahl Prozessoren
#SBATCH --ntasks=1 --nodes=1
#SBATCH --cpus-per-task=48
#SBATCH --threads-per-core=1

### exclusive usage of a single node ? 
#SBATCH --exclusive

### CLAIX-2018
#SBATCH --partition=c18m

### use Project account once accounting is implemented ##SBATCH --account rwth0303     

### send email at job start and end
#SBATCH --mail-type=ALL
#SBATCH --mail-user=anmey@itc.rwth-aachen.de

module load CHEMISTRY gaussian
### the gaussian module allocates the scratch directory
echo $GAUSS_SCRDIR

### adjust working directory and input file names and output directory names
export WDIR=/home/da026566/hpc/benchmarks/Gaussian/Raabe

export INP1=small.inp12-5gb
export INP2=small.inp12-5gb
export INP3=small.inp12-5gb
export INP4=small.inp12-5gb

export OUT1=run1
export OUT2=run2
export OUT3=run3
export OUT4=run4

### the program will run in $WDIR/$SLURM_JOB_ID/$OUTx
### Scratch files will be put in $GAUSS_SCRDIR/$SLURM_JOB_ID/$OUTx
### Input files are assumed to be in $WDIR/$INPx

mkdir -p $WDIR/$SLURM_JOB_ID/$OUT1
mkdir -p $WDIR/$SLURM_JOB_ID/$OUT2
mkdir -p $WDIR/$SLURM_JOB_ID/$OUT3
mkdir -p $WDIR/$SLURM_JOB_ID/$OUT4
mkdir -p $GAUSS_SCRDIR/$SLURM_JOB_ID/$OUT1
mkdir -p $GAUSS_SCRDIR/$SLURM_JOB_ID/$OUT2
mkdir -p $GAUSS_SCRDIR/$SLURM_JOB_ID/$OUT3
mkdir -p $GAUSS_SCRDIR/$SLURM_JOB_ID/$OUT4

uptime
date

( cd $WDIR/$SLURM_JOB_ID/$OUT1; \
  export GAUSS_SCRDIR=$GAUSS_SCRDIR/$SLURM_JOB_ID/$OUT1; \
  numactl --cpubind=0 --membind=0 -- timex g09 < ../../$INP1 > g09.out  ) &
pid1=$!
( cd $WDIR/$SLURM_JOB_ID/$OUT2; \
  export GAUSS_SCRDIR=$GAUSS_SCRDIR/$SLURM_JOB_ID/$OUT2; \
  numactl --cpubind=1 --membind=1 -- timex g09 < ../../$INP2 > g09.out  ) &
pid2=$!
( cd $WDIR/$SLURM_JOB_ID/$OUT3; \
  export GAUSS_SCRDIR=$GAUSS_SCRDIR/$SLURM_JOB_ID/$OUT3; \
  numactl --cpubind=2 --membind=2 -- timex g09 < ../../$INP3 > g09.out  ) &
pid3=$!
( cd $WDIR/$SLURM_JOB_ID/$OUT4; \
  export GAUSS_SCRDIR=$GAUSS_SCRDIR/$SLURM_JOB_ID/$OUT4; \
  numactl --cpubind=3 --membind=3 -- timex g09 < ../../$INP4 > g09.out  ) &

wait $pid1 $pid2 $pid3 $pid4

date
uptime
cd $WDIR
ls -l $WDIR/$SLURM_JOB_ID/*/*
ls -l $GAUSS_SCRDIR/$SLURM_JOB_ID/*/*


Links and more Information

t.b.a.