Versions Compared

Key

  • This line was added.
  • This line was removed.
  • Formatting was changed.

...

Note
titleUse this for GPU-aware MPI codes

To use GPU-aware Cray MPICH, users must set the following modules and environment variables:

module load craype-accel-amd-gfx90a
module load rocm/<VERSION>
export MPICH_GPU_SUPPORT_ENABLED=1

...

Column
width900px


Code Block
languagebash
themeDJango
titleTerminal N. Explaining the use of the "hello_jobstep" code from an salloc session (compiling)
$ cd $MYSCRATCH
$ git clone https://github.com/PawseySC/hello_jobstep.git
Cloning into 'hello_jobstep'...
...
Resolving deltas: 100% (41/41), done.
$ cd hello_jobstep


$ module load PrgEnv-cray craype-accel-amd-gfx90a rocm/<VERSION>
$ make hello_jobstep
CC -std=c++11 -fopenmp --rocm-path=/opt/rocm -x hip -D__HIP_ARCH_GFX90A__=1 --offload-arch=gfx90a -I/opt/rocm/include -c hello_jobstep.cpp
CC -fopenmp --rocm-path=/opt/rocm -L/opt/rocm/lib -lamdhip64 hello_jobstep.o -o hello_jobstep


...

Column
width900px


Code Block
languagebash
themeEmacs
titleListing N. exampleScript_1NodeShared_1GPU.sh
linenumberstrue
#!/bin/bash --login
#SBATCH --job-name=1GPUSharedNode
#SBATCH --partition=gpu
#SBATCH --nodes=1              #1 nodes in this example 
#SBATCH --gres=gpu:1           #1 GPU per node (1 "allocation-pack" in total for the job)
#SBATCH --time=00:05:00
#SBATCH --account=<yourProject>-gpu #IMPORTANT: use your own project and the -gpu suffix
#(Note that there is not request for exclusive access to the node)

#----
#Loading needed modules (adapt this for your own purposes):
module load PrgEnv-cray
module load rocm/<VERSION> craype-accel-amd-gfx90a
echo -e "\n\n#------------------------#"
module list

#----
#Printing the status of the given allocation
echo -e "\n\n#------------------------#"
echo "Printing from scontrol:"
scontrol show job ${SLURM_JOBID}

#----
#Definition of the executable (we assume the example code has been compiled and is available in $MYSCRATCH):
exeDir=$MYSCRATCH/hello_jobstep
exeName=hello_jobstep
theExe=$exeDir/$exeName

#----
#MPI & OpenMP settings
#Not needed for 1GPU:export MPICH_GPU_SUPPORT_ENABLED=1 #This allows for GPU-aware MPI communication among GPUs
export OMP_NUM_THREADS=1           #This controls the real CPU-cores per task for the executable

#----
#Execution
#Note: srun needs the explicit indication full parameters for use of resources in the job step.
#      These are independent from the allocation parameters (which are not inherited by srun)
#      For optimal GPU binding using slurm options,
#      "--gpus-per-task=1" and "--gpu-bind=closest" create the optimal binding of GPUs      
#      (Although in this case this can be avoided as only 1 "allocation-pack" has been requested)
#      "-c 8" is used to force allocation of 1 task per CPU chiplet. Then, the REAL number of threads
#         for the code SHOULD be defined by the environment variables above.
#      (The "-l" option is for displaying, at the beginning of each line, the taskID that generates the output.)
#      (The "-u" option is for unbuffered output, so that output is displayed as soon as it's generated.)
#      (If the output needs to be sorted for clarity, then add "| sort -n" at the end of the command.)
echo -e "\n\n#------------------------#"
echo "Test code execution:"
srun -l -u -N 1 -n 1 -c 8 --gres=gpu:1 --gpus-per-task=1 --gpu-bind=closest ${theExe}

#----
#Printing information of finished job steps:
echo -e "\n\n#------------------------#"
echo "Printing information of finished jobs steps using sacct:"
sacct -j ${SLURM_JOBID} -o jobid%20,Start%20,elapsed%20

#----
#Done
echo -e "\n\n#------------------------#"
echo "Done"


...

Column
width900px


Code Block
languagebash
themeEmacs
titleListing N. exampleScript_2NodesExclusive_16GPUs_8VisiblePerTask.sh
linenumberstrue
#!/bin/bash --login
#SBATCH --job-name=16GPUExclusiveNode-8GPUsVisiblePerTask
#SBATCH --partition=gpu
#SBATCH --nodes=2              #2 nodes in this example 
#SBATCH --exclusive            #All resources of the node are exclusive to this job
#                              #8 GPUs per node (16 "allocation packs" in total for the job)
#SBATCH --time=00:05:00
#SBATCH --account=<yourProject>-gpu #IMPORTANT: use your own project and the -gpu suffix

#----
#Loading needed modules (adapt this for your own purposes):
#For the hello_jobstep example:
module load PrgEnv-cray
module load rocm/<VERSION> craype-accel-amd-gfx90a
#OR for a tensorflow example:
#module load tensorflow/<version>
echo -e "\n\n#------------------------#"
module list

#----
#Printing the status of the given allocation
echo -e "\n\n#------------------------#"
echo "Printing from scontrol:"
scontrol show job ${SLURM_JOBID}

#----
#Definition of the executable (we assume the example code has been compiled and is available in $MYSCRATCH):
exeDir=$MYSCRATCH/hello_jobstep
exeName=hello_jobstep
theExe=$exeDir/$exeName

#----
#MPI & OpenMP settings if needed (these won't work for Tensorflow):
export MPICH_GPU_SUPPORT_ENABLED=1 #This allows for GPU-aware MPI communication among GPUs
export OMP_NUM_THREADS=1           #This controls the real CPU-cores per task for the executable

#----
#TensorFlow settings if needed:
#  The following two variables control the real number of threads in Tensorflow code:
#export TF_NUM_INTEROP_THREADS=1    #Number of threads for independent operations
#export TF_NUM_INTRAOP_THREADS=1    #Number of threads within individual operations 

#----
#Execution
#Note: srun needs the explicit indication full parameters for use of resources in the job step.
#      These are independent from the allocation parameters (which are not inherited by srun)
#      Each task needs access to all the 8 available GPUs in the node where it's running.
#      So, no optimal binding can be provided by the scheduler.
#      Therefore, "--gpus-per-task" and "--gpu-bind" are not used.
#      Optimal use of resources is now responsability of the code.
#      "-c 8" is used to force allocation of 1 task per CPU chiplet. Then, the REAL number of threads
#         for the code SHOULD be defined by the environment variables above.
#      (The "-l" option is for displaying, at the beginning of each line, the taskID that generates the output.)
#      (The "-u" option is for unbuffered output, so that output is displayed as soon as it's generated.)
#      (If the output needs to be sorted for clarity, then add "| sort -n" at the end of the command.)
echo -e "\n\n#------------------------#"
echo "Test code execution:"
srun -l -u -N 2 -n 16 -c 8 --gres=gpu:8 ${theExe}
#srun -l -u -N 2 -n 16 -c 8 --gres=gpu:8 python3 ${tensorFlowScript}

#----
#Printing information of finished job steps:
echo -e "\n\n#------------------------#"
echo "Printing information of finished jobs steps using sacct:"
sacct -j ${SLURM_JOBID} -o jobid%20,Start%20,elapsed%20

#----
#Done
echo -e "\n\n#------------------------#"
echo "Done"


...

Column
width900px


Code Block
languagebash
themeEmacs
titleListing N. exampleScript_1NodeShared_6GPUs_2VisiblePerTask.sh
linenumberstrue
#!/bin/bash --login
#SBATCH --job-name=6GPUSharedNode-2GPUsVisiblePerTask
#SBATCH --partition=gpu
#SBATCH --nodes=1              #1 nodes in this example 
#SBATCH --gres=gpu:6           #6 GPUs per node (6 "allocation packs" in total for the job)
#SBATCH --time=00:05:00
#SBATCH --account=<yourProject>-gpu #IMPORTANT: use your own project and the -gpu suffix

#----
#Loading needed modules (adapt this for your own purposes):
module load PrgEnv-cray
module load rocm/<VERSION> craype-accel-amd-gfx90a
echo -e "\n\n#------------------------#"
module list

#----
#Printing the status of the given allocation
echo -e "\n\n#------------------------#"
echo "Printing from scontrol:"
scontrol show job ${SLURM_JOBID}

#----
#Definition of the executable (we assume the example code has been compiled and is available in $MYSCRATCH):
exeDir=$MYSCRATCH/hello_jobstep
exeName=hello_jobstep
theExe=$exeDir/$exeName

#----
#MPI & OpenMP settings if needed (these won't work for Tensorflow):
export MPICH_GPU_SUPPORT_ENABLED=1 #This allows for GPU-aware MPI communication among GPUs
export OMP_NUM_THREADS=1           #This controls the real CPU-cores per task for the executable

#----
#Execution
#Note: srun needs the explicit indication full parameters for use of resources in the job step.
#      These are independent from the allocation parameters (which are not inherited by srun)
#      For best possible GPU binding using slurm options,
#      "--gpus-per-task=2" and "--gpu-bind=closest" will provide the best GPUs to the tasks.
#      But best is still not optimal.
#      Each task have access to 2 available GPUs in the node where it's running.
#      Optimal use of resources of each of the 2GPUs accesible per task is now responsability of the code.
#      IMPORTANT: Note the use of "-c 16" to "reserve" 2 chiplets per task and is consistent with
#                 the use of "--gpus-per-task=2" to "reserve" 2 GPUs per task. Then, the REAL number of
#                 threads for the code SHOULD be defined by the environment variables above.
#      (The "-l" option is for displaying, at the beginning of each line, the taskID that generates the output.)
#      (The "-u" option is for unbuffered output, so that output is displayed as soon as it's generated.)
#      (If the output needs to be sorted for clarity, then add "| sort -n" at the end of the command.)
echo -e "\n\n#------------------------#"
echo "Test code execution:"
srun -l -u -N 1 -n 3 -c 16 --gres=gpu:6 --gpus-per-task=2 --gpu-bind=closest ${theExe}

#----
#Printing information of finished job steps:
echo -e "\n\n#------------------------#"
echo "Printing information of finished jobs steps using sacct:"
sacct -j ${SLURM_JOBID} -o jobid%20,Start%20,elapsed%20

#----
#Done
echo -e "\n\n#------------------------#"
echo "Done"


...

Column
width900px


Code Block
languagebash
themeEmacs
titleListing N. exampleScript_1NodeExclusive_8GPUs_jobPacking.sh
linenumberstrue
#!/bin/bash --login
#SBATCH --job-name=JobPacking8GPUsExclusive-bindMethod1
#SBATCH --partition=gpu
#SBATCH --nodes=1              #1 nodes in this example 
#SBATCH --exclusive            #All resources of the node are exclusive to this job
#                              #8 GPUs per node (8 "allocation-packs" in total for the job)
#SBATCH --time=00:05:00
#SBATCH --account=<yourProject>-gpu #IMPORTANT: use your own project and the -gpu suffix

#----
#Loading needed modules (adapt this for your own purposes):
module load PrgEnv-cray
module load rocm/<VERSION> craype-accel-amd-gfx90a
echo -e "\n\n#------------------------#"
module list

#----
#Printing the status of the given allocation
echo -e "\n\n#------------------------#"
echo "Printing from scontrol:"
scontrol show job ${SLURM_JOBID}

#----
#Job Packing Wrapper: Each srun-task will use a different instance of the executable.
jobPackingWrapper="jobPackingWrapper.sh"

#----
#MPI & OpenMP settings
#No need for 1GPU steps:export MPICH_GPU_SUPPORT_ENABLED=1 #This allows for GPU-aware MPI communication among GPUs
export OMP_NUM_THREADS=1           #This controls the real CPU-cores per task for the executable

#----
#Execution
#Note: srun needs the explicit indication full parameters for use of resources in the job step.
#      These are independent from the allocation parameters (which are not inherited by srun)
#      "-c 8" is used to force allocation of 1 task per CPU chiplet. Then, the REAL number of threads
#         for the code SHOULD be defined by the environment variables above.
#      (The "-l" option is for displaying, at the beginning of each line, the taskID that generates the output.)
#      (The "-u" option is for unbuffered output, so that output is displayed as soon as it's generated.)
echo -e "\n\n#------------------------#"
echo "Test code execution:"
srun -l -u -N 1 -n 8 -c 8 --gres=gpu:8 --gpus-per-task=1 --gpu-bind=closest ./${jobPackingWrapper}

#----
#Printing information of finished job steps:
echo -e "\n\n#------------------------#"
echo "Printing information of finished jobs steps using sacct:"
sacct -j ${SLURM_JOBID} -o jobid%20,Start%20,elapsed%20

#----
#Done
echo -e "\n\n#------------------------#"
echo "Done" 


...