Versions Compared

Key

  • This line was added.
  • This line was removed.
  • Formatting was changed.

...

export MPICH_GPU_SUPPORT_ENABLED = 1
export OMP_NUM_THREADS=1
srun -N 1 -n 8 -c 8 --gpus-per-node=gres=gpu:8 --gpus-per-task=1 --gpu-bind=closest myMPIExecutable

...

Column
width900px


Code Block
languagebash
themeDJango
titleTerminal N. Example error message for some GPU-aware MPI
$ srun -N 1 -n 8 -c 8 --gpus-per-node=gres=gpu:8 --gpus-per-task=1 --gpu-bind=closest ./myCode_mpiGPU.exe
GTL_DEBUG: [0] hsa_amd_ipc_memory_attach (in gtlt_hsa_ops.c at line 1544):
HSA_STATUS_ERROR_INVALID_ARGUMENT: One of the actual arguments does not meet a precondition stated in the documentation of the corresponding formal argument.
MPICH ERROR [Rank 0] [job id 339192.2] [Mon Sep  4 13:00:27 2023] [nid001004] - Abort(407515138) (rank 0 in comm 0):
Fatal error in PMPI_Waitall: Invalid count, error stack:


...

export MPICH_GPU_SUPPORT_ENABLED = 1
export OMP_NUM_THREADS=1
srun -N 1 -n 8 -c 8 --gpus-per-node=gres=gpu:8 --cpu-bind=${CPU_BIND} ./selectGPU_X.sh myMPIExecutable

The wrapper will be ran by each of the 8 tasks spawned by srun (-n 8) and will assign a different and single value to ROCR_VISIBLE_DEVICES for each of the tasks. Furthermore, the task with SLURM_LOCALID=0 will be receive GCD 0 (Bus C1) as the only visible Slurm GPU for the task. The task with SLURM_LOCALID=1 will receive GPU 1 (Bus C6), and so forth.

...

export MPICH_GPU_SUPPORT_ENABLED = 1
export OMP_NUM_THREADS=1
CPU_BIND="map_cpu:49,57,17,25,0,9,33,41"
srun -N 1 -n 8 -c 8 --gpus-per-node=gres=gpu:8 --cpu-bind=${CPU_BIND} ./selectGPU_X.sh myMPIExecutable

This provides the optimal binding in a job that requires the use of 8 CPU tasks (single threaded) with 1 GCD (logical/Slurm GPU) per task.

...

Column
width900px


Code Block
languagebash
themeDJango
titleTerminal N. Explaining the use of the script "generate_CPU_BIND.sh" from an salloc session
$ salloc -N 1 --gpus-per-node=gres=gpu:3 -A yourProject-gpu --partition=gpu-dev
salloc: Granted job allocation 1370877


$ scontrol show jobid $SLURM_JOBID
JobId=1370877 JobName=interactive
   UserId=quokka(20146) GroupId=quokka(20146) MCS_label=N/A
   Priority=16818 Nice=0 Account=rottnest0001-gpu QOS=normal
   JobState=RUNNING Reason=None Dependency=(null)
   Requeue=1 Restarts=0 BatchFlag=0 Reboot=0 ExitCode=0:0
   RunTime=00:00:48 TimeLimit=01:00:00 TimeMin=N/A
   SubmitTime=16:45:41 EligibleTime=16:45:41
   AccrueTime=Unknown
   StartTime=16:45:41 EndTime=17:45:41 Deadline=N/A
   SuspendTime=None SecsPreSuspend=0 LastSchedEval=16:45:41 Scheduler=Main
   Partition=gpu AllocNode:Sid=joey-02:253180
   ReqNodeList=(null) ExcNodeList=(null)
   NodeList=nid001004
   BatchHost=nid001004
   NumNodes=1 NumCPUs=48 NumTasks=1 CPUs/Task=1 ReqB:S:C:T=0:0:*:1
   TRES=cpu=48,mem=88320M,node=1,billing=192,gres/gpu=3
   Socks/Node=* NtasksPerN:B:S:C=0:0:*:1 CoreSpec=*
   MinCPUsNode=1 MinMemoryNode=0 MinTmpDiskNode=0
   Features=(null) DelayBoot=00:00:00
   OverSubscribe=OK Contiguous=0 Licenses=(null) Network=(null)
   Command=(null)
   WorkDir=/scratch/rottnest0001/quokka/hello_jobstep
   Power=
   CpusPerTres=gres:gpu:8
   MemPerTres=gpu:29440
   TresPerNode=gres:gpu:3   


$ rocm-smi --showhw
======================= ROCm System Management Interface =======================
============================ Concise Hardware Info =============================
GPU  DID   GFX RAS   SDMA RAS  UMC RAS   VBIOS           BUS 
0    7408  DISABLED  ENABLED   DISABLED  113-D65201-042  0000:C9:00.0 
1    7408  DISABLED  ENABLED   DISABLED  113-D65201-042  0000:D1:00.0 
2    7408  DISABLED  ENABLED   DISABLED  113-D65201-042  0000:D6:00.0 
================================================================================
============================= End of ROCm SMI Log ==============================


$ generate_CPU_BIND.sh map_cpu
map_cpu:21,2,14


$ generate_CPU_BIND.sh mask_cpu
mask_cpu:0000000000FF0000,00000000000000FF,000000000000FF00


...

export MPICH_GPU_SUPPORT_ENABLED = 1
export OMP_NUM_THREADS=1
CPU_BIND=$(generate_CPU_BIND.sh map_cpu)
srun -N 1 -n 8 -c 8 --gpus-per-node=gres=gpu:8 --cpu-bind=${CPU_BIND} ./selectGPU_X.sh myMPIExecutable

...

Column
width900px


Code Block
languagebash
themeDJango
titleTerminal N. Explaining the use of the "hello_jobstep" code from an salloc session (allocation and check)
$ salloc -N 1 --gpus-per-node=gres=gpu:3 -A <yourProject>-gpu --partition=gpu-dev
salloc: Granted job allocation 339185

$ scontrol show jobid $SLURM_JOBID
JobId=339185 JobName=interactive
   UserId=quokka(20146) GroupId=quokka(20146) MCS_label=N/A
   Priority=16818 Nice=0 Account=rottnest0001-gpu QOS=normal
   JobState=RUNNING Reason=None Dependency=(null)
   Requeue=1 Restarts=0 BatchFlag=0 Reboot=0 ExitCode=0:0
   RunTime=00:00:48 TimeLimit=01:00:00 TimeMin=N/A
   SubmitTime=16:45:41 EligibleTime=16:45:41
   AccrueTime=Unknown
   StartTime=16:45:41 EndTime=17:45:41 Deadline=N/A
   SuspendTime=None SecsPreSuspend=0 LastSchedEval=16:45:41 Scheduler=Main
   Partition=gpu AllocNode:Sid=joey-02:253180
   ReqNodeList=(null) ExcNodeList=(null)
   NodeList=nid001004
   BatchHost=nid001004
   NumNodes=1 NumCPUs=48 NumTasks=1 CPUs/Task=1 ReqB:S:C:T=0:0:*:1
   TRES=cpu=48,mem=88320M,node=1,billing=192,gres/gpu=3
   Socks/Node=* NtasksPerN:B:S:C=0:0:*:1 CoreSpec=*
   MinCPUsNode=1 MinMemoryNode=0 MinTmpDiskNode=0
   Features=(null) DelayBoot=00:00:00
   OverSubscribe=OK Contiguous=0 Licenses=(null) Network=(null)
   Command=(null)
   WorkDir=/scratch/rottnest0001/quokka/hello_jobstep
   Power=
   CpusPerTres=gres:gpu:8
   MemPerTres=gpu:29440
   TresPerNode=gres:gpu:3   


...

Column
width900px


Code Block
languagebash
themeDJango
titleTerminal N. Testing srun settings (method 1) for optimal binding for pure MPI job 1 GPU per task.
$ export OMP_NUM_THREADS=1; srun -N 1 -n 3 -c 8 --gpus-per-node=gres=gpu:3 --gpus-per-task=1 --gpu-bind=closest ./hello_jobstep | sort -n
MPI 000 - OMP 000 - HWT 002 - Node nid001004 - RunTime_GPU_ID 0 - ROCR_VISIBLE_GPU_ID 0 - GPU_Bus_ID d1
MPI 001 - OMP 000 - HWT 009 - Node nid001004 - RunTime_GPU_ID 0 - ROCR_VISIBLE_GPU_ID 0 - GPU_Bus_ID d6
MPI 002 - OMP 000 - HWT 017 - Node nid001004 - RunTime_GPU_ID 0 - ROCR_VISIBLE_GPU_ID 0 - GPU_Bus_ID c9


...

Column
width900px


Code Block
languagebash
themeDJango
titleTerminal N. Testing "manual" method (method 2) for optimal binding for pure MPI job 1 GPU per task.
$ CPU_BIND=$(generate_CPU_BIND.sh map_cpu)
$ echo $CPU_BIND
map_cpu:16,3,15

$ export OMP_NUM_THREADS=1; srun -N 1 -n 3 -c 8 --gpus-per-node=gres=gpu:3 --cpu-bind=${CPU_BIND} ./selectGPU_X.sh ./hello_jobstep | sort -n
MPI 000 - OMP 000 - HWT 016 - Node nid001004 - RunTime_GPU_ID 0 - ROCR_VISIBLE_GPU_ID 0 - GPU_Bus_ID c9
MPI 001 - OMP 000 - HWT 003 - Node nid001004 - RunTime_GPU_ID 0 - ROCR_VISIBLE_GPU_ID 1 - GPU_Bus_ID d1
MPI 002 - OMP 000 - HWT 015 - Node nid001004 - RunTime_GPU_ID 0 - ROCR_VISIBLE_GPU_ID 2 - GPU_Bus_ID d6


...

Column
width900px


Code Block
languagebash
themeDJango
titleTerminal N. Testing srun settings (method 1) for optimal binding for a case with 4 CPU threads per task and 1 GPU per task
$ export OMP_NUM_THREADS=4; srun -N 1 -n 3 -c 8 --gpus-per-node=gres=gpu:3 --gpus-per-task=1 --gpu-bind=closest ./hello_jobstep | sort -n
MPI 000 - OMP 000 - HWT 000 - Node nid001004 - RunTime_GPU_ID 0 - ROCR_VISIBLE_GPU_ID 0 - GPU_Bus_ID d1
MPI 000 - OMP 001 - HWT 003 - Node nid001004 - RunTime_GPU_ID 0 - ROCR_VISIBLE_GPU_ID 0 - GPU_Bus_ID d1
MPI 000 - OMP 002 - HWT 005 - Node nid001004 - RunTime_GPU_ID 0 - ROCR_VISIBLE_GPU_ID 0 - GPU_Bus_ID d1
MPI 000 - OMP 003 - HWT 006 - Node nid001004 - RunTime_GPU_ID 0 - ROCR_VISIBLE_GPU_ID 0 - GPU_Bus_ID d1
MPI 001 - OMP 000 - HWT 008 - Node nid001004 - RunTime_GPU_ID 0 - ROCR_VISIBLE_GPU_ID 0 - GPU_Bus_ID d6
MPI 001 - OMP 001 - HWT 011 - Node nid001004 - RunTime_GPU_ID 0 - ROCR_VISIBLE_GPU_ID 0 - GPU_Bus_ID d6
MPI 001 - OMP 002 - HWT 013 - Node nid001004 - RunTime_GPU_ID 0 - ROCR_VISIBLE_GPU_ID 0 - GPU_Bus_ID d6
MPI 001 - OMP 003 - HWT 014 - Node nid001004 - RunTime_GPU_ID 0 - ROCR_VISIBLE_GPU_ID 0 - GPU_Bus_ID d6
MPI 002 - OMP 000 - HWT 016 - Node nid001004 - RunTime_GPU_ID 0 - ROCR_VISIBLE_GPU_ID 0 - GPU_Bus_ID c9
MPI 002 - OMP 001 - HWT 019 - Node nid001004 - RunTime_GPU_ID 0 - ROCR_VISIBLE_GPU_ID 0 - GPU_Bus_ID c9
MPI 002 - OMP 002 - HWT 021 - Node nid001004 - RunTime_GPU_ID 0 - ROCR_VISIBLE_GPU_ID 0 - GPU_Bus_ID c9
MPI 002 - OMP 003 - HWT 022 - Node nid001004 - RunTime_GPU_ID 0 - ROCR_VISIBLE_GPU_ID 0 - GPU_Bus_ID c9


...

Column
width900px


Code Block
languagebash
themeDJango
titleTerminal N. Testing "manual" method (method 2) for optimal binding for a case with 4 CPU threads per task and 1 GPU per task
$ CPU_BIND=$(generate_CPU_BIND.sh mask_cpu)
$ echo $CPU_BIND
mask_cpu:0000000000FF0000,00000000000000FF,000000000000FF00

$ export OMP_NUM_THREADS=4; srun -N 1 -n 3 -c 8 --gpus-per-node=gres=gpu:3 --cpu-bind=${CPU_BIND} ./selectGPU_X.sh ./hello_jobstep | sort -n
MPI 000 - OMP 000 - HWT 016 - Node nid001004 - RunTime_GPU_ID 0 - ROCR_VISIBLE_GPU_ID 0 - GPU_Bus_ID c9
MPI 000 - OMP 001 - HWT 018 - Node nid001004 - RunTime_GPU_ID 0 - ROCR_VISIBLE_GPU_ID 0 - GPU_Bus_ID c9
MPI 000 - OMP 002 - HWT 021 - Node nid001004 - RunTime_GPU_ID 0 - ROCR_VISIBLE_GPU_ID 0 - GPU_Bus_ID c9
MPI 000 - OMP 003 - HWT 022 - Node nid001004 - RunTime_GPU_ID 0 - ROCR_VISIBLE_GPU_ID 0 - GPU_Bus_ID c9
MPI 001 - OMP 000 - HWT 000 - Node nid001004 - RunTime_GPU_ID 0 - ROCR_VISIBLE_GPU_ID 1 - GPU_Bus_ID d1
MPI 001 - OMP 001 - HWT 003 - Node nid001004 - RunTime_GPU_ID 0 - ROCR_VISIBLE_GPU_ID 1 - GPU_Bus_ID d1
MPI 001 - OMP 002 - HWT 005 - Node nid001004 - RunTime_GPU_ID 0 - ROCR_VISIBLE_GPU_ID 1 - GPU_Bus_ID d1
MPI 001 - OMP 003 - HWT 006 - Node nid001004 - RunTime_GPU_ID 0 - ROCR_VISIBLE_GPU_ID 1 - GPU_Bus_ID d1
MPI 002 - OMP 000 - HWT 008 - Node nid001004 - RunTime_GPU_ID 0 - ROCR_VISIBLE_GPU_ID 2 - GPU_Bus_ID d6
MPI 002 - OMP 001 - HWT 011 - Node nid001004 - RunTime_GPU_ID 0 - ROCR_VISIBLE_GPU_ID 2 - GPU_Bus_ID d6
MPI 002 - OMP 002 - HWT 013 - Node nid001004 - RunTime_GPU_ID 0 - ROCR_VISIBLE_GPU_ID 2 - GPU_Bus_ID d6
MPI 002 - OMP 003 - HWT 014 - Node nid001004 - RunTime_GPU_ID 0 - ROCR_VISIBLE_GPU_ID 2 - GPU_Bus_ID d6


...

#SBATCH --nodes=1              #1 nodes in this example 
#SBATCH --gpus-per-node=gres=gpu:1      #1 GPUs per node (1 "allocation-pack" in total for the job)

...

Column
width900px


Code Block
languagebash
themeEmacs
titleListing N. exampleScript_1NodeShared_1GPU.sh
linenumberstrue
#!/bin/bash --login
#SBATCH --job-name=1GPUSharedNode
#SBATCH --partition=gpu
#SBATCH --nodes=1              #1 nodes in this example 
#SBATCH --gpus-per-node=gres=gpu:1      #1 GPUs per node (1 "allocation-pack" in total for the job)
#SBATCH --time=00:05:00
#SBATCH --account=<yourProject>-gpu #IMPORTANT: use your own project and the -gpu suffix
#(Note that there is not request for exclusive access to the node)

#----
#Loading needed modules (adapt this for your own purposes):
module load PrgEnv-cray
module load rocm craype-accel-amd-gfx90a
echo -e "\n\n#------------------------#"
module list

#----
#Printing the status of the given allocation
echo -e "\n\n#------------------------#"
echo "Printing from scontrol:"
scontrol show job ${SLURM_JOBID}

#----
#Definition of the executable (we assume the example code has been compiled and is available in $MYSCRATCH):
exeDir=$MYSCRATCH/hello_jobstep
exeName=hello_jobstep
theExe=$exeDir/$exeName

#----
#MPI & OpenMP settings
#Not needed for 1GPU:export MPICH_GPU_SUPPORT_ENABLED=1 #This allows for GPU-aware MPI communication among GPUs
export OMP_NUM_THREADS=1           #This controls the real CPU-cores per task for the executable

#----
#Execution
#Note: srun needs the explicit indication full parameters for use of resources in the job step.
#      These are independent from the allocation parameters (which are not inherited by srun)
#      For optimal GPU binding using slurm options,
#      "--gpus-per-task=1" and "--gpu-bind=closest" create the optimal binding of GPUs      
#      (Although in this case this can be avoided as only 1 "allocation-pack" has been requested)
echo -e "\n\n#------------------------#"
echo "Test code execution:"
srun -l -u -N 1 -n 1 -c 8 --gpus-per-node=gres=gpu:1 ${theExe} | sort -n

#----
#Printing information of finished job steps:
echo -e "\n\n#------------------------#"
echo "Printing information of finished jobs steps using sacct:"
sacct -j ${SLURM_JOBID} -o jobid%20,Start%20,elapsed%20

#----
#Done
echo -e "\n\n#------------------------#"
echo "Done"


...

#SBATCH --nodes=1              #1 nodes in this example 
#SBATCH --gpus-per-node=gres=gpu:3      #3 GPUs per node (3 "allocation-packs" in total for the job)

...

#SBATCH --nodes=1              #1 nodes in this example 
#SBATCH --gpus-per-node=gres=gpu:3      #3 GPUs per node (3 "allocation-packs" in total for the job)

...

Column
width900px


Code Block
languagebash
themeEmacs
titleListing N. exampleScript_1NodeExclusive_8GPUs_jobPacking.sh
linenumberstrue
#!/bin/bash --login
#SBATCH --job-name=JobPacking8GPUsExclusive-bindMethod1
#SBATCH --partition=gpu
#SBATCH --nodes=1              #1 nodes in this example 
#SBATCH --exclusive            #All resources of the node are exclusive to this job
#                              #8 GPUs per node (8 "allocation-packs" in total for the job)
#SBATCH --time=00:05:00
#SBATCH --account=<yourProject>-gpu #IMPORTANT: use your own project and the -gpu suffix

#----
#Loading needed modules (adapt this for your own purposes):
module load PrgEnv-cray
module load rocm craype-accel-amd-gfx90a
echo -e "\n\n#------------------------#"
module list

#----
#Printing the status of the given allocation
echo -e "\n\n#------------------------#"
echo "Printing from scontrol:"
scontrol show job ${SLURM_JOBID}

#----
#Job Packing Wrapper: Each srun-task will use a different instance of the executable.
jobPackingWrapper="jobPackingWrapper.sh"

#----
#MPI & OpenMP settings
#No need for 1GPU steps:export MPICH_GPU_SUPPORT_ENABLED=1 #This allows for GPU-aware MPI communication among GPUs
export OMP_NUM_THREADS=1           #This controls the real CPU-cores per task for the executable

#----
#Execution
#Note: srun needs the explicit indication full parameters for use of resources in the job step.
#      These are independent from the allocation parameters (which are not inherited by srun)
echo -e "\n\n#------------------------#"
echo "Test code execution:"
srun -l -u -N 1 -n 8 -c 8 --gpus-per-node=gres=gpu:8 --gpus-per-task=1 --gpu-bind=closest ./${jobPackingWrapper}

#----
#Printing information of finished job steps:
echo -e "\n\n#------------------------#"
echo "Printing information of finished jobs steps using sacct:"
sacct -j ${SLURM_JOBID} -o jobid%20,Start%20,elapsed%20

#----
#Done
echo -e "\n\n#------------------------#"
echo "Done" 


...