...
export MPICH_GPU_SUPPORT_ENABLED = 1
export OMP_NUM_THREADS=1
srun -N 1 -n 8 -c 8 --gpus-per-node=gres=gpu:8 --gpus-per-task=1 --gpu-bind=closest myMPIExecutable
...
Column |
---|
|
Code Block |
---|
language | bash |
---|
theme | DJango |
---|
title | Terminal N. Example error message for some GPU-aware MPI |
---|
| $ srun -N 1 -n 8 -c 8 --gpus-per-node=gres=gpu:8 --gpus-per-task=1 --gpu-bind=closest ./myCode_mpiGPU.exe
GTL_DEBUG: [0] hsa_amd_ipc_memory_attach (in gtlt_hsa_ops.c at line 1544):
HSA_STATUS_ERROR_INVALID_ARGUMENT: One of the actual arguments does not meet a precondition stated in the documentation of the corresponding formal argument.
MPICH ERROR [Rank 0] [job id 339192.2] [Mon Sep 4 13:00:27 2023] [nid001004] - Abort(407515138) (rank 0 in comm 0):
Fatal error in PMPI_Waitall: Invalid count, error stack: |
|
...
export MPICH_GPU_SUPPORT_ENABLED = 1
export OMP_NUM_THREADS=1
srun -N 1 -n 8 -c 8 --gpus-per-node=gres=gpu:8 --cpu-bind=${CPU_BIND} ./selectGPU_X.sh myMPIExecutable
The wrapper will be ran by each of the 8 tasks spawned by srun
(-n 8
) and will assign a different and single value to ROCR_VISIBLE_DEVICES
for each of the tasks. Furthermore, the task with SLURM_LOCALID=0
will be receive GCD 0 (Bus C1)
as the only visible Slurm GPU for the task. The task with SLURM_LOCALID=1
will receive GPU 1 (Bus C6)
, and so forth.
...
export MPICH_GPU_SUPPORT_ENABLED = 1
export OMP_NUM_THREADS=1
CPU_BIND="map_cpu:49,57,17,25,0,9,33,41"
srun -N 1 -n 8 -c 8 --gpus-per-node=gres=gpu:8 --cpu-bind=${CPU_BIND} ./selectGPU_X.sh myMPIExecutable
This provides the optimal binding in a job that requires the use of 8 CPU tasks (single threaded) with 1 GCD (logical/Slurm GPU) per task.
...
Column |
---|
|
Code Block |
---|
language | bash |
---|
theme | DJango |
---|
title | Terminal N. Explaining the use of the script "generate_CPU_BIND.sh" from an salloc session |
---|
| $ salloc -N 1 --gpus-per-node=gres=gpu:3 -A yourProject-gpu --partition=gpu-dev
salloc: Granted job allocation 1370877
$ scontrol show jobid $SLURM_JOBID
JobId=1370877 JobName=interactive
UserId=quokka(20146) GroupId=quokka(20146) MCS_label=N/A
Priority=16818 Nice=0 Account=rottnest0001-gpu QOS=normal
JobState=RUNNING Reason=None Dependency=(null)
Requeue=1 Restarts=0 BatchFlag=0 Reboot=0 ExitCode=0:0
RunTime=00:00:48 TimeLimit=01:00:00 TimeMin=N/A
SubmitTime=16:45:41 EligibleTime=16:45:41
AccrueTime=Unknown
StartTime=16:45:41 EndTime=17:45:41 Deadline=N/A
SuspendTime=None SecsPreSuspend=0 LastSchedEval=16:45:41 Scheduler=Main
Partition=gpu AllocNode:Sid=joey-02:253180
ReqNodeList=(null) ExcNodeList=(null)
NodeList=nid001004
BatchHost=nid001004
NumNodes=1 NumCPUs=48 NumTasks=1 CPUs/Task=1 ReqB:S:C:T=0:0:*:1
TRES=cpu=48,mem=88320M,node=1,billing=192,gres/gpu=3
Socks/Node=* NtasksPerN:B:S:C=0:0:*:1 CoreSpec=*
MinCPUsNode=1 MinMemoryNode=0 MinTmpDiskNode=0
Features=(null) DelayBoot=00:00:00
OverSubscribe=OK Contiguous=0 Licenses=(null) Network=(null)
Command=(null)
WorkDir=/scratch/rottnest0001/quokka/hello_jobstep
Power=
CpusPerTres=gres:gpu:8
MemPerTres=gpu:29440
TresPerNode=gres:gpu:3
$ rocm-smi --showhw
======================= ROCm System Management Interface =======================
============================ Concise Hardware Info =============================
GPU DID GFX RAS SDMA RAS UMC RAS VBIOS BUS
0 7408 DISABLED ENABLED DISABLED 113-D65201-042 0000:C9:00.0
1 7408 DISABLED ENABLED DISABLED 113-D65201-042 0000:D1:00.0
2 7408 DISABLED ENABLED DISABLED 113-D65201-042 0000:D6:00.0
================================================================================
============================= End of ROCm SMI Log ==============================
$ generate_CPU_BIND.sh map_cpu
map_cpu:21,2,14
$ generate_CPU_BIND.sh mask_cpu
mask_cpu:0000000000FF0000,00000000000000FF,000000000000FF00 |
|
...
export MPICH_GPU_SUPPORT_ENABLED = 1
export OMP_NUM_THREADS=1
CPU_BIND=$(generate_CPU_BIND.sh map_cpu)
srun -N 1 -n 8 -c 8 --gpus-per-node=gres=gpu:8 --cpu-bind=${CPU_BIND} ./selectGPU_X.sh myMPIExecutable
...
Column |
---|
|
Code Block |
---|
language | bash |
---|
theme | DJango |
---|
title | Terminal N. Explaining the use of the "hello_jobstep" code from an salloc session (allocation and check) |
---|
| $ salloc -N 1 --gpus-per-node=gres=gpu:3 -A <yourProject>-gpu --partition=gpu-dev
salloc: Granted job allocation 339185
$ scontrol show jobid $SLURM_JOBID
JobId=339185 JobName=interactive
UserId=quokka(20146) GroupId=quokka(20146) MCS_label=N/A
Priority=16818 Nice=0 Account=rottnest0001-gpu QOS=normal
JobState=RUNNING Reason=None Dependency=(null)
Requeue=1 Restarts=0 BatchFlag=0 Reboot=0 ExitCode=0:0
RunTime=00:00:48 TimeLimit=01:00:00 TimeMin=N/A
SubmitTime=16:45:41 EligibleTime=16:45:41
AccrueTime=Unknown
StartTime=16:45:41 EndTime=17:45:41 Deadline=N/A
SuspendTime=None SecsPreSuspend=0 LastSchedEval=16:45:41 Scheduler=Main
Partition=gpu AllocNode:Sid=joey-02:253180
ReqNodeList=(null) ExcNodeList=(null)
NodeList=nid001004
BatchHost=nid001004
NumNodes=1 NumCPUs=48 NumTasks=1 CPUs/Task=1 ReqB:S:C:T=0:0:*:1
TRES=cpu=48,mem=88320M,node=1,billing=192,gres/gpu=3
Socks/Node=* NtasksPerN:B:S:C=0:0:*:1 CoreSpec=*
MinCPUsNode=1 MinMemoryNode=0 MinTmpDiskNode=0
Features=(null) DelayBoot=00:00:00
OverSubscribe=OK Contiguous=0 Licenses=(null) Network=(null)
Command=(null)
WorkDir=/scratch/rottnest0001/quokka/hello_jobstep
Power=
CpusPerTres=gres:gpu:8
MemPerTres=gpu:29440
TresPerNode=gres:gpu:3 |
|
...
Column |
---|
|
Code Block |
---|
language | bash |
---|
theme | DJango |
---|
title | Terminal N. Testing srun settings (method 1) for optimal binding for pure MPI job 1 GPU per task. |
---|
| $ export OMP_NUM_THREADS=1; srun -N 1 -n 3 -c 8 --gpus-per-node=gres=gpu:3 --gpus-per-task=1 --gpu-bind=closest ./hello_jobstep | sort -n
MPI 000 - OMP 000 - HWT 002 - Node nid001004 - RunTime_GPU_ID 0 - ROCR_VISIBLE_GPU_ID 0 - GPU_Bus_ID d1
MPI 001 - OMP 000 - HWT 009 - Node nid001004 - RunTime_GPU_ID 0 - ROCR_VISIBLE_GPU_ID 0 - GPU_Bus_ID d6
MPI 002 - OMP 000 - HWT 017 - Node nid001004 - RunTime_GPU_ID 0 - ROCR_VISIBLE_GPU_ID 0 - GPU_Bus_ID c9 |
|
...
Column |
---|
|
Code Block |
---|
language | bash |
---|
theme | DJango |
---|
title | Terminal N. Testing "manual" method (method 2) for optimal binding for pure MPI job 1 GPU per task. |
---|
| $ CPU_BIND=$(generate_CPU_BIND.sh map_cpu)
$ echo $CPU_BIND
map_cpu:16,3,15
$ export OMP_NUM_THREADS=1; srun -N 1 -n 3 -c 8 --gpus-per-node=gres=gpu:3 --cpu-bind=${CPU_BIND} ./selectGPU_X.sh ./hello_jobstep | sort -n
MPI 000 - OMP 000 - HWT 016 - Node nid001004 - RunTime_GPU_ID 0 - ROCR_VISIBLE_GPU_ID 0 - GPU_Bus_ID c9
MPI 001 - OMP 000 - HWT 003 - Node nid001004 - RunTime_GPU_ID 0 - ROCR_VISIBLE_GPU_ID 1 - GPU_Bus_ID d1
MPI 002 - OMP 000 - HWT 015 - Node nid001004 - RunTime_GPU_ID 0 - ROCR_VISIBLE_GPU_ID 2 - GPU_Bus_ID d6 |
|
...
Column |
---|
|
Code Block |
---|
language | bash |
---|
theme | DJango |
---|
title | Terminal N. Testing srun settings (method 1) for optimal binding for a case with 4 CPU threads per task and 1 GPU per task |
---|
| $ export OMP_NUM_THREADS=4; srun -N 1 -n 3 -c 8 --gpus-per-node=gres=gpu:3 --gpus-per-task=1 --gpu-bind=closest ./hello_jobstep | sort -n
MPI 000 - OMP 000 - HWT 000 - Node nid001004 - RunTime_GPU_ID 0 - ROCR_VISIBLE_GPU_ID 0 - GPU_Bus_ID d1
MPI 000 - OMP 001 - HWT 003 - Node nid001004 - RunTime_GPU_ID 0 - ROCR_VISIBLE_GPU_ID 0 - GPU_Bus_ID d1
MPI 000 - OMP 002 - HWT 005 - Node nid001004 - RunTime_GPU_ID 0 - ROCR_VISIBLE_GPU_ID 0 - GPU_Bus_ID d1
MPI 000 - OMP 003 - HWT 006 - Node nid001004 - RunTime_GPU_ID 0 - ROCR_VISIBLE_GPU_ID 0 - GPU_Bus_ID d1
MPI 001 - OMP 000 - HWT 008 - Node nid001004 - RunTime_GPU_ID 0 - ROCR_VISIBLE_GPU_ID 0 - GPU_Bus_ID d6
MPI 001 - OMP 001 - HWT 011 - Node nid001004 - RunTime_GPU_ID 0 - ROCR_VISIBLE_GPU_ID 0 - GPU_Bus_ID d6
MPI 001 - OMP 002 - HWT 013 - Node nid001004 - RunTime_GPU_ID 0 - ROCR_VISIBLE_GPU_ID 0 - GPU_Bus_ID d6
MPI 001 - OMP 003 - HWT 014 - Node nid001004 - RunTime_GPU_ID 0 - ROCR_VISIBLE_GPU_ID 0 - GPU_Bus_ID d6
MPI 002 - OMP 000 - HWT 016 - Node nid001004 - RunTime_GPU_ID 0 - ROCR_VISIBLE_GPU_ID 0 - GPU_Bus_ID c9
MPI 002 - OMP 001 - HWT 019 - Node nid001004 - RunTime_GPU_ID 0 - ROCR_VISIBLE_GPU_ID 0 - GPU_Bus_ID c9
MPI 002 - OMP 002 - HWT 021 - Node nid001004 - RunTime_GPU_ID 0 - ROCR_VISIBLE_GPU_ID 0 - GPU_Bus_ID c9
MPI 002 - OMP 003 - HWT 022 - Node nid001004 - RunTime_GPU_ID 0 - ROCR_VISIBLE_GPU_ID 0 - GPU_Bus_ID c9 |
|
...
Column |
---|
|
Code Block |
---|
language | bash |
---|
theme | DJango |
---|
title | Terminal N. Testing "manual" method (method 2) for optimal binding for a case with 4 CPU threads per task and 1 GPU per task |
---|
| $ CPU_BIND=$(generate_CPU_BIND.sh mask_cpu)
$ echo $CPU_BIND
mask_cpu:0000000000FF0000,00000000000000FF,000000000000FF00
$ export OMP_NUM_THREADS=4; srun -N 1 -n 3 -c 8 --gpus-per-node=gres=gpu:3 --cpu-bind=${CPU_BIND} ./selectGPU_X.sh ./hello_jobstep | sort -n
MPI 000 - OMP 000 - HWT 016 - Node nid001004 - RunTime_GPU_ID 0 - ROCR_VISIBLE_GPU_ID 0 - GPU_Bus_ID c9
MPI 000 - OMP 001 - HWT 018 - Node nid001004 - RunTime_GPU_ID 0 - ROCR_VISIBLE_GPU_ID 0 - GPU_Bus_ID c9
MPI 000 - OMP 002 - HWT 021 - Node nid001004 - RunTime_GPU_ID 0 - ROCR_VISIBLE_GPU_ID 0 - GPU_Bus_ID c9
MPI 000 - OMP 003 - HWT 022 - Node nid001004 - RunTime_GPU_ID 0 - ROCR_VISIBLE_GPU_ID 0 - GPU_Bus_ID c9
MPI 001 - OMP 000 - HWT 000 - Node nid001004 - RunTime_GPU_ID 0 - ROCR_VISIBLE_GPU_ID 1 - GPU_Bus_ID d1
MPI 001 - OMP 001 - HWT 003 - Node nid001004 - RunTime_GPU_ID 0 - ROCR_VISIBLE_GPU_ID 1 - GPU_Bus_ID d1
MPI 001 - OMP 002 - HWT 005 - Node nid001004 - RunTime_GPU_ID 0 - ROCR_VISIBLE_GPU_ID 1 - GPU_Bus_ID d1
MPI 001 - OMP 003 - HWT 006 - Node nid001004 - RunTime_GPU_ID 0 - ROCR_VISIBLE_GPU_ID 1 - GPU_Bus_ID d1
MPI 002 - OMP 000 - HWT 008 - Node nid001004 - RunTime_GPU_ID 0 - ROCR_VISIBLE_GPU_ID 2 - GPU_Bus_ID d6
MPI 002 - OMP 001 - HWT 011 - Node nid001004 - RunTime_GPU_ID 0 - ROCR_VISIBLE_GPU_ID 2 - GPU_Bus_ID d6
MPI 002 - OMP 002 - HWT 013 - Node nid001004 - RunTime_GPU_ID 0 - ROCR_VISIBLE_GPU_ID 2 - GPU_Bus_ID d6
MPI 002 - OMP 003 - HWT 014 - Node nid001004 - RunTime_GPU_ID 0 - ROCR_VISIBLE_GPU_ID 2 - GPU_Bus_ID d6 |
|
...
#SBATCH --nodes=1 #1 nodes in this example
#SBATCH --gpus-per-node=gres=gpu:1 #1 GPUs per node (1 "allocation-pack" in total for the job)
...
Column |
---|
|
Code Block |
---|
language | bash |
---|
theme | Emacs |
---|
title | Listing N. exampleScript_1NodeShared_1GPU.sh |
---|
linenumbers | true |
---|
| #!/bin/bash --login
#SBATCH --job-name=1GPUSharedNode
#SBATCH --partition=gpu
#SBATCH --nodes=1 #1 nodes in this example
#SBATCH --gpus-per-node=gres=gpu:1 #1 GPUs per node (1 "allocation-pack" in total for the job)
#SBATCH --time=00:05:00
#SBATCH --account=<yourProject>-gpu #IMPORTANT: use your own project and the -gpu suffix
#(Note that there is not request for exclusive access to the node)
#----
#Loading needed modules (adapt this for your own purposes):
module load PrgEnv-cray
module load rocm craype-accel-amd-gfx90a
echo -e "\n\n#------------------------#"
module list
#----
#Printing the status of the given allocation
echo -e "\n\n#------------------------#"
echo "Printing from scontrol:"
scontrol show job ${SLURM_JOBID}
#----
#Definition of the executable (we assume the example code has been compiled and is available in $MYSCRATCH):
exeDir=$MYSCRATCH/hello_jobstep
exeName=hello_jobstep
theExe=$exeDir/$exeName
#----
#MPI & OpenMP settings
#Not needed for 1GPU:export MPICH_GPU_SUPPORT_ENABLED=1 #This allows for GPU-aware MPI communication among GPUs
export OMP_NUM_THREADS=1 #This controls the real CPU-cores per task for the executable
#----
#Execution
#Note: srun needs the explicit indication full parameters for use of resources in the job step.
# These are independent from the allocation parameters (which are not inherited by srun)
# For optimal GPU binding using slurm options,
# "--gpus-per-task=1" and "--gpu-bind=closest" create the optimal binding of GPUs
# (Although in this case this can be avoided as only 1 "allocation-pack" has been requested)
echo -e "\n\n#------------------------#"
echo "Test code execution:"
srun -l -u -N 1 -n 1 -c 8 --gpus-per-node=gres=gpu:1 ${theExe} | sort -n
#----
#Printing information of finished job steps:
echo -e "\n\n#------------------------#"
echo "Printing information of finished jobs steps using sacct:"
sacct -j ${SLURM_JOBID} -o jobid%20,Start%20,elapsed%20
#----
#Done
echo -e "\n\n#------------------------#"
echo "Done" |
|
...
#SBATCH --nodes=1 #1 nodes in this example
#SBATCH --gpus-per-node=gres=gpu:3 #3 GPUs per node (3 "allocation-packs" in total for the job)
...
#SBATCH --nodes=1 #1 nodes in this example
#SBATCH --gpus-per-node=gres=gpu:3 #3 GPUs per node (3 "allocation-packs" in total for the job)
...
Column |
---|
|
Code Block |
---|
language | bash |
---|
theme | Emacs |
---|
title | Listing N. exampleScript_1NodeExclusive_8GPUs_jobPacking.sh |
---|
linenumbers | true |
---|
| #!/bin/bash --login
#SBATCH --job-name=JobPacking8GPUsExclusive-bindMethod1
#SBATCH --partition=gpu
#SBATCH --nodes=1 #1 nodes in this example
#SBATCH --exclusive #All resources of the node are exclusive to this job
# #8 GPUs per node (8 "allocation-packs" in total for the job)
#SBATCH --time=00:05:00
#SBATCH --account=<yourProject>-gpu #IMPORTANT: use your own project and the -gpu suffix
#----
#Loading needed modules (adapt this for your own purposes):
module load PrgEnv-cray
module load rocm craype-accel-amd-gfx90a
echo -e "\n\n#------------------------#"
module list
#----
#Printing the status of the given allocation
echo -e "\n\n#------------------------#"
echo "Printing from scontrol:"
scontrol show job ${SLURM_JOBID}
#----
#Job Packing Wrapper: Each srun-task will use a different instance of the executable.
jobPackingWrapper="jobPackingWrapper.sh"
#----
#MPI & OpenMP settings
#No need for 1GPU steps:export MPICH_GPU_SUPPORT_ENABLED=1 #This allows for GPU-aware MPI communication among GPUs
export OMP_NUM_THREADS=1 #This controls the real CPU-cores per task for the executable
#----
#Execution
#Note: srun needs the explicit indication full parameters for use of resources in the job step.
# These are independent from the allocation parameters (which are not inherited by srun)
echo -e "\n\n#------------------------#"
echo "Test code execution:"
srun -l -u -N 1 -n 8 -c 8 --gpus-per-node=gres=gpu:8 --gpus-per-task=1 --gpu-bind=closest ./${jobPackingWrapper}
#----
#Printing information of finished job steps:
echo -e "\n\n#------------------------#"
echo "Printing information of finished jobs steps using sacct:"
sacct -j ${SLURM_JOBID} -o jobid%20,Start%20,elapsed%20
#----
#Done
echo -e "\n\n#------------------------#"
echo "Done" |
|
...