Versions Compared

Key

  • This line was added.
  • This line was removed.
  • Formatting was changed.

...

Column
width900px


Code Block
languagebash
themeEmacs
titleListing N. exampleScript_1NodeShared_1GPU.sh
linenumberstrue
#!/bin/bash --login
#SBATCH --job-name=1GPUSharedNode
#SBATCH --partition=gpu
#SBATCH --nodes=1              #1 nodes in this example 
#SBATCH --gres=gpu:1           #1 GPU per node (1 "allocation-pack" in total for the job)
#SBATCH --time=00:05:00
#SBATCH --account=<yourProject>-gpu #IMPORTANT: use your own project and the -gpu suffix
#(Note that there is not request for exclusive access to the node)

#----
#Loading needed modules (adapt this for your own purposes):
module load PrgEnv-cray
module load rocm craype-accel-amd-gfx90a
echo -e "\n\n#------------------------#"
module list

#----
#Printing the status of the given allocation
echo -e "\n\n#------------------------#"
echo "Printing from scontrol:"
scontrol show job ${SLURM_JOBID}

#----
#Definition of the executable (we assume the example code has been compiled and is available in $MYSCRATCH):
exeDir=$MYSCRATCH/hello_jobstep
exeName=hello_jobstep
theExe=$exeDir/$exeName

#----
#MPI & OpenMP settings
#Not needed for 1GPU:export MPICH_GPU_SUPPORT_ENABLED=1 #This allows for GPU-aware MPI communication among GPUs
export OMP_NUM_THREADS=1           #This controls the real CPU-cores per task for the executable

#----
#Execution
#Note: srun needs the explicit indication full parameters for use of resources in the job step.
#      These are independent from the allocation parameters (which are not inherited by srun)
#      For optimal GPU binding using slurm options,
#      "--gpus-per-task=1" and "--gpu-bind=closest" create the optimal binding of GPUs      
#      (Although in this case this can be avoided as only 1 "allocation-pack" has been requested)
#      (The "-l" option is for displaying, at the beginning of each line, the taskID that generates the output.)
#      (The "-u" option is for unbuffered output, so that output is displayed as soon as it's generated.)
#      (If the output needs to be sorted for clarity, then add "| sort -n" at the end of the command.)
echo -e "\n\n#------------------------#"
echo "Test code execution:"
srun -l -u -N 1 -n 1 -c 8 --gres=gpu:1 --gpus-per-task=1 --gpu-bind=closest ${theExe}
|
sort -n

#----
#Printing information of finished job steps:
echo -e "\n\n#------------------------#"
echo "Printing information of finished jobs steps using sacct:"
sacct -j ${SLURM_JOBID} -o jobid%20,Start%20,elapsed%20

#----
#Done
echo -e "\n\n#------------------------#"
echo "Done"


...

Column
width900px


Code Block
languagebash
themeEmacs
titleListing N. exampleScript_2NodesExclusive_16GPUs_8VisiblePerTask.sh
linenumberstrue
#!/bin/bash --login
#SBATCH --job-name=16GPUExclusiveNode-8GPUsVisiblePerTask
#SBATCH --partition=gpu
#SBATCH --nodes=2              #2 nodes in this example 
#SBATCH --exclusive            #All resources of the node are exclusive to this job
#                              #8 GPUs per node (16 "allocation packs" in total for the job)
#SBATCH --time=00:05:00
#SBATCH --account=<yourProject>-gpu #IMPORTANT: use your own project and the -gpu suffix

#----
#Loading needed modules (these may not be needed for Tensorflow) (adapt this for your own purposes):
module load PrgEnv-cray
module load rocm craype-accel-amd-gfx90a
echo -e "\n\n#------------------------#"
module list

#----
#Printing the status of the given allocation
echo -e "\n\n#------------------------#"
echo "Printing from scontrol:"
scontrol show job ${SLURM_JOBID}

#----
#Definition of the executable (we assume the example code has been compiled and is available in $MYSCRATCH):
exeDir=$MYSCRATCH/hello_jobstep
exeName=hello_jobstep
theExe=$exeDir/$exeName

#----
#MPI & OpenMP settings if needed (these may not be needed for Tensorflow):
export MPICH_GPU_SUPPORT_ENABLED=1 #This allows for GPU-aware MPI communication among GPUs
export OMP_NUM_THREADS=1           #This controls the real CPU-cores per task for the executable

#----
#Execution
#Note: srun needs the explicit indication full parameters for use of resources in the job step.
#      These are independent from the allocation parameters (which are not inherited by srun)
#      No optimal binding is provided by the scheduler.
#      Therefore, "--gpus-per-task" and "--gpu-bind" are not used.
#      Each task have access to all the 8 available GPUs in the node wher it's running.
#      Optimal use of resources is now responsability of the code.
#   echo -e "\  (The "-l" option is for displaying, at the beginning of each line, the taskID that generates the output.)
#      (The "-u" option is for unbuffered output, so that output is displayed as soon as it's generated.)
#      (If the output needs to be sorted for clarity, then add "| sort -n" at the end of the command.)
echo -e "\n\n#------------------------#"
echo "Test code execution:"
srun -l -u -N 2 -n 16 -c 8 --gres=gpu:8 ${theExe}
|
sort -n

#----
#Printing information of finished job steps:
echo -e "\n\n#------------------------#"
echo "Printing information of finished jobs steps using sacct:"
sacct -j ${SLURM_JOBID} -o jobid%20,Start%20,elapsed%20

#----
#Done
echo -e "\n\n#------------------------#"
echo "Done"


...

Column
width900px


Code Block
languagebash
themeEmacs
titleListing N. exampleScript_1NodeShared_6GPUs_2VisiblePerTask.sh
linenumberstrue
#!/bin/bash --login
#SBATCH --job-name=6GPUSharedNode-2GPUsVisiblePerTask
#SBATCH --partition=gpu
#SBATCH --nodes=1              #1 nodes in this example 
#SBATCH --gres=gpu:6           #6 GPUs per node (6 "allocation packs" in total for the job)
#SBATCH --time=00:05:00
#SBATCH --account=<yourProject>-gpu #IMPORTANT: use your own project and the -gpu suffix

#----
#Loading needed modules (these may not be needed for Tensorflow) (adapt this for your own purposes):
module load PrgEnv-cray
module load rocm craype-accel-amd-gfx90a
echo -e "\n\n#------------------------#"
module list

#----
#Printing the status of the given allocation
echo -e "\n\n#------------------------#"
echo "Printing from scontrol:"
scontrol show job ${SLURM_JOBID}

#----
#Definition of the executable (we assume the example code has been compiled and is available in $MYSCRATCH):
exeDir=$MYSCRATCH/hello_jobstep
exeName=hello_jobstep
theExe=$exeDir/$exeName

#----
#MPI & OpenMP settings if needed (these may not needed for Tensorflow):
export MPICH_GPU_SUPPORT_ENABLED=1 #This allows for GPU-aware MPI communication among GPUs
export OMP_NUM_THREADS=1           #This controls the real CPU-cores per task for the executable

#----
#Execution
#Note: srun needs the explicit indication full parameters for use of resources in the job step.
#      These are independent from the allocation parameters (which are not inherited by srun)
#      For best possible GPU binding using slurm options,
#      "--gpus-per-task=2" and "--gpu-bind=closest" will provide the best GPUs to the tasks.
#      But best is still not optimal.
#      Each task have access to 2 available GPUs in the node where it's running.
#      Optimal use of resources of each of the 2GPUs accesible per task is now responsability of the code.
#      IMPORTANT: Note the use of "-c 16" to "reserve" 2 chiplets per task and be consistent with
#                 the use of "--gpus-per-task=2" to "reserve" 2 GPUs per task
#  echo -e "\n   (The "-l" option is for displaying, at the beginning of each line, the taskID that generates the output.)
#      (The "-u" option is for unbuffered output, so that output is displayed as soon as it's generated.)
#      (If the output needs to be sorted for clarity, then add "| sort -n" at the end of the command.)
echo -e "\n\n#------------------------#"
echo "Test code execution:"
srun -l -u -N 1 -n 3 -c 16 --gres=gpu:6 --gpus-per-task=2 --gpu-bind=closest ${theExe} | sort -n

#----
#Printing information of finished job steps:
echo -e "\n\n#------------------------#"
echo "Printing information of finished jobs steps using sacct:"
sacct -j ${SLURM_JOBID} -o jobid%20,Start%20,elapsed%20

#----
#Done
echo -e "\n\n#------------------------#"
echo "Done"


...

Column
width900px


Code Block
languagebash
themeEmacs
titleListing N. exampleScript_1NodeExclusive_8GPUs_jobPacking.sh
linenumberstrue
#!/bin/bash --login
#SBATCH --job-name=JobPacking8GPUsExclusive-bindMethod1
#SBATCH --partition=gpu
#SBATCH --nodes=1              #1 nodes in this example 
#SBATCH --exclusive            #All resources of the node are exclusive to this job
#                              #8 GPUs per node (8 "allocation-packs" in total for the job)
#SBATCH --time=00:05:00
#SBATCH --account=<yourProject>-gpu #IMPORTANT: use your own project and the -gpu suffix

#----
#Loading needed modules (adapt this for your own purposes):
module load PrgEnv-cray
module load rocm craype-accel-amd-gfx90a
echo -e "\n\n#------------------------#"
module list

#----
#Printing the status of the given allocation
echo -e "\n\n#------------------------#"
echo "Printing from scontrol:"
scontrol show job ${SLURM_JOBID}

#----
#Job Packing Wrapper: Each srun-task will use a different instance of the executable.
jobPackingWrapper="jobPackingWrapper.sh"

#----
#MPI & OpenMP settings
#No need for 1GPU steps:export MPICH_GPU_SUPPORT_ENABLED=1 #This allows for GPU-aware MPI communication among GPUs
export OMP_NUM_THREADS=1           #This controls the real CPU-cores per task for the executable

#----
#Execution
#Note: srun needs the explicit indication full parameters for use of resources in the job step.
#      These are independent from the allocation parameters (which are not inherited by srun)
#      (The "-l" option is for displaying, at the beginning of each line, the taskID that generates the output.)
#      (The "-u" option is for unbuffered output, so that output is displayed as soon as it's generated.)
echo -e "\n\n#------------------------#"
echo "Test code execution:"
srun -l -u -N 1 -n 8 -c 8 --gres=gpu:8 --gpus-per-task=1 --gpu-bind=closest ./${jobPackingWrapper}

#----
#Printing information of finished job steps:
echo -e "\n\n#------------------------#"
echo "Printing information of finished jobs steps using sacct:"
sacct -j ${SLURM_JOBID} -o jobid%20,Start%20,elapsed%20

#----
#Done
echo -e "\n\n#------------------------#"
echo "Done" 


...