#!/bin/bash --login
#SBATCH --job-name=16GPUExclusiveNode-8GPUsVisiblePerTask
#SBATCH --partition=gpu
#SBATCH --nodes=2 #2 nodes in this example
#SBATCH --exclusive #All resources of the node are exclusive to this job
# #8 GPUs per node (16 "allocation packs" in total for the job)
#SBATCH --time=00:05:00
#SBATCH --account=<yourProject>-gpu #IMPORTANT: use your own project and the -gpu suffix
#----
#Loading needed modules (adapt this for your own purposes):
#For the hello_jobstep example:
module load PrgEnv-cray
module load rocm/<VERSION> craype-accel-amd-gfx90a
#OR for a tensorflow example:
#module load tensorflow/<version>
echo -e "\n\n#------------------------#"
module list
#----
#Printing the status of the given allocation
echo -e "\n\n#------------------------#"
echo "Printing from scontrol:"
scontrol show job ${SLURM_JOBID}
#----
#Definition of the executable (we assume the example code has been compiled and is available in $MYSCRATCH):
exeDir=$MYSCRATCH/hello_jobstep
exeName=hello_jobstep
theExe=$exeDir/$exeName
#----
#MPI & OpenMP settings if needed (these won't work for Tensorflow):
export MPICH_GPU_SUPPORT_ENABLED=1 #This allows for GPU-aware MPI communication among GPUs
export OMP_NUM_THREADS=1 #This controls the real CPU-cores per task for the executable
#----
#TensorFlow settings if needed:
# The following two variables control the real number of threads in Tensorflow code:
#export TF_NUM_INTEROP_THREADS=1 #Number of threads for independent operations
#export TF_NUM_INTRAOP_THREADS=1 #Number of threads within individual operations
#----
#Execution
#Note: srun needs the explicit indication full parameters for use of resources in the job step.
# These are independent from the allocation parameters (which are not inherited by srun)
# Each task needs access to all the 8 available GPUs in the node where it's running.
# So, no optimal binding can be provided by the scheduler.
# Therefore, "--gpus-per-task" and "--gpu-bind" are not used.
# Optimal use of resources is now responsability of the code.
# "-c 8" is used to force allocation of 1 task per CPU chiplet. Then, the REAL number of threads
# for the code SHOULD be defined by the environment variables above.
# (The "-l" option is for displaying, at the beginning of each line, the taskID that generates the output.)
# (The "-u" option is for unbuffered output, so that output is displayed as soon as it's generated.)
# (If the output needs to be sorted for clarity, then add "| sort -n" at the end of the command.)
echo -e "\n\n#------------------------#"
echo "Test code execution:"
srun -l -u -N 2 -n 16 -c 8 --gres=gpu:8 ${theExe}
#srun -l -u -N 2 -n 16 -c 8 --gres=gpu:8 python3 ${tensorFlowScript}
#----
#Printing information of finished job steps:
echo -e "\n\n#------------------------#"
echo "Printing information of finished jobs steps using sacct:"
sacct -j ${SLURM_JOBID} -o jobid%20,Start%20,elapsed%20
#----
#Done
echo -e "\n\n#------------------------#"
echo "Done" |