Skip to content

Commit b1e89bb

Browse files
authored
Merge pull request #270 from aws-samples/improvements/#269_nccl_optimization
Improvements/#269 nccl optimization
2 parents 70d9937 + 5567b42 commit b1e89bb

File tree

7 files changed

+253
-278
lines changed

7 files changed

+253
-278
lines changed

micro-benchmarks/nccl-tests/README.md

Lines changed: 132 additions & 67 deletions
Large diffs are not rendered by default.

micro-benchmarks/nccl-tests/kubernetes/nccl-tests.yaml

Lines changed: 18 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -22,49 +22,48 @@ spec:
2222
value: /opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:/usr/local/nvidia/lib:$LD_LIBRARY_PATH
2323
- name: PATH
2424
value: $PATH:/opt/amazon/efa/bin:/usr/bin
25-
- name: XLA_FLAGS
26-
value: "--xla_gpu_cuda_data_dir=/usr/local/cuda"
27-
- name: TF_XLA_FLAGS
28-
value: "--tf_xla_cpu_global_jit"
29-
- name: NCCL_DEBUG
30-
value: INFO
3125
command:
3226
- /opt/amazon/openmpi/bin/mpirun
3327
- --allow-run-as-root
3428
- --tag-output
3529
- -np
3630
- "16"
37-
- -bind-to
31+
- -N
32+
- "8"
33+
- --bind-to
3834
- none
39-
- -map-by
40-
- slot
4135
- -x
4236
- PATH
4337
- -x
4438
- LD_LIBRARY_PATH
4539
- -x
46-
- XLA_FLAGS
40+
- FI_PROVIDER=efa
4741
- -x
48-
- TF_XLA_FLAGS
42+
- FI_EFA_USE_DEVICE_RDMA=1
43+
- -x
44+
- FI_EFA_FORK_SAFE=1
4945
- -x
5046
- NCCL_DEBUG=INFO
5147
- -x
52-
- NCCL_NVLS_ENABLE=1
48+
- NCCL_BUFFSIZE=8388608
49+
- -x
50+
- NCCL_P2P_NET_CHUNKSIZE=524288
5351
- --mca
5452
- pml
55-
- ^cm
53+
- ^cm,ucx
54+
- --mca
55+
- btl
56+
- tcp,self
5657
- --mca
57-
- pml_rsh_agent=ssh
58-
- --oversubscribe
58+
- btl_tcp_if_exclude
59+
- lo,docker0,veth_def_agent
5960
- /opt/nccl-tests/build/all_reduce_perf
6061
- -b
61-
- "1"
62+
- "8"
6263
- -e
63-
- 2G
64+
- "16G"
6465
- -f
6566
- "2"
66-
- -t
67-
- "1"
6867
- -g
6968
- "1"
7069
- -c

micro-benchmarks/nccl-tests/nccl-tests.Dockerfile

Lines changed: 58 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -2,47 +2,55 @@
22
# SPDX-License-Identifier: MIT-0
33
FROM nvidia/cuda:12.2.0-devel-ubuntu22.04
44

5+
ARG GDRCOPY_VERSION=v2.4.1
56
ARG EFA_INSTALLER_VERSION=1.31.0
67
ARG AWS_OFI_NCCL_VERSION=v1.8.1-aws
7-
ARG NCCL_TESTS_VERSION=2.13.9
8-
ARG NCCL_VERSION=2.20.3
8+
ARG NCCL_VERSION=v2.20.3-1
9+
ARG NCCL_TESTS_VERSION=v2.13.9
910

1011
RUN apt-get update -y
1112
RUN apt-get remove -y --allow-change-held-packages \
12-
libmlx5-1 ibverbs-utils libibverbs-dev libibverbs1 libnccl2 libnccl-dev
13+
ibverbs-utils \
14+
libibverbs-dev \
15+
libibverbs1 \
16+
libmlx5-1 \
17+
libnccl2 \
18+
libnccl-dev
1319

1420
RUN rm -rf /opt/hpcx \
1521
&& rm -rf /usr/local/mpi \
1622
&& rm -f /etc/ld.so.conf.d/hpcx.conf \
1723
&& ldconfig
24+
1825
ENV OPAL_PREFIX=
1926

2027
RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
28+
apt-utils \
29+
autoconf \
30+
automake \
31+
build-essential \
32+
check \
33+
cmake \
34+
curl \
35+
debhelper \
36+
devscripts \
2137
git \
2238
gcc \
23-
vim \
39+
gdb \
2440
kmod \
41+
libsubunit-dev \
42+
libtool \
2543
openssh-client \
2644
openssh-server \
27-
build-essential \
28-
curl \
29-
autoconf \
30-
libtool \
31-
gdb \
32-
automake \
45+
pkg-config \
3346
python3-distutils \
34-
cmake \
35-
apt-utils \
36-
devscripts \
37-
debhelper \
38-
libsubunit-dev \
39-
check \
40-
pkg-config
47+
vim
4148

4249
RUN mkdir -p /var/run/sshd
4350
RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \
4451
echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \
4552
sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config
53+
4654
ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:/usr/local/lib:$LD_LIBRARY_PATH
4755
ENV PATH /opt/amazon/openmpi/bin/:/opt/amazon/efa/bin:/usr/bin:/usr/local/bin:$PATH
4856

@@ -52,12 +60,14 @@ RUN curl https://bootstrap.pypa.io/get-pip.py -o /tmp/get-pip.py \
5260

5361
#################################################
5462
## Install NVIDIA GDRCopy
55-
#RUN git clone https://github.com/NVIDIA/gdrcopy.git /opt/gdrcopy \
56-
# && cd /opt/gdrcopy \
57-
# && make lib_install install \
58-
# && cd /opt/gdrcopy/tests \
59-
# && make \
60-
# && mv copylat copybw sanity apiperf /usr/bin/
63+
RUN git clone -b ${GDRCOPY_VERSION} https://github.com/NVIDIA/gdrcopy.git /tmp/gdrcopy \
64+
&& cd /tmp/gdrcopy \
65+
&& make prefix=/opt/gdrcopy install
66+
67+
ENV LD_LIBRARY_PATH /opt/gdrcopy/lib:/usr/local/cuda/compat:$LD_LIBRARY_PATH
68+
ENV LIBRARY_PATH /opt/gdrcopy/lib:/usr/local/cuda/compat/:$LIBRARY_PATH
69+
ENV CPATH /opt/gdrcopy/include:$CPATH
70+
ENV PATH /opt/gdrcopy/bin:$PATH
6171

6272
#################################################
6373
## Install EFA installer
@@ -70,36 +80,50 @@ RUN cd $HOME \
7080

7181
###################################################
7282
## Install NCCL
73-
RUN git clone -b v${NCCL_VERSION}-1 https://github.com/NVIDIA/nccl.git /opt/nccl \
83+
RUN git clone -b ${NCCL_VERSION} https://github.com/NVIDIA/nccl.git /opt/nccl \
7484
&& cd /opt/nccl \
7585
&& make -j $(nproc) src.build CUDA_HOME=/usr/local/cuda \
76-
NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90"
86+
NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_89,code=sm_89 -gencode=arch=compute_90,code=sm_90"
7787

7888
###################################################
7989
## Install AWS-OFI-NCCL plugin
80-
RUN apt-get install libtool autoconf cmake nasm unzip pigz parallel nfs-common build-essential hwloc libhwloc-dev libjemalloc2 libnuma-dev numactl libjemalloc-dev preload htop iftop liblapack-dev libgfortran5 ipcalc wget curl devscripts debhelper check libsubunit-dev fakeroot pkg-config dkms -y
81-
RUN export OPAL_PREFIX="" \
82-
&& git clone https://github.com/aws/aws-ofi-nccl.git /opt/aws-ofi-nccl \
83-
&& cd /opt/aws-ofi-nccl \
84-
&& git checkout ${AWS_OFI_NCCL_VERSION} \
85-
&& ./autogen.sh \
90+
RUN DEBIAN_FRONTEND=noninteractive apt-get install -y libhwloc-dev
91+
RUN curl -OL https://github.com/aws/aws-ofi-nccl/releases/download/${AWS_OFI_NCCL_VERSION}/aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz \
92+
&& tar -xf aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz \
93+
&& cd aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v} \
8694
&& ./configure --prefix=/opt/aws-ofi-nccl/install \
8795
--with-mpi=/opt/amazon/openmpi \
8896
--with-libfabric=/opt/amazon/efa \
8997
--with-cuda=/usr/local/cuda \
9098
--enable-platform-aws \
91-
&& make -j $(nproc) && make install
99+
&& make -j $(nproc) \
100+
&& make install \
101+
&& cd .. \
102+
&& rm -rf aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v} \
103+
&& rm aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz
92104

93105
###################################################
94106
## Install NCCL-tests
95-
RUN git clone -b v${NCCL_TESTS_VERSION} https://github.com/NVIDIA/nccl-tests.git /opt/nccl-tests \
107+
RUN git clone -b ${NCCL_TESTS_VERSION} https://github.com/NVIDIA/nccl-tests.git /opt/nccl-tests \
96108
&& cd /opt/nccl-tests \
97109
&& make -j $(nproc) \
98110
MPI=1 \
99111
MPI_HOME=/opt/amazon/openmpi/ \
100112
CUDA_HOME=/usr/local/cuda \
101113
NCCL_HOME=/opt/nccl/build \
102-
NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90"
114+
NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_89,code=sm_89 -gencode=arch=compute_90,code=sm_90"
103115

104116
RUN rm -rf /var/lib/apt/lists/*
117+
118+
## Set Open MPI variables to exclude network interface and conduit.
119+
ENV OMPI_MCA_pml=^cm,ucx \
120+
OMPI_MCA_btl=tcp,self \
121+
OMPI_MCA_btl_tcp_if_exclude=lo,docker0,veth_def_agent\
122+
OPAL_PREFIX=/opt/amazon/openmpi \
123+
NCCL_SOCKET_IFNAME=^docker,lo
124+
125+
## Turn off PMIx Error https://github.com/open-mpi/ompi/issues/7516
126+
ENV PMIX_MCA_gds=hash
127+
128+
## Set LD_PRELOAD for NCCL library
105129
ENV LD_PRELOAD /opt/nccl/build/lib/libnccl.so

micro-benchmarks/nccl-tests/slurm/nccl-3collectives.sbatch

Lines changed: 0 additions & 74 deletions
This file was deleted.
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,39 @@
11
#!/bin/bash
2-
#SBATCH -N 2
3-
#SBATCH --exclusive
4-
52
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
63
# SPDX-License-Identifier: MIT-0
74

5+
#SBATCH --job-name=nccl-all_reduce_perf # name of your job
6+
#SBATCH --nodes=2 # number of nodes to use, 24 p4d(e) = 192 A100 GPUs
7+
#SBATCH --ntasks-per-node 8 # Number of GPU per node
8+
#SBATCH --output %x_%j.out
9+
#SBATCH --error %x_%j.err
10+
#SBATCH --exclusive
11+
812
# This script is designed to run on the Deep Learning AMI, Ubuntu 20.04
913
# See https://aws.amazon.com/releasenotes/aws-deep-learning-base-gpu-ami-ubuntu-20-04/
1014
set -ex
1115

1216
# Get Hostname to Instance ID mapping
1317
mpirun -N 1 bash -c 'echo $(hostname) ➡️ $(cat /sys/devices/virtual/dmi/id/board_asset_tag | tr -d " ")'
1418

19+
20+
### NCCL_BUFFSIZE increase the send queue depth and can turn NCCL communications into non-blocking.
21+
### https://www.usenix.org/system/files/atc23-choi.pdf
22+
23+
### NCCL_P2P_NET_CHUNKSIZE Improve performance by increasing buffer size for Send/Recv, Gather, Scatter and Alltoall communications
24+
### https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/p2p.html
25+
1526
# run all_reduce test
1627
mpirun -n $((8 * SLURM_JOB_NUM_NODES)) -N 8 \
1728
-x FI_PROVIDER=efa \
1829
-x FI_EFA_USE_DEVICE_RDMA=1 \
19-
-x RDMAV_FORK_SAFE=1 \
30+
-x FI_EFA_FORK_SAFE=1 \
2031
-x LD_LIBRARY_PATH=/opt/amazon/efa/lib:/opt/amazon/openmpi/lib:/opt/aws-ofi-nccl/lib:/usr/local/lib:/usr/lib:$LD_LIBRARY_PATH \
2132
-x NCCL_DEBUG=INFO \
22-
--mca pml ^cm \
33+
-x NCCL_BUFFSIZE=8388608 \
34+
-x NCCL_P2P_NET_CHUNKSIZE=524288 \
35+
--mca pml ^cm,ucx \
2336
--mca btl tcp,self \
2437
--mca btl_tcp_if_exclude lo,docker0,veth_def_agent \
25-
--bind-to none /usr/local/cuda-12.2/efa/test-cuda-12.2/all_reduce_perf -b 8 -e 2G -f 2 -g 1 -c 1 -n 100
38+
--bind-to none /usr/local/cuda-12.2/efa/test-cuda-12.2/all_reduce_perf -b 8 -e 16G -f 2 -g 1 -c 1 -n 100
39+

0 commit comments

Comments
 (0)