2
2
# SPDX-License-Identifier: MIT-0
3
3
FROM nvidia/cuda:12.2.0-devel-ubuntu22.04
4
4
5
+ ARG GDRCOPY_VERSION=v2.4.1
5
6
ARG EFA_INSTALLER_VERSION=1.31.0
6
7
ARG AWS_OFI_NCCL_VERSION=v1.8.1-aws
7
- ARG NCCL_TESTS_VERSION=2.13.9
8
- ARG NCCL_VERSION=2.20.3
8
+ ARG NCCL_VERSION=v2.20.3-1
9
+ ARG NCCL_TESTS_VERSION=v2.13.9
9
10
10
11
RUN apt-get update -y
11
12
RUN apt-get remove -y --allow-change-held-packages \
12
- libmlx5-1 ibverbs-utils libibverbs-dev libibverbs1 libnccl2 libnccl-dev
13
+ ibverbs-utils \
14
+ libibverbs-dev \
15
+ libibverbs1 \
16
+ libmlx5-1 \
17
+ libnccl2 \
18
+ libnccl-dev
13
19
14
20
RUN rm -rf /opt/hpcx \
15
21
&& rm -rf /usr/local/mpi \
16
22
&& rm -f /etc/ld.so.conf.d/hpcx.conf \
17
23
&& ldconfig
24
+
18
25
ENV OPAL_PREFIX=
19
26
20
27
RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
28
+ apt-utils \
29
+ autoconf \
30
+ automake \
31
+ build-essential \
32
+ check \
33
+ cmake \
34
+ curl \
35
+ debhelper \
36
+ devscripts \
21
37
git \
22
38
gcc \
23
- vim \
39
+ gdb \
24
40
kmod \
41
+ libsubunit-dev \
42
+ libtool \
25
43
openssh-client \
26
44
openssh-server \
27
- build-essential \
28
- curl \
29
- autoconf \
30
- libtool \
31
- gdb \
32
- automake \
45
+ pkg-config \
33
46
python3-distutils \
34
- cmake \
35
- apt-utils \
36
- devscripts \
37
- debhelper \
38
- libsubunit-dev \
39
- check \
40
- pkg-config
47
+ vim
41
48
42
49
RUN mkdir -p /var/run/sshd
43
50
RUN sed -i 's/[ #]\( .*StrictHostKeyChecking \) .*/ \1 no/g' /etc/ssh/ssh_config && \
44
51
echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \
45
52
sed -i 's/#\( StrictModes \) .*/\1 no/g' /etc/ssh/sshd_config
53
+
46
54
ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:/usr/local/lib:$LD_LIBRARY_PATH
47
55
ENV PATH /opt/amazon/openmpi/bin/:/opt/amazon/efa/bin:/usr/bin:/usr/local/bin:$PATH
48
56
@@ -52,12 +60,14 @@ RUN curl https://bootstrap.pypa.io/get-pip.py -o /tmp/get-pip.py \
52
60
53
61
# ################################################
54
62
# # Install NVIDIA GDRCopy
55
- # RUN git clone https://github.com/NVIDIA/gdrcopy.git /opt/gdrcopy \
56
- # && cd /opt/gdrcopy \
57
- # && make lib_install install \
58
- # && cd /opt/gdrcopy/tests \
59
- # && make \
60
- # && mv copylat copybw sanity apiperf /usr/bin/
63
+ RUN git clone -b ${GDRCOPY_VERSION} https://github.com/NVIDIA/gdrcopy.git /tmp/gdrcopy \
64
+ && cd /tmp/gdrcopy \
65
+ && make prefix=/opt/gdrcopy install
66
+
67
+ ENV LD_LIBRARY_PATH /opt/gdrcopy/lib:/usr/local/cuda/compat:$LD_LIBRARY_PATH
68
+ ENV LIBRARY_PATH /opt/gdrcopy/lib:/usr/local/cuda/compat/:$LIBRARY_PATH
69
+ ENV CPATH /opt/gdrcopy/include:$CPATH
70
+ ENV PATH /opt/gdrcopy/bin:$PATH
61
71
62
72
# ################################################
63
73
# # Install EFA installer
@@ -70,36 +80,50 @@ RUN cd $HOME \
70
80
71
81
# ##################################################
72
82
# # Install NCCL
73
- RUN git clone -b v ${NCCL_VERSION}-1 https://github.com/NVIDIA/nccl.git /opt/nccl \
83
+ RUN git clone -b ${NCCL_VERSION} https://github.com/NVIDIA/nccl.git /opt/nccl \
74
84
&& cd /opt/nccl \
75
85
&& make -j $(nproc) src.build CUDA_HOME=/usr/local/cuda \
76
- NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90"
86
+ NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_89,code=sm_89 -gencode=arch= compute_90,code=sm_90"
77
87
78
88
# ##################################################
79
89
# # Install AWS-OFI-NCCL plugin
80
- RUN apt-get install libtool autoconf cmake nasm unzip pigz parallel nfs-common build-essential hwloc libhwloc-dev libjemalloc2 libnuma-dev numactl libjemalloc-dev preload htop iftop liblapack-dev libgfortran5 ipcalc wget curl devscripts debhelper check libsubunit-dev fakeroot pkg-config dkms -y
81
- RUN export OPAL_PREFIX="" \
82
- && git clone https://github.com/aws/aws-ofi-nccl.git /opt/aws-ofi-nccl \
83
- && cd /opt/aws-ofi-nccl \
84
- && git checkout ${AWS_OFI_NCCL_VERSION} \
85
- && ./autogen.sh \
90
+ RUN DEBIAN_FRONTEND=noninteractive apt-get install -y libhwloc-dev
91
+ RUN curl -OL https://github.com/aws/aws-ofi-nccl/releases/download/${AWS_OFI_NCCL_VERSION}/aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz \
92
+ && tar -xf aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz \
93
+ && cd aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v} \
86
94
&& ./configure --prefix=/opt/aws-ofi-nccl/install \
87
95
--with-mpi=/opt/amazon/openmpi \
88
96
--with-libfabric=/opt/amazon/efa \
89
97
--with-cuda=/usr/local/cuda \
90
98
--enable-platform-aws \
91
- && make -j $(nproc) && make install
99
+ && make -j $(nproc) \
100
+ && make install \
101
+ && cd .. \
102
+ && rm -rf aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v} \
103
+ && rm aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz
92
104
93
105
# ##################################################
94
106
# # Install NCCL-tests
95
- RUN git clone -b v ${NCCL_TESTS_VERSION} https://github.com/NVIDIA/nccl-tests.git /opt/nccl-tests \
107
+ RUN git clone -b ${NCCL_TESTS_VERSION} https://github.com/NVIDIA/nccl-tests.git /opt/nccl-tests \
96
108
&& cd /opt/nccl-tests \
97
109
&& make -j $(nproc) \
98
110
MPI=1 \
99
111
MPI_HOME=/opt/amazon/openmpi/ \
100
112
CUDA_HOME=/usr/local/cuda \
101
113
NCCL_HOME=/opt/nccl/build \
102
- NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90"
114
+ NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_89,code=sm_89 -gencode=arch= compute_90,code=sm_90"
103
115
104
116
RUN rm -rf /var/lib/apt/lists/*
117
+
118
+ # # Set Open MPI variables to exclude network interface and conduit.
119
+ ENV OMPI_MCA_pml=^cm,ucx \
120
+ OMPI_MCA_btl=tcp,self \
121
+ OMPI_MCA_btl_tcp_if_exclude=lo,docker0,veth_def_agent\
122
+ OPAL_PREFIX=/opt/amazon/openmpi \
123
+ NCCL_SOCKET_IFNAME=^docker,lo
124
+
125
+ # # Turn off PMIx Error https://github.com/open-mpi/ompi/issues/7516
126
+ ENV PMIX_MCA_gds=hash
127
+
128
+ # # Set LD_PRELOAD for NCCL library
105
129
ENV LD_PRELOAD /opt/nccl/build/lib/libnccl.so
0 commit comments