Skip to content

Commit c952261

Browse files
authored
Merge pull request #3 from LambdaLabsML/raj_new_libraries
Dockerize the deepspeed tests.
2 parents 9f4c4bd + 29ed7a1 commit c952261

19 files changed

+359
-68
lines changed

.gitignore

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,3 +129,9 @@ dmypy.json
129129

130130
# Pyre type checker
131131
.pyre/
132+
133+
134+
# deepspeed logs
135+
*.log
136+
*.log
137+
**/*.log
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
.git
2+
.cache
3+
*.pyc
4+
__pycache__
5+
*.egg-info
6+
venv/
7+
.venv/
8+
.env
9+
*.log
10+
outputs/
11+
checkpoints/
12+
.DS_Store
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# Log files
2+
*.log
3+
**/*.log
4+
5+
# Output directories
6+
output/
7+
logs/
8+
9+
# Temporary files
10+
*.tmp
11+
*.temp
12+
13+
# Python cache
14+
__pycache__/
15+
*.pyc
16+
*.pyo
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04
2+
3+
ENV DEBIAN_FRONTEND=noninteractive
4+
ENV PYTHONUNBUFFERED=1
5+
6+
# Install system dependencies including SSH and nvtop
7+
RUN apt-get update && apt-get install -y \
8+
python3.10 \
9+
python3.10-dev \
10+
python3-pip \
11+
python3-pybind11 \
12+
git \
13+
curl \
14+
wget \
15+
build-essential \
16+
openssh-server \
17+
openssh-client \
18+
nvtop \
19+
&& rm -rf /var/lib/apt/lists/*
20+
21+
# Set python3.10 as default
22+
RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.10 1 && \
23+
update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
24+
25+
# Configure SSH
26+
RUN mkdir -p /var/run/sshd && \
27+
sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config && \
28+
sed -i 's/#PubkeyAuthentication yes/PubkeyAuthentication yes/' /etc/ssh/sshd_config
29+
30+
# Setup passwordless SSH for localhost
31+
RUN mkdir -p /root/.ssh && chmod 700 /root/.ssh && \
32+
ssh-keygen -t rsa -N "" -f /root/.ssh/id_rsa && \
33+
cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys && \
34+
chmod 600 /root/.ssh/authorized_keys /root/.ssh/id_rsa && \
35+
echo "StrictHostKeyChecking no" > /root/.ssh/config && \
36+
echo "UserKnownHostsFile=/dev/null" >> /root/.ssh/config && \
37+
chmod 600 /root/.ssh/config
38+
39+
# Upgrade pip
40+
RUN python -m pip install --upgrade pip setuptools wheel
41+
42+
WORKDIR /workspace
43+
44+
# Install PyTorch first with CUDA 11.8 support
45+
RUN pip install torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 --index-url https://download.pytorch.org/whl/cu118
46+
47+
# Copy and install other dependencies
48+
COPY training/requirements_freeze.txt ./
49+
RUN pip install -r requirements_freeze.txt
50+
51+
# Copy project code
52+
COPY . .
53+
54+
CMD ["/bin/bash"]
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
.PHONY: build run stop clean shell logs
2+
3+
build:
4+
docker compose build
5+
6+
run:
7+
docker compose up -d
8+
9+
stop:
10+
docker compose down
11+
12+
clean:
13+
docker compose down -v
14+
docker system prune -f
15+
16+
shell:
17+
docker compose exec deepspeed-training /bin/bash
18+
19+
logs:
20+
docker compose logs -f
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
services:
2+
deepspeed-training:
3+
build:
4+
context: .
5+
dockerfile: Dockerfile
6+
image: deepspeed-training:latest
7+
container_name: deepspeed-training
8+
9+
privileged: true
10+
runtime: nvidia
11+
12+
command: /bin/bash -c "/usr/sbin/sshd && tail -f /dev/null"
13+
14+
environment:
15+
- PROJECT_PATH=${PROJECT_PATH:-/workspace}
16+
- HF_HOME=${PROJECT_PATH:-/workspace}/.cache/huggingface
17+
- TRANSFORMERS_CACHE=${PROJECT_PATH:-/workspace}/.cache/huggingface/transformers
18+
- HF_DATASETS_CACHE=${PROJECT_PATH:-/workspace}/.cache/huggingface/datasets
19+
- NVIDIA_VISIBLE_DEVICES=all
20+
- NVIDIA_DRIVER_CAPABILITIES=compute,utility
21+
22+
volumes:
23+
# Mount DeepSpeedExamples repo to /workspace/DeepSpeedExamples
24+
- ${HOST_REPO_PATH}:${PROJECT_PATH}/DeepSpeedExamples:rw
25+
26+
# Mount cache to /workspace/.cache
27+
- ${HOST_CACHE_PATH}:${PROJECT_PATH}/.cache:rw
28+
29+
stdin_open: true
30+
tty: true
31+
ipc: host
32+
shm_size: '32gb'
33+
34+
working_dir: /workspace/DeepSpeedExamples/applications/DeepSpeed-Chat/training

applications/DeepSpeed-Chat/requirements_freeze.txt

Lines changed: 0 additions & 12 deletions
This file was deleted.
Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
1-
PROJECT_PATH=/home/ubuntu/ml-1cc/benchmark
2-
OMPI_MCA_btl_tcp_if_include=eno1
3-
UCX_TLS=self,shm,tcp
4-
NCCL_P2P_LEVEL=NVL
5-
NCCL_NET_GDR_LEVEL=PIX
6-
NCCL_IB_HCA='=mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7,mlx5_8'
7-
NCCL_IB_PCI_RELAXED_ORDERING=1
8-
NCCL_SOCKET_IFNAME=eno1
9-
NCCL_DEBUG=WARN
1+
export PROJECT_PATH=/srv/nfs/staging/
2+
export OMPI_MCA_btl_tcp_if_include=eno1
3+
export UCX_TLS=self,shm,tcp
4+
export NCCL_P2P_LEVEL=NVL
5+
export NCCL_NET_GDR_LEVEL=PIX
6+
export NCCL_IB_HCA='=mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7,mlx5_8'
7+
export NCCL_IB_PCI_RELAXED_ORDERING=1
8+
export NCCL_SOCKET_IFNAME=eno1
9+
export NCCL_DEBUG=WARN

applications/DeepSpeed-Chat/training/Tutorial.md

Lines changed: 40 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,42 @@
11
# Benchmark DeepSpeed-Chat Training on Lambda Machines
2+
## Docker based setup:
23

3-
## Usage
4+
#### Step 1: Configure Environment
5+
6+
Create a .env file with your paths:
7+
8+
```
9+
cat > .env << EOF
10+
# HOST PATHS
11+
HOST_CACHE_PATH=/srv/nfs/staging/.cache
12+
HOST_REPO_PATH=/srv/nfs/staging/DeepSpeedExamples
13+
14+
# CONTAINER PATHS (inside Docker)
15+
PROJECT_PATH=/workspace
16+
EOF
17+
```
18+
19+
Adjust HOST\_CACHE\_PATH and HOST\_REPO\_PATH to match your storage locations.
20+
21+
#### Step 2: Build and Run Container
22+
23+
```
24+
make build # Build Docker image
25+
make run # Start container
26+
make shell # Enter container shell
27+
```
28+
29+
Inside the container, you'll be at /workspace/DeepSpeedExamples/applications/DeepSpeed-Chat/training.
30+
31+
#### Step 3: Run below commands to run training
32+
33+
```
34+
./run_batch.sh run_opt-350m_bs24_zero0 hostfiles/1node_1xN/ output/raj_1xN_opt-350m_bs24 3000
35+
./run_batch.sh run_opt-13b_bs16_zero0 hostfiles/1node_1xN/ output/raj_1xN_opt-13b_bs16_zero0/ 600
36+
```
37+
38+
39+
## Usage (Non-docker based setup)
440

541
#### Step 1: Step up the envrionment
642

@@ -10,13 +46,13 @@
1046
pip install deepspeed==0.10.0 && \
1147
sudo apt-get update && sudo apt-get install -y python3-pybind11 && \
1248
wget https://raw.githubusercontent.com/LambdaLabsML/DeepSpeedExamples/master/applications/DeepSpeed-Chat/requirements_freeze.txt && \
13-
pip install --upgrade -r requirements_freeze.txt && \
14-
rm requirements_freeze.txt
49+
xargs -a requirements_freeze.txt -I{} sh -c 'pip install --upgrade "{}" || echo "SKIPPED: {}"'
1550
```
1651

1752
Note: you can blast this installtion across a cluster with the `install_dependencies.sh` script.
1853
```
19-
./install_dependencies.sh <path-to-list-of-nodes.txt>
54+
#./install_dependencies.sh <path-to-list-of-nodes.txt>
55+
#./install_dependencies.sh ./hostfile/1node_1xN
2056
```
2157

2258
```
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
accelerate==0.34.2
2+
aiohappyeyeballs==2.6.1
3+
aiohttp==3.13.1
4+
aiosignal==1.4.0
5+
annotated-types==0.7.0
6+
anyio==4.11.0
7+
attrs==25.4.0
8+
certifi==2025.10.5
9+
charset-normalizer==3.4.4
10+
datasets==4.3.0
11+
deepspeed==0.18.1
12+
dill==0.3.7
13+
einops==0.8.1
14+
filelock==3.20.0
15+
fire==0.5.0
16+
frozenlist==1.8.0
17+
fsspec==2023.9.2
18+
h11==0.16.0
19+
hf-xet==1.1.10
20+
hjson==3.1.0
21+
httpcore==1.0.9
22+
httpx==0.28.1
23+
huggingface-hub==0.36.0
24+
idna==3.11
25+
iniconfig==2.3.0
26+
Jinja2==3.1.6
27+
loguru==0.7.0
28+
MarkupSafe==3.0.3
29+
mpmath==1.3.0
30+
msgpack==1.1.2
31+
multidict==6.7.0
32+
multiprocess==0.70.15
33+
networkx==3.1
34+
ninja==1.13.0
35+
numpy==1.26.4
36+
nvidia-cublas-cu12==12.8.4.1
37+
nvidia-cuda-cupti-cu12==12.8.90
38+
nvidia-cuda-nvrtc-cu12==12.8.93
39+
nvidia-cuda-runtime-cu12==12.8.90
40+
nvidia-cudnn-cu12==9.10.2.21
41+
nvidia-cufft-cu12==11.3.3.83
42+
nvidia-cufile-cu12==1.13.1.3
43+
nvidia-curand-cu12==10.3.9.90
44+
nvidia-cusolver-cu12==11.7.3.90
45+
nvidia-cusparse-cu12==12.5.8.93
46+
nvidia-cusparselt-cu12==0.7.1
47+
nvidia-ml-py==13.580.82
48+
nvidia-nccl-cu12==2.27.5
49+
nvidia-nvjitlink-cu12==12.8.93
50+
nvidia-nvshmem-cu12==3.3.20
51+
nvidia-nvtx-cu12==12.8.90
52+
packaging==25.0
53+
pandas==2.3.3
54+
pillow==11.3.0
55+
pluggy==1.6.0
56+
propcache==0.4.1
57+
protobuf==3.20.3
58+
psutil==7.1.1
59+
py-cpuinfo==9.0.0
60+
pyarrow==21.0.0
61+
pydantic==2.12.3
62+
pydantic_core==2.41.4
63+
pytest==7.4.0
64+
python-dateutil==2.9.0.post0
65+
pytz==2023.3
66+
PyYAML==6.0.3
67+
regex==2025.10.23
68+
requests==2.32.5
69+
safetensors==0.6.2
70+
sentencepiece==0.1.99
71+
setuptools==80.9.0
72+
sh==2.0.4
73+
six==1.17.0
74+
sniffio==1.3.1
75+
sympy==1.14.0
76+
termcolor==3.1.0
77+
tokenizers==0.20.1
78+
tqdm==4.67.1
79+
transformers==4.46.3
80+
triton==3.5.0
81+
typing-inspection==0.4.2
82+
typing_extensions==4.15.0
83+
tzdata==2025.2
84+
urllib3==2.5.0
85+
wheel==0.45.1
86+
xxhash==3.6.0
87+
yarl==1.22.0

0 commit comments

Comments
 (0)