Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -129,3 +129,9 @@ dmypy.json

# Pyre type checker
.pyre/


# deepspeed logs
*.log
*.log
**/*.log
12 changes: 12 additions & 0 deletions applications/DeepSpeed-Chat/.dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
.git
.cache
*.pyc
__pycache__
*.egg-info
venv/
.venv/
.env
*.log
outputs/
checkpoints/
.DS_Store
16 changes: 16 additions & 0 deletions applications/DeepSpeed-Chat/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Log files
*.log
**/*.log

# Output directories
output/
logs/

# Temporary files
*.tmp
*.temp

# Python cache
__pycache__/
*.pyc
*.pyo
54 changes: 54 additions & 0 deletions applications/DeepSpeed-Chat/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04

ENV DEBIAN_FRONTEND=noninteractive
ENV PYTHONUNBUFFERED=1

# Install system dependencies including SSH and nvtop
RUN apt-get update && apt-get install -y \
python3.10 \
python3.10-dev \
python3-pip \
python3-pybind11 \
git \
curl \
wget \
build-essential \
openssh-server \
openssh-client \
nvtop \
&& rm -rf /var/lib/apt/lists/*

# Set python3.10 as default
RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.10 1 && \
update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1

# Configure SSH
RUN mkdir -p /var/run/sshd && \
sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config && \
sed -i 's/#PubkeyAuthentication yes/PubkeyAuthentication yes/' /etc/ssh/sshd_config

# Setup passwordless SSH for localhost
RUN mkdir -p /root/.ssh && chmod 700 /root/.ssh && \
ssh-keygen -t rsa -N "" -f /root/.ssh/id_rsa && \
cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys && \
chmod 600 /root/.ssh/authorized_keys /root/.ssh/id_rsa && \
echo "StrictHostKeyChecking no" > /root/.ssh/config && \
echo "UserKnownHostsFile=/dev/null" >> /root/.ssh/config && \
chmod 600 /root/.ssh/config

# Upgrade pip
RUN python -m pip install --upgrade pip setuptools wheel

WORKDIR /workspace

# Install PyTorch first with CUDA 11.8 support
RUN pip install torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 --index-url https://download.pytorch.org/whl/cu118

# Copy and install other dependencies
COPY training/requirements_freeze.txt ./
RUN pip install -r requirements_freeze.txt

# Copy project code
COPY . .

CMD ["/bin/bash"]
20 changes: 20 additions & 0 deletions applications/DeepSpeed-Chat/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
.PHONY: build run stop clean shell logs

build:
docker compose build

run:
docker compose up -d

stop:
docker compose down

clean:
docker compose down -v
docker system prune -f

shell:
docker compose exec deepspeed-training /bin/bash

logs:
docker compose logs -f
34 changes: 34 additions & 0 deletions applications/DeepSpeed-Chat/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
services:
deepspeed-training:
build:
context: .
dockerfile: Dockerfile
image: deepspeed-training:latest
container_name: deepspeed-training

privileged: true
runtime: nvidia

command: /bin/bash -c "/usr/sbin/sshd && tail -f /dev/null"

environment:
- PROJECT_PATH=${PROJECT_PATH:-/workspace}
- HF_HOME=${PROJECT_PATH:-/workspace}/.cache/huggingface
- TRANSFORMERS_CACHE=${PROJECT_PATH:-/workspace}/.cache/huggingface/transformers
- HF_DATASETS_CACHE=${PROJECT_PATH:-/workspace}/.cache/huggingface/datasets
- NVIDIA_VISIBLE_DEVICES=all
- NVIDIA_DRIVER_CAPABILITIES=compute,utility

volumes:
# Mount DeepSpeedExamples repo to /workspace/DeepSpeedExamples
- ${HOST_REPO_PATH}:${PROJECT_PATH}/DeepSpeedExamples:rw

# Mount cache to /workspace/.cache
- ${HOST_CACHE_PATH}:${PROJECT_PATH}/.cache:rw

stdin_open: true
tty: true
ipc: host
shm_size: '32gb'

working_dir: /workspace/DeepSpeedExamples/applications/DeepSpeed-Chat/training
12 changes: 0 additions & 12 deletions applications/DeepSpeed-Chat/requirements_freeze.txt

This file was deleted.

18 changes: 9 additions & 9 deletions applications/DeepSpeed-Chat/training/.deepspeed_env
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
PROJECT_PATH=/home/ubuntu/ml-1cc/benchmark
OMPI_MCA_btl_tcp_if_include=eno1
UCX_TLS=self,shm,tcp
NCCL_P2P_LEVEL=NVL
NCCL_NET_GDR_LEVEL=PIX
NCCL_IB_HCA='=mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7,mlx5_8'
NCCL_IB_PCI_RELAXED_ORDERING=1
NCCL_SOCKET_IFNAME=eno1
NCCL_DEBUG=WARN
export PROJECT_PATH=/srv/nfs/staging/
export OMPI_MCA_btl_tcp_if_include=eno1
export UCX_TLS=self,shm,tcp
export NCCL_P2P_LEVEL=NVL
export NCCL_NET_GDR_LEVEL=PIX
export NCCL_IB_HCA='=mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7,mlx5_8'
export NCCL_IB_PCI_RELAXED_ORDERING=1
export NCCL_SOCKET_IFNAME=eno1
export NCCL_DEBUG=WARN
44 changes: 40 additions & 4 deletions applications/DeepSpeed-Chat/training/Tutorial.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,42 @@
# Benchmark DeepSpeed-Chat Training on Lambda Machines
## Docker based setup:

## Usage
#### Step 1: Configure Environment

Create a .env file with your paths:

```
cat > .env << EOF
# HOST PATHS
HOST_CACHE_PATH=/srv/nfs/staging/.cache
HOST_REPO_PATH=/srv/nfs/staging/DeepSpeedExamples

# CONTAINER PATHS (inside Docker)
PROJECT_PATH=/workspace
EOF
```

Adjust HOST\_CACHE\_PATH and HOST\_REPO\_PATH to match your storage locations.

#### Step 2: Build and Run Container

```
make build # Build Docker image
make run # Start container
make shell # Enter container shell
```

Inside the container, you'll be at /workspace/DeepSpeedExamples/applications/DeepSpeed-Chat/training.

#### Step 3: Run below commands to run training

```
./run_batch.sh run_opt-350m_bs24_zero0 hostfiles/1node_1xN/ output/raj_1xN_opt-350m_bs24 3000
./run_batch.sh run_opt-13b_bs16_zero0 hostfiles/1node_1xN/ output/raj_1xN_opt-13b_bs16_zero0/ 600
```


## Usage (Non-docker based setup)

#### Step 1: Step up the envrionment

Expand All @@ -10,13 +46,13 @@
pip install deepspeed==0.10.0 && \
sudo apt-get update && sudo apt-get install -y python3-pybind11 && \
wget https://raw.githubusercontent.com/LambdaLabsML/DeepSpeedExamples/master/applications/DeepSpeed-Chat/requirements_freeze.txt && \
pip install --upgrade -r requirements_freeze.txt && \
rm requirements_freeze.txt
xargs -a requirements_freeze.txt -I{} sh -c 'pip install --upgrade "{}" || echo "SKIPPED: {}"'
```

Note: you can blast this installtion across a cluster with the `install_dependencies.sh` script.
```
./install_dependencies.sh <path-to-list-of-nodes.txt>
#./install_dependencies.sh <path-to-list-of-nodes.txt>
#./install_dependencies.sh ./hostfile/1node_1xN
```

```
Expand Down
87 changes: 87 additions & 0 deletions applications/DeepSpeed-Chat/training/requirements_freeze.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
accelerate==0.34.2
aiohappyeyeballs==2.6.1
aiohttp==3.13.1
aiosignal==1.4.0
annotated-types==0.7.0
anyio==4.11.0
attrs==25.4.0
certifi==2025.10.5
charset-normalizer==3.4.4
datasets==4.3.0
deepspeed==0.18.1
dill==0.3.7
einops==0.8.1
filelock==3.20.0
fire==0.5.0
frozenlist==1.8.0
fsspec==2023.9.2
h11==0.16.0
hf-xet==1.1.10
hjson==3.1.0
httpcore==1.0.9
httpx==0.28.1
huggingface-hub==0.36.0
idna==3.11
iniconfig==2.3.0
Jinja2==3.1.6
loguru==0.7.0
MarkupSafe==3.0.3
mpmath==1.3.0
msgpack==1.1.2
multidict==6.7.0
multiprocess==0.70.15
networkx==3.1
ninja==1.13.0
numpy==1.26.4
nvidia-cublas-cu12==12.8.4.1
nvidia-cuda-cupti-cu12==12.8.90
nvidia-cuda-nvrtc-cu12==12.8.93
nvidia-cuda-runtime-cu12==12.8.90
nvidia-cudnn-cu12==9.10.2.21
nvidia-cufft-cu12==11.3.3.83
nvidia-cufile-cu12==1.13.1.3
nvidia-curand-cu12==10.3.9.90
nvidia-cusolver-cu12==11.7.3.90
nvidia-cusparse-cu12==12.5.8.93
nvidia-cusparselt-cu12==0.7.1
nvidia-ml-py==13.580.82
nvidia-nccl-cu12==2.27.5
nvidia-nvjitlink-cu12==12.8.93
nvidia-nvshmem-cu12==3.3.20
nvidia-nvtx-cu12==12.8.90
packaging==25.0
pandas==2.3.3
pillow==11.3.0
pluggy==1.6.0
propcache==0.4.1
protobuf==3.20.3
psutil==7.1.1
py-cpuinfo==9.0.0
pyarrow==21.0.0
pydantic==2.12.3
pydantic_core==2.41.4
pytest==7.4.0
python-dateutil==2.9.0.post0
pytz==2023.3
PyYAML==6.0.3
regex==2025.10.23
requests==2.32.5
safetensors==0.6.2
sentencepiece==0.1.99
setuptools==80.9.0
sh==2.0.4
six==1.17.0
sniffio==1.3.1
sympy==1.14.0
termcolor==3.1.0
tokenizers==0.20.1
tqdm==4.67.1
transformers==4.46.3
triton==3.5.0
typing-inspection==0.4.2
typing_extensions==4.15.0
tzdata==2025.2
urllib3==2.5.0
wheel==0.45.1
xxhash==3.6.0
yarl==1.22.0
6 changes: 1 addition & 5 deletions applications/DeepSpeed-Chat/training/run_opt-1.3b_zero0.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,7 @@ echo >> $NAME_LOG
first_line=$(head -n 1 "$HOSTFILE_NAME")
master_addr=$(echo "$first_line" | awk '{print $1}')

deepspeed_path=$(which deepspeed)
if [ -z "$deepspeed_path" ]; then
# deepspeed was not found in the system path, so hardcode the path
deepspeed_path="/home/ubuntu/.local/bin/deepspeed"
fi
deepspeed_path="deepspeed"

source ./setup_env.sh $MODEL_NAME $STEP_NAME && \
NCCL_DEBUG=INFO PROJECT_PATH=${PROJECT_PATH} $deepspeed_path --hostfile=$HOSTFILE_NAME --master_addr $master_addr $SCRIPT_PATH/main.py \
Expand Down
Loading