LambdaLabsML
diff --git a/‎.gitignore‎
Lines changed: 6 additions & 0 deletions b/‎.gitignore‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎applications/DeepSpeed-Chat/.dockerignore‎
Lines changed: 12 additions & 0 deletions b/‎applications/DeepSpeed-Chat/.dockerignore‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎applications/DeepSpeed-Chat/.gitignore‎
Lines changed: 16 additions & 0 deletions b/‎applications/DeepSpeed-Chat/.gitignore‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎applications/DeepSpeed-Chat/Dockerfile‎
Lines changed: 54 additions & 0 deletions b/‎applications/DeepSpeed-Chat/Dockerfile‎
Lines changed: 54 additions & 0 deletions
diff --git a/‎applications/DeepSpeed-Chat/Makefile‎
Lines changed: 20 additions & 0 deletions b/‎applications/DeepSpeed-Chat/Makefile‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎applications/DeepSpeed-Chat/docker-compose.yml‎
Lines changed: 34 additions & 0 deletions b/‎applications/DeepSpeed-Chat/docker-compose.yml‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎applications/DeepSpeed-Chat/requirements_freeze.txt‎
Lines changed: 0 additions & 12 deletions b/‎applications/DeepSpeed-Chat/requirements_freeze.txt‎
Lines changed: 0 additions & 12 deletions
diff --git a/‎applications/DeepSpeed-Chat/training/.deepspeed_env‎
Lines changed: 9 additions & 9 deletions b/‎applications/DeepSpeed-Chat/training/.deepspeed_env‎
Lines changed: 9 additions & 9 deletions
diff --git a/‎applications/DeepSpeed-Chat/training/Tutorial.md‎
Lines changed: 40 additions & 4 deletions b/‎applications/DeepSpeed-Chat/training/Tutorial.md‎
Lines changed: 40 additions & 4 deletions
diff --git a/‎applications/DeepSpeed-Chat/training/requirements_freeze.txt‎
Lines changed: 87 additions & 0 deletions b/‎applications/DeepSpeed-Chat/training/requirements_freeze.txt‎
Lines changed: 87 additions & 0 deletions
@@ -129,3 +129,9 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
+
+
+# deepspeed logs
+*.log
+*.log
+**/*.log
@@ -0,0 +1,12 @@
+.git
+.cache
+*.pyc
+__pycache__
+*.egg-info
+venv/
+.venv/
+.env
+*.log
+outputs/
+checkpoints/
+.DS_Store
@@ -0,0 +1,16 @@
+# Log files
+*.log
+**/*.log
+
+# Output directories
+output/
+logs/
+
+# Temporary files
+*.tmp
+*.temp
+
+# Python cache
+__pycache__/
+*.pyc
+*.pyo
@@ -0,0 +1,54 @@
+FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV PYTHONUNBUFFERED=1
+
+# Install system dependencies including SSH and nvtop
+RUN apt-get update && apt-get install -y \
+ python3.10 \
+ python3.10-dev \
+ python3-pip \
+ python3-pybind11 \
+ git \
+ curl \
+ wget \
+ build-essential \
+ openssh-server \
+ openssh-client \
+ nvtop \
+ && rm -rf /var/lib/apt/lists/*
+
+# Set python3.10 as default
+RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.10 1 && \
+ update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
+
+# Configure SSH
+RUN mkdir -p /var/run/sshd && \
+ sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config && \
+ sed -i 's/#PubkeyAuthentication yes/PubkeyAuthentication yes/' /etc/ssh/sshd_config
+
+# Setup passwordless SSH for localhost
+RUN mkdir -p /root/.ssh && chmod 700 /root/.ssh && \
+ ssh-keygen -t rsa -N "" -f /root/.ssh/id_rsa && \
+ cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys && \
+ chmod 600 /root/.ssh/authorized_keys /root/.ssh/id_rsa && \
+ echo "StrictHostKeyChecking no" > /root/.ssh/config && \
+ echo "UserKnownHostsFile=/dev/null" >> /root/.ssh/config && \
+ chmod 600 /root/.ssh/config
+
+# Upgrade pip
+RUN python -m pip install --upgrade pip setuptools wheel
+
+WORKDIR /workspace
+
+# Install PyTorch first with CUDA 11.8 support
+RUN pip install torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 --index-url https://download.pytorch.org/whl/cu118
+
+# Copy and install other dependencies
+COPY training/requirements_freeze.txt ./
+RUN pip install -r requirements_freeze.txt
+
+# Copy project code
+COPY . .
+
+CMD ["/bin/bash"]
@@ -0,0 +1,20 @@
+.PHONY: build run stop clean shell logs
+
+build:
+docker compose build
+
+run:
+docker compose up -d
+
+stop:
+docker compose down
+
+clean:
+docker compose down -v
+docker system prune -f
+
+shell:
+docker compose exec deepspeed-training /bin/bash
+
+logs:
+docker compose logs -f
@@ -0,0 +1,34 @@
+services:
+ deepspeed-training:
+ build:
+ context: .
+ dockerfile: Dockerfile
+ image: deepspeed-training:latest
+ container_name: deepspeed-training
+ 
+ privileged: true
+ runtime: nvidia
+ 
+ command: /bin/bash -c "/usr/sbin/sshd && tail -f /dev/null"
+ 
+ environment:
+ - PROJECT_PATH=${PROJECT_PATH:-/workspace}
+ - HF_HOME=${PROJECT_PATH:-/workspace}/.cache/huggingface
+ - TRANSFORMERS_CACHE=${PROJECT_PATH:-/workspace}/.cache/huggingface/transformers
+ - HF_DATASETS_CACHE=${PROJECT_PATH:-/workspace}/.cache/huggingface/datasets
+ - NVIDIA_VISIBLE_DEVICES=all
+ - NVIDIA_DRIVER_CAPABILITIES=compute,utility
+ 
+ volumes:
+ # Mount DeepSpeedExamples repo to /workspace/DeepSpeedExamples
+ - ${HOST_REPO_PATH}:${PROJECT_PATH}/DeepSpeedExamples:rw
+ 
+ # Mount cache to /workspace/.cache
+ - ${HOST_CACHE_PATH}:${PROJECT_PATH}/.cache:rw
+ 
+ stdin_open: true
+ tty: true
+ ipc: host
+ shm_size: '32gb'
+ 
+ working_dir: /workspace/DeepSpeedExamples/applications/DeepSpeed-Chat/training
@@ -1,9 +1,9 @@
-PROJECT_PATH=/home/ubuntu/ml-1cc/benchmark
-OMPI_MCA_btl_tcp_if_include=eno1
-UCX_TLS=self,shm,tcp
-NCCL_P2P_LEVEL=NVL
-NCCL_NET_GDR_LEVEL=PIX
-NCCL_IB_HCA='=mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7,mlx5_8'
-NCCL_IB_PCI_RELAXED_ORDERING=1
-NCCL_SOCKET_IFNAME=eno1
-NCCL_DEBUG=WARN
+export PROJECT_PATH=/srv/nfs/staging/
+export OMPI_MCA_btl_tcp_if_include=eno1
+export UCX_TLS=self,shm,tcp
+export NCCL_P2P_LEVEL=NVL
+export NCCL_NET_GDR_LEVEL=PIX
+export NCCL_IB_HCA='=mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7,mlx5_8'
+export NCCL_IB_PCI_RELAXED_ORDERING=1
+export NCCL_SOCKET_IFNAME=eno1
+export NCCL_DEBUG=WARN
@@ -1,6 +1,42 @@
 # Benchmark DeepSpeed-Chat Training on Lambda Machines
+## Docker based setup:
 
-## Usage
+#### Step 1: Configure Environment
+
+Create a .env file with your paths:
+
+```
+cat > .env << EOF
+# HOST PATHS
+HOST_CACHE_PATH=/srv/nfs/staging/.cache
+HOST_REPO_PATH=/srv/nfs/staging/DeepSpeedExamples
+
+# CONTAINER PATHS (inside Docker)
+PROJECT_PATH=/workspace
+EOF
+```
+
+Adjust HOST\_CACHE\_PATH and HOST\_REPO\_PATH to match your storage locations.
+
+#### Step 2: Build and Run Container
+
+```
+make build # Build Docker image
+make run # Start container
+make shell # Enter container shell
+```
+
+Inside the container, you'll be at /workspace/DeepSpeedExamples/applications/DeepSpeed-Chat/training.
+
+#### Step 3: Run below commands to run training
+
+```
+./run_batch.sh run_opt-350m_bs24_zero0 hostfiles/1node_1xN/ output/raj_1xN_opt-350m_bs24 3000
+./run_batch.sh run_opt-13b_bs16_zero0 hostfiles/1node_1xN/ output/raj_1xN_opt-13b_bs16_zero0/ 600
+```
+
+
+## Usage (Non-docker based setup)
 
 #### Step 1: Step up the envrionment
 
@@ -10,13 +46,13 @@
 pip install deepspeed==0.10.0 && \
 sudo apt-get update && sudo apt-get install -y python3-pybind11 && \
 wget https://raw.githubusercontent.com/LambdaLabsML/DeepSpeedExamples/master/applications/DeepSpeed-Chat/requirements_freeze.txt && \
-pip install --upgrade -r requirements_freeze.txt && \
-rm requirements_freeze.txt
+xargs -a requirements_freeze.txt -I{} sh -c 'pip install --upgrade "{}" || echo "SKIPPED: {}"'
 ```
 
 Note: you can blast this installtion across a cluster with the `install_dependencies.sh` script.
 ```
-./install_dependencies.sh <path-to-list-of-nodes.txt>
+#./install_dependencies.sh <path-to-list-of-nodes.txt>
+#./install_dependencies.sh ./hostfile/1node_1xN 
 ```
 
 ```
 
@@ -0,0 +1,87 @@
+accelerate==0.34.2
+aiohappyeyeballs==2.6.1
+aiohttp==3.13.1
+aiosignal==1.4.0
+annotated-types==0.7.0
+anyio==4.11.0
+attrs==25.4.0
+certifi==2025.10.5
+charset-normalizer==3.4.4
+datasets==4.3.0
+deepspeed==0.18.1
+dill==0.3.7
+einops==0.8.1
+filelock==3.20.0
+fire==0.5.0
+frozenlist==1.8.0
+fsspec==2023.9.2
+h11==0.16.0
+hf-xet==1.1.10
+hjson==3.1.0
+httpcore==1.0.9
+httpx==0.28.1
+huggingface-hub==0.36.0
+idna==3.11
+iniconfig==2.3.0
+Jinja2==3.1.6
+loguru==0.7.0
+MarkupSafe==3.0.3
+mpmath==1.3.0
+msgpack==1.1.2
+multidict==6.7.0
+multiprocess==0.70.15
+networkx==3.1
+ninja==1.13.0
+numpy==1.26.4
+nvidia-cublas-cu12==12.8.4.1
+nvidia-cuda-cupti-cu12==12.8.90
+nvidia-cuda-nvrtc-cu12==12.8.93
+nvidia-cuda-runtime-cu12==12.8.90
+nvidia-cudnn-cu12==9.10.2.21
+nvidia-cufft-cu12==11.3.3.83
+nvidia-cufile-cu12==1.13.1.3
+nvidia-curand-cu12==10.3.9.90
+nvidia-cusolver-cu12==11.7.3.90
+nvidia-cusparse-cu12==12.5.8.93
+nvidia-cusparselt-cu12==0.7.1
+nvidia-ml-py==13.580.82
+nvidia-nccl-cu12==2.27.5
+nvidia-nvjitlink-cu12==12.8.93
+nvidia-nvshmem-cu12==3.3.20
+nvidia-nvtx-cu12==12.8.90
+packaging==25.0
+pandas==2.3.3
+pillow==11.3.0
+pluggy==1.6.0
+propcache==0.4.1
+protobuf==3.20.3
+psutil==7.1.1
+py-cpuinfo==9.0.0
+pyarrow==21.0.0
+pydantic==2.12.3
+pydantic_core==2.41.4
+pytest==7.4.0
+python-dateutil==2.9.0.post0
+pytz==2023.3
+PyYAML==6.0.3
+regex==2025.10.23
+requests==2.32.5
+safetensors==0.6.2
+sentencepiece==0.1.99
+setuptools==80.9.0
+sh==2.0.4
+six==1.17.0
+sniffio==1.3.1
+sympy==1.14.0
+termcolor==3.1.0
+tokenizers==0.20.1
+tqdm==4.67.1
+transformers==4.46.3
+triton==3.5.0
+typing-inspection==0.4.2
+typing_extensions==4.15.0
+tzdata==2025.2
+urllib3==2.5.0
+wheel==0.45.1
+xxhash==3.6.0
+yarl==1.22.0