Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ RUN useradd -m -u $USER -s /bin/bash $USERNAME \

COPY --chown=$USERNAME ./requirements.txt ./
COPY --chown=$USERNAME transformers/ ./transformers
RUN pip install -r requirements.txt

# Stock version of pip doesn't work with editable transformers.
RUN pip install --upgrade pip --no-cache-dir && pip install -r requirements.txt --no-cache-dir

COPY --chown=$USERNAME src/ ./src
52 changes: 16 additions & 36 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,69 +6,49 @@ style:

batch_size := 1

install-mqa-transformers:
git clone https://github.com/bigcode-project/transformers.git; \
cd transformers; \
git checkout mayank/multi_query; \
pip install .; \
cd ..; \
rm -rf transformers;
install:
git submodule update --init
pip install -r requirements.txt

# BLOOM AliBi
hf-1b-bloom-fp32:
python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class BLOOM --dtype float32 --batch_size ${batch_size}
python3 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_Pipeline --model_class BLOOM --dtype float32 --batch_size ${batch_size}
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mayank31398 I don't see why we should have it as a module. It's a script and not installed as a package.


hf-1b-bloom-bf16:
python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class BLOOM --dtype bfloat16 --batch_size ${batch_size}
python3 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_Pipeline --model_class BLOOM --dtype bfloat16 --batch_size ${batch_size}

hf-1b-bloom-int8:
python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class BLOOM --dtype int8 --batch_size ${batch_size}
python3 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_Pipeline --model_class BLOOM --dtype int8 --batch_size ${batch_size}

ds-inference-1b-bloom-fp16:
deepspeed --num_gpus 1 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class DS_Inference_Pipeline --model_class BLOOM --batch_size ${batch_size}
deepspeed --num_gpus 1 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class DS_Pipeline --model_class BLOOM --batch_size ${batch_size}

# GPT2 MHA
hf-1b-GPT2-mha-fp32:
python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 1 --dtype float32 --batch_size ${batch_size}
python3 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 1 --dtype float32 --batch_size ${batch_size}

hf-1b-GPT2-mha-bf16:
python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 1 --dtype bfloat16 --batch_size ${batch_size}
python3 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 1 --dtype bfloat16 --batch_size ${batch_size}

hf-1b-GPT2-mha-int8:
python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 1 --dtype int8 --batch_size ${batch_size}
python3 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 1 --dtype int8 --batch_size ${batch_size}

ds-inference-1b-GPT2-mha-fp16:
deepspeed --num_gpus 1 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class DS_Inference_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 1 --batch_size ${batch_size}

# GPT2 MQA
hf-1b-GPT2-mqa-fp32:
python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 2 --dtype float32 --batch_size ${batch_size}

hf-1b-GPT2-mqa-bf16:
python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 2 --dtype bfloat16 --batch_size ${batch_size}

hf-1b-GPT2-mqa-int8:
python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 2 --dtype int8 --batch_size ${batch_size}

ds-inference-1b-GPT2-mqa-fp16:
deepspeed --num_gpus 1 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class DS_Inference_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 2 --batch_size ${batch_size}
deepspeed --num_gpus 1 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class DS_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 1 --batch_size ${batch_size}

# GPT2 MQA1
hf-1b-GPT2-mqa1-fp32:
python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 3 --dtype float32 --batch_size ${batch_size}
python3 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 2 --dtype float32 --batch_size ${batch_size}

hf-1b-GPT2-mqa1-bf16:
python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 3 --dtype bfloat16 --batch_size ${batch_size}
python3 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 2 --dtype bfloat16 --batch_size ${batch_size}

hf-1b-GPT2-mqa1-int8:
python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 3 --dtype int8 --batch_size ${batch_size}

ds-inference-1b-GPT2-mqa1-fp16:
deepspeed --num_gpus 1 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class DS_Inference_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 3 --batch_size ${batch_size}
python3 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 2 --dtype int8 --batch_size ${batch_size}

# Input length experiments
hf-1b-GPT2-mqa1-int8-input-length:
python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 3 --dtype int8 --batch_size ${batch_size} --max_input_length ${max_input_length}
python3 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 2 --dtype int8 --batch_size ${batch_size} --max_input_length ${max_input_length}

hf-1b-GPT2-mha-int8-input-length:
python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 1 --dtype int8 --batch_size ${batch_size} --max_input_length ${max_input_length}
python3 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 1 --dtype int8 --batch_size ${batch_size} --max_input_length ${max_input_length}
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
accelerate==0.15.0
bitsandbytes
deepspeed==0.7.7
./transformers
-e ./transformers

# TODO: Dev only
isort>=5.5.4
Expand Down
Empty file added src/__init__.py
Empty file.
18 changes: 11 additions & 7 deletions src/main.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,23 @@
import pipelines
from utils import benchmark_end_to_end, get_arg_parser, get_args, get_dummy_batch
from typing import List, Optional

from src.pipelines import get_pipeline_class
from src.utils.arguments import parse_args
from src.utils.benchmark import benchmark_end_to_end
from src.utils.input import get_dummy_batch
from src.utils.logging import configure_logging

def main() -> None:
# deepspeed.init_distributed("nccl")

args = get_args(get_arg_parser())
def main(argv: Optional[List[str]] = None) -> None:
args = parse_args(argv=argv)

inputs = get_dummy_batch(args.batch_size, args.max_input_length)

generate_kwargs = dict(max_new_tokens=args.max_new_tokens, do_sample=False)
generate_kwargs = {"max_new_tokens": args.max_new_tokens, "do_sample": False}

pipeline_class = getattr(pipelines, args.pipeline_class)
pipeline_class = get_pipeline_class(args.pipeline_class)
benchmark_end_to_end(args, pipeline_class, inputs, generate_kwargs)


if __name__ == "__main__":
configure_logging()
main()
14 changes: 11 additions & 3 deletions src/pipelines/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
from .ds_inference import DS_Inference_Pipeline
from .hf import HF_CPU_Pipeline, HF_GPU_Pipeline
from .pipeline import Pipeline
def get_pipeline_class(name):
if name == "HF_Pipeline":
from src.pipelines.transformers import HF_Pipeline

return HF_Pipeline
elif name == "DS_Pipeline":
from src.pipelines.ds import DS_Pipeline

return DS_Pipeline
else:
raise NotImplementedError(f"Unsupported pipeline class: {name}")
25 changes: 25 additions & 0 deletions src/pipelines/ds.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import os
from argparse import Namespace

import deepspeed
import torch

from src.pipelines.pipeline import Pipeline
from src.utils.arguments import check_unused


class DS_Pipeline(Pipeline):
def __init__(self, args: Namespace) -> None:
check_unused(args, {"device": torch.device("cuda")}, enforce=True)
# TODO: Works with other dtypes?
check_unused(args, {"dtype": torch.float16})
super().__init__(args)

self.model = deepspeed.init_inference(
self.model,
mp_size=int(os.getenv("WORLD_SIZE", "1")),
# base_dir="./",
dtype=args.dtype,
replace_with_kernel_inject=args.inject_kernel,
enable_cuda_graph=args.cuda_graph,
)
39 changes: 0 additions & 39 deletions src/pipelines/ds_inference.py

This file was deleted.

32 changes: 0 additions & 32 deletions src/pipelines/hf.py

This file was deleted.

Loading