bigcode-project · jlamypoirier · Jan 30, 2023 · Dec 20, 2022 · Jan 13, 2023 · Jan 13, 2023
diff --git a/Dockerfile b/Dockerfile
@@ -11,6 +11,8 @@ RUN useradd -m -u $USER -s /bin/bash $USERNAME \
 
 COPY --chown=$USERNAME ./requirements.txt ./
 COPY --chown=$USERNAME transformers/ ./transformers
-RUN pip install -r requirements.txt
+
+# Stock version of pip doesn't work with editable transformers.
+RUN pip install --upgrade pip --no-cache-dir && pip install -r requirements.txt --no-cache-dir
 
 COPY --chown=$USERNAME src/ ./src
diff --git a/Makefile b/Makefile
@@ -6,69 +6,49 @@ style:
 
 batch_size := 1
 
-install-mqa-transformers:
-git clone https://github.com/bigcode-project/transformers.git; \
-cd transformers; \
-git checkout mayank/multi_query; \
-pip install .; \
-cd ..; \
-rm -rf transformers;
+install:
+git submodule update --init
+pip install -r requirements.txt
 
 # BLOOM AliBi
 hf-1b-bloom-fp32:
-python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class BLOOM --dtype float32 --batch_size ${batch_size}
+python3 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_Pipeline --model_class BLOOM --dtype float32 --batch_size ${batch_size}
 
 hf-1b-bloom-bf16:
-python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class BLOOM --dtype bfloat16 --batch_size ${batch_size}
+python3 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_Pipeline --model_class BLOOM --dtype bfloat16 --batch_size ${batch_size}
 
 hf-1b-bloom-int8:
-python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class BLOOM --dtype int8 --batch_size ${batch_size}
+python3 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_Pipeline --model_class BLOOM --dtype int8 --batch_size ${batch_size}
 
 ds-inference-1b-bloom-fp16:
-deepspeed --num_gpus 1 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class DS_Inference_Pipeline --model_class BLOOM --batch_size ${batch_size}
+deepspeed --num_gpus 1 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class DS_Pipeline --model_class BLOOM --batch_size ${batch_size}
 
 # GPT2 MHA
 hf-1b-GPT2-mha-fp32:
-python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 1 --dtype float32 --batch_size ${batch_size}
+python3 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 1 --dtype float32 --batch_size ${batch_size}
 
 hf-1b-GPT2-mha-bf16:
-python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 1 --dtype bfloat16 --batch_size ${batch_size}
+python3 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 1 --dtype bfloat16 --batch_size ${batch_size}
 
 hf-1b-GPT2-mha-int8:
-python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 1 --dtype int8 --batch_size ${batch_size}
+python3 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 1 --dtype int8 --batch_size ${batch_size}
 
 ds-inference-1b-GPT2-mha-fp16:
-deepspeed --num_gpus 1 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class DS_Inference_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 1 --batch_size ${batch_size}
-
-# GPT2 MQA
-hf-1b-GPT2-mqa-fp32:
-python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 2 --dtype float32 --batch_size ${batch_size}
-
-hf-1b-GPT2-mqa-bf16:
-python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 2 --dtype bfloat16 --batch_size ${batch_size}
-
-hf-1b-GPT2-mqa-int8:
-python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 2 --dtype int8 --batch_size ${batch_size}
-
-ds-inference-1b-GPT2-mqa-fp16:
-deepspeed --num_gpus 1 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class DS_Inference_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 2 --batch_size ${batch_size}
+deepspeed --num_gpus 1 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class DS_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 1 --batch_size ${batch_size}
 
 # GPT2 MQA1
 hf-1b-GPT2-mqa1-fp32:
-python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 3 --dtype float32 --batch_size ${batch_size}
+python3 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 2 --dtype float32 --batch_size ${batch_size}
 
 hf-1b-GPT2-mqa1-bf16:
-python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 3 --dtype bfloat16 --batch_size ${batch_size}
+python3 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 2 --dtype bfloat16 --batch_size ${batch_size}
 
 hf-1b-GPT2-mqa1-int8:
-python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 3 --dtype int8 --batch_size ${batch_size}
-
-ds-inference-1b-GPT2-mqa1-fp16:
-deepspeed --num_gpus 1 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class DS_Inference_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 3 --batch_size ${batch_size}
+python3 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 2 --dtype int8 --batch_size ${batch_size}
 
 # Input length experiments
 hf-1b-GPT2-mqa1-int8-input-length:
-python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 3 --dtype int8 --batch_size ${batch_size} --max_input_length ${max_input_length}
+python3 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 2 --dtype int8 --batch_size ${batch_size} --max_input_length ${max_input_length}
 
 hf-1b-GPT2-mha-int8-input-length:
-python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 1 --dtype int8 --batch_size ${batch_size} --max_input_length ${max_input_length}
+python3 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 1 --dtype int8 --batch_size ${batch_size} --max_input_length ${max_input_length}
diff --git a/requirements.txt b/requirements.txt
@@ -1,7 +1,7 @@
 accelerate==0.15.0
 bitsandbytes
 deepspeed==0.7.7
-./transformers
+-e ./transformers
 
 # TODO: Dev only
 isort>=5.5.4

diff --git a/src/__init__.py b/src/__init__.py
diff --git a/src/main.py b/src/main.py
@@ -1,19 +1,23 @@
-import pipelines
-from utils import benchmark_end_to_end, get_arg_parser, get_args, get_dummy_batch
+from typing import List, Optional
 
+from src.pipelines import get_pipeline_class
+from src.utils.arguments import parse_args
+from src.utils.benchmark import benchmark_end_to_end
+from src.utils.input import get_dummy_batch
+from src.utils.logging import configure_logging
 
-def main() -> None:
- # deepspeed.init_distributed("nccl")
 
- args = get_args(get_arg_parser())
+def main(argv: Optional[List[str]] = None) -> None:
+ args = parse_args(argv=argv)
 
  inputs = get_dummy_batch(args.batch_size, args.max_input_length)
 
- generate_kwargs = dict(max_new_tokens=args.max_new_tokens, do_sample=False)
+ generate_kwargs = {"max_new_tokens": args.max_new_tokens, "do_sample": False}
 
- pipeline_class = getattr(pipelines, args.pipeline_class)
+ pipeline_class = get_pipeline_class(args.pipeline_class)
  benchmark_end_to_end(args, pipeline_class, inputs, generate_kwargs)
 
 
 if __name__ == "__main__":
+ configure_logging()
  main()
diff --git a/src/pipelines/__init__.py b/src/pipelines/__init__.py
@@ -1,3 +1,11 @@
-from .ds_inference import DS_Inference_Pipeline
-from .hf import HF_CPU_Pipeline, HF_GPU_Pipeline
-from .pipeline import Pipeline
+def get_pipeline_class(name):
+ if name == "HF_Pipeline":
+ from src.pipelines.transformers import HF_Pipeline
+
+ return HF_Pipeline
+ elif name == "DS_Pipeline":
+ from src.pipelines.ds import DS_Pipeline
+
+ return DS_Pipeline
+ else:
+ raise NotImplementedError(f"Unsupported pipeline class: {name}")
diff --git a/src/pipelines/ds.py b/src/pipelines/ds.py
@@ -0,0 +1,25 @@
+import os
+from argparse import Namespace
+
+import deepspeed
+import torch
+
+from src.pipelines.pipeline import Pipeline
+from src.utils.arguments import check_unused
+
+
+class DS_Pipeline(Pipeline):
+ def __init__(self, args: Namespace) -> None:
+ check_unused(args, {"device": torch.device("cuda")}, enforce=True)
+ # TODO: Works with other dtypes?
+ check_unused(args, {"dtype": torch.float16})
+ super().__init__(args)
+
+ self.model = deepspeed.init_inference(
+ self.model,
+ mp_size=int(os.getenv("WORLD_SIZE", "1")),
+ # base_dir="./",
+ dtype=args.dtype,
+ replace_with_kernel_inject=args.inject_kernel,
+ enable_cuda_graph=args.cuda_graph,
+ )
diff --git a/src/pipelines/ds_inference.py b/src/pipelines/ds_inference.py
diff --git a/src/pipelines/hf.py b/src/pipelines/hf.py