Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -59,3 +59,10 @@ hf-1b-GPT2-mqa1-int8:

ds-inference-1b-GPT2-mqa1-fp16:
deepspeed --num_gpus 1 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class DS_Inference_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 3 --batch_size ${batch_size}

# Input length experiments
hf-1b-GPT2-mqa1-int8-input-length:
python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 3 --dtype int8 --batch_size ${batch_size} --max_input_length ${max_input_length}

hf-1b-GPT2-mha-int8-input-length:
python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 1 --dtype int8 --batch_size ${batch_size} --max_input_length ${max_input_length}
Empty file modified run.sh
100644 → 100755
Empty file.
8 changes: 8 additions & 0 deletions run_input_length.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
export CUDA_VISIBLE_DEVICES=0

rm -rf ./tmp

for max_input_length in {1900,1024,512,256,128,64,32,16,8,4}
do
make $1 batch_size=4 max_input_length=$max_input_length
done
9 changes: 6 additions & 3 deletions src/main.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
import pipelines
from utils import benchmark_end_to_end, get_arg_parser, get_args, get_dummy_batch

from utils import benchmark_end_to_end, get_arg_parser, get_args, get_dummy_batch, get_dummy_batch_tokenizer
from transformers import AutoTokenizer

def main() -> None:
# deepspeed.init_distributed("nccl")

args = get_args(get_arg_parser())

inputs = get_dummy_batch(args.batch_size)
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.add_special_tokens({"pad_token": "[PAD]"})
inputs = get_dummy_batch_tokenizer(args.batch_size, tokenizer, args.max_input_length)

generate_kwargs = dict(max_new_tokens=args.max_new_tokens, do_sample=False)

pipeline_class = getattr(pipelines, args.pipeline_class)
Expand Down
4 changes: 3 additions & 1 deletion src/pipelines/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,10 @@ def __init__(self, args: Namespace) -> None:
self.config, self.tokenizer, self.model_class = get_config_tokenizer_model_class(args)
self.model = None
self.input_device = None
self.max_input_length = args.max_input_length

def __call__(self, text: List[str], **generate_kwargs) -> Tuple[List[str], List[int]]:
input_tokens = self.tokenizer(text, return_tensors="pt", padding=True)
input_tokens = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=self.max_input_length)

for t in input_tokens:
if torch.is_tensor(input_tokens[t]):
Expand All @@ -25,6 +26,7 @@ def __call__(self, text: List[str], **generate_kwargs) -> Tuple[List[str], List[
output_tokens = output.sequences

input_token_lengths = [x.shape[0] for x in input_tokens.input_ids]
print("Input token lengths: ", input_token_lengths)
output_token_lengths = [x.shape[0] for x in output_tokens]
num_generated_tokens = [o - i for i, o in zip(input_token_lengths, output_token_lengths)]

Expand Down
2 changes: 1 addition & 1 deletion src/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
from .arguments import get_arg_parser, get_args
from .benchmark import benchmark_end_to_end
from .dummy import get_dummy_batch
from .dummy import get_dummy_batch, get_dummy_batch_tokenizer
1 change: 1 addition & 0 deletions src/utils/arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ def get_arg_parser() -> ArgumentParser:
parser.add_argument("--model_class", default="GPT2", type=str)
parser.add_argument("--batch_size", default=1, type=int)
parser.add_argument("--dtype", default="bfloat16", type=str)
parser.add_argument("--max_input_length", default=100, type=int)
parser.add_argument("--max_new_tokens", default=100, type=int)
parser.add_argument("--local_rank", type=int)
parser.add_argument("--hidden_size", type=int)
Expand Down
9 changes: 8 additions & 1 deletion src/utils/dummy.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import copy
import math
from typing import List

import random

dummy_input_sentences = [
"DeepSpeed is a machine learning framework",
Expand All @@ -24,3 +24,10 @@ def get_dummy_batch(batch_size: int, input_sentences: List[str] = None) -> List[
input_sentences = input_sentences[:batch_size]

return input_sentences

def get_dummy_batch_tokenizer(batch_size : int, tokenizer, max_input_length : int) -> List[str]:
input_sentences = []
for i in range(batch_size):
sentence = [random.randint(0, tokenizer.vocab_size-1) for _ in range(max_input_length)]
input_sentences.append(tokenizer.decode(sentence))
return input_sentences