bigcode-project · mayank31398 · Dec 12, 2022 · Dec 10, 2022 · Dec 11, 2022 · Dec 12, 2022
diff --git a/README.md b/README.md
@@ -126,3 +126,54 @@ Latency (sec)
 | 1664 | oom | 13.45 | 13.36 | oom |
 | 1792 | oom | 14.65 | 14.85 | oom |
 | 1920 | oom | oom | oom | oom |
+
+## GPT2 Multi-Query Attention
+```python
+hidden_size = 2048
+n_head = 16
+n_layer = 24
+total_params = 1126889472
+```
+
+![image](images/GPT2-MQA-throughput.png)
+![image](images/GPT2-MQA-latency.png)
+
+Throughput (tokens/sec | msec/token)
+| batch_size | HF (fp32) | HF (bf16) | HF (int8) |
+|:----------:|:---------------:|:----------------:|:----------------:|
+| 1 | 72.61 \| 13.77 | 68.89 \| 14.52 | 54.68 \| 18.29 |
+| 2 | 139.03 \| 7.19 | 133.32 \| 7.50 | 106.70 \| 9.37 |
+| 4 | 275.54 \| 3.63 | 273.12 \| 3.66 | 213.83 \| 4.68 |
+| 8 | 538.85 \| 1.86 | 556.67 \| 1.80 | 432.10 \| 2.31 |
+| 16 | 1015.47 \| 0.98 | 1096.44 \| 0.91 | 846.28 \| 1.18 |
+| 32 | 1863.15 \| 0.54 | 2194.91 \| 0.46 | 1663.86 \| 0.60 |
+| 64 | 3009.88 \| 0.33 | 4167.02 \| 0.24 | 3192.54 \| 0.31 |
+| 128 | 3399.45 \| 0.29 | 6856.43 \| 0.15 | 5928.43 \| 0.17 |
+| 256 | 4208.59 \| 0.24 | 11002.50 \| 0.09 | 9938.01 \| 0.10 |
+| 512 | 4559.72 \| 0.22 | 13727.93 \| 0.07 | 13850.24 \| 0.07 |
+| 1024 | 4969.87 \| 0.20 | 15122.67 \| 0.07 | 15604.99 \| 0.06 |
+| 2048 | 5090.85 \| 0.20 | 16014.17 \| 0.06 | 16298.18 \| 0.06 |
+| 4096 | 5212.22 \| 0.19 | 16570.20 \| 0.06 | 16884.37 \| 0.06 |
+| 8192 | 5268.96 \| 0.19 | 16781.00 \| 0.06 | 17088.02 \| 0.06 |
+| 16384 | oom | 16874.13 \| 0.06 | 17159.74 \| 0.06 |
+| 32768 | oom | oom | oom |
+
+Latency (sec)
+| batch_size | HF (fp32) | HF (bf16) | HF (int8) |
+|:----------:|:---------:|:---------:|:---------:|
+| 1 | 1.38 | 1.45 | 1.83 |
+| 2 | 1.44 | 1.50 | 1.87 |
+| 4 | 1.45 | 1.46 | 1.87 |
+| 8 | 1.48 | 1.44 | 1.85 |
+| 16 | 1.58 | 1.46 | 1.89 |
+| 32 | 1.72 | 1.46 | 1.92 |
+| 64 | 2.13 | 1.54 | 2.00 |
+| 128 | 3.77 | 1.87 | 2.16 |
+| 256 | 6.08 | 2.33 | 2.58 |
+| 512 | 11.23 | 3.73 | 3.70 |
+| 1024 | 20.60 | 6.77 | 6.56 |
+| 2048 | 40.23 | 12.79 | 12.57 |
+| 4096 | 78.58 | 24.72 | 24.26 |
+| 8192 | 155.48 | 48.82 | 47.94 |
+| 16384 | oom | 97.10 | 95.48 |
+| 32768 | oom | oom | oom |
diff --git a/images/BLOOM-latency.png b/images/BLOOM-latency.png
diff --git a/images/BLOOM-throughput.png b/images/BLOOM-throughput.png
diff --git a/images/GPT2-MHA-latency.png b/images/GPT2-MHA-latency.png
diff --git a/images/GPT2-MHA-throughput.png b/images/GPT2-MHA-throughput.png
diff --git a/images/GPT2-MQA-latency.png b/images/GPT2-MQA-latency.png
diff --git a/images/GPT2-MQA-throughput.png b/images/GPT2-MQA-throughput.png
diff --git a/scripts/make_graph_throughput.py b/scripts/make_graph_throughput.py
@@ -46,7 +46,10 @@ def parse_line(line: str, plot: str = "throughput") -> str:
 
 def parse_data(data: list):
  x = []
- y = [[], [], [], []]
+ y = []
+ for i in range(len(data[0]) - 1):
+ y.append([])
+
  for dp in data:
  x.append(dp[0])
  for i in range(1, len(dp)):

diff --git a/scripts/parse_logs.py b/scripts/parse_logs.py
@@ -0,0 +1,148 @@
+import argparse
+import copy
+import os
+from typing import Tuple
+
+from markdownTable import markdownTable
+from pandas import DataFrame
+
+
+def get_args() -> argparse.Namespace:
+ parser = argparse.ArgumentParser()
+
+ parser.add_argument("--input_dir", type=str, required=True)
+
+ args = parser.parse_args()
+ return args
+
+
+def parse_line(line: str) -> Tuple[str, str]:
+ line = line.strip()
+
+ if line.endswith("tokens/sec"):
+ line = line.split("Throughput (including tokenization) = ")[1]
+ line = line.split(" tokens/sec")[0]
+
+ return line, "throughput"
+ elif line.endswith("msecs/token"):
+ line = line.split("Throughput (including tokenization) = ")[1]
+ line = line.split(" msecs/token")[0]
+
+ return line, "inverse_throughput"
+ elif line.startswith("Latency = ") and line.endswith("secs"):
+ line = line.split("Latency = ")[1]
+ line = line.split(" secs")[0]
+
+ return line, "latency"
+ elif "with batch size = " in line:
+ line = line.split("with batch size = ")[1]
+
+ return line, "batch_size"
+
+ return None, None
+
+
+def get_throughput_dataframe(results: dict, order: list) -> DataFrame:
+ throughput = copy.deepcopy(results["throughput"])
+ for key in results["inverse_throughput"]:
+ for index, value in enumerate(results["inverse_throughput"][key]):
+ throughput[key][index] = throughput[key][index] + " \| " + value
+
+ max_rows = -1
+ batch_size_column = None
+ for key in results["batch_size"]:
+ bs = len(results["batch_size"][key])
+
+ if bs > max_rows:
+ max_rows = bs
+ batch_size_column = results["batch_size"][key]
+
+ for key in throughput:
+ while len(throughput[key]) < max_rows:
+ throughput[key].append("oom")
+ throughput["batch_size"] = batch_size_column
+
+ df = DataFrame(throughput)
+ df = df.loc[:, order]
+
+ return df
+
+
+def get_latency_dataframe(results: dict, order: list) -> DataFrame:
+ latency = copy.deepcopy(results["latency"])
+
+ max_rows = -1
+ batch_size_column = None
+ for key in results["batch_size"]:
+ bs = len(results["batch_size"][key])
+
+ if bs > max_rows:
+ max_rows = bs
+ batch_size_column = results["batch_size"][key]
+
+ for key in latency:
+ while len(latency[key]) < max_rows:
+ latency[key].append("oom")
+ latency["batch_size"] = batch_size_column
+
+ df = DataFrame(latency)
+ df = df.loc[:, order]
+
+ return df
+
+
+def make_table(results: dict):
+ order = ["batch_size", "HF (fp32)", "HF (bf16)", "HF (int8)"]
+
+ kwargs = dict(
+ row_sep="markdown",
+ padding_width=1,
+ )
+
+ throughput = get_throughput_dataframe(results, order)
+ throughput = throughput.to_dict(orient="records")
+ throughput = markdownTable(throughput).setParams(**kwargs).getMarkdown().split("```")[1]
+
+ latency = get_latency_dataframe(results, order)
+ latency = latency.to_dict(orient="records")
+ latency = markdownTable(latency).setParams(**kwargs).getMarkdown().split("```")[1]
+
+ return throughput, latency
+
+
+def main() -> None:
+ args = get_args()
+
+ input_files = os.listdir(args.input_dir)
+ results = {"throughput": {}, "inverse_throughput": {}, "latency": {}, "batch_size": {}}
+ filename_column = {
+ "fp32.log": "HF (fp32)",
+ "bf16.log": "HF (bf16)",
+ "int8.log": "HF (int8)",
+ "fp16.log": "DS-inference (fp16)",
+ }
+
+ for filename in input_files:
+ with open(os.path.join(args.input_dir, filename), "r") as f:
+ lines = f.readlines()
+
+ for line in lines:
+ value, key = parse_line(line)
+
+ if key is not None:
+ column_name = filename_column[filename]
+ if column_name not in results[key]:
+ results[key][column_name] = []
+ results[key][column_name].append(value)
+
+ throughput, latency = make_table(results)
+
+ print("Throughput (tokens/sec | msec/token)")
+ print(throughput)
+ print()
+ print("Latency (sec)")
+ print(latency)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/run_batch_size.sh → scripts/run_batch_size.sh b/run_batch_size.sh → scripts/run_batch_size.sh
@@ -7,6 +7,7 @@ do
  make $1 batch_size=$bs
 done
 
+# split for loops
 for i in {1..20}
 do
  bs=$(($i*128))

diff --git a/scripts/run_batch_size1.sh b/scripts/run_batch_size1.sh
@@ -0,0 +1,10 @@
+export CUDA_VISIBLE_DEVICES=0
+
+rm -rf ./tmp
+
+# split for loops
+for i in {0..20}
+do
+ bs=$((2**$i))
+ make $1 batch_size=$bs
+done
diff --git a/run_input_length.sh → scripts/run_input_length.sh b/run_input_length.sh → scripts/run_input_length.sh