Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 51 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -126,3 +126,54 @@ Latency (sec)
| 1664 | oom | 13.45 | 13.36 | oom |
| 1792 | oom | 14.65 | 14.85 | oom |
| 1920 | oom | oom | oom | oom |

## GPT2 Multi-Query Attention
```python
hidden_size = 2048
n_head = 16
n_layer = 24
total_params = 1126889472
```

![image](images/GPT2-MQA-throughput.png)
![image](images/GPT2-MQA-latency.png)

Throughput (tokens/sec | msec/token)
| batch_size | HF (fp32) | HF (bf16) | HF (int8) |
|:----------:|:---------------:|:----------------:|:----------------:|
| 1 | 72.61 \| 13.77 | 68.89 \| 14.52 | 54.68 \| 18.29 |
| 2 | 139.03 \| 7.19 | 133.32 \| 7.50 | 106.70 \| 9.37 |
| 4 | 275.54 \| 3.63 | 273.12 \| 3.66 | 213.83 \| 4.68 |
| 8 | 538.85 \| 1.86 | 556.67 \| 1.80 | 432.10 \| 2.31 |
| 16 | 1015.47 \| 0.98 | 1096.44 \| 0.91 | 846.28 \| 1.18 |
| 32 | 1863.15 \| 0.54 | 2194.91 \| 0.46 | 1663.86 \| 0.60 |
| 64 | 3009.88 \| 0.33 | 4167.02 \| 0.24 | 3192.54 \| 0.31 |
| 128 | 3399.45 \| 0.29 | 6856.43 \| 0.15 | 5928.43 \| 0.17 |
| 256 | 4208.59 \| 0.24 | 11002.50 \| 0.09 | 9938.01 \| 0.10 |
| 512 | 4559.72 \| 0.22 | 13727.93 \| 0.07 | 13850.24 \| 0.07 |
| 1024 | 4969.87 \| 0.20 | 15122.67 \| 0.07 | 15604.99 \| 0.06 |
| 2048 | 5090.85 \| 0.20 | 16014.17 \| 0.06 | 16298.18 \| 0.06 |
| 4096 | 5212.22 \| 0.19 | 16570.20 \| 0.06 | 16884.37 \| 0.06 |
| 8192 | 5268.96 \| 0.19 | 16781.00 \| 0.06 | 17088.02 \| 0.06 |
| 16384 | oom | 16874.13 \| 0.06 | 17159.74 \| 0.06 |
| 32768 | oom | oom | oom |

Latency (sec)
| batch_size | HF (fp32) | HF (bf16) | HF (int8) |
|:----------:|:---------:|:---------:|:---------:|
| 1 | 1.38 | 1.45 | 1.83 |
| 2 | 1.44 | 1.50 | 1.87 |
| 4 | 1.45 | 1.46 | 1.87 |
| 8 | 1.48 | 1.44 | 1.85 |
| 16 | 1.58 | 1.46 | 1.89 |
| 32 | 1.72 | 1.46 | 1.92 |
| 64 | 2.13 | 1.54 | 2.00 |
| 128 | 3.77 | 1.87 | 2.16 |
| 256 | 6.08 | 2.33 | 2.58 |
| 512 | 11.23 | 3.73 | 3.70 |
| 1024 | 20.60 | 6.77 | 6.56 |
| 2048 | 40.23 | 12.79 | 12.57 |
| 4096 | 78.58 | 24.72 | 24.26 |
| 8192 | 155.48 | 48.82 | 47.94 |
| 16384 | oom | 97.10 | 95.48 |
| 32768 | oom | oom | oom |
Binary file modified images/BLOOM-latency.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified images/BLOOM-throughput.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified images/GPT2-MHA-latency.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified images/GPT2-MHA-throughput.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added images/GPT2-MQA-latency.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added images/GPT2-MQA-throughput.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
5 changes: 4 additions & 1 deletion scripts/make_graph_throughput.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,10 @@ def parse_line(line: str, plot: str = "throughput") -> str:

def parse_data(data: list):
x = []
y = [[], [], [], []]
y = []
for i in range(len(data[0]) - 1):
y.append([])

for dp in data:
x.append(dp[0])
for i in range(1, len(dp)):
Expand Down
148 changes: 148 additions & 0 deletions scripts/parse_logs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
import argparse
import copy
import os
from typing import Tuple

from markdownTable import markdownTable
from pandas import DataFrame


def get_args() -> argparse.Namespace:
parser = argparse.ArgumentParser()

parser.add_argument("--input_dir", type=str, required=True)

args = parser.parse_args()
return args


def parse_line(line: str) -> Tuple[str, str]:
line = line.strip()

if line.endswith("tokens/sec"):
line = line.split("Throughput (including tokenization) = ")[1]
line = line.split(" tokens/sec")[0]

return line, "throughput"
elif line.endswith("msecs/token"):
line = line.split("Throughput (including tokenization) = ")[1]
line = line.split(" msecs/token")[0]

return line, "inverse_throughput"
elif line.startswith("Latency = ") and line.endswith("secs"):
line = line.split("Latency = ")[1]
line = line.split(" secs")[0]

return line, "latency"
elif "with batch size = " in line:
line = line.split("with batch size = ")[1]

return line, "batch_size"

return None, None


def get_throughput_dataframe(results: dict, order: list) -> DataFrame:
throughput = copy.deepcopy(results["throughput"])
for key in results["inverse_throughput"]:
for index, value in enumerate(results["inverse_throughput"][key]):
throughput[key][index] = throughput[key][index] + " \| " + value

max_rows = -1
batch_size_column = None
for key in results["batch_size"]:
bs = len(results["batch_size"][key])

if bs > max_rows:
max_rows = bs
batch_size_column = results["batch_size"][key]

for key in throughput:
while len(throughput[key]) < max_rows:
throughput[key].append("oom")
throughput["batch_size"] = batch_size_column

df = DataFrame(throughput)
df = df.loc[:, order]

return df


def get_latency_dataframe(results: dict, order: list) -> DataFrame:
latency = copy.deepcopy(results["latency"])

max_rows = -1
batch_size_column = None
for key in results["batch_size"]:
bs = len(results["batch_size"][key])

if bs > max_rows:
max_rows = bs
batch_size_column = results["batch_size"][key]

for key in latency:
while len(latency[key]) < max_rows:
latency[key].append("oom")
latency["batch_size"] = batch_size_column

df = DataFrame(latency)
df = df.loc[:, order]

return df


def make_table(results: dict):
order = ["batch_size", "HF (fp32)", "HF (bf16)", "HF (int8)"]

kwargs = dict(
row_sep="markdown",
padding_width=1,
)

throughput = get_throughput_dataframe(results, order)
throughput = throughput.to_dict(orient="records")
throughput = markdownTable(throughput).setParams(**kwargs).getMarkdown().split("```")[1]

latency = get_latency_dataframe(results, order)
latency = latency.to_dict(orient="records")
latency = markdownTable(latency).setParams(**kwargs).getMarkdown().split("```")[1]

return throughput, latency


def main() -> None:
args = get_args()

input_files = os.listdir(args.input_dir)
results = {"throughput": {}, "inverse_throughput": {}, "latency": {}, "batch_size": {}}
filename_column = {
"fp32.log": "HF (fp32)",
"bf16.log": "HF (bf16)",
"int8.log": "HF (int8)",
"fp16.log": "DS-inference (fp16)",
}

for filename in input_files:
with open(os.path.join(args.input_dir, filename), "r") as f:
lines = f.readlines()

for line in lines:
value, key = parse_line(line)

if key is not None:
column_name = filename_column[filename]
if column_name not in results[key]:
results[key][column_name] = []
results[key][column_name].append(value)

throughput, latency = make_table(results)

print("Throughput (tokens/sec | msec/token)")
print(throughput)
print()
print("Latency (sec)")
print(latency)


if __name__ == "__main__":
main()
1 change: 1 addition & 0 deletions run_batch_size.sh → scripts/run_batch_size.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ do
make $1 batch_size=$bs
done

# split for loops
for i in {1..20}
do
bs=$(($i*128))
Expand Down
10 changes: 10 additions & 0 deletions scripts/run_batch_size1.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
export CUDA_VISIBLE_DEVICES=0

rm -rf ./tmp

# split for loops
for i in {0..20}
do
bs=$((2**$i))
make $1 batch_size=$bs
done
File renamed without changes.