Skip to content

Commit d58ccd7

Browse files
committed
Merge branch 'master' into byshiue-patch-1
2 parents d513391 + 9f8b7de commit d58ccd7

File tree

1,235 files changed

+581845
-29341
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,235 files changed

+581845
-29341
lines changed

FasterTransformer/README.md

Lines changed: 30 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# FasterTransformer
22

33
This repository provides a script and recipe to run the highly optimized transformer for inference, and it is tested and maintained by NVIDIA.
4+
This repo is move to https://github.com/NVIDIA/FasterTransformer and we will deprecate this sub-repo in the future.
45

56
## Table Of Contents
67
- [FasterTransformer](#fastertransformer)
@@ -10,6 +11,7 @@ This repository provides a script and recipe to run the highly optimized transfo
1011
- [FasterTransformer v2](#fastertransformer-v2)
1112
- [FasterTransformer v2.1](#fastertransformer-v21)
1213
- [FasterTransformer v3.0](#fastertransformer-v30)
14+
- [FasterTransformer v3.1](#fastertransformer-v31)
1315
- [Architecture matrix](#architecture-matrix)
1416
- [Release notes](#release-notes)
1517
- [Changelog](#changelog)
@@ -33,16 +35,21 @@ FasterTransformer v2.1 optimizes some kernels of encoder and decoder, adding the
3335

3436
FasterTransformer v3.0 adds the supporting of INT8 quantization for cpp and TensorFlow encoder model on Turing and Ampere GPUs.
3537

38+
### FasterTransformer v3.1
39+
40+
First, FasterTransformer v3.1 adds the supporting of INT8 quantization of PyTorch encoder model on Turing and Ampere GPUs. Second, v3.1 improves the performances of encoder on FP16 and INT8. Compared to v3.0, v3.1 provides at most 1.2x speedup on T4 FP16, and 1.7x speedup on T4 INT8. Third, v3.1 supports the inference of GPT-2 model.
41+
3642
### Architecture matrix
3743

38-
The following matrix shows the Architecture Differences between the model.
44+
The following matrix shows the architecture differences between the model.
3945

40-
| Architecure | Encoder | Encoder INT8 quantization |Decoder | Decoding with beam search | Decoding with sampling |
41-
|---------------------------|-------------------|----------------------------|--------------------|---------------------------|------------------------|
42-
|FasterTransformer v1 | Yes | No | No | No | No |
43-
|FasterTransformer v2 | Yes | No | Yes | Yes | No |
44-
|FasterTransformer v2.1 | Yes | No | Yes | Yes | Yes |
45-
|FasterTransformer v3.0 | Yes | Yes | Yes | Yes | Yes |
46+
| Architecure | Encoder | Encoder INT8 quantization | Decoder | Decoding with beam search | Decoding with sampling | GPT-2 |
47+
|---------------------------|-------------------|----------------------------|---------------------|---------------------------|------------------------|-------|
48+
| v1 | Yes | No | No | No | No | No |
49+
| v2 | Yes | No | Yes | Yes | No | No |
50+
| v2.1 | Yes | No | Yes | Yes | Yes | No |
51+
| v3.0 | Yes | Yes | Yes | Yes | Yes | No |
52+
| v3.1 | Yes | Yes | Yes | Yes | Yes | Yes |
4653

4754
## Release notes
4855

@@ -52,9 +59,25 @@ FasterTransformer v2 will be deprecated on Dec 2020.
5259

5360
FasterTransformer v2.1 will be deprecated on July 2021.
5461

62+
FasterTransformer v3.0 will be deprecated on Sep 2021.
63+
64+
FasterTransformer v3.1 will be deprecated on Dec 2021.
65+
5566
### Changelog
5667

68+
Dec 2020
69+
- **Release the FasterTransformer 3.1**
70+
71+
Nov 2020
72+
- Optimize the INT8 inference.
73+
- Support PyTorch INT8 inference.
74+
- Provide PyTorch INT8 quantiztion tools.
75+
- Integrate the fused multi-head attention kernel of TensorRT into FasterTransformer.
76+
- Add unit test of SQuAD.
77+
- Update the missed NGC checkpoints.
78+
5779
Sep 2020
80+
- Support GPT2
5881
- **Release the FasterTransformer 3.0**
5982
- Support INT8 quantization of encoder of cpp and TensorFlow op.
6083
- Add bert-tf-quantization tool.

FasterTransformer/v1/fastertransformer/common.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
* limitations under the License.
1515
*/
1616
#pragma once
17-
17+
#include <stdexcept>
1818
#include <iostream>
1919
#include <cuda_runtime.h>
2020
#include <cuda_fp16.h>

FasterTransformer/v1/fastertransformer/trt_plugin/trt_model.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ class TRT_Transformer
6565
auto from_tensor = network->addInput(INPUT_BLOB_NAME, dtype_, nvinfer1::Dims2{seq_len_, hidden_dim_});
6666
auto mask_tensor = network->addInput(MASK_BLOB_NAME, dtype_, nvinfer1::Dims2{seq_len_, seq_len_});
6767

68-
assert(input_tensor);
68+
assert(from_tensor);
6969
assert(mask_tensor);
7070

7171
nvinfer1::ITensor* output_tensor = nullptr;

FasterTransformer/v3.0/fastertransformer/common.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,11 @@ void cublasLtMM_withAlgo(int *res, int batchCount, int m, int n, int k,
193193
res,
194194
CtransformDesc,
195195
(findAlgo == 1 ? (&algo) : NULL), NULL, 0, stream);
196+
197+
cublasLtMatmulDescDestroy(matmulDesc);
198+
cublasLtMatrixLayoutDestroy(AtransformDesc);
199+
cublasLtMatrixLayoutDestroy(BtransformDesc);
200+
cublasLtMatrixLayoutDestroy(CtransformDesc);
196201
}
197202

198203
//for int8 IO cublasLtMM with algo
@@ -281,6 +286,11 @@ void cublasLtMM_withAlgo_int8IO(int8_t *res, int batchCount, int m, int n, int k
281286
res,
282287
CtransformDesc,
283288
(findAlgo == 1 ? (&algo) : NULL), NULL, 0, stream);
289+
290+
cublasLtMatmulDescDestroy(matmulDesc);
291+
cublasLtMatrixLayoutDestroy(AtransformDesc);
292+
cublasLtMatrixLayoutDestroy(BtransformDesc);
293+
cublasLtMatrixLayoutDestroy(CtransformDesc);
284294
}
285295

286296
template <typename T>
Lines changed: 207 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,207 @@
1+
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
cmake_minimum_required(VERSION 3.8 FATAL_ERROR) # for PyTorch extensions, version should be greater than 3.13
15+
project(FasterTransformer LANGUAGES CXX CUDA)
16+
17+
find_package(CUDA 10.1 REQUIRED)
18+
19+
option(BUILD_TRT "Build in TensorRT mode" OFF)
20+
option(BUILD_TF "Build in TensorFlow mode" OFF)
21+
option(BUILD_THE "Build in PyTorch eager mode" OFF)
22+
option(BUILD_THS "Build in TorchScript class mode" OFF)
23+
24+
if(BUILD_THS)
25+
if(DEFINED ENV{NVIDIA_PYTORCH_VERSION})
26+
if($ENV{NVIDIA_PYTORCH_VERSION} VERSION_LESS "20.03")
27+
message(FATAL_ERROR "NVIDIA PyTorch image is too old for TorchScript mode.")
28+
endif()
29+
if($ENV{NVIDIA_PYTORCH_VERSION} VERSION_EQUAL "20.03")
30+
add_definitions(-DLEGACY_THS=1)
31+
endif()
32+
endif()
33+
endif()
34+
35+
set(CXX_STD "11" CACHE STRING "C++ standard")
36+
37+
set(CUDA_PATH ${CUDA_TOOLKIT_ROOT_DIR})
38+
39+
set(TF_PATH "" CACHE STRING "TensorFlow path")
40+
41+
if(BUILD_TF AND NOT TF_PATH)
42+
message(FATAL_ERROR "TF_PATH must be set if BUILD_TF(=TensorFlow mode) is on.")
43+
endif()
44+
45+
set(TRT_PATH "" CACHE STRING "TensorRT path")
46+
47+
if(BUILD_TRT AND NOT TRT_PATH)
48+
message(FATAL_ERROR "TRT_PATH must be set if BUILD_TRT(=TensorRT mode) is on.")
49+
endif()
50+
51+
list(APPEND CMAKE_MODULE_PATH ${CUDA_PATH}/lib64)
52+
53+
if (${CUDA_VERSION} GREATER_EQUAL 11.0)
54+
message(STATUS "Add DCUDA11_MODE")
55+
add_definitions("-DCUDA11_MODE")
56+
endif()
57+
58+
# setting compiler flags
59+
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
60+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
61+
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -Wall")
62+
63+
if (SM STREQUAL 80 OR
64+
SM STREQUAL 86 OR
65+
SM STREQUAL 70 OR
66+
SM STREQUAL 75 OR
67+
SM STREQUAL 61 OR
68+
SM STREQUAL 60)
69+
#set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_${SM},code=\\\"sm_${SM},compute_${SM}\\\" -rdc=true")
70+
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_${SM},code=\\\"sm_${SM},compute_${SM}\\\"")
71+
if (SM STREQUAL 70 OR SM STREQUAL 75 OR SM STREQUAL 80 OR SM STREQUAL 86)
72+
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DWMMA")
73+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DWMMA")
74+
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DWMMA")
75+
endif()
76+
if(BUILD_THE OR BUILD_THS)
77+
string(SUBSTRING ${SM} 0 1 SM_MAJOR)
78+
string(SUBSTRING ${SM} 1 1 SM_MINOR)
79+
set(ENV{TORCH_CUDA_ARCH_LIST} "${SM_MAJOR}.${SM_MINOR}")
80+
endif()
81+
message("-- Assign GPU architecture (sm=${SM})")
82+
83+
else()
84+
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} \
85+
-gencode=arch=compute_70,code=\\\"sm_70,compute_70\\\" \
86+
-gencode=arch=compute_75,code=\\\"sm_75,compute_75\\\" \
87+
")
88+
# -rdc=true")
89+
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DWMMA")
90+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DWMMA")
91+
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DWMMA")
92+
if(BUILD_THE OR BUILD_THS)
93+
set(ENV{TORCH_CUDA_ARCH_LIST} "7.0;7.5")
94+
endif()
95+
message("-- Assign GPU architecture (sm=70,75)")
96+
endif()
97+
98+
set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -Wall -O0")
99+
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -Wall -O0")
100+
# set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -O0 -G -Xcompiler -Wall --ptxas-options=-v --resource-usage")
101+
set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -O0 -G -Xcompiler -Wall")
102+
103+
set(CMAKE_CXX_STANDARD "${CXX_STD}")
104+
set(CMAKE_CXX_STANDARD_REQUIRED ON)
105+
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda")
106+
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
107+
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --std=c++${CXX_STD}")
108+
109+
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3")
110+
# set(CMAKE_CUDA_FLAGS_RELEASE "${CMAKE_CUDA_FLAGS_RELEASE} -Xcompiler -O3 --ptxas-options=--verbose")
111+
set(CMAKE_CUDA_FLAGS_RELEASE "${CMAKE_CUDA_FLAGS_RELEASE} -Xcompiler -O3")
112+
113+
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
114+
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
115+
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
116+
117+
set(COMMON_HEADER_DIRS
118+
${PROJECT_SOURCE_DIR}
119+
${CUDA_PATH}/include
120+
)
121+
122+
set(COMMON_LIB_DIRS
123+
${CUDA_PATH}/lib64
124+
)
125+
126+
if(BUILD_TF)
127+
list(APPEND COMMON_HEADER_DIRS ${TF_PATH}/include)
128+
list(APPEND COMMON_LIB_DIRS ${TF_PATH})
129+
endif()
130+
131+
if(BUILD_TRT)
132+
list(APPEND COMMON_HEADER_DIRS ${TRT_PATH}/include)
133+
list(APPEND COMMON_LIB_DIRS ${TRT_PATH}/lib)
134+
endif()
135+
136+
set(PYTHON_PATH "python" CACHE STRING "Python path")
137+
if(BUILD_THS)
138+
execute_process(COMMAND ${PYTHON_PATH} "-c" "from __future__ import print_function; import torch; print(torch.__version__,end='');"
139+
RESULT_VARIABLE _PYTHON_SUCCESS
140+
OUTPUT_VARIABLE TORCH_VERSION)
141+
if (TORCH_VERSION VERSION_LESS "1.5.0")
142+
message(FATAL_ERROR "PyTorch >= 1.5.0 is needed for TorchScript mode.")
143+
endif()
144+
endif()
145+
if(BUILD_THE OR BUILD_THS)
146+
execute_process(COMMAND ${PYTHON_PATH} "-c" "from __future__ import print_function; import os; import torch;
147+
print(os.path.dirname(torch.__file__),end='');"
148+
RESULT_VARIABLE _PYTHON_SUCCESS
149+
OUTPUT_VARIABLE TORCH_DIR)
150+
if (NOT _PYTHON_SUCCESS MATCHES 0)
151+
message(FATAL_ERROR "Torch config Error.")
152+
endif()
153+
list(APPEND CMAKE_PREFIX_PATH ${TORCH_DIR})
154+
find_package(Torch REQUIRED)
155+
156+
execute_process(COMMAND ${PYTHON_PATH} "-c" "from __future__ import print_function; from distutils import sysconfig;
157+
print(sysconfig.get_python_inc());
158+
print(sysconfig.get_config_var('SO'));"
159+
RESULT_VARIABLE _PYTHON_SUCCESS
160+
OUTPUT_VARIABLE _PYTHON_VALUES)
161+
if (NOT _PYTHON_SUCCESS MATCHES 0)
162+
message(FATAL_ERROR "Python config Error.")
163+
endif()
164+
string(REGEX REPLACE ";" "\\\\;" _PYTHON_VALUES ${_PYTHON_VALUES})
165+
string(REGEX REPLACE "\n" ";" _PYTHON_VALUES ${_PYTHON_VALUES})
166+
list(GET _PYTHON_VALUES 0 PY_INCLUDE_DIR)
167+
list(GET _PYTHON_VALUES 1 PY_SUFFIX)
168+
list(APPEND COMMON_HEADER_DIRS ${PY_INCLUDE_DIR})
169+
170+
execute_process(COMMAND ${PYTHON_PATH} "-c" "from torch.utils import cpp_extension; print(' '.join(cpp_extension._prepare_ldflags([],True,False)),end='');"
171+
RESULT_VARIABLE _PYTHON_SUCCESS
172+
OUTPUT_VARIABLE TORCH_LINK)
173+
if (NOT _PYTHON_SUCCESS MATCHES 0)
174+
message(FATAL_ERROR "PyTorch link config Error.")
175+
endif()
176+
endif()
177+
178+
179+
include_directories(
180+
${COMMON_HEADER_DIRS}
181+
)
182+
183+
link_directories(
184+
${COMMON_LIB_DIRS}
185+
)
186+
187+
add_subdirectory(fastertransformer)
188+
add_subdirectory(tools)
189+
add_subdirectory(sample)
190+
191+
if(BUILD_TF)
192+
add_custom_target(copy ALL COMMENT "Copying tensorflow test scripts")
193+
add_custom_command(TARGET copy
194+
POST_BUILD
195+
COMMAND cp ${PROJECT_SOURCE_DIR}/sample/tensorflow/ ${PROJECT_BINARY_DIR} -r
196+
)
197+
endif()
198+
199+
if(BUILD_THE OR BUILD_THS)
200+
add_custom_target(copy ALL COMMENT "Copying pytorch test scripts")
201+
add_custom_command(TARGET copy
202+
POST_BUILD
203+
COMMAND cp ${PROJECT_SOURCE_DIR}/sample/pytorch/ ${PROJECT_BINARY_DIR} -r
204+
COMMAND mkdir -p ${PROJECT_BINARY_DIR}/pytorch/translation/data/
205+
COMMAND cp ${PROJECT_SOURCE_DIR}/sample/tensorflow/utils/translation/test.* ${PROJECT_BINARY_DIR}/pytorch/translation/data/
206+
)
207+
endif()

0 commit comments

Comments
 (0)