arunjose696
diff --git a/‎FasterTransformer/README.md‎
Lines changed: 30 additions & 7 deletions b/‎FasterTransformer/README.md‎
Lines changed: 30 additions & 7 deletions
diff --git a/‎FasterTransformer/v1/fastertransformer/common.h‎
Lines changed: 1 addition & 1 deletion b/‎FasterTransformer/v1/fastertransformer/common.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎FasterTransformer/v1/fastertransformer/trt_plugin/trt_model.h‎
Lines changed: 1 addition & 1 deletion b/‎FasterTransformer/v1/fastertransformer/trt_plugin/trt_model.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎FasterTransformer/v3.0/fastertransformer/common.h‎
Lines changed: 10 additions & 0 deletions b/‎FasterTransformer/v3.0/fastertransformer/common.h‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎FasterTransformer/v3.1/CMakeLists.txt‎
Lines changed: 207 additions & 0 deletions b/‎FasterTransformer/v3.1/CMakeLists.txt‎
Lines changed: 207 additions & 0 deletions
@@ -1,6 +1,7 @@
 # FasterTransformer
 
 This repository provides a script and recipe to run the highly optimized transformer for inference, and it is tested and maintained by NVIDIA.
+This repo is move to https://github.com/NVIDIA/FasterTransformer and we will deprecate this sub-repo in the future. 
 
 ## Table Of Contents
 - [FasterTransformer](#fastertransformer)
@@ -10,6 +11,7 @@ This repository provides a script and recipe to run the highly optimized transfo
  - [FasterTransformer v2](#fastertransformer-v2)
  - [FasterTransformer v2.1](#fastertransformer-v21)
  - [FasterTransformer v3.0](#fastertransformer-v30)
+ - [FasterTransformer v3.1](#fastertransformer-v31)
  - [Architecture matrix](#architecture-matrix)
  - [Release notes](#release-notes)
  - [Changelog](#changelog)
@@ -33,16 +35,21 @@ FasterTransformer v2.1 optimizes some kernels of encoder and decoder, adding the
 
 FasterTransformer v3.0 adds the supporting of INT8 quantization for cpp and TensorFlow encoder model on Turing and Ampere GPUs. 
 
+### FasterTransformer v3.1
+
+First, FasterTransformer v3.1 adds the supporting of INT8 quantization of PyTorch encoder model on Turing and Ampere GPUs. Second, v3.1 improves the performances of encoder on FP16 and INT8. Compared to v3.0, v3.1 provides at most 1.2x speedup on T4 FP16, and 1.7x speedup on T4 INT8. Third, v3.1 supports the inference of GPT-2 model.
+
 ### Architecture matrix
 
-The following matrix shows the Architecture Differences between the model.
+The following matrix shows the architecture differences between the model.
 
-| Architecure | Encoder | Encoder INT8 quantization |Decoder | Decoding with beam search | Decoding with sampling |
-|---------------------------|-------------------|----------------------------|--------------------|---------------------------|------------------------|
-|FasterTransformer v1 | Yes | No | No | No | No |
-|FasterTransformer v2 | Yes | No | Yes | Yes | No |
-|FasterTransformer v2.1 | Yes | No | Yes | Yes | Yes |
-|FasterTransformer v3.0 | Yes | Yes | Yes | Yes | Yes |
+| Architecure | Encoder | Encoder INT8 quantization | Decoder | Decoding with beam search | Decoding with sampling | GPT-2 |
+|---------------------------|-------------------|----------------------------|---------------------|---------------------------|------------------------|-------|
+| v1 | Yes | No | No | No | No | No |
+| v2 | Yes | No | Yes | Yes | No | No |
+| v2.1 | Yes | No | Yes | Yes | Yes | No |
+| v3.0 | Yes | Yes | Yes | Yes | Yes | No |
+| v3.1 | Yes | Yes | Yes | Yes | Yes | Yes |
 
 ## Release notes
 
@@ -52,9 +59,25 @@ FasterTransformer v2 will be deprecated on Dec 2020.
 
 FasterTransformer v2.1 will be deprecated on July 2021. 
 
+FasterTransformer v3.0 will be deprecated on Sep 2021. 
+
+FasterTransformer v3.1 will be deprecated on Dec 2021. 
+
 ### Changelog
 
+Dec 2020
+- **Release the FasterTransformer 3.1**
+
+Nov 2020
+- Optimize the INT8 inference.
+- Support PyTorch INT8 inference.
+- Provide PyTorch INT8 quantiztion tools.
+- Integrate the fused multi-head attention kernel of TensorRT into FasterTransformer.
+- Add unit test of SQuAD. 
+- Update the missed NGC checkpoints.
+
 Sep 2020
+- Support GPT2
 - **Release the FasterTransformer 3.0**
  - Support INT8 quantization of encoder of cpp and TensorFlow op.
  - Add bert-tf-quantization tool.
 
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 #pragma once
-
+#include <stdexcept>
 #include <iostream>
 #include <cuda_runtime.h>
 #include <cuda_fp16.h>
 
@@ -65,7 +65,7 @@ class TRT_Transformer
  auto from_tensor = network->addInput(INPUT_BLOB_NAME, dtype_, nvinfer1::Dims2{seq_len_, hidden_dim_});
  auto mask_tensor = network->addInput(MASK_BLOB_NAME, dtype_, nvinfer1::Dims2{seq_len_, seq_len_});
 
- assert(input_tensor);
+ assert(from_tensor);
  assert(mask_tensor);
 
  nvinfer1::ITensor* output_tensor = nullptr;
 
@@ -193,6 +193,11 @@ void cublasLtMM_withAlgo(int *res, int batchCount, int m, int n, int k,
  res,
  CtransformDesc,
  (findAlgo == 1 ? (&algo) : NULL), NULL, 0, stream);
+
+ cublasLtMatmulDescDestroy(matmulDesc);
+ cublasLtMatrixLayoutDestroy(AtransformDesc);
+ cublasLtMatrixLayoutDestroy(BtransformDesc);
+ cublasLtMatrixLayoutDestroy(CtransformDesc);
 }
 
 //for int8 IO cublasLtMM with algo
@@ -281,6 +286,11 @@ void cublasLtMM_withAlgo_int8IO(int8_t *res, int batchCount, int m, int n, int k
  res,
  CtransformDesc,
  (findAlgo == 1 ? (&algo) : NULL), NULL, 0, stream);
+
+ cublasLtMatmulDescDestroy(matmulDesc);
+ cublasLtMatrixLayoutDestroy(AtransformDesc);
+ cublasLtMatrixLayoutDestroy(BtransformDesc);
+ cublasLtMatrixLayoutDestroy(CtransformDesc);
 }
 
 template <typename T>
 
@@ -0,0 +1,207 @@
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+cmake_minimum_required(VERSION 3.8 FATAL_ERROR) # for PyTorch extensions, version should be greater than 3.13
+project(FasterTransformer LANGUAGES CXX CUDA)
+
+find_package(CUDA 10.1 REQUIRED)
+
+option(BUILD_TRT "Build in TensorRT mode" OFF)
+option(BUILD_TF "Build in TensorFlow mode" OFF)
+option(BUILD_THE "Build in PyTorch eager mode" OFF)
+option(BUILD_THS "Build in TorchScript class mode" OFF)
+
+if(BUILD_THS)
+ if(DEFINED ENV{NVIDIA_PYTORCH_VERSION})
+ if($ENV{NVIDIA_PYTORCH_VERSION} VERSION_LESS "20.03")
+ message(FATAL_ERROR "NVIDIA PyTorch image is too old for TorchScript mode.")
+ endif()
+ if($ENV{NVIDIA_PYTORCH_VERSION} VERSION_EQUAL "20.03")
+ add_definitions(-DLEGACY_THS=1)
+ endif()
+ endif()
+endif()
+
+set(CXX_STD "11" CACHE STRING "C++ standard")
+
+set(CUDA_PATH ${CUDA_TOOLKIT_ROOT_DIR})
+
+set(TF_PATH "" CACHE STRING "TensorFlow path")
+
+if(BUILD_TF AND NOT TF_PATH)
+ message(FATAL_ERROR "TF_PATH must be set if BUILD_TF(=TensorFlow mode) is on.")
+endif()
+
+set(TRT_PATH "" CACHE STRING "TensorRT path")
+
+if(BUILD_TRT AND NOT TRT_PATH)
+ message(FATAL_ERROR "TRT_PATH must be set if BUILD_TRT(=TensorRT mode) is on.")
+endif()
+
+list(APPEND CMAKE_MODULE_PATH ${CUDA_PATH}/lib64)
+
+if (${CUDA_VERSION} GREATER_EQUAL 11.0)
+ message(STATUS "Add DCUDA11_MODE")
+ add_definitions("-DCUDA11_MODE")
+endif()
+
+# setting compiler flags
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}") 
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -Wall")
+
+if (SM STREQUAL 80 OR
+ SM STREQUAL 86 OR
+ SM STREQUAL 70 OR
+ SM STREQUAL 75 OR
+ SM STREQUAL 61 OR
+ SM STREQUAL 60)
+#set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_${SM},code=\\\"sm_${SM},compute_${SM}\\\" -rdc=true")
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_${SM},code=\\\"sm_${SM},compute_${SM}\\\"")
+ if (SM STREQUAL 70 OR SM STREQUAL 75 OR SM STREQUAL 80 OR SM STREQUAL 86)
+ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DWMMA")
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DWMMA")
+ set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DWMMA")
+ endif()
+if(BUILD_THE OR BUILD_THS)
+ string(SUBSTRING ${SM} 0 1 SM_MAJOR)
+ string(SUBSTRING ${SM} 1 1 SM_MINOR)
+ set(ENV{TORCH_CUDA_ARCH_LIST} "${SM_MAJOR}.${SM_MINOR}")
+endif()
+message("-- Assign GPU architecture (sm=${SM})")
+
+else()
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} \
+ -gencode=arch=compute_70,code=\\\"sm_70,compute_70\\\" \
+ -gencode=arch=compute_75,code=\\\"sm_75,compute_75\\\" \
+ ")
+# -rdc=true")
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DWMMA")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DWMMA")
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DWMMA")
+if(BUILD_THE OR BUILD_THS)
+ set(ENV{TORCH_CUDA_ARCH_LIST} "7.0;7.5")
+endif()
+message("-- Assign GPU architecture (sm=70,75)")
+endif()
+
+set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -Wall -O0")
+set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -Wall -O0")
+# set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -O0 -G -Xcompiler -Wall --ptxas-options=-v --resource-usage")
+set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -O0 -G -Xcompiler -Wall")
+
+set(CMAKE_CXX_STANDARD "${CXX_STD}")
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda")
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --std=c++${CXX_STD}")
+
+set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3")
+# set(CMAKE_CUDA_FLAGS_RELEASE "${CMAKE_CUDA_FLAGS_RELEASE} -Xcompiler -O3 --ptxas-options=--verbose")
+set(CMAKE_CUDA_FLAGS_RELEASE "${CMAKE_CUDA_FLAGS_RELEASE} -Xcompiler -O3")
+
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
+
+set(COMMON_HEADER_DIRS
+ ${PROJECT_SOURCE_DIR}
+ ${CUDA_PATH}/include
+)
+
+set(COMMON_LIB_DIRS
+ ${CUDA_PATH}/lib64
+)
+
+if(BUILD_TF)
+ list(APPEND COMMON_HEADER_DIRS ${TF_PATH}/include)
+ list(APPEND COMMON_LIB_DIRS ${TF_PATH})
+endif()
+
+if(BUILD_TRT)
+ list(APPEND COMMON_HEADER_DIRS ${TRT_PATH}/include)
+ list(APPEND COMMON_LIB_DIRS ${TRT_PATH}/lib)
+endif()
+
+set(PYTHON_PATH "python" CACHE STRING "Python path")
+if(BUILD_THS)
+ execute_process(COMMAND ${PYTHON_PATH} "-c" "from __future__ import print_function; import torch; print(torch.__version__,end='');"
+ RESULT_VARIABLE _PYTHON_SUCCESS
+ OUTPUT_VARIABLE TORCH_VERSION)
+ if (TORCH_VERSION VERSION_LESS "1.5.0")
+ message(FATAL_ERROR "PyTorch >= 1.5.0 is needed for TorchScript mode.")
+ endif()
+endif()
+if(BUILD_THE OR BUILD_THS)
+ execute_process(COMMAND ${PYTHON_PATH} "-c" "from __future__ import print_function; import os; import torch;
+print(os.path.dirname(torch.__file__),end='');"
+ RESULT_VARIABLE _PYTHON_SUCCESS
+ OUTPUT_VARIABLE TORCH_DIR)
+ if (NOT _PYTHON_SUCCESS MATCHES 0)
+ message(FATAL_ERROR "Torch config Error.")
+ endif()
+ list(APPEND CMAKE_PREFIX_PATH ${TORCH_DIR})
+ find_package(Torch REQUIRED)
+
+ execute_process(COMMAND ${PYTHON_PATH} "-c" "from __future__ import print_function; from distutils import sysconfig;
+print(sysconfig.get_python_inc());
+print(sysconfig.get_config_var('SO'));"
+ RESULT_VARIABLE _PYTHON_SUCCESS
+ OUTPUT_VARIABLE _PYTHON_VALUES)
+ if (NOT _PYTHON_SUCCESS MATCHES 0)
+ message(FATAL_ERROR "Python config Error.")
+ endif()
+ string(REGEX REPLACE ";" "\\\\;" _PYTHON_VALUES ${_PYTHON_VALUES})
+ string(REGEX REPLACE "\n" ";" _PYTHON_VALUES ${_PYTHON_VALUES})
+ list(GET _PYTHON_VALUES 0 PY_INCLUDE_DIR)
+ list(GET _PYTHON_VALUES 1 PY_SUFFIX)
+ list(APPEND COMMON_HEADER_DIRS ${PY_INCLUDE_DIR})
+
+ execute_process(COMMAND ${PYTHON_PATH} "-c" "from torch.utils import cpp_extension; print(' '.join(cpp_extension._prepare_ldflags([],True,False)),end='');"
+ RESULT_VARIABLE _PYTHON_SUCCESS
+ OUTPUT_VARIABLE TORCH_LINK)
+ if (NOT _PYTHON_SUCCESS MATCHES 0)
+ message(FATAL_ERROR "PyTorch link config Error.")
+ endif()
+endif()
+
+
+include_directories(
+ ${COMMON_HEADER_DIRS}
+)
+
+link_directories(
+ ${COMMON_LIB_DIRS}
+)
+
+add_subdirectory(fastertransformer)
+add_subdirectory(tools)
+add_subdirectory(sample)
+
+if(BUILD_TF)
+ add_custom_target(copy ALL COMMENT "Copying tensorflow test scripts")
+ add_custom_command(TARGET copy
+ POST_BUILD
+ COMMAND cp ${PROJECT_SOURCE_DIR}/sample/tensorflow/ ${PROJECT_BINARY_DIR} -r
+ )
+endif()
+
+if(BUILD_THE OR BUILD_THS)
+ add_custom_target(copy ALL COMMENT "Copying pytorch test scripts")
+ add_custom_command(TARGET copy
+ POST_BUILD
+ COMMAND cp ${PROJECT_SOURCE_DIR}/sample/pytorch/ ${PROJECT_BINARY_DIR} -r
+ COMMAND mkdir -p ${PROJECT_BINARY_DIR}/pytorch/translation/data/
+ COMMAND cp ${PROJECT_SOURCE_DIR}/sample/tensorflow/utils/translation/test.* ${PROJECT_BINARY_DIR}/pytorch/translation/data/
+ )
+endif()