From 8ccbb376dafbc49de537302ab868782925e55a6b Mon Sep 17 00:00:00 2001
From: Ting Liu <wtmlon@foxmail.com>
Date: Wed, 19 Jun 2024 11:57:52 +0800
Subject: [PATCH 1/2] remove fast generation

---
 fast_generation/README.md                     |  305 --
 fast_generation/perf/README.md                |  250 -
 fast_generation/perf/bart_perf.py             |  170 -
 fast_generation/perf/codegen_perf.py          |  175 -
 fast_generation/perf/gpt_perf.py              |  155 -
 fast_generation/perf/opt_perf.py              |  162 -
 fast_generation/perf/pegasus_perf.py          |  168 -
 fast_generation/perf/run_perf_bart.sh         |   76 -
 fast_generation/perf/run_perf_codegen.sh      |   64 -
 fast_generation/perf/run_perf_gpt.sh          |   52 -
 fast_generation/perf/run_perf_opt.sh          |   52 -
 fast_generation/perf/run_perf_pegasus.sh      |   45 -
 fast_generation/samples/codegen_16b_sample.py |   38 -
 fast_generation/samples/codegen_sample.py     |   37 -
 fast_generation/samples/gpt_mp_sample.py      |  132 -
 fast_generation/samples/gpt_sample.py         |   35 -
 fast_generation/samples/gptj_sample.py        |   42 -
 fast_generation/samples/mbart_sample.py       |   58 -
 fast_generation/samples/opt_sample.py         |   45 -
 fast_generation/samples/pegasus_sample.py     |   36 -
 fast_generation/samples/plato_sample.py       |   62 -
 fast_generation/samples/plato_xl_sample.py    |  162 -
 fast_generation/samples/t5_sample.py          |   58 -
 fast_generation/samples/unimo_text_sample.py  |   59 -
 paddlenlp/ops/CMakeLists.txt                  |  490 --
 paddlenlp/ops/__init__.py                     |   13 -
 paddlenlp/ops/cmake/FindNCCL.cmake            |  165 -
 paddlenlp/ops/cmake/external/boost.cmake      |   64 -
 paddlenlp/ops/ext_utils.py                    |  367 --
 paddlenlp/ops/fast_transformer/CMakeLists.txt |   14 -
 paddlenlp/ops/fast_transformer/__init__.py    |   13 -
 .../sample/bart_decoding_sample.py            |  132 -
 .../sample/bart_export_model_sample.py        |  111 -
 .../fast_transformer/sample/bart_inference.py |  107 -
 .../sample/config/decoder.sample.yaml         |   39 -
 .../sample/config/decoding.sample.yaml        |   44 -
 .../fast_transformer/sample/decoder_sample.py |  145 -
 .../sample/decoding_sample.py                 |   99 -
 .../sample/encoder_decoder_sample.py          |  101 -
 .../sample/encoder_decoding_sample.py         |  128 -
 .../sample/gpt_export_model_sample.py         |  106 -
 .../ops/fast_transformer/sample/gpt_sample.py |  112 -
 .../sample/mbart_decoding_sample.py           |  138 -
 .../sample/mbart_export_model_sample.py       |  118 -
 .../sample/mbart_inference.py                 |  108 -
 .../sample/plato_export_model_sample.py       |  120 -
 .../sample/plato_inference.py                 |  108 -
 .../sample/t5_export_model_sample.py          |  113 -
 .../fast_transformer/sample/t5_inference.py   |   94 -
 .../sample/unimo_text_export_model_sample.py  |  111 -
 .../sample/unimo_text_inference.py            |   92 -
 .../ops/fast_transformer/src/CMakeLists.txt   |  336 --
 .../ops/fast_transformer/src/cublas_handle.cc |   28 -
 .../ops/fast_transformer/src/cublas_handle.h  |   58 -
 .../ops/fast_transformer/src/demo/gpt.cc      |  321 --
 .../ops/fast_transformer/src/demo/helper.h    |   66 -
 .../src/demo/transformer_e2e.cc               |  281 -
 .../ops/fast_transformer/src/demo/utf8.h      |   34 -
 .../fast_transformer/src/demo/utf8/checked.h  |  319 --
 .../ops/fast_transformer/src/demo/utf8/core.h |  387 --
 .../fast_transformer/src/demo/utf8/cpp11.h    |  103 -
 .../fast_transformer/src/demo/utf8/cpp17.h    |  103 -
 .../src/demo/utf8/unchecked.h                 |  257 -
 .../src/fusion_bart_decoding_op.cc            |  352 --
 .../src/fusion_bart_decoding_op.cu            |  581 ---
 .../src/fusion_bart_decoding_op.h             |   85 -
 .../fast_transformer/src/fusion_decoder_op.cc |  228 -
 .../fast_transformer/src/fusion_decoder_op.cu |  374 --
 .../fast_transformer/src/fusion_decoder_op.h  |   72 -
 .../src/fusion_decoding_op.cc                 |  337 --
 .../src/fusion_decoding_op.cu                 |  538 --
 .../fast_transformer/src/fusion_decoding_op.h |   84 -
 .../fast_transformer/src/fusion_encoder_op.cc |  193 -
 .../fast_transformer/src/fusion_encoder_op.cu |  443 --
 .../fast_transformer/src/fusion_encoder_op.h  |   63 -
 .../src/fusion_force_decoding_op.cc           |  340 --
 .../src/fusion_force_decoding_op.cu           |  572 ---
 .../src/fusion_force_decoding_op.h            |   85 -
 .../ops/fast_transformer/src/fusion_gpt_op.cc |  223 -
 .../ops/fast_transformer/src/fusion_gpt_op.cu |  378 --
 .../ops/fast_transformer/src/fusion_gpt_op.h  |   71 -
 .../fast_transformer/src/fusion_gptj_op.cc    |  203 -
 .../fast_transformer/src/fusion_gptj_op.cu    |  334 --
 .../ops/fast_transformer/src/fusion_gptj_op.h |   66 -
 .../src/fusion_mbart_decoding_op.cc           |  368 --
 .../src/fusion_mbart_decoding_op.cu           |  596 ---
 .../src/fusion_mbart_decoding_op.h            |   88 -
 .../fast_transformer/src/fusion_miro_op.cc    |  427 --
 .../fast_transformer/src/fusion_miro_op.cu    |  710 ---
 .../ops/fast_transformer/src/fusion_miro_op.h |  102 -
 .../ops/fast_transformer/src/fusion_opt_op.cc |  227 -
 .../ops/fast_transformer/src/fusion_opt_op.cu |  384 --
 .../ops/fast_transformer/src/fusion_opt_op.h  |   71 -
 .../src/fusion_pegasus_decoding_op.cc         |  372 --
 .../src/fusion_pegasus_decoding_op.cu         |  554 --
 .../src/fusion_pegasus_decoding_op.h          |   86 -
 .../src/fusion_t5_decoding_op.cc              |  377 --
 .../src/fusion_t5_decoding_op.cu              |  635 ---
 .../src/fusion_t5_decoding_op.h               |   91 -
 .../src/fusion_unified_decoding_op.cc         |  417 --
 .../src/fusion_unified_decoding_op.cu         |  693 ---
 .../src/fusion_unified_decoding_op.h          |  100 -
 .../fast_transformer/src/parallel_utils.cc    |  148 -
 .../ops/fast_transformer/src/parallel_utils.h |  102 -
 .../ops/fast_transformer/src/pd_traits.h      |   37 -
 paddlenlp/ops/fast_transformer/src/utils.cc   |   25 -
 paddlenlp/ops/fast_transformer/src/utils.h    |   21 -
 .../fast_transformer/transformer/__init__.py  |   13 -
 .../fast_transformer/transformer/decoder.py   |  586 ---
 .../fast_transformer/transformer/decoding.py  | 4550 -----------------
 .../fast_transformer/transformer/encoder.py   |  456 --
 .../transformer/fast_transformer.py           | 2021 --------
 .../patches/FasterTransformer/CMakeLists.txt  |  418 --
 .../fastertransformer/CMakeLists.txt          |   27 -
 .../bert_encoder_transformer.h                | 1123 ----
 .../cuda/attention_kernels.cu                 |  154 -
 .../cuda/attention_kernels.cuh                |   34 -
 .../fastertransformer/cuda/cuda_kernels.cu    |   95 -
 .../fastertransformer/cuda/cuda_kernels.h     |  198 -
 .../cuda/decoding_kernels.cu                  |  713 ---
 .../cuda/lightseq_kernels.cu                  |   56 -
 .../cuda/masked_multihead_attention.cu        | 1504 ------
 .../cuda/masked_multihead_attention.h         |  115 -
 .../cuda/masked_multihead_attention_utils.h   |  265 -
 .../cuda/online_softmax_beamsearch_kernels.cu | 1559 ------
 .../fastertransformer/cuda/open_attention.h   | 1137 ----
 .../fastertransformer/cuda/open_decoder.cu    |  646 ---
 .../fastertransformer/cuda/open_decoder.cuh   |  123 -
 .../fastertransformer/cuda/topk_kernels.cu    | 2643 ----------
 .../fastertransformer/cuda/topk_kernels.cuh   |   87 -
 .../cuda/transformer_decoder.cu               |  643 ---
 .../cuda/transformer_decoding_kernels.cu      |  671 ---
 .../cuda/transformer_kernels.cu               |  985 ----
 .../cuda/transformer_kernels.cuh              |   48 -
 .../fastertransformer/decoding_beamsearch.h   | 1445 ------
 .../fastertransformer/decoding_sampling.h     | 1319 -----
 .../FasterTransformer/fastertransformer/gpt.h |  895 ----
 .../fastertransformer/gptj.h                  |  946 ----
 .../fastertransformer/open_decoder.h          | 2166 --------
 .../FasterTransformer/fastertransformer/opt.h |  927 ----
 .../fastertransformer/standard_encoder.h      | 1013 ----
 .../fastertransformer/t5_beamsearch.h         |  922 ----
 .../fastertransformer/t5_sampling.h           |  780 ---
 .../fastertransformer/utils/allocator.h       |  120 -
 .../fastertransformer/utils/arguments.h       |  210 -
 .../fastertransformer/utils/common.h          |  231 -
 .../utils/common_structure.h                  |   80 -
 147 files changed, 52160 deletions(-)
 delete mode 100644 fast_generation/README.md
 delete mode 100644 fast_generation/perf/README.md
 delete mode 100644 fast_generation/perf/bart_perf.py
 delete mode 100644 fast_generation/perf/codegen_perf.py
 delete mode 100644 fast_generation/perf/gpt_perf.py
 delete mode 100644 fast_generation/perf/opt_perf.py
 delete mode 100644 fast_generation/perf/pegasus_perf.py
 delete mode 100644 fast_generation/perf/run_perf_bart.sh
 delete mode 100644 fast_generation/perf/run_perf_codegen.sh
 delete mode 100644 fast_generation/perf/run_perf_gpt.sh
 delete mode 100644 fast_generation/perf/run_perf_opt.sh
 delete mode 100644 fast_generation/perf/run_perf_pegasus.sh
 delete mode 100644 fast_generation/samples/codegen_16b_sample.py
 delete mode 100644 fast_generation/samples/codegen_sample.py
 delete mode 100644 fast_generation/samples/gpt_mp_sample.py
 delete mode 100644 fast_generation/samples/gpt_sample.py
 delete mode 100644 fast_generation/samples/gptj_sample.py
 delete mode 100644 fast_generation/samples/mbart_sample.py
 delete mode 100644 fast_generation/samples/opt_sample.py
 delete mode 100644 fast_generation/samples/pegasus_sample.py
 delete mode 100644 fast_generation/samples/plato_sample.py
 delete mode 100644 fast_generation/samples/plato_xl_sample.py
 delete mode 100644 fast_generation/samples/t5_sample.py
 delete mode 100644 fast_generation/samples/unimo_text_sample.py
 delete mode 100644 paddlenlp/ops/CMakeLists.txt
 delete mode 100644 paddlenlp/ops/cmake/FindNCCL.cmake
 delete mode 100644 paddlenlp/ops/cmake/external/boost.cmake
 delete mode 100644 paddlenlp/ops/ext_utils.py
 delete mode 100644 paddlenlp/ops/fast_transformer/CMakeLists.txt
 delete mode 100644 paddlenlp/ops/fast_transformer/__init__.py
 delete mode 100644 paddlenlp/ops/fast_transformer/sample/bart_decoding_sample.py
 delete mode 100644 paddlenlp/ops/fast_transformer/sample/bart_export_model_sample.py
 delete mode 100644 paddlenlp/ops/fast_transformer/sample/bart_inference.py
 delete mode 100644 paddlenlp/ops/fast_transformer/sample/config/decoder.sample.yaml
 delete mode 100644 paddlenlp/ops/fast_transformer/sample/config/decoding.sample.yaml
 delete mode 100644 paddlenlp/ops/fast_transformer/sample/decoder_sample.py
 delete mode 100644 paddlenlp/ops/fast_transformer/sample/decoding_sample.py
 delete mode 100644 paddlenlp/ops/fast_transformer/sample/encoder_decoder_sample.py
 delete mode 100644 paddlenlp/ops/fast_transformer/sample/encoder_decoding_sample.py
 delete mode 100644 paddlenlp/ops/fast_transformer/sample/gpt_export_model_sample.py
 delete mode 100644 paddlenlp/ops/fast_transformer/sample/gpt_sample.py
 delete mode 100644 paddlenlp/ops/fast_transformer/sample/mbart_decoding_sample.py
 delete mode 100644 paddlenlp/ops/fast_transformer/sample/mbart_export_model_sample.py
 delete mode 100644 paddlenlp/ops/fast_transformer/sample/mbart_inference.py
 delete mode 100644 paddlenlp/ops/fast_transformer/sample/plato_export_model_sample.py
 delete mode 100644 paddlenlp/ops/fast_transformer/sample/plato_inference.py
 delete mode 100644 paddlenlp/ops/fast_transformer/sample/t5_export_model_sample.py
 delete mode 100644 paddlenlp/ops/fast_transformer/sample/t5_inference.py
 delete mode 100644 paddlenlp/ops/fast_transformer/sample/unimo_text_export_model_sample.py
 delete mode 100644 paddlenlp/ops/fast_transformer/sample/unimo_text_inference.py
 delete mode 100644 paddlenlp/ops/fast_transformer/src/CMakeLists.txt
 delete mode 100644 paddlenlp/ops/fast_transformer/src/cublas_handle.cc
 delete mode 100644 paddlenlp/ops/fast_transformer/src/cublas_handle.h
 delete mode 100644 paddlenlp/ops/fast_transformer/src/demo/gpt.cc
 delete mode 100644 paddlenlp/ops/fast_transformer/src/demo/helper.h
 delete mode 100644 paddlenlp/ops/fast_transformer/src/demo/transformer_e2e.cc
 delete mode 100644 paddlenlp/ops/fast_transformer/src/demo/utf8.h
 delete mode 100644 paddlenlp/ops/fast_transformer/src/demo/utf8/checked.h
 delete mode 100644 paddlenlp/ops/fast_transformer/src/demo/utf8/core.h
 delete mode 100644 paddlenlp/ops/fast_transformer/src/demo/utf8/cpp11.h
 delete mode 100644 paddlenlp/ops/fast_transformer/src/demo/utf8/cpp17.h
 delete mode 100644 paddlenlp/ops/fast_transformer/src/demo/utf8/unchecked.h
 delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_bart_decoding_op.cc
 delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_bart_decoding_op.cu
 delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_bart_decoding_op.h
 delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_decoder_op.cc
 delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_decoder_op.cu
 delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_decoder_op.h
 delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_decoding_op.cc
 delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_decoding_op.cu
 delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_decoding_op.h
 delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_encoder_op.cc
 delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_encoder_op.cu
 delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_encoder_op.h
 delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_force_decoding_op.cc
 delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_force_decoding_op.cu
 delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_force_decoding_op.h
 delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_gpt_op.cc
 delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_gpt_op.cu
 delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_gpt_op.h
 delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_gptj_op.cc
 delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_gptj_op.cu
 delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_gptj_op.h
 delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_mbart_decoding_op.cc
 delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_mbart_decoding_op.cu
 delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_mbart_decoding_op.h
 delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_miro_op.cc
 delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_miro_op.cu
 delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_miro_op.h
 delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_opt_op.cc
 delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_opt_op.cu
 delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_opt_op.h
 delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_pegasus_decoding_op.cc
 delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_pegasus_decoding_op.cu
 delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_pegasus_decoding_op.h
 delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_t5_decoding_op.cc
 delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_t5_decoding_op.cu
 delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_t5_decoding_op.h
 delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_unified_decoding_op.cc
 delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_unified_decoding_op.cu
 delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_unified_decoding_op.h
 delete mode 100644 paddlenlp/ops/fast_transformer/src/parallel_utils.cc
 delete mode 100644 paddlenlp/ops/fast_transformer/src/parallel_utils.h
 delete mode 100644 paddlenlp/ops/fast_transformer/src/pd_traits.h
 delete mode 100644 paddlenlp/ops/fast_transformer/src/utils.cc
 delete mode 100644 paddlenlp/ops/fast_transformer/src/utils.h
 delete mode 100644 paddlenlp/ops/fast_transformer/transformer/__init__.py
 delete mode 100644 paddlenlp/ops/fast_transformer/transformer/decoder.py
 delete mode 100644 paddlenlp/ops/fast_transformer/transformer/decoding.py
 delete mode 100644 paddlenlp/ops/fast_transformer/transformer/encoder.py
 delete mode 100644 paddlenlp/ops/fast_transformer/transformer/fast_transformer.py
 delete mode 100644 paddlenlp/ops/patches/FasterTransformer/CMakeLists.txt
 delete mode 100644 paddlenlp/ops/patches/FasterTransformer/fastertransformer/CMakeLists.txt
 delete mode 100644 paddlenlp/ops/patches/FasterTransformer/fastertransformer/bert_encoder_transformer.h
 delete mode 100644 paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/attention_kernels.cu
 delete mode 100644 paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/attention_kernels.cuh
 delete mode 100644 paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/cuda_kernels.cu
 delete mode 100644 paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/cuda_kernels.h
 delete mode 100644 paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/decoding_kernels.cu
 delete mode 100644 paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/lightseq_kernels.cu
 delete mode 100644 paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/masked_multihead_attention.cu
 delete mode 100644 paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/masked_multihead_attention.h
 delete mode 100644 paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/masked_multihead_attention_utils.h
 delete mode 100644 paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/online_softmax_beamsearch_kernels.cu
 delete mode 100644 paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/open_attention.h
 delete mode 100644 paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/open_decoder.cu
 delete mode 100644 paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/open_decoder.cuh
 delete mode 100644 paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/topk_kernels.cu
 delete mode 100644 paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/topk_kernels.cuh
 delete mode 100644 paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/transformer_decoder.cu
 delete mode 100644 paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/transformer_decoding_kernels.cu
 delete mode 100644 paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/transformer_kernels.cu
 delete mode 100644 paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/transformer_kernels.cuh
 delete mode 100644 paddlenlp/ops/patches/FasterTransformer/fastertransformer/decoding_beamsearch.h
 delete mode 100644 paddlenlp/ops/patches/FasterTransformer/fastertransformer/decoding_sampling.h
 delete mode 100644 paddlenlp/ops/patches/FasterTransformer/fastertransformer/gpt.h
 delete mode 100644 paddlenlp/ops/patches/FasterTransformer/fastertransformer/gptj.h
 delete mode 100644 paddlenlp/ops/patches/FasterTransformer/fastertransformer/open_decoder.h
 delete mode 100644 paddlenlp/ops/patches/FasterTransformer/fastertransformer/opt.h
 delete mode 100644 paddlenlp/ops/patches/FasterTransformer/fastertransformer/standard_encoder.h
 delete mode 100644 paddlenlp/ops/patches/FasterTransformer/fastertransformer/t5_beamsearch.h
 delete mode 100644 paddlenlp/ops/patches/FasterTransformer/fastertransformer/t5_sampling.h
 delete mode 100644 paddlenlp/ops/patches/FasterTransformer/fastertransformer/utils/allocator.h
 delete mode 100644 paddlenlp/ops/patches/FasterTransformer/fastertransformer/utils/arguments.h
 delete mode 100644 paddlenlp/ops/patches/FasterTransformer/fastertransformer/utils/common.h
 delete mode 100644 paddlenlp/ops/patches/FasterTransformer/fastertransformer/utils/common_structure.h

diff --git a/fast_generation/README.md b/fast_generation/README.md
deleted file mode 100644
index fe699a9c7271..000000000000
--- a/fast_generation/README.md
+++ /dev/null
@@ -1,305 +0,0 @@
-# FastGeneration
-
-FastGeneration是PaddleNLP v2.2版本加入的文本生成高性能加速功能，其支持GPT、OPT、BART、UnifiedTransformer等多种NLP生成类预训练模型，并且支持多种解码策略，可以用于机器翻译、文本续写、文本摘要、对话生成等多种NLG任务的GPU场景预测加速。
-
-功能底层依托于[NV FasterTransformer](https://github.com/NVIDIA/FasterTransformer)，该库针对标准的Transformer和GPT模型、beam search和sampling解码策略进行了性能优化。PaddleNLP FastGeneration在其之上进行了扩展，实现了更多模型和生成策略的优化支持，并将功能入口封装于`model.generate`函数。功能的开启和关闭通过传入`use_fast`参数进行控制（默认为关闭状态）。通过调用generate函数，用户可以简单的使用模型高性能推理功能。下图展示了FastGeneration的启动流程：
-
-
-<p align="center">
-  <img src="../docs/imgs/fast_generation.png" width="400" height ="600" />
-</p>
-
-## Featrues
-
-- 全面支持生成式预训练模型。包括GPT、OPT、CodeGen、GPTJ、BART、mBART、UnifiedTransformer和UNIMO-text。
-- 支持大多数主流解码策略。包括Beam Search、Sampling、Greedy Search。以及Diverse Sibling Search、Length Penalty等子策略。
-- 解码速度快。最高可达非加速版generate函数的**18倍**。**并支持FP16混合精度计算**。
-- 易用性强。功能的入口为`model.generate`，与非加速版生成api的使用方法相同，当满足加速条件时使用jit即时编译高性能算子并用于生成，不满足则自动切换回非加速版生成api。
-- GPT、UnifiedTransformer和UNIMO-text模型支持高性能并行推理，在具备MPI和NCCL的环境中一行代码即可开启使用，允许通过多张小显存容量的 GPU 使用百亿大模型，预测速度较单卡也进一步提升。百亿模型四卡并行高性能推理速度达单卡高性能推理速度2+倍。
-
-### Inference Model Support
-下表为PaddleNLP FastGeneration对预训练模型和解码策略的支持情况（GPU）。
-
-| Model   Name           | GPT2    | OPT     | CodeGen| GPTJ| BART            | mBART           | UnifiedTransformer |
-|------------------------|---------|---------| ---------| ---------|-----------------|-----------------|--------------------|
-| Model   Structure      | Decoder | Decoder |Decoder|Decoder| Encoder-Decoder | Encoder-Decoder | Prefix-LM          |
-| Beam Search            | ❌       | ❌       |❌|❌| ✅               | ✅               | ✅                  |
-| Top-K Sampling         | ✅       | ✅       |✅|✅| ✅               | ✅               | ✅                  |
-| Top-P Sampling         | ✅       | ✅       |✅|✅| ✅               | ✅               | ✅                  |
-| Diverse Sibling Search | ❌       | ❌       |❌|❌| ✅               | ✅               | ✅                  |
-| Forced Decoding        | ❌       | ❌       |❌|❌| ❌               | ✅               | ❌                  |
-| Length Penalty         | ❌       | ❌       |❌|❌| ✅               | ✅               | ✅                  |
-| Temperature            | ✅       | ✅       |✅|✅| ✅               | ✅               | ✅                  |
-| Repetition Penalty     | ✅       | ✅       |✅|✅| ❌               | ❌               | ❌                  |
-
-## Performence
-
-FastGeneration的高性能解码相比原版generate方法加速明显，并且与竞品相比有也有极大的速度优势。以下为性能对比图：
-
-- **batch_size = 4, out_seq_len = 32**
-- Device: Tesla V100-SXM2-16GB
-- CUDA version 11.2
-- cudnn version 8
-- torch version 1.10.0+cu113
-- transformers version 4.12.5
-
-### **BART** (bart-base, batch_size=4, max_length=32)
-
-<p align="left">
-  <img src="https://user-images.githubusercontent.com/10242208/183384011-0df9a81e-72ac-429e-88da-166d48128b67.png" width="800" height ="400" />
-</p>
-
-### **GPT** (gpt2, batch_size=4, max_length=32)
-
-<p align="left">
-  <img src="https://user-images.githubusercontent.com/10242208/183376427-638a7dd1-94b0-4b45-bd52-7c38f12f090f.png" width="800" height ="400" />
-</p>
-
-### **OPT** (opt, batch_size=4, max_length=32)
-
-<p align="left">
-  <img src="https://user-images.githubusercontent.com/10242208/183376428-7e7a0998-803c-4bc3-acf6-971a9471b300.png" width="800" height ="400" />
-</p>
-
-### **CodeGen:**
-* 环境和超参
-  - Platform: Tesla V100-SXM2-32GB
-  - CUDA 10.1
-  - CUDNN 7.6.5
-  - PaddlePaddle-gpu 2.3.1.post101
-  - transformers==4.21.1
-  - torch==1.11.0
-  - Batch Size: 1
-  - Input Length: 60
-  - Output Length: 20
-<p align="left">
-  <img src="https://user-images.githubusercontent.com/24390500/185611444-df2bec75-6cec-4c86-afd6-3049faae6288.png" width="800" height ="350" />
-</p>
-
-- Platform: A100-40G
-<p align="left">
-  <img src="https://user-images.githubusercontent.com/24390500/185743415-317e75f5-029b-4037-aaaa-75d38db6b288.png" width="800" height ="350" />
-</p>
-
-### **Pegasus**
-* 环境和超参
-  - Platform: Tesla V100-SXM2-32GB
-  - CUDA 10.1
-  - CUDNN 7.6.5
-  - PaddlePaddle-gpu 2.3.2.post101
-  - transformers==4.21.1
-  - torch==1.11.0
-  - Batch Size: 4
-  - Input Length: 60
-  - Output Length: 20
-  - Decode_strategy: beam search
-  - num_beams: 4
-<p align="left">
-  <img src="https://user-images.githubusercontent.com/24390500/198013848-96ada404-c936-42a0-a83d-eedb8193ef53.png" width="800" height ="400" />
-</p>
-
-更详细的性能数据请参见[这里](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/fast_generation/perf)
-
-## Quick Start
-
-### 高性能推理
-
-为体现FastGeneration的易用性，我们在`samples`文件夹中内置了几个典型任务示例，下面以基于GPT模型的中文文本续写任务为例：
-
-```sh
-python samples/gpt_sample.py
-```
-
-如果是第一次执行，PaddleNLP会启动即时编译（[JIT Compile](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/new_op/new_custom_op_cn.html#jit-compile)）自动编译高性能解码算子。
-
-```sh
-...
-2021-11-17 13:42:56,771 - INFO - execute command: cd /10.2/hub/PaddleNLP/paddlenlp/ops/extenstions && /usr/local/bin/python FasterTransformer_setup.py build
-INFO:utils.cpp_extension:execute command: cd /10.2/hub/PaddleNLP/paddlenlp/ops/extenstions && /usr/local/bin/python FasterTransformer_setup.py build
-grep: warning: GREP_OPTIONS is deprecated; please use an alias or script
-running build
-running build_ext
--- The C compiler identification is GNU 8.2.0
--- The CXX compiler identification is GNU 8.2.0
--- The CUDA compiler identification is NVIDIA 10.2.89
--- Check for working C compiler: /usr/bin/cc
--- Check for working C compiler: /usr/bin/cc -- works
--- Detecting C compiler ABI info
--- Detecting C compiler ABI info - done
--- Detecting C compile features
--- Detecting C compile features - done
--- Check for working CXX compiler: /usr
-...
-```
-
-编译过程通常会花费几分钟的时间编译只会进行一次，之后再次使用高性能解码就不需要重新编译了，编译完成后会继续运行，可以看到生成的结果如下：
-
-```
-Model input: 花间一壶酒，独酌无相亲。举杯邀明月，
-Result: 对影成三人。
-```
-
-打开示例代码 `samples/gpt_sample.py` ，我们可以看到如下代码：
-
-```
-...
-model = GPTLMHeadModel.from_pretrained(model_name)
-...
-outputs, _ = model.generate(
-    input_ids=inputs_ids, max_length=10, decode_strategy='greedy_search',
-    use_fast=True)
-...
-```
-
-可以看到，FastGeneration的使用方法与 `model.generate()` 相同，只需传入输入tensor和解码相关参数即可，使用非常简便。如果要使用非加速版的 `model.generate()` 方法，只需传入 `use_fast=False` 即可，示例如下：
-
-```
-...
-outputs, _ = model.generate(
-    input_ids=inputs_ids, max_length=10, decode_strategy='greedy_search', use_fast=False)
-...
-```
-
-**NOTE:** 需要注意的是，如果传入 `model.generate()` 的参数不满足高性能版本的要求。程序会做出提示并自动切换为非加速版本，例如我们在上面的例子中传入 `min_length=1` ，会得到如下提示：
-
-```
-...
-[2021-11-17 14:21:06,132] [ WARNING] - 'min_length != 0' is not supported yet in the fast version
-[2021-11-17 14:21:06,132] [ WARNING] - FastGeneration is not available, and the original version would be used instead.
-...
-```
-
-关于该函数的详细介绍可以参考API文档[generate](https://paddlenlp.readthedocs.io/zh/latest/source/paddlenlp.transformers.generation_utils.html)和**Aistudio教程[文本生成任务实战：如何使用PaddleNLP实现各种解码策略](https://aistudio.baidu.com/aistudio/projectdetail/3243711?contributionType=1)。**`samples`文件夹中的其他示例的使用方法相同。
-
-### 并行推理
-
-FastGeneration对GPT、UnifiedTransformer和UNIMO-text模型在高性能推理的基础上还实现了模型并行功能，其中GPT支持Tensor Parallel和Layer Parallel（Pipeline Parallel）两种并行策略的组合，UnifiedTransformer和UNIMO-text支持Tensor Parallel。关于这两种并行策略的详细介绍请参考[Megatron论文](https://arxiv.org/pdf/2104.04473.pdf)。
-
-并行推理当前依赖MPI（[MPICH](https://www.mpich.org)、[OpenMPI](https://www.open-mpi.org)均可）和[NCCL](https://developer.nvidia.com/nccl)，如需使用还请先安装依赖。在使用时，相比上面的单卡高性能加速代码中也只增加了`from_pretrained`创建加载模型之前加上`enable_ft_para()`一行。
-#### GPT 并行推理
-
-GPT高性能并行推理的完整使用示例已在`gpt_mp_sample.py`中提供，按照如下方式启动即可：
-
-```sh
-mpirun -n 4 python gpt_mp_sample.py --tensor_para_size 4 --layer_para_size 1
-```
-
-其中`-n 4`指明使用的进程和GPU数，`tensor_para_size`和`tensor_para_size`分别指明Tensor Parallel和Layer Parallel各自使用的GPU数，均设置为1则进行单卡预测。另外加上`--use_fp16`以使用FP16，加上`--profile`可以进行相应设置的性能测试。其他生成相关的参数设置释义如下：
-- `model_name` 指定使用的GPT模型，默认为[`gpt-cpm-larg-cn`](https://github.com/TsinghuaAI/CPM-1-Generate)。
-- `max_length` 指定生成的最大长度，默认为50。
-- `topk` 用于Top-K采样策略，采样时将只从概率最高K个token中采样，默认为1，即greedy search。
-- `topp` 用于Top-P采样策略，采样时将只从概率最高且累加概率不超过该值的token中采样，默认为1.0。
-- `temperature` 用于调整预测概率分布，默认为1.0，即保持模型原有的预测概率。
-
-使用`gpt-cpm-larg-cn`(2.6B)和默认设置，在V100上4卡Tensor Parallel较单卡高性能预测速度提升约40%。
-
-#### PLATO-XL 并行推理
-
-PLATO-XL百亿对话预训练模型(11B UnifiedTransformer模型)高性能并行推理的完整使用示例已在`plato_xl_sample.py`中提供(当前只支持Tensor Parallel)，按照如下方式启动即可：
-
-```shell
-mpirun -n 4 python plato_xl_sample.py
-```
-
-参数释义基本同上。在V100上4卡Tensor Parallel高性能预测为单卡高性能预测速度的2倍。
-
-## Generate Examples
-
-除了以上示例之外，PaddleNLP的examples中大多使用了`model.generate`的示例都可以通过调整到合适的参数使用高性能推理。具体如下：
-
-- [examples/dialogue/unified_transformer](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/examples/dialogue/unified_transformer)
-- [model_zoo/gpt/fast_gpt](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/model_zoo/gpt/fast_gpt)
-- [examples/text_generation/unimo-text](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/examples/text_generation/unimo-text)
-- [examples/text_summarization/bart](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/examples/text_summarization/bart)
-
-下面我们以基于 `Unified Transformer` 的任务型对话为例展示一下FastGeneration的加速效果：
-
-打开以上链接中Unified Transformer对应的example，找到README中对应预测的脚本。稍作修改如下：
-
-```sh
-export CUDA_VISIBLE_DEVICES=0
-    python infer.py \
-    --model_name_or_path=unified_transformer-12L-cn-luge \
-    --output_path=./predict.txt \
-    --logging_steps=10 \
-    --seed=2021 \
-    --max_seq_len=512 \
-    --max_knowledge_len=256 \
-    --batch_size=4 \
-    --min_dec_len=1 \
-    --max_dec_len=64 \
-    --num_return_sequences=1 \
-    --decode_strategy=sampling \
-    --top_k=5 \
-    --faster
-    --device=gpu
-```
-
-由于这里只是展示性能，我们直接在 `model_name_or_path` 填入PaddleNLP预训练模型名称 `unified_transformer-12L-cn-luge` 。
-
-可以看到，由于该任务为对话任务，我们为了防止模型生成过多安全回复（如：哈哈哈、不错等），保证生成结果具有更多的随机性，我们选择TopK-sampling作为解码策略，并让k=5。
-
-打开 `infer.py` ，可以看到我们传入的脚本参数大多都提供给了 `model.generate()` 方法：
-
-```
-output = model.generate(
-        input_ids=input_ids,
-        token_type_ids=token_type_ids,
-        position_ids=position_ids,
-        attention_mask=attention_mask,
-        seq_len=seq_len,
-        max_length=args.max_dec_len,
-        min_length=args.min_dec_len,
-        decode_strategy=args.decode_strategy,
-        temperature=args.temperature,
-        top_k=args.top_k,
-        top_p=args.top_p,
-        num_beams=args.num_beams,
-        length_penalty=args.length_penalty,
-        early_stopping=args.early_stopping,
-        num_return_sequences=args.num_return_sequences,
-        use_fp16_decoding=args.use_fp16_decoding,
-        use_fast=args.faster)
-```
-
-运行脚本，输出结果如下：
-
-```sh
-step 10 - 1.695s/step
-step 20 - 1.432s/step
-step 30 - 1.435s/step
-```
-
-可以看到，非加速版 `generate()` 方法的预测速度为每个step耗时1.5秒左右。
-
-下面我们在启动脚本中传入 `--faster` 参数，该参数会向 `generate()` 方法传入 `use_fast=True` ，启动加速模式。同时我们需要设置 `--min_dec_len=0` ，因为FastGeneration当前还不支持该参数。新的脚本启动参数如下：
-
-```sh
-export CUDA_VISIBLE_DEVICES=0
-    python infer.py \
-    --model_name_or_path=unified_transformer-12L-cn-luge \
-    --output_path=./predict.txt \
-    --logging_steps=10 \
-    --seed=2021 \
-    --max_seq_len=512 \
-    --max_knowledge_len=256 \
-    --batch_size=4 \
-    --min_dec_len=0 \
-    --max_dec_len=64 \
-    --num_return_sequences=1 \
-    --decode_strategy=sampling \
-    --top_k=5 \
-    --device=gpu \
-    --faster
-```
-
-再次运行脚本，输出结果如下（由于我们已经编译过高性能算子，所以这里不会重新编译）：
-
-```sh
-[2021-11-23 13:38:09,200] [   DEBUG] - skipping 'FastGeneration' extension (up-to-date) build
-step 10 - 0.250s/step
-step 20 - 0.156s/step
-step 30 - 0.141s/step
-```
-
-可以看到，FastGeneration的预测速度为每个step耗时0.15秒左右，相比非加速版提速超过9倍。
diff --git a/fast_generation/perf/README.md b/fast_generation/perf/README.md
deleted file mode 100644
index 242bf765ec11..000000000000
--- a/fast_generation/perf/README.md
+++ /dev/null
@@ -1,250 +0,0 @@
-# FastGeneration Performance
-
-以下性能数据为非加速版generate方法和FastGeneration对比数据。
-
-- **测试设备:** Tesla V100-SXM2-16GB
-- **Batch Size:** 4
-- **Max Length:** 32
-
-## 性能数据
-***
-
-CUDA 10.1, cudnn 7, gcc 82
-
-torch version 1.10.0+cu102, transformers version 4.12.5
-
-**BART:**
-
-| Model Size | Decode Strategy| FastGeneration(FP32)<br>(ms) | FastGeneration(FP16)<br>(ms) | HF generate<br>(ms) | Speed Up Rate<br>(Faster32/HF) | Speed Up Rate<br>(Faster16/HF) |
-|-----|----|---|---|---|---|---|
-|num_layers = 6<br>num_attention_heads = 12<br>hidden_size = 768<br>(bart-base)|top_k = 1|37.53|34.01|136.89|3.65|4.02
-| |top_k = 4    |39.33|34.98|146.89|3.73|4.2 |
-| |top_k = 8    |42.35|34.77|136.80|3.23|3.93|
-| |top_k = 16   |40.95|35.45|148.45|3.63|4.19|
-| |top_p = 0.4  |45.83|33.32|184.36|4.02|5.53|
-| |num_beams = 4|44.72|37.51|242.73|5.43|6.47|
-| |num_beams = 8|61.56|40.27|273.93|4.45|6.8 |
-| |num_beams = 16|82.05|46.68|433.51|5.28|9.29|
-|num_layers = 12<br>num_attention_heads = 16<br>hidden_size = 1024<br>(bart-large)|top_k = 1|55.03|45.44|199.27|3.62|4.39|
-| |top_k = 4|70.12|56.81|220.96|3.15|3.89|
-| |top_k = 8|69.96|57.73|201.06|2.87|3.48|
-| |top_k = 16|69.16|59.62|223.73|3.23|3.75|
-| |top_p = 0.4|73.49|61.43|275.86|3.75|4.49|
-| |num_beams = 4|66.44|50.71|277.61|4.18|5.47|
-| |num_beams = 8|135.30|85.75|314.78|2.33|3.67|
-| |num_beams = 16|168.01|100.22|441.95|2.63|4.41|
-
-**GPT:**
-
-| Model Size | Decode Strategy| FastGeneration(FP32)<br>(ms) | FastGeneration(FP16)<br>(ms) | HF generate<br>(ms) | Speed Up Rate<br>(Faster32/HF) | Speed Up Rate<br>(Faster16/HF) |
-|-----|----|---|---|---|---|---|
-|num_layers = 12<br>num_attention_heads = 12<br>hidden_size = 768<br>(gpt2)|top_k = 1|69.29|59.20|363.93|5.25|6.15|
-| |top_k = 4|68.07|60.92|391.02|5.74|6.42|
-| |top_k = 8|69.16|60.45|401.18|5.80|6.64|
-| |top_k = 16|73.59|62.40|401.55|5.46|6.44|
-| |top_p = 0.4|95.61|76.26|429.63|4.49|5.63|
-|num_layers = 24<br>num_attention_heads = 16<br>hidden_size = 1024<br>(gpt2-medium)|top_k = 1|127.04|95.13|726.83|5.72|7.64|
-| |top_k = 4|126.74|93.95|694.53|5.48|7.39|
-| |top_k = 8|128.11|94.07|743.63|5.80|7.91|
-| |top_k = 16|126.78|95.00|732.96|5.78|7.72|
-| |top_p = 0.4|143.36|105.40|756.12|5.27|7.17|
-|num_layers = 36<br>num_attention_heads = 20<br>hidden_size = 1280<br>(gpt2-large)top_k = 1|236.80|200.37|1057.94|4.47|5.28|
-| |top_k = 4|236.69|201.95|1075.17|4.54|5.32|
-| |top_k = 8|237.04|202.00|1084.60|4.58|5.37|
-| |top_k = 16|235.01|201.79|1110.75|4.73|5.5|
-| |top_p = 0.4|270.31|205.84|1111.16|4.11|5.4|
-
-**OPT**
-
-* 模型参数
-
-| Model Name | num_layers | num_attention_heads | hidden_size |
-|------------|------------|---------------------|-------------|
-| OPT-125m   | 12         | 12                  | 768         |
-| OPT-350M   | 24         | 16                  | 1024        |
-
-transformer: 4.20.1
-
-* 性能指标数据
-
-|   Model  | Decoding   Strategy | Faster   Generation(FP32)(ms) | Faster   Generation(FP16)(ms) | HF   Generation(ms) | Speed Up   Rate(Faster32/HF) | Speed Up   Rate(Faster16/HF) |
-|:--------:|:-------------------:|:-----------------------------:|:-----------------------------:|:-------------------:|:----------------------------:|:----------------------------:|
-| opt-125m |       top_k=1       |             58.39             |             48.82             |       290.14        |             4.97             |             5.94             |
-|          |       top_k=4       |             58.45             |             49.05             |       283.55        |             4.85             |             5.78             |
-|          |       top_k=8       |             59.13             |             49.32             |       284.76        |             4.82             |             5.77             |
-|          |       top_k=16      |             60.15             |             49.54             |       299.87        |             4.99             |             6.05             |
-|          |      top_p=0.4      |             75.78             |             60.72             |       335.70        |             4.43             |             5.53             |
-| opt-350m |       top_k=1       |            124.49             |             90.58             |       511.46        |             4.11             |             5.65             |
-|          |       top_k=4       |            125.60             |             90.96             |       528.42        |             4.21             |             5.81             |
-|          |       top_k=8       |            125.93             |             90.96             |       523.46        |             4.16             |             5.75             |
-|          |       top_k=16      |            126.25             |             91.58             |       524.79        |             4.16             |             5.73             |
-|          |      top_p=0.4      |            142.93             |            103.68             |       600.80        |             4.20             |             5.79             |
-
-***
-
-CUDA 11.2, cudnn 8, gcc 82
-
-torch version 1.10.0+cu113, transformers version 4.12.5
-
-**BART:**
-
-| Model Size | Decode Strategy| FastGeneration(FP32)<br>(ms) | FastGeneration(FP16)<br>(ms) | HF generate<br>(ms) | Speed Up Rate<br>(Faster32/HF) | Speed Up Rate<br>(Faster16/HF) |
-|-----|----|---|---|---|---|---|
-|num_layers = 6<br>num_attention_heads = 12<br>hidden_size = 768<br>(bart-base)|top_k = 1|31.1|27.4|139.46|4.48|5.09
-| |top_k = 4    |32.13|29.06|149.81|4.66|5.16|
-| |top_k = 8    |31.7|28.36|154.3|4.87|5.44|
-| |top_k = 16   |32.93|28.66|145.85|4.43|5.09|
-| |top_p = 0.4  |33.35|29.01|173.18|5.19|5.97|
-| |num_beams = 4|47.55|38.02|252.71|5.31|6.65|
-| |num_beams = 8|52.19|41.39|282.3|5.41|6.82|
-| |num_beams = 16|67.18|45.82|441.59|6.57|9.64|
-|num_layers = 12<br>num_attention_heads = 16<br>hidden_size = 1024<br>(bart-large)|top_k = 1|45.8|37.43|173.08|3.78|4.62|
-| |top_k = 4|51.11|48.28|246.27|4.82|5.1|
-| |top_k = 8|61.61|50.67|246.19|4.0|4.86|
-| |top_k = 16|63.81|48.33|272.93|4.28|5.65|
-| |top_p = 0.4|63.0|50.05|288.76|4.58|5.77|
-| |num_beams = 4|65.54|48.58|273.84|4.18|5.64|
-| |num_beams = 8|75.68|52.59|340.86|4.5|6.48|
-| |num_beams = 16|102.87|62.25|477.97|4.65|7.68|
-
-**GPT:**
-
-| Model Size | Decode Strategy| FastGeneration(FP32)<br>(ms) | FastGeneration(FP16)<br>(ms) | HF generate<br>(ms) | Speed Up Rate<br>(Faster32/HF) | Speed Up Rate<br>(Faster16/HF) |
-|-----|----|---|---|---|---|---|
-|num_layers = 12<br>num_attention_heads = 12<br>hidden_size = 768<br>(gpt2)|top_k = 1|50.84|40.37|399.58|7.86|9.9|
-| |top_k = 4|50.38|38.81|419.55|8.33|10.81|
-| |top_k = 8|51.23|36.78|411.7|8.04|11.19|
-| |top_k = 16|51.03|38.76|408.36|8.0|10.54|
-| |top_p = 0.4|68.55|48.04|489.45|7.14|10.19|
-|num_layers = 24<br>num_attention_heads = 16<br>hidden_size = 1024<br>(gpt2-medium)|top_k = 1|111.37|79.73|753.11|6.76|9.45|
-| |top_k = 4|110.53|80.48|767.48|6.94|9.54|
-| |top_k = 8|109.87|78.92|754.99|6.87|9.57|
-| |top_k = 16|110.61|85.26|764.16|6.91|8.96|
-| |top_p = 0.4|127.51|87.72|830.24|6.51|9.46|
-|num_layers = 36<br>num_attention_heads = 20<br>hidden_size = 1280<br>(gpt2-large)|top_k = 1|203.76|142.85|1108.26|5.44|7.76|
-| |top_k = 4|204.18|139.49|1230.63|6.03|8.82|
-| |top_k = 8|204.22|139.14|1238.96|6.07|8.9|
-| |top_k = 16|204.11|140.04|1148.05|5.62|8.2|
-| |top_p = 0.4|222.12|150.68|1248.75|5.62|8.29|
-
-
-**OPT:**
-
-* 模型参数
-
-| Model Name | num_layers | num_attention_heads | hidden_size |
-|------------|------------|---------------------|-------------|
-| OPT-125m   | 12         | 12                  | 768         |
-| OPT-350M   | 24         | 16                  | 1024        |
-
-transformers: 4.20.1
-
-* 性能结果报表
-
-|   Model  | Decoding   Strategy | Faster   Generation(FP32)(ms) | Faster   Generation(FP16)(ms) | HF   Generation(ms) | Speed Up   Rate(Faster32/HF) | Speed Up   Rate(Faster16/HF) |
-|:--------:|:-------------------:|:-----------------------------:|:-----------------------------:|:-------------------:|:----------------------------:|:----------------------------:|
-| opt-125m |       top_k=1       |             50.57             |             42.59             |       267.95        |             5.30             |             6.29             |
-|          |       top_k=4       |             50.88             |             40.01             |       280.95        |             5.52             |             7.02             |
-|          |       top_k=8       |             50.91             |             43.77             |       268.54        |             5.27             |             6.14             |
-|          |       top_k=16      |             51.08             |             42.56             |       265.40        |             5.20             |             6.24             |
-|          |      top_p=0.4      |             69.08             |             54.59             |       330.56        |             4.78             |             6.06             |
-| opt-350m |       top_k=1       |            110.22             |             77.82             |       507.00        |             4.60             |             6.51             |
-|          |       top_k=4       |            110.76             |             77.93             |       479.42        |             4.33             |             6.15             |
-|          |       top_k=8       |            142.07             |             78.86             |       513.79        |             3.62             |             6.52             |
-|          |       top_k=16      |            110.80             |             78.19             |       488.34        |             4.41             |             6.25             |
-|          |      top_p=0.4      |            128.33             |             92.57             |       544.18        |             4.24             |             5.88             |
-
-**CodeGen:**
-* 环境和超参
-
-- Platform: Tesla V100-SXM2-32GB
-- CUDA 10.1
-- CUDNN 7.6.5
-- PaddlePaddle-gpu 2.3.1.post101
-- transformers==4.21.1
-- torch==1.11.0
-- Batch Size: 1
-- Input Length: 60
-- Output Length: 20
-
-* 模型参数
-
-| Model Name | num_layers | num_attention_heads | hidden_size |
-|------------|------------|---------------------|-------------|
-| Salesforce/codegen-350M-mono   | 20         | 16                  | 1024         |
-| Salesforce/codegen-2B-mono   | 32         | 32                  | 2560        |
-| Salesforce/codegen-6B-mono   | 33         | 16                  | 4096         |
-| Salesforce/codegen-16B-mono   | 34         | 24                  | 6144        |
-
-
-
-* 性能结果报表
-
-|   Model  | Decoding   Strategy | Faster   Generation(FP32)(ms) | Faster   Generation(FP16)(ms) | HF   Generation(ms) | Speed Up   Rate(Faster32/HF) | Speed Up   Rate(Faster16/HF) |
-|:--------:|:-------------------:|:-----------------------------:|:-----------------------------:|:-------------------:|:----------------------------:|:----------------------------:|
-| Salesforce/codegen-350M-mono |       top_k=1       |             57.76             |             51.35             |       709.62        |             12.29             |             13.82             |
-|          |       top_k=4       |             57.42             |             50.88             |       639.58        |            11.14             |             12.57             |
-|          |       top_k=8       |             57.24             |             51.67             |       685.82        |             11.98             |             13.27             |
-|          |       top_k=16      |             57.57             |             51.61             |       686.62        |             11.93             |             13.30             |
-|          |      top_p=0.4      |             67.26             |             57.35             |       656.12        |             9.75             |             11.44             |
-| Salesforce/codegen-2B-mono|       top_k=1       |            319.03             |             207.41             |       1040.71        |             3.26             |             5.02             |
-|          |       top_k=4       |            318.98             |             207.37             |       1014.32        |             3.18             |             4.89             |
-|          |       top_k=8       |            319.66             |             207.26             |       1084.09        |             3.39             |             5.23             |
-|          |       top_k=16      |            320.04             |             207.74             |       1040.28        |             3.25             |             5.01             |
-|          |      top_p=0.4      |            329.07             |             213.97             |       1055.55        |             3.21             |             4.93             |
-| Salesforce/codegen-6B-mono|       top_k=1       |            762.91             |             411.94             |       1384.90        |             1.82             |             3.36             |
-|          |       top_k=4       |            762.58             |             412.79             |       1378.32        |             1.81             |             3.34             |
-|          |       top_k=8       |            763.43             |             413.32             |       1366.45        |             1.79             |             3.31             |
-|          |       top_k=16      |            762.79             |             413.83             |       1376.69        |             1.80             |             3.33             |
-|          |      top_p=0.4      |            771.77             |             419.16             |       1366.49        |             1.77             |             3.26             |
-
-
-**Pegasus:**
-
-| Model Size | Decode Strategy| FastGeneration(FP32)<br>(ms) | FastGeneration(FP16)<br>(ms) | HF generate<br>(ms) | Speed Up Rate<br>(Faster32/HF) | Speed Up Rate<br>(Faster16/HF) |
-|-----|----|---|---|---|---|---|
-|IDEA-CCNL/Randeng-Pegasus-238M-Summary-Chinese|num_beams=2|87.41|75.47|1322.24|15.13|17.52
-| |num_beams=4    |91.55|66.47|1364.43|14.90|20.53|
-| |num_beams=6    |94.55|73.25|1391.35|14.72|18.99|
-| |num_beams=8   |100.48|84.82|1467.64|14.61|17.30|
-|IDEA-CCNL/Randeng-Pegasus-523M-Summary-Chinese|num_beams=2|120.15|94.26|1735.21|14.44|18.41|
-| |num_beams=4    |126.42|99.07|1622.31|12.83|16.38|
-| |num_beams=6    |142.21|99.95|1717.49|12.08|17.18|
-| |num_beams=8   |158.26|104.31|1697.65|10.73|16.27|
-
-
-## 测试方法
-
-运行如下命令即可bart性能测试：
-
-```sh
-bash run_perf_bart.sh
-```
-
-运行如下命令即可启动gpt性能测试：
-
-```sh
-bash run_perf_gpt.sh
-```
-
-运行以上命令后，脚本会自动使用不同的模型参数进行性能测试，结果如下图所示：
-
-```sh
-...
-[2021-12-10 08:11:37,255] [   DEBUG] - skipping 'FastGeneration' extension (up-to-date) build
-Namespace(decode_strategy='sampling', max_length=32, model_name_or_path='bart-base', num_beams=1, top_k=1, top_p=1.0, use_fp16_decoding=False)
-Faster FP32 cost: 40.13654176145792
-PD cost: 511.413540635258
-HF cost: 138.49875444546342
-Speed up Faster FP32/PD: 12.741843671403577
-Speed up Faster FP32/HF: 3.4506897796177394
-...
-...
-[2021-12-10 08:13:42,858] [   DEBUG] - skipping 'FastGeneration' extension (up-to-date) build
-Namespace(decode_strategy='sampling', max_length=32, model_name_or_path='bart-base', num_beams=1, top_k=1, top_p=1.0, use_fp16_decoding=True)
-Faster FP16 cost: 34.004870522767305
-...
-```
-可以看到，对于每组参数，脚本会先输出FP32和竞品的测试对比，再单独输出FP16的性能数据。
-
-**NOTE:** 根据测试环境和机器状态的不同，以上性能测试脚本的结果可能与表中结果有所出入。
diff --git a/fast_generation/perf/bart_perf.py b/fast_generation/perf/bart_perf.py
deleted file mode 100644
index 8466dafcaaef..000000000000
--- a/fast_generation/perf/bart_perf.py
+++ /dev/null
@@ -1,170 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import time
-from pprint import pprint
-
-import paddle
-import torch
-from transformers import BartForConditionalGeneration as hf_bart_model
-
-from paddlenlp.data import Pad
-from paddlenlp.transformers import BartForConditionalGeneration, BartTokenizer
-
-
-def prepare_input(tokenizer, sentences):
-    word_pad = Pad(tokenizer.pad_token_id, dtype="int64")
-    tokenized = tokenizer(sentences)
-    inputs = word_pad([i["input_ids"] for i in tokenized])
-    input_ids = paddle.to_tensor(inputs)
-    return input_ids
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--model_name_or_path",
-        default="bart-base",
-        type=str,
-        choices=["bart-base", "bart-large"],
-        help="The model name to specify the bart to use. Can be one of ['bart-base', 'bart-large']. ",
-    )
-    parser.add_argument(
-        "--decode_strategy",
-        default="sampling",
-        type=str,
-        choices=["greedy_search", "beam_search", "sampling"],
-        help="The decoding strategy. Can be one of ['greedy_search', 'beam_search', 'sampling']",
-    )
-    parser.add_argument("--num_beams", default=4, type=int, help="The parameters for beam search. ")
-    parser.add_argument("--top_k", default=4, type=int, help="The number of candidate to procedure beam search. ")
-    parser.add_argument(
-        "--top_p", default=1.0, type=float, help="The probability threshold to procedure topp sampling. "
-    )
-    parser.add_argument("--max_length", default=32, type=int, help="Maximum output length. ")
-    parser.add_argument("--use_fp16_decoding", action="store_true", help="Whether to use fp16 decoding to predict. ")
-    args = parser.parse_args()
-    return args
-
-
-def do_predict(args):
-    place = "gpu"
-    place = paddle.set_device(place)
-
-    tokenizer = BartTokenizer.from_pretrained(args.model_name_or_path)
-    model = BartForConditionalGeneration.from_pretrained(args.model_name_or_path)
-    # Set evaluate mode
-    model.eval()
-    sentences = [
-        "I love that girl, but <mask> does not <mask> me.",
-        "She is so <mask> that I can not help glance at <mask>.",
-        "Nothing's gonna <mask> my love for you.",
-        "Drop everything now. Meet me in the pouring <mask>. Kiss me on the sidewalk.",
-    ]
-
-    input_ids = prepare_input(tokenizer, sentences)
-
-    # Define model
-    model.eval()
-
-    num_loop = 100
-    with paddle.no_grad():
-        for i in range(num_loop):
-            # For warmup.
-            if 50 == i:
-                # PaddlePaddle >= 2.2
-                paddle.device.cuda.synchronize(place)
-                start = time.perf_counter()
-            model.generate(
-                input_ids=input_ids,
-                max_length=args.max_length,
-                decode_strategy=args.decode_strategy,
-                top_k=args.top_k,
-                top_p=args.top_p,
-                num_beams=args.num_beams,
-                early_stopping=True,
-                use_fast=True,
-                use_fp16_decoding=args.use_fp16_decoding,
-            )
-        paddle.device.cuda.synchronize(place)
-        fast_cost = (time.perf_counter() - start) / 50 * 1000
-
-    if args.use_fp16_decoding:
-        pprint(args)
-        print("Fast FP16 cost:", fast_cost)
-        return
-
-    with paddle.no_grad():
-        for i in range(num_loop):
-            # For warmup.
-            if 50 == i:
-                # PaddlePaddle >= 2.2
-                paddle.device.cuda.synchronize(place)
-                start = time.perf_counter()
-            model.generate(
-                input_ids=input_ids,
-                max_length=args.max_length,
-                decode_strategy=args.decode_strategy,
-                top_k=args.top_k,
-                top_p=args.top_p,
-                num_beams=args.num_beams,
-                early_stopping=True,
-            )
-        paddle.device.cuda.synchronize(place)
-        pd_cost = (time.perf_counter() - start) / 50 * 1000
-
-    device = torch.device("cuda:0")
-    hf_model = hf_bart_model.from_pretrained("facebook/" + args.model_name_or_path)
-    hf_model.to(device)
-    hf_model.eval()
-    hf_input_ids = prepare_input(tokenizer, sentences)
-    hf_input_ids = torch.tensor(hf_input_ids.numpy())
-    hf_input_ids = hf_input_ids.to(device)
-
-    if args.decode_strategy == "sampling":
-        do_sample = True
-    else:
-        do_sample = False
-    with torch.no_grad():
-        for i in range(num_loop):
-            # For warmup.
-            if 50 == i:
-                torch.cuda.synchronize()
-                start = time.perf_counter()
-            hf_model.generate(
-                hf_input_ids,
-                do_sample=do_sample,
-                max_length=args.max_length + 1,
-                top_k=args.top_k,
-                top_p=args.top_p,
-                num_beams=args.num_beams,
-                no_repeat_ngram_size=0,
-                length_penalty=0.0,
-            )
-        torch.cuda.synchronize()
-        hf_cost = (time.perf_counter() - start) / 50 * 1000
-
-    pprint(args)
-    print("Fast FP32 cost:", fast_cost)
-    print("PD cost:", pd_cost)
-    print("HF cost:", hf_cost)
-    print("Speed up Fast FP32/PD:", pd_cost / fast_cost)
-    print("Speed up Fast FP32/HF:", hf_cost / fast_cost)
-
-
-if __name__ == "__main__":
-    args = parse_args()
-    do_predict(args)
diff --git a/fast_generation/perf/codegen_perf.py b/fast_generation/perf/codegen_perf.py
deleted file mode 100644
index 1a84b4e94fab..000000000000
--- a/fast_generation/perf/codegen_perf.py
+++ /dev/null
@@ -1,175 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import time
-from pprint import pprint
-
-import numpy as np
-import paddle
-import pynvml
-
-from paddlenlp.transformers import CodeGenForCausalLM, CodeGenTokenizer
-
-pynvml.nvmlInit()
-
-
-def query_by_id(gpu_id=2):
-    handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_id)
-    meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
-    return meminfo.used // 1024 // 1024
-
-
-def perf_pd(args):
-    start_mem = query_by_id(args.gpu_id)
-    place = "gpu"
-    place = paddle.set_device(place)
-    tokenizer = CodeGenTokenizer.from_pretrained(args.model_name_or_path)
-    model = CodeGenForCausalLM.from_pretrained(args.model_name_or_path)
-    model.eval()
-    load_mem = query_by_id(args.gpu_id)
-
-    input_ids_np = [
-        np.random.choice(list(tokenizer.decoder.keys())[:-1], args.input_len) for _ in range(args.batch_size)
-    ]
-    input_ids = paddle.to_tensor(input_ids_np)
-
-    num_loop = 100
-    with paddle.no_grad():
-        for i in range(num_loop):
-            # For warmup.
-            if num_loop // 2 == i:
-                # PaddlePaddle >= 2.2
-                paddle.device.cuda.synchronize(place)
-                start = time.perf_counter()
-            model.generate(
-                input_ids=input_ids,
-                max_length=args.generate_len,
-                min_length=args.generate_len,
-                decode_strategy="sampling",
-                top_k=args.top_k,
-                top_p=args.top_p,
-                use_fast=args.use_faster,
-                use_fp16_decoding=args.use_fp16_decoding,
-            )
-            generate_mem = query_by_id(args.gpu_id)
-        paddle.device.cuda.synchronize(place)
-        pd_cost = (time.perf_counter() - start) / (num_loop - num_loop // 2) * 1000
-    return pd_cost, load_mem - start_mem, generate_mem - start_mem
-
-
-def perf_hf(args):
-    import torch
-    from transformers import CodeGenForCausalLM as hf_codegen
-    from transformers import CodeGenTokenizer as hf_tokenizer
-
-    start_mem = query_by_id(args.gpu_id)
-    device = torch.device("cuda")
-    tokenizer = hf_tokenizer.from_pretrained(args.model_name_or_path)
-    model = hf_codegen.from_pretrained(args.model_name_or_path)
-    model.to(device)
-    model.eval()
-    load_mem = query_by_id(args.gpu_id)
-
-    input_ids_np = [np.random.choice(list(tokenizer.decoder.keys()), args.input_len) for _ in range(args.batch_size)]
-    input_ids = torch.tensor(input_ids_np)
-    input_ids = input_ids.to(device)
-    num_loop = 100
-    with torch.no_grad():
-        for i in range(num_loop):
-            # For warmup.
-            if num_loop // 2 == i:
-                torch.cuda.synchronize()
-                start = time.perf_counter()
-            model.generate(
-                input_ids,
-                do_sample=True,
-                max_length=args.generate_len + input_ids.shape[-1],
-                min_length=args.generate_len + input_ids.shape[-1],
-                top_k=args.top_k,
-                top_p=args.top_p,
-            )
-            generate_mem = query_by_id(args.gpu_id)
-        torch.cuda.synchronize()
-        hf_cost = (time.perf_counter() - start) / (num_loop - num_loop // 2) * 1000
-    return hf_cost, load_mem - start_mem, generate_mem - start_mem
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--perf_type",
-        default="pd",
-        type=str,
-        choices=["pd", "pd_faster_fp32", "pd_faster_fp16", "hf"],
-        help="The type of perf.  ",
-    )
-    parser.add_argument(
-        "--model_name_or_path",
-        default="Salesforce/codegen-350M-mono",
-        type=str,
-        choices=[
-            "Salesforce/codegen-350M-mono",
-            "Salesforce/codegen-2B-mono",
-            "Salesforce/codegen-6B-mono",
-            "Salesforce/codegen-16B-mono",
-        ],
-        help="The model name to specify the bart to use.  ",
-    )
-    parser.add_argument("--top_k", default=4, type=int, help="The number of candidate to procedure topk sampling. ")
-    parser.add_argument(
-        "--top_p", default=1.0, type=float, help="The probability threshold to procedure topp sampling. "
-    )
-    parser.add_argument("--batch_size", default=1, type=int, help="The size of input batch. ")
-    parser.add_argument("--input_len", default=60, type=int, help="The size of model input. ")
-    parser.add_argument("--generate_len", default=20, type=int, help="Length of output . ")
-    parser.add_argument("--gpu_id", default=2, type=int, help="The id of GPU . ")
-    parser.add_argument(
-        "--use_faster", action="store_true", help="Whether to process inference using faster codegen. "
-    )
-
-    parser.add_argument("--use_fp16_decoding", action="store_true", help="Whether to use fp16 decoding to predict. ")
-    args = parser.parse_args()
-    return args
-
-
-def do_predict(args):
-    try:
-        if args.perf_type == "pd":
-            args.use_faster = False
-            cost, load_mem, generate_mem = perf_pd(args)
-        elif args.perf_type == "pd_faster_fp32":
-            args.use_faster = True
-            args.use_fp16_decoding = False
-            cost, load_mem, generate_mem = perf_pd(args)
-        elif args.perf_type == "pd_faster_fp16":
-            args.use_faster = True
-            args.use_fp16_decoding = True
-            paddle.set_default_dtype("float16")
-            cost, load_mem, generate_mem = perf_pd(args)
-        else:
-            cost, load_mem, generate_mem = perf_hf(args)
-        pprint(args)
-        print(
-            f"CodeGenPerfResult: cost_time: {cost} ms, load_mem: {load_mem} MB, generate_mem:{generate_mem} MB, args:{args}\n"
-        )
-    except Exception as e:
-        pprint(args)
-        print(f"CodeGenPerfResult: ERROR: {e}, args:{args}\n")
-
-
-if __name__ == "__main__":
-    args = parse_args()
-    do_predict(args)
diff --git a/fast_generation/perf/gpt_perf.py b/fast_generation/perf/gpt_perf.py
deleted file mode 100644
index 87afcba682b4..000000000000
--- a/fast_generation/perf/gpt_perf.py
+++ /dev/null
@@ -1,155 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import time
-from pprint import pprint
-
-import numpy as np
-import paddle
-import torch
-from transformers import GPT2LMHeadModel as hf_gpt_model
-
-from paddlenlp.transformers import GPTLMHeadModel, GPTTokenizer
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--model_name_or_path",
-        default="gpt2-en",
-        type=str,
-        choices=["gpt2-en", "gpt2-medium-en", "gpt2-large-en"],
-        help="The model name to specify the bart to use. Can be one of ['gpt2-en', 'gpt2-medium-en', 'gpt2-large-en']. ",
-    )
-    parser.add_argument(
-        "--decode_strategy",
-        default="sampling",
-        type=str,
-        choices=["greedy_search", "sampling"],
-        help="The decoding strategy. Can be one of ['greedy_search', 'sampling']",
-    )
-    parser.add_argument("--top_k", default=4, type=int, help="The number of candidate to procedure beam search. ")
-    parser.add_argument("--batch_size", default=4, type=int, help="The size of input batch. ")
-    parser.add_argument(
-        "--top_p", default=1.0, type=float, help="The probability threshold to procedure topp sampling. "
-    )
-    parser.add_argument("--max_length", default=32, type=int, help="Maximum output length. ")
-    parser.add_argument("--use_fp16_decoding", action="store_true", help="Whether to use fp16 decoding to predict. ")
-    args = parser.parse_args()
-    return args
-
-
-def do_predict(args):
-    place = "gpu"
-    place = paddle.set_device(place)
-
-    tokenizer = GPTTokenizer.from_pretrained(args.model_name_or_path)
-    model = GPTLMHeadModel.from_pretrained(args.model_name_or_path)
-    # Set evaluate mode
-    model.eval()
-    bos_id = tokenizer.convert_tokens_to_ids("<|endoftext|>")
-    eos_id = tokenizer.convert_tokens_to_ids("<|endoftext|>")
-
-    input_ids_np = np.array([[bos_id] for i in range(args.batch_size)]).astype("int64").reshape([args.batch_size, 1])
-    input_ids = paddle.to_tensor(input_ids_np)
-    # Define model
-    num_loop = 100
-    with paddle.no_grad():
-        for i in range(num_loop):
-            # For warmup.
-            if 50 == i:
-                # PaddlePaddle >= 2.2
-                paddle.device.cuda.synchronize(place)
-                start = time.perf_counter()
-            model.generate(
-                input_ids=input_ids,
-                max_length=args.max_length,
-                decode_strategy=args.decode_strategy,
-                top_k=args.top_k,
-                top_p=args.top_p,
-                bos_token_id=bos_id,
-                eos_token_id=eos_id,
-                use_fast=True,
-                use_fp16_decoding=args.use_fp16_decoding,
-            )
-        paddle.device.cuda.synchronize(place)
-        fast_cost = (time.perf_counter() - start) / 50 * 1000
-
-    if args.use_fp16_decoding:
-        pprint(args)
-        print("Fast FP16 cost:", fast_cost)
-        return
-    with paddle.no_grad():
-        for i in range(num_loop):
-            # For warmup.
-            if 50 == i:
-                # PaddlePaddle >= 2.2
-                paddle.device.cuda.synchronize(place)
-                start = time.perf_counter()
-            model.generate(
-                input_ids=input_ids,
-                max_length=args.max_length,
-                decode_strategy=args.decode_strategy,
-                top_k=args.top_k,
-                top_p=args.top_p,
-                bos_token_id=bos_id,
-                eos_token_id=eos_id,
-            )
-        paddle.device.cuda.synchronize(place)
-        pd_cost = (time.perf_counter() - start) / 50 * 1000
-
-    device = torch.device("cuda:0")
-    hf_model = hf_gpt_model.from_pretrained(args.model_name_or_path[:-3])
-    hf_model.to(device)
-    hf_model.eval()
-
-    hf_input_ids = torch.tensor(input_ids_np)
-    hf_input_ids = hf_input_ids.to(device)
-
-    if args.decode_strategy == "sampling":
-        do_sample = True
-    else:
-        do_sample = False
-    with torch.no_grad():
-        for i in range(num_loop):
-            # For warmup.
-            if 50 == i:
-                torch.cuda.synchronize()
-                start = time.perf_counter()
-            hf_model.generate(
-                hf_input_ids,
-                do_sample=do_sample,
-                max_length=args.max_length + 1,
-                bos_token_id=bos_id,
-                eos_token_id=eos_id,
-                pad_token_id=0,
-                top_k=args.top_k,
-                top_p=args.top_p,
-            )
-        torch.cuda.synchronize()
-        hf_cost = (time.perf_counter() - start) / 50 * 1000
-
-    pprint(args)
-    print("Fast FP32 cost:", fast_cost)
-    print("PD cost:", pd_cost)
-    print("HF cost:", hf_cost)
-    print("Speed up Fast FP32/PD:", pd_cost / fast_cost)
-    print("Speed up Fast FP32/HF:", hf_cost / fast_cost)
-
-
-if __name__ == "__main__":
-    args = parse_args()
-    do_predict(args)
diff --git a/fast_generation/perf/opt_perf.py b/fast_generation/perf/opt_perf.py
deleted file mode 100644
index 213881fbf947..000000000000
--- a/fast_generation/perf/opt_perf.py
+++ /dev/null
@@ -1,162 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-
-# append project root dir to project to make it run with latest code
-import sys
-import time
-from pprint import pprint
-
-import numpy as np
-import paddle
-import torch
-from transformers.models.opt.modeling_opt import OPTForCausalLM as hf_opt_model
-
-from paddlenlp.transformers import GPTTokenizer, OPTForCausalLM
-
-sys.path.insert(0, "../../")
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--model_name_or_path",
-        default="facebook/opt-125m",
-        type=str,
-        choices=["facebook/opt-125m", "facebook/opt-350m", "facebook/opt-1.3b", "facebook/opt-2.7b"],
-        help="The model name to specify the bart to use. Can be one of ['facebook/opt-125m', 'facebook/opt-350m', 'facebook/opt-1.3b', 'facebook/opt-2.7b']. ",
-    )
-    parser.add_argument(
-        "--decode_strategy",
-        default="greedy_search",
-        type=str,
-        choices=["greedy_search", "sampling"],
-        help="The decoding strategy. Can be one of ['greedy_search', 'sampling']",
-    )
-    parser.add_argument("--top_k", default=4, type=int, help="The number of candidate to procedure beam search. ")
-    parser.add_argument("--batch_size", default=4, type=int, help="The size of input batch. ")
-    parser.add_argument(
-        "--top_p", default=1.0, type=float, help="The probability threshold to procedure topp sampling. "
-    )
-    parser.add_argument("--max_length", default=32, type=int, help="Maximum output length. ")
-    parser.add_argument("--use_fp16_decoding", action="store_true", help="Whether to use fp16 decoding to predict. ")
-    args = parser.parse_args()
-    return args
-
-
-def do_predict(args):
-    place = "gpu"
-    place = paddle.set_device(place)
-
-    tokenizer = GPTTokenizer.from_pretrained(args.model_name_or_path)
-    model = OPTForCausalLM.from_pretrained(args.model_name_or_path)
-    # Set evaluate mode
-    model.eval()
-    bos_id = tokenizer.convert_tokens_to_ids("<|endoftext|>")
-    eos_id = tokenizer.convert_tokens_to_ids("<|endoftext|>")
-
-    input_ids_np = np.array([[bos_id] for i in range(args.batch_size)]).astype("int64").reshape([args.batch_size, 1])
-    input_ids = paddle.to_tensor(input_ids_np)
-    # Define model
-    num_loop = 100
-    with paddle.no_grad():
-        for i in range(num_loop):
-            # For warmup.
-            if 50 == i:
-                # PaddlePaddle >= 2.2
-                paddle.device.cuda.synchronize(place)
-                start = time.perf_counter()
-            model.generate(
-                input_ids=input_ids,
-                max_length=args.max_length,
-                decode_strategy=args.decode_strategy,
-                top_k=args.top_k,
-                top_p=args.top_p,
-                bos_token_id=bos_id,
-                eos_token_id=eos_id,
-                use_fast=True,
-                use_fp16_decoding=args.use_fp16_decoding,
-            )
-        paddle.device.cuda.synchronize(place)
-        fast_cost = (time.perf_counter() - start) / 50 * 1000
-
-    if args.use_fp16_decoding:
-        pprint(args)
-        print("Fast FP16 cost:", fast_cost)
-        return
-    with paddle.no_grad():
-        for i in range(num_loop):
-            # For warmup.
-            if 50 == i:
-                # PaddlePaddle >= 2.2
-                paddle.device.cuda.synchronize(place)
-                start = time.perf_counter()
-            model.generate(
-                input_ids=input_ids,
-                max_length=args.max_length,
-                decode_strategy=args.decode_strategy,
-                top_k=args.top_k,
-                top_p=args.top_p,
-                bos_token_id=bos_id,
-                eos_token_id=eos_id,
-            )
-        paddle.device.cuda.synchronize(place)
-        pd_cost = (time.perf_counter() - start) / 50 * 1000
-
-    device = torch.device("cuda:0")
-    hf_model = hf_opt_model.from_pretrained(args.model_name_or_path)
-
-    hf_model.to(device)
-    hf_model.eval()
-
-    hf_input_ids = torch.tensor(input_ids_np)
-    hf_input_ids = hf_input_ids.to(device)
-
-    if args.decode_strategy == "sampling":
-        do_sample = True
-    else:
-        do_sample = False
-    with torch.no_grad():
-        for i in range(num_loop):
-            # For warmup.
-            if 50 == i:
-                torch.cuda.synchronize()
-                start = time.perf_counter()
-            hf_model.generate(
-                hf_input_ids,
-                do_sample=do_sample,
-                max_length=args.max_length + 1,
-                bos_token_id=bos_id,
-                eos_token_id=eos_id,
-                pad_token_id=0,
-                top_k=args.top_k,
-                top_p=args.top_p,
-            )
-        torch.cuda.synchronize()
-        hf_cost = (time.perf_counter() - start) / 50 * 1000
-
-    pprint(args)
-    print("Fast FP32 cost:", fast_cost)
-    print("PD cost:", pd_cost)
-    print("HF cost:", hf_cost)
-    print("Speed up Fast FP32/PD:", pd_cost / fast_cost)
-    print("Speed up Fast FP32/HF:", hf_cost / fast_cost)
-
-
-if __name__ == "__main__":
-    args = parse_args()
-    print(args.model_name_or_path)
-    do_predict(args)
diff --git a/fast_generation/perf/pegasus_perf.py b/fast_generation/perf/pegasus_perf.py
deleted file mode 100644
index fe8ba55fb8e3..000000000000
--- a/fast_generation/perf/pegasus_perf.py
+++ /dev/null
@@ -1,168 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import time
-from pprint import pprint
-
-import numpy as np
-import paddle
-import pynvml
-
-from paddlenlp.transformers import (
-    PegasusChineseTokenizer,
-    PegasusForConditionalGeneration,
-)
-
-pynvml.nvmlInit()
-
-
-def query_by_id(gpu_id=2):
-    handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_id)
-    meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
-    return meminfo.used // 1024 // 1024
-
-
-def perf_pd(args):
-    start_mem = query_by_id(args.gpu_id)
-    place = "gpu"
-    place = paddle.set_device(place)
-    tokenizer = PegasusChineseTokenizer.from_pretrained(args.model_name_or_path)
-    model = PegasusForConditionalGeneration.from_pretrained(args.model_name_or_path)
-    model.eval()
-    load_mem = query_by_id(args.gpu_id)
-    input_ids_np = [np.random.choice(range(len(tokenizer.vocab)), args.input_len) for _ in range(args.batch_size)]
-    input_ids = paddle.to_tensor(input_ids_np)
-
-    num_loop = 100
-    with paddle.no_grad():
-        for i in range(num_loop):
-            # For warmup.
-            if num_loop // 2 == i:
-                # PaddlePaddle >= 2.2
-                paddle.device.cuda.synchronize(place)
-                start = time.perf_counter()
-            model.generate(
-                input_ids=input_ids,
-                max_length=args.generate_len,
-                min_length=args.generate_len,
-                decode_strategy="beam_search",
-                num_beams=args.num_beams,
-                use_fast=args.use_faster,
-                use_fp16_decoding=args.use_fp16_decoding,
-            )
-            generate_mem = query_by_id(args.gpu_id)
-        paddle.device.cuda.synchronize(place)
-        pd_cost = (time.perf_counter() - start) / (num_loop - num_loop // 2) * 1000
-    return pd_cost, load_mem - start_mem, generate_mem - start_mem
-
-
-def perf_hf(args):
-    import torch
-    from tokenizers_pegasus import PegasusTokenizer as hf_tokenizer
-    from transformers import PegasusForConditionalGeneration as hf_pegasus
-
-    start_mem = query_by_id(args.gpu_id)
-    device = torch.device("cuda")
-    tokenizer = hf_tokenizer.from_pretrained(args.model_name_or_path)
-    model = hf_pegasus.from_pretrained(args.model_name_or_path)
-    model.to(device)
-    model.eval()
-    load_mem = query_by_id(args.gpu_id)
-
-    input_ids_np = [np.random.choice(range(len(tokenizer.vocab)), args.input_len) for _ in range(args.batch_size)]
-    input_ids = torch.tensor(input_ids_np)
-    input_ids = input_ids.to(device)
-    num_loop = 100
-    with torch.no_grad():
-        for i in range(num_loop):
-            # For warmup.
-            if num_loop // 2 == i:
-                torch.cuda.synchronize()
-                start = time.perf_counter()
-            model.generate(
-                input_ids,
-                do_sample=False,
-                num_beams=args.num_beams,
-                max_length=args.generate_len + input_ids.shape[-1],
-                min_length=args.generate_len + input_ids.shape[-1],
-            )
-            generate_mem = query_by_id(args.gpu_id)
-        torch.cuda.synchronize()
-        hf_cost = (time.perf_counter() - start) / (num_loop - num_loop // 2) * 1000
-    return hf_cost, load_mem - start_mem, generate_mem - start_mem
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--perf_type",
-        default="pd",
-        type=str,
-        choices=["pd", "pd_faster_fp32", "pd_faster_fp16", "hf"],
-        help="The type of perf.  ",
-    )
-    parser.add_argument(
-        "--model_name_or_path",
-        default="IDEA-CCNL/Randeng-Pegasus-238M-Summary-Chinese",
-        type=str,
-        choices=[
-            "IDEA-CCNL/Randeng-Pegasus-238M-Summary-Chinese",
-            "IDEA-CCNL/Randeng-Pegasus-523M-Summary-Chinese",
-        ],
-        help="The model name to specify the pegasus to use.  ",
-    )
-    parser.add_argument("--num_beams", default=4, type=int, help="The number of beams to procedure beam search. ")
-    parser.add_argument("--batch_size", default=1, type=int, help="The size of input batch. ")
-    parser.add_argument("--input_len", default=60, type=int, help="The size of model input. ")
-    parser.add_argument("--generate_len", default=20, type=int, help="Length of output . ")
-    parser.add_argument("--gpu_id", default=2, type=int, help="The id of GPU . ")
-    parser.add_argument(
-        "--use_faster", action="store_true", help="Whether to process inference using faster pegasus. "
-    )
-
-    parser.add_argument("--use_fp16_decoding", action="store_true", help="Whether to use fp16 decoding to predict. ")
-    args = parser.parse_args()
-    return args
-
-
-def do_predict(args):
-    try:
-        if args.perf_type == "pd":
-            args.use_faster = False
-            cost, load_mem, generate_mem = perf_pd(args)
-        elif args.perf_type == "pd_faster_fp32":
-            args.use_faster = True
-            args.use_fp16_decoding = False
-            cost, load_mem, generate_mem = perf_pd(args)
-        elif args.perf_type == "pd_faster_fp16":
-            args.use_faster = True
-            args.use_fp16_decoding = True
-            # paddle.set_default_dtype('float16')
-            cost, load_mem, generate_mem = perf_pd(args)
-        else:
-            cost, load_mem, generate_mem = perf_hf(args)
-        pprint(args)
-        print(
-            f"PegasusPerfResult: cost_time: {cost} ms, load_mem: {load_mem} MB, generate_mem:{generate_mem} MB, args:{args}\n"
-        )
-    except Exception as e:
-        pprint(args)
-        print(f"PegasusPerfResult: ERROR: {e}, args:{args}\n")
-
-
-if __name__ == "__main__":
-    args = parse_args()
-    do_predict(args)
diff --git a/fast_generation/perf/run_perf_bart.sh b/fast_generation/perf/run_perf_bart.sh
deleted file mode 100644
index fa087770cb5a..000000000000
--- a/fast_generation/perf/run_perf_bart.sh
+++ /dev/null
@@ -1,76 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-export CUDA_VISIBLE_DEVICES=3
-
-for model_name in bart-base bart-large;  
-    do   
-        for top_k in 1 4 8 16;
-            do
-                python bart_perf.py \
-                    --model_name_or_path=$model_name \
-                    --decode_strategy=sampling \
-                    --num_beams=1 \
-                    --top_k=$top_k \
-                    --top_p=1 \
-                    --max_length=32 
-                sleep 10s
-                python bart_perf.py \
-                    --model_name_or_path=$model_name \
-                    --decode_strategy=sampling \
-                    --num_beams=1 \
-                    --top_k=$top_k \
-                    --top_p=1 \
-                    --max_length=32 \
-                    --use_fp16_decoding
-                sleep 10s
-            done
-        python bart_perf.py \
-            --model_name_or_path=$model_name \
-            --decode_strategy=sampling \
-            --num_beams=1 \
-            --top_k=0 \
-            --top_p=0.4 \
-            --max_length=32 
-        sleep 10s
-        python bart_perf.py \
-            --model_name_or_path=$model_name \
-            --decode_strategy=sampling \
-            --num_beams=1 \
-            --top_k=0 \
-            --top_p=0.4 \
-            --max_length=32 \
-            --use_fp16_decoding
-        sleep 10s
-        for num_beams in 4 8 16;
-            do
-                python bart_perf.py \
-                    --model_name_or_path=$model_name \
-                    --decode_strategy=beam_search \
-                    --num_beams=$num_beams \
-                    --top_k=1 \
-                    --top_p=1 \
-                    --max_length=32 
-                sleep 10s
-                python bart_perf.py \
-                    --model_name_or_path=$model_name \
-                    --decode_strategy=beam_search \
-                    --num_beams=$num_beams \
-                    --top_k=1 \
-                    --top_p=1 \
-                    --max_length=32 \
-                    --use_fp16_decoding
-                sleep 10s
-            done
-    done
\ No newline at end of file
diff --git a/fast_generation/perf/run_perf_codegen.sh b/fast_generation/perf/run_perf_codegen.sh
deleted file mode 100644
index be4792096e2e..000000000000
--- a/fast_generation/perf/run_perf_codegen.sh
+++ /dev/null
@@ -1,64 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-GPU_ID=1
-export CUDA_VISIBLE_DEVICES=${GPU_ID}
-
-for model_name in Salesforce/codegen-350M-mono Salesforce/codegen-2B-mono Salesforce/codegen-6B-mono; 
-    do   
-        for top_k in 1 4 8 16;
-            do
-                for input_len in 60;
-                    do
-                        for generate_len in 20;
-                            do
-                                for perf_type in pd pd_faster_fp32 pd_faster_fp16 hf;
-                                    do 
-                                        echo model_name: $model_name, perf_type: $perf_type, top_k: $top_k, top_p: 1.0, input_len: $input_len, generate_len: $generate_len
-                                        python codegen_perf.py \
-                                            --model_name_or_path=$model_name \
-                                            --perf_type=$perf_type \
-                                            --top_k=$top_k \
-                                            --top_p=1.0 \
-                                            --input_len=$input_len \
-                                            --generate_len=$generate_len \
-                                            --gpu_id ${GPU_ID}
-                                        sleep 3s
-                                    done
-                            done
-                    done
-            done
-        for top_p in 0.4;
-            do
-                for input_len in 60;
-                    do
-                        for generate_len in 20;
-                            do
-                                for perf_type in pd pd_faster_fp32 pd_faster_fp16 hf;
-                                    do 
-                                        echo model_name: $model_name, perf_type: $perf_type, top_k: 0, top_p: $top_p, input_len: $input_len, generate_len: $generate_len
-                                        python codegen_perf.py \
-                                            --model_name_or_path=$model_name \
-                                            --perf_type=$perf_type \
-                                            --top_k=0 \
-                                            --top_p=$top_p \
-                                            --input_len=$input_len \
-                                            --generate_len=$generate_len \
-                                            --gpu_id ${GPU_ID}
-                                        sleep 3s
-                                    done
-                            done
-                    done
-            done
-    done
\ No newline at end of file
diff --git a/fast_generation/perf/run_perf_gpt.sh b/fast_generation/perf/run_perf_gpt.sh
deleted file mode 100644
index 5363b0546af6..000000000000
--- a/fast_generation/perf/run_perf_gpt.sh
+++ /dev/null
@@ -1,52 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-export CUDA_VISIBLE_DEVICES=3
-
-for model_name in gpt2-en gpt2-medium-en gpt2-large-en;  
-    do   
-        for top_k in 1 4 8 16;
-            do
-                python gpt_perf.py \
-                    --model_name_or_path=$model_name \
-                    --decode_strategy=sampling \
-                    --top_k=$top_k \
-                    --top_p=1 \
-                    --max_length=32 
-                sleep 10s
-                python gpt_perf.py \
-                    --model_name_or_path=$model_name \
-                    --decode_strategy=sampling \
-                    --top_k=$top_k \
-                    --top_p=1 \
-                    --max_length=32 \
-                    --use_fp16_decoding
-                sleep 10s
-            done
-        python gpt_perf.py \
-            --model_name_or_path=$model_name \
-            --decode_strategy=sampling \
-            --top_k=0 \
-            --top_p=0.4 \
-            --max_length=32 
-        sleep 10s
-        python gpt_perf.py \
-            --model_name_or_path=$model_name \
-            --decode_strategy=sampling \
-            --top_k=0 \
-            --top_p=0.4 \
-            --max_length=32 \
-            --use_fp16_decoding
-        sleep 10s
-    done
\ No newline at end of file
diff --git a/fast_generation/perf/run_perf_opt.sh b/fast_generation/perf/run_perf_opt.sh
deleted file mode 100644
index bc1d525c00ac..000000000000
--- a/fast_generation/perf/run_perf_opt.sh
+++ /dev/null
@@ -1,52 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-export CUDA_VISIBLE_DEVICES=3
-
-for model_name in facebook/opt-125m facebook/opt-350m;
-    do   
-        for top_k in 1 4 8 16;
-            do
-                python opt_perf.py \
-                    --model_name_or_path=$model_name \
-                    --decode_strategy=sampling \
-                    --top_k=$top_k \
-                    --top_p=0.4 \
-                    --max_length=32 
-                sleep 10s
-                python opt_perf.py \
-                    --model_name_or_path=$model_name \
-                    --decode_strategy=sampling \
-                    --top_k=$top_k \
-                    --top_p=0.4 \
-                    --max_length=32 \
-                    --use_fp16_decoding
-                sleep 10s
-            done
-        python opt_perf.py \
-            --model_name_or_path=$model_name \
-            --decode_strategy=sampling \
-            --top_k=0 \
-            --top_p=0.4 \
-            --max_length=32 
-        sleep 10s
-        python opt_perf.py \
-            --model_name_or_path=$model_name \
-            --decode_strategy=sampling \
-            --top_k=0 \
-            --top_p=0.4 \
-            --max_length=32 \
-            --use_fp16_decoding
-        sleep 10s
-    done
diff --git a/fast_generation/perf/run_perf_pegasus.sh b/fast_generation/perf/run_perf_pegasus.sh
deleted file mode 100644
index 264c28b22c8b..000000000000
--- a/fast_generation/perf/run_perf_pegasus.sh
+++ /dev/null
@@ -1,45 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-GPU_ID=4
-export CUDA_VISIBLE_DEVICES=${GPU_ID}
-
-for model_name in IDEA-CCNL/Randeng-Pegasus-238M-Summary-Chinese IDEA-CCNL/Randeng-Pegasus-523M-Summary-Chinese;
-    do
-        for batch_size in 1 4 8 16;
-            do   
-                for num_beams in 2 4 6 8;
-                    do
-                        for input_len in 60;
-                            do
-                                for generate_len in 20;
-                                    do
-                                        for perf_type in pd_faster_fp16 pd_faster_fp32 pd hf;
-                                            do 
-                                                echo model_name: $model_name, perf_type: $perf_type, batch_size:$batch_size, num_beams: $num_beams, input_len: $input_len, generate_len: $generate_len
-                                                python pegasus_perf.py \
-                                                    --model_name_or_path=$model_name \
-                                                    --perf_type=$perf_type \
-                                                    --batch_size=$batch_size \
-                                                    --num_beams=$num_beams \
-                                                    --input_len=$input_len \
-                                                    --generate_len=$generate_len \
-                                                    --gpu_id ${GPU_ID}
-                                                sleep 3s
-                                            done
-                                    done
-                            done
-                    done
-            done
-    done
\ No newline at end of file
diff --git a/fast_generation/samples/codegen_16b_sample.py b/fast_generation/samples/codegen_16b_sample.py
deleted file mode 100644
index 0f556911e813..000000000000
--- a/fast_generation/samples/codegen_16b_sample.py
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-
-from paddlenlp.transformers import CodeGenForCausalLM, CodeGenTokenizer
-
-# Can be load on A100-40G
-paddle.set_default_dtype("float16")
-model_name = "Salesforce/codegen-16B-mono"
-
-tokenizer = CodeGenTokenizer.from_pretrained(model_name)
-model = CodeGenForCausalLM.from_pretrained(model_name)
-model.eval()
-
-inputs = "def hello"
-input_ids = tokenizer([inputs], return_tensors="pd")["input_ids"]
-
-# Enable FastGeneration
-outputs, _ = model.generate(
-    input_ids=input_ids, max_length=128, decode_strategy="greedy_search", use_fp16_decoding=True, use_fast=True
-)
-
-result = tokenizer.decode(outputs[0], truncate_before_pattern=[r"\n\n^#", "^'''", "\n\n\n"])
-
-print("Model input:", inputs)
-print("Result:", result)
diff --git a/fast_generation/samples/codegen_sample.py b/fast_generation/samples/codegen_sample.py
deleted file mode 100644
index 77cb5c7a335e..000000000000
--- a/fast_generation/samples/codegen_sample.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddlenlp.transformers import CodeGenForCausalLM, CodeGenTokenizer
-
-model_name = "Salesforce/codegen-350M-mono"
-
-tokenizer = CodeGenTokenizer.from_pretrained(model_name)
-model = CodeGenForCausalLM.from_pretrained(model_name)
-model.eval()
-
-inputs = "def hello"
-input_ids = tokenizer([inputs], return_tensors="pd")["input_ids"]
-
-outputs, _ = model.generate(
-    input_ids=input_ids, max_length=128, decode_strategy="greedy_search", use_fp16_decoding=True, use_fast=True
-)
-
-result = tokenizer.decode(outputs[0], truncate_before_pattern=[r"\n\n^#", "^'''", "\n\n\n"])
-
-print("Model input:", inputs)
-print("Result:", result)
-# Result: _world():
-#       print("Hello World")
-
-# hello_world()
diff --git a/fast_generation/samples/gpt_mp_sample.py b/fast_generation/samples/gpt_mp_sample.py
deleted file mode 100644
index 061318e74661..000000000000
--- a/fast_generation/samples/gpt_mp_sample.py
+++ /dev/null
@@ -1,132 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import time
-from pprint import pprint
-
-import paddle
-
-from paddlenlp.ops import enable_ft_para, get_ft_para_conf
-from paddlenlp.transformers import GPTChineseTokenizer, GPTLMHeadModel, GPTTokenizer
-
-MODEL_CLASSES = {
-    "gpt-cpm-large-cn": (GPTLMHeadModel, GPTChineseTokenizer),
-    "gpt-cpm-small-cn-distill": (GPTLMHeadModel, GPTChineseTokenizer),
-    "gpt2-en": (GPTLMHeadModel, GPTTokenizer),
-    "gpt2-medium-en": (GPTLMHeadModel, GPTTokenizer),
-    "gpt2-large-en": (GPTLMHeadModel, GPTTokenizer),
-    "gpt2-xl-en": (GPTLMHeadModel, GPTTokenizer),
-}
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--model_name",
-        default="gpt-cpm-large-cn",
-        choices=list(MODEL_CLASSES.keys()),
-        help="The model name to specify which gpt to use. It can be " + ", ".join(MODEL_CLASSES.keys()),
-    )
-    parser.add_argument("--batch_size", default=4, type=int, help="Batch size.")
-    parser.add_argument("--max_length", default=50, type=int, help="Maximum output length.")
-    parser.add_argument(
-        "--topk", default=1, type=int, help="The number of highest probability tokens to keep for top-k-sampling."
-    )
-    parser.add_argument("--topp", default=1.0, type=float, help="The cumulative probability for top-p-filtering.")
-    parser.add_argument("--temperature", default=1.0, type=float, help="The temperature to set.")
-    parser.add_argument("--tensor_para_size", default=2, type=int, help="The size for tensor parallel.")
-    parser.add_argument("--layer_para_size", default=1, type=int, help="The size for layer parallel.")
-    parser.add_argument(
-        "--layer_para_batch_size",
-        default=None,
-        type=int,
-        help="The local batch size for pipeline parallel." "It is suggested to use `batch_size // layer_para_size`.",
-    )
-    parser.add_argument("--use_fp16", action="store_true", help="Whether to use fp16 to predict.")
-    parser.add_argument("--profile", action="store_true", help="Whether to profile.")
-    args = parser.parse_args()
-    return args
-
-
-def profile(batch_size, total_step=50, warmup_step=10, rank=0):
-    def _wrapper(func):
-        def _impl(*args, **kwargs):
-            for i in range(total_step):
-                if i == warmup_step:
-                    paddle.device.cuda.synchronize()
-                    start_time = time.time()
-                out = func(*args, **kwargs)
-            paddle.device.cuda.synchronize()
-            end_time = time.time()
-            if rank is None or get_ft_para_conf().rank == rank:
-                time_interval = end_time - start_time
-                num_batch = total_step - warmup_step
-                print("Latency: %2fs, QPS: %2f" % (time_interval / num_batch, num_batch * batch_size / time_interval))
-            return out
-
-        return _impl
-
-    return _wrapper
-
-
-def main(args):
-    if args.use_fp16:
-        paddle.set_default_dtype("float16")
-    enable_ft_para(
-        args.tensor_para_size,
-        args.layer_para_size,
-        args.batch_size // args.layer_para_size if args.layer_para_batch_size is None else args.layer_para_batch_size,
-    )
-    # TODO(guosheng): Maybe device can be set in `enable_ft_para`
-    paddle.set_device("gpu:" + str(get_ft_para_conf().rank))
-
-    model_name = args.model_name
-    if args.profile:
-        MODEL_CLASSES[model_name][0].generate = profile(args.batch_size)(MODEL_CLASSES[model_name][0].generate)
-    tokenizer = MODEL_CLASSES[model_name][-1].from_pretrained(model_name)
-    model = MODEL_CLASSES[model_name][0].from_pretrained(model_name)
-    model.eval()
-
-    # NOTE: When using prompt, open this and replace the text with what you want.
-    input = "花间一壶酒，独酌无相亲。举杯邀明月，"
-    # input = '一时黛玉进了荣府，下了车。众嬷嬷引着，便往东转弯，'
-    # input = '爱因斯坦曾经说过：'
-    input_ids = tokenizer(input)["input_ids"]
-    # NOTE: When generating from the beginning, open this.
-    # input_ids = [tokenizer.eos_token_id]
-    input_ids = [input_ids] * args.batch_size
-
-    inputs_ids = paddle.to_tensor(input_ids, dtype="int32")
-    outputs, _ = model.generate(
-        input_ids=inputs_ids,
-        max_length=args.max_length,
-        decode_strategy="sampling",
-        top_k=args.topk,
-        top_p=args.topp,
-        temperature=args.temperature,
-        use_fast=True,
-    )
-
-    # Only make the first process to output.
-    if get_ft_para_conf().rank == 0:
-        for i in range(len(outputs)):
-            result = tokenizer.convert_ids_to_string(outputs[i].numpy().tolist())
-            print("Result:", result)
-
-
-if __name__ == "__main__":
-    args = parse_args()
-    pprint(args)
-    main(args)
diff --git a/fast_generation/samples/gpt_sample.py b/fast_generation/samples/gpt_sample.py
deleted file mode 100644
index e0cff0bba726..000000000000
--- a/fast_generation/samples/gpt_sample.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-
-from paddlenlp.transformers import GPTChineseTokenizer, GPTLMHeadModel
-
-model_name = "gpt-cpm-small-cn-distill"
-
-tokenizer = GPTChineseTokenizer.from_pretrained(model_name)
-model = GPTLMHeadModel.from_pretrained(model_name)
-model.eval()
-
-inputs = "花间一壶酒，独酌无相亲。举杯邀明月，"
-inputs_ids = tokenizer(inputs)["input_ids"]
-inputs_ids = paddle.to_tensor(inputs_ids, dtype="int64").unsqueeze(0)
-
-outputs, _ = model.generate(input_ids=inputs_ids, max_length=10, decode_strategy="greedy_search", use_fast=True)
-
-result = tokenizer.convert_ids_to_string(outputs[0].numpy().tolist())
-
-print("Model input:", inputs)
-print("Result:", result)
-# 对影成三人。
diff --git a/fast_generation/samples/gptj_sample.py b/fast_generation/samples/gptj_sample.py
deleted file mode 100644
index 17615667dfda..000000000000
--- a/fast_generation/samples/gptj_sample.py
+++ /dev/null
@@ -1,42 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-
-from paddlenlp.transformers import GPTJForCausalLM, GPTJTokenizer
-
-paddle.set_default_dtype("float16")
-model_name = "EleutherAI/gpt-j-6B"
-
-tokenizer = GPTJTokenizer.from_pretrained(model_name)
-model = GPTJForCausalLM.from_pretrained(model_name)
-model.eval()
-
-inputs = "What is PaddleNLP?"
-input_ids = tokenizer([inputs], return_tensors="pd")["input_ids"]
-
-outputs, _ = model.generate(
-    input_ids=input_ids,
-    max_length=100,
-    decode_strategy="sampling",
-    temperature=0.8,
-    top_p=0.9,
-    use_fp16_decoding=True,
-    use_fast=True,
-)
-
-result = tokenizer.decode(outputs[0])
-
-print("Model input:", inputs)
-print("Result:", result)
diff --git a/fast_generation/samples/mbart_sample.py b/fast_generation/samples/mbart_sample.py
deleted file mode 100644
index e16c4e7de176..000000000000
--- a/fast_generation/samples/mbart_sample.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import paddle
-
-from paddlenlp.transformers import MBart50Tokenizer, MBartForConditionalGeneration
-
-model_name = "mbart-large-50-many-to-many-mmt"
-
-tokenizer = MBart50Tokenizer.from_pretrained(model_name, src_lang="en_XX")
-model = MBartForConditionalGeneration.from_pretrained(model_name)
-model.eval()
-
-
-def postprocess_response(seq, bos_idx, eos_idx):
-    """Post-process the decoded sequence."""
-    eos_pos = len(seq) - 1
-    for i, idx in enumerate(seq):
-        if idx == eos_idx:
-            eos_pos = i
-            break
-    seq = [idx for idx in seq[: eos_pos + 1] if idx != bos_idx and idx != eos_idx]
-    res = tokenizer.convert_ids_to_string(seq)
-    return res
-
-
-bos_id = tokenizer.lang_code_to_id["zh_CN"]
-eos_id = model.mbart.config["eos_token_id"]
-
-inputs = "PaddleNLP is a powerful NLP library with Awesome pre-trained models and easy-to-use interface, supporting wide-range of NLP tasks from research to industrial applications."
-input_ids = tokenizer(inputs)["input_ids"]
-input_ids = paddle.to_tensor(input_ids, dtype="int32").unsqueeze(0)
-
-outputs, _ = model.generate(
-    input_ids=input_ids,
-    forced_bos_token_id=bos_id,
-    decode_strategy="beam_search",
-    num_beams=4,
-    max_length=50,
-    use_fast=True,
-)
-
-result = postprocess_response(outputs[0].numpy().tolist(), bos_id, eos_id)
-
-print("Model input:", inputs)
-
-print("Result:", result)
-# PaddleNLP是一个强大的NLP库,具有超乎寻常的预训练模型和易于使用的接口,支持从研究到工业应用的广泛的NLP任务。
diff --git a/fast_generation/samples/opt_sample.py b/fast_generation/samples/opt_sample.py
deleted file mode 100644
index 812fd6e01b8f..000000000000
--- a/fast_generation/samples/opt_sample.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-
-from paddlenlp.transformers import GPTTokenizer, OPTForCausalLM
-
-model_name = "facebook/opt-350m"
-
-tokenizer = GPTTokenizer.from_pretrained(model_name)
-model = OPTForCausalLM.from_pretrained(model_name)
-model.eval()
-
-inputs = """a chat between a curious human and Statue of Liberty.
-Human: What is your name?
-Statue: I am statue of liberty.
-Human: where do you live?
-Statue: New york city.
-Human: how long have you lived there?。"""
-
-inputs_ids = tokenizer([inputs])["input_ids"]
-inputs_ids = paddle.to_tensor(inputs_ids, dtype="int64")
-
-outputs, _ = model.generate(
-    input_ids=inputs_ids,
-    max_length=20,
-    decode_strategy="greedy_search",
-    use_fast=True,
-)
-
-result = tokenizer.convert_ids_to_string(outputs[0].numpy().tolist())
-
-print("Model input:", inputs)
-print("Result:", result)
diff --git a/fast_generation/samples/pegasus_sample.py b/fast_generation/samples/pegasus_sample.py
deleted file mode 100644
index ddbc340808b6..000000000000
--- a/fast_generation/samples/pegasus_sample.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddlenlp.transformers import (
-    PegasusChineseTokenizer,
-    PegasusForConditionalGeneration,
-)
-
-model = PegasusForConditionalGeneration.from_pretrained("IDEA-CCNL/Randeng-Pegasus-238M-Summary-Chinese")
-tokenizer = PegasusChineseTokenizer.from_pretrained("IDEA-CCNL/Randeng-Pegasus-238M-Summary-Chinese")
-model.eval()
-
-inputs = "在北京冬奥会自由式滑雪女子坡面障碍技巧决赛中，中国选手谷爱凌夺得银牌。祝贺谷爱凌！今天上午，自由式滑雪女子坡面障碍技巧决赛举行。决赛分三轮进行，取选手最佳成绩排名决出奖牌。第一跳，中国选手谷爱凌获得69.90分。在12位选手中排名第三。完成动作后，谷爱凌又扮了个鬼脸，甚是可爱。第二轮中，谷爱凌在道具区第三个障碍处失误，落地时摔倒。获得16.98分。网友：摔倒了也没关系，继续加油！在第二跳失误摔倒的情况下，谷爱凌顶住压力，第三跳稳稳发挥，流畅落地！获得86.23分！此轮比赛，共12位选手参赛，谷爱凌第10位出场。网友：看比赛时我比谷爱凌紧张，加油！"
-tokenized = tokenizer(inputs, return_tensors="pd")
-outputs, _ = model.generate(
-    input_ids=tokenized["input_ids"],
-    decode_strategy="beam_search",
-    num_beams=4,
-    use_fp16_decoding=True,
-    use_fast=True,
-)
-result = tokenizer.decode(outputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
-
-print("Model input:", inputs)
-print("Result:", result)
diff --git a/fast_generation/samples/plato_sample.py b/fast_generation/samples/plato_sample.py
deleted file mode 100644
index ac79e60918e4..000000000000
--- a/fast_generation/samples/plato_sample.py
+++ /dev/null
@@ -1,62 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddlenlp.transformers import (
-    UnifiedTransformerLMHeadModel,
-    UnifiedTransformerTokenizer,
-)
-
-model_name = "plato-mini"
-
-tokenizer = UnifiedTransformerTokenizer.from_pretrained(model_name)
-model = UnifiedTransformerLMHeadModel.from_pretrained(model_name)
-model.eval()
-
-
-def postprocess_response(token_ids, tokenizer):
-    """Post-process the decoded sequence. Truncate from the first <eos>."""
-    eos_pos = len(token_ids)
-    for i, tok_id in enumerate(token_ids):
-        if tok_id == tokenizer.sep_token_id:
-            eos_pos = i
-            break
-    token_ids = token_ids[:eos_pos]
-    tokens = tokenizer.convert_ids_to_tokens(token_ids)
-    tokens = tokenizer.merge_subword(tokens)
-    return tokens
-
-
-inputs = "你好啊，你今年多大了"
-
-inputs_ids = tokenizer.dialogue_encode(
-    inputs, add_start_token_as_response=True, return_tensors=True, is_split_into_words=False
-)
-
-outputs, _ = model.generate(
-    input_ids=inputs_ids["input_ids"],
-    token_type_ids=inputs_ids["token_type_ids"],
-    position_ids=inputs_ids["position_ids"],
-    attention_mask=inputs_ids["attention_mask"],
-    max_length=64,
-    decode_strategy="sampling",
-    top_k=5,
-    use_fast=True,
-)
-
-result = postprocess_response(outputs[0].numpy(), tokenizer)
-result = "".join(result)
-
-print("Model input:", inputs)
-print("Result:", result)
-# 我今年23岁了,你今年多大了?
diff --git a/fast_generation/samples/plato_xl_sample.py b/fast_generation/samples/plato_xl_sample.py
deleted file mode 100644
index 9c6138a9721b..000000000000
--- a/fast_generation/samples/plato_xl_sample.py
+++ /dev/null
@@ -1,162 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import os
-import time
-from distutils.util import strtobool
-from pprint import pprint
-
-import paddle
-
-from paddlenlp.data import DataCollatorWithPadding
-from paddlenlp.ops import enable_ft_para, get_ft_para_conf
-from paddlenlp.transformers import (
-    UnifiedTransformerLMHeadModel,
-    UnifiedTransformerTokenizer,
-)
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--use_role", type=strtobool, default=True, help="Whether to use role embeddings.")
-    parser.add_argument(
-        "--position_style",
-        default="relative",
-        choices=["continuous", "relative"],
-        type=str,
-        help="The type for positional embedding. Default is relative.",
-    )
-    parser.add_argument("--batch_size", default=1, type=int, help="Batch size.")
-    parser.add_argument(
-        "--num_return_sequences", default=1, type=int, help="The number of returned sequences for each sample."
-    )
-    parser.add_argument("--max_out_len", default=64, type=int, help="Maximum output sequence length.")
-    parser.add_argument("--min_out_len", default=1, type=int, help="Minimum output sequence length.")
-    parser.add_argument(
-        "--topk", default=1, type=int, help="The number of highest probability tokens to keep for top-k-sampling."
-    )
-    parser.add_argument("--topp", default=1.0, type=float, help="The cumulative probability for top-p-filtering.")
-    parser.add_argument("--temperature", default=1.0, type=float, help="The temperature to set.")
-    parser.add_argument("--use_fp16", action="store_true", help="Whether to use fp16 to predict.")
-    parser.add_argument("--profile", action="store_true", help="Whether to profile.")
-    args = parser.parse_args()
-    return args
-
-
-def profile(batch_size, total_step=50, warmup_step=10, rank=0):
-    def _wrapper(func):
-        def _impl(*args, **kwargs):
-            for i in range(total_step):
-                if i == warmup_step:
-                    paddle.device.cuda.synchronize()
-                    start_time = time.time()
-                out = func(*args, **kwargs)
-            paddle.device.cuda.synchronize()
-            end_time = time.time()
-            if rank is None or get_ft_para_conf().rank == rank:
-                time_interval = end_time - start_time
-                num_batch = total_step - warmup_step
-                print("Latency: %2fs, QPS: %2f" % (time_interval / num_batch, num_batch * batch_size / time_interval))
-            return out
-
-        return _impl
-
-    return _wrapper
-
-
-def postprocess_response(token_ids, tokenizer):
-    """Post-process the decoded sequence. Truncate from the first <eos>."""
-    eos_pos = len(token_ids)
-    for i, tok_id in enumerate(token_ids):
-        if tok_id == tokenizer.sep_token_id:
-            eos_pos = i
-            break
-    token_ids = token_ids[:eos_pos]
-    tokens = tokenizer.convert_ids_to_tokens(token_ids)
-    tokens = tokenizer.merge_subword(tokens)
-    response = " ".join(tokens)
-    return response
-
-
-def main(args):
-    # For memory saving when using FastGeneration:
-    # If environment variable `PPFG_QKV_MEM_OPT` is set and the weights of q/k/v
-    # is fused, it will try to delete the original unfused weights. Note the
-    # rollback to original model would not be guarantee anymore when the fast
-    # model failed if the original weights are deleted.
-    os.environ["PPFG_QKV_MEM_OPT"] = "1"
-    if args.use_fp16:
-        paddle.set_default_dtype("float16")
-    enable_ft_para()
-    # TODO(guosheng): Maybe device can be set in `enable_ft_para`
-    paddle.set_device("gpu:" + str(get_ft_para_conf().rank))
-
-    if args.profile:
-        UnifiedTransformerLMHeadModel.generate = profile(args.batch_size)(UnifiedTransformerLMHeadModel.generate)
-    tokenizer = UnifiedTransformerTokenizer.from_pretrained("plato-xl")
-    model = UnifiedTransformerLMHeadModel.from_pretrained("plato-xl")
-    model.eval()
-
-    history = [
-        "hi , Mary ! What do you usually like to do in your spare time ?",
-        "well , I spend a lot of time watching movies .",
-        "what a confidence ! I always watch a lot of movies , too ."
-        "oh really , Frank ? What kind of movies do you like ?",
-    ]
-    inputs = [history] * args.batch_size
-    inputs = list(
-        map(
-            lambda history: tokenizer.dialogue_encode(
-                history=history,
-                add_start_token_as_response=True,
-                return_length=True,
-                return_role_ids=args.use_role,
-                position_style=args.position_style,
-            ),
-            inputs,
-        )
-    )
-    collator = DataCollatorWithPadding(tokenizer)
-    data = collator(inputs)
-
-    outputs, _ = model.generate(
-        input_ids=data["input_ids"],
-        token_type_ids=data["token_type_ids"],
-        position_ids=data["position_ids"],
-        attention_mask=data["attention_mask"].cast("float32"),  # TODO(guosheng): remove this cast
-        role_ids=data.get("role_ids", None),
-        seq_len=data["seq_len"],
-        max_length=args.max_out_len,
-        min_length=args.min_out_len,
-        decode_strategy="sampling",
-        top_k=args.topk,
-        top_p=args.topp,
-        temperature=args.temperature,
-        num_return_sequences=args.num_return_sequences,
-        use_fast=True,
-        use_fp16_decoding=args.use_fp16,
-    )
-
-    # Only make the first process to output.
-    if get_ft_para_conf().rank == 0:
-        for i in range(len(outputs)):
-            result = postprocess_response(outputs[i].numpy(), tokenizer)
-            print("Result:", result)
-
-
-if __name__ == "__main__":
-    args = parse_args()
-    pprint(args)
-    main(args)
diff --git a/fast_generation/samples/t5_sample.py b/fast_generation/samples/t5_sample.py
deleted file mode 100644
index 53ad13f903c1..000000000000
--- a/fast_generation/samples/t5_sample.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-
-from paddlenlp.transformers import T5ForConditionalGeneration, T5Tokenizer
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--max_length", default=256, type=int, help="Maximum output sequence length.")
-    parser.add_argument("--beam_size", default=4, type=int, help="The beam size to set.")
-    parser.add_argument("--use_faster", action="store_true", help="Whether to use faster to predict.")
-    parser.add_argument("--use_fp16_decoding", action="store_true", help="Whether to use fp16 to predict.")
-    args = parser.parse_args()
-    return args
-
-
-def predict(args):
-    model_name = "t5-base"
-
-    model = T5ForConditionalGeneration.from_pretrained(model_name)
-    model.eval()
-    tokenizer = T5Tokenizer.from_pretrained(model_name)
-
-    en_text = ' This image section from an infrared recording by the Spitzer telescope shows a "family portrait" of countless generations of stars: the oldest stars are seen as blue dots. '
-    input_ids = tokenizer.encode("translate English to French: " + en_text, return_tensors="pd")["input_ids"]
-
-    output, _ = model.generate(
-        input_ids=input_ids,
-        num_beams=args.beam_size,
-        max_length=args.max_length,
-        decode_strategy="beam_search",
-        use_fast=True,  # args.use_faster,
-        use_fp16_decoding=args.use_fp16_decoding,
-    )
-
-    translation = tokenizer.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
-
-    print("The original sentence: ", en_text)
-    print("The translation result: ", translation)
-
-
-if __name__ == "__main__":
-    args = parse_args()
-
-    predict(args)
diff --git a/fast_generation/samples/unimo_text_sample.py b/fast_generation/samples/unimo_text_sample.py
deleted file mode 100644
index 29197be47e52..000000000000
--- a/fast_generation/samples/unimo_text_sample.py
+++ /dev/null
@@ -1,59 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddlenlp.transformers import UNIMOLMHeadModel, UNIMOTokenizer
-
-model_name = "unimo-text-1.0-lcsts-new"
-
-model = UNIMOLMHeadModel.from_pretrained(model_name)
-model.eval()
-tokenizer = UNIMOTokenizer.from_pretrained(model_name)
-
-
-def postprocess_response(token_ids, tokenizer):
-    """Post-process the decoded sequence. Truncate from the first <eos>."""
-    eos_pos = len(token_ids)
-    for i, tok_id in enumerate(token_ids):
-        if tok_id == tokenizer.mask_token_id:
-            eos_pos = i
-            break
-    token_ids = token_ids[:eos_pos]
-    tokens = tokenizer.convert_ids_to_tokens(token_ids)
-    tokens = tokenizer.merge_subword(tokens)
-    return tokens
-
-
-inputs = "深度学习是人工智能的核心技术领域。百度飞桨作为中国首个自主研发、功能丰富、开源开放的产业级深度学习平台,将从多层次技术产品、产业AI人才培养和强大的生态资源支持三方面全面护航企业实现快速AI转型升级。"
-
-inputs_ids = tokenizer.gen_encode(
-    inputs, add_start_token_for_decoding=True, return_tensors=True, is_split_into_words=False
-)
-
-outputs, _ = model.generate(
-    input_ids=inputs_ids["input_ids"],
-    token_type_ids=inputs_ids["token_type_ids"],
-    position_ids=inputs_ids["position_ids"],
-    attention_mask=inputs_ids["attention_mask"],
-    max_length=64,
-    decode_strategy="beam_search",
-    num_beams=2,
-    use_fast=True,
-)
-
-result = postprocess_response(outputs[0].numpy(), tokenizer)
-result = "".join(result)
-
-print("Model input:", inputs)
-print("Result:", result)
-# 百度飞桨：深度学习助力企业转型升级
diff --git a/paddlenlp/ops/CMakeLists.txt b/paddlenlp/ops/CMakeLists.txt
deleted file mode 100644
index d0914969a979..000000000000
--- a/paddlenlp/ops/CMakeLists.txt
+++ /dev/null
@@ -1,490 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-cmake_minimum_required(VERSION 3.10 FATAL_ERROR)
-project(FasterTransformer LANGUAGES C CXX CUDA)
-
-find_package(CUDA 10.1 REQUIRED)
-
-find_program(CCACHE_PROGRAM ccache)
-if(CCACHE_PROGRAM)
-  set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache)
-  set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK ccache)
-endif()
-
-INCLUDE(ExternalProject)
-
-set(CXX_STD "17" CACHE STRING "C++ standard")
-
-option(ON_INFER         "Compiled with inference. "                                 OFF)
-option(WITH_GPU         "Compiled with GPU/CPU, default use CPU."                   ON)
-option(WITH_MKL         "Compile with MKL. Only works when ON_INFER is ON."         ON)
-option(USE_TENSORRT     "Compiled with TensorRT."                                   OFF)
-option(WITH_TRANSFORMER "Compiled with Transformer."                                ON)
-option(WITH_GPT         "Compiled with GPT."                                        ON)
-option(WITH_OPT         "Compiled with OPT."                                        ON)
-option(WITH_UNIFIED     "Compiled with Unified Transformer."                        ON)
-option(WITH_T5          "Compiled with T5."                                         ON)
-option(WITH_SP          "Compiled with sentencepiece. Only works when WITH_GPT and ON_INFER is ON." OFF)
-option(WITH_DECODER     "Compile with Transformer Decoder"                          ON)
-option(WITH_ENCODER     "Compile with Transformer Encoder"                          ON)
-option(WITH_STATIC_LIB  "Compile static lib"                                        OFF)
-option(WITH_BART        "Compile with BART"                                         ON)
-option(WITH_MBART       "Compile with MBART"                                        ON)
-option(WITH_PARALLEL    "Compile with model parallel for GPT"                       OFF)
-option(WITH_ONNXRUNTIME "Compile demo with ONNXRuntime"                             OFF)
-option(WITH_GPTJ        "Compile with GPTJ"                                         ON)
-option(WITH_PEGASUS     "Compile with Pegasus"                                      ON)
-
-set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
-if(WITH_PARALLEL)
-  # https://cmake.org/cmake/help/latest/module/FindMPI.html#variables-for-locating-mpi
-  # https://github.com/Kitware/CMake/blob/master/Modules/FindMPI.cmake
-  find_package(MPI REQUIRED)
-  find_package(NCCL REQUIRED)
-  add_definitions(-DBUILD_GPT)
-  list(APPEND decoding_op_files parallel_utils.cc)
-endif()
-
-if(NOT WITH_GPU)
-  message(FATAL_ERROR "Faster transformer custom op doesn't support CPU. Please add the flag -DWITH_GPU=ON to use GPU. ")
-endif()
-
-list(APPEND decoding_op_files cublas_handle.cc utils.cc)
-
-if(WITH_TRANSFORMER)
-  list(APPEND decoding_op_files fusion_decoding_op.cc fusion_decoding_op.cu fusion_force_decoding_op.cc fusion_force_decoding_op.cu)
-endif()
-
-if(WITH_GPT)
-  list(APPEND decoding_op_files fusion_gpt_op.cc fusion_gpt_op.cu)
-endif()
-
-if(WITH_OPT)
-  list(APPEND decoding_op_files fusion_opt_op.cc fusion_opt_op.cu)
-endif()
-
-if(WITH_UNIFIED)
-  list(APPEND decoding_op_files fusion_unified_decoding_op.cc fusion_unified_decoding_op.cu fusion_miro_op.cc fusion_miro_op.cu)
-endif()
-
-if(WITH_ENCODER)
-  list(APPEND decoding_op_files fusion_encoder_op.cc fusion_encoder_op.cu)
-endif()
-
-if(WITH_DECODER)
-  list(APPEND decoding_op_files fusion_decoder_op.cc fusion_decoder_op.cu)
-endif()
-
-if(WITH_BART)
-  list(APPEND decoding_op_files fusion_bart_decoding_op.cc fusion_bart_decoding_op.cu)
-endif()
-
-if(WITH_MBART)
-  list(APPEND decoding_op_files fusion_mbart_decoding_op.cc fusion_mbart_decoding_op.cu)
-endif()
-
-if(WITH_GPTJ)
-  list(APPEND decoding_op_files fusion_gptj_op.cc fusion_gptj_op.cu)
-endif()
-
-if(WITH_PEGASUS)
-  list(APPEND decoding_op_files fusion_pegasus_decoding_op.cc fusion_pegasus_decoding_op.cu)
-endif()
-
-if(WITH_T5)
-  list(APPEND decoding_op_files fusion_t5_decoding_op.cc fusion_t5_decoding_op.cu)
-endif()
-
-if(NOT WITH_TRANSFORMER AND NOT WITH_GPT AND NOT WITH_DECODER AND NOT WITH_ENCODER AND NOT WITH_BART AND NOT WITH_MBART AND NOT WITH_GPTJ AND NOT WITH_PEGASUS AND NOT WITH_T5)
-  message(FATAL_ERROR "-DWITH_TRANSFORMER=ON or/and -DWITH_GPT=ON or/and -DWITH_DECODER=ON or/and -DWITH_ENCODER=ON or/and -DWITH_BART=ON or/and -DWITH_MBART=ON or/and -DWITH_GPTJ=ON or/and -DWITH_PEGASUS=ON or/and -DWITH_T5=ON must be set to use FasterTransformer. ")
-endif()
-
-set(CUDA_PATH ${CUDA_TOOLKIT_ROOT_DIR})
-
-list(APPEND CMAKE_MODULE_PATH ${CUDA_PATH}/lib64)
-
-# Setting compiler flags
-set(CMAKE_C_FLAGS    "${CMAKE_C_FLAGS}")    
-set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS}")
-set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS}  -Xcompiler -Wall")
-
-######################################################################################
-# A function for automatic detection of GPUs installed  (if autodetection is enabled)
-# Usage:
-#   detect_installed_gpus(out_variable)
-function(detect_installed_gpus out_variable)
-  if(NOT CUDA_gpu_detect_output)
-    set(cufile ${PROJECT_BINARY_DIR}/detect_cuda_archs.cu)
-
-    file(WRITE ${cufile} ""
-      "#include \"stdio.h\"\n"
-      "#include \"cuda.h\"\n"
-      "#include \"cuda_runtime.h\"\n"
-      "int main() {\n"
-      "  int count = 0;\n"
-      "  if (cudaSuccess != cudaGetDeviceCount(&count)) return -1;\n"
-      "  if (count == 0) return -1;\n"
-      "  for (int device = 0; device < count; ++device) {\n"
-      "    cudaDeviceProp prop;\n"
-      "    if (cudaSuccess == cudaGetDeviceProperties(&prop, device))\n"
-      "      printf(\"%d.%d \", prop.major, prop.minor);\n"
-      "  }\n"
-      "  return 0;\n"
-      "}\n")
-
-    execute_process(COMMAND "${CUDA_NVCC_EXECUTABLE}"
-                    "--run" "${cufile}"
-                    WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/"
-                    RESULT_VARIABLE nvcc_res OUTPUT_VARIABLE nvcc_out
-                    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
-
-    if(nvcc_res EQUAL 0)
-      # Only use last item of nvcc_out (the last device's compute capability).
-      string(REGEX REPLACE "\\." "" nvcc_out "${nvcc_out}")
-      string(REGEX MATCHALL "[0-9()]+" nvcc_out "${nvcc_out}")
-      list(GET nvcc_out -1 nvcc_out)
-      set(CUDA_gpu_detect_output ${nvcc_out} CACHE INTERNAL "Returned GPU architetures from detect_installed_gpus tool" FORCE)
-    endif()
-  endif()
-
-  if(NOT CUDA_gpu_detect_output)
-    message(STATUS "Automatic GPU detection failed. Building for all known architectures.")
-    set(${out_variable} ${paddle_known_gpu_archs} PARENT_SCOPE)
-  else()
-    set(${out_variable} ${CUDA_gpu_detect_output} PARENT_SCOPE)
-  endif()
-endfunction()
-
-if (NOT SM)
-  # TODO(guosheng): Remove it if `GetCUDAComputeCapability` is exposed by paddle.
-  # Currently, if `CUDA_gpu_detect_output` is not defined, use the detected arch.
-  detect_installed_gpus(SM)
-endif()
-
-#[[
-if (SM STREQUAL 80 OR
-    SM STREQUAL 86 OR
-    SM STREQUAL 70 OR
-    SM STREQUAL 75 OR
-    SM STREQUAL 61 OR
-    SM STREQUAL 60)
-set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_${SM},code=\\\"sm_${SM},compute_${SM}\\\"")
-  if (SM STREQUAL 70 OR SM STREQUAL 75 OR SM STREQUAL 80 OR SM STREQUAL 86)
-    set(CMAKE_C_FLAGS    "${CMAKE_C_FLAGS}    -DWMMA")
-    set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS}  -DWMMA")
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DWMMA")
-  endif()
-message("-- Assign GPU architecture (sm=${SM})")
-
-else()
-set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS}  \
-                      -gencode=arch=compute_70,code=\\\"sm_70,compute_70\\\" \
-                      -gencode=arch=compute_75,code=\\\"sm_75,compute_75\\\" \
-                      ")
-
-set(CMAKE_C_FLAGS    "${CMAKE_C_FLAGS}    -DWMMA")
-set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS}  -DWMMA")
-set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DWMMA")
-
-message("-- Assign GPU architecture (sm=70,75)")
-endif()
-]]
-
-set(SM_SETS 52 60 61 70 75 80)
-set(USING_WMMA False)
-set(FIND_SM False)
-
-foreach(SM_NUM IN LISTS SM_SETS)
-  string(FIND "${SM}" "${SM_NUM}" SM_POS)
-  if(SM_POS GREATER -1)
-    set(FIND_SM True)
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_${SM_NUM},code=\\\"sm_${SM_NUM},compute_${SM_NUM}\\\"")
-
-    if (SM_NUM STREQUAL 70 OR SM_NUM STREQUAL 75 OR SM_NUM STREQUAL 80 OR SM_NUM STREQUAL 86)
-      set(USING_WMMA True)
-    endif()
-
-    set(CMAKE_CUDA_ARCHITECTURES ${SM_NUM})
-    message("-- Assign GPU architecture (sm=${SM_NUM})")
-  endif()
-endforeach()
-
-if(USING_WMMA STREQUAL True)
-  set(CMAKE_C_FLAGS    "${CMAKE_C_FLAGS}    -DWMMA")
-  set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS}  -DWMMA")
-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DWMMA")
-  message("-- Use WMMA")
-endif()
-
-if(NOT (FIND_SM STREQUAL True))
-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS}  \
-                        -gencode=arch=compute_70,code=\\\"sm_70,compute_70\\\" \
-                        -gencode=arch=compute_75,code=\\\"sm_75,compute_75\\\" \
-                        -gencode=arch=compute_80,code=\\\"sm_80,compute_80\\\" \
-                        ")
-
-  set(CMAKE_C_FLAGS    "${CMAKE_C_FLAGS}    -DWMMA")
-  set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS}  -DWMMA")
-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DWMMA")
-  if(BUILD_PYT)
-    set(ENV{TORCH_CUDA_ARCH_LIST} "7.0;7.5;8.0")
-  endif()
-  set(CMAKE_CUDA_ARCHITECTURES 70 75 80)
-  message("-- Assign GPU architecture (sm=70,75,80)")
-endif()
-
-set(CMAKE_C_FLAGS_DEBUG    "${CMAKE_C_FLAGS_DEBUG}    -Wall -O0")
-set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG}  -Wall -O0")
-set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -O0 -G -Xcompiler -Wall")
-
-set(CMAKE_CXX17_STANDARD_COMPILE_OPTION "-std=c++{CXX_STD}")
-set(CMAKE_CXX17_EXTENSION_COMPILE_OPTION "-std=gnu++{CXX_STD}")
-set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda")
-set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
-set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --std=c++${CXX_STD}")
-
-set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3")
-set(CMAKE_CUDA_FLAGS_RELEASE "${CMAKE_CUDA_FLAGS_RELEASE} -Xcompiler -O3")
-
-set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
-set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
-set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
-
-list(APPEND COMMON_HEADER_DIRS
-  ${PROJECT_SOURCE_DIR}
-  ${CUDA_PATH}/include)
-
-set(COMMON_LIB_DIRS
-  ${CUDA_PATH}/lib64
-)
-
-if(WITH_PARALLEL)
-  list(APPEND COMMON_HEADER_DIRS
-    ${NCCL_INCLUDE_PATH}
-    ${MPI_INCLUDE_PATH})
-endif()
-
-set(THIRD_PATH "third-party")
-set(THIRD_PARTY_NAME "fastertransformer")
-
-include(external/boost)
-
-set(OPS_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
-
-file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/fastertransformer/utils/allocator.h allocator_src)
-file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/utils/allocator.h allocator_dst)
-
-file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/fastertransformer/utils/common.h common_src)
-file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/utils/common.h common_dst)
-
-file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/fastertransformer/utils/common_structure.h common_structure_src)
-file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/utils/common_structure.h common_structure_dst)
-
-file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/CMakeLists.txt cmakelists_src)
-file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/CMakeLists.txt cmakelists_dst)
-
-file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/fastertransformer/cuda/topk_kernels.cu topk_kernels_src)
-file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/cuda/topk_kernels.cu topk_kernels_dst)
-
-file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/fastertransformer/cuda/lightseq_kernels.cu lightseq_kernels_cu_src)
-
-file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/fastertransformer/cuda/open_decoder.cu open_decoder_cu_src)
-file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/cuda/open_decoder.cu open_decoder_cu_dst)
-
-file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/fastertransformer/open_decoder.h open_decoder_h_src)
-file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/open_decoder.h open_decoder_h_dst)
-
-file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/fastertransformer/cuda/cuda_kernels.h cuda_kernels_h_src)
-file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/cuda/cuda_kernels.h cuda_kernels_h_dst)
-
-file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/fastertransformer/cuda/cuda_kernels.cu cuda_kernels_cu_src)
-file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/cuda/cuda_kernels.cu cuda_kernels_cu_dst)
-
-file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/fastertransformer/cuda/decoding_kernels.cu decoding_kernels_cu_src)
-file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/fastertransformer/cuda/transformer_decoding_kernels.cu trans_decoding_kernels_cu_src)
-file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/cuda/decoding_kernels.cu decoding_kernels_cu_dst)
-
-file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/fastertransformer/cuda/open_decoder.cuh open_decoder_cuh_src)
-file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/cuda/open_decoder.cuh open_decoder_cuh_dst)
-
-file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/fastertransformer/utils/arguments.h arguments_h_src)
-file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/utils/arguments.h arguments_h_dst)
-
-file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/fastertransformer/decoding_beamsearch.h decoding_beamsearch_h_src)
-file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/decoding_beamsearch.h decoding_beamsearch_h_dst)
-
-file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/fastertransformer/decoding_sampling.h decoding_sampling_h_src)
-file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/decoding_sampling.h decoding_sampling_h_dst)
-
-file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/fastertransformer/cuda/online_softmax_beamsearch_kernels.cu online_softmax_beamsearch_kernels_cu_src)
-file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/cuda/online_softmax_beamsearch_kernels.cu online_softmax_beamsearch_kernels_cu_dst)
-
-file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/fastertransformer/cuda/topk_kernels.cuh topk_kernels_cuh_src)
-file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/cuda/topk_kernels.cuh topk_kernels_cuh_dst)
-
-file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/fastertransformer/cuda/transformer_kernels.cu trans_kernels_cu_src)
-file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/cuda/transformer_kernels.cu trans_kernels_cu_dst)
-
-file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/fastertransformer/cuda/transformer_kernels.cuh trans_kernels_cuh_src)
-file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/cuda/transformer_kernels.cuh trans_kernels_cuh_dst)
-
-file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/fastertransformer/cuda/masked_multihead_attention.cu masked_multihead_attention_cu_src)
-file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/cuda/masked_multihead_attention.cu masked_multihead_attention_cu_dst)
-
-file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/fastertransformer/gpt.h gpt_h_src)
-file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/gpt.h gpt_h_dst)
-
-file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/fastertransformer/opt.h opt_h_src)
-file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/opt.h opt_h_dst)
-
-file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/fastertransformer/gptj.h gptj_h_src)
-file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/gptj.h gptj_h_dst)
-
-file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/fastertransformer/cuda/masked_multihead_attention_utils.h masked_multihead_attention_utils_h_src)
-file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/cuda/masked_multihead_attention_utils.h masked_multihead_attention_utils_h_dst)
-
-file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/fastertransformer/cuda/masked_multihead_attention.h masked_multihead_attention_h_src)
-file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/cuda/masked_multihead_attention.h masked_multihead_attention_h_dst)
-
-file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/fastertransformer/cuda/attention_kernels.cu attention_kernels_cu_src)
-file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/cuda/attention_kernels.cu attention_kernels_cu_dst)
-
-file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/fastertransformer/cuda/attention_kernels.cuh attention_kernels_cuh_src)
-file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/cuda/attention_kernels.cuh attention_kernels_cuh_dst)
-
-file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/fastertransformer/t5_beamsearch.h t5_bs_h_src)
-file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/fastertransformer/t5_sampling.h t5_spl_h_src)
-set(ft_dst ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/)
-
-# Encoder patches.
-file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/fastertransformer/bert_encoder_transformer.h bert_encoder_transformer_h_src)
-file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/bert_encoder_transformer.h bert_encoder_transformer_h_dst)
-
-file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/fastertransformer/standard_encoder.h standard_encoder_h_src)
-file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/standard_encoder.h standard_encoder_h_dst)
-
-file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/fastertransformer/cuda/open_attention.h open_attention_h_src)
-file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/cuda/open_attention.h open_attention_h_dst)
-
-file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/cuda/open_attention.cu open_attention_cu_dst)
-
-file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/fastertransformer/CMakeLists.txt fastertransformer_cmakelists_src)
-file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/CMakeLists.txt fastertransformer_cmakelists_dst)
-# Encoder patches end.
-
-# TODO(guosheng): `find` seems meeting errors missing argument to `-exec', fix it
-set(MUTE_COMMAND grep -rl "printf(\"\\[WARNING\\]" ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/ | xargs -i{} sed -i "s/printf(\"\\WWARNING\\W decoding[^)]\\{1,\\})/ /" {})
-set(OPEN_ATTENTION_MUTE_COMMAND grep -rl "printf(\"\\[WARNING\\]" ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/ | xargs -i{} sed -i "s/printf(\"\\WWARNING\\W\\WOpenMultiHeadAttention\\W[^)]\\{1,\\})/ /" {})
-
-set(RM_OLD_CUB_COMMAND rm -rf ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/cuda/cub)
-
-set(FT_PATCH_COMMAND
-  printf \\n\\n > blank_lines
-  && cp ${allocator_src} ${allocator_dst}
-  && cp ${common_src} ${common_dst}
-  && cp ${common_structure_src} ${common_structure_dst}
-  && cp ${cmakelists_src} ${cmakelists_dst}
-  && cp ${topk_kernels_src} ${topk_kernels_dst}
-  && cp ${decoding_beamsearch_h_src} ${decoding_beamsearch_h_dst}
-  && cp ${decoding_sampling_h_src} ${decoding_sampling_h_dst}
-  && cp ${online_softmax_beamsearch_kernels_cu_src} ${online_softmax_beamsearch_kernels_cu_dst}
-  && cp ${arguments_h_src} ${arguments_h_dst}
-  && cp ${open_decoder_h_src} ${open_decoder_h_dst}
-  && cp ${standard_encoder_h_src} ${standard_encoder_h_dst}
-  && cp ${bert_encoder_transformer_h_src} ${bert_encoder_transformer_h_dst}
-  && cp ${trans_kernels_cu_src} ${trans_kernels_cu_dst}
-  && cp ${masked_multihead_attention_cu_src} ${masked_multihead_attention_cu_dst}
-  && cp ${open_attention_h_src} ${open_attention_h_dst}
-  && cp ${fastertransformer_cmakelists_src} ${fastertransformer_cmakelists_dst}
-  && cp ${gpt_h_src} ${gpt_h_dst}
-  && cp ${opt_h_src} ${opt_h_dst}
-  && cp ${gptj_h_src} ${gptj_h_dst}
-  && cp ${masked_multihead_attention_h_src} ${masked_multihead_attention_h_dst}
-  && cp ${t5_bs_h_src} ${ft_dst}
-  && cp ${t5_spl_h_src} ${ft_dst}
-  && cat blank_lines ${masked_multihead_attention_utils_h_src} >> ${masked_multihead_attention_utils_h_dst}
-  && cat blank_lines ${attention_kernels_cu_src} >> ${attention_kernels_cu_dst}
-  && cat blank_lines ${attention_kernels_cuh_src} >> ${attention_kernels_cuh_dst}
-  && cat blank_lines ${cuda_kernels_h_src} >> ${cuda_kernels_h_dst}
-  && cat blank_lines ${lightseq_kernels_cu_src} >> ${topk_kernels_dst}
-  && cat blank_lines ${cuda_kernels_cu_src} >> ${cuda_kernels_cu_dst}
-  && cat blank_lines ${decoding_kernels_cu_src} >> ${decoding_kernels_cu_dst}
-  && cat blank_lines ${topk_kernels_cuh_src} >> ${topk_kernels_cuh_dst}
-  && cat blank_lines ${trans_decoding_kernels_cu_src} >> ${decoding_kernels_cu_dst}
-  && cat blank_lines ${open_decoder_cu_src} >> ${open_decoder_cu_dst}
-  && cat blank_lines ${open_decoder_cuh_src} >> ${open_decoder_cuh_dst}
-  && cat blank_lines ${trans_kernels_cuh_src} >> ${trans_kernels_cuh_dst}
-  && sed -i "s/^#define NEW_TRANSPOSE_BATCH_MAJOR 1/#define NEW_TRANSPOSE_BATCH_MAJOR 0/g" ${open_decoder_cu_dst}
-  && sed -i "2091,2119d" ${open_attention_cu_dst}
-  && rm blank_lines
-  && ${MUTE_COMMAND}
-  && ${OPEN_ATTENTION_MUTE_COMMAND}
-  && ${RM_OLD_CUB_COMMAND}
-)
-
-# TODO(guosheng): Use UPDATE_COMMAND instead of PATCH_COMMAND to make cmake
-# re-run always use the latest patches when the developer changes FT patch codes,
-# all patches rather than the changes would re-build, any better way to do this.
-# Or maybe hidden this function for simplicity.
-set(FT_UPDATE_COMMAND git checkout nccl_dependent_refine && git checkout . && ${FT_PATCH_COMMAND})
-
-ExternalProject_Add(
-  extern_${THIRD_PARTY_NAME}
-  GIT_REPOSITORY    https://gitee.com/paddlepaddle/FasterTransformer.git
-  GIT_TAG           nccl_dependent_refine
-  PREFIX            ${THIRD_PATH}
-  SOURCE_DIR        ${THIRD_PATH}/source/${THIRD_PARTY_NAME}
-  UPDATE_COMMAND    ${FT_UPDATE_COMMAND}  # PATCH_COMMAND     ${FT_PATCH_COMMAND}
-  BINARY_DIR        ${THIRD_PATH}/build/${THIRD_PARTY_NAME}
-  INSTALL_COMMAND   ""
-  CMAKE_ARGS        -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} -DSM=${SM} -DBUILD_PD=ON -DBUILD_ENCODER=${WITH_ENCODER} -DPY_CMD=${PY_CMD} -DON_INFER=${ON_INFER} -DPADDLE_LIB=${PADDLE_LIB} -DWITH_MKL=${WITH_MKL} -DWITH_STATIC_LIB=${WITH_STATIC_LIB} -DBUILD_GPT=${WITH_PARALLEL} -DWITH_ONNXRUNTIME=${WITH_ONNXRUNTIME}
-)
-# -DBUILD_GPT=${WITH_GPT} 
-ExternalProject_Get_property(extern_${THIRD_PARTY_NAME} BINARY_DIR)
-ExternalProject_Get_property(extern_${THIRD_PARTY_NAME} SOURCE_DIR)
-ExternalProject_Get_property(extern_${THIRD_PARTY_NAME} SOURCE_SUBDIR)
-
-set(FT_INCLUDE_PATH ${SOURCE_DIR}/${SOURCE_SUBDIR})
-set(FT_LIB_PATH ${BINARY_DIR}/lib)
-
-include_directories(
-  ${FT_INCLUDE_PATH}
-)
-
-link_directories(
-  ${FT_LIB_PATH}
-)
-
-if(ON_INFER AND WITH_GPT AND WITH_SP)
-  ExternalProject_Add(
-    extern_sentencepiece
-    GIT_REPOSITORY    https://github.com/google/sentencepiece.git
-    PREFIX            ${THIRD_PATH}
-    SOURCE_DIR        ${THIRD_PATH}/source/sentencepiece/
-    BINARY_DIR        ${THIRD_PATH}/build/sentencepiece/
-    INSTALL_COMMAND   ""
-  )
-  
-  include_directories(
-    ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/sentencepiece/src/
-  )
-
-  link_directories(
-    ${CMAKE_BINARY_DIR}/${THIRD_PATH}/build/sentencepiece/src/
-  )
-
-  add_definitions(-DGPT_ON_SENTENCEPIECE)
-endif()
-
-add_subdirectory(fast_transformer)
diff --git a/paddlenlp/ops/__init__.py b/paddlenlp/ops/__init__.py
index f18e6d0817ca..d98e77ece75f 100644
--- a/paddlenlp/ops/__init__.py
+++ b/paddlenlp/ops/__init__.py
@@ -18,16 +18,3 @@
 from .distributed import *
 from .einsum import *
 
-# isort: split
-from .fast_transformer.transformer.decoding import *
-
-# isort: split
-from .fast_transformer.transformer.decoder import *
-from .fast_transformer.transformer.encoder import *
-from .fast_transformer.transformer.fast_transformer import *
-
-paddle.nn.TransformerEncoderLayer._ft_forward = encoder_layer_forward  # noqa F405
-paddle.nn.TransformerEncoder._ft_forward = encoder_forward  # noqa F405
-
-paddle.nn.TransformerEncoderLayer._ori_forward = paddle.nn.TransformerEncoderLayer.forward
-paddle.nn.TransformerEncoder._ori_forward = paddle.nn.TransformerEncoder.forward
diff --git a/paddlenlp/ops/cmake/FindNCCL.cmake b/paddlenlp/ops/cmake/FindNCCL.cmake
deleted file mode 100644
index 7dc1fa9968f4..000000000000
--- a/paddlenlp/ops/cmake/FindNCCL.cmake
+++ /dev/null
@@ -1,165 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# From PyTorch:
-#
-# Copyright (c) 2016-     Facebook, Inc            (Adam Paszke)
-# Copyright (c) 2014-     Facebook, Inc            (Soumith Chintala)
-# Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
-# Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)
-# Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
-# Copyright (c) 2011-2013 NYU                      (Clement Farabet)
-# Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
-# Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
-# Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
-#
-# From Caffe2:
-#
-# Copyright (c) 2016-present, Facebook Inc. All rights reserved.
-#
-# All contributions by Facebook:
-# Copyright (c) 2016 Facebook Inc.
-#
-# All contributions by Google:
-# Copyright (c) 2015 Google Inc.
-# All rights reserved.
-#
-# All contributions by Yangqing Jia:
-# Copyright (c) 2015 Yangqing Jia
-# All rights reserved.
-#
-# All contributions by Kakao Brain:
-# Copyright 2019-2020 Kakao Brain
-#
-# All contributions from Caffe:
-# Copyright(c) 2013, 2014, 2015, the respective contributors
-# All rights reserved.
-#
-# All other contributions:
-# Copyright(c) 2015, 2016 the respective contributors
-# All rights reserved.
-#
-# Caffe2 uses a copyright model similar to Caffe: each contributor holds
-# copyright over their contributions to Caffe2. The project versioning records
-# all such contribution and copyright details. If a contributor wants to further
-# mark their specific copyright on a particular contribution, they should
-# indicate their copyright solely in the commit message of the change when it is
-# committed.
-#
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#
-# 3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
-#    and IDIAP Research Institute nor the names of its contributors may be
-#    used to endorse or promote products derived from this software without
-#    specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
-#
-# Find the nccl libraries
-#
-# The following variables are optionally searched for defaults
-#  NCCL_ROOT: Base directory where all NCCL components are foundHong Xu, 1 year ago: • Let CMake handle NCCL detection instead of ou…
-#  NCCL_INCLUDE_DIR: Directory where NCCL header is foundPieter Noordhuis, 3 years ago: • Bump gloo
-#  NCCL_LIB_DIR: Directory where NCCL library is found
-#
-# The following are set after configuration is done:
-#  NCCL_FOUND
-#  NCCL_INCLUDE_DIRS
-#  NCCL_LIBRARIES
-#
-# The path hints include CUDA_TOOLKIT_ROOT_DIR seeing as some folks
-# install NCCL in the same location as the CUDA toolkit.
-# See https://github.com/caffe2/caffe2/issues/1601
-
-set(NCCL_INCLUDE_DIR $ENV{NCCL_INCLUDE_DIR} CACHE PATH "Folder contains NVIDIA NCCL headers")
-set(NCCL_LIB_DIR $ENV{NCCL_LIB_DIR} CACHE PATH "Folder contains NVIDIA NCCL libraries")
-set(NCCL_VERSION $ENV{NCCL_VERSION} CACHE STRING "Version of NCCL to build with")
-
-if ($ENV{NCCL_ROOT_DIR})
-  message(WARNING "NCCL_ROOT_DIR is deprecated. Please set NCCL_ROOT instead.")
-endif()
-list(APPEND NCCL_ROOT $ENV{NCCL_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR})
-# Compatible layer for CMake <3.12. NCCL_ROOT will be accounted in for searching paths and libraries for CMake >=3.12.
-list(APPEND CMAKE_PREFIX_PATH ${NCCL_ROOT})
-
-find_path(NCCL_INCLUDE_DIRS
-  NAMES nccl.h
-  HINTS ${NCCL_INCLUDE_DIR})
-
-if (USE_STATIC_NCCL)
-  MESSAGE(STATUS "USE_STATIC_NCCL is set. Linking with static NCCL library.")
-  SET(NCCL_LIBNAME "nccl_static")
-  if (NCCL_VERSION)  # Prefer the versioned library if a specific NCCL version is specified
-    set(CMAKE_FIND_LIBRARY_SUFFIXES ".a.${NCCL_VERSION}" ${CMAKE_FIND_LIBRARY_SUFFIXES})
-  endif()
-else()
-  SET(NCCL_LIBNAME "nccl")
-  if (NCCL_VERSION)  # Prefer the versioned library if a specific NCCL version is specified
-    set(CMAKE_FIND_LIBRARY_SUFFIXES ".so.${NCCL_VERSION}" ${CMAKE_FIND_LIBRARY_SUFFIXES})
-  endif()
-endif()
-
-find_library(NCCL_LIBRARIES
-  NAMES ${NCCL_LIBNAME}
-  HINTS ${NCCL_LIB_DIR})
-
-include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(NCCL DEFAULT_MSG NCCL_INCLUDE_DIRS NCCL_LIBRARIES)
-
-if(NCCL_FOUND)  # obtaining NCCL version and some sanity checks
-  set (NCCL_HEADER_FILE "${NCCL_INCLUDE_DIRS}/nccl.h")
-  message (STATUS "Determining NCCL version from ${NCCL_HEADER_FILE}...")
-  set (OLD_CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES})
-  list (APPEND CMAKE_REQUIRED_INCLUDES ${NCCL_INCLUDE_DIRS})
-  include(CheckCXXSymbolExists)
-  check_cxx_symbol_exists(NCCL_VERSION_CODE nccl.h NCCL_VERSION_DEFINED)
-
-  if (NCCL_VERSION_DEFINED)
-    set(file "${PROJECT_BINARY_DIR}/detect_nccl_version.cc")
-    file(WRITE ${file} "
-      #include <iostream>
-      #include <nccl.h>
-      int main()
-      {
-        std::cout << NCCL_MAJOR << '.' << NCCL_MINOR << '.' << NCCL_PATCH << std::endl;
-        int x;
-        ncclGetVersion(&x);
-        return x == NCCL_VERSION_CODE;
-      }
-")
-    try_run(NCCL_VERSION_MATCHED compile_result ${PROJECT_BINARY_DIR} ${file}
-          RUN_OUTPUT_VARIABLE NCCL_VERSION_FROM_HEADER
-          LINK_LIBRARIES ${NCCL_LIBRARIES})
-    if (NOT NCCL_VERSION_MATCHED)
-      message(FATAL_ERROR "Found NCCL header version and library version do not match! \
-(include: ${NCCL_INCLUDE_DIRS}, library: ${NCCL_LIBRARIES}) Please set NCCL_INCLUDE_DIR and NCCL_LIB_DIR manually.")
-    endif()
-    message(STATUS "NCCL version: ${NCCL_VERSION_FROM_HEADER}")
-  else()
-    # message(STATUS "NCCL version < 2.3.5-5")
-  endif ()
-  set (CMAKE_REQUIRED_INCLUDES ${OLD_CMAKE_REQUIRED_INCLUDES})
-
-  message(STATUS "Found NCCL (include: ${NCCL_INCLUDE_DIRS}, library: ${NCCL_LIBRARIES})")
-  mark_as_advanced(NCCL_ROOT_DIR NCCL_INCLUDE_DIRS NCCL_LIBRARIES)
-endif()
diff --git a/paddlenlp/ops/cmake/external/boost.cmake b/paddlenlp/ops/cmake/external/boost.cmake
deleted file mode 100644
index 3140c7a48f46..000000000000
--- a/paddlenlp/ops/cmake/external/boost.cmake
+++ /dev/null
@@ -1,64 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-include(ExternalProject)
-
-set(BOOST_PROJECT "extern_boost")
-# To release PaddlePaddle as a pip package, we have to follow the
-# manylinux1 standard, which features as old Linux kernels and
-# compilers as possible and recommends CentOS 5. Indeed, the earliest
-# CentOS version that works with NVIDIA CUDA is CentOS 6.  And a new
-# version of boost, say, 1.66.0, doesn't build on CentOS 6.  We
-# checked that the devtools package of CentOS 6 installs boost 1.41.0.
-# So we use 1.41.0 here.
-set(BOOST_VER "1.41.0")
-set(BOOST_TAR "boost_1_41_0" CACHE STRING "" FORCE)
-set(BOOST_URL "http://paddlepaddledeps.bj.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE)
-
-MESSAGE(STATUS "BOOST_TAR: ${BOOST_TAR}, BOOST_URL: ${BOOST_URL}")
-
-set(THIRD_PARTY_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source)
-
-set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost)
-set(BOOST_DOWNLOAD_DIR  "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}")
-
-set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}" CACHE PATH "boost include directory." FORCE)
-set_directory_properties(PROPERTIES CLEAN_NO_CUSTOM 1)
-include_directories(${BOOST_INCLUDE_DIR})
-
-ExternalProject_Add(
-    ${BOOST_PROJECT}
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    DOWNLOAD_DIR          ${BOOST_DOWNLOAD_DIR}
-    URL                   ${BOOST_URL}
-    DOWNLOAD_NO_PROGRESS  1
-    PREFIX                ${BOOST_SOURCES_DIR}
-    CONFIGURE_COMMAND     ""
-    BUILD_COMMAND         ""
-    INSTALL_COMMAND       ""
-    UPDATE_COMMAND        ""
-    )
-
-ExternalProject_Get_property(${BOOST_PROJECT} SOURCE_DIR)
-
-if (${CMAKE_VERSION} VERSION_LESS "3.3.0" OR NOT WIN32)
-    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/boost_dummy.c)
-    file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";")
-    add_library(boost STATIC ${dummyfile})
-else()
-    add_library(boost INTERFACE)
-endif()
-
-add_dependencies(boost ${BOOST_PROJECT})
-set(Boost_INCLUDE_DIR ${BOOST_INCLUDE_DIR})
-include_directories(${Boost_INCLUDE_DIR})
diff --git a/paddlenlp/ops/ext_utils.py b/paddlenlp/ops/ext_utils.py
deleted file mode 100644
index 5891d78abdaa..000000000000
--- a/paddlenlp/ops/ext_utils.py
+++ /dev/null
@@ -1,367 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import functools
-import hashlib
-import os
-import subprocess
-import sys
-import sysconfig
-import textwrap
-from pathlib import Path
-
-from filelock import FileLock
-from paddle.utils.cpp_extension import load_op_meta_info_and_register_op
-from paddle.utils.cpp_extension.cpp_extension import CUDA_HOME
-from paddle.utils.cpp_extension.cpp_extension import (
-    BuildExtension as PaddleBuildExtension,
-)
-from paddle.utils.cpp_extension.cpp_extension import CppExtension
-from paddle.utils.cpp_extension.extension_utils import (
-    _import_module_from_library,
-    _jit_compile,
-)
-from setuptools import Extension
-
-from paddlenlp.utils.env import PPNLP_HOME
-from paddlenlp.utils.log import logger
-
-if CUDA_HOME and not os.path.exists(CUDA_HOME):
-    # CUDA_HOME is only None for Windows CPU version in paddle `find_cuda_home`.
-    # Clear it for other non-CUDA situations.
-    CUDA_HOME = None
-
-LOADED_EXT = {}
-
-
-def file_lock(lock_file_path):
-    def _wrapper(func):
-        @functools.wraps(func)
-        def _impl(*args, **kwargs):
-            with FileLock(lock_file_path):
-                func(*args, **kwargs)
-
-        return _impl
-
-    return _wrapper
-
-
-def _get_files(path):
-    """
-    Helps to list all files under the given path.
-    """
-    if os.path.isfile(path):
-        return [path]
-    all_files = []
-    for root, _dirs, files in os.walk(path, followlinks=True):
-        for file in files:
-            file = os.path.join(root, file)
-            all_files.append(file)
-    return all_files
-
-
-# copy form distutils.dep_util to avoid import distutils
-def newer_group(sources, target, missing="error"):
-    """Return true if 'target' is out-of-date with respect to any file
-    listed in 'sources'.  In other words, if 'target' exists and is newer
-    than every file in 'sources', return false; otherwise return true.
-    'missing' controls what we do when a source file is missing; the
-    default ("error") is to blow up with an OSError from inside 'stat()';
-    if it is "ignore", we silently drop any missing source files; if it is
-    "newer", any missing source files make us assume that 'target' is
-    out-of-date (this is handy in "dry-run" mode: it'll make you pretend to
-    carry out commands that wouldn't work because inputs are missing, but
-    that doesn't matter because you're not actually going to run the
-    commands).
-    """
-    # If the target doesn't even exist, then it's definitely out-of-date.
-    if not os.path.exists(target):
-        return 1
-
-    # Otherwise we have to find out the hard way: if *any* source file
-    # is more recent than 'target', then 'target' is out-of-date and
-    # we can immediately return true.  If we fall through to the end
-    # of the loop, then 'target' is up-to-date and we return false.
-    from stat import ST_MTIME
-
-    target_mtime = os.stat(target)[ST_MTIME]
-    for source in sources:
-        if not os.path.exists(source):
-            if missing == "error":  # blow up when we stat() the file
-                pass
-            elif missing == "ignore":  # missing source dropped from
-                continue  # target's dependency list
-            elif missing == "newer":  # missing source means target is
-                return 1  # out-of-date
-
-        source_mtime = os.stat(source)[ST_MTIME]
-        if source_mtime > target_mtime:
-            return 1
-    else:
-        return 0
-
-
-class CMakeExtension(Extension):
-    def __init__(self, name, source_dir=None):
-        # A CMakeExtension needs a source_dir instead of a file list.
-        Extension.__init__(self, name, sources=[])
-        if source_dir is None:
-            self.source_dir = str(Path(__file__).parent.resolve())
-        else:
-            self.source_dir = os.path.abspath(os.path.expanduser(source_dir))
-        self.sources = _get_files(self.source_dir)
-
-    def build_with_command(self, ext_builder):
-        """
-        Custom `build_ext.build_extension` in `Extension` instead of `Command`.
-        `ext_builder` is the instance of `build_ext` command.
-        """
-        # refer to https://github.com/pybind/cmake_example/blob/master/setup.py
-        if ext_builder.compiler.compiler_type == "msvc":
-            raise NotImplementedError
-        cmake_args = getattr(self, "cmake_args", []) + [
-            "-DCMAKE_BUILD_TYPE={}".format("Debug" if ext_builder.debug else "Release"),
-            "-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={}".format(ext_builder.build_lib),
-        ]
-        build_args = []
-
-        # Set CMAKE_BUILD_PARALLEL_LEVEL to control the parallel build level
-        # across all generators.
-        if "CMAKE_BUILD_PARALLEL_LEVEL" not in os.environ:
-            # self.parallel is a Python 3 only way to set parallel jobs by hand
-            # using -j in the build_ext call, not supported by pip or PyPA-build.
-            if hasattr(ext_builder, "parallel") and ext_builder.parallel:
-                # CMake 3.12+ only.
-                build_args += ["-j{}".format(ext_builder.parallel)]
-
-        build_args += ["-j14"]
-
-        if not os.path.exists(ext_builder.build_temp):
-            os.makedirs(ext_builder.build_temp)
-
-        # Redirect stdout/stderr to mute, especially when allowing errors
-        stdout = getattr(self, "_std_out_handle", None)
-        subprocess.check_call(
-            ["cmake", self.source_dir] + cmake_args, cwd=ext_builder.build_temp, stdout=stdout, stderr=stdout
-        )
-        subprocess.check_call(
-            ["cmake", "--build", "."] + build_args, cwd=ext_builder.build_temp, stdout=stdout, stderr=stdout
-        )
-
-    def get_target_filename(self):
-        """
-        The file names of libraries. Currently only support one library for
-        one extension.
-        """
-        raise NotImplementedError
-
-    def get_output_filename(self):
-        """
-        The file names of outputs, which mostly is the same with
-        `get_target_filename`.
-        """
-        return self.get_target_filename()
-
-
-class FasterTransformerExtension(CMakeExtension):
-    def __init__(self, name, source_dir=None, need_parallel=False):
-        super(FasterTransformerExtension, self).__init__(name, source_dir)
-        self.sources = _get_files(os.path.join(self.source_dir, "fast_transformer", "src")) + _get_files(
-            os.path.join(self.source_dir, "patches", "FasterTransformer")
-        )
-        self._std_out_handle = None
-        # Env variable may not work as expected, since jit compile by `load`
-        # would not re-built if source code is not update.
-        # self.sm = os.environ.get("PPNLP_GENERATE_CODE", None)
-        # Whether or not to use model parallel. Note that since the building use
-        # a new process, we shoud find a way to let it know whether to use model
-        # parallel.
-        self.need_parallel = need_parallel
-
-    def build_with_command(self, ext_builder):
-        if CUDA_HOME is None:  # GPU only
-            # TODO(guosheng): should we touch a dummy file or add a quick exit
-            # method to avoid meaningless process in `load`
-            logger.warning("FastGeneration is not available because CUDA can not be found.")
-            raise NotImplementedError
-        # TODO(guosheng): Multiple -std seems be passed in FastGeneration,
-        # which is not allowed by NVCC. Fix it later.
-        self.cmake_args = [f"-DPY_CMD={sys.executable}"]
-        # `GetCUDAComputeCapability` is not exposed yet, and detect CUDA/GPU
-        # version in cmake file.
-        # self.cmake_args += [f"-DSM={self.sm}"] if self.sm is not None else []
-        self.cmake_args += "-DWITH_GPT=ON -DON_INFER=OFF -DWITH_MKL=ON -DWITH_ONNXRUNTIME=ON".split()
-
-        self.cmake_args += ["-DCMAKE_C_COMPILER={}".format(os.popen("which gcc").read().replace("\n", ""))]
-        self.cmake_args += ["-DCMAKE_CXX_COMPILER={}".format(os.popen("which g++").read().replace("\n", ""))]
-
-        self.cmake_args += ["-DPYTHON_LIBRARY={}".format(sysconfig.get_config_var("LIBDIR"))]
-        self.cmake_args += ["-DPYTHON_INCLUDE_DIR={}".format(sysconfig.get_config_var("INCLUDEPY"))]
-
-        if self.need_parallel:
-            self.cmake_args += ["-DWITH_PARALLEL=ON"]
-
-        try:
-            super(FasterTransformerExtension, self).build_with_command(ext_builder)
-            # FastGeneration cmake file resets `CMAKE_LIBRARY_OUTPUT_DIRECTORY`
-            # to `CMAKE_BINARY_DIR/lib`, thus copy the lib back to `build_ext.build_lib`.
-            # Maybe move this copy to CMakeList.
-            # `copy_tree` or `copy_file`, boost lib might be included
-            ext_builder.copy_tree(os.path.join(ext_builder.build_temp, "lib"), ext_builder.build_lib)
-            # TODO(guosheng): Maybe we should delete the build dir especially
-            # when it is in the dir of paddlenlp package.
-            # os.remove(ext_builder.build_temp)
-        except Exception as e:
-            logger.warning("FastGeneration is not available due to build errors.")
-            raise e
-
-    def get_target_filename(self):
-        # CMake file has fixed the name of lib, maybe we can copy it as the name
-        # returned by `BuildExtension.get_ext_filename` after build.
-        return "libdecoding_op.so"
-
-    def get_output_filename(self):
-        return "libdecoding_op.so"
-
-
-class BuildExtension(PaddleBuildExtension):
-    """
-    Support both `CppExtention` of Paddle and custom extensions of PaddleNLP.
-    """
-
-    def build_extensions(self):
-        custom_exts = []  # for
-        no_custom_exts = []  # for normal extentions paddle.utils.cpp_extension
-        for ext in self.extensions:
-            if hasattr(ext, "build_with_command"):
-                # custom build in Extension
-                ext.build_with_command(self)
-                custom_exts.append(ext)
-            else:
-                no_custom_exts.append(ext)
-        if no_custom_exts:
-            # Build CppExtentio/CUDAExtension with `PaddleBuildExtension`
-            self.extensions = no_custom_exts
-            super(BuildExtension, self).build_extensions()
-        self.extensions = custom_exts + no_custom_exts
-
-
-EXTENSIONS = {
-    "FastGeneration": FasterTransformerExtension,
-    # NOTE: Since model parallel code is supported by definitions, to avoid
-    # performance degrading on non-parallel mode, we use a separated lib for
-    # model parallel.
-    "FasterTransformerParallel": FasterTransformerExtension,
-}
-
-
-def get_extension_maker(name):
-    # Use `paddle.utils.cpp_extension.CppExtension` as the default
-    # TODO(guosheng): Maybe register extension classes into `Extensions`.
-    return EXTENSIONS.get(name, CppExtension)
-
-
-def _write_setup_file(name, file_path, build_dir, **kwargs):
-    """
-    Automatically generate setup.py and write it into build directory.
-    `kwargws` is arguments for the corresponding Extension initialization.
-    Any type extension can be jit build.
-    """
-    template = textwrap.dedent(
-        """
-    from setuptools import setup
-    from paddlenlp.ops.ext_utils import get_extension_maker, BuildExtension
-
-    setup(
-        name='{name}',
-        ext_modules=[
-            get_extension_maker('{name}')(
-                name='{name}',
-                {kwargs_str})],
-        cmdclass={{'build_ext' : BuildExtension.with_options(
-            output_dir=r'{build_dir}')
-        }})"""
-    ).lstrip()
-    kwargs_str = ""
-    for key, value in kwargs.items():
-        kwargs_str += key + "=" + (f"'{value}'" if isinstance(value, str) else str(value)) + ","
-    content = template.format(name=name, kwargs_str=kwargs_str, build_dir=build_dir)
-
-    with open(file_path, "w") as f:
-        f.write(content)
-
-
-@file_lock(os.path.join(PPNLP_HOME, "load_ext.lock"))
-def load(name, build_dir=None, force=False, verbose=False, **kwargs):
-    # TODO(guosheng): Need better way to resolve unsupported such as CPU. Currently,
-    # raise NotImplementedError and skip `_jit_compile`. Otherwise, `_jit_compile`
-    # will output the error to stdout (when verbose is True) and raise `RuntimeError`,
-    # which is not friendly for users though no other bad effect.
-    if CUDA_HOME is None:
-        logger.warning("%s is not available because CUDA can not be found." % name)
-        raise NotImplementedError
-    if name in LOADED_EXT.keys():
-        # TODO(guosheng): Maybe the key should combined with kwargs since the
-        # extension object is created using them.
-        return LOADED_EXT[name]
-    if build_dir is None:
-        # build_dir = os.path.join(PPNLP_HOME, 'extenstions')
-        # Maybe under package dir is better to avoid cmake source path conflict
-        # with different source path, like this:
-        # build_dir = os.path.join(
-        #     str(Path(__file__).parent.resolve()), 'extenstions')
-        # However if it is under the package dir, it might make the package hard
-        # to uninstall. Thus we put it in PPNLP_HOME with digest of current path,
-        # like this:
-        build_dir = os.path.join(
-            PPNLP_HOME, "extensions", hashlib.md5(str(Path(__file__).parent.resolve()).encode("utf-8")).hexdigest()
-        )
-    build_base_dir = os.path.abspath(os.path.expanduser(os.path.join(build_dir, name)))
-    if not os.path.exists(build_base_dir):
-        os.makedirs(build_base_dir)
-
-    extension = get_extension_maker(name)(name, **kwargs)
-    # Check if 'target' is out-of-date with respect to any file to avoid rebuild
-    if isinstance(extension, CMakeExtension):
-        # `CppExtention/CUDAExtension `has version manager by `PaddleBuildExtension`
-        # Maybe move this to CMakeExtension later.
-        # TODO(guosheng): flags/args changes may also trigger build, and maybe
-        # need version manager like `PaddleBuildExtension`.
-        out_filename = extension.get_output_filename()
-        if isinstance(out_filename, str):
-            out_filename = [out_filename]
-        out_filepath = [os.path.join(build_base_dir, f) for f in out_filename]
-        lib_filename = extension.get_target_filename()
-        lib_filepath = os.path.join(build_base_dir, lib_filename)
-        if not force:
-            ext_sources = extension.sources
-            if all(os.path.exists(f) and not newer_group(ext_sources, f, "newer") for f in out_filepath):
-                logger.debug("skipping '%s' extension (up-to-date) build" % name)
-                ops = load_op_meta_info_and_register_op(lib_filepath)
-                LOADED_EXT[name] = ops
-                return LOADED_EXT[name]
-
-    # write setup file and jit compile
-    file_path = os.path.join(build_dir, name, "{}_setup.py".format(name))
-    _write_setup_file(name, file_path, build_base_dir, **kwargs)
-    _jit_compile(file_path, verbose)
-    if isinstance(extension, CMakeExtension):
-        # Load a shared library (if exists) only to register op.
-        if os.path.exists(lib_filepath):
-            ops = load_op_meta_info_and_register_op(lib_filepath)
-            LOADED_EXT[name] = ops
-            return LOADED_EXT[name]
-    else:
-        # Import as callable python api
-        return _import_module_from_library(name, build_base_dir, verbose)
diff --git a/paddlenlp/ops/fast_transformer/CMakeLists.txt b/paddlenlp/ops/fast_transformer/CMakeLists.txt
deleted file mode 100644
index be58f747a2fb..000000000000
--- a/paddlenlp/ops/fast_transformer/CMakeLists.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-add_subdirectory(src)
diff --git a/paddlenlp/ops/fast_transformer/__init__.py b/paddlenlp/ops/fast_transformer/__init__.py
deleted file mode 100644
index 185a92b8d94d..000000000000
--- a/paddlenlp/ops/fast_transformer/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/paddlenlp/ops/fast_transformer/sample/bart_decoding_sample.py b/paddlenlp/ops/fast_transformer/sample/bart_decoding_sample.py
deleted file mode 100644
index 6e3edff7c33f..000000000000
--- a/paddlenlp/ops/fast_transformer/sample/bart_decoding_sample.py
+++ /dev/null
@@ -1,132 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import time
-from pprint import pprint
-
-import paddle
-
-from paddlenlp.transformers import BartForConditionalGeneration, BartTokenizer
-from paddlenlp.utils.log import logger
-
-
-def postprocess_seq(seq, bos_idx, eos_idx, output_bos=False, output_eos=False):
-    """
-    Post-process the decoded sequence.
-    """
-    eos_pos = len(seq) - 1
-    for i, idx in enumerate(seq):
-        if idx == eos_idx:
-            eos_pos = i
-            break
-    seq = [idx for idx in seq[: eos_pos + 1] if (output_bos or idx != bos_idx) and (output_eos or idx != eos_idx)]
-    return seq
-
-
-def prepare_input(tokenizer, sentences):
-    tokenized = tokenizer(sentences, padding=True)
-    input_ids = paddle.to_tensor(tokenized["input_ids"], dtype="int64")
-    return input_ids
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--model_name_or_path",
-        default="bart-base",
-        type=str,
-        help="The model name to specify the bart to use. Can be one of ['bart-base', 'bart-large',]. ",
-    )
-    parser.add_argument(
-        "--decoding_strategy",
-        default="beam_search",
-        type=str,
-        help="The decoding strategy. Can be one of [greedy_search, beam_search, sampling]",
-    )
-    parser.add_argument("--beam_size", default=5, type=int, help="The parameters for beam search. ")
-    parser.add_argument("--top_k", default=4, type=int, help="The number of candidate to procedure beam search. ")
-    parser.add_argument(
-        "--top_p", default=1.0, type=float, help="The probability threshold to procedure topp sampling. "
-    )
-    parser.add_argument("--max_length", default=20, type=int, help="Maximum output length. ")
-    parser.add_argument("--diversity_rate", default=0.0, type=float, help="The diversity of beam search. ")
-    parser.add_argument(
-        "--length_penalty", default=0.6, type=float, help="The power number in length penalty calculation"
-    )
-    parser.add_argument("--use_fp16_decoding", action="store_true", help="Whether to use fp16 decoding to predict. ")
-    args = parser.parse_args()
-    return args
-
-
-def do_predict(args):
-    place = "gpu"
-    paddle.set_device(place)
-
-    tokenizer = BartTokenizer.from_pretrained(args.model_name_or_path)
-    logger.info("Loading the model parameters, please wait...")
-    model = BartForConditionalGeneration.from_pretrained(args.model_name_or_path)
-
-    # Set evaluate mode
-    model.eval()
-    sentences = [
-        "I love that girl, but <mask> does not <mask> me.",
-        "She is so <mask> that I can not help glance at <mask>.",
-        "Nothing's gonna <mask> my love for you.",
-        "Drop everything now. Meet me in the pouring <mask>. Kiss me on the sidewalk.",
-    ]
-
-    bos_id = model.bart.config["bos_token_id"]
-    eos_id = model.bart.config["eos_token_id"]
-    input_ids = prepare_input(tokenizer, sentences)
-    # Define model
-    fast_bart = model
-
-    # Set evaluate mode
-    fast_bart.eval()
-
-    with paddle.no_grad():
-        for i in range(100):
-            # For warmup.
-            if 50 == i:
-                # PaddlePaddle >= 2.2
-                paddle.device.cuda.synchronize()
-                start = time.perf_counter()
-            finished_seq, _ = fast_bart.generate(
-                input_ids=input_ids,
-                max_length=args.max_length,
-                decode_strategy=args.decoding_strategy,
-                top_k=args.top_k,
-                top_p=args.top_p,
-                num_beams=args.beam_size,
-                diversity_rate=args.diversity_rate,
-                length_penalty=args.length_penalty,
-                use_fp16_decoding=args.use_fp16_decoding,
-                use_fast=True,
-            )
-
-        paddle.device.cuda.synchronize()
-        logger.info("Average test time for decoding is %f ms" % ((time.perf_counter() - start) / 50 * 1000))
-
-        # Output
-        finished_seq = finished_seq.numpy()
-        for ins in finished_seq:
-            generated_ids = postprocess_seq(ins, bos_id, eos_id)
-            print(tokenizer.convert_ids_to_string(generated_ids))
-
-
-if __name__ == "__main__":
-    args = parse_args()
-    pprint(args)
-
-    do_predict(args)
diff --git a/paddlenlp/ops/fast_transformer/sample/bart_export_model_sample.py b/paddlenlp/ops/fast_transformer/sample/bart_export_model_sample.py
deleted file mode 100644
index e72e5c3ae91a..000000000000
--- a/paddlenlp/ops/fast_transformer/sample/bart_export_model_sample.py
+++ /dev/null
@@ -1,111 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import os
-from pprint import pprint
-
-import paddle
-
-from paddlenlp.ops import FasterBART
-from paddlenlp.transformers import BartForConditionalGeneration, BartTokenizer
-from paddlenlp.utils.log import logger
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--model_name_or_path", default="bart-base", type=str, help="The model name to specify the bart to use. "
-    )
-    parser.add_argument(
-        "--inference_model_dir", default="./infer_model/", type=str, help="Path to save inference model of bart. "
-    )
-    parser.add_argument("--topk", default=4, type=int, help="The number of candidate to procedure top_k sampling. ")
-    parser.add_argument(
-        "--topp", default=1.0, type=float, help="The probability threshold to procedure top_p sampling. "
-    )
-    parser.add_argument("--max_out_len", default=20, type=int, help="Maximum output length. ")
-    parser.add_argument("--temperature", default=1.0, type=float, help="The temperature to set. ")
-    parser.add_argument("--num_return_sequences", default=1, type=int, help="The number of returned sequences. ")
-    parser.add_argument("--use_fp16_decoding", action="store_true", help="Whether to use fp16 decoding to predict. ")
-    parser.add_argument(
-        "--decoding_strategy",
-        default="beam_search",
-        choices=["sampling", "beam_search"],
-        type=str,
-        help="The main strategy to decode. ",
-    )
-    parser.add_argument("--num_beams", default=5, type=int, help="The number of candidate to procedure beam search. ")
-    parser.add_argument(
-        "--diversity_rate", default=0.0, type=float, help="The diversity rate to procedure beam search. "
-    )
-    parser.add_argument("--repetition_penalty", default=1.0, type=float, help="The repetition_penalty to set. ")
-    parser.add_argument("--length_penalty", default=0.0, type=float, help="The length penalty to decode. ")
-    parser.add_argument("--early_stopping", action="store_true", help="Whether to do early stopping. ")
-
-    args = parser.parse_args()
-    return args
-
-
-def do_predict(args):
-    place = "gpu"
-    place = paddle.set_device(place)
-
-    model = BartForConditionalGeneration.from_pretrained(args.model_name_or_path)
-    tokenizer = BartTokenizer.from_pretrained(args.model_name_or_path)
-
-    # For opening faster_encoder
-    model.eval()
-
-    fast_bart = FasterBART(model=model, use_fp16_decoding=args.use_fp16_decoding)
-    # Set evaluate mode
-    fast_bart.eval()
-
-    # Convert dygraph model to static graph model
-    fast_bart = paddle.jit.to_static(
-        fast_bart,
-        input_spec=[
-            # input_ids
-            paddle.static.InputSpec(shape=[None, None], dtype="int32"),
-            # encoder_output
-            None,
-            # seq_len
-            None,
-            args.num_beams,  # num_beams.
-            args.topk,
-            args.topp,
-            args.decoding_strategy,
-            tokenizer.bos_token_id,  # bos
-            tokenizer.eos_token_id,  # eos
-            tokenizer.pad_token_id,  # pad
-            tokenizer.eos_token_id,  # decoder_start_token_id
-            args.max_out_len,  # max_length
-            args.diversity_rate,  # diversity_rate
-            args.length_penalty,  # length_penalty
-            args.num_return_sequences,
-            args.early_stopping,
-            tokenizer.eos_token_id,  # forced_eos_token_id
-        ],
-    )
-
-    # Save converted static graph model
-    paddle.jit.save(fast_bart, os.path.join(args.inference_model_dir, "bart"))
-    logger.info("BART has been saved to {}.".format(args.inference_model_dir))
-
-
-if __name__ == "__main__":
-    args = parse_args()
-    pprint(args)
-
-    do_predict(args)
diff --git a/paddlenlp/ops/fast_transformer/sample/bart_inference.py b/paddlenlp/ops/fast_transformer/sample/bart_inference.py
deleted file mode 100644
index 3ba744a0efde..000000000000
--- a/paddlenlp/ops/fast_transformer/sample/bart_inference.py
+++ /dev/null
@@ -1,107 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import os
-from pprint import pprint
-
-import numpy as np
-import paddle.inference as paddle_infer
-
-from paddlenlp.ops.ext_utils import load
-from paddlenlp.transformers import BartTokenizer
-
-
-def setup_args():
-    """Setup arguments."""
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--inference_model_dir", default="./infer_model/", type=str, help="Path to save inference model of BART. "
-    )
-
-    args = parser.parse_args()
-
-    return args
-
-
-def prepare_input(tokenizer, sentences):
-    tokenized = tokenizer(sentences, padding=True)
-    input_ids = np.asarray(tokenized["input_ids"], dtype="int32")
-    return input_ids
-
-
-def postprocess_seq(seq, bos_idx, eos_idx, output_bos=False, output_eos=False):
-    """
-    Post-process the decoded sequence.
-    """
-    eos_pos = len(seq) - 1
-    for i, idx in enumerate(seq):
-        if idx == eos_idx:
-            eos_pos = i
-            break
-    seq = [idx for idx in seq[: eos_pos + 1] if (output_bos or idx != bos_idx) and (output_eos or idx != eos_idx)]
-    return seq
-
-
-def infer(args):
-    model_name = "bart-base"
-    tokenizer = BartTokenizer.from_pretrained(model_name)
-
-    sentences = [
-        "I love that girl, but <mask> does not <mask> me.",
-        "She is so <mask> that I can not help glance at <mask>.",
-        "Nothing's gonna <mask> my love for you.",
-        "Drop everything now. Meet me in the pouring <mask>. Kiss me on the sidewalk.",
-    ]
-
-    input_ids = prepare_input(tokenizer, sentences)
-
-    # Load FastGeneration lib.
-    load("FastGeneration", verbose=True)
-
-    config = paddle_infer.Config(
-        os.path.join(args.inference_model_dir, "bart.pdmodel"),
-        os.path.join(args.inference_model_dir, "bart.pdiparams"),
-    )
-
-    config.enable_use_gpu(100, 0)
-    config.disable_glog_info()
-    # `embedding_eltwise_layernorm_fuse_pass` failed
-    config.delete_pass("embedding_eltwise_layernorm_fuse_pass")
-    predictor = paddle_infer.create_predictor(config)
-
-    input_names = predictor.get_input_names()
-    input_handle = predictor.get_input_handle(input_names[0])
-    input_handle.copy_from_cpu(input_ids.astype("int32"))
-
-    predictor.run()
-
-    output_names = predictor.get_output_names()
-    output_handle = predictor.get_output_handle(output_names[0])
-    output_data = output_handle.copy_to_cpu()
-
-    for idx, sample in enumerate(output_data.transpose([1, 2, 0]).tolist()):
-        for beam_idx, beam in enumerate(sample):
-            if beam_idx >= len(sample) / 2:
-                break
-            generated_ids = postprocess_seq(beam, tokenizer.bos_token_id, tokenizer.eos_token_id)
-            seq = tokenizer.convert_ids_to_string(generated_ids)
-            print(f"{idx}-{beam_idx}: {seq}")
-
-
-if __name__ == "__main__":
-    args = setup_args()
-    pprint(args)
-
-    infer(args)
diff --git a/paddlenlp/ops/fast_transformer/sample/config/decoder.sample.yaml b/paddlenlp/ops/fast_transformer/sample/config/decoder.sample.yaml
deleted file mode 100644
index 7a9eee1bfc60..000000000000
--- a/paddlenlp/ops/fast_transformer/sample/config/decoder.sample.yaml
+++ /dev/null
@@ -1,39 +0,0 @@
-# Batch size during inference. 
-infer_batch_size: 8
-max_out_len: 32
-
-# Hyparams for model:
-# These following five vocabularies related configurations will be set
-# automatically according to the passed vocabulary path and special tokens.
-# Size of source word dictionary.
-src_vocab_size: 38512
-# Size of target word dictionay
-trg_vocab_size: 38512
-# Index for <bos> token
-bos_idx: 0
-# Index for <eos> token
-eos_idx: 1
-# Index for <unk> token
-unk_idx: 2
-# Max length of sequences deciding the size of position encoding table.
-max_length: 32
-# The dimension for word embeddings, which is also the last dimension of
-# the input and output of multi-head attention, position-wise feed-forward
-# networks, encoder and decoder.
-d_model: 512
-# Size of the hidden layer in position-wise feed-forward networks.
-d_inner_hid: 2048
-# Number of head used in multi-head attention.
-n_head: 8
-# Number of sub-layers to be stacked in the encoder.
-num_encoder_layers: 6
-# Number of sub-layers to be stacked in the decoder.
-num_decoder_layers: 6
-# Dropout rates.
-dropout: 0.1
-# The flag indicating whether to share embedding and softmax weights.
-# Vocabularies in source and target should be same for weight sharing.
-weight_sharing: True
-
-# Path of trained parameter, to make prediction
-init_from_params: base_trained_models/step_final/
diff --git a/paddlenlp/ops/fast_transformer/sample/config/decoding.sample.yaml b/paddlenlp/ops/fast_transformer/sample/config/decoding.sample.yaml
deleted file mode 100644
index b0ac5ba2e774..000000000000
--- a/paddlenlp/ops/fast_transformer/sample/config/decoding.sample.yaml
+++ /dev/null
@@ -1,44 +0,0 @@
-# Batch size during inference. 
-infer_batch_size: 32
-# Hyparams for generation:
-decoding_strategy: "beam_search"
-# The parameters for beam search.
-beam_size: 4
-# The parameters for topk sampling. 
-topk: 4
-# The parameters for topp sampling. 
-topp: 0.0
-max_out_len: 32
-# The number of decoded sentences to output.
-n_best: 1
-
-# Hyparams for model:
-# These following five vocabularies related configurations will be set
-# automatically according to the passed vocabulary path and special tokens.
-# Size of source word dictionary.
-src_vocab_size: 30000
-# Size of target word dictionay
-trg_vocab_size: 30000
-# Index for <bos> token
-bos_idx: 0
-# Index for <eos> token
-eos_idx: 1
-# Index for <unk> token
-unk_idx: 2
-# Max length of sequences deciding the size of position encoding table.
-max_length: 32
-# The dimension for word embeddings, which is also the last dimension of
-# the input and output of multi-head attention, position-wise feed-forward
-# networks, encoder and decoder.
-d_model: 512
-# Size of the hidden layer in position-wise feed-forward networks.
-d_inner_hid: 2048
-# Number of head used in multi-head attention.
-n_head: 8
-# Number of sub-layers to be stacked in the encoder and decoder.
-n_layer: 6
-# Dropout rates.
-dropout: 0.1
-# The flag indicating whether to share embedding and softmax weights.
-# Vocabularies in source and target should be same for weight sharing.
-weight_sharing: True
diff --git a/paddlenlp/ops/fast_transformer/sample/decoder_sample.py b/paddlenlp/ops/fast_transformer/sample/decoder_sample.py
deleted file mode 100644
index 6927aa41ad60..000000000000
--- a/paddlenlp/ops/fast_transformer/sample/decoder_sample.py
+++ /dev/null
@@ -1,145 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import os
-import time
-from pprint import pprint
-
-import paddle
-import yaml
-from attrdict import AttrDict
-
-from paddlenlp.ops import FasterDecoder
-from paddlenlp.utils.log import logger
-
-
-def get_op_cache_config(use_batch_major_op_cache, size_per_head, is_fp16):
-    x = 8 if is_fp16 else 4
-    use_batch_major_op_cache = True if use_batch_major_op_cache is True and size_per_head % x == 0 else False
-    x = x if use_batch_major_op_cache else 1
-    return use_batch_major_op_cache, x
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--config", default="./config/decoder.sample.yaml", type=str, help="Path of the config file. ")
-    parser.add_argument(
-        "--decoder_lib", default="../../build/lib/libdecoding_op.so", type=str, help="Path of libdecoding_op.so. "
-    )
-    parser.add_argument("--use_fp16_decoder", action="store_true", help="Whether to use fp16 decoder to predict. ")
-    args = parser.parse_args()
-    return args
-
-
-def do_predict(args):
-    place = "gpu"
-    paddle.set_device(place)
-
-    use_batch_major_op_cache = True
-    size_per_head = args.d_model // args.n_head
-    use_batch_major_op_cache, x = get_op_cache_config(use_batch_major_op_cache, size_per_head, args.use_fp16_decoder)
-    print(f"use_batch_major_op_cache={use_batch_major_op_cache}, x={x}")
-    # Define model
-    transformer = FasterDecoder(
-        src_vocab_size=args.src_vocab_size,
-        trg_vocab_size=args.trg_vocab_size,
-        max_length=args.max_length + 1,
-        num_encoder_layers=args.num_encoder_layers,
-        num_decoder_layers=args.num_decoder_layers,
-        n_head=args.n_head,
-        d_model=args.d_model,
-        d_inner_hid=args.d_inner_hid,
-        dropout=args.dropout,
-        weight_sharing=args.weight_sharing,
-        bos_id=args.bos_idx,
-        eos_id=args.eos_idx,
-        max_out_len=args.max_out_len,
-        decoder_lib=args.decoder_lib,
-        use_fp16_decoder=args.use_fp16_decoder,
-        use_batch_major_op_cache=use_batch_major_op_cache,
-    )
-
-    # Load checkpoint.
-    transformer.load(os.path.join(args.init_from_params, "transformer.pdparams"))
-    # Set evaluate mode
-    transformer.eval()
-
-    # Generate data randomly
-    dec_input = paddle.randn(shape=[args.infer_batch_size, 1, args.d_model], dtype="float32")
-    enc_output = paddle.randn(shape=[args.infer_batch_size, args.max_length, args.d_model], dtype="float32")
-    mem_seq_lens = paddle.full(shape=[args.infer_batch_size, 1], fill_value=args.max_length, dtype="int32")
-    dtype = "float32"
-    if args.use_fp16_decoder:
-        dtype = "float16"
-        dec_input = paddle.cast(dec_input, dtype=dtype)
-        enc_output = paddle.cast(enc_output, dtype=dtype)
-    if not use_batch_major_op_cache:
-        self_cache_key = paddle.zeros(
-            shape=[args.num_decoder_layers, 0, args.infer_batch_size, args.d_model], dtype=dtype
-        )
-        self_cache_value = paddle.zeros(
-            shape=[args.num_decoder_layers, 0, args.infer_batch_size, args.d_model], dtype=dtype
-        )
-    else:
-        self_cache_key = paddle.zeros(
-            shape=[
-                args.num_decoder_layers,
-                args.infer_batch_size,
-                args.n_head,
-                size_per_head // x,
-                args.max_out_len,
-                x,
-            ],
-            dtype=dtype,
-        )
-        self_cache_value = paddle.zeros(
-            shape=[args.num_decoder_layers, args.infer_batch_size, args.n_head, args.max_out_len, size_per_head],
-            dtype=dtype,
-        )
-    mem_cache = paddle.zeros(
-        shape=[args.num_decoder_layers, 2, args.infer_batch_size, args.max_length, args.d_model], dtype=dtype
-    )
-
-    with paddle.no_grad():
-        for i in range(100):
-            # For warmup.
-            if 50 == i:
-                start = time.time()
-            paddle.device.cuda.synchronize()
-            dec_output, self_cache_key, self_cache_value, mem_cache = transformer.decoder(
-                from_tensor=dec_input,
-                memory_tensor=enc_output,
-                mem_seq_len=mem_seq_lens,
-                self_cache_key=self_cache_key,
-                self_cache_value=self_cache_value,
-                mem_cache=mem_cache,
-                step=0,
-                memory_hidden_dim=args.d_model,
-                is_fuse_qkv=False,
-            )
-        paddle.device.cuda.synchronize()
-        logger.info("Average test time for decoder is %f ms" % ((time.time() - start) / 50 * 1000))
-
-
-if __name__ == "__main__":
-    ARGS = parse_args()
-    yaml_file = ARGS.config
-    with open(yaml_file, "rt") as f:
-        args = AttrDict(yaml.safe_load(f))
-    args.decoder_lib = ARGS.decoder_lib
-    args.use_fp16_decoder = ARGS.use_fp16_decoder
-    pprint(args)
-
-    do_predict(args)
diff --git a/paddlenlp/ops/fast_transformer/sample/decoding_sample.py b/paddlenlp/ops/fast_transformer/sample/decoding_sample.py
deleted file mode 100644
index 5ea3f94f1138..000000000000
--- a/paddlenlp/ops/fast_transformer/sample/decoding_sample.py
+++ /dev/null
@@ -1,99 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import time
-from pprint import pprint
-
-import numpy as np
-import paddle
-import yaml
-from attrdict import AttrDict
-
-from paddlenlp.ops import FasterTransformer
-from paddlenlp.utils.log import logger
-
-paddle.seed(2)
-np.random.seed(2)
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--config",
-        default="./fast_transformer/sample/config/decoding.sample.yaml",
-        type=str,
-        help="Path of the config file. ",
-    )
-    parser.add_argument(
-        "--decoding_lib", default="./build/lib/libdecoding_op.so", type=str, help="Path of libdecoding_op.so. "
-    )
-    parser.add_argument("--use_fp16_decoding", action="store_true", help="Whether to use fp16 decoding to predict. ")
-    args = parser.parse_args()
-    return args
-
-
-def do_predict(args):
-    place = "gpu"
-    place = paddle.set_device(place)
-
-    # Define model
-    transformer = FasterTransformer(
-        src_vocab_size=args.src_vocab_size,
-        trg_vocab_size=args.trg_vocab_size,
-        max_length=args.max_length + 1,
-        num_encoder_layers=args.n_layer,
-        num_decoder_layers=args.n_layer,
-        n_head=args.n_head,
-        d_model=args.d_model,
-        d_inner_hid=args.d_inner_hid,
-        dropout=args.dropout,
-        weight_sharing=args.weight_sharing,
-        bos_id=args.bos_idx,
-        eos_id=args.eos_idx,
-        decoding_strategy=args.decoding_strategy,
-        beam_size=args.beam_size,
-        topk=args.topk,
-        topp=args.topp,
-        max_out_len=args.max_out_len,
-        decoding_lib=args.decoding_lib,
-        use_fp16_decoding=args.use_fp16_decoding,
-    )
-
-    # Set evaluate mode
-    transformer.eval()
-
-    enc_output = paddle.randn([args.infer_batch_size, args.max_length, args.d_model])
-    if args.use_fp16_decoding:
-        enc_output = paddle.cast(enc_output, "float16")
-    mem_seq_len = paddle.randint(1, args.max_length + 1, shape=[args.infer_batch_size], dtype="int32")
-    with paddle.no_grad():
-        for i in range(100):
-            # For warmup.
-            if 50 == i:
-                start = time.time()
-            transformer.decoding(enc_output=enc_output, memory_seq_lens=mem_seq_len)
-        logger.info("Average test time for decoding is %f ms" % ((time.time() - start) / 50 * 1000))
-
-
-if __name__ == "__main__":
-    ARGS = parse_args()
-    yaml_file = ARGS.config
-    with open(yaml_file, "rt") as f:
-        args = AttrDict(yaml.safe_load(f))
-        pprint(args)
-    args.decoding_lib = ARGS.decoding_lib
-    args.use_fp16_decoding = ARGS.use_fp16_decoding
-
-    do_predict(args)
diff --git a/paddlenlp/ops/fast_transformer/sample/encoder_decoder_sample.py b/paddlenlp/ops/fast_transformer/sample/encoder_decoder_sample.py
deleted file mode 100644
index eba7667d30ca..000000000000
--- a/paddlenlp/ops/fast_transformer/sample/encoder_decoder_sample.py
+++ /dev/null
@@ -1,101 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import os
-import time
-from pprint import pprint
-
-import paddle
-import yaml
-from attrdict import AttrDict
-
-from paddlenlp.ops import FasterDecoder
-from paddlenlp.utils.log import logger
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--config", default="./config/decoder.sample.yaml", type=str, help="Path of the config file. ")
-    parser.add_argument(
-        "--decoder_lib", default="../../build/lib/libdecoder_op.so", type=str, help="Path of libdecoder_op.so. "
-    )
-    parser.add_argument("--use_fp16_decoder", action="store_true", help="Whether to use fp16 decoder to predict. ")
-    args = parser.parse_args()
-    return args
-
-
-def get_op_cache_config(use_batch_major_op_cache, size_per_head, is_fp16):
-    x = 8 if is_fp16 else 4
-    use_batch_major_op_cache = True if use_batch_major_op_cache is True and size_per_head % x == 0 else False
-    x = x if use_batch_major_op_cache else 1
-    return use_batch_major_op_cache, x
-
-
-def do_predict(args):
-    place = "gpu"
-    paddle.set_device(place)
-
-    use_batch_major_op_cache = True
-    size_per_head = args.d_model // args.n_head
-    use_batch_major_op_cache, x = get_op_cache_config(use_batch_major_op_cache, size_per_head, args.use_fp16_decoder)
-
-    # Define model
-    transformer = FasterDecoder(
-        src_vocab_size=args.src_vocab_size,
-        trg_vocab_size=args.trg_vocab_size,
-        max_length=args.max_length + 1,
-        num_encoder_layers=args.num_encoder_layers,
-        num_decoder_layers=args.num_decoder_layers,
-        n_head=args.n_head,
-        d_model=args.d_model,
-        d_inner_hid=args.d_inner_hid,
-        dropout=args.dropout,
-        weight_sharing=args.weight_sharing,
-        bos_id=args.bos_idx,
-        eos_id=args.eos_idx,
-        max_out_len=args.max_out_len,
-        decoder_lib=args.decoder_lib,
-        use_fp16_decoder=args.use_fp16_decoder,
-        use_batch_major_op_cache=use_batch_major_op_cache,
-    )
-
-    # Load checkpoint.
-    transformer.load(os.path.join(args.init_from_params, "transformer.pdparams"))
-    # Set evaluate mode
-    transformer.eval()
-
-    # Generate src_word randomly
-    src_word = paddle.randint(0, args.src_vocab_size, shape=[args.infer_batch_size, args.max_length], dtype="int64")
-
-    with paddle.no_grad():
-        for i in range(100):
-            # For warmup.
-            if 50 == i:
-                start = time.time()
-            paddle.device.cuda.synchronize()
-            finished_seq, finished_scores = transformer(src_word=src_word)
-        paddle.device.cuda.synchronize()
-        logger.info("Average test time for decoder is %f ms" % ((time.time() - start) / 50 * 1000))
-
-
-if __name__ == "__main__":
-    ARGS = parse_args()
-    yaml_file = ARGS.config
-    with open(yaml_file, "rt") as f:
-        args = AttrDict(yaml.safe_load(f))
-    args.decoder_lib = ARGS.decoder_lib
-    args.use_fp16_decoder = ARGS.use_fp16_decoder
-    pprint(args)
-
-    do_predict(args)
diff --git a/paddlenlp/ops/fast_transformer/sample/encoder_decoding_sample.py b/paddlenlp/ops/fast_transformer/sample/encoder_decoding_sample.py
deleted file mode 100644
index 2c4a092465a2..000000000000
--- a/paddlenlp/ops/fast_transformer/sample/encoder_decoding_sample.py
+++ /dev/null
@@ -1,128 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import time
-from pprint import pprint
-
-import numpy as np
-import paddle
-import yaml
-from attrdict import AttrDict
-
-from paddlenlp.data import Pad
-from paddlenlp.ops import FasterTransformer, enable_fast_encoder
-from paddlenlp.utils.log import logger
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--config",
-        default="./fast_transformer/sample/config/decoding.sample.yaml",
-        type=str,
-        help="Path of the config file. ",
-    )
-    parser.add_argument(
-        "--decoding_lib", default="./build/lib/libdecoding_op.so", type=str, help="Path of libdecoding_op.so. "
-    )
-    parser.add_argument("--use_fp16_decoding", action="store_true", help="Whether to use fp16 decoding to predict. ")
-    parser.add_argument(
-        "--enable_fast_encoder",
-        action="store_true",
-        help="Whether to use fast version encoder to predict. This is experimental option for now. ",
-    )
-    parser.add_argument("--use_fp16_encoder", action="store_true", help="Whether to use fp16 encoder to predict. ")
-    args = parser.parse_args()
-    return args
-
-
-def generate_src_word(batch_size, vocab_size, max_length, eos_idx, pad_idx):
-    memory_sequence_length = np.random.randint(low=1, high=max_length, size=batch_size).astype(np.int32)
-    data = []
-    for i in range(batch_size):
-        data.append(np.random.randint(low=3, high=vocab_size, size=memory_sequence_length[i], dtype=np.int64))
-
-    word_pad = Pad(pad_idx)
-    src_word = word_pad([list(word) + [eos_idx] for word in data])
-
-    return paddle.to_tensor(src_word, dtype="int64")
-
-
-def do_predict(args):
-    place = "gpu"
-    place = paddle.set_device(place)
-
-    # Define model
-    transformer = FasterTransformer(
-        src_vocab_size=args.src_vocab_size,
-        trg_vocab_size=args.trg_vocab_size,
-        max_length=args.max_length + 1,
-        num_encoder_layers=args.n_layer,
-        num_decoder_layers=args.n_layer,
-        n_head=args.n_head,
-        d_model=args.d_model,
-        d_inner_hid=args.d_inner_hid,
-        dropout=args.dropout,
-        weight_sharing=args.weight_sharing,
-        bos_id=args.bos_idx,
-        eos_id=args.eos_idx,
-        decoding_strategy=args.decoding_strategy,
-        beam_size=args.beam_size,
-        topk=args.topk,
-        topp=args.topp,
-        max_out_len=args.max_out_len,
-        decoding_lib=args.decoding_lib,
-        use_fp16_decoding=args.use_fp16_decoding,
-        enable_fast_encoder=args.enable_fast_encoder,
-        use_fp16_encoder=args.use_fp16_encoder,
-    )
-
-    # Set evaluate mode
-    transformer.eval()
-
-    if args.enable_fast_encoder:
-        transformer = enable_fast_encoder(transformer, use_fp16=args.use_fp16_encoder)
-
-    src_word = generate_src_word(
-        batch_size=args.infer_batch_size,
-        vocab_size=args.src_vocab_size,
-        max_length=args.max_length,
-        eos_idx=args.eos_idx,
-        pad_idx=args.bos_idx,
-    )
-
-    with paddle.no_grad():
-        for i in range(100):
-            # For warmup.
-            if 50 == i:
-                paddle.device.cuda.synchronize(place)
-                start = time.time()
-            transformer(src_word=src_word)
-        paddle.device.cuda.synchronize(place)
-        logger.info("Average test time for encoder-decoding is %f ms" % ((time.time() - start) / 50 * 1000))
-
-
-if __name__ == "__main__":
-    ARGS = parse_args()
-    yaml_file = ARGS.config
-    with open(yaml_file, "rt") as f:
-        args = AttrDict(yaml.safe_load(f))
-    args.decoding_lib = ARGS.decoding_lib
-    args.use_fp16_decoding = ARGS.use_fp16_decoding
-    args.enable_fast_encoder = ARGS.enable_fast_encoder
-    args.use_fp16_encoder = ARGS.use_fp16_encoder
-    pprint(args)
-
-    do_predict(args)
diff --git a/paddlenlp/ops/fast_transformer/sample/gpt_export_model_sample.py b/paddlenlp/ops/fast_transformer/sample/gpt_export_model_sample.py
deleted file mode 100644
index f3ab7771bf17..000000000000
--- a/paddlenlp/ops/fast_transformer/sample/gpt_export_model_sample.py
+++ /dev/null
@@ -1,106 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import os
-from pprint import pprint
-
-import paddle
-
-from paddlenlp.ops import FasterGPT
-from paddlenlp.transformers import GPTChineseTokenizer, GPTLMHeadModel, GPTTokenizer
-from paddlenlp.utils.log import logger
-
-MODEL_CLASSES = {
-    "gpt-cpm-large-cn": (GPTLMHeadModel, GPTChineseTokenizer),
-    "gpt2-medium-en": (GPTLMHeadModel, GPTTokenizer),
-}
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--model_name_or_path",
-        default="gpt2-medium-en",
-        type=str,
-        help="The model name to specify the gpt to use. Can be one of ['gpt2-en', 'gpt2-medium-en', 'gpt-cpm-large-cn']. ",
-    )
-    parser.add_argument(
-        "--decoding_lib", default="../../build/lib/libdecoding_op.so", type=str, help="Path of libdecoding_op.so. "
-    )
-    parser.add_argument(
-        "--inference_model_dir", default="./infer_model/", type=str, help="Path to save inference model of gpt. "
-    )
-    parser.add_argument("--topk", default=4, type=int, help="The number of candidate to procedure beam search. ")
-    parser.add_argument(
-        "--topp", default=0.0, type=float, help="The probability threshold to procedure topp sampling. "
-    )
-    parser.add_argument("--max_out_len", default=32, type=int, help="Maximum output length. ")
-    parser.add_argument("--temperature", default=1.0, type=float, help="The temperature to set. ")
-    parser.add_argument("--use_fp16_decoding", action="store_true", help="Whether to use fp16 decoding to predict. ")
-    args = parser.parse_args()
-    return args
-
-
-def do_predict(args):
-    place = "gpu"
-    place = paddle.set_device(place)
-
-    model_class, tokenizer_class = MODEL_CLASSES[args.model_name_or_path]
-    tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
-    logger.info("Loading the model parameters, please wait...")
-    model = model_class.from_pretrained(args.model_name_or_path, max_predict_len=args.max_out_len)
-
-    gpt = FasterGPT(model=model, decoding_lib=args.decoding_lib, use_fp16_decoding=args.use_fp16_decoding)
-
-    # Set evaluate mode
-    gpt.eval()
-
-    # Convert dygraph model to static graph model
-    gpt = paddle.jit.to_static(
-        gpt,
-        input_spec=[
-            # input_ids
-            paddle.static.InputSpec(shape=[None, None], dtype="int32"),
-            #
-            # If it's necessarry to provide mem_seq_len and attention_mask,
-            # the parameters should be:
-            # mem_seq_len
-            # paddle.static.InputSpec(shape=[None, None], dtype="int32"),
-            # attention_mask
-            # paddle.static.InputSpec(shape=[None, None, None], dtype="float16" if args.use_fp16_decoding else "float32"),
-            #
-            None,  # mem_seq_len
-            None,  # attention_mask
-            args.topk,
-            args.topp,
-            args.max_out_len,
-            tokenizer.eos_token_id,
-            tokenizer.eos_token_id,
-            tokenizer.pad_token_id,
-            None,  # forced_eos_token_id
-            args.temperature,
-        ],
-    )
-
-    # Save converted static graph model
-    paddle.jit.save(gpt, os.path.join(args.inference_model_dir, "gpt"))
-    logger.info("GPT has been saved to {}".format(args.inference_model_dir))
-
-    gpt.save_resources(tokenizer, args.inference_model_dir)
-
-
-if __name__ == "__main__":
-    args = parse_args()
-    pprint(args)
-    do_predict(args)
diff --git a/paddlenlp/ops/fast_transformer/sample/gpt_sample.py b/paddlenlp/ops/fast_transformer/sample/gpt_sample.py
deleted file mode 100644
index 7f3344fe6063..000000000000
--- a/paddlenlp/ops/fast_transformer/sample/gpt_sample.py
+++ /dev/null
@@ -1,112 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import time
-from pprint import pprint
-
-import numpy as np
-import paddle
-
-from paddlenlp.transformers import GPTChineseTokenizer, GPTLMHeadModel, GPTTokenizer
-from paddlenlp.utils.log import logger
-
-MODEL_CLASSES = {
-    "gpt-cpm-large-cn": (GPTLMHeadModel, GPTChineseTokenizer),
-    "gpt2-medium-en": (GPTLMHeadModel, GPTTokenizer),
-}
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--model_name_or_path",
-        default="gpt2-medium-en",
-        type=str,
-        help="The model name to specify the gpt to use. Can be one of ['gpt2-en', 'gpt2-medium-en', 'gpt-cpm-large-cn']. ",
-    )
-    parser.add_argument(
-        "--decoding_lib", default="../build/lib/libdecoding_op.so", type=str, help="Path of libdecoding_op.so. "
-    )
-    parser.add_argument("--batch_size", default=4, type=int, help="Batch size. ")
-    parser.add_argument("--topk", default=4, type=int, help="The number of candidate to procedure beam search. ")
-    parser.add_argument(
-        "--topp", default=1.0, type=float, help="The probability threshold to procedure topp sampling. "
-    )
-    parser.add_argument("--max_length", default=32, type=int, help="Maximum output length. ")
-    parser.add_argument(
-        "--start_token", default="<|endoftext|>", type=str, help="The start token. Defaults to <|endoftext|>. "
-    )
-    parser.add_argument(
-        "--end_token", default="<|endoftext|>", type=str, help="The end token. Defaults to <|endoftext|>. "
-    )
-    parser.add_argument("--temperature", default=1.0, type=float, help="The temperature to set. ")
-    parser.add_argument("--use_fp16_decoding", action="store_true", help="Whether to use fp16 decoding to predict. ")
-    args = parser.parse_args()
-    return args
-
-
-def do_predict(args):
-    place = "gpu"
-    place = paddle.set_device(place)
-
-    model_class, tokenizer_class = MODEL_CLASSES[args.model_name_or_path]
-    tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
-    logger.info("Loading the model parameters, please wait...")
-    model = model_class.from_pretrained(args.model_name_or_path)
-    model.eval()
-
-    bos_id = tokenizer.convert_tokens_to_ids(args.start_token)
-    eos_id = tokenizer.convert_tokens_to_ids(args.end_token)
-
-    # Define model
-    gpt = model
-
-    # Set evaluate mode
-    gpt.eval()
-    input_ids = np.array([[bos_id] for i in range(args.batch_size * 1)]).astype("int64").reshape([args.batch_size, 1])
-    input_ids = paddle.to_tensor(input_ids)
-
-    with paddle.no_grad():
-        for i in range(100):
-            # For warmup.
-            if 50 == i:
-                paddle.device.cuda.synchronize(place)
-                start = time.time()
-            out_seq, _ = gpt.generate(
-                input_ids,
-                top_k=args.topk,
-                top_p=args.topp,
-                max_length=args.max_length,
-                temperature=args.temperature,
-                bos_token_id=bos_id,
-                eos_token_id=eos_id,
-                decode_strategy="sampling",
-                use_fp16_decoding=args.use_fp16_decoding,
-                use_fast=True,
-            )
-            output_sequence = out_seq.numpy()
-
-        paddle.device.cuda.synchronize(place)
-        logger.info("Average test time for decoding is %f ms" % ((time.time() - start) / 50 * 1000))
-        output_sequence = out_seq.numpy().tolist()
-    for i in range(args.batch_size):
-        print("========== Sample-%d ==========" % i)
-        print(tokenizer.convert_ids_to_string(output_sequence[i]))
-
-
-if __name__ == "__main__":
-    args = parse_args()
-    pprint(args)
-    do_predict(args)
diff --git a/paddlenlp/ops/fast_transformer/sample/mbart_decoding_sample.py b/paddlenlp/ops/fast_transformer/sample/mbart_decoding_sample.py
deleted file mode 100644
index 76c0d5699323..000000000000
--- a/paddlenlp/ops/fast_transformer/sample/mbart_decoding_sample.py
+++ /dev/null
@@ -1,138 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import time
-from pprint import pprint
-
-import paddle
-
-from paddlenlp.data import Pad
-from paddlenlp.transformers import MBartForConditionalGeneration, MBartTokenizer
-from paddlenlp.utils.log import logger
-
-
-def post_process_seq(seq, bos_idx, eos_idx, output_bos=False, output_eos=False):
-    """
-    Post-process the decoded sequence.
-    """
-    eos_pos = len(seq) - 1
-    for i, idx in enumerate(seq):
-        if idx == eos_idx:
-            eos_pos = i
-            break
-    seq = [idx for idx in seq[: eos_pos + 1] if (output_bos or idx != bos_idx) and (output_eos or idx != eos_idx)]
-    return seq
-
-
-def prepare_input(tokenizer, sentences, pad_id):
-    word_pad = Pad(pad_id, dtype="int64")
-    tokenized = tokenizer(sentences, return_length=True)
-    inputs = word_pad(tokenized["input_ids"])
-    input_ids = paddle.to_tensor(inputs)
-    return input_ids
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--model_name_or_path",
-        default="mbart-large-50-one-to-many-mmt",
-        type=str,
-        help="The model name to specify the bart to use. ",
-        choices=[
-            "mbart-large-50-one-to-many-mmt",
-            "mbart-large-50-many-to-one-mmt",
-            "mbart-large-50-many-to-many-mmt",
-            "mbart-large-cc25",
-            "mbart-large-en-ro",
-        ],
-    )
-    parser.add_argument(
-        "--decoding_strategy",
-        default="beam_search",
-        type=str,
-        help="The decoding strategy.",
-        choices=["greedy_search", "beam_search", "sampling"],
-    )
-    parser.add_argument("--beam_size", default=4, type=int, help="The parameters for beam search. ")
-    parser.add_argument("--top_k", default=4, type=int, help="The number of candidate to procedure beam search. ")
-    parser.add_argument(
-        "--top_p", default=1.0, type=float, help="The probability threshold to procedure topp sampling. "
-    )
-    parser.add_argument("--max_length", default=50, type=int, help="Maximum output length. ")
-    parser.add_argument("--diversity_rate", default=0.0, type=float, help="The diversity of beam search. ")
-    parser.add_argument(
-        "--length_penalty", default=0.0, type=float, help="The power number in length penalty calculation"
-    )
-    parser.add_argument("--use_fp16_decoding", action="store_true", help="Whether to use fp16 decoding to predict. ")
-    parser.add_argument("--not_use_faster", action="store_false", help="Whether to use FastGeneration. ")
-    args = parser.parse_args()
-    return args
-
-
-def do_predict(args):
-    place = "gpu"
-    paddle.set_device(place)
-
-    tokenizer = MBartTokenizer.from_pretrained(args.model_name_or_path, src_lang="en_XX")
-    logger.info("Loading the model parameters, please wait...")
-    model = MBartForConditionalGeneration.from_pretrained(args.model_name_or_path)
-    # Set evaluate mode
-    model.eval()
-    sentences = [
-        "I love that girl, but she does not love me.",
-        "She is so beautiful that I can not help glance at her.",
-        "Nothing's gonna change my love for you.",
-        "Drop everything now. Meet me in the pouring rain. Kiss me on the sidewalk.",
-    ]
-
-    eos_id = model.mbart.config["eos_token_id"]
-    pad_id = model.mbart.config["pad_token_id"]
-    input_ids = prepare_input(tokenizer, sentences, pad_id)
-
-    with paddle.no_grad():
-        for i in range(100):
-            # For warmup.
-            if 50 == i:
-                # PaddlePaddle >= 2.2
-                paddle.device.cuda.synchronize()
-                start = time.perf_counter()
-            finished_seqs, _ = model.generate(
-                input_ids=input_ids,
-                forced_bos_token_id=tokenizer.lang_code_to_id["zh_CN"],
-                max_length=args.max_length,
-                decode_strategy=args.decoding_strategy,
-                top_k=args.top_k,
-                top_p=args.top_p,
-                num_beams=args.beam_size,
-                diversity_rate=args.diversity_rate,
-                length_penalty=args.length_penalty,
-                use_fast=args.not_use_faster,
-            )
-        paddle.device.cuda.synchronize()
-        logger.info("Average test time for decoding is %f ms" % ((time.perf_counter() - start) / 50 * 1000))
-
-        # Output
-        finished_seqs = finished_seqs.numpy().tolist()
-        for idx, finished_seq in enumerate(finished_seqs):
-            finished_seq = finished_seq
-            print(f"source: {sentences[idx]}")
-            finished_seq = post_process_seq(finished_seq, tokenizer.lang_code_to_id["zh_CN"], eos_id)
-            print(f"target: {tokenizer.convert_ids_to_string(finished_seq)}\n")
-
-
-if __name__ == "__main__":
-    args = parse_args()
-    pprint(args)
-    do_predict(args)
diff --git a/paddlenlp/ops/fast_transformer/sample/mbart_export_model_sample.py b/paddlenlp/ops/fast_transformer/sample/mbart_export_model_sample.py
deleted file mode 100644
index 330e7aa5da78..000000000000
--- a/paddlenlp/ops/fast_transformer/sample/mbart_export_model_sample.py
+++ /dev/null
@@ -1,118 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import os
-from pprint import pprint
-
-import paddle
-
-from paddlenlp.ops import FasterMBART
-from paddlenlp.transformers import MBartForConditionalGeneration, MBartTokenizer
-from paddlenlp.utils.log import logger
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--model_name_or_path",
-        default="mbart-large-50-many-to-many-mmt",
-        type=str,
-        help="The model name to specify the bart to use. ",
-    )
-    parser.add_argument(
-        "--inference_model_dir", default="./infer_model/", type=str, help="Path to save inference model of bart. "
-    )
-    parser.add_argument("--topk", default=4, type=int, help="The number of candidate to procedure top_k sampling. ")
-    parser.add_argument(
-        "--topp", default=1.0, type=float, help="The probability threshold to procedure top_p sampling. "
-    )
-    parser.add_argument("--max_out_len", default=64, type=int, help="Maximum output length. ")
-    parser.add_argument("--temperature", default=1.0, type=float, help="The temperature to set. ")
-    parser.add_argument("--num_return_sequences", default=1, type=int, help="The number of returned sequences. ")
-    parser.add_argument("--use_fp16_decoding", action="store_true", help="Whether to use fp16 decoding to predict. ")
-    parser.add_argument(
-        "--decoding_strategy",
-        default="beam_search",
-        choices=["sampling", "beam_search"],
-        type=str,
-        help="The main strategy to decode. ",
-    )
-    parser.add_argument("--num_beams", default=5, type=int, help="The number of candidate to procedure beam search. ")
-    parser.add_argument(
-        "--diversity_rate", default=0.0, type=float, help="The diversity rate to procedure beam search. "
-    )
-    parser.add_argument("--repetition_penalty", default=1.0, type=float, help="The repetition_penalty to set. ")
-    parser.add_argument("--length_penalty", default=0.0, type=float, help="The length penalty to decode. ")
-    parser.add_argument("--early_stopping", action="store_true", help="Whether to do early stopping. ")
-
-    args = parser.parse_args()
-    return args
-
-
-def do_predict(args):
-    place = "gpu"
-    place = paddle.set_device(place)
-
-    model = MBartForConditionalGeneration.from_pretrained(args.model_name_or_path)
-    tokenizer = MBartTokenizer.from_pretrained(args.model_name_or_path, src_lang="en_XX")
-
-    # For opening faster_encoder
-    model.eval()
-
-    fast_mbart = FasterMBART(model=model, use_fp16_decoding=args.use_fp16_decoding)
-    # Set evaluate mode
-    fast_mbart.eval()
-
-    # Convert dygraph model to static graph model
-    fast_mbart = paddle.jit.to_static(
-        fast_mbart,
-        input_spec=[
-            # input_ids
-            paddle.static.InputSpec(shape=[None, None], dtype="int32"),
-            # encoder_output
-            None,
-            # seq_len
-            None,
-            paddle.static.InputSpec(
-                shape=[None, 1], dtype="int32"
-            ),  # forced_bos_token_id can be a Tensor or int (bos_id)
-            args.num_beams,  # num_beams.
-            args.topk,  # top_k
-            args.topp,  # top_p
-            args.decoding_strategy,  # decode_strategy
-            tokenizer.bos_token_id,  # bos_token_id
-            tokenizer.eos_token_id,  # eos_token_id
-            tokenizer.pad_token_id,  # pad_token_id
-            model.mbart.config["decoder_start_token_id"],  # decoder_start_token_id
-            args.max_out_len,  # max_length
-            args.diversity_rate,  # diversity_rate
-            args.length_penalty,  # length_penalty
-            args.temperature,  # temperature
-            args.num_return_sequences,  # num_return_sequences
-            args.early_stopping,  # early_stopping
-            tokenizer.eos_token_id,  # forced_eos_token_id
-        ],
-    )
-
-    # Save converted static graph model
-    paddle.jit.save(fast_mbart, os.path.join(args.inference_model_dir, "mbart"))
-    logger.info("MBART has been saved to {}.".format(args.inference_model_dir))
-
-
-if __name__ == "__main__":
-    args = parse_args()
-    pprint(args)
-
-    do_predict(args)
diff --git a/paddlenlp/ops/fast_transformer/sample/mbart_inference.py b/paddlenlp/ops/fast_transformer/sample/mbart_inference.py
deleted file mode 100644
index fd4b4d2ad150..000000000000
--- a/paddlenlp/ops/fast_transformer/sample/mbart_inference.py
+++ /dev/null
@@ -1,108 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import os
-from pprint import pprint
-
-import numpy as np
-import paddle.inference as paddle_infer
-
-from paddlenlp.ops.ext_utils import load
-from paddlenlp.transformers import MBartTokenizer
-
-
-def setup_args():
-    """Setup arguments."""
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--inference_model_dir", default="./infer_model/", type=str, help="Path to save inference model of BART. "
-    )
-    parser.add_argument("--batch_size", default=1, type=int, help="Batch size. ")
-
-    args = parser.parse_args()
-
-    return args
-
-
-def postprocess_response(tokenizer, seq, bos_idx, eos_idx):
-    """Post-process the decoded sequence."""
-    eos_pos = len(seq) - 1
-    for i, idx in enumerate(seq):
-        if idx == eos_idx:
-            eos_pos = i
-            break
-    seq = [idx for idx in seq[: eos_pos + 1] if idx != bos_idx and idx != eos_idx]
-    res = tokenizer.convert_ids_to_string(seq)
-    return res
-
-
-def infer(args):
-    model_name = "mbart-large-50-many-to-many-mmt"
-
-    tokenizer = MBartTokenizer.from_pretrained(model_name, src_lang="en_XX")
-    bos_id = tokenizer.lang_code_to_id["zh_CN"]
-    inputs = "PaddleNLP is a powerful NLP library with Awesome pre-trained models and easy-to-use interface, supporting wide-range of NLP tasks from research to industrial applications."
-
-    eos_id = tokenizer.eos_token_id
-
-    # Input ids
-    input_ids = tokenizer(inputs)["input_ids"]
-    input_ids = np.asarray(input_ids, dtype="int32").reshape(1, -1).repeat(args.batch_size, axis=0)
-
-    # Forced bos token ids
-    forced_bos_token = np.ones([args.batch_size, 1], dtype="int32") * bos_id
-
-    # Load FastGeneration lib.
-    load("FastGeneration", verbose=True)
-
-    config = paddle_infer.Config(
-        os.path.join(args.inference_model_dir, "mbart.pdmodel"),
-        os.path.join(args.inference_model_dir, "mbart.pdiparams"),
-    )
-
-    config.enable_use_gpu(100, 0)
-    config.disable_glog_info()
-    predictor = paddle_infer.create_predictor(config)
-
-    input_names = predictor.get_input_names()
-
-    # Input ids
-    input_ids_handle = predictor.get_input_handle(input_names[0])
-    input_ids_handle.copy_from_cpu(input_ids.astype("int32"))
-
-    # Forced bos token ids
-    forced_bos_token_handle = predictor.get_input_handle(input_names[1])
-    forced_bos_token_handle.copy_from_cpu(forced_bos_token.astype("int32"))
-
-    predictor.run()
-
-    output_names = predictor.get_output_names()
-    output_handle = predictor.get_output_handle(output_names[0])
-    output_data = output_handle.copy_to_cpu()
-
-    # [batch_size, num_beams * 2, sequence_length]
-    output_data = output_data.transpose([1, 2, 0])
-
-    # Only use the best sequence.
-    result = [postprocess_response(tokenizer, sample.tolist()[0], bos_id, eos_id) for sample in output_data]
-    print("Model input:", inputs)
-    print("Result:", "\n".join(result))
-
-
-if __name__ == "__main__":
-    args = setup_args()
-    pprint(args)
-
-    infer(args)
diff --git a/paddlenlp/ops/fast_transformer/sample/plato_export_model_sample.py b/paddlenlp/ops/fast_transformer/sample/plato_export_model_sample.py
deleted file mode 100644
index 1b3e6a0e877e..000000000000
--- a/paddlenlp/ops/fast_transformer/sample/plato_export_model_sample.py
+++ /dev/null
@@ -1,120 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import os
-from pprint import pprint
-
-import paddle
-
-from paddlenlp.ops import FasterUnifiedTransformer
-from paddlenlp.transformers import (
-    UnifiedTransformerLMHeadModel,
-    UnifiedTransformerTokenizer,
-)
-from paddlenlp.utils.log import logger
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--model_name_or_path",
-        default="plato-xl",
-        type=str,
-        help="The model name to specify the PLATO/UnifiedTransformer to use. ",
-    )
-    parser.add_argument(
-        "--inference_model_dir", default="./infer_model/", type=str, help="Path to save inference model of gpt. "
-    )
-    parser.add_argument("--topk", default=4, type=int, help="The number of candidate to procedure top_k sampling. ")
-    parser.add_argument(
-        "--topp", default=1.0, type=float, help="The probability threshold to procedure top_p sampling. "
-    )
-    parser.add_argument("--max_out_len", default=64, type=int, help="Maximum output length. ")
-    parser.add_argument("--min_out_len", default=1, type=int, help="Minimum output length. ")
-    parser.add_argument("--temperature", default=1.0, type=float, help="The temperature to set. ")
-    parser.add_argument("--num_return_sequences", default=1, type=int, help="The number of returned sequences. ")
-    parser.add_argument("--use_fp16_decoding", action="store_true", help="Whether to use fp16 decoding to predict. ")
-    parser.add_argument(
-        "--decoding_strategy",
-        default="sampling",
-        choices=["sampling", "beam_search"],
-        type=str,
-        help="The main strategy to decode. ",
-    )
-    parser.add_argument("--num_beams", default=4, type=int, help="The number of candidate to procedure beam search. ")
-    parser.add_argument(
-        "--diversity_rate", default=0.0, type=float, help="The diversity rate to procedure beam search. "
-    )
-
-    args = parser.parse_args()
-    return args
-
-
-def do_predict(args):
-    place = "gpu"
-    place = paddle.set_device(place)
-
-    if args.use_fp16_decoding and os.getenv("PPFG_QKV_MEM_OPT", "0") == "1":
-        paddle.set_default_dtype("float16")
-
-    model_name = "plato-xl"
-    model = UnifiedTransformerLMHeadModel.from_pretrained(model_name)
-    tokenizer = UnifiedTransformerTokenizer.from_pretrained(model_name)
-
-    plato = FasterUnifiedTransformer(model=model, use_fp16_decoding=args.use_fp16_decoding)
-    # Set evaluate mode
-    plato.eval()
-
-    # Convert dygraph model to static graph model
-    plato = paddle.jit.to_static(
-        plato,
-        input_spec=[
-            # input_ids
-            paddle.static.InputSpec(shape=[None, None], dtype="int32"),
-            # token_type_ids
-            paddle.static.InputSpec(shape=[None, None], dtype="int32"),
-            # attention_mask
-            paddle.static.InputSpec(shape=[None, 1, None, None], dtype="float32"),
-            # seq_len
-            paddle.static.InputSpec(shape=[None], dtype="int32"),
-            # role_ids
-            paddle.static.InputSpec(shape=[None, None], dtype="int32"),
-            # position_ids
-            paddle.static.InputSpec(shape=[None, None], dtype="int32"),
-            args.max_out_len,
-            args.min_out_len,
-            args.topk,
-            args.topp,
-            args.decoding_strategy,
-            tokenizer.cls_token_id,  # cls/bos
-            tokenizer.sep_token_id,  # sep/eos
-            tokenizer.pad_token_id,  # pad
-            args.num_beams,  # num_beams. Used for beam_search.
-            args.diversity_rate,  # diversity rate. Used for beam search.
-            args.temperature,
-            args.num_return_sequences,
-        ],
-    )
-
-    # Save converted static graph model
-    paddle.jit.save(plato, os.path.join(args.inference_model_dir, "plato"))
-    logger.info("PLATO has been saved to {}.".format(args.inference_model_dir))
-
-
-if __name__ == "__main__":
-    args = parse_args()
-    pprint(args)
-
-    do_predict(args)
diff --git a/paddlenlp/ops/fast_transformer/sample/plato_inference.py b/paddlenlp/ops/fast_transformer/sample/plato_inference.py
deleted file mode 100644
index 8f935b7a0d25..000000000000
--- a/paddlenlp/ops/fast_transformer/sample/plato_inference.py
+++ /dev/null
@@ -1,108 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-from pprint import pprint
-
-import numpy as np
-import paddle.inference as paddle_infer
-
-from paddlenlp.ops.ext_utils import load
-from paddlenlp.transformers import UnifiedTransformerTokenizer
-
-
-def setup_args():
-    """Setup arguments."""
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--inference_model_dir", default="./infer_model/", type=str, help="Path to save inference model of PLATO. "
-    )
-    parser.add_argument("--use_role", action="store_true", help="Whether to use role embeddings. ")
-    parser.add_argument(
-        "--position_style",
-        default="relative",
-        choices=["continuous", "relative"],
-        type=str,
-        help="The type for positional embedding. Default is continuous. ",
-    )
-
-    args = parser.parse_args()
-
-    return args
-
-
-def postprocess_response(token_ids, tokenizer):
-    """Post-process the decoded sequence. Truncate from the first <eos>."""
-    eos_pos = len(token_ids)
-    for i, tok_id in enumerate(token_ids):
-        if tok_id == tokenizer.sep_token_id:
-            eos_pos = i
-            break
-    token_ids = token_ids[:eos_pos]
-    tokens = tokenizer.convert_ids_to_tokens(token_ids)
-    tokens = tokenizer.merge_subword(tokens)
-    return tokens
-
-
-def infer(args):
-    model_name = "plato-xl"
-    tokenizer = UnifiedTransformerTokenizer.from_pretrained(model_name)
-
-    context = [
-        "Hi , Becky , what's up ?",
-        "Not much , except that my mother-in-law is driving me up the wall .",
-        "What's the problem ?",
-    ]
-
-    data = tokenizer.dialogue_encode(
-        history=context,
-        add_start_token_as_response=True,
-        return_length=True,
-        return_role_ids=args.use_role,
-        position_style=args.position_style,
-    )
-
-    # Load FastGeneration lib.
-    load("FastGeneration", verbose=True)
-
-    config = paddle_infer.Config(
-        args.inference_model_dir + "plato.pdmodel", args.inference_model_dir + "plato.pdiparams"
-    )
-    config.enable_use_gpu(100, 0)
-    config.disable_glog_info()
-    predictor = paddle_infer.create_predictor(config)
-
-    input_handles = {}
-    for name in predictor.get_input_names():
-        input_handles[name] = predictor.get_input_handle(name)
-        if name == "attention_mask":
-            input_handles[name].copy_from_cpu(np.expand_dims(np.asarray(data[name], dtype="float32"), axis=(0, 1)))
-        else:
-            input_handles[name].copy_from_cpu(np.asarray(data[name], dtype="int32").reshape([1, -1]))
-
-    output_handles = [predictor.get_output_handle(name) for name in predictor.get_output_names()]
-
-    predictor.run()
-
-    output = [output_handle.copy_to_cpu() for output_handle in output_handles]
-
-    for sample in output[0].transpose([1, 0]).tolist():
-        print(" ".join(postprocess_response(sample, tokenizer)))
-
-
-if __name__ == "__main__":
-    args = setup_args()
-    pprint(args)
-
-    infer(args)
diff --git a/paddlenlp/ops/fast_transformer/sample/t5_export_model_sample.py b/paddlenlp/ops/fast_transformer/sample/t5_export_model_sample.py
deleted file mode 100644
index 5fdf1c99532e..000000000000
--- a/paddlenlp/ops/fast_transformer/sample/t5_export_model_sample.py
+++ /dev/null
@@ -1,113 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import os
-from pprint import pprint
-
-import paddle
-
-from paddlenlp.ops import FasterT5
-from paddlenlp.transformers import T5ForConditionalGeneration, T5Tokenizer
-from paddlenlp.utils.log import logger
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--model_name_or_path", default="t5-base", type=str, help="The model name to specify the bart to use. "
-    )
-    parser.add_argument(
-        "--inference_model_dir", default="./infer_model/", type=str, help="Path to save inference model of bart. "
-    )
-    parser.add_argument("--topk", default=4, type=int, help="The number of candidate to procedure top_k sampling. ")
-    parser.add_argument(
-        "--topp", default=1.0, type=float, help="The probability threshold to procedure top_p sampling. "
-    )
-    parser.add_argument("--max_out_len", default=256, type=int, help="Maximum output length. ")
-    parser.add_argument("--temperature", default=1.0, type=float, help="The temperature to set. ")
-    parser.add_argument("--num_return_sequences", default=1, type=int, help="The number of returned sequences. ")
-    parser.add_argument("--use_fp16_decoding", action="store_true", help="Whether to use fp16 decoding to predict. ")
-    parser.add_argument(
-        "--decoding_strategy",
-        default="beam_search",
-        choices=["sampling", "beam_search"],
-        type=str,
-        help="The main strategy to decode. ",
-    )
-    parser.add_argument("--num_beams", default=4, type=int, help="The number of candidate to procedure beam search. ")
-    parser.add_argument(
-        "--diversity_rate", default=0.0, type=float, help="The diversity rate to procedure beam search. "
-    )
-    parser.add_argument("--repetition_penalty", default=1.0, type=float, help="The repetition_penalty to set. ")
-    parser.add_argument("--length_penalty", default=0.0, type=float, help="The length penalty to decode. ")
-    parser.add_argument("--early_stopping", action="store_true", help="Whether to do early stopping. ")
-
-    args = parser.parse_args()
-    return args
-
-
-def do_predict(args):
-    place = "gpu"
-    place = paddle.set_device(place)
-
-    model = T5ForConditionalGeneration.from_pretrained(args.model_name_or_path)
-    tokenizer = T5Tokenizer.from_pretrained(args.model_name_or_path)
-
-    # For opening faster_encoder
-    model.eval()
-
-    fast_t5 = FasterT5(model=model, use_fp16_decoding=args.use_fp16_decoding)
-    # Set evaluate mode
-    fast_t5.eval()
-
-    # Convert dygraph model to static graph model
-    fast_t5 = paddle.jit.to_static(
-        fast_t5,
-        input_spec=[
-            # input_ids
-            paddle.static.InputSpec(shape=[None, None], dtype="int32"),
-            # encoder_output
-            None,
-            # seq_len
-            None,
-            args.max_out_len,  # max_length
-            0,  # min_length
-            args.topk,  # top_k
-            args.topp,  # top_p
-            args.num_beams,  # num_beams
-            args.decoding_strategy,  # decode_strategy
-            None,  # decoder_start_token_id
-            tokenizer.bos_token_id,  # bos_token_id
-            tokenizer.eos_token_id,  # eos_token_id
-            tokenizer.pad_token_id,  # pad_token_id
-            args.diversity_rate,  # diversity_rate
-            args.temperature,  # temperature
-            args.num_return_sequences,  # num_return_sequences
-            args.length_penalty,  # length_penalty
-            args.early_stopping,  # early_stopping
-            tokenizer.eos_token_id,  # forced_eos_token_id
-        ],
-    )
-
-    # Save converted static graph model
-    paddle.jit.save(fast_t5, os.path.join(args.inference_model_dir, "t5"))
-    logger.info("T5 has been saved to {}.".format(args.inference_model_dir))
-
-
-if __name__ == "__main__":
-    args = parse_args()
-    pprint(args)
-
-    do_predict(args)
diff --git a/paddlenlp/ops/fast_transformer/sample/t5_inference.py b/paddlenlp/ops/fast_transformer/sample/t5_inference.py
deleted file mode 100644
index 585a4a9566f2..000000000000
--- a/paddlenlp/ops/fast_transformer/sample/t5_inference.py
+++ /dev/null
@@ -1,94 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import os
-from pprint import pprint
-
-import paddle.inference as paddle_infer
-
-from paddlenlp.ops.ext_utils import load
-from paddlenlp.transformers import T5Tokenizer
-
-
-def setup_args():
-    """Setup arguments."""
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--inference_model_dir", default="./infer_model/", type=str, help="Path to save inference model of T5. "
-    )
-    parser.add_argument("--batch_size", default=1, type=int, help="Batch size. ")
-
-    args = parser.parse_args()
-
-    return args
-
-
-def postprocess_response(tokenizer, seq, bos_idx, eos_idx):
-    """Post-process the decoded sequence."""
-    eos_pos = len(seq) - 1
-    for i, idx in enumerate(seq):
-        if idx == eos_idx:
-            eos_pos = i
-            break
-    seq = [idx for idx in seq[: eos_pos + 1] if idx != bos_idx and idx != eos_idx]
-    res = tokenizer.convert_ids_to_string(seq)
-    return res
-
-
-def infer(args):
-    model_name = "t5-base"
-
-    tokenizer = T5Tokenizer.from_pretrained(model_name)
-    inputs = ' This image section from an infrared recording by the Spitzer telescope shows a "family portrait" of countless generations of stars: the oldest stars are seen as blue dots. '
-
-    # Input ids
-    input_ids = tokenizer.encode("translate English to French: " + inputs, return_tensors="np")["input_ids"]
-
-    # Load FastGeneration lib.
-    load("FastGeneration", verbose=True)
-
-    config = paddle_infer.Config(
-        os.path.join(args.inference_model_dir, "t5.pdmodel"), os.path.join(args.inference_model_dir, "t5.pdiparams")
-    )
-
-    config.enable_use_gpu(100, 0)
-    config.disable_glog_info()
-    predictor = paddle_infer.create_predictor(config)
-
-    input_names = predictor.get_input_names()
-
-    # Input ids
-    input_ids_handle = predictor.get_input_handle(input_names[0])
-    input_ids_handle.copy_from_cpu(input_ids.astype("int32"))
-
-    predictor.run()
-
-    output_names = predictor.get_output_names()
-    output_handle = predictor.get_output_handle(output_names[0])
-    output_data = output_handle.copy_to_cpu()
-
-    # [batch_size, num_beams * 2, sequence_length]
-    output_data = output_data.transpose([1, 2, 0])
-
-    # Only use the best sequence.
-    translation = tokenizer.decode(output_data[0][0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
-    print("Result:", translation)
-
-
-if __name__ == "__main__":
-    args = setup_args()
-    pprint(args)
-
-    infer(args)
diff --git a/paddlenlp/ops/fast_transformer/sample/unimo_text_export_model_sample.py b/paddlenlp/ops/fast_transformer/sample/unimo_text_export_model_sample.py
deleted file mode 100644
index eb3f2eeda8d8..000000000000
--- a/paddlenlp/ops/fast_transformer/sample/unimo_text_export_model_sample.py
+++ /dev/null
@@ -1,111 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import os
-from pprint import pprint
-
-import paddle
-
-from paddlenlp.ops import FasterUNIMOText
-from paddlenlp.transformers import UNIMOLMHeadModel, UNIMOTokenizer
-from paddlenlp.utils.log import logger
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--model_name_or_path",
-        default="unimo-text-1.0-lcsts-new",
-        type=str,
-        help="The model name to specify the UNIMOText to use. ",
-    )
-    parser.add_argument(
-        "--inference_model_dir", default="./infer_model/", type=str, help="Path to save inference model of gpt. "
-    )
-    parser.add_argument("--topk", default=4, type=int, help="The number of candidate to procedure top_k sampling. ")
-    parser.add_argument(
-        "--topp", default=1.0, type=float, help="The probability threshold to procedure top_p sampling. "
-    )
-    parser.add_argument("--max_out_len", default=64, type=int, help="Maximum output length. ")
-    parser.add_argument("--min_out_len", default=1, type=int, help="Minimum output length. ")
-    parser.add_argument("--num_return_sequence", default=1, type=int, help="The number of returned sequence. ")
-    parser.add_argument("--temperature", default=1.0, type=float, help="The temperature to set. ")
-    parser.add_argument("--num_return_sequences", default=1, type=int, help="The number of returned sequences. ")
-    parser.add_argument("--use_fp16_decoding", action="store_true", help="Whether to use fp16 decoding to predict. ")
-    parser.add_argument(
-        "--decoding_strategy",
-        default="sampling",
-        choices=["sampling", "beam_search"],
-        type=str,
-        help="The main strategy to decode. ",
-    )
-    parser.add_argument("--num_beams", default=4, type=int, help="The number of candidate to procedure beam search. ")
-    parser.add_argument(
-        "--diversity_rate", default=0.0, type=float, help="The diversity rate to procedure beam search. "
-    )
-
-    args = parser.parse_args()
-    return args
-
-
-def do_predict(args):
-    place = "gpu"
-    place = paddle.set_device(place)
-
-    model_name = "unimo-text-1.0-lcsts-new"
-    model = UNIMOLMHeadModel.from_pretrained(model_name)
-    tokenizer = UNIMOTokenizer.from_pretrained(model_name)
-
-    unimo_text = FasterUNIMOText(model=model, use_fp16_decoding=args.use_fp16_decoding)
-    # Set evaluate mode
-    unimo_text.eval()
-
-    # Convert dygraph model to static graph model
-    unimo_text = paddle.jit.to_static(
-        unimo_text,
-        input_spec=[
-            # input_ids
-            paddle.static.InputSpec(shape=[None, None], dtype="int32"),
-            # token_type_ids
-            paddle.static.InputSpec(shape=[None, None], dtype="int32"),
-            # attention_mask
-            paddle.static.InputSpec(shape=[None, 1, None, None], dtype="float32"),
-            # seq_len
-            paddle.static.InputSpec(shape=[None], dtype="int32"),
-            args.max_out_len,
-            args.min_out_len,
-            args.topk,
-            args.topp,
-            args.num_beams,  # num_beams. Used for beam_search.
-            args.decoding_strategy,
-            tokenizer.cls_token_id,  # cls/bos
-            tokenizer.mask_token_id,  # mask/eos
-            tokenizer.pad_token_id,  # pad
-            args.diversity_rate,  # diversity rate. Used for beam search.
-            args.temperature,
-            args.num_return_sequences,
-        ],
-    )
-
-    # Save converted static graph model
-    paddle.jit.save(unimo_text, os.path.join(args.inference_model_dir, "unimo_text"))
-    logger.info("UNIMOText has been saved to {}.".format(args.inference_model_dir))
-
-
-if __name__ == "__main__":
-    args = parse_args()
-    pprint(args)
-
-    do_predict(args)
diff --git a/paddlenlp/ops/fast_transformer/sample/unimo_text_inference.py b/paddlenlp/ops/fast_transformer/sample/unimo_text_inference.py
deleted file mode 100644
index 02effef6af56..000000000000
--- a/paddlenlp/ops/fast_transformer/sample/unimo_text_inference.py
+++ /dev/null
@@ -1,92 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-from pprint import pprint
-
-import numpy as np
-import paddle.inference as paddle_infer
-
-from paddlenlp.ops.ext_utils import load
-from paddlenlp.transformers import UNIMOTokenizer
-
-
-def setup_args():
-    """Setup arguments."""
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--inference_model_dir", default="./infer_model/", type=str, help="Path to save inference model of UNIMOText. "
-    )
-
-    args = parser.parse_args()
-
-    return args
-
-
-def postprocess_response(token_ids, tokenizer):
-    """Post-process the decoded sequence. Truncate from the first <eos>."""
-    eos_pos = len(token_ids)
-    for i, tok_id in enumerate(token_ids):
-        if tok_id == tokenizer.mask_token_id:
-            eos_pos = i
-            break
-    token_ids = token_ids[:eos_pos]
-    tokens = tokenizer.convert_ids_to_tokens(token_ids)
-    tokens = tokenizer.merge_subword(tokens)
-    return tokens
-
-
-def infer(args):
-    model_name = "unimo-text-1.0-lcsts-new"
-    tokenizer = UNIMOTokenizer.from_pretrained(model_name)
-
-    inputs = "深度学习是人工智能的核心技术领域。百度飞桨作为中国首个自主研发、功能丰富、开源开放的产业级深度学习平台,将从多层次技术产品、产业AI人才培养和强大的生态资源支持三方面全面护航企业实现快速AI转型升级。"
-
-    data = tokenizer.gen_encode(
-        inputs, add_start_token_for_decoding=True, return_length=True, is_split_into_words=False
-    )
-
-    # Load FastGeneration lib.
-    load("FastGeneration", verbose=True)
-
-    config = paddle_infer.Config(
-        args.inference_model_dir + "unimo_text.pdmodel", args.inference_model_dir + "unimo_text.pdiparams"
-    )
-    config.enable_use_gpu(100, 0)
-    config.disable_glog_info()
-    predictor = paddle_infer.create_predictor(config)
-
-    input_handles = {}
-    for name in predictor.get_input_names():
-        input_handles[name] = predictor.get_input_handle(name)
-        if name == "attention_mask":
-            input_handles[name].copy_from_cpu(np.expand_dims(np.asarray(data[name], dtype="float32"), axis=(0, 1)))
-        else:
-            input_handles[name].copy_from_cpu(np.asarray(data[name], dtype="int32").reshape([1, -1]))
-
-    output_handles = [predictor.get_output_handle(name) for name in predictor.get_output_names()]
-
-    predictor.run()
-
-    output = [output_handle.copy_to_cpu() for output_handle in output_handles]
-
-    for sample in output[0].transpose([1, 0]).tolist():
-        print("".join(postprocess_response(sample, tokenizer)))
-
-
-if __name__ == "__main__":
-    args = setup_args()
-    pprint(args)
-
-    infer(args)
diff --git a/paddlenlp/ops/fast_transformer/src/CMakeLists.txt b/paddlenlp/ops/fast_transformer/src/CMakeLists.txt
deleted file mode 100644
index 7db53476685b..000000000000
--- a/paddlenlp/ops/fast_transformer/src/CMakeLists.txt
+++ /dev/null
@@ -1,336 +0,0 @@
-if (${CUDA_VERSION} GREATER_EQUAL 11.0)
-  message(STATUS "Add DCUDA11_MODE")
-  add_definitions("-DCUDA11_MODE")
-endif()
-
-add_definitions(-DNDEBUG)
-add_definitions(-DPADDLE_CUDA)
-# Default is 1 in standard c++ when using gcc8.2
-add_definitions(-D_GLIBCXX_USE_CXX11_ABI=1)
-
-add_definitions(-w)
-
-if(ON_INFER)
-  add_definitions(-DPADDLE_ON_INFERENCE)
-
-  link_directories(${COMMON_LIB_DIRS})
-
-  set(ft_lib_link
-    decoder decoding topk cuda_int8_kernels cuda_kernels online_softmax_beamsearch transformer_kernels attention_kernels encoder nccl_utils nvtx_utils
-  )
-
-  if(WITH_GPU)
-    add_definitions("-DPADDLE_WITH_CUDA")
-  endif()
-
-  if(NOT WITH_STATIC_LIB)
-    add_definitions("-DPADDLE_WITH_SHARED_LIB")
-  else()
-    # PD_INFER_DECL is mainly used to set the dllimport/dllexport attribute in dynamic library mode.
-    # Set it to empty in static library mode to avoid compilation issues.
-    add_definitions("/DPD_INFER_DECL=")
-  endif()
-
-  macro(safe_set_static_flag)
-      foreach(flag_var
-          CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
-          CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
-        if(${flag_var} MATCHES "/MD")
-          string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
-        endif(${flag_var} MATCHES "/MD")
-      endforeach(flag_var)
-  endmacro()
-
-  if(NOT DEFINED PADDLE_LIB)
-    message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/path/paddle/lib")
-  endif()
-  if(NOT DEFINED DEMO)
-    message(FATAL_ERROR "please set DEMO with -DDEMO=demo_name")
-  endif()
-
-  include_directories("${PADDLE_LIB}/paddle/include")
-  set(PADDLE_LIB_THIRD_PARTY_PATH "${PADDLE_LIB}/third_party/install/")
-  include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}protobuf/include")
-  include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}glog/include")
-  include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}gflags/include")
-  include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/include")
-  if (WITH_ONNXRUNTIME)
-    include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/include")
-    include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/include")
-  endif()
-
-  link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}protobuf/lib")
-  link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}glog/lib")
-  link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}gflags/lib")
-  link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/lib")
-  link_directories("${PADDLE_LIB}/paddle/lib")
-  if (WITH_ONNXRUNTIME)
-    link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib")
-    link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/lib")
-  endif()
-
-  if (WIN32)
-    add_definitions("/DGOOGLE_GLOG_DLL_DECL=")
-    option(MSVC_STATIC_CRT "use static C Runtime library by default" ON)
-    if (MSVC_STATIC_CRT)
-      if (WITH_MKL)
-        set(FLAG_OPENMP "/openmp")
-      endif()
-      set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd ${FLAG_OPENMP}")
-      set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT ${FLAG_OPENMP}")
-      set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd ${FLAG_OPENMP}")
-      set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT ${FLAG_OPENMP}")
-      safe_set_static_flag()
-      if (WITH_STATIC_LIB)
-        add_definitions(-DSTATIC_LIB)
-      endif()
-    endif()
-  else()
-    if(WITH_MKL)
-      set(FLAG_OPENMP "-fopenmp")
-    endif()
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAG_OPENMP}")
-  endif()
-
-  if (USE_TENSORRT AND WITH_GPU)
-    set(TENSORRT_ROOT "" CACHE STRING "The root directory of TensorRT library")
-    if("${TENSORRT_ROOT}" STREQUAL "")
-        message(FATAL_ERROR "The TENSORRT_ROOT is empty, you must assign it a value with CMake command. Such as: -DTENSORRT_ROOT=TENSORRT_ROOT_PATH ")
-    endif()
-    set(TENSORRT_INCLUDE_DIR ${TENSORRT_ROOT}/include)
-    set(TENSORRT_LIB_DIR ${TENSORRT_ROOT}/lib)
-  endif()
-
-  if (NOT WIN32)
-    if (USE_TENSORRT AND WITH_GPU)
-        include_directories("${TENSORRT_INCLUDE_DIR}")
-        link_directories("${TENSORRT_LIB_DIR}")
-    endif()
-  endif(NOT WIN32)
-
-  if(WITH_MKL)
-    set(MATH_LIB_PATH "${PADDLE_LIB_THIRD_PARTY_PATH}mklml")
-    include_directories("${MATH_LIB_PATH}/include")
-    if(WIN32)
-      set(MATH_LIB ${MATH_LIB_PATH}/lib/mklml${CMAKE_STATIC_LIBRARY_SUFFIX}
-                  ${MATH_LIB_PATH}/lib/libiomp5md${CMAKE_STATIC_LIBRARY_SUFFIX})
-    else()
-      set(MATH_LIB ${MATH_LIB_PATH}/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX}
-                  ${MATH_LIB_PATH}/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX})
-    endif()
-    set(MKLDNN_PATH "${PADDLE_LIB_THIRD_PARTY_PATH}mkldnn")
-    if(EXISTS ${MKLDNN_PATH})
-      include_directories("${MKLDNN_PATH}/include")
-      if(WIN32)
-        set(MKLDNN_LIB ${MKLDNN_PATH}/lib/mkldnn.lib)
-      else(WIN32)
-        set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libdnnl.so.3)
-      endif(WIN32)
-    endif()
-  else()
-    set(OPENBLAS_LIB_PATH "${PADDLE_LIB_THIRD_PARTY_PATH}openblas")
-    include_directories("${OPENBLAS_LIB_PATH}/include/openblas")
-    if(WIN32)
-      set(MATH_LIB ${OPENBLAS_LIB_PATH}/lib/openblas${CMAKE_STATIC_LIBRARY_SUFFIX})
-    else()
-      set(MATH_LIB ${OPENBLAS_LIB_PATH}/lib/libopenblas${CMAKE_STATIC_LIBRARY_SUFFIX})
-    endif()
-  endif()
-
-  if(WITH_STATIC_LIB)
-    set(DEPS ${PADDLE_LIB}/paddle/lib/libpaddle_inference${CMAKE_STATIC_LIBRARY_SUFFIX})
-  else()
-    if(WIN32)
-      set(DEPS ${PADDLE_LIB}/paddle/lib/libpaddle_inference${CMAKE_STATIC_LIBRARY_SUFFIX})
-    else()
-      set(DEPS ${PADDLE_LIB}/paddle/lib/libpaddle_inference${CMAKE_SHARED_LIBRARY_SUFFIX})
-    endif()
-  endif()
-
-  if (WITH_ONNXRUNTIME)
-    set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/libonnxruntime.so paddle2onnx)
-  endif()
-
-  if (NOT WIN32)
-    set(EXTERNAL_LIB "-lrt -ldl -lpthread")
-    set(DEPS ${DEPS}
-        ${MATH_LIB} ${MKLDNN_LIB}
-        glog gflags protobuf  xxhash
-        ${EXTERNAL_LIB})
-  else()
-    set(DEPS ${DEPS}
-        ${MATH_LIB} ${MKLDNN_LIB}
-        glog gflags_static libprotobuf xxhash ${EXTERNAL_LIB})
-    set(DEPS ${DEPS} shlwapi.lib)
-  endif(NOT WIN32)
-
-  cuda_add_library(decoding_infer_op ${decoding_op_files} SHARED)
-  add_dependencies(decoding_infer_op extern_${THIRD_PARTY_NAME} boost)
-
-  string(REPLACE "/" ";" DEMO_PATH ${DEMO})
-
-  list(LENGTH DEMO_PATH PATH_LEN)
-  MATH(EXPR PATH_LEN "${PATH_LEN}-1")
-  list(GET DEMO_PATH ${PATH_LEN} DEMO_NAME)
-
-  string(REPLACE "." ";" DEMO_NAME ${DEMO_NAME})
-  list(GET DEMO_NAME 0 DEMO_NAME)
-  add_executable(${DEMO_NAME} ${DEMO})
-  set(DEPS decoding_infer_op ${ft_lib_link} boost ${DEPS} cublas cudart)
-
-  if(WITH_GPT AND WITH_SP)
-    set(DEPS ${DEPS} sentencepiece)
-    add_dependencies(decoding_infer_op extern_sentencepiece)
-  endif()
-
-  if(WITH_PARALLEL)
-    set(DEPS ${DEPS} mpi nccl)
-  endif()
-
-  if(WIN32)
-    if(USE_TENSORRT)
-      add_custom_command(TARGET ${DEMO_NAME} POST_BUILD
-              COMMAND ${CMAKE_COMMAND} -E copy ${TENSORRT_LIB_DIR}/nvinfer${CMAKE_SHARED_LIBRARY_SUFFIX}
-                ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE}
-              COMMAND ${CMAKE_COMMAND} -E copy ${TENSORRT_LIB_DIR}/nvinfer_plugin${CMAKE_SHARED_LIBRARY_SUFFIX}
-                ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE}
-      )
-    endif()
-    if(WITH_MKL)
-      add_custom_command(TARGET ${DEMO_NAME} POST_BUILD
-            COMMAND ${CMAKE_COMMAND} -E copy ${MATH_LIB_PATH}/lib/mklml.dll ${CMAKE_BINARY_DIR}/Release
-            COMMAND ${CMAKE_COMMAND} -E copy ${MATH_LIB_PATH}/lib/libiomp5md.dll ${CMAKE_BINARY_DIR}/Release
-            COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_PATH}/lib/mkldnn.dll  ${CMAKE_BINARY_DIR}/Release
-      )
-    else()
-      add_custom_command(TARGET ${DEMO_NAME} POST_BUILD
-            COMMAND ${CMAKE_COMMAND} -E copy ${OPENBLAS_LIB_PATH}/lib/openblas.dll ${CMAKE_BINARY_DIR}/Release
-      )
-    endif()
-    if(NOT WITH_STATIC_LIB)
-        add_custom_command(TARGET ${DEMO_NAME} POST_BUILD
-          COMMAND ${CMAKE_COMMAND} -E copy "${PADDLE_LIB}/paddle/lib/paddle_fluid.dll" ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE}
-        )
-    endif()
-  endif()
-
-  target_link_libraries(${DEMO_NAME} ${DEPS})
-
-else(ON_INFER)
-  if(NOT PY_CMD)
-    set(PYTHON_PATH "python" CACHE STRING "Python path")
-  else()
-    set(PYTHON_PATH ${PY_CMD} CACHE STRING "Python path")
-  endif()
-
-  execute_process(COMMAND ${PYTHON_PATH} "-c" "from __future__ import print_function; import paddle; print(paddle.__version__)"
-                RESULT_VARIABLE _INC_PYTHON_SUCCESS
-                OUTPUT_VARIABLE _INC_PYTHON_VALUES)
-  message(STATUS "PADDLE_VERSION: ${_INC_PYTHON_VALUES}")
-
-  # TODO(gongenlei): support PADDLE_NEW_ALLOCATOR for ON_INFER
-  if(_INC_PYTHON_VALUES VERSION_GREATER_EQUAL "2.3.0")
-      add_definitions(-DPADDLE_NEW_ALLOCATOR)
-  endif()
-
-  if(_INC_PYTHON_VALUES VERSION_GREATER_EQUAL "2.5.0" OR _INC_PYTHON_VALUES VERSION_EQUAL "0.0.0")
-    find_package(PythonLibs REQUIRED)
-    include_directories(${PYTHON_INCLUDE_DIRS})
-  endif()
-
-  execute_process(COMMAND ${PYTHON_PATH} "-c" "from __future__ import print_function; import paddle; print(paddle.sysconfig.get_include())"
-                  RESULT_VARIABLE _INC_PYTHON_SUCCESS
-                  OUTPUT_VARIABLE _INC_PYTHON_VALUES)
-  if (NOT _INC_PYTHON_SUCCESS MATCHES 0)
-      message(FATAL_ERROR "Python config Error.")
-  endif()
-  string(REGEX REPLACE ";" "\\\\;" _INC_PYTHON_VALUES ${_INC_PYTHON_VALUES})
-  string(REGEX REPLACE "\n" ";" _INC_PYTHON_VALUES ${_INC_PYTHON_VALUES})
-  list(GET _INC_PYTHON_VALUES 0 PY_INCLUDE_DIR)
-
-  list(APPEND COMMON_HEADER_DIRS ${PY_INCLUDE_DIR})
-  list(APPEND COMMON_HEADER_DIRS ${PY_INCLUDE_DIR}/third_party)
-
-  include_directories(
-    ${COMMON_HEADER_DIRS}
-  )
-
-  execute_process(COMMAND ${PYTHON_PATH} "-c" "from __future__ import print_function; import paddle; print(paddle.sysconfig.get_lib())"
-                  RESULT_VARIABLE _LIB_PYTHON_SUCCESS
-                  OUTPUT_VARIABLE _LIB_PYTHON_VALUES)
-  if (NOT _LIB_PYTHON_SUCCESS MATCHES 0)
-      message(FATAL_ERROR "Python config Error.")
-  endif()
-  string(REGEX REPLACE ";" "\\\\;" _LIB_PYTHON_VALUES ${_LIB_PYTHON_VALUES})
-  string(REGEX REPLACE "\n" ";" _LIB_PYTHON_VALUES ${_LIB_PYTHON_VALUES})
-  list(GET _LIB_PYTHON_VALUES 0 PY_LIB_DIR)
-  list(APPEND COMMON_LIB_DIRS ${PY_LIB_DIR})
-
-  link_directories(
-    ${COMMON_LIB_DIRS}
-  )
-
-  include_directories(${PY_INCLUDE_DIR})
-  include_directories(${PY_INCLUDE_DIR}/third_party)
-
-  if(EXISTS ${PY_LIB_DIR}/libpaddle_custom_op.so)
-    set(lib_link
-      -lpaddle_custom_op
-    )
-  endif()
-
-  if(EXISTS ${PY_LIB_DIR}/../fluid/)
-    if(EXISTS ${PY_LIB_DIR}/../fluid/libpaddle.so)
-      set(lib_link
-        -lpaddle
-      )
-    elseif(EXISTS ${PY_LIB_DIR}/../fluid/core_avx.so)
-      set(lib_link
-        -l:core_avx.so
-      )
-    else()
-      set(lib_link
-        -l:core_noavx.so
-      )
-    endif()
-    link_directories(
-      ${PY_LIB_DIR}/../fluid/
-    )
-  elseif(EXISTS ${PY_LIB_DIR}/../base/)
-    if(EXISTS ${PY_LIB_DIR}/../base/libpaddle.so)
-      set(lib_link
-        -lpaddle
-      )
-    elseif(EXISTS ${PY_LIB_DIR}/../base/core_avx.so)
-      set(lib_link
-        -l:core_avx.so
-      )
-    else()
-      set(lib_link
-        -l:core_noavx.so
-      )
-    endif()
-    link_directories(
-      ${PY_LIB_DIR}/../base/
-    )
-  endif()
-
-  set(ft_lib_link
-    -ldecoder -ldecoding -ltopk -lcuda_int8_kernels -lcuda_kernels -lonline_softmax_beamsearch -ltransformer_kernels -lattention_kernels -lencoder -lnccl_utils -lnvtx_utils
-  )
-
-  if(WITH_PARALLEL)
-    set(ft_lib_link ${ft_lib_link} -lmpi -lnccl)
-  endif()
-
-  add_definitions(-DPADDLE_WITH_CUDA)
-  add_definitions(-DEIGEN_USE_GPU)
-  add_definitions(-DPADDLE_USE_DSO)
-  if (WITH_MKL)
-    add_definitions(-DPADDLE_WITH_MKLDNN)
-  endif()
-
-  add_library(decoding_op SHARED ${decoding_op_files})
-  add_dependencies(decoding_op extern_${THIRD_PARTY_NAME} boost)
-  target_link_libraries(decoding_op PRIVATE -lcublas -lcudart ${lib_link} ${ft_lib_link})
-endif()
diff --git a/paddlenlp/ops/fast_transformer/src/cublas_handle.cc b/paddlenlp/ops/fast_transformer/src/cublas_handle.cc
deleted file mode 100644
index dfdc9badc005..000000000000
--- a/paddlenlp/ops/fast_transformer/src/cublas_handle.cc
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "cublas_handle.h"
-
-CublasHandle* CublasHandle::GetInstance() {
-  static CublasHandle* p_handle = nullptr;
-  if (p_handle == nullptr) {
-    p_handle = new CublasHandle();
-  }
-  return p_handle;
-}
-
-CublasHandle::~CublasHandle() {
-  cublasDestroy(cublas_handle_);
-  cublasLtDestroy(cublaslt_handle_);
-}
diff --git a/paddlenlp/ops/fast_transformer/src/cublas_handle.h b/paddlenlp/ops/fast_transformer/src/cublas_handle.h
deleted file mode 100644
index d636d0585dd1..000000000000
--- a/paddlenlp/ops/fast_transformer/src/cublas_handle.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-
-#include "fastertransformer/utils/common.h"
-
-
-/**
- * The CublasHandle class defines the `GetInstance` method that serves as an
- * alternative to constructor and lets clients access the same instance of this
- * class over and over.
- */
-class CublasHandle {
-  /**
-   * The CublasHandle's constructor should always be private to prevent direct
-   * construction calls with the `new` operator.
-   */
-private:
-  CublasHandle() {
-    cublasCreate(&cublas_handle_);
-    cublasLtCreate(&cublaslt_handle_);
-  }
-
-public:
-  /**
-   * CublasHandle should not be cloneable.
-   */
-  CublasHandle(CublasHandle& other) = delete;
-
-  /**
-   * CublasHandle should not be assignable.
-   */
-  void operator=(const CublasHandle&) = delete;
-
-  /**
-   * This is the static method that controls the access to the singleton
-   * instance. On the first run, it creates a singleton object and places it
-   * into the static field. On subsequent runs, it returns the client existing
-   * object stored in the static field.
-   */
-  static CublasHandle* GetInstance();
-
-  cublasHandle_t cublas_handle_;
-  cublasLtHandle_t cublaslt_handle_;
-
-  ~CublasHandle();
-};
diff --git a/paddlenlp/ops/fast_transformer/src/demo/gpt.cc b/paddlenlp/ops/fast_transformer/src/demo/gpt.cc
deleted file mode 100644
index 7dd55efb2c7a..000000000000
--- a/paddlenlp/ops/fast_transformer/src/demo/gpt.cc
+++ /dev/null
@@ -1,321 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// 
-//     http://www.apache.org/licenses/LICENSE-2.0
-// 
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <pthread.h>
-#include <algorithm>
-#include <atomic>
-#include <string>
-#include <cstring>
-#include <fstream>
-#include <iostream>
-#include <locale>
-#include <numeric>
-#include <string>
-#include <thread>
-#include <unordered_map>
-
-#ifdef GPT_ON_SENTENCEPIECE
-#include <sentencepiece_processor.h>
-#endif
-
-#include "helper.h"
-#include "utf8.h"
-
-#include <sys/time.h>
-#include <unistd.h>
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <ctime>
-
-DEFINE_int32(batch_size, 1, "Batch size to do inference. ");
-DEFINE_int32(gpu_id, 0, "The gpu id to do inference. ");
-DEFINE_string(model_dir,
-              "./infer_model/",
-              "The directory to the inference model. ");
-DEFINE_string(vocab_file,
-              "./infer_model/vocab.txt",
-              "The path to the vocabulary file. ");
-DEFINE_string(start_token, "<|endoftext|>", "The start token of GPT.");
-DEFINE_string(end_token, "<|endoftext|>", "The end token of GPT.");
-
-using namespace paddle_infer;
-
-std::string model_dir = "";
-std::string vocab_file = "";
-
-const int BOS_IDX = 50256;
-const int EOS_IDX = 50256;
-const int PAD_IDX = 50256;
-const int MAX_LENGTH = 256;
-
-int batch_size = 1;
-int gpu_id = 0;
-
-namespace paddle {
-namespace inference {
-
-struct DataInput {
-  std::vector<int64_t> src_data;
-};
-
-struct DataResult {
-  std::wstring result_q;
-};
-
-bool get_result_tensor(const std::unique_ptr<paddle_infer::Tensor>& seq_ids,
-                       std::vector<DataResult>& dataresultvec,
-                       std::unordered_map<int, std::u32string>& num2word_dict,
-                       std::unordered_map<char32_t, int>& byte_decoder) {
-  // NOTE: Add SentencePiece to do some postprocess on cpm model.
-  // sentencepiece::SentencePieceProcessor processor;
-  // max_length * batch_size
-  std::vector<int> output_shape = seq_ids->shape();
-  int batch_size = output_shape[1];
-  int out_num = std::accumulate(
-      output_shape.begin(), output_shape.end(), 1, std::multiplies<int>());
-  std::vector<int> seq_ids_out;
-  seq_ids_out.resize(out_num);
-  seq_ids->CopyToCpu(seq_ids_out.data());
-
-  dataresultvec.resize(batch_size);
-  auto max_output_length = output_shape[0];
-
-  for (int bsz = 0; bsz < batch_size; ++bsz) {
-    std::u32string tmp_result_q = U"";
-    for (int len = 1; len < max_output_length; ++len) {
-      tmp_result_q =
-          tmp_result_q + num2word_dict[seq_ids_out[len * batch_size + bsz]];
-    }
-
-    for (int i = 0; i < tmp_result_q.length(); ++i) {
-      char32_t tmp = tmp_result_q[i];
-      if (byte_decoder.find(tmp) != byte_decoder.end()) {
-        dataresultvec[bsz].result_q = dataresultvec[bsz].result_q +
-                                      static_cast<wchar_t>(byte_decoder[tmp]);
-      } else {
-        std::cout << "Should not reach here. " << std::endl;
-        exit(-1);
-      }
-    }
-  }
-  return true;
-}
-
-std::unordered_map<char32_t, int> convert_unicode() {
-  char32_t c0 = U'!';
-  char32_t c1 = U'~';
-  char32_t c2 = U'¡';
-  char32_t c3 = U'¬';
-  char32_t c4 = U'®';
-  char32_t c5 = U'ÿ';
-
-  int a0 = c0;
-  int a1 = c1;
-  int a2 = c2;
-  int a3 = c3;
-  int a4 = c4;
-  int a5 = c5;
-
-  std::unordered_map<char32_t, int> ret;
-  int n = 0;
-  for (int b = 0; b < 256; ++b) {
-    char32_t key;
-    if (b < a0 || (b > a1 && b < a2) || (b < a3 && b > a4) || b > a5) {
-      key = static_cast<char32_t>(256 + n);
-      ret.insert(std::pair<char32_t, int>(key, b));
-      n++;
-    } else {
-      key = static_cast<char32_t>(b);
-      ret.insert(std::pair<char32_t, int>(key, b));
-    }
-  }
-
-  return ret;
-}
-
-class DataReader {
-public:
-  DataReader() {}
-
-  bool NextBatch(std::shared_ptr<paddle_infer::Predictor>& predictor,
-                 const int& batch_size,
-                 const std::u32string& start_token,
-                 const std::u32string& end_token,
-                 const int& num_batches,
-                 std::vector<std::u32string>& source_query_vec) {
-    if (current_batches++ >= num_batches) {
-      return false;
-    }
-
-    for (int i = 0; i < batch_size; ++i) {
-      source_query_vec.push_back(start_token);
-    }
-
-    std::u32string line;
-    std::vector<std::u32string> word_data;
-    std::vector<DataInput> data_input_vec;
-    int max_len = 0;
-    for (int i = 0; i < batch_size; i++) {
-      DataInput data_input;
-      data_input.src_data.push_back(word2num_dict[start_token]);
-      max_len = std::max(max_len, static_cast<int>(data_input.src_data.size()));
-      max_len = std::min(max_len, MAX_LENGTH);
-      data_input_vec.push_back(data_input);
-    }
-    if (data_input_vec.empty()) {
-      return false;
-    }
-    return TensorMoreBatch(
-        predictor, data_input_vec, max_len, data_input_vec.size());
-  }
-
-  bool GetWordDict() {
-    std::ifstream fin(vocab_file);
-    std::string line;
-    int k = 0;
-    while (std::getline(fin, line)) {
-      std::u32string tmp = utf8::utf8to32(line);
-      word2num_dict[tmp] = k;
-      num2word_dict[k] = tmp;
-      k += 1;
-    }
-
-    fin.close();
-
-    return true;
-  }
-
-  int GetCurrentBatch() { return current_batches; }
-
-  std::unordered_map<std::u32string, int> word2num_dict;
-  std::unordered_map<int, std::u32string> num2word_dict;
-  std::unique_ptr<std::ifstream> file;
-
-private:
-  bool TensorMoreBatch(std::shared_ptr<paddle_infer::Predictor>& predictor,
-                       std::vector<DataInput>& data_input_vec,
-                       int max_len,
-                       int batch_size) {
-    auto ids_name = predictor->GetInputNames();
-    auto ids_t = predictor->GetInputHandle(ids_name[0]);
-    std::vector<int> ids_vec;
-    ids_vec.resize(max_len * batch_size);
-    for (int i = 0; i < batch_size; ++i) {
-      for (int k = 0; k < max_len; ++k) {
-        if (k < data_input_vec[i].src_data.size()) {
-          ids_vec[i * max_len + k] = data_input_vec[i].src_data[k];
-        } else {
-          ids_vec[i * max_len + k] = PAD_IDX;
-        }
-      }
-    }
-    ids_t->Reshape({batch_size, max_len});
-    ids_t->CopyFromCpu(ids_vec.data());
-
-    return true;
-  }
-
-  int current_batches = 0;
-};
-
-
-template <typename... Args>
-void SummaryConfig(const paddle_infer::Config& config,
-                   double infer_time,
-                   int num_batches,
-                   int num_samples) {
-  LOG(INFO) << "----------------------- Perf info -----------------------";
-  LOG(INFO) << "batch_size: " << batch_size;
-  LOG(INFO) << "average_latency(ms): " << infer_time / num_samples << ", "
-            << "QPS: " << num_samples / (infer_time / 1000.0);
-}
-
-
-void Main(const int& batch_size,
-          const int& gpu_id,
-          const std::u32string& start_token,
-          const std::u32string& end_token) {
-  Config config;
-  config.SetModel(model_dir + "/gpt.pdmodel", model_dir + "/gpt.pdiparams");
-
-  config.EnableUseGpu(100, gpu_id);
-
-  config.SwitchUseFeedFetchOps(false);
-  config.SwitchSpecifyInputNames(true);
-  auto predictor = CreatePredictor(config);
-  DataReader reader;
-  reader.GetWordDict();
-
-  double whole_time = 0;
-  Timer timer;
-  int num_batches = 100;
-  int warmup = 50;
-  std::vector<std::u32string> source_query_vec;
-  auto byte_decoder = convert_unicode();
-
-  while (reader.NextBatch(predictor,
-                          batch_size,
-                          start_token,
-                          end_token,
-                          num_batches,
-                          source_query_vec)) {
-    int crt_batch = reader.GetCurrentBatch();
-    if (crt_batch >= warmup) {
-      timer.tic();
-    }
-    predictor->Run();
-
-    if (crt_batch >= warmup) {
-      whole_time += timer.toc();
-    }
-
-    std::vector<DataResult> dataresultvec;
-    auto output_names = predictor->GetOutputNames();
-    get_result_tensor(predictor->GetOutputHandle(output_names[0]),
-                      dataresultvec,
-                      reader.num2word_dict,
-                      byte_decoder);
-
-    for (int i = 0; i < batch_size; ++i) {
-      std::wcout << dataresultvec[i].result_q;
-      std::cout << std::endl;
-    }
-    source_query_vec.clear();
-  }
-  std::cout << std::endl;
-  SummaryConfig(config,
-                whole_time,
-                num_batches - warmup,
-                (num_batches - warmup) * batch_size);
-}
-}  // namespace inference
-}  // namespace paddle
-
-int main(int argc, char** argv) {
-  gflags::ParseCommandLineFlags(&argc, &argv, true);
-
-  batch_size = FLAGS_batch_size;
-  gpu_id = FLAGS_gpu_id;
-
-  model_dir = FLAGS_model_dir;
-  vocab_file = FLAGS_vocab_file;
-
-  paddle::inference::Main(batch_size,
-                          gpu_id,
-                          utf8::utf8to32(FLAGS_start_token),
-                          utf8::utf8to32(FLAGS_end_token));
-
-  return 0;
-}
diff --git a/paddlenlp/ops/fast_transformer/src/demo/helper.h b/paddlenlp/ops/fast_transformer/src/demo/helper.h
deleted file mode 100644
index 046ca4c9e3ea..000000000000
--- a/paddlenlp/ops/fast_transformer/src/demo/helper.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include <gflags/gflags.h>
-#include <glog/logging.h>
-#include <glog/raw_logging.h>
-#include <sys/time.h>
-#include <chrono>  // NOLINT
-#include <numeric>
-#include <sstream>
-#include <string>
-#include <vector>
-#include "paddle_inference_api.h"
-
-namespace paddle {
-namespace inference {
-// Timer for timer
-class Timer {
-public:
-  std::chrono::high_resolution_clock::time_point start;
-  std::chrono::high_resolution_clock::time_point startu;
-
-  void tic() { start = std::chrono::high_resolution_clock::now(); }
-
-  double toc() {
-    startu = std::chrono::high_resolution_clock::now();
-    std::chrono::duration<double> time_span =
-        std::chrono::duration_cast<std::chrono::duration<double>>(startu -
-                                                                  start);
-    double used_time_ms = static_cast<double>(time_span.count()) * 1000.0;
-    return used_time_ms;
-  }
-};
-
-static void split(const std::string &str,
-                  char sep,
-                  std::vector<std::string> *pieces) {
-  pieces->clear();
-  if (str.empty()) {
-    return;
-  }
-  size_t pos = 0;
-  size_t next = str.find(sep, pos);
-  while (next != std::string::npos) {
-    pieces->push_back(str.substr(pos, next - pos));
-    pos = next + 1;
-    next = str.find(sep, pos);
-  }
-  if (!str.substr(pos).empty()) {
-    pieces->push_back(str.substr(pos));
-  }
-}
-
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddlenlp/ops/fast_transformer/src/demo/transformer_e2e.cc b/paddlenlp/ops/fast_transformer/src/demo/transformer_e2e.cc
deleted file mode 100644
index 2f1c802f96ed..000000000000
--- a/paddlenlp/ops/fast_transformer/src/demo/transformer_e2e.cc
+++ /dev/null
@@ -1,281 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <pthread.h>
-#include <algorithm>
-#include <atomic>
-#include <cstring>
-#include <fstream>
-#include <iostream>
-#include <numeric>
-#include <string>
-#include <thread>
-#include <unordered_map>
-
-#include "helper.h"
-
-#include <sys/time.h>
-#include <unistd.h>
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <ctime>
-
-using namespace paddle_infer;
-
-DEFINE_int32(batch_size, 1, "Batch size to do inference. ");
-DEFINE_int32(gpu_id, 0, "The gpu id to do inference. ");
-DEFINE_string(model_dir,
-              "./infer_model/",
-              "The directory to the inference model. ");
-DEFINE_string(vocab_file,
-              "./vocab_all.bpe.33708",
-              "The path to the vocabulary file. ");
-DEFINE_string(data_file,
-              "./newstest2014.tok.bpe.33708.en",
-              "The path to the input data file. ");
-
-std::string model_dir = "";
-std::string vocab_file = "";
-std::string data_file = "";
-
-const int EOS_IDX = 1;
-const int PAD_IDX = 0;
-const int MAX_LENGTH = 256;
-const int N_BEST = 1;
-
-int batch_size = 1;
-int gpu_id = 0;
-
-namespace paddle {
-namespace inference {
-
-struct DataInput {
-  std::vector<int64_t> src_data;
-};
-
-struct DataResult {
-  std::string result_q;
-};
-
-bool get_result_tensor(const std::unique_ptr<paddle_infer::Tensor>& seq_ids,
-                       std::vector<DataResult>& dataresultvec,
-                       std::unordered_map<int, std::string>& num2word_dict) {
-  std::vector<int> output_shape = seq_ids->shape();
-  int batch_size = output_shape[1];
-  int beam_num = output_shape[2];
-  int out_num = std::accumulate(
-      output_shape.begin(), output_shape.end(), 1, std::multiplies<int>());
-  std::vector<int> seq_ids_out;
-  seq_ids_out.resize(out_num);
-  seq_ids->CopyToCpu(seq_ids_out.data());
-
-  dataresultvec.resize(batch_size * N_BEST);
-  auto max_output_length = output_shape[0];
-
-  for (int bsz = 0; bsz < output_shape[1]; ++bsz) {
-    for (int k = 0; k < N_BEST; ++k) {
-      dataresultvec[bsz * N_BEST + k].result_q = "";
-      for (int len = 0; len < max_output_length; ++len) {
-        if (seq_ids_out[len * batch_size * beam_num + bsz * beam_num + k] ==
-            EOS_IDX)
-          break;
-        dataresultvec[bsz * N_BEST + k].result_q =
-            dataresultvec[bsz * N_BEST + k].result_q +
-            num2word_dict[seq_ids_out[len * batch_size * beam_num +
-                                      bsz * beam_num + k]] +
-            " ";
-      }
-    }
-  }
-  return true;
-}
-
-class DataReader {
-public:
-  explicit DataReader(const std::string& path)
-      : file(new std::ifstream(path)) {}
-
-  bool NextBatch(std::shared_ptr<paddle_infer::Predictor>& predictor,
-                 const int& batch_size,
-                 std::vector<std::string>& source_query_vec) {
-    std::string line;
-    std::vector<std::string> word_data;
-    std::vector<DataInput> data_input_vec;
-    int max_len = 0;
-    for (int i = 0; i < batch_size; i++) {
-      if (!std::getline(*file, line)) {
-        break;
-      }
-      DataInput data_input;
-      split(line, ' ', &word_data);
-      std::string query_str = "";
-      for (int j = 0; j < word_data.size(); ++j) {
-        if (j >= MAX_LENGTH) {
-          break;
-        }
-        query_str += word_data[j];
-        if (word2num_dict.find(word_data[j]) == word2num_dict.end()) {
-          data_input.src_data.push_back(word2num_dict["<unk>"]);
-        } else {
-          data_input.src_data.push_back(word2num_dict[word_data[j]]);
-        }
-      }
-      source_query_vec.push_back(query_str);
-      data_input.src_data.push_back(EOS_IDX);
-      max_len = std::max(max_len, static_cast<int>(data_input.src_data.size()));
-      max_len = std::min(max_len, MAX_LENGTH);
-      data_input_vec.push_back(data_input);
-    }
-    if (data_input_vec.empty()) {
-      return false;
-    }
-    return TensorMoreBatch(
-        predictor, data_input_vec, max_len, data_input_vec.size());
-  }
-
-  bool GetWordDict() {
-    std::ifstream fin(vocab_file);
-    std::string line;
-    int k = 0;
-    while (std::getline(fin, line)) {
-      word2num_dict[line] = k;
-      num2word_dict[k] = line;
-      k += 1;
-    }
-
-    fin.close();
-
-    return true;
-  }
-
-  std::unordered_map<std::string, int> word2num_dict;
-  std::unordered_map<int, std::string> num2word_dict;
-  std::unique_ptr<std::ifstream> file;
-
-private:
-  bool TensorMoreBatch(std::shared_ptr<paddle_infer::Predictor>& predictor,
-                       std::vector<DataInput>& data_input_vec,
-                       int max_len,
-                       int batch_size) {
-    auto src_word_t = predictor->GetInputHandle("src_word");
-    std::vector<int64_t> src_word_vec;
-    src_word_vec.resize(max_len * batch_size);
-    for (int i = 0; i < batch_size; ++i) {
-      for (int k = 0; k < max_len; ++k) {
-        if (k < data_input_vec[i].src_data.size()) {
-          src_word_vec[i * max_len + k] = data_input_vec[i].src_data[k];
-        } else {
-          src_word_vec[i * max_len + k] = PAD_IDX;
-        }
-      }
-    }
-    src_word_t->Reshape({batch_size, max_len});
-    src_word_t->CopyFromCpu(src_word_vec.data());
-
-    // NOTE: If the saved model supports force decoding, a nullptr must be
-    // given to trg_word to ensure predictor work properly when not
-    // using force decoding.
-    /*
-     * auto trg_word_t = predictor->GetInputHandle("trg_word");
-     * trg_word_t->Reshape({0, 0});
-     * trg_word_t->CopyFromCpu((int*)nullptr);
-     */
-
-    return true;
-  }
-};
-
-
-template <typename... Args>
-void SummaryConfig(const paddle_infer::Config& config,
-                   double infer_time,
-                   int num_batches,
-                   int num_samples) {
-  LOG(INFO) << "----------------------- Data info -----------------------";
-  LOG(INFO) << "batch_size: " << batch_size;
-  LOG(INFO) << "num_of_samples: " << num_samples;
-  LOG(INFO) << "----------------------- Conf info -----------------------";
-  LOG(INFO) << "runtime_device: " << (config.use_gpu() ? "gpu" : "cpu");
-  LOG(INFO) << "ir_optim: " << (config.ir_optim() ? "true" : "false");
-  LOG(INFO) << "----------------------- Perf info -----------------------";
-  LOG(INFO) << "average_latency(ms): " << infer_time / num_samples << ", "
-            << "QPS: " << num_samples / (infer_time / 1000.0);
-}
-
-
-void Main(int batch_size, int gpu_id) {
-  Config config;
-  config.SetModel(model_dir + "/transformer.pdmodel",
-                  model_dir + "/transformer.pdiparams");
-
-  config.EnableUseGpu(100, gpu_id);
-
-  config.SwitchUseFeedFetchOps(false);
-  config.SwitchSpecifyInputNames(true);
-  // When using fp16, fc_elementwise_layernorm_fuse_pass causes a little
-  // different translation results with original dygraph prediction, maybe you
-  // can turn off the IR optimization for same results as following:
-  // config.SwitchIrOptim(false);
-  auto predictor = CreatePredictor(config);
-  DataReader reader(data_file);
-  reader.GetWordDict();
-
-  double whole_time = 0;
-  Timer timer;
-  int num_batches = 0;
-  int num_samples = 0;
-  std::vector<std::string> source_query_vec;
-  std::ofstream out("predict.txt");
-
-  while (reader.NextBatch(predictor, batch_size, source_query_vec)) {
-    timer.tic();
-    predictor->Run();
-    std::vector<DataResult> dataresultvec;
-    auto output_names = predictor->GetOutputNames();
-    get_result_tensor(predictor->GetOutputHandle(output_names[0]),
-                      dataresultvec,
-                      reader.num2word_dict);
-
-    whole_time += timer.toc();
-    num_batches++;
-
-    if (out.is_open()) {
-      for (int i = 0; i < dataresultvec.size(); ++i) {
-        out << dataresultvec[i].result_q << "\n";
-      }
-    }
-    num_samples += dataresultvec.size();
-
-    source_query_vec.clear();
-  }
-  SummaryConfig(config, whole_time, num_batches, num_samples);
-  out.close();
-}
-}  // namespace inference
-}  // namespace paddle
-
-int main(int argc, char** argv) {
-  gflags::ParseCommandLineFlags(&argc, &argv, true);
-
-  batch_size = FLAGS_batch_size;
-  gpu_id = FLAGS_gpu_id;
-
-  model_dir = FLAGS_model_dir;
-  vocab_file = FLAGS_vocab_file;
-  data_file = FLAGS_data_file;
-
-  paddle::inference::Main(batch_size, gpu_id);
-
-  return 0;
-}
diff --git a/paddlenlp/ops/fast_transformer/src/demo/utf8.h b/paddlenlp/ops/fast_transformer/src/demo/utf8.h
deleted file mode 100644
index 82b13f59f983..000000000000
--- a/paddlenlp/ops/fast_transformer/src/demo/utf8.h
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright 2006 Nemanja Trifunovic
-
-/*
-Permission is hereby granted, free of charge, to any person or organization
-obtaining a copy of the software and accompanying documentation covered by
-this license (the "Software") to use, reproduce, display, distribute,
-execute, and transmit the Software, and to prepare derivative works of the
-Software, and to permit third-parties to whom the Software is furnished to
-do so, all subject to the following:
-
-The copyright notices in the Software and this entire statement, including
-the above license grant, this restriction and the following disclaimer,
-must be included in all copies of the Software, in whole or in part, and
-all derivative works of the Software, unless such copies or derivative
-works are solely in the form of machine-executable object code generated by
-a source language processor.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
-SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
-FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
-ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE.
-*/
-
-
-#ifndef UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731
-#define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731
-
-#include "utf8/checked.h"
-#include "utf8/unchecked.h"
-
-#endif // header guard
diff --git a/paddlenlp/ops/fast_transformer/src/demo/utf8/checked.h b/paddlenlp/ops/fast_transformer/src/demo/utf8/checked.h
deleted file mode 100644
index 512dcc2fbac8..000000000000
--- a/paddlenlp/ops/fast_transformer/src/demo/utf8/checked.h
+++ /dev/null
@@ -1,319 +0,0 @@
-// Copyright 2006-2016 Nemanja Trifunovic
-
-/*
-Permission is hereby granted, free of charge, to any person or organization
-obtaining a copy of the software and accompanying documentation covered by
-this license (the "Software") to use, reproduce, display, distribute,
-execute, and transmit the Software, and to prepare derivative works of the
-Software, and to permit third-parties to whom the Software is furnished to
-do so, all subject to the following:
-
-The copyright notices in the Software and this entire statement, including
-the above license grant, this restriction and the following disclaimer,
-must be included in all copies of the Software, in whole or in part, and
-all derivative works of the Software, unless such copies or derivative
-works are solely in the form of machine-executable object code generated by
-a source language processor.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
-SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
-FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
-ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE.
-*/
-
-
-#ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
-#define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
-
-#include "core.h"
-#include <stdexcept>
-
-namespace utf8
-{
-    // Base for the exceptions that may be thrown from the library
-    class exception : public ::std::exception {
-    };
-
-    // Exceptions that may be thrown from the library functions.
-    class invalid_code_point : public exception {
-        uint32_t cp;
-    public:
-        invalid_code_point(uint32_t codepoint) : cp(codepoint) {}
-        virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid code point"; }
-        uint32_t code_point() const {return cp;}
-    };
-
-    class invalid_utf8 : public exception {
-        uint8_t u8;
-    public:
-        invalid_utf8 (uint8_t u) : u8(u) {}
-        invalid_utf8 (char c) : u8(static_cast<uint8_t>(c)) {}
-        virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid UTF-8"; }
-        uint8_t utf8_octet() const {return u8;}
-    };
-
-    class invalid_utf16 : public exception {
-        uint16_t u16;
-    public:
-        invalid_utf16 (uint16_t u) : u16(u) {}
-        virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid UTF-16"; }
-        uint16_t utf16_word() const {return u16;}
-    };
-
-    class not_enough_room : public exception {
-    public:
-        virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Not enough space"; }
-    };
-
-    /// The library API - functions intended to be called by the users
-
-    template <typename octet_iterator>
-    octet_iterator append(uint32_t cp, octet_iterator result)
-    {
-        if (!utf8::internal::is_code_point_valid(cp))
-            throw invalid_code_point(cp);
-
-        return internal::append(cp, result);
-    }
-
-    template <typename octet_iterator, typename output_iterator>
-    output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement)
-    {
-        while (start != end) {
-            octet_iterator sequence_start = start;
-            internal::utf_error err_code = utf8::internal::validate_next(start, end);
-            switch (err_code) {
-                case internal::UTF8_OK :
-                    for (octet_iterator it = sequence_start; it != start; ++it)
-                        *out++ = *it;
-                    break;
-                case internal::NOT_ENOUGH_ROOM:
-                    out = utf8::append (replacement, out);
-                    start = end;
-                    break;
-                case internal::INVALID_LEAD:
-                    out = utf8::append (replacement, out);
-                    ++start;
-                    break;
-                case internal::INCOMPLETE_SEQUENCE:
-                case internal::OVERLONG_SEQUENCE:
-                case internal::INVALID_CODE_POINT:
-                    out = utf8::append (replacement, out);
-                    ++start;
-                    // just one replacement mark for the sequence
-                    while (start != end && utf8::internal::is_trail(*start))
-                        ++start;
-                    break;
-            }
-        }
-        return out;
-    }
-
-    template <typename octet_iterator, typename output_iterator>
-    inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out)
-    {
-        static const uint32_t replacement_marker = utf8::internal::mask16(0xfffd);
-        return utf8::replace_invalid(start, end, out, replacement_marker);
-    }
-
-    template <typename octet_iterator>
-    uint32_t next(octet_iterator& it, octet_iterator end)
-    {
-        uint32_t cp = 0;
-        internal::utf_error err_code = utf8::internal::validate_next(it, end, cp);
-        switch (err_code) {
-            case internal::UTF8_OK :
-                break;
-            case internal::NOT_ENOUGH_ROOM :
-                throw not_enough_room();
-            case internal::INVALID_LEAD :
-            case internal::INCOMPLETE_SEQUENCE :
-            case internal::OVERLONG_SEQUENCE :
-                throw invalid_utf8(static_cast<uint8_t>(*it));
-            case internal::INVALID_CODE_POINT :
-                throw invalid_code_point(cp);
-        }
-        return cp;
-    }
-
-    template <typename octet_iterator>
-    uint32_t peek_next(octet_iterator it, octet_iterator end)
-    {
-        return utf8::next(it, end);
-    }
-
-    template <typename octet_iterator>
-    uint32_t prior(octet_iterator& it, octet_iterator start)
-    {
-        // can't do much if it == start
-        if (it == start)
-            throw not_enough_room();
-
-        octet_iterator end = it;
-        // Go back until we hit either a lead octet or start
-        while (utf8::internal::is_trail(*(--it)))
-            if (it == start)
-                throw invalid_utf8(*it); // error - no lead byte in the sequence
-        return utf8::peek_next(it, end);
-    }
-
-    template <typename octet_iterator, typename distance_type>
-    void advance (octet_iterator& it, distance_type n, octet_iterator end)
-    {
-        const distance_type zero(0);
-        if (n < zero) {
-            // backward
-            for (distance_type i = n; i < zero; ++i)
-                utf8::prior(it, end);
-        } else {
-            // forward
-            for (distance_type i = zero; i < n; ++i)
-                utf8::next(it, end);
-        }
-    }
-
-    template <typename octet_iterator>
-    typename std::iterator_traits<octet_iterator>::difference_type
-    distance (octet_iterator first, octet_iterator last)
-    {
-        typename std::iterator_traits<octet_iterator>::difference_type dist;
-        for (dist = 0; first < last; ++dist)
-            utf8::next(first, last);
-        return dist;
-    }
-
-    template <typename u16bit_iterator, typename octet_iterator>
-    octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
-    {
-        while (start != end) {
-            uint32_t cp = utf8::internal::mask16(*start++);
-            // Take care of surrogate pairs first
-            if (utf8::internal::is_lead_surrogate(cp)) {
-                if (start != end) {
-                    uint32_t trail_surrogate = utf8::internal::mask16(*start++);
-                    if (utf8::internal::is_trail_surrogate(trail_surrogate))
-                        cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
-                    else
-                        throw invalid_utf16(static_cast<uint16_t>(trail_surrogate));
-                }
-                else
-                    throw invalid_utf16(static_cast<uint16_t>(cp));
-
-            }
-            // Lone trail surrogate
-            else if (utf8::internal::is_trail_surrogate(cp))
-                throw invalid_utf16(static_cast<uint16_t>(cp));
-
-            result = utf8::append(cp, result);
-        }
-        return result;
-    }
-
-    template <typename u16bit_iterator, typename octet_iterator>
-    u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
-    {
-        while (start < end) {
-            uint32_t cp = utf8::next(start, end);
-            if (cp > 0xffff) { //make a surrogate pair
-                *result++ = static_cast<uint16_t>((cp >> 10)   + internal::LEAD_OFFSET);
-                *result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
-            }
-            else
-                *result++ = static_cast<uint16_t>(cp);
-        }
-        return result;
-    }
-
-    template <typename octet_iterator, typename u32bit_iterator>
-    octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result)
-    {
-        while (start != end)
-            result = utf8::append(*(start++), result);
-
-        return result;
-    }
-
-    template <typename octet_iterator, typename u32bit_iterator>
-    u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result)
-    {
-        while (start < end)
-            (*result++) = utf8::next(start, end);
-
-        return result;
-    }
-
-    // The iterator class
-    template <typename octet_iterator>
-    class iterator {
-      octet_iterator it;
-      octet_iterator range_start;
-      octet_iterator range_end;
-      public:
-      typedef uint32_t value_type;
-      typedef uint32_t* pointer;
-      typedef uint32_t& reference;
-      typedef std::ptrdiff_t difference_type;
-      typedef std::bidirectional_iterator_tag iterator_category;
-      iterator () {}
-      explicit iterator (const octet_iterator& octet_it,
-                         const octet_iterator& rangestart,
-                         const octet_iterator& rangeend) :
-               it(octet_it), range_start(rangestart), range_end(rangeend)
-      {
-          if (it < range_start || it > range_end)
-              throw std::out_of_range("Invalid utf-8 iterator position");
-      }
-      // the default "big three" are OK
-      octet_iterator base () const { return it; }
-      uint32_t operator * () const
-      {
-          octet_iterator temp = it;
-          return utf8::next(temp, range_end);
-      }
-      bool operator == (const iterator& rhs) const
-      {
-          if (range_start != rhs.range_start || range_end != rhs.range_end)
-              throw std::logic_error("Comparing utf-8 iterators defined with different ranges");
-          return (it == rhs.it);
-      }
-      bool operator != (const iterator& rhs) const
-      {
-          return !(operator == (rhs));
-      }
-      iterator& operator ++ ()
-      {
-          utf8::next(it, range_end);
-          return *this;
-      }
-      iterator operator ++ (int)
-      {
-          iterator temp = *this;
-          utf8::next(it, range_end);
-          return temp;
-      }
-      iterator& operator -- ()
-      {
-          utf8::prior(it, range_start);
-          return *this;
-      }
-      iterator operator -- (int)
-      {
-          iterator temp = *this;
-          utf8::prior(it, range_start);
-          return temp;
-      }
-    }; // class iterator
-
-} // namespace utf8
-
-#if UTF_CPP_CPLUSPLUS >= 201703L // C++ 17 or later
-#include "cpp17.h"
-#elif UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later
-#include "cpp11.h"
-#endif // C++ 11 or later
-
-#endif //header guard
-
diff --git a/paddlenlp/ops/fast_transformer/src/demo/utf8/core.h b/paddlenlp/ops/fast_transformer/src/demo/utf8/core.h
deleted file mode 100644
index 34371ee31c8c..000000000000
--- a/paddlenlp/ops/fast_transformer/src/demo/utf8/core.h
+++ /dev/null
@@ -1,387 +0,0 @@
-// Copyright 2006 Nemanja Trifunovic
-
-/*
-Permission is hereby granted, free of charge, to any person or organization
-obtaining a copy of the software and accompanying documentation covered by
-this license (the "Software") to use, reproduce, display, distribute,
-execute, and transmit the Software, and to prepare derivative works of the
-Software, and to permit third-parties to whom the Software is furnished to
-do so, all subject to the following:
-
-The copyright notices in the Software and this entire statement, including
-the above license grant, this restriction and the following disclaimer,
-must be included in all copies of the Software, in whole or in part, and
-all derivative works of the Software, unless such copies or derivative
-works are solely in the form of machine-executable object code generated by
-a source language processor.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
-SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
-FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
-ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE.
-*/
-
-
-#ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
-#define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
-
-#include <iterator>
-
-// Determine the C++ standard version.
-// If the user defines UTF_CPP_CPLUSPLUS, use that.
-// Otherwise, trust the unreliable predefined macro __cplusplus
-
-#if !defined UTF_CPP_CPLUSPLUS
-    #define UTF_CPP_CPLUSPLUS __cplusplus
-#endif
-
-#if UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later
-    #define UTF_CPP_OVERRIDE override
-    #define UTF_CPP_NOEXCEPT noexcept
-#else // C++ 98/03
-    #define UTF_CPP_OVERRIDE
-    #define UTF_CPP_NOEXCEPT throw()
-#endif // C++ 11 or later
-
-
-namespace utf8
-{
-    // The typedefs for 8-bit, 16-bit and 32-bit unsigned integers
-    // You may need to change them to match your system.
-    // These typedefs have the same names as ones from cstdint, or boost/cstdint
-    typedef unsigned char   uint8_t;
-    typedef unsigned short  uint16_t;
-    typedef unsigned int    uint32_t;
-
-// Helper code - not intended to be directly called by the library users. May be changed at any time
-namespace internal
-{
-    // Unicode constants
-    // Leading (high) surrogates: 0xd800 - 0xdbff
-    // Trailing (low) surrogates: 0xdc00 - 0xdfff
-    const uint16_t LEAD_SURROGATE_MIN  = 0xd800u;
-    const uint16_t LEAD_SURROGATE_MAX  = 0xdbffu;
-    const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u;
-    const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu;
-    const uint16_t LEAD_OFFSET         = 0xd7c0u;       // LEAD_SURROGATE_MIN - (0x10000 >> 10)
-    const uint32_t SURROGATE_OFFSET    = 0xfca02400u;   // 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN
-
-    // Maximum valid value for a Unicode code point
-    const uint32_t CODE_POINT_MAX      = 0x0010ffffu;
-
-    template<typename octet_type>
-    inline uint8_t mask8(octet_type oc)
-    {
-        return static_cast<uint8_t>(0xff & oc);
-    }
-    template<typename u16_type>
-    inline uint16_t mask16(u16_type oc)
-    {
-        return static_cast<uint16_t>(0xffff & oc);
-    }
-    template<typename octet_type>
-    inline bool is_trail(octet_type oc)
-    {
-        return ((utf8::internal::mask8(oc) >> 6) == 0x2);
-    }
-
-    template <typename u16>
-    inline bool is_lead_surrogate(u16 cp)
-    {
-        return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX);
-    }
-
-    template <typename u16>
-    inline bool is_trail_surrogate(u16 cp)
-    {
-        return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
-    }
-
-    template <typename u16>
-    inline bool is_surrogate(u16 cp)
-    {
-        return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
-    }
-
-    template <typename u32>
-    inline bool is_code_point_valid(u32 cp)
-    {
-        return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp));
-    }
-
-    template <typename octet_iterator>
-    inline typename std::iterator_traits<octet_iterator>::difference_type
-    sequence_length(octet_iterator lead_it)
-    {
-        uint8_t lead = utf8::internal::mask8(*lead_it);
-        if (lead < 0x80)
-            return 1;
-        else if ((lead >> 5) == 0x6)
-            return 2;
-        else if ((lead >> 4) == 0xe)
-            return 3;
-        else if ((lead >> 3) == 0x1e)
-            return 4;
-        else
-            return 0;
-    }
-
-    template <typename octet_difference_type>
-    inline bool is_overlong_sequence(uint32_t cp, octet_difference_type length)
-    {
-        if (cp < 0x80) {
-            if (length != 1) 
-                return true;
-        }
-        else if (cp < 0x800) {
-            if (length != 2) 
-                return true;
-        }
-        else if (cp < 0x10000) {
-            if (length != 3) 
-                return true;
-        }
-
-        return false;
-    }
-
-    enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT};
-
-    /// Helper for get_sequence_x
-    template <typename octet_iterator>
-    utf_error increase_safely(octet_iterator& it, octet_iterator end)
-    {
-        if (++it == end)
-            return NOT_ENOUGH_ROOM;
-
-        if (!utf8::internal::is_trail(*it))
-            return INCOMPLETE_SEQUENCE;
-
-        return UTF8_OK;
-    }
-
-    #define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT, END) {utf_error ret = increase_safely(IT, END); if (ret != UTF8_OK) return ret;}    
-
-    /// get_sequence_x functions decode utf-8 sequences of the length x
-    template <typename octet_iterator>
-    utf_error get_sequence_1(octet_iterator& it, octet_iterator end, uint32_t& code_point)
-    {
-        if (it == end)
-            return NOT_ENOUGH_ROOM;
-
-        code_point = utf8::internal::mask8(*it);
-
-        return UTF8_OK;
-    }
-
-    template <typename octet_iterator>
-    utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t& code_point)
-    {
-        if (it == end) 
-            return NOT_ENOUGH_ROOM;
-
-        code_point = utf8::internal::mask8(*it);
-
-        UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
-
-        code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f);
-
-        return UTF8_OK;
-    }
-
-    template <typename octet_iterator>
-    utf_error get_sequence_3(octet_iterator& it, octet_iterator end, uint32_t& code_point)
-    {
-        if (it == end)
-            return NOT_ENOUGH_ROOM;
-            
-        code_point = utf8::internal::mask8(*it);
-
-        UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
-
-        code_point = ((code_point << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff);
-
-        UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
-
-        code_point += (*it) & 0x3f;
-
-        return UTF8_OK;
-    }
-
-    template <typename octet_iterator>
-    utf_error get_sequence_4(octet_iterator& it, octet_iterator end, uint32_t& code_point)
-    {
-        if (it == end)
-           return NOT_ENOUGH_ROOM;
-
-        code_point = utf8::internal::mask8(*it);
-
-        UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
-
-        code_point = ((code_point << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff);
-
-        UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
-
-        code_point += (utf8::internal::mask8(*it) << 6) & 0xfff;
-
-        UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
-
-        code_point += (*it) & 0x3f;
-
-        return UTF8_OK;
-    }
-
-    #undef UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR
-
-    template <typename octet_iterator>
-    utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t& code_point)
-    {
-        if (it == end)
-            return NOT_ENOUGH_ROOM;
-
-        // Save the original value of it so we can go back in case of failure
-        // Of course, it does not make much sense with i.e. stream iterators
-        octet_iterator original_it = it;
-
-        uint32_t cp = 0;
-        // Determine the sequence length based on the lead octet
-        typedef typename std::iterator_traits<octet_iterator>::difference_type octet_difference_type;
-        const octet_difference_type length = utf8::internal::sequence_length(it);
-
-        // Get trail octets and calculate the code point
-        utf_error err = UTF8_OK;
-        switch (length) {
-            case 0:
-                return INVALID_LEAD;
-            case 1:
-                err = utf8::internal::get_sequence_1(it, end, cp);
-                break;
-            case 2:
-                err = utf8::internal::get_sequence_2(it, end, cp);
-            break;
-            case 3:
-                err = utf8::internal::get_sequence_3(it, end, cp);
-            break;
-            case 4:
-                err = utf8::internal::get_sequence_4(it, end, cp);
-            break;
-        }
-
-        if (err == UTF8_OK) {
-            // Decoding succeeded. Now, security checks...
-            if (utf8::internal::is_code_point_valid(cp)) {
-                if (!utf8::internal::is_overlong_sequence(cp, length)){
-                    // Passed! Return here.
-                    code_point = cp;
-                    ++it;
-                    return UTF8_OK;
-                }
-                else
-                    err = OVERLONG_SEQUENCE;
-            }
-            else 
-                err = INVALID_CODE_POINT;
-        }
-
-        // Failure branch - restore the original value of the iterator
-        it = original_it;
-        return err;
-    }
-
-    template <typename octet_iterator>
-    inline utf_error validate_next(octet_iterator& it, octet_iterator end) {
-        uint32_t ignored;
-        return utf8::internal::validate_next(it, end, ignored);
-    }
-
-    // Internal implementation of both checked and unchecked append() function
-    // This function will be invoked by the overloads below, as they will know
-    // the octet_type.
-    template <typename octet_iterator, typename octet_type>
-    octet_iterator append(uint32_t cp, octet_iterator result) {
-        if (cp < 0x80)                        // one octet
-            *(result++) = static_cast<octet_type>(cp);
-        else if (cp < 0x800) {                // two octets
-            *(result++) = static_cast<octet_type>((cp >> 6)          | 0xc0);
-            *(result++) = static_cast<octet_type>((cp & 0x3f)        | 0x80);
-        }
-        else if (cp < 0x10000) {              // three octets
-            *(result++) = static_cast<octet_type>((cp >> 12)         | 0xe0);
-            *(result++) = static_cast<octet_type>(((cp >> 6) & 0x3f) | 0x80);
-            *(result++) = static_cast<octet_type>((cp & 0x3f)        | 0x80);
-        }
-        else {                                // four octets
-            *(result++) = static_cast<octet_type>((cp >> 18)         | 0xf0);
-            *(result++) = static_cast<octet_type>(((cp >> 12) & 0x3f)| 0x80);
-            *(result++) = static_cast<octet_type>(((cp >> 6) & 0x3f) | 0x80);
-            *(result++) = static_cast<octet_type>((cp & 0x3f)        | 0x80);
-        }
-        return result;
-    }
-    
-    // One of the following overloads will be invoked from the API calls
-
-    // A simple (but dangerous) case: the caller appends byte(s) to a char array
-    inline char* append(uint32_t cp, char* result) {
-        return append<char*, char>(cp, result);
-    }
-
-    // Hopefully, most common case: the caller uses back_inserter
-    // i.e. append(cp, std::back_inserter(str));
-    template<typename container_type>
-    std::back_insert_iterator<container_type> append
-            (uint32_t cp, std::back_insert_iterator<container_type> result) {
-        return append<std::back_insert_iterator<container_type>,
-            typename container_type::value_type>(cp, result);
-    }
-
-    // The caller uses some other kind of output operator - not covered above
-    // Note that in this case we are not able to determine octet_type
-    // so we assume it's uint_8; that can cause a conversion warning if we are wrong.
-    template <typename octet_iterator>
-    octet_iterator append(uint32_t cp, octet_iterator result) {
-        return append<octet_iterator, uint8_t>(cp, result);
-    }
-
-} // namespace internal
-
-    /// The library API - functions intended to be called by the users
-
-    // Byte order mark
-    const uint8_t bom[] = {0xef, 0xbb, 0xbf};
-
-    template <typename octet_iterator>
-    octet_iterator find_invalid(octet_iterator start, octet_iterator end)
-    {
-        octet_iterator result = start;
-        while (result != end) {
-            utf8::internal::utf_error err_code = utf8::internal::validate_next(result, end);
-            if (err_code != internal::UTF8_OK)
-                return result;
-        }
-        return result;
-    }
-
-    template <typename octet_iterator>
-    inline bool is_valid(octet_iterator start, octet_iterator end)
-    {
-        return (utf8::find_invalid(start, end) == end);
-    }
-
-    template <typename octet_iterator>
-    inline bool starts_with_bom (octet_iterator it, octet_iterator end)
-    {
-        return (
-            ((it != end) && (utf8::internal::mask8(*it++)) == bom[0]) &&
-            ((it != end) && (utf8::internal::mask8(*it++)) == bom[1]) &&
-            ((it != end) && (utf8::internal::mask8(*it))   == bom[2])
-           );
-    }	
-} // namespace utf8
-
-#endif // header guard
-
-
diff --git a/paddlenlp/ops/fast_transformer/src/demo/utf8/cpp11.h b/paddlenlp/ops/fast_transformer/src/demo/utf8/cpp11.h
deleted file mode 100644
index 2366f12915cb..000000000000
--- a/paddlenlp/ops/fast_transformer/src/demo/utf8/cpp11.h
+++ /dev/null
@@ -1,103 +0,0 @@
-// Copyright 2018 Nemanja Trifunovic
-
-/*
-Permission is hereby granted, free of charge, to any person or organization
-obtaining a copy of the software and accompanying documentation covered by
-this license (the "Software") to use, reproduce, display, distribute,
-execute, and transmit the Software, and to prepare derivative works of the
-Software, and to permit third-parties to whom the Software is furnished to
-do so, all subject to the following:
-
-The copyright notices in the Software and this entire statement, including
-the above license grant, this restriction and the following disclaimer,
-must be included in all copies of the Software, in whole or in part, and
-all derivative works of the Software, unless such copies or derivative
-works are solely in the form of machine-executable object code generated by
-a source language processor.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
-SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
-FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
-ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE.
-*/
-
-
-#ifndef UTF8_FOR_CPP_a184c22c_d012_11e8_a8d5_f2801f1b9fd1
-#define UTF8_FOR_CPP_a184c22c_d012_11e8_a8d5_f2801f1b9fd1
-
-#include "checked.h"
-#include <string>
-
-namespace utf8
-{
-
-    inline void append(char32_t cp, std::string& s)
-    {
-        append(uint32_t(cp), std::back_inserter(s));
-    }
-
-    inline std::string utf16to8(const std::u16string& s)
-    {
-        std::string result;
-        utf16to8(s.begin(), s.end(), std::back_inserter(result));
-        return result;
-    }
-
-    inline std::u16string utf8to16(const std::string& s)
-    {
-        std::u16string result;
-        utf8to16(s.begin(), s.end(), std::back_inserter(result));
-        return result;
-    }
-
-    inline std::string utf32to8(const std::u32string& s)
-    {
-        std::string result;
-        utf32to8(s.begin(), s.end(), std::back_inserter(result));
-        return result;
-    }
-
-    inline std::u32string utf8to32(const std::string& s)
-    {
-        std::u32string result;
-        utf8to32(s.begin(), s.end(), std::back_inserter(result));
-        return result;
-    }
-
-    inline std::size_t find_invalid(const std::string& s)
-    {
-        std::string::const_iterator invalid = find_invalid(s.begin(), s.end());
-        return (invalid == s.end()) ? std::string::npos : static_cast<std::size_t>(invalid - s.begin());
-    }
-
-    inline bool is_valid(const std::string& s)
-    {
-        return is_valid(s.begin(), s.end());
-    }
-
-    inline std::string replace_invalid(const std::string& s, char32_t replacement)
-    {
-        std::string result;
-        replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement);
-        return result;
-    }
-
-    inline std::string replace_invalid(const std::string& s)
-    {
-        std::string result;
-        replace_invalid(s.begin(), s.end(), std::back_inserter(result));
-        return result;
-    }
-
-    inline bool starts_with_bom(const std::string& s)
-    {
-        return starts_with_bom(s.begin(), s.end());
-    }
- 
-} // namespace utf8
-
-#endif // header guard
-
diff --git a/paddlenlp/ops/fast_transformer/src/demo/utf8/cpp17.h b/paddlenlp/ops/fast_transformer/src/demo/utf8/cpp17.h
deleted file mode 100644
index 32a77ce30750..000000000000
--- a/paddlenlp/ops/fast_transformer/src/demo/utf8/cpp17.h
+++ /dev/null
@@ -1,103 +0,0 @@
-// Copyright 2018 Nemanja Trifunovic
-
-/*
-Permission is hereby granted, free of charge, to any person or organization
-obtaining a copy of the software and accompanying documentation covered by
-this license (the "Software") to use, reproduce, display, distribute,
-execute, and transmit the Software, and to prepare derivative works of the
-Software, and to permit third-parties to whom the Software is furnished to
-do so, all subject to the following:
-
-The copyright notices in the Software and this entire statement, including
-the above license grant, this restriction and the following disclaimer,
-must be included in all copies of the Software, in whole or in part, and
-all derivative works of the Software, unless such copies or derivative
-works are solely in the form of machine-executable object code generated by
-a source language processor.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
-SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
-FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
-ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE.
-*/
-
-
-#ifndef UTF8_FOR_CPP_7e906c01_03a3_4daf_b420_ea7ea952b3c9
-#define UTF8_FOR_CPP_7e906c01_03a3_4daf_b420_ea7ea952b3c9
-
-#include "checked.h"
-#include <string>
-
-namespace utf8
-{
-
-    inline void append(char32_t cp, std::string& s)
-    {
-        append(uint32_t(cp), std::back_inserter(s));
-    }
-
-    inline std::string utf16to8(std::u16string_view s)
-    {
-        std::string result;
-        utf16to8(s.begin(), s.end(), std::back_inserter(result));
-        return result;
-    }
-
-    inline std::u16string utf8to16(std::string_view s)
-    {
-        std::u16string result;
-        utf8to16(s.begin(), s.end(), std::back_inserter(result));
-        return result;
-    }
-
-    inline std::string utf32to8(std::u32string_view s)
-    {
-        std::string result;
-        utf32to8(s.begin(), s.end(), std::back_inserter(result));
-        return result;
-    }
-
-    inline std::u32string utf8to32(std::string_view s)
-    {
-        std::u32string result;
-        utf8to32(s.begin(), s.end(), std::back_inserter(result));
-        return result;
-    }
-
-    inline std::size_t find_invalid(std::string_view s)
-    {
-        std::string_view::const_iterator invalid = find_invalid(s.begin(), s.end());
-        return (invalid == s.end()) ? std::string_view::npos : static_cast<std::size_t>(invalid - s.begin());
-    }
-
-    inline bool is_valid(std::string_view s)
-    {
-        return is_valid(s.begin(), s.end());
-    }
-
-    inline std::string replace_invalid(std::string_view s, char32_t replacement)
-    {
-        std::string result;
-        replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement);
-        return result;
-    }
-
-    inline std::string replace_invalid(std::string_view s)
-    {
-        std::string result;
-        replace_invalid(s.begin(), s.end(), std::back_inserter(result));
-        return result;
-    }
-
-    inline bool starts_with_bom(std::string_view s)
-    {
-        return starts_with_bom(s.begin(), s.end());
-    }
- 
-} // namespace utf8
-
-#endif // header guard
-
diff --git a/paddlenlp/ops/fast_transformer/src/demo/utf8/unchecked.h b/paddlenlp/ops/fast_transformer/src/demo/utf8/unchecked.h
deleted file mode 100644
index 8fe83c9ecbc7..000000000000
--- a/paddlenlp/ops/fast_transformer/src/demo/utf8/unchecked.h
+++ /dev/null
@@ -1,257 +0,0 @@
-// Copyright 2006 Nemanja Trifunovic
-
-/*
-Permission is hereby granted, free of charge, to any person or organization
-obtaining a copy of the software and accompanying documentation covered by
-this license (the "Software") to use, reproduce, display, distribute,
-execute, and transmit the Software, and to prepare derivative works of the
-Software, and to permit third-parties to whom the Software is furnished to
-do so, all subject to the following:
-
-The copyright notices in the Software and this entire statement, including
-the above license grant, this restriction and the following disclaimer,
-must be included in all copies of the Software, in whole or in part, and
-all derivative works of the Software, unless such copies or derivative
-works are solely in the form of machine-executable object code generated by
-a source language processor.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
-SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
-FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
-ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE.
-*/
-
-
-#ifndef UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
-#define UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
-
-#include "core.h"
-
-namespace utf8
-{
-    namespace unchecked
-    {
-        template <typename octet_iterator>
-        octet_iterator append(uint32_t cp, octet_iterator result)
-        {
-            return internal::append(cp, result);
-        }
-
-        template <typename octet_iterator, typename output_iterator>
-        output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement)
-        {
-            while (start != end) {
-                octet_iterator sequence_start = start;
-                internal::utf_error err_code = utf8::internal::validate_next(start, end);
-                switch (err_code) {
-                    case internal::UTF8_OK :
-                        for (octet_iterator it = sequence_start; it != start; ++it)
-                            *out++ = *it;
-                        break;
-                    case internal::NOT_ENOUGH_ROOM:
-                        out = utf8::unchecked::append (replacement, out);
-                        start = end;
-                        break;
-                    case internal::INVALID_LEAD:
-                        out = utf8::unchecked::append (replacement, out);
-                        ++start;
-                        break;
-                    case internal::INCOMPLETE_SEQUENCE:
-                    case internal::OVERLONG_SEQUENCE:
-                    case internal::INVALID_CODE_POINT:
-                        out = utf8::unchecked::append (replacement, out);
-                        ++start;
-                        // just one replacement mark for the sequence
-                        while (start != end && utf8::internal::is_trail(*start))
-                            ++start;
-                        break;
-                }
-            }
-            return out;
-        }
-
-        template <typename octet_iterator, typename output_iterator>
-        inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out)
-        {
-            static const uint32_t replacement_marker = utf8::internal::mask16(0xfffd);
-            return utf8::unchecked::replace_invalid(start, end, out, replacement_marker);
-        }
-
-        template <typename octet_iterator>
-        uint32_t next(octet_iterator& it)
-        {
-            uint32_t cp = utf8::internal::mask8(*it);
-            typename std::iterator_traits<octet_iterator>::difference_type length = utf8::internal::sequence_length(it);
-            switch (length) {
-                case 1:
-                    break;
-                case 2:
-                    it++;
-                    cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f);
-                    break;
-                case 3:
-                    ++it; 
-                    cp = ((cp << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff);
-                    ++it;
-                    cp += (*it) & 0x3f;
-                    break;
-                case 4:
-                    ++it;
-                    cp = ((cp << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff);                
-                    ++it;
-                    cp += (utf8::internal::mask8(*it) << 6) & 0xfff;
-                    ++it;
-                    cp += (*it) & 0x3f; 
-                    break;
-            }
-            ++it;
-            return cp;
-        }
-
-        template <typename octet_iterator>
-        uint32_t peek_next(octet_iterator it)
-        {
-            return utf8::unchecked::next(it);
-        }
-
-        template <typename octet_iterator>
-        uint32_t prior(octet_iterator& it)
-        {
-            while (utf8::internal::is_trail(*(--it))) ;
-            octet_iterator temp = it;
-            return utf8::unchecked::next(temp);
-        }
-
-        template <typename octet_iterator, typename distance_type>
-        void advance (octet_iterator& it, distance_type n)
-        {
-            const distance_type zero(0);
-            if (n < zero) {
-                // backward
-                for (distance_type i = n; i < zero; ++i)
-                    utf8::unchecked::prior(it);
-            } else {
-                // forward
-                for (distance_type i = zero; i < n; ++i)
-                    utf8::unchecked::next(it);
-            }
-        }
-
-        template <typename octet_iterator>
-        typename std::iterator_traits<octet_iterator>::difference_type
-        distance (octet_iterator first, octet_iterator last)
-        {
-            typename std::iterator_traits<octet_iterator>::difference_type dist;
-            for (dist = 0; first < last; ++dist) 
-                utf8::unchecked::next(first);
-            return dist;
-        }
-
-        template <typename u16bit_iterator, typename octet_iterator>
-        octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
-        {
-            while (start != end) {
-                uint32_t cp = utf8::internal::mask16(*start++);
-            // Take care of surrogate pairs first
-                if (utf8::internal::is_lead_surrogate(cp)) {
-                    uint32_t trail_surrogate = utf8::internal::mask16(*start++);
-                    cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
-                }
-                result = utf8::unchecked::append(cp, result);
-            }
-            return result;
-        }
-
-        template <typename u16bit_iterator, typename octet_iterator>
-        u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
-        {
-            while (start < end) {
-                uint32_t cp = utf8::unchecked::next(start);
-                if (cp > 0xffff) { //make a surrogate pair
-                    *result++ = static_cast<uint16_t>((cp >> 10)   + internal::LEAD_OFFSET);
-                    *result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
-                }
-                else
-                    *result++ = static_cast<uint16_t>(cp);
-            }
-            return result;
-        }
-
-        template <typename octet_iterator, typename u32bit_iterator>
-        octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result)
-        {
-            while (start != end)
-                result = utf8::unchecked::append(*(start++), result);
-
-            return result;
-        }
-
-        template <typename octet_iterator, typename u32bit_iterator>
-        u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result)
-        {
-            while (start < end)
-                (*result++) = utf8::unchecked::next(start);
-
-            return result;
-        }
-
-        // The iterator class
-        template <typename octet_iterator>
-          class iterator {
-            octet_iterator it;
-            public:
-            typedef uint32_t value_type;
-            typedef uint32_t* pointer;
-            typedef uint32_t& reference;
-            typedef std::ptrdiff_t difference_type;
-            typedef std::bidirectional_iterator_tag iterator_category;
-            iterator () {}
-            explicit iterator (const octet_iterator& octet_it): it(octet_it) {}
-            // the default "big three" are OK
-            octet_iterator base () const { return it; }
-            uint32_t operator * () const
-            {
-                octet_iterator temp = it;
-                return utf8::unchecked::next(temp);
-            }
-            bool operator == (const iterator& rhs) const 
-            { 
-                return (it == rhs.it);
-            }
-            bool operator != (const iterator& rhs) const
-            {
-                return !(operator == (rhs));
-            }
-            iterator& operator ++ () 
-            {
-                ::std::advance(it, utf8::internal::sequence_length(it));
-                return *this;
-            }
-            iterator operator ++ (int)
-            {
-                iterator temp = *this;
-                ::std::advance(it, utf8::internal::sequence_length(it));
-                return temp;
-            }  
-            iterator& operator -- ()
-            {
-                utf8::unchecked::prior(it);
-                return *this;
-            }
-            iterator operator -- (int)
-            {
-                iterator temp = *this;
-                utf8::unchecked::prior(it);
-                return temp;
-            }
-          }; // class iterator
-
-    } // namespace utf8::unchecked
-} // namespace utf8 
-
-
-#endif // header guard
-
diff --git a/paddlenlp/ops/fast_transformer/src/fusion_bart_decoding_op.cc b/paddlenlp/ops/fast_transformer/src/fusion_bart_decoding_op.cc
deleted file mode 100644
index ed182f854001..000000000000
--- a/paddlenlp/ops/fast_transformer/src/fusion_bart_decoding_op.cc
+++ /dev/null
@@ -1,352 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <string>
-#include <vector>
-
-#include "fusion_bart_decoding_op.h"
-#include "pd_traits.h"
-
-
-std::vector<paddle::Tensor> BartDecodingForward(
-    const paddle::Tensor& input,
-    const paddle::Tensor& mem_seq_len,
-    const paddle::Tensor& word_embedding,
-    const std::vector<paddle::Tensor>& self_ln_weight,
-    const std::vector<paddle::Tensor>& self_ln_bias,
-    const std::vector<paddle::Tensor>& self_q_weight,
-    const std::vector<paddle::Tensor>& self_q_bias,
-    const std::vector<paddle::Tensor>& self_k_weight,
-    const std::vector<paddle::Tensor>& self_k_bias,
-    const std::vector<paddle::Tensor>& self_v_weight,
-    const std::vector<paddle::Tensor>& self_v_bias,
-    const std::vector<paddle::Tensor>& self_out_weight,
-    const std::vector<paddle::Tensor>& self_out_bias,
-    const std::vector<paddle::Tensor>& cross_ln_weight,
-    const std::vector<paddle::Tensor>& cross_ln_bias,
-    const std::vector<paddle::Tensor>& cross_q_weight,
-    const std::vector<paddle::Tensor>& cross_q_bias,
-    const std::vector<paddle::Tensor>& cross_k_weight,
-    const std::vector<paddle::Tensor>& cross_k_bias,
-    const std::vector<paddle::Tensor>& cross_v_weight,
-    const std::vector<paddle::Tensor>& cross_v_bias,
-    const std::vector<paddle::Tensor>& cross_out_weight,
-    const std::vector<paddle::Tensor>& cross_out_bias,
-    const std::vector<paddle::Tensor>& ffn_ln_weight,
-    const std::vector<paddle::Tensor>& ffn_ln_bias,
-    const std::vector<paddle::Tensor>& ffn_inter_weight,
-    const std::vector<paddle::Tensor>& ffn_inter_bias,
-    const std::vector<paddle::Tensor>& ffn_out_weight,
-    const std::vector<paddle::Tensor>& ffn_out_bias,
-    const paddle::Tensor& decoder_ln_weight,
-    const paddle::Tensor& decoder_ln_bias,
-    const paddle::Tensor& embedding_weight,
-    const paddle::Tensor& embedding_bias,
-    const paddle::Tensor& positional_embedding_weight,
-    const std::string& decoding_strategy,
-    const int& beam_size,
-    const int& topk,
-    const float& topp,
-    const float& temperature,
-    const int& n_head,
-    const int& size_per_head,
-    const int& num_layer,
-    const int& bos_id,
-    const int& eos_id,
-    const int64_t& max_len,
-    const int64_t& min_len,
-    const float& beam_search_diversity_rate,
-    const bool& rel_len,
-    const float& alpha,
-    const bool& early_stopping) {
-  int batch_size = input.shape()[0];
-  int max_out_len = rel_len ? max_len + input.shape()[1] : max_len;
-  int min_out_len = rel_len ? min_len + input.shape()[1] : min_len;
-
-  std::vector<int64_t> output_dims;
-  std::vector<int64_t> parent_ids_dims;
-  std::vector<int64_t> sequence_length_dims({batch_size});
-  if (decoding_strategy == "beam_search") {
-    batch_size /= beam_size;
-    output_dims = {max_out_len, batch_size, beam_size};
-    parent_ids_dims = output_dims;
-  } else if (decoding_strategy == "beam_search_v2" ||
-             decoding_strategy == "beam_search_v3") {
-    // Use separated alive and finish beam queues to avoid the decrease of alive
-    // beams. The outputs must include both the finish and alive to trace full
-    // path.
-    if (batch_size != -1) {
-      sequence_length_dims = {batch_size * 2};
-      batch_size /= beam_size;
-    } else {
-      sequence_length_dims = {batch_size};
-    }
-    output_dims = {max_out_len, batch_size, beam_size * 2};
-    parent_ids_dims = output_dims;
-  } else if (decoding_strategy == "topk_sampling" ||
-             decoding_strategy == "topp_sampling") {
-    output_dims = {max_out_len, batch_size};
-    parent_ids_dims = {1};
-  } else {
-    PD_THROW("Not supported decoding strategy. ");
-  }
-
-  if (input.place() == paddle::PlaceType::kGPU) {
-    auto output_ids = paddle::Tensor(paddle::PlaceType::kGPU, output_dims);
-    auto parent_ids = paddle::Tensor(paddle::PlaceType::kGPU, parent_ids_dims);
-    auto sequence_length =
-        paddle::Tensor(paddle::PlaceType::kGPU, sequence_length_dims);
-
-    paddle::Tensor seq_len = paddle::Tensor(paddle::PlaceType::kGPU);
-
-    if (mem_seq_len.place() != paddle::PlaceType::kGPU) {
-      seq_len = mem_seq_len.copy_to<int>(paddle::PlaceType::kGPU);
-    } else {
-      seq_len = mem_seq_len;
-    }
-
-    return BartDecodingCUDAForward(input,
-                                   seq_len,
-                                   word_embedding,
-                                   self_ln_weight,
-                                   self_ln_bias,
-                                   self_q_weight,
-                                   self_q_bias,
-                                   self_k_weight,
-                                   self_k_bias,
-                                   self_v_weight,
-                                   self_v_bias,
-                                   self_out_weight,
-                                   self_out_bias,
-                                   cross_ln_weight,
-                                   cross_ln_bias,
-                                   cross_q_weight,
-                                   cross_q_bias,
-                                   cross_k_weight,
-                                   cross_k_bias,
-                                   cross_v_weight,
-                                   cross_v_bias,
-                                   cross_out_weight,
-                                   cross_out_bias,
-                                   ffn_ln_weight,
-                                   ffn_ln_bias,
-                                   ffn_inter_weight,
-                                   ffn_inter_bias,
-                                   ffn_out_weight,
-                                   ffn_out_bias,
-                                   decoder_ln_weight,
-                                   decoder_ln_bias,
-                                   embedding_weight,
-                                   embedding_bias,
-                                   positional_embedding_weight,
-                                   output_ids,
-                                   parent_ids,
-                                   sequence_length,
-                                   decoding_strategy,
-                                   beam_size,
-                                   topk,
-                                   topp,
-                                   temperature,
-                                   n_head,
-                                   size_per_head,
-                                   num_layer,
-                                   bos_id,
-                                   eos_id,
-                                   max_out_len,
-                                   min_out_len,
-                                   beam_search_diversity_rate,
-                                   alpha,
-                                   early_stopping);
-  } else {
-    PD_THROW("Not implemented place. Only GPU is supported. ");
-  }
-}
-
-std::vector<std::vector<int64_t>> BartDecodingInferShape(
-    const std::vector<int64_t>& input_shape,
-    const std::vector<int64_t>& mem_seq_len_shape,
-    const std::vector<int64_t>& word_embedding_shape,
-    const std::vector<std::vector<int64_t>>& self_ln_weight_shapes,
-    const std::vector<std::vector<int64_t>>& self_ln_bias_shapes,
-    const std::vector<std::vector<int64_t>>& self_q_weight_shapes,
-    const std::vector<std::vector<int64_t>>& self_q_bias_shapes,
-    const std::vector<std::vector<int64_t>>& self_k_weight_shapes,
-    const std::vector<std::vector<int64_t>>& self_k_bias_shapes,
-    const std::vector<std::vector<int64_t>>& self_v_weight_shapes,
-    const std::vector<std::vector<int64_t>>& self_v_bias_shapes,
-    const std::vector<std::vector<int64_t>>& self_out_weight_shapes,
-    const std::vector<std::vector<int64_t>>& self_out_bias_shapes,
-    const std::vector<std::vector<int64_t>>& cross_ln_weight_shapes,
-    const std::vector<std::vector<int64_t>>& cross_ln_bias_shapes,
-    const std::vector<std::vector<int64_t>>& cross_q_weight_shapes,
-    const std::vector<std::vector<int64_t>>& cross_q_bias_shapes,
-    const std::vector<std::vector<int64_t>>& cross_k_weight_shapes,
-    const std::vector<std::vector<int64_t>>& cross_k_bias_shapes,
-    const std::vector<std::vector<int64_t>>& cross_v_weight_shapes,
-    const std::vector<std::vector<int64_t>>& cross_v_bias_shapes,
-    const std::vector<std::vector<int64_t>>& cross_out_weight_shapes,
-    const std::vector<std::vector<int64_t>>& cross_out_bias_shapes,
-    const std::vector<std::vector<int64_t>>& ffn_ln_weight_shapes,
-    const std::vector<std::vector<int64_t>>& ffn_ln_bias_shapes,
-    const std::vector<std::vector<int64_t>>& ffn_inter_weight_shapes,
-    const std::vector<std::vector<int64_t>>& ffn_inter_bias_shapes,
-    const std::vector<std::vector<int64_t>>& ffn_out_weight_shapes,
-    const std::vector<std::vector<int64_t>>& ffn_out_bias_shapes,
-    const std::vector<int64_t>& decoder_ln_weight_shape,
-    const std::vector<int64_t>& decoder_ln_bias_shape,
-    const std::vector<int64_t>& embedding_weight_shape,
-    const std::vector<int64_t>& embedding_bias_shape,
-    const std::vector<int64_t>& positional_embedding_weight_shape,
-    const std::string& decoding_strategy,
-    const int& beam_size,
-    const int& topk,
-    const float& topp,
-    const float& temperature,
-    const int& n_head,
-    const int& size_per_head,
-    const int& num_layer,
-    const int& bos_id,
-    const int& eos_id,
-    const int64_t& max_len,
-    const int64_t& min_len,
-    const float& beam_search_diversity_rate,
-    const bool& rel_len,
-    const float& alpha,
-    const bool& early_stopping) {
-  int batch_size = input_shape[0];
-
-  std::vector<int64_t> output_dims;
-  std::vector<int64_t> sequence_length_dims({batch_size});
-  if (decoding_strategy == "beam_search") {
-    if (batch_size != -1) {
-      batch_size /= beam_size;
-    }
-    output_dims = {max_len, batch_size, beam_size};
-    return {output_dims, output_dims, sequence_length_dims};
-  } else if (decoding_strategy == "beam_search_v2" ||
-             decoding_strategy == "beam_search_v3") {
-    // Use separated alive and finish beam queues to avoid the decrease of alive
-    // beams. The outputs must include both the finish and alive to trace full
-    // path.
-    if (batch_size != -1) {
-      sequence_length_dims = {batch_size * 2};
-      batch_size /= beam_size;
-    } else {
-      sequence_length_dims = {batch_size};
-    }
-    output_dims = {max_len, batch_size, beam_size * 2};
-    return {output_dims, output_dims, sequence_length_dims};
-  } else if (decoding_strategy == "topk_sampling" ||
-             decoding_strategy == "topp_sampling") {
-    output_dims = {max_len, batch_size};
-    return {output_dims, {1}, sequence_length_dims};
-  } else {
-    PD_THROW("Not supported decoding strategy. ");
-  }
-}
-
-std::vector<paddle::DataType> BartDecodingInferDtype(
-    const paddle::DataType& input,
-    const paddle::DataType& mem_seq_len,
-    const paddle::DataType& word_embedding,
-    const std::vector<paddle::DataType>& self_ln_weight,
-    const std::vector<paddle::DataType>& self_ln_bias,
-    const std::vector<paddle::DataType>& self_q_weight,
-    const std::vector<paddle::DataType>& self_q_bias,
-    const std::vector<paddle::DataType>& self_k_weight,
-    const std::vector<paddle::DataType>& self_k_bias,
-    const std::vector<paddle::DataType>& self_v_weight,
-    const std::vector<paddle::DataType>& self_v_bias,
-    const std::vector<paddle::DataType>& self_out_weight,
-    const std::vector<paddle::DataType>& self_out_bias,
-    const std::vector<paddle::DataType>& cross_ln_weight,
-    const std::vector<paddle::DataType>& cross_ln_bias,
-    const std::vector<paddle::DataType>& cross_q_weight,
-    const std::vector<paddle::DataType>& cross_q_bias,
-    const std::vector<paddle::DataType>& cross_k_weight,
-    const std::vector<paddle::DataType>& cross_k_bias,
-    const std::vector<paddle::DataType>& cross_v_weight,
-    const std::vector<paddle::DataType>& cross_v_bias,
-    const std::vector<paddle::DataType>& cross_out_weight,
-    const std::vector<paddle::DataType>& cross_out_bias,
-    const std::vector<paddle::DataType>& ffn_ln_weight,
-    const std::vector<paddle::DataType>& ffn_ln_bias,
-    const std::vector<paddle::DataType>& ffn_inter_weight,
-    const std::vector<paddle::DataType>& ffn_inter_bias,
-    const std::vector<paddle::DataType>& ffn_out_weight,
-    const std::vector<paddle::DataType>& ffn_out_bias,
-    const paddle::DataType& decoder_ln_weight,
-    const paddle::DataType& decoder_ln_bias,
-    const paddle::DataType& embedding_weight,
-    const paddle::DataType& embedding_bias,
-    const paddle::DataType& positional_embedding_weight) {
-  return {paddle::DataType::INT32,
-          paddle::DataType::INT32,
-          paddle::DataType::INT32};
-}
-
-PD_BUILD_OP(fusion_bart_decoding)
-    .Inputs({"Input",
-             "MemSeqLen",
-             "WordEmbedding",
-             paddle::Vec("SelfLayernormWeight"),
-             paddle::Vec("SelfLayernormBias"),
-             paddle::Vec("SelfQueryWeight"),
-             paddle::Vec("SelfQueryBias"),
-             paddle::Vec("SelfKeyWeight"),
-             paddle::Vec("SelfKeyBias"),
-             paddle::Vec("SelfValueWeight"),
-             paddle::Vec("SelfValueBias"),
-             paddle::Vec("SelfOutWeight"),
-             paddle::Vec("SelfOutBias"),
-             paddle::Vec("CrossLayernormWeight"),
-             paddle::Vec("CrossLayernormBias"),
-             paddle::Vec("CrossQueryWeight"),
-             paddle::Vec("CrossQueryBias"),
-             paddle::Vec("CrossKeyWeight"),
-             paddle::Vec("CrossKeyBias"),
-             paddle::Vec("CrossValueWeight"),
-             paddle::Vec("CrossValueBias"),
-             paddle::Vec("CrossOutWeight"),
-             paddle::Vec("CrossOutBias"),
-             paddle::Vec("FFNLayernormWeight"),
-             paddle::Vec("FFNLayernormBias"),
-             paddle::Vec("FFNInterWeight"),
-             paddle::Vec("FFNInterBias"),
-             paddle::Vec("FFNOutWeight"),
-             paddle::Vec("FFNOutBias"),
-             "DecoderLayernormWeight",
-             "DecoderLayernormBias",
-             "EmbWeight",
-             "EmbBias",
-             "PositionEncEmb"})
-    .Outputs({"OutputIds", "ParentIds", "SequenceLength"})
-    .Attrs({"decoding_strategy: std::string",
-            "beam_size: int",
-            "topk: int",
-            "topp: float",
-            "temperature: float",
-            "n_head: int",
-            "size_per_head: int",
-            "num_layer: int",
-            "bos_id: int",
-            "eos_id: int",
-            "max_len: int64_t",
-            "min_len: int64_t",
-            "beam_search_diversity_rate: float",
-            "rel_len: bool",
-            "alpha: float",
-            "early_stopping: bool"})
-    .SetKernelFn(PD_KERNEL(BartDecodingForward))
-    .SetInferShapeFn(PD_INFER_SHAPE(BartDecodingInferShape))
-    .SetInferDtypeFn(PD_INFER_DTYPE(BartDecodingInferDtype));
diff --git a/paddlenlp/ops/fast_transformer/src/fusion_bart_decoding_op.cu b/paddlenlp/ops/fast_transformer/src/fusion_bart_decoding_op.cu
deleted file mode 100644
index 11d454156788..000000000000
--- a/paddlenlp/ops/fast_transformer/src/fusion_bart_decoding_op.cu
+++ /dev/null
@@ -1,581 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <curand.h>
-#include <curand_kernel.h>
-#include <algorithm>
-#include <iterator>
-#include <random>
-#include <sstream>
-#include <vector>
-
-#include "cublas_handle.h"
-
-#include "fusion_bart_decoding_op.h"
-#include "pd_traits.h"
-
-template <paddle::DataType D>
-std::vector<paddle::Tensor> bart_decoding_kernel(
-    const paddle::Tensor& input,
-    const paddle::Tensor& memory_sequence_length,
-    const paddle::Tensor& word_emb,
-    const std::vector<paddle::Tensor>& self_layernorm_weight,
-    const std::vector<paddle::Tensor>& self_layernorm_bias,
-    const std::vector<paddle::Tensor>& self_attn_query_weight,
-    const std::vector<paddle::Tensor>& self_attn_query_bias,
-    const std::vector<paddle::Tensor>& self_attn_key_weight,
-    const std::vector<paddle::Tensor>& self_attn_key_bias,
-    const std::vector<paddle::Tensor>& self_attn_value_weight,
-    const std::vector<paddle::Tensor>& self_attn_value_bias,
-    const std::vector<paddle::Tensor>& self_attn_output_weight,
-    const std::vector<paddle::Tensor>& self_attn_output_bias,
-    const std::vector<paddle::Tensor>& cross_layernorm_weight,
-    const std::vector<paddle::Tensor>& cross_layernorm_bias,
-    const std::vector<paddle::Tensor>& cross_attn_query_weight,
-    const std::vector<paddle::Tensor>& cross_attn_query_bias,
-    const std::vector<paddle::Tensor>& cross_attn_key_weight,
-    const std::vector<paddle::Tensor>& cross_attn_key_bias,
-    const std::vector<paddle::Tensor>& cross_attn_value_weight,
-    const std::vector<paddle::Tensor>& cross_attn_value_bias,
-    const std::vector<paddle::Tensor>& cross_attn_output_weight,
-    const std::vector<paddle::Tensor>& cross_attn_output_bias,
-    const std::vector<paddle::Tensor>& ffn_layernorm_weight,
-    const std::vector<paddle::Tensor>& ffn_layernorm_bias,
-    const std::vector<paddle::Tensor>& ffn_intermediate_weight,
-    const std::vector<paddle::Tensor>& ffn_intermediate_bias,
-    const std::vector<paddle::Tensor>& ffn_output_weight,
-    const std::vector<paddle::Tensor>& ffn_output_bias,
-    const paddle::Tensor& decoder_layernorm_weight,
-    const paddle::Tensor& decoder_layernorm_bias,
-    const paddle::Tensor& embedding_weight,
-    const paddle::Tensor& embedding_bias,
-    const paddle::Tensor& position_encoding_table,
-    paddle::Tensor& output_ids,
-    paddle::Tensor& parent_ids,
-    paddle::Tensor& sequence_length,
-    const std::string& decoding_strategy,
-    const int& beam_size,
-    const int& topk,
-    const float& topp,
-    const float& temperature,
-    const int& head_num_,
-    const int& size_per_head_,
-    const int& num_layer_,
-    const int& start_id_,
-    const int& end_id_,
-    const int64_t& max_seq_len_,
-    const int64_t& min_seq_len_,
-    const float& beam_search_diversity_rate_,
-    const float& alpha,
-    const bool& early_stopping,
-    cudaStream_t stream) {
-  int beam_width_ = (decoding_strategy == "beam_search" ||
-                     decoding_strategy == "beam_search_v2" ||
-                     decoding_strategy == "beam_search_v3")
-                        ? beam_size
-                        : 1;
-  int candidate_num_ = (decoding_strategy == "topk_sampling" ||
-                        decoding_strategy == "topp_sampling")
-                           ? topk
-                           : 1;
-  float probability_threshold_ = (decoding_strategy == "topk_sampling" ||
-                                  decoding_strategy == "topp_sampling")
-                                     ? topp
-                                     : 0.0;
-
-  auto input_dims = input.shape();
-  int batch_size_ = (decoding_strategy == "beam_search" ||
-                     decoding_strategy == "beam_search_v2" ||
-                     decoding_strategy == "beam_search_v3")
-                        ? input_dims[0] / beam_width_
-                        : input_dims[0];
-  const int memory_max_seq_len = input_dims[1];
-  const int memory_hidden_dim = input_dims[2];
-  const int vocab_size = word_emb.shape()[0];
-
-  typedef PDTraits<D> traits_;
-  typedef typename traits_::DataType DataType_;
-  typedef typename traits_::data_t data_t_;
-
-  DecodingInitParam<DataType_> decoding_params;
-  decoding_params.cublas_handle = CublasHandle::GetInstance()->cublas_handle_;
-  decoding_params.cublaslt_handle =
-      CublasHandle::GetInstance()->cublaslt_handle_;
-
-  decoding_params.output_ids = output_ids.mutable_data<int>(input.place());
-  decoding_params.parent_ids = parent_ids.mutable_data<int>(input.place());
-  decoding_params.sequence_length =
-      sequence_length.mutable_data<int>(input.place());
-
-  typedef DecoderTransformerTraits<traits_::OpType> DecodingTraits_;
-  decoding_params.stream = stream;
-  fastertransformer::Allocator<AllocatorType::PD> allocator_(stream);
-
-  decoding_params.memory_tensor =
-      reinterpret_cast<const DataType_*>(input.data<data_t_>());
-  decoding_params.memory_sequence_length = memory_sequence_length.data<int>();
-
- //TODO(gongenlei): Support MP & PP
-  TensorParallelParam tensor_parallel_param;
-  LayerParallelParam layer_parallel_param;
-  tensor_parallel_param.rank = 0;
-  tensor_parallel_param.world_size = 1;
-  tensor_parallel_param.local_head_num_ = head_num_;
-  tensor_parallel_param.local_hidden_units_ = memory_hidden_dim;
-  layer_parallel_param.rank = 0;
-  layer_parallel_param.world_size = 1;
-  layer_parallel_param.layers_per_group = num_layer_;
-  layer_parallel_param.local_batch_size = batch_size_;
-
-  DecoderInitParam<DataType_>* params =
-      new DecoderInitParam<DataType_>[num_layer_];
-
-  for (int i = 0; i < num_layer_; i++) {
-    params[i].stream = stream;
-    params[i].cublas_handle = CublasHandle::GetInstance()->cublas_handle_;
-    params[i].cublaslt_handle = CublasHandle::GetInstance()->cublaslt_handle_;
-
-    if (decoding_strategy == "beam_search" ||
-        decoding_strategy == "beam_search_v2" ||
-        decoding_strategy == "beam_search_v3") {
-      params[i].request_batch_size = batch_size_ * beam_width_;
-      params[i].request_max_mem_seq_len = memory_max_seq_len;
-    } else if (decoding_strategy == "sampling" ||
-               decoding_strategy == "topk_sampling" ||
-               decoding_strategy == "topp_sampling") {
-      params[i].request_batch_size = batch_size_;
-      params[i].request_max_mem_seq_len = memory_max_seq_len;
-    }
-
-    // self attn
-    params[i].self_layernorm.gamma = reinterpret_cast<const DataType_*>(
-        self_layernorm_weight[i].data<data_t_>());
-    params[i].self_layernorm.beta = reinterpret_cast<const DataType_*>(
-        self_layernorm_bias[i].data<data_t_>());
-    // query
-    params[i].self_attention.query_weight.kernel =
-        reinterpret_cast<const DataType_*>(
-            self_attn_query_weight[i].data<data_t_>());
-    params[i].self_attention.query_weight.bias =
-        reinterpret_cast<const DataType_*>(
-            self_attn_query_bias[i].data<data_t_>());
-    // // key
-    // params[i].self_attention.key_weight.kernel =
-    //     reinterpret_cast<const DataType_*>(
-    //         self_attn_key_weight[i].data<data_t_>());
-    // params[i].self_attention.key_weight.bias =
-    //     reinterpret_cast<const DataType_*>(
-    //         self_attn_key_bias[i].data<data_t_>());
-    // // value
-    // params[i].self_attention.value_weight.kernel =
-    //     reinterpret_cast<const DataType_*>(
-    //         self_attn_value_weight[i].data<data_t_>());
-    // params[i].self_attention.value_weight.bias =
-    //     reinterpret_cast<const DataType_*>(
-    //         self_attn_value_bias[i].data<data_t_>());
-
-    // key
-    params[i].self_attention.key_weight.kernel = nullptr;
-    params[i].self_attention.key_weight.bias = nullptr;
-    // value
-    params[i].self_attention.value_weight.kernel = nullptr;
-    params[i].self_attention.value_weight.bias = nullptr;
-
-    // out proj
-    params[i].self_attention.attention_output_weight.kernel =
-        reinterpret_cast<const DataType_*>(
-            self_attn_output_weight[i].data<data_t_>());
-    params[i].self_attention.attention_output_weight.bias =
-        reinterpret_cast<const DataType_*>(
-            self_attn_output_bias[i].data<data_t_>());
-
-    // cross
-    params[i].cross_layernorm.gamma = reinterpret_cast<const DataType_*>(
-        cross_layernorm_weight[i].data<data_t_>());
-    params[i].cross_layernorm.beta = reinterpret_cast<const DataType_*>(
-        cross_layernorm_bias[i].data<data_t_>());
-    // query
-    params[i].cross_attention.query_weight.kernel =
-        reinterpret_cast<const DataType_*>(
-            cross_attn_query_weight[i].data<data_t_>());
-    params[i].cross_attention.query_weight.bias =
-        reinterpret_cast<const DataType_*>(
-            cross_attn_query_bias[i].data<data_t_>());
-    // key
-    params[i].cross_attention.key_weight.kernel =
-        reinterpret_cast<const DataType_*>(
-            cross_attn_key_weight[i].data<data_t_>());
-    params[i].cross_attention.key_weight.bias =
-        reinterpret_cast<const DataType_*>(
-            cross_attn_key_bias[i].data<data_t_>());
-    // value
-    params[i].cross_attention.value_weight.kernel =
-        reinterpret_cast<const DataType_*>(
-            cross_attn_value_weight[i].data<data_t_>());
-    params[i].cross_attention.value_weight.bias =
-        reinterpret_cast<const DataType_*>(
-            cross_attn_value_bias[i].data<data_t_>());
-    // out proj
-    params[i].cross_attention.attention_output_weight.kernel =
-        reinterpret_cast<const DataType_*>(
-            cross_attn_output_weight[i].data<data_t_>());
-    params[i].cross_attention.attention_output_weight.bias =
-        reinterpret_cast<const DataType_*>(
-            cross_attn_output_bias[i].data<data_t_>());
-
-    // ffn
-    params[i].ffn_layernorm.gamma = reinterpret_cast<const DataType_*>(
-        ffn_layernorm_weight[i].data<data_t_>());
-    params[i].ffn_layernorm.beta = reinterpret_cast<const DataType_*>(
-        ffn_layernorm_bias[i].data<data_t_>());
-    // intermediate proj
-    params[i].ffn.intermediate_weight.kernel =
-        reinterpret_cast<const DataType_*>(
-            ffn_intermediate_weight[i].data<data_t_>());
-    params[i].ffn.intermediate_weight.bias = reinterpret_cast<const DataType_*>(
-        ffn_intermediate_bias[i].data<data_t_>());
-    // out proj
-    params[i].ffn.output_weight.kernel = reinterpret_cast<const DataType_*>(
-        ffn_output_weight[i].data<data_t_>());
-    params[i].ffn.output_weight.bias =
-        reinterpret_cast<const DataType_*>(ffn_output_bias[i].data<data_t_>());
-  }
-
-  decoding_params.layernorm.gamma = reinterpret_cast<const DataType_*>(
-      decoder_layernorm_weight.data<data_t_>());
-  decoding_params.layernorm.beta = reinterpret_cast<const DataType_*>(
-      decoder_layernorm_bias.data<data_t_>());
-  // for embedding
-  decoding_params.embedding_table =
-      reinterpret_cast<const DataType_*>(word_emb.data<data_t_>());
-
-  // for weight sharing matmul
-  decoding_params.embedding_kernel =
-      reinterpret_cast<const DataType_*>(embedding_weight.data<data_t_>());
-  // for matmul bias
-  decoding_params.embedding_bias =
-      reinterpret_cast<const DataType_*>(embedding_bias.data<data_t_>());
-  decoding_params.position_encoding_table = reinterpret_cast<const DataType_*>(
-      position_encoding_table.data<data_t_>());
-
-  int finished_candidate_num_ =
-      ("beam_search_v3" == decoding_strategy) ? beam_width_ : beam_width_ * 2;
-
-  if ("beam_search" == decoding_strategy) {
-    DecodingBeamsearch<DecodingTraits_::OpType>* decoding_beamsearch_;
-    decoding_beamsearch_ = new DecodingBeamsearch<DecodingTraits_::OpType>(
-        allocator_,
-        batch_size_,
-        beam_width_,
-        max_seq_len_,
-        head_num_,
-        size_per_head_,
-        vocab_size,
-        num_layer_,
-        memory_hidden_dim,
-        memory_max_seq_len,
-        start_id_,
-        end_id_,
-        beam_search_diversity_rate_,
-        true,  /*is_fuse_topk_softMax*/
-        true, /*is_fuse_qkv*/
-        false, /*keep_alive_beam*/
-        alpha, 
-        false, /*normalization_before*/
-        2, /*pos_offset*/
-        ActivationType::GELU,
-        false,  /*pos_bias*/
-        false,  /*prefix_lm*/
-        -1,  /*finished_candidate_num*/
-        false,  /*early_stopping*/
-        false,  /*is_mbart*/
-        min_seq_len_);
-
-    decoding_beamsearch_->set_tensor_parallel_param(
-        tensor_parallel_param);
-    decoding_beamsearch_->set_layer_parallel_param(
-        layer_parallel_param);
-
-    decoding_beamsearch_->forward(params, decoding_params);
-
-    delete decoding_beamsearch_;
-  } else if ("beam_search_v2" == decoding_strategy ||
-             "beam_search_v3" == decoding_strategy) {
-    DecodingBeamsearch<DecodingTraits_::OpType>* decoding_beamsearch_;
-    decoding_beamsearch_ = new DecodingBeamsearch<DecodingTraits_::OpType>(
-        allocator_,
-        batch_size_,
-        beam_width_,
-        max_seq_len_,
-        head_num_,
-        size_per_head_,
-        vocab_size,
-        num_layer_,
-        memory_hidden_dim,
-        memory_max_seq_len,
-        start_id_,
-        end_id_,
-        beam_search_diversity_rate_,
-        true,  /*is_fuse_topk_softMax*/
-        true, /*is_fuse_qkv*/
-        true,  /*keep_alive_beam*/
-        alpha,
-        false, /*normalization_before*/
-        2,
-        ActivationType::GELU,
-        false, /*pos_bias*/
-        false, /*prefix_lm*/
-        finished_candidate_num_,
-        early_stopping,
-        false,  /*is_mbart*/
-        min_seq_len_);
-    
-    decoding_beamsearch_->set_tensor_parallel_param(
-        tensor_parallel_param);
-    decoding_beamsearch_->set_layer_parallel_param(
-        layer_parallel_param);
-
-    decoding_beamsearch_->forward(params, decoding_params);
-
-    delete decoding_beamsearch_;
-  } else if ("topk_sampling" == decoding_strategy ||
-             "topp_sampling" == decoding_strategy ||
-             "sampling" == decoding_strategy) {
-    DecodingSampling<DecodingTraits_::OpType>* decoding_sampling_;
-    decoding_sampling_ =
-        new DecodingSampling<DecodingTraits_::OpType>(allocator_,
-                                                      batch_size_,
-                                                      max_seq_len_,
-                                                      head_num_,
-                                                      size_per_head_,
-                                                      vocab_size,
-                                                      num_layer_,
-                                                      memory_hidden_dim,
-                                                      memory_max_seq_len,
-                                                      start_id_,
-                                                      end_id_,
-                                                      candidate_num_,
-                                                      probability_threshold_,
-                                                      true, /*is_fuse_qkv*/
-                                                      false, /*normalization_before*/
-                                                      2, /*pos_offset*/
-                                                      ActivationType::GELU,
-                                                      false, /*pos_bias*/
-                                                      temperature, /*temperature*/
-                                                      1.0, /*repeat_penalty*/
-                                                      false, /*prefix_lm*/
-                                                      false,  /*is_mbart*/
-                                                      min_seq_len_);
-    decoding_sampling_->set_tensor_parallel_param(
-        tensor_parallel_param);
-    decoding_sampling_->set_layer_parallel_param(
-        layer_parallel_param);
-
-    decoding_sampling_->forward(params, decoding_params);
-
-    delete decoding_sampling_;
-  } else {
-    PD_THROW(
-        "Only beam_search, topk_sampling and topp_sampling are supported for "
-        "FastGeneration. ");
-  }
-  delete[] params;
-
-  return {output_ids, parent_ids, sequence_length};
-}
-
-std::vector<paddle::Tensor> BartDecodingCUDAForward(
-    const paddle::Tensor& input,
-    const paddle::Tensor& mem_seq_len,
-    const paddle::Tensor& word_embedding,
-    const std::vector<paddle::Tensor>& self_ln_weight,
-    const std::vector<paddle::Tensor>& self_ln_bias,
-    const std::vector<paddle::Tensor>& self_q_weight,
-    const std::vector<paddle::Tensor>& self_q_bias,
-    const std::vector<paddle::Tensor>& self_k_weight,
-    const std::vector<paddle::Tensor>& self_k_bias,
-    const std::vector<paddle::Tensor>& self_v_weight,
-    const std::vector<paddle::Tensor>& self_v_bias,
-    const std::vector<paddle::Tensor>& self_out_weight,
-    const std::vector<paddle::Tensor>& self_out_bias,
-    const std::vector<paddle::Tensor>& cross_ln_weight,
-    const std::vector<paddle::Tensor>& cross_ln_bias,
-    const std::vector<paddle::Tensor>& cross_q_weight,
-    const std::vector<paddle::Tensor>& cross_q_bias,
-    const std::vector<paddle::Tensor>& cross_k_weight,
-    const std::vector<paddle::Tensor>& cross_k_bias,
-    const std::vector<paddle::Tensor>& cross_v_weight,
-    const std::vector<paddle::Tensor>& cross_v_bias,
-    const std::vector<paddle::Tensor>& cross_out_weight,
-    const std::vector<paddle::Tensor>& cross_out_bias,
-    const std::vector<paddle::Tensor>& ffn_ln_weight,
-    const std::vector<paddle::Tensor>& ffn_ln_bias,
-    const std::vector<paddle::Tensor>& ffn_inter_weight,
-    const std::vector<paddle::Tensor>& ffn_inter_bias,
-    const std::vector<paddle::Tensor>& ffn_out_weight,
-    const std::vector<paddle::Tensor>& ffn_out_bias,
-    const paddle::Tensor& decoder_ln_weight,
-    const paddle::Tensor& decoder_ln_bias,
-    const paddle::Tensor& embedding_weight,
-    const paddle::Tensor& embedding_bias,
-    const paddle::Tensor& positional_embedding_weight,
-    paddle::Tensor& output_ids,
-    paddle::Tensor& parent_ids,
-    paddle::Tensor& sequence_length,
-    const std::string& decoding_strategy,
-    const int& beam_size,
-    const int& topk,
-    const float& topp,
-    const float& temperature,
-    const int& n_head,
-    const int& size_per_head,
-    const int& num_layer,
-    const int& bos_id,
-    const int& eos_id,
-    const int64_t& max_len,
-    const int64_t& min_len,
-    const float& beam_search_diversity_rate,
-    const float& alpha,
-    const bool& early_stopping) {
-  auto stream = input.stream();
-
-  cublasSetStream(CublasHandle::GetInstance()->cublas_handle_, stream);
-
-  std::vector<paddle::Tensor> ret;
-
-  switch (input.type()) {
-    case paddle::DataType::FLOAT16: {
-      ret = bart_decoding_kernel<paddle::DataType::FLOAT16>(
-          input,
-          mem_seq_len,
-          word_embedding,
-          self_ln_weight,
-          self_ln_bias,
-          self_q_weight,
-          self_q_bias,
-          self_k_weight,
-          self_k_bias,
-          self_v_weight,
-          self_v_bias,
-          self_out_weight,
-          self_out_bias,
-          cross_ln_weight,
-          cross_ln_bias,
-          cross_q_weight,
-          cross_q_bias,
-          cross_k_weight,
-          cross_k_bias,
-          cross_v_weight,
-          cross_v_bias,
-          cross_out_weight,
-          cross_out_bias,
-          ffn_ln_weight,
-          ffn_ln_bias,
-          ffn_inter_weight,
-          ffn_inter_bias,
-          ffn_out_weight,
-          ffn_out_bias,
-          decoder_ln_weight,
-          decoder_ln_bias,
-          embedding_weight,
-          embedding_bias,
-          positional_embedding_weight,
-          output_ids,
-          parent_ids,
-          sequence_length,
-          decoding_strategy,
-          beam_size,
-          topk,
-          topp,
-          temperature,
-          n_head,
-          size_per_head,
-          num_layer,
-          bos_id,
-          eos_id,
-          max_len,
-          min_len,
-          beam_search_diversity_rate,
-          alpha,
-          early_stopping,
-          stream);
-      break;
-    }
-    case paddle::DataType::FLOAT32: {
-      ret = bart_decoding_kernel<paddle::DataType::FLOAT32>(
-          input,
-          mem_seq_len,
-          word_embedding,
-          self_ln_weight,
-          self_ln_bias,
-          self_q_weight,
-          self_q_bias,
-          self_k_weight,
-          self_k_bias,
-          self_v_weight,
-          self_v_bias,
-          self_out_weight,
-          self_out_bias,
-          cross_ln_weight,
-          cross_ln_bias,
-          cross_q_weight,
-          cross_q_bias,
-          cross_k_weight,
-          cross_k_bias,
-          cross_v_weight,
-          cross_v_bias,
-          cross_out_weight,
-          cross_out_bias,
-          ffn_ln_weight,
-          ffn_ln_bias,
-          ffn_inter_weight,
-          ffn_inter_bias,
-          ffn_out_weight,
-          ffn_out_bias,
-          decoder_ln_weight,
-          decoder_ln_bias,
-          embedding_weight,
-          embedding_bias,
-          positional_embedding_weight,
-          output_ids,
-          parent_ids,
-          sequence_length,
-          decoding_strategy,
-          beam_size,
-          topk,
-          topp,
-          temperature,
-          n_head,
-          size_per_head,
-          num_layer,
-          bos_id,
-          eos_id,
-          max_len,
-          min_len,
-          beam_search_diversity_rate,
-          alpha,
-          early_stopping,
-          stream);
-      break;
-    }
-    default: {
-      PD_THROW(
-          "NOT supported data type. "
-          "Only float16 and float32 are supported. ");
-      break;
-    }
-  }
-  return ret;
-}
diff --git a/paddlenlp/ops/fast_transformer/src/fusion_bart_decoding_op.h b/paddlenlp/ops/fast_transformer/src/fusion_bart_decoding_op.h
deleted file mode 100644
index 82219aba6ebe..000000000000
--- a/paddlenlp/ops/fast_transformer/src/fusion_bart_decoding_op.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include "fastertransformer/decoding_beamsearch.h"
-#include "fastertransformer/decoding_sampling.h"
-#include "fastertransformer/open_decoder.h"
-#include "fastertransformer/utils/common.h"
-
-#ifdef PADDLE_ON_INFERENCE
-#include "paddle/extension.h"
-#include "paddle_inference_api.h"
-#include "paddle/common/exception.h"
-#else
-#include "paddle/extension.h"
-#endif
-
-
-std::vector<paddle::Tensor> BartDecodingCUDAForward(
-    const paddle::Tensor& input,
-    const paddle::Tensor& mem_seq_len,
-    const paddle::Tensor& word_embedding,
-    const std::vector<paddle::Tensor>& self_ln_weight,
-    const std::vector<paddle::Tensor>& self_ln_bias,
-    const std::vector<paddle::Tensor>& self_q_weight,
-    const std::vector<paddle::Tensor>& self_q_bias,
-    const std::vector<paddle::Tensor>& self_k_weight,
-    const std::vector<paddle::Tensor>& self_k_bias,
-    const std::vector<paddle::Tensor>& self_v_weight,
-    const std::vector<paddle::Tensor>& self_v_bias,
-    const std::vector<paddle::Tensor>& self_out_weight,
-    const std::vector<paddle::Tensor>& self_out_bias,
-    const std::vector<paddle::Tensor>& cross_ln_weight,
-    const std::vector<paddle::Tensor>& cross_ln_bias,
-    const std::vector<paddle::Tensor>& cross_q_weight,
-    const std::vector<paddle::Tensor>& cross_q_bias,
-    const std::vector<paddle::Tensor>& cross_k_weight,
-    const std::vector<paddle::Tensor>& cross_k_bias,
-    const std::vector<paddle::Tensor>& cross_v_weight,
-    const std::vector<paddle::Tensor>& cross_v_bias,
-    const std::vector<paddle::Tensor>& cross_out_weight,
-    const std::vector<paddle::Tensor>& cross_out_bias,
-    const std::vector<paddle::Tensor>& ffn_ln_weight,
-    const std::vector<paddle::Tensor>& ffn_ln_bias,
-    const std::vector<paddle::Tensor>& ffn_inter_weight,
-    const std::vector<paddle::Tensor>& ffn_inter_bias,
-    const std::vector<paddle::Tensor>& ffn_out_weight,
-    const std::vector<paddle::Tensor>& ffn_out_bias,
-    const paddle::Tensor& decoder_ln_weight,
-    const paddle::Tensor& decoder_ln_bias,
-    const paddle::Tensor& embedding_weight,
-    const paddle::Tensor& embedding_bias,
-    const paddle::Tensor& positional_embedding_weight,
-    paddle::Tensor& output_ids,
-    paddle::Tensor& parent_ids,
-    paddle::Tensor& sequence_length,
-    const std::string& decoding_strategy,
-    const int& beam_size,
-    const int& topk,
-    const float& topp,
-    const float& temperature,
-    const int& n_head,
-    const int& size_per_head,
-    const int& num_layer,
-    const int& bos_id,
-    const int& eos_id,
-    const int64_t& max_len,
-    const int64_t& min_len,
-    const float& beam_search_diversity_rate,
-    const float& alpha,
-    const bool& early_stopping);
diff --git a/paddlenlp/ops/fast_transformer/src/fusion_decoder_op.cc b/paddlenlp/ops/fast_transformer/src/fusion_decoder_op.cc
deleted file mode 100644
index e0d055bde5fe..000000000000
--- a/paddlenlp/ops/fast_transformer/src/fusion_decoder_op.cc
+++ /dev/null
@@ -1,228 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <string>
-#include <vector>
-
-#include "fusion_decoder_op.h"
-#include "pd_traits.h"
-
-
-std::vector<paddle::Tensor> DecoderForward(
-    const paddle::Tensor& from_tensor,
-    const paddle::Tensor& memory_tensor,
-    const paddle::Tensor& mem_seq_len,
-    const paddle::Tensor& self_ln_weight,
-    const paddle::Tensor& self_ln_bias,
-    const paddle::Tensor& self_q_weight,
-    const paddle::Tensor& self_q_bias,
-    const paddle::Tensor& self_k_weight,
-    const paddle::Tensor& self_k_bias,
-    const paddle::Tensor& self_v_weight,
-    const paddle::Tensor& self_v_bias,
-    const paddle::Tensor& self_out_weight,
-    const paddle::Tensor& self_out_bias,
-    const paddle::Tensor& cross_ln_weight,
-    const paddle::Tensor& cross_ln_bias,
-    const paddle::Tensor& cross_q_weight,
-    const paddle::Tensor& cross_q_bias,
-    const paddle::Tensor& cross_k_weight,
-    const paddle::Tensor& cross_k_bias,
-    const paddle::Tensor& cross_v_weight,
-    const paddle::Tensor& cross_v_bias,
-    const paddle::Tensor& cross_out_weight,
-    const paddle::Tensor& cross_out_bias,
-    const paddle::Tensor& ffn_ln_weight,
-    const paddle::Tensor& ffn_ln_bias,
-    const paddle::Tensor& ffn_inter_weight,
-    const paddle::Tensor& ffn_inter_bias,
-    const paddle::Tensor& ffn_out_weight,
-    const paddle::Tensor& ffn_out_bias,
-    const paddle::Tensor& old_self_cache_key,
-    const paddle::Tensor& old_self_cache_value,
-    const paddle::Tensor& old_mem_cache,
-    const int step,
-    int n_head,
-    int size_per_head,
-    int memory_hidden_dim,
-    bool is_fuse_qkv) {
-  const int batch_size = memory_tensor.shape()[0];
-  std::vector<int64_t> output_dims;
-  output_dims = {batch_size, 1, n_head * size_per_head};
-
-  auto new_self_cache_key = old_self_cache_key;
-  auto new_self_cache_value = old_self_cache_value;
-  auto new_mem_cache = old_mem_cache;
-
-  if (from_tensor.place() == paddle::PlaceType::kGPU) {
-    auto decoder_output = paddle::Tensor(from_tensor.place(), output_dims);
-
-    paddle::Tensor _mem_seq_len = paddle::Tensor(paddle::PlaceType::kGPU);
-
-    if (mem_seq_len.place() != paddle::PlaceType::kGPU) {
-      _mem_seq_len = mem_seq_len.copy_to<int>(paddle::PlaceType::kGPU);
-    } else {
-      _mem_seq_len = mem_seq_len;
-    }
-
-    return DecoderCUDAForward(from_tensor,
-                              memory_tensor,
-                              _mem_seq_len,
-                              self_ln_weight,
-                              self_ln_bias,
-                              self_q_weight,
-                              self_q_bias,
-                              self_k_weight,
-                              self_k_bias,
-                              self_v_weight,
-                              self_v_bias,
-                              self_out_weight,
-                              self_out_bias,
-                              cross_ln_weight,
-                              cross_ln_bias,
-                              cross_q_weight,
-                              cross_q_bias,
-                              cross_k_weight,
-                              cross_k_bias,
-                              cross_v_weight,
-                              cross_v_bias,
-                              cross_out_weight,
-                              cross_out_bias,
-                              ffn_ln_weight,
-                              ffn_ln_bias,
-                              ffn_inter_weight,
-                              ffn_inter_bias,
-                              ffn_out_weight,
-                              ffn_out_bias,
-                              old_self_cache_key,
-                              old_self_cache_value,
-                              old_mem_cache,
-                              step,
-                              decoder_output,
-                              new_self_cache_key,
-                              new_self_cache_value,
-                              new_mem_cache,
-                              n_head,
-                              size_per_head,
-                              memory_hidden_dim,
-                              is_fuse_qkv);
-  } else {
-    PD_THROW("Not implemented place. Only GPU is supported. ");
-  }
-}
-
-std::vector<std::vector<int64_t>> DecoderInferShape(
-    const std::vector<int64_t>& from_tensor_shape,
-    const std::vector<int64_t>& memory_tensor_shape,
-    const std::vector<int64_t>& mem_seq_len_shape,
-    const std::vector<int64_t>& self_ln_weight_shapes,
-    const std::vector<int64_t>& self_ln_bias_shapes,
-    const std::vector<int64_t>& self_q_weight_shapes,
-    const std::vector<int64_t>& self_q_bias_shapes,
-    const std::vector<int64_t>& self_k_weight_shapes,
-    const std::vector<int64_t>& self_k_bias_shapes,
-    const std::vector<int64_t>& self_v_weight_shapes,
-    const std::vector<int64_t>& self_v_bias_shapes,
-    const std::vector<int64_t>& self_out_weight_shapes,
-    const std::vector<int64_t>& self_out_bias_shapes,
-    const std::vector<int64_t>& cross_ln_weight_shapes,
-    const std::vector<int64_t>& cross_ln_bias_shapes,
-    const std::vector<int64_t>& cross_q_weight_shapes,
-    const std::vector<int64_t>& cross_q_bias_shapes,
-    const std::vector<int64_t>& cross_k_weight_shapes,
-    const std::vector<int64_t>& cross_k_bias_shapes,
-    const std::vector<int64_t>& cross_v_weight_shapes,
-    const std::vector<int64_t>& cross_v_bias_shapes,
-    const std::vector<int64_t>& cross_out_weight_shapes,
-    const std::vector<int64_t>& cross_out_bias_shapes,
-    const std::vector<int64_t>& ffn_ln_weight_shapes,
-    const std::vector<int64_t>& ffn_ln_bias_shapes,
-    const std::vector<int64_t>& ffn_inter_weight_shapes,
-    const std::vector<int64_t>& ffn_inter_bias_shapes,
-    const std::vector<int64_t>& ffn_out_weight_shapes,
-    const std::vector<int64_t>& ffn_out_bias_shapes,
-    const std::vector<int64_t>& old_self_cache_key_shape,
-    const std::vector<int64_t>& old_self_cache_value_shape,
-    const std::vector<int64_t>& old_mem_cache_shape,
-    const int& step,
-    const int& n_head,
-    const int& size_per_head,
-    const int& memory_hidden_dim,
-    const bool& is_fuse_qkv) {
-  return {from_tensor_shape,
-          old_self_cache_key_shape,
-          old_self_cache_value_shape,
-          old_mem_cache_shape};
-}
-
-std::vector<paddle::DataType> DecoderInferDtype(
-    const paddle::DataType& from_tensor,
-    const paddle::DataType& memory_tensor,
-    const paddle::DataType& mem_seq_len,
-    const paddle::DataType& self_ln_weight,
-    const paddle::DataType& self_ln_bias,
-    const paddle::DataType& self_q_weight,
-    const paddle::DataType& self_q_bias,
-    const paddle::DataType& self_k_weight,
-    const paddle::DataType& self_k_bias,
-    const paddle::DataType& self_v_weight,
-    const paddle::DataType& self_v_bias,
-    const paddle::DataType& self_out_weight,
-    const paddle::DataType& self_out_bias,
-    const paddle::DataType& cross_ln_weight,
-    const paddle::DataType& cross_ln_bias,
-    const paddle::DataType& cross_q_weight,
-    const paddle::DataType& cross_q_bias,
-    const paddle::DataType& cross_k_weight,
-    const paddle::DataType& cross_k_bias,
-    const paddle::DataType& cross_v_weight,
-    const paddle::DataType& cross_v_bias,
-    const paddle::DataType& cross_out_weight,
-    const paddle::DataType& cross_out_bias,
-    const paddle::DataType& ffn_ln_weight,
-    const paddle::DataType& ffn_ln_bias,
-    const paddle::DataType& ffn_inter_weight,
-    const paddle::DataType& ffn_inter_bias,
-    const paddle::DataType& ffn_out_weight,
-    const paddle::DataType& ffn_out_bias,
-    const paddle::DataType& old_self_cache_key,
-    const paddle::DataType& old_self_cache_value,
-    const paddle::DataType& old_mem_cache) {
-  return {from_tensor, old_self_cache_key, old_self_cache_value, old_mem_cache};
-}
-
-PD_BUILD_OP(fusion_decoder)
-    .Inputs(
-        {"FromTensor",          "MemoryTensor",         "MemSeqLen",
-         "SelfLayernormWeight", "SelfLayernormBias",    "SelfQueryWeight",
-         "SelfQueryBias",       "SelfKeyWeight",        "SelfKeyBias",
-         "SelfValueWeight",     "SelfValueBias",        "SelfOutWeight",
-         "SelfOutBias",         "CrossLayernormWeight", "CrossLayernormBias",
-         "CrossQueryWeight",    "CrossQueryBias",       "CrossKeyWeight",
-         "CrossKeyBias",        "CrossValueWeight",     "CrossValueBias",
-         "CrossOutWeight",      "CrossOutBias",         "FFNLayernormWeight",
-         "FFNLayernormBias",    "FFNInterWeight",       "FFNInterBias",
-         "FFNOutWeight",        "FFNOutBias",           "OldSelfCacheKey",
-         "OldSelfCacheValue",   "OldMemCache"})
-    .Outputs({"DecoderOutput",
-              "NewSelfCacheKey",
-              "NewSelfCacheValue",
-              "NewMemCache"})
-    .Attrs({"step: int",
-            "n_head: int",
-            "size_per_head: int",
-            "memory_hidden_dim: int",
-            "is_fuse_qkv: bool"})
-    .SetKernelFn(PD_KERNEL(DecoderForward))
-    .SetInferShapeFn(PD_INFER_SHAPE(DecoderInferShape))
-    .SetInferDtypeFn(PD_INFER_DTYPE(DecoderInferDtype));
diff --git a/paddlenlp/ops/fast_transformer/src/fusion_decoder_op.cu b/paddlenlp/ops/fast_transformer/src/fusion_decoder_op.cu
deleted file mode 100644
index efe05f4be58e..000000000000
--- a/paddlenlp/ops/fast_transformer/src/fusion_decoder_op.cu
+++ /dev/null
@@ -1,374 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <curand.h>
-#include <curand_kernel.h>
-#include <algorithm>
-#include <iterator>
-#include <random>
-#include <sstream>
-#include <vector>
-
-
-#include "fusion_decoder_op.h"
-#include "pd_traits.h"
-
-
-template <paddle::DataType D>
-std::vector<paddle::Tensor> decoder_kernel(
-    const paddle::Tensor& from_tensor_input,
-    const paddle::Tensor& memory_tensor_input,
-    const paddle::Tensor& mem_seq_len_input,
-    const paddle::Tensor& self_ln_weight,
-    const paddle::Tensor& self_ln_bias,
-    const paddle::Tensor& self_q_weight,
-    const paddle::Tensor& self_q_bias,
-    const paddle::Tensor& self_k_weight,
-    const paddle::Tensor& self_k_bias,
-    const paddle::Tensor& self_v_weight,
-    const paddle::Tensor& self_v_bias,
-    const paddle::Tensor& self_out_weight,
-    const paddle::Tensor& self_out_bias,
-    const paddle::Tensor& cross_ln_weight,
-    const paddle::Tensor& cross_ln_bias,
-    const paddle::Tensor& cross_q_weight,
-    const paddle::Tensor& cross_q_bias,
-    const paddle::Tensor& cross_k_weight,
-    const paddle::Tensor& cross_k_bias,
-    const paddle::Tensor& cross_v_weight,
-    const paddle::Tensor& cross_v_bias,
-    const paddle::Tensor& cross_out_weight,
-    const paddle::Tensor& cross_out_bias,
-    const paddle::Tensor& ffn_ln_weight,
-    const paddle::Tensor& ffn_ln_bias,
-    const paddle::Tensor& ffn_inter_weight,
-    const paddle::Tensor& ffn_inter_bias,
-    const paddle::Tensor& ffn_out_weight,
-    const paddle::Tensor& ffn_out_bias,
-    const paddle::Tensor& old_self_cache_key,
-    const paddle::Tensor& old_self_cache_value,
-    const paddle::Tensor& old_mem_cache,
-    const int step,
-    paddle::Tensor& decoder_output_tensor,
-    paddle::Tensor& new_self_cache_key,
-    paddle::Tensor& new_self_cache_value,
-    paddle::Tensor& new_mem_cache,
-    int n_head,
-    int size_per_head,
-    int memory_hidden_dim,
-    bool is_fuse_qkv,
-    cublasHandle_t cublas_handle_,
-    cublasLtHandle_t cublaslt_handle_,
-    cudaStream_t stream) {
-  auto input_dims = memory_tensor_input.shape();
-  const int batch_size_ = static_cast<int>(input_dims[0]);
-  const int max_seq_len_ = static_cast<int>(input_dims[1]);
-  const int memory_hidden_dim_ = static_cast<int>(memory_hidden_dim);
-  const bool is_fuse_qkv_ = static_cast<bool>(is_fuse_qkv);
-
-  // Detect we use batch major
-  bool use_batch_major =
-      (old_self_cache_key.shape().size() == 5) ? true : false;
-  // we use decoder_max_seq_len == -1 to tell the decoder we use seq major cache
-  // format
-  int decoder_max_seq_len =
-      (use_batch_major) ? (int)old_self_cache_value.shape()[2] : -1;
-
-  typedef PDTraits<D> traits_;
-  typedef typename traits_::DataType DataType_;
-  typedef typename traits_::data_t data_t_;
-  typedef DecoderTransformerTraits<traits_::OpType> DecoderTraits_;
-  OpenDecoder<DecoderTraits_::OpType>* decoder_;
-  decoder_ = new OpenDecoder<DecoderTraits_::OpType>(n_head,
-                                                     size_per_head,
-                                                     memory_hidden_dim_,
-                                                     is_fuse_qkv_,
-                                                     true,
-                                                     ActivationType::RELU);
-
-  DataType_* decoder_output = reinterpret_cast<DataType_*>(
-      decoder_output_tensor.mutable_data<data_t_>());
-  DataType_* self_cache_key_tensor = reinterpret_cast<DataType_*>(
-      const_cast<data_t_*>(old_self_cache_key.data<data_t_>()));
-  DataType_* self_cache_value_tensor = reinterpret_cast<DataType_*>(
-      const_cast<data_t_*>(old_self_cache_value.data<data_t_>()));
-  DataType_* memory_cache = reinterpret_cast<DataType_*>(
-      const_cast<data_t_*>(old_mem_cache.data<data_t_>()));
-  const DataType_* from_tensor =
-      reinterpret_cast<const DataType_*>(from_tensor_input.data<data_t_>());
-  const DataType_* memory_tensor =
-      reinterpret_cast<const DataType_*>(memory_tensor_input.data<data_t_>());
-  const int* memory_sequence_length = mem_seq_len_input.data<int>();
-
-  DecoderInitParam<DataType_> params;
-  params.cublas_handle = cublas_handle_;
-  params.cublaslt_handle = cublaslt_handle_;
-  params.stream = stream;
-  params.request_max_mem_seq_len = max_seq_len_;
-  params.request_batch_size = batch_size_;
-  fastertransformer::Allocator<AllocatorType::PD> allocator_(stream);
-
-  params.self_layernorm.gamma =
-      reinterpret_cast<const DataType_*>(self_ln_weight.data<data_t_>());
-  params.self_layernorm.beta =
-      reinterpret_cast<const DataType_*>(self_ln_bias.data<data_t_>());
-  params.self_attention.query_weight.kernel =
-      reinterpret_cast<const DataType_*>(self_q_weight.data<data_t_>());
-  params.self_attention.query_weight.bias =
-      reinterpret_cast<const DataType_*>(self_q_bias.data<data_t_>());
-  params.self_attention.key_weight.kernel =
-      reinterpret_cast<const DataType_*>(self_k_weight.data<data_t_>());
-  params.self_attention.key_weight.bias =
-      reinterpret_cast<const DataType_*>(self_k_bias.data<data_t_>());
-  params.self_attention.value_weight.kernel =
-      reinterpret_cast<const DataType_*>(self_v_weight.data<data_t_>());
-  params.self_attention.value_weight.bias =
-      reinterpret_cast<const DataType_*>(self_v_bias.data<data_t_>());
-  params.self_attention.attention_output_weight.kernel =
-      reinterpret_cast<const DataType_*>(self_out_weight.data<data_t_>());
-  params.self_attention.attention_output_weight.bias =
-      reinterpret_cast<const DataType_*>(self_out_bias.data<data_t_>());
-  params.cross_layernorm.gamma =
-      reinterpret_cast<const DataType_*>(cross_ln_weight.data<data_t_>());
-  params.cross_layernorm.beta =
-      reinterpret_cast<const DataType_*>(cross_ln_bias.data<data_t_>());
-  params.cross_attention.query_weight.kernel =
-      reinterpret_cast<const DataType_*>(cross_q_weight.data<data_t_>());
-  params.cross_attention.query_weight.bias =
-      reinterpret_cast<const DataType_*>(cross_q_bias.data<data_t_>());
-  params.cross_attention.key_weight.kernel =
-      reinterpret_cast<const DataType_*>(cross_k_weight.data<data_t_>());
-  params.cross_attention.key_weight.bias =
-      reinterpret_cast<const DataType_*>(cross_k_bias.data<data_t_>());
-  params.cross_attention.value_weight.kernel =
-      reinterpret_cast<const DataType_*>(cross_v_weight.data<data_t_>());
-  params.cross_attention.value_weight.bias =
-      reinterpret_cast<const DataType_*>(cross_v_bias.data<data_t_>());
-  params.cross_attention.attention_output_weight.kernel =
-      reinterpret_cast<const DataType_*>(cross_out_weight.data<data_t_>());
-  params.cross_attention.attention_output_weight.bias =
-      reinterpret_cast<const DataType_*>(cross_out_bias.data<data_t_>());
-  params.ffn_layernorm.gamma =
-      reinterpret_cast<const DataType_*>(ffn_ln_weight.data<data_t_>());
-  params.ffn_layernorm.beta =
-      reinterpret_cast<const DataType_*>(ffn_ln_bias.data<data_t_>());
-  params.ffn.intermediate_weight.kernel =
-      reinterpret_cast<const DataType_*>(ffn_inter_weight.data<data_t_>());
-  params.ffn.intermediate_weight.bias =
-      reinterpret_cast<const DataType_*>(ffn_inter_bias.data<data_t_>());
-  params.ffn.output_weight.kernel =
-      reinterpret_cast<const DataType_*>(ffn_out_weight.data<data_t_>());
-  params.ffn.output_weight.bias =
-      reinterpret_cast<const DataType_*>(ffn_out_bias.data<data_t_>());
-
-  const int local_step = static_cast<int>(step) + 1;
-  const int hidden_units = n_head * size_per_head;
-  DataType_* K_cache = self_cache_key_tensor;
-  DataType_* V_cache = self_cache_value_tensor;
-  DataType_* K_mem_cache = memory_cache;
-  DataType_* V_mem_cache =
-      memory_cache + batch_size_ * max_seq_len_ * hidden_units;
-  decoder_->set_max_batch_size(batch_size_);
-
-  const int decoder_buffer_size =
-      decoder_->getWorkspaceSize() * sizeof(DataType_);
-  void* buf =
-      allocator_.malloc(((sizeof(DataType_) == 2) ? CUBLAS_WORKSPACE_SIZE : 0) +
-                        decoder_buffer_size);
-  void* cublas_workspace = nullptr;
-  DataType_* decoder_buffer = (DataType_*)buf;
-  if (sizeof(DataType_) == 2)  // half
-  {
-    cublas_workspace = buf;
-    decoder_buffer =
-        (DataType_*)((char*)cublas_workspace + CUBLAS_WORKSPACE_SIZE);
-  }
-  decoder_->initialize(params, decoder_buffer, cublas_workspace);
-  decoder_->forward(from_tensor,
-                    memory_tensor,
-                    K_cache,
-                    V_cache,
-                    K_mem_cache,
-                    V_mem_cache,
-                    memory_sequence_length,
-                    decoder_output,
-                    local_step,
-                    decoder_max_seq_len,
-                    true);
-  allocator_.free(decoder_buffer);
-  delete decoder_;
-  return {decoder_output_tensor,
-          new_self_cache_key,
-          new_self_cache_value,
-          new_mem_cache};
-}
-
-std::vector<paddle::Tensor> DecoderCUDAForward(
-    const paddle::Tensor& from_tensor,
-    const paddle::Tensor& memory_tensor,
-    const paddle::Tensor& mem_seq_len,
-    const paddle::Tensor& self_ln_weight,
-    const paddle::Tensor& self_ln_bias,
-    const paddle::Tensor& self_q_weight,
-    const paddle::Tensor& self_q_bias,
-    const paddle::Tensor& self_k_weight,
-    const paddle::Tensor& self_k_bias,
-    const paddle::Tensor& self_v_weight,
-    const paddle::Tensor& self_v_bias,
-    const paddle::Tensor& self_out_weight,
-    const paddle::Tensor& self_out_bias,
-    const paddle::Tensor& cross_ln_weight,
-    const paddle::Tensor& cross_ln_bias,
-    const paddle::Tensor& cross_q_weight,
-    const paddle::Tensor& cross_q_bias,
-    const paddle::Tensor& cross_k_weight,
-    const paddle::Tensor& cross_k_bias,
-    const paddle::Tensor& cross_v_weight,
-    const paddle::Tensor& cross_v_bias,
-    const paddle::Tensor& cross_out_weight,
-    const paddle::Tensor& cross_out_bias,
-    const paddle::Tensor& ffn_ln_weight,
-    const paddle::Tensor& ffn_ln_bias,
-    const paddle::Tensor& ffn_inter_weight,
-    const paddle::Tensor& ffn_inter_bias,
-    const paddle::Tensor& ffn_out_weight,
-    const paddle::Tensor& ffn_out_bias,
-    const paddle::Tensor& old_self_cache_key,
-    const paddle::Tensor& old_self_cache_value,
-    const paddle::Tensor& old_mem_cache,
-    const int step,
-    paddle::Tensor& decoder_output,
-    paddle::Tensor& new_self_cache_key,
-    paddle::Tensor& new_self_cache_value,
-    paddle::Tensor& new_mem_cache,
-    int n_head,
-    int size_per_head,
-    int memory_hidden_dim,
-    bool is_fuse_qkv) {
-  auto stream = memory_tensor.stream();
-  cublasHandle_t cublas_handle_;
-  cublasCreate(&cublas_handle_);
-  cublasLtHandle_t cublaslt_handle_;
-  cublasLtCreate(&cublaslt_handle_);
-  cublasSetStream(cublas_handle_, stream);
-
-  std::vector<paddle::Tensor> ret;
-
-  switch (memory_tensor.type()) {
-    case paddle::DataType::FLOAT16: {
-      ret = decoder_kernel<paddle::DataType::FLOAT16>(from_tensor,
-                                                      memory_tensor,
-                                                      mem_seq_len,
-                                                      self_ln_weight,
-                                                      self_ln_bias,
-                                                      self_q_weight,
-                                                      self_q_bias,
-                                                      self_k_weight,
-                                                      self_k_bias,
-                                                      self_v_weight,
-                                                      self_v_bias,
-                                                      self_out_weight,
-                                                      self_out_bias,
-                                                      cross_ln_weight,
-                                                      cross_ln_bias,
-                                                      cross_q_weight,
-                                                      cross_q_bias,
-                                                      cross_k_weight,
-                                                      cross_k_bias,
-                                                      cross_v_weight,
-                                                      cross_v_bias,
-                                                      cross_out_weight,
-                                                      cross_out_bias,
-                                                      ffn_ln_weight,
-                                                      ffn_ln_bias,
-                                                      ffn_inter_weight,
-                                                      ffn_inter_bias,
-                                                      ffn_out_weight,
-                                                      ffn_out_bias,
-                                                      old_self_cache_key,
-                                                      old_self_cache_value,
-                                                      old_mem_cache,
-                                                      step,
-                                                      decoder_output,
-                                                      new_self_cache_key,
-                                                      new_self_cache_value,
-                                                      new_mem_cache,
-                                                      n_head,
-                                                      size_per_head,
-                                                      memory_hidden_dim,
-                                                      is_fuse_qkv,
-                                                      cublas_handle_,
-                                                      cublaslt_handle_,
-                                                      stream);
-      break;
-    }
-    case paddle::DataType::FLOAT32: {
-      ret = decoder_kernel<paddle::DataType::FLOAT32>(from_tensor,
-                                                      memory_tensor,
-                                                      mem_seq_len,
-                                                      self_ln_weight,
-                                                      self_ln_bias,
-                                                      self_q_weight,
-                                                      self_q_bias,
-                                                      self_k_weight,
-                                                      self_k_bias,
-                                                      self_v_weight,
-                                                      self_v_bias,
-                                                      self_out_weight,
-                                                      self_out_bias,
-                                                      cross_ln_weight,
-                                                      cross_ln_bias,
-                                                      cross_q_weight,
-                                                      cross_q_bias,
-                                                      cross_k_weight,
-                                                      cross_k_bias,
-                                                      cross_v_weight,
-                                                      cross_v_bias,
-                                                      cross_out_weight,
-                                                      cross_out_bias,
-                                                      ffn_ln_weight,
-                                                      ffn_ln_bias,
-                                                      ffn_inter_weight,
-                                                      ffn_inter_bias,
-                                                      ffn_out_weight,
-                                                      ffn_out_bias,
-                                                      old_self_cache_key,
-                                                      old_self_cache_value,
-                                                      old_mem_cache,
-                                                      step,
-                                                      decoder_output,
-                                                      new_self_cache_key,
-                                                      new_self_cache_value,
-                                                      new_mem_cache,
-                                                      n_head,
-                                                      size_per_head,
-                                                      memory_hidden_dim,
-                                                      is_fuse_qkv,
-                                                      cublas_handle_,
-                                                      cublaslt_handle_,
-                                                      stream);
-      break;
-    }
-    default: {
-      PD_THROW(
-          "NOT supported data type. "
-          "Only float16 and float32 are supported. ");
-      break;
-    }
-  }
-  cublasDestroy(cublas_handle_);
-  cublasLtDestroy(cublaslt_handle_);
-  return ret;
-}
diff --git a/paddlenlp/ops/fast_transformer/src/fusion_decoder_op.h b/paddlenlp/ops/fast_transformer/src/fusion_decoder_op.h
deleted file mode 100644
index e9cc413b42dc..000000000000
--- a/paddlenlp/ops/fast_transformer/src/fusion_decoder_op.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include "fastertransformer/open_decoder.h"
-#include "fastertransformer/utils/common.h"
-
-#ifdef PADDLE_ON_INFERENCE
-#include "paddle/extension.h"
-#include "paddle_inference_api.h"
-#include "paddle/common/exception.h"
-#else
-#include "paddle/extension.h"
-#endif
-
-
-std::vector<paddle::Tensor> DecoderCUDAForward(
-    const paddle::Tensor& from_tensor,
-    const paddle::Tensor& memory_tensor,
-    const paddle::Tensor& mem_seq_len,
-    const paddle::Tensor& self_ln_weight,
-    const paddle::Tensor& self_ln_bias,
-    const paddle::Tensor& self_q_weight,
-    const paddle::Tensor& self_q_bias,
-    const paddle::Tensor& self_k_weight,
-    const paddle::Tensor& self_k_bias,
-    const paddle::Tensor& self_v_weight,
-    const paddle::Tensor& self_v_bias,
-    const paddle::Tensor& self_out_weight,
-    const paddle::Tensor& self_out_bias,
-    const paddle::Tensor& cross_ln_weight,
-    const paddle::Tensor& cross_ln_bias,
-    const paddle::Tensor& cross_q_weight,
-    const paddle::Tensor& cross_q_bias,
-    const paddle::Tensor& cross_k_weight,
-    const paddle::Tensor& cross_k_bias,
-    const paddle::Tensor& cross_v_weight,
-    const paddle::Tensor& cross_v_bias,
-    const paddle::Tensor& cross_out_weight,
-    const paddle::Tensor& cross_out_bias,
-    const paddle::Tensor& ffn_ln_weight,
-    const paddle::Tensor& ffn_ln_bias,
-    const paddle::Tensor& ffn_inter_weight,
-    const paddle::Tensor& ffn_inter_bias,
-    const paddle::Tensor& ffn_out_weight,
-    const paddle::Tensor& ffn_out_bias,
-    const paddle::Tensor& old_self_cache_key,
-    const paddle::Tensor& old_self_cache_value,
-    const paddle::Tensor& old_mem_cache,
-    const int step,
-    paddle::Tensor& decoder_output,
-    paddle::Tensor& new_self_cache_key,
-    paddle::Tensor& new_self_cache_value,
-    paddle::Tensor& new_mem_cache,
-    int n_head,
-    int size_per_head,
-    int memory_hidden_dim,
-    bool is_fuse_qkv);
diff --git a/paddlenlp/ops/fast_transformer/src/fusion_decoding_op.cc b/paddlenlp/ops/fast_transformer/src/fusion_decoding_op.cc
deleted file mode 100644
index 3607f70961fb..000000000000
--- a/paddlenlp/ops/fast_transformer/src/fusion_decoding_op.cc
+++ /dev/null
@@ -1,337 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <string>
-#include <vector>
-
-#include "fusion_decoding_op.h"
-#include "pd_traits.h"
-
-
-std::vector<paddle::Tensor> DecodingForward(
-    const paddle::Tensor& input,
-    const paddle::Tensor& mem_seq_len,
-    const paddle::Tensor& word_embedding,
-    const std::vector<paddle::Tensor>& self_ln_weight,
-    const std::vector<paddle::Tensor>& self_ln_bias,
-    const std::vector<paddle::Tensor>& self_q_weight,
-    const std::vector<paddle::Tensor>& self_q_bias,
-    const std::vector<paddle::Tensor>& self_k_weight,
-    const std::vector<paddle::Tensor>& self_k_bias,
-    const std::vector<paddle::Tensor>& self_v_weight,
-    const std::vector<paddle::Tensor>& self_v_bias,
-    const std::vector<paddle::Tensor>& self_out_weight,
-    const std::vector<paddle::Tensor>& self_out_bias,
-    const std::vector<paddle::Tensor>& cross_ln_weight,
-    const std::vector<paddle::Tensor>& cross_ln_bias,
-    const std::vector<paddle::Tensor>& cross_q_weight,
-    const std::vector<paddle::Tensor>& cross_q_bias,
-    const std::vector<paddle::Tensor>& cross_k_weight,
-    const std::vector<paddle::Tensor>& cross_k_bias,
-    const std::vector<paddle::Tensor>& cross_v_weight,
-    const std::vector<paddle::Tensor>& cross_v_bias,
-    const std::vector<paddle::Tensor>& cross_out_weight,
-    const std::vector<paddle::Tensor>& cross_out_bias,
-    const std::vector<paddle::Tensor>& ffn_ln_weight,
-    const std::vector<paddle::Tensor>& ffn_ln_bias,
-    const std::vector<paddle::Tensor>& ffn_inter_weight,
-    const std::vector<paddle::Tensor>& ffn_inter_bias,
-    const std::vector<paddle::Tensor>& ffn_out_weight,
-    const std::vector<paddle::Tensor>& ffn_out_bias,
-    const paddle::Tensor& decoder_ln_weight,
-    const paddle::Tensor& decoder_ln_bias,
-    const paddle::Tensor& embedding_weight,
-    const paddle::Tensor& embedding_bias,
-    const paddle::Tensor& positional_embedding_weight,
-    const std::string& decoding_strategy,
-    const int& beam_size,
-    const int& topk,
-    const float& topp,
-    const int& n_head,
-    const int& size_per_head,
-    const int& num_layer,
-    const int& bos_id,
-    const int& eos_id,
-    const int64_t& max_len,
-    const float& beam_search_diversity_rate,
-    const bool& rel_len,
-    const float& alpha) {
-  int batch_size = input.shape()[0];
-  int max_out_len = rel_len ? max_len + input.shape()[1] : max_len;
-
-  std::vector<int64_t> output_dims;
-  std::vector<int64_t> parent_ids_dims;
-  std::vector<int64_t> sequence_length_dims({batch_size});
-  if (decoding_strategy == "beam_search") {
-    batch_size /= beam_size;
-    output_dims = {max_out_len, batch_size, beam_size};
-    parent_ids_dims = output_dims;
-  } else if (decoding_strategy == "beam_search_v2") {
-    // Use separated alive and finish beam queues to avoid the decrease of alive
-    // beams. The outputs must include both the finish and alive to trace full
-    // path.
-    if (batch_size != -1) {
-      sequence_length_dims = {batch_size * 2};
-      batch_size /= beam_size;
-    } else {
-      sequence_length_dims = {batch_size};
-    }
-    output_dims = {max_out_len, batch_size, beam_size * 2};
-    parent_ids_dims = output_dims;
-  } else if (decoding_strategy == "topk_sampling" ||
-             decoding_strategy == "topp_sampling") {
-    output_dims = {max_out_len, batch_size};
-    parent_ids_dims = {1};
-  } else {
-    PD_THROW("Not supported decoding strategy. ");
-  }
-
-  if (input.place() == paddle::PlaceType::kGPU) {
-    auto output_ids = paddle::Tensor(paddle::PlaceType::kGPU, output_dims);
-    auto parent_ids = paddle::Tensor(paddle::PlaceType::kGPU, parent_ids_dims);
-    auto sequence_length =
-        paddle::Tensor(paddle::PlaceType::kGPU, sequence_length_dims);
-
-    paddle::Tensor seq_len = paddle::Tensor(paddle::PlaceType::kGPU);
-
-    if (mem_seq_len.place() != paddle::PlaceType::kGPU) {
-      seq_len = mem_seq_len.copy_to<int>(paddle::PlaceType::kGPU);
-    } else {
-      seq_len = mem_seq_len;
-    }
-
-    return DecodingCUDAForward(input,
-                               seq_len,
-                               word_embedding,
-                               self_ln_weight,
-                               self_ln_bias,
-                               self_q_weight,
-                               self_q_bias,
-                               self_k_weight,
-                               self_k_bias,
-                               self_v_weight,
-                               self_v_bias,
-                               self_out_weight,
-                               self_out_bias,
-                               cross_ln_weight,
-                               cross_ln_bias,
-                               cross_q_weight,
-                               cross_q_bias,
-                               cross_k_weight,
-                               cross_k_bias,
-                               cross_v_weight,
-                               cross_v_bias,
-                               cross_out_weight,
-                               cross_out_bias,
-                               ffn_ln_weight,
-                               ffn_ln_bias,
-                               ffn_inter_weight,
-                               ffn_inter_bias,
-                               ffn_out_weight,
-                               ffn_out_bias,
-                               decoder_ln_weight,
-                               decoder_ln_bias,
-                               embedding_weight,
-                               embedding_bias,
-                               positional_embedding_weight,
-                               output_ids,
-                               parent_ids,
-                               sequence_length,
-                               decoding_strategy,
-                               beam_size,
-                               topk,
-                               topp,
-                               n_head,
-                               size_per_head,
-                               num_layer,
-                               bos_id,
-                               eos_id,
-                               max_out_len,
-                               beam_search_diversity_rate,
-                               alpha);
-  } else {
-    PD_THROW("Not implemented place. Only GPU is supported. ");
-  }
-}
-
-std::vector<std::vector<int64_t>> DecodingInferShape(
-    const std::vector<int64_t>& input_shape,
-    const std::vector<int64_t>& mem_seq_len_shape,
-    const std::vector<int64_t>& word_embedding_shape,
-    const std::vector<std::vector<int64_t>>& self_ln_weight_shapes,
-    const std::vector<std::vector<int64_t>>& self_ln_bias_shapes,
-    const std::vector<std::vector<int64_t>>& self_q_weight_shapes,
-    const std::vector<std::vector<int64_t>>& self_q_bias_shapes,
-    const std::vector<std::vector<int64_t>>& self_k_weight_shapes,
-    const std::vector<std::vector<int64_t>>& self_k_bias_shapes,
-    const std::vector<std::vector<int64_t>>& self_v_weight_shapes,
-    const std::vector<std::vector<int64_t>>& self_v_bias_shapes,
-    const std::vector<std::vector<int64_t>>& self_out_weight_shapes,
-    const std::vector<std::vector<int64_t>>& self_out_bias_shapes,
-    const std::vector<std::vector<int64_t>>& cross_ln_weight_shapes,
-    const std::vector<std::vector<int64_t>>& cross_ln_bias_shapes,
-    const std::vector<std::vector<int64_t>>& cross_q_weight_shapes,
-    const std::vector<std::vector<int64_t>>& cross_q_bias_shapes,
-    const std::vector<std::vector<int64_t>>& cross_k_weight_shapes,
-    const std::vector<std::vector<int64_t>>& cross_k_bias_shapes,
-    const std::vector<std::vector<int64_t>>& cross_v_weight_shapes,
-    const std::vector<std::vector<int64_t>>& cross_v_bias_shapes,
-    const std::vector<std::vector<int64_t>>& cross_out_weight_shapes,
-    const std::vector<std::vector<int64_t>>& cross_out_bias_shapes,
-    const std::vector<std::vector<int64_t>>& ffn_ln_weight_shapes,
-    const std::vector<std::vector<int64_t>>& ffn_ln_bias_shapes,
-    const std::vector<std::vector<int64_t>>& ffn_inter_weight_shapes,
-    const std::vector<std::vector<int64_t>>& ffn_inter_bias_shapes,
-    const std::vector<std::vector<int64_t>>& ffn_out_weight_shapes,
-    const std::vector<std::vector<int64_t>>& ffn_out_bias_shapes,
-    const std::vector<int64_t>& decoder_ln_weight_shape,
-    const std::vector<int64_t>& decoder_ln_bias_shape,
-    const std::vector<int64_t>& embedding_weight_shape,
-    const std::vector<int64_t>& embedding_bias_shape,
-    const std::vector<int64_t>& positional_embedding_weight_shape,
-    const std::string& decoding_strategy,
-    const int& beam_size,
-    const int& topk,
-    const float& topp,
-    const int& n_head,
-    const int& size_per_head,
-    const int& num_layer,
-    const int& bos_id,
-    const int& eos_id,
-    const int64_t& max_len,
-    const float& beam_search_diversity_rate,
-    const bool& rel_len,
-    const float& alpha) {
-  int batch_size = input_shape[0];
-
-  std::vector<int64_t> output_dims;
-  std::vector<int64_t> sequence_length_dims({batch_size});
-  if (decoding_strategy == "beam_search") {
-    if (batch_size != -1) {
-      batch_size /= beam_size;
-    }
-    output_dims = {max_len, batch_size, beam_size};
-    return {output_dims, output_dims, sequence_length_dims};
-  } else if (decoding_strategy == "beam_search_v2") {
-    // Use separated alive and finish beam queues to avoid the decrease of alive
-    // beams. The outputs must include both the finish and alive to trace full
-    // path.
-    if (batch_size != -1) {
-      sequence_length_dims = {batch_size * 2};
-      batch_size /= beam_size;
-    } else {
-      sequence_length_dims = {batch_size};
-    }
-    output_dims = {max_len, batch_size, beam_size * 2};
-    return {output_dims, output_dims, sequence_length_dims};
-  } else if (decoding_strategy == "topk_sampling" ||
-             decoding_strategy == "topp_sampling") {
-    output_dims = {max_len, batch_size};
-    return {output_dims, {1}, sequence_length_dims};
-  } else {
-    PD_THROW("Not supported decoding strategy. ");
-  }
-}
-
-std::vector<paddle::DataType> DecodingInferDtype(
-    const paddle::DataType& input,
-    const paddle::DataType& mem_seq_len,
-    const paddle::DataType& word_embedding,
-    const std::vector<paddle::DataType>& self_ln_weight,
-    const std::vector<paddle::DataType>& self_ln_bias,
-    const std::vector<paddle::DataType>& self_q_weight,
-    const std::vector<paddle::DataType>& self_q_bias,
-    const std::vector<paddle::DataType>& self_k_weight,
-    const std::vector<paddle::DataType>& self_k_bias,
-    const std::vector<paddle::DataType>& self_v_weight,
-    const std::vector<paddle::DataType>& self_v_bias,
-    const std::vector<paddle::DataType>& self_out_weight,
-    const std::vector<paddle::DataType>& self_out_bias,
-    const std::vector<paddle::DataType>& cross_ln_weight,
-    const std::vector<paddle::DataType>& cross_ln_bias,
-    const std::vector<paddle::DataType>& cross_q_weight,
-    const std::vector<paddle::DataType>& cross_q_bias,
-    const std::vector<paddle::DataType>& cross_k_weight,
-    const std::vector<paddle::DataType>& cross_k_bias,
-    const std::vector<paddle::DataType>& cross_v_weight,
-    const std::vector<paddle::DataType>& cross_v_bias,
-    const std::vector<paddle::DataType>& cross_out_weight,
-    const std::vector<paddle::DataType>& cross_out_bias,
-    const std::vector<paddle::DataType>& ffn_ln_weight,
-    const std::vector<paddle::DataType>& ffn_ln_bias,
-    const std::vector<paddle::DataType>& ffn_inter_weight,
-    const std::vector<paddle::DataType>& ffn_inter_bias,
-    const std::vector<paddle::DataType>& ffn_out_weight,
-    const std::vector<paddle::DataType>& ffn_out_bias,
-    const paddle::DataType& decoder_ln_weight,
-    const paddle::DataType& decoder_ln_bias,
-    const paddle::DataType& embedding_weight,
-    const paddle::DataType& embedding_bias,
-    const paddle::DataType& positional_embedding_weight) {
-  return {paddle::DataType::INT32,
-          paddle::DataType::INT32,
-          paddle::DataType::INT32};
-}
-
-PD_BUILD_OP(fusion_decoding)
-    .Inputs({"Input",
-             "MemSeqLen",
-             "WordEmbedding",
-             paddle::Vec("SelfLayernormWeight"),
-             paddle::Vec("SelfLayernormBias"),
-             paddle::Vec("SelfQueryWeight"),
-             paddle::Vec("SelfQueryBias"),
-             paddle::Vec("SelfKeyWeight"),
-             paddle::Vec("SelfKeyBias"),
-             paddle::Vec("SelfValueWeight"),
-             paddle::Vec("SelfValueBias"),
-             paddle::Vec("SelfOutWeight"),
-             paddle::Vec("SelfOutBias"),
-             paddle::Vec("CrossLayernormWeight"),
-             paddle::Vec("CrossLayernormBias"),
-             paddle::Vec("CrossQueryWeight"),
-             paddle::Vec("CrossQueryBias"),
-             paddle::Vec("CrossKeyWeight"),
-             paddle::Vec("CrossKeyBias"),
-             paddle::Vec("CrossValueWeight"),
-             paddle::Vec("CrossValueBias"),
-             paddle::Vec("CrossOutWeight"),
-             paddle::Vec("CrossOutBias"),
-             paddle::Vec("FFNLayernormWeight"),
-             paddle::Vec("FFNLayernormBias"),
-             paddle::Vec("FFNInterWeight"),
-             paddle::Vec("FFNInterBias"),
-             paddle::Vec("FFNOutWeight"),
-             paddle::Vec("FFNOutBias"),
-             "DecoderLayernormWeight",
-             "DecoderLayernormBias",
-             "EmbWeight",
-             "EmbBias",
-             "PositionEncEmb"})
-    .Outputs({"OutputIds", "ParentIds", "SequenceLength"})
-    .Attrs({"decoding_strategy: std::string",
-            "beam_size: int",
-            "topk: int",
-            "topp: float",
-            "n_head: int",
-            "size_per_head: int",
-            "num_layer: int",
-            "bos_id: int",
-            "eos_id: int",
-            "max_len: int64_t",
-            "beam_search_diversity_rate: float",
-            "rel_len: bool",
-            "alpha: float"})
-    .SetKernelFn(PD_KERNEL(DecodingForward))
-    .SetInferShapeFn(PD_INFER_SHAPE(DecodingInferShape))
-    .SetInferDtypeFn(PD_INFER_DTYPE(DecodingInferDtype));
diff --git a/paddlenlp/ops/fast_transformer/src/fusion_decoding_op.cu b/paddlenlp/ops/fast_transformer/src/fusion_decoding_op.cu
deleted file mode 100644
index 3072b19709a7..000000000000
--- a/paddlenlp/ops/fast_transformer/src/fusion_decoding_op.cu
+++ /dev/null
@@ -1,538 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <curand.h>
-#include <curand_kernel.h>
-
-#include <algorithm>
-#include <iterator>
-#include <random>
-#include <sstream>
-#include <vector>
-
-
-#include "fusion_decoding_op.h"
-#include "pd_traits.h"
-
-template <paddle::DataType D>
-std::vector<paddle::Tensor> decoding_kernel(
-    const paddle::Tensor& input,
-    const paddle::Tensor& memory_sequence_length,
-    const paddle::Tensor& word_emb,
-    const std::vector<paddle::Tensor>& self_layernorm_weight,
-    const std::vector<paddle::Tensor>& self_layernorm_bias,
-    const std::vector<paddle::Tensor>& self_attn_query_weight,
-    const std::vector<paddle::Tensor>& self_attn_query_bias,
-    const std::vector<paddle::Tensor>& self_attn_key_weight,
-    const std::vector<paddle::Tensor>& self_attn_key_bias,
-    const std::vector<paddle::Tensor>& self_attn_value_weight,
-    const std::vector<paddle::Tensor>& self_attn_value_bias,
-    const std::vector<paddle::Tensor>& self_attn_output_weight,
-    const std::vector<paddle::Tensor>& self_attn_output_bias,
-    const std::vector<paddle::Tensor>& cross_layernorm_weight,
-    const std::vector<paddle::Tensor>& cross_layernorm_bias,
-    const std::vector<paddle::Tensor>& cross_attn_query_weight,
-    const std::vector<paddle::Tensor>& cross_attn_query_bias,
-    const std::vector<paddle::Tensor>& cross_attn_key_weight,
-    const std::vector<paddle::Tensor>& cross_attn_key_bias,
-    const std::vector<paddle::Tensor>& cross_attn_value_weight,
-    const std::vector<paddle::Tensor>& cross_attn_value_bias,
-    const std::vector<paddle::Tensor>& cross_attn_output_weight,
-    const std::vector<paddle::Tensor>& cross_attn_output_bias,
-    const std::vector<paddle::Tensor>& ffn_layernorm_weight,
-    const std::vector<paddle::Tensor>& ffn_layernorm_bias,
-    const std::vector<paddle::Tensor>& ffn_intermediate_weight,
-    const std::vector<paddle::Tensor>& ffn_intermediate_bias,
-    const std::vector<paddle::Tensor>& ffn_output_weight,
-    const std::vector<paddle::Tensor>& ffn_output_bias,
-    const paddle::Tensor& decoder_layernorm_weight,
-    const paddle::Tensor& decoder_layernorm_bias,
-    const paddle::Tensor& embedding_weight,
-    const paddle::Tensor& embedding_bias,
-    const paddle::Tensor& position_encoding_table,
-    paddle::Tensor& output_ids,
-    paddle::Tensor& parent_ids,
-    paddle::Tensor& sequence_length,
-    const std::string& decoding_strategy,
-    const int& beam_size,
-    const int& topk,
-    const float& topp,
-    const int& head_num_,
-    const int& size_per_head_,
-    const int& num_layer_,
-    const int& start_id_,
-    const int& end_id_,
-    const int64_t& max_seq_len_,
-    const float& beam_search_diversity_rate_,
-    const float& alpha,
-    cudaStream_t stream) {
-  int beam_width_ = (decoding_strategy == "beam_search" ||
-                     decoding_strategy == "beam_search_v2")
-                        ? beam_size
-                        : 1;
-  int candidate_num_ = (decoding_strategy == "topk_sampling" ||
-                        decoding_strategy == "topp_sampling")
-                           ? topk
-                           : 1;
-  float probability_threshold_ = (decoding_strategy == "topk_sampling" ||
-                                  decoding_strategy == "topp_sampling")
-                                     ? topp
-                                     : 0.0;
-
-  auto input_dims = input.shape();
-  int batch_size_ = (decoding_strategy == "beam_search" ||
-                     decoding_strategy == "beam_search_v2")
-                        ? input_dims[0] / beam_width_
-                        : input_dims[0];
-  const int memory_max_seq_len = input_dims[1];
-  const int memory_hidden_dim = input_dims[2];
-  const int vocab_size = word_emb.shape()[0];
-
-  typedef PDTraits<D> traits_;
-  typedef typename traits_::DataType DataType_;
-  typedef typename traits_::data_t data_t_;
-
-  DecodingInitParam<DataType_> decoding_params;
-  decoding_params.cublas_handle = CublasHandle::GetInstance()->cublas_handle_;
-  decoding_params.cublaslt_handle =
-      CublasHandle::GetInstance()->cublaslt_handle_;
-
-  decoding_params.output_ids = output_ids.mutable_data<int>(input.place());
-  decoding_params.parent_ids = parent_ids.mutable_data<int>(input.place());
-  decoding_params.sequence_length =
-      sequence_length.mutable_data<int>(input.place());
-
-  typedef DecoderTransformerTraits<traits_::OpType> DecodingTraits_;
-  decoding_params.stream = stream;
-  fastertransformer::Allocator<AllocatorType::PD> allocator_(stream);
-
-  decoding_params.memory_tensor =
-      reinterpret_cast<const DataType_*>(input.data<data_t_>());
-  decoding_params.memory_sequence_length = memory_sequence_length.data<int>();
-
-  DecoderInitParam<DataType_>* params =
-      new DecoderInitParam<DataType_>[num_layer_];
-
-  int inner_coeff = ffn_intermediate_weight[0].shape()[1] / memory_hidden_dim;
-
-  auto q_weight_shape = self_attn_query_weight[0].shape();
-  auto k_weight_shape = self_attn_key_weight[0].shape();
-  bool fuse_qkv = (q_weight_shape[1] == k_weight_shape[1]) ? false : true;
-
-  for (int i = 0; i < num_layer_; i++) {
-    params[i].stream = stream;
-    params[i].cublas_handle = CublasHandle::GetInstance()->cublas_handle_;
-    params[i].cublaslt_handle = CublasHandle::GetInstance()->cublaslt_handle_;
-
-    if (decoding_strategy == "beam_search" ||
-        decoding_strategy == "beam_search_v2") {
-      params[i].request_batch_size = batch_size_ * beam_width_;
-      params[i].request_max_mem_seq_len = memory_max_seq_len;
-    } else if (decoding_strategy == "sampling" ||
-               decoding_strategy == "topk_sampling" ||
-               decoding_strategy == "topp_sampling") {
-      params[i].request_batch_size = batch_size_;
-      params[i].request_max_mem_seq_len = memory_max_seq_len;
-    }
-
-    // self attn
-    params[i].self_layernorm.gamma = reinterpret_cast<const DataType_*>(
-        self_layernorm_weight[i].data<data_t_>());
-    params[i].self_layernorm.beta = reinterpret_cast<const DataType_*>(
-        self_layernorm_bias[i].data<data_t_>());
-    // query
-    params[i].self_attention.query_weight.kernel =
-        reinterpret_cast<const DataType_*>(
-            self_attn_query_weight[i].data<data_t_>());
-    params[i].self_attention.query_weight.bias =
-        reinterpret_cast<const DataType_*>(
-            self_attn_query_bias[i].data<data_t_>());
-    // key
-    params[i].self_attention.key_weight.kernel =
-        reinterpret_cast<const DataType_*>(
-            self_attn_key_weight[i].data<data_t_>());
-    params[i].self_attention.key_weight.bias =
-        reinterpret_cast<const DataType_*>(
-            self_attn_key_bias[i].data<data_t_>());
-    // value
-    params[i].self_attention.value_weight.kernel =
-        reinterpret_cast<const DataType_*>(
-            self_attn_value_weight[i].data<data_t_>());
-    params[i].self_attention.value_weight.bias =
-        reinterpret_cast<const DataType_*>(
-            self_attn_value_bias[i].data<data_t_>());
-    // out proj
-    params[i].self_attention.attention_output_weight.kernel =
-        reinterpret_cast<const DataType_*>(
-            self_attn_output_weight[i].data<data_t_>());
-    params[i].self_attention.attention_output_weight.bias =
-        reinterpret_cast<const DataType_*>(
-            self_attn_output_bias[i].data<data_t_>());
-
-    // cross
-    params[i].cross_layernorm.gamma = reinterpret_cast<const DataType_*>(
-        cross_layernorm_weight[i].data<data_t_>());
-    params[i].cross_layernorm.beta = reinterpret_cast<const DataType_*>(
-        cross_layernorm_bias[i].data<data_t_>());
-    // query
-    params[i].cross_attention.query_weight.kernel =
-        reinterpret_cast<const DataType_*>(
-            cross_attn_query_weight[i].data<data_t_>());
-    params[i].cross_attention.query_weight.bias =
-        reinterpret_cast<const DataType_*>(
-            cross_attn_query_bias[i].data<data_t_>());
-    // key
-    params[i].cross_attention.key_weight.kernel =
-        reinterpret_cast<const DataType_*>(
-            cross_attn_key_weight[i].data<data_t_>());
-    params[i].cross_attention.key_weight.bias =
-        reinterpret_cast<const DataType_*>(
-            cross_attn_key_bias[i].data<data_t_>());
-    // value
-    params[i].cross_attention.value_weight.kernel =
-        reinterpret_cast<const DataType_*>(
-            cross_attn_value_weight[i].data<data_t_>());
-    params[i].cross_attention.value_weight.bias =
-        reinterpret_cast<const DataType_*>(
-            cross_attn_value_bias[i].data<data_t_>());
-    // out proj
-    params[i].cross_attention.attention_output_weight.kernel =
-        reinterpret_cast<const DataType_*>(
-            cross_attn_output_weight[i].data<data_t_>());
-    params[i].cross_attention.attention_output_weight.bias =
-        reinterpret_cast<const DataType_*>(
-            cross_attn_output_bias[i].data<data_t_>());
-
-    // ffn
-    params[i].ffn_layernorm.gamma = reinterpret_cast<const DataType_*>(
-        ffn_layernorm_weight[i].data<data_t_>());
-    params[i].ffn_layernorm.beta = reinterpret_cast<const DataType_*>(
-        ffn_layernorm_bias[i].data<data_t_>());
-    // intermediate proj
-    params[i].ffn.intermediate_weight.kernel =
-        reinterpret_cast<const DataType_*>(
-            ffn_intermediate_weight[i].data<data_t_>());
-    params[i].ffn.intermediate_weight.bias = reinterpret_cast<const DataType_*>(
-        ffn_intermediate_bias[i].data<data_t_>());
-    // out proj
-    params[i].ffn.output_weight.kernel = reinterpret_cast<const DataType_*>(
-        ffn_output_weight[i].data<data_t_>());
-    params[i].ffn.output_weight.bias =
-        reinterpret_cast<const DataType_*>(ffn_output_bias[i].data<data_t_>());
-  }
-
-  decoding_params.layernorm.gamma = reinterpret_cast<const DataType_*>(
-      decoder_layernorm_weight.data<data_t_>());
-  decoding_params.layernorm.beta = reinterpret_cast<const DataType_*>(
-      decoder_layernorm_bias.data<data_t_>());
-  // for embedding
-  decoding_params.embedding_table =
-      reinterpret_cast<const DataType_*>(word_emb.data<data_t_>());
-
-  // for weight sharing matmul
-  decoding_params.embedding_kernel =
-      reinterpret_cast<const DataType_*>(embedding_weight.data<data_t_>());
-  // for matmul bias
-  decoding_params.embedding_bias =
-      reinterpret_cast<const DataType_*>(embedding_bias.data<data_t_>());
-
-  decoding_params.position_encoding_table = reinterpret_cast<const DataType_*>(
-      position_encoding_table.data<data_t_>());
-
-  if ("beam_search" == decoding_strategy) {
-    DecodingBeamsearch<DecodingTraits_::OpType>* decoding_beam_search_;
-    decoding_beam_search_ = new DecodingBeamsearch<DecodingTraits_::OpType>(
-        allocator_,
-        batch_size_,
-        beam_width_,
-        max_seq_len_,
-        head_num_,
-        size_per_head_,
-        vocab_size,
-        num_layer_,
-        memory_hidden_dim,
-        memory_max_seq_len,
-        start_id_,
-        end_id_,
-        beam_search_diversity_rate_,
-        true,  // is_fuse_topk_softMax
-        fuse_qkv,
-        false,                 // keep_alive_beam
-        0.6,                   // alpha
-        true,                  // normalization_before
-        0,                     // pos_offset
-        ActivationType::RELU,  // act
-        false,                 // pos_bias
-        false,                 // prefix_lm
-        -1,                    // finished_candidate_num
-        false,                 // early_stopping
-        false,                 // is_mbart
-        0,                     // min_length
-        inner_coeff);
-
-    decoding_beam_search_->forward(params, decoding_params);
-
-    delete decoding_beam_search_;
-  } else if ("beam_search_v2" == decoding_strategy) {
-    DecodingBeamsearch<DecodingTraits_::OpType>* decoding_beam_search_;
-    decoding_beam_search_ = new DecodingBeamsearch<DecodingTraits_::OpType>(
-        allocator_,
-        batch_size_,
-        beam_width_,
-        max_seq_len_,
-        head_num_,
-        size_per_head_,
-        vocab_size,
-        num_layer_,
-        memory_hidden_dim,
-        memory_max_seq_len,
-        start_id_,
-        end_id_,
-        beam_search_diversity_rate_,
-        true,  // is_fuse_topk_softMax
-        fuse_qkv,
-        true,  // keep_alive_beam
-        alpha,
-        true,                  // normalization_before
-        0,                     // pos_offset
-        ActivationType::RELU,  // act
-        false,                 // pos_bias
-        false,                 // prefix_lm
-        -1,                    // finished_candidate_num
-        false,                 // early_stopping
-        false,                 // is_mbart
-        0,                     // min_length
-        inner_coeff);
-
-    decoding_beam_search_->forward(params, decoding_params);
-
-    delete decoding_beam_search_;
-  } else if ("topk_sampling" == decoding_strategy ||
-             "topp_sampling" == decoding_strategy ||
-             "sampling" == decoding_strategy) {
-    DecodingSampling<DecodingTraits_::OpType>* decoding_sampling_;
-    decoding_sampling_ = new DecodingSampling<DecodingTraits_::OpType>(
-        allocator_,
-        batch_size_,
-        max_seq_len_,
-        head_num_,
-        size_per_head_,
-        vocab_size,
-        num_layer_,
-        memory_hidden_dim,
-        memory_max_seq_len,
-        start_id_,
-        end_id_,
-        candidate_num_,
-        probability_threshold_,
-        fuse_qkv,
-        true,                  // normalization_before
-        0,                     // pos_offset
-        ActivationType::RELU,  // act
-        false,                 // pos_bias
-        1.0,                   // temperature
-        1.0,                   // repeat_penalty
-        false,                 // prefix_lm
-        false,                 // is_mbart
-        0,                     // min_length
-        inner_coeff);
-
-    decoding_sampling_->forward(params, decoding_params);
-
-    delete decoding_sampling_;
-  } else {
-    PD_THROW(
-        "Only beam_search, topk_sampling and topp_sampling are supported for "
-        "FastGeneration. ");
-  }
-  delete[] params;
-
-  return {output_ids, parent_ids, sequence_length};
-}
-
-std::vector<paddle::Tensor> DecodingCUDAForward(
-    const paddle::Tensor& input,
-    const paddle::Tensor& mem_seq_len,
-    const paddle::Tensor& word_embedding,
-    const std::vector<paddle::Tensor>& self_ln_weight,
-    const std::vector<paddle::Tensor>& self_ln_bias,
-    const std::vector<paddle::Tensor>& self_q_weight,
-    const std::vector<paddle::Tensor>& self_q_bias,
-    const std::vector<paddle::Tensor>& self_k_weight,
-    const std::vector<paddle::Tensor>& self_k_bias,
-    const std::vector<paddle::Tensor>& self_v_weight,
-    const std::vector<paddle::Tensor>& self_v_bias,
-    const std::vector<paddle::Tensor>& self_out_weight,
-    const std::vector<paddle::Tensor>& self_out_bias,
-    const std::vector<paddle::Tensor>& cross_ln_weight,
-    const std::vector<paddle::Tensor>& cross_ln_bias,
-    const std::vector<paddle::Tensor>& cross_q_weight,
-    const std::vector<paddle::Tensor>& cross_q_bias,
-    const std::vector<paddle::Tensor>& cross_k_weight,
-    const std::vector<paddle::Tensor>& cross_k_bias,
-    const std::vector<paddle::Tensor>& cross_v_weight,
-    const std::vector<paddle::Tensor>& cross_v_bias,
-    const std::vector<paddle::Tensor>& cross_out_weight,
-    const std::vector<paddle::Tensor>& cross_out_bias,
-    const std::vector<paddle::Tensor>& ffn_ln_weight,
-    const std::vector<paddle::Tensor>& ffn_ln_bias,
-    const std::vector<paddle::Tensor>& ffn_inter_weight,
-    const std::vector<paddle::Tensor>& ffn_inter_bias,
-    const std::vector<paddle::Tensor>& ffn_out_weight,
-    const std::vector<paddle::Tensor>& ffn_out_bias,
-    const paddle::Tensor& decoder_ln_weight,
-    const paddle::Tensor& decoder_ln_bias,
-    const paddle::Tensor& embedding_weight,
-    const paddle::Tensor& embedding_bias,
-    const paddle::Tensor& positional_embedding_weight,
-    paddle::Tensor& output_ids,
-    paddle::Tensor& parent_ids,
-    paddle::Tensor& sequence_length,
-    const std::string& decoding_strategy,
-    const int& beam_size,
-    const int& topk,
-    const float& topp,
-    const int& n_head,
-    const int& size_per_head,
-    const int& num_layer,
-    const int& bos_id,
-    const int& eos_id,
-    const int64_t& max_len,
-    const float& beam_search_diversity_rate,
-    const float& alpha) {
-  auto stream = input.stream();
-
-  cublasSetStream(CublasHandle::GetInstance()->cublas_handle_, stream);
-
-  std::vector<paddle::Tensor> ret;
-
-  switch (input.type()) {
-    case paddle::DataType::FLOAT16: {
-      ret = decoding_kernel<paddle::DataType::FLOAT16>(
-          input,
-          mem_seq_len,
-          word_embedding,
-          self_ln_weight,
-          self_ln_bias,
-          self_q_weight,
-          self_q_bias,
-          self_k_weight,
-          self_k_bias,
-          self_v_weight,
-          self_v_bias,
-          self_out_weight,
-          self_out_bias,
-          cross_ln_weight,
-          cross_ln_bias,
-          cross_q_weight,
-          cross_q_bias,
-          cross_k_weight,
-          cross_k_bias,
-          cross_v_weight,
-          cross_v_bias,
-          cross_out_weight,
-          cross_out_bias,
-          ffn_ln_weight,
-          ffn_ln_bias,
-          ffn_inter_weight,
-          ffn_inter_bias,
-          ffn_out_weight,
-          ffn_out_bias,
-          decoder_ln_weight,
-          decoder_ln_bias,
-          embedding_weight,
-          embedding_bias,
-          positional_embedding_weight,
-          output_ids,
-          parent_ids,
-          sequence_length,
-          decoding_strategy,
-          beam_size,
-          topk,
-          topp,
-          n_head,
-          size_per_head,
-          num_layer,
-          bos_id,
-          eos_id,
-          max_len,
-          beam_search_diversity_rate,
-          alpha,
-          stream);
-      break;
-    }
-    case paddle::DataType::FLOAT32: {
-      ret = decoding_kernel<paddle::DataType::FLOAT32>(
-          input,
-          mem_seq_len,
-          word_embedding,
-          self_ln_weight,
-          self_ln_bias,
-          self_q_weight,
-          self_q_bias,
-          self_k_weight,
-          self_k_bias,
-          self_v_weight,
-          self_v_bias,
-          self_out_weight,
-          self_out_bias,
-          cross_ln_weight,
-          cross_ln_bias,
-          cross_q_weight,
-          cross_q_bias,
-          cross_k_weight,
-          cross_k_bias,
-          cross_v_weight,
-          cross_v_bias,
-          cross_out_weight,
-          cross_out_bias,
-          ffn_ln_weight,
-          ffn_ln_bias,
-          ffn_inter_weight,
-          ffn_inter_bias,
-          ffn_out_weight,
-          ffn_out_bias,
-          decoder_ln_weight,
-          decoder_ln_bias,
-          embedding_weight,
-          embedding_bias,
-          positional_embedding_weight,
-          output_ids,
-          parent_ids,
-          sequence_length,
-          decoding_strategy,
-          beam_size,
-          topk,
-          topp,
-          n_head,
-          size_per_head,
-          num_layer,
-          bos_id,
-          eos_id,
-          max_len,
-          beam_search_diversity_rate,
-          alpha,
-          stream);
-      break;
-    }
-    default: {
-      PD_THROW(
-          "NOT supported data type. "
-          "Only float16 and float32 are supported. ");
-      break;
-    }
-  }
-  return ret;
-}
diff --git a/paddlenlp/ops/fast_transformer/src/fusion_decoding_op.h b/paddlenlp/ops/fast_transformer/src/fusion_decoding_op.h
deleted file mode 100644
index 419649092abe..000000000000
--- a/paddlenlp/ops/fast_transformer/src/fusion_decoding_op.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include "cublas_handle.h"
-
-#include "fastertransformer/decoding_beamsearch.h"
-#include "fastertransformer/decoding_sampling.h"
-#include "fastertransformer/open_decoder.h"
-#include "fastertransformer/utils/common.h"
-
-#ifdef PADDLE_ON_INFERENCE
-#include "paddle/extension.h"
-#include "paddle_inference_api.h"
-#include "paddle/common/exception.h"
-#else
-#include "paddle/extension.h"
-#endif
-
-
-std::vector<paddle::Tensor> DecodingCUDAForward(
-    const paddle::Tensor& input,
-    const paddle::Tensor& mem_seq_len,
-    const paddle::Tensor& word_embedding,
-    const std::vector<paddle::Tensor>& self_ln_weight,
-    const std::vector<paddle::Tensor>& self_ln_bias,
-    const std::vector<paddle::Tensor>& self_q_weight,
-    const std::vector<paddle::Tensor>& self_q_bias,
-    const std::vector<paddle::Tensor>& self_k_weight,
-    const std::vector<paddle::Tensor>& self_k_bias,
-    const std::vector<paddle::Tensor>& self_v_weight,
-    const std::vector<paddle::Tensor>& self_v_bias,
-    const std::vector<paddle::Tensor>& self_out_weight,
-    const std::vector<paddle::Tensor>& self_out_bias,
-    const std::vector<paddle::Tensor>& cross_ln_weight,
-    const std::vector<paddle::Tensor>& cross_ln_bias,
-    const std::vector<paddle::Tensor>& cross_q_weight,
-    const std::vector<paddle::Tensor>& cross_q_bias,
-    const std::vector<paddle::Tensor>& cross_k_weight,
-    const std::vector<paddle::Tensor>& cross_k_bias,
-    const std::vector<paddle::Tensor>& cross_v_weight,
-    const std::vector<paddle::Tensor>& cross_v_bias,
-    const std::vector<paddle::Tensor>& cross_out_weight,
-    const std::vector<paddle::Tensor>& cross_out_bias,
-    const std::vector<paddle::Tensor>& ffn_ln_weight,
-    const std::vector<paddle::Tensor>& ffn_ln_bias,
-    const std::vector<paddle::Tensor>& ffn_inter_weight,
-    const std::vector<paddle::Tensor>& ffn_inter_bias,
-    const std::vector<paddle::Tensor>& ffn_out_weight,
-    const std::vector<paddle::Tensor>& ffn_out_bias,
-    const paddle::Tensor& decoder_ln_weight,
-    const paddle::Tensor& decoder_ln_bias,
-    const paddle::Tensor& embedding_weight,
-    const paddle::Tensor& embedding_bias,
-    const paddle::Tensor& positional_embedding_weight,
-    paddle::Tensor& output_ids,
-    paddle::Tensor& parent_ids,
-    paddle::Tensor& sequence_length,
-    const std::string& decoding_strategy,
-    const int& beam_size,
-    const int& topk,
-    const float& topp,
-    const int& n_head,
-    const int& size_per_head,
-    const int& num_layer,
-    const int& bos_id,
-    const int& eos_id,
-    const int64_t& max_len,
-    const float& beam_search_diversity_rate,
-    const float& alpha);
diff --git a/paddlenlp/ops/fast_transformer/src/fusion_encoder_op.cc b/paddlenlp/ops/fast_transformer/src/fusion_encoder_op.cc
deleted file mode 100644
index a3a97b92461a..000000000000
--- a/paddlenlp/ops/fast_transformer/src/fusion_encoder_op.cc
+++ /dev/null
@@ -1,193 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <string>
-#include <vector>
-
-#include "fusion_encoder_op.h"
-
-std::vector<paddle::Tensor> EncoderForward(
-    const paddle::Tensor& input,
-    const paddle::Tensor& attn_mask,
-    const std::vector<paddle::Tensor>& attn_query_weight,
-    const std::vector<paddle::Tensor>& attn_query_bias,
-    const std::vector<paddle::Tensor>& attn_key_weight,
-    const std::vector<paddle::Tensor>& attn_key_bias,
-    const std::vector<paddle::Tensor>& attn_value_weight,
-    const std::vector<paddle::Tensor>& attn_value_bias,
-    const std::vector<paddle::Tensor>& attn_output_weight,
-    const std::vector<paddle::Tensor>& attn_output_bias,
-    const std::vector<paddle::Tensor>& attn_output_layernorm_weight,
-    const std::vector<paddle::Tensor>& attn_output_layernorm_bias,
-    const std::vector<paddle::Tensor>& output_layernorm_weight,
-    const std::vector<paddle::Tensor>& output_layernorm_bias,
-    const std::vector<paddle::Tensor>& ffn_intermediate_weight,
-    const std::vector<paddle::Tensor>& ffn_intermediate_bias,
-    const std::vector<paddle::Tensor>& ffn_output_weight,
-    const std::vector<paddle::Tensor>& ffn_output_bias,
-    // const paddle::Tensor& sequence_id_offset,
-    // const paddle::Tensor& trt_seqlen_offset,
-    // const paddle::Tensor& amax_list,
-    const int64_t& head_num,
-    const int64_t& size_per_head,
-    const bool& use_gelu,
-    const bool& remove_padding,
-    const int64_t& int8_mode,
-    const int64_t& num_layer,
-    const int64_t& layer_idx,
-    const bool& allow_gemm_test,
-    const bool& use_trt_kernel,
-    const bool& normalize_before) {
-  if (input.place() == paddle::PlaceType::kGPU) {
-    auto shape = input.shape();
-    std::vector<paddle::Tensor> encoder_out({
-      paddle::Tensor(paddle::PlaceType::kGPU, shape), paddle::Tensor(paddle::PlaceType::kGPU, shape)
-    });
-
-    return EncoderCUDAForward(input,
-                              attn_mask,
-                              attn_query_weight,
-                              attn_query_bias,
-                              attn_key_weight,
-                              attn_key_bias,
-                              attn_value_weight,
-                              attn_value_bias,
-                              attn_output_weight,
-                              attn_output_bias,
-                              attn_output_layernorm_weight,
-                              attn_output_layernorm_bias,
-                              output_layernorm_weight,
-                              output_layernorm_bias,
-                              ffn_intermediate_weight,
-                              ffn_intermediate_bias,
-                              ffn_output_weight,
-                              ffn_output_bias,
-                              // sequence_id_offset,
-                              // trt_seqlen_offset,
-                              // amax_list,
-                              encoder_out,
-                              head_num,
-                              size_per_head,
-                              use_gelu,
-                              remove_padding,
-                              int8_mode,  // no support now
-                              num_layer,
-                              layer_idx,
-                              allow_gemm_test,
-                              use_trt_kernel,
-                              normalize_before);
-  } else {
-    PD_THROW("Not implemented place. Only GPU is supported. ");
-  }
-}
-
-std::vector<std::vector<int64_t>> EncoderInferShape(
-    const std::vector<int64_t>& input_shape,
-    const std::vector<int64_t>& attn_mask_shape,
-    const std::vector<std::vector<int64_t>>& attn_query_weight_shape,
-    const std::vector<std::vector<int64_t>>& attn_query_bias_shape,
-    const std::vector<std::vector<int64_t>>& attn_key_weight_shape,
-    const std::vector<std::vector<int64_t>>& attn_key_bias_shape,
-    const std::vector<std::vector<int64_t>>& attn_value_weight_shape,
-    const std::vector<std::vector<int64_t>>& attn_value_bias_shape,
-    const std::vector<std::vector<int64_t>>& attn_output_weight_shape,
-    const std::vector<std::vector<int64_t>>& attn_output_bias_shape,
-    const std::vector<std::vector<int64_t>>& attn_output_layernorm_weight_shape,
-    const std::vector<std::vector<int64_t>>& attn_output_layernorm_bias_shape,
-    const std::vector<std::vector<int64_t>>& output_layernorm_weight_shape,
-    const std::vector<std::vector<int64_t>>& output_layernorm_bias_shape,
-    const std::vector<std::vector<int64_t>>& ffn_intermediate_weight_shape,
-    const std::vector<std::vector<int64_t>>& ffn_intermediate_bias_shape,
-    const std::vector<std::vector<int64_t>>& ffn_output_weight_shape,
-    const std::vector<std::vector<int64_t>>& ffn_output_bias_shape,
-    // const std::vector<int64_t>& sequence_id_offset,
-    // const std::vector<int64_t>& trt_seqlen_offset,
-    // const std::vector<int64_t>& amax_list_shape,
-    const int64_t& head_num,
-    const int64_t& size_per_head,
-    const bool& use_gelu,
-    const bool& remove_padding,
-    const int64_t& int8_mode,  // no support now
-    const int64_t& num_layer,
-    const int64_t& layer_idx,
-    const bool& allow_gemm_test,
-    const bool& use_trt_kernel,
-    const bool& normalize_before) {
-  return {input_shape};
-}
-
-
-std::vector<paddle::DataType> EncoderInferDtype(
-    const paddle::DataType& input,
-    const paddle::DataType& attn_mask,
-    const std::vector<paddle::DataType>& attn_query_weight,
-    const std::vector<paddle::DataType>& attn_query_bias,
-    const std::vector<paddle::DataType>& attn_key_weight,
-    const std::vector<paddle::DataType>& attn_key_bias,
-    const std::vector<paddle::DataType>& attn_value_weight,
-    const std::vector<paddle::DataType>& attn_value_bias,
-    const std::vector<paddle::DataType>& attn_output_weight,
-    const std::vector<paddle::DataType>& attn_output_bias,
-    const std::vector<paddle::DataType>& attn_output_layernorm_weight,
-    const std::vector<paddle::DataType>& attn_output_layernorm_bias,
-    const std::vector<paddle::DataType>& output_layernorm_weight,
-    const std::vector<paddle::DataType>& output_layernorm_bias,
-    const std::vector<paddle::DataType>& ffn_intermediate_weight,
-    const std::vector<paddle::DataType>& ffn_intermediate_bias,
-    const std::vector<paddle::DataType>& ffn_output_weight,
-    const std::vector<paddle::DataType>& ffn_output_bias) {
-  // const paddle::DataType& sequence_id_offset,
-  // const paddle::DataType& trt_seqlen_offset,
-  // const paddle::DataType& amax_list) {
-  return {input};
-}
-
-PD_BUILD_OP(fusion_encoder)
-    .Inputs({
-        "Input",
-        "SelfAttnMask",
-        paddle::Vec("SelfQueryWeight"),
-        paddle::Vec("SelfQueryBias"),
-        paddle::Vec("SelfKeyWeight"),
-        paddle::Vec("SelfKeyBias"),
-        paddle::Vec("SelfValueWeight"),
-        paddle::Vec("SelfValueBias"),
-        paddle::Vec("SelfAttnOutputWeight"),
-        paddle::Vec("SelfAttnOutputBias"),
-        paddle::Vec("SelfAttnOutputLayernormWeight"),
-        paddle::Vec("SelfAttnOutputLayernormBias"),
-        paddle::Vec("OutputLayernormWeight"),
-        paddle::Vec("OutputLayernormBias"),
-        paddle::Vec("FFNInterWeight"),
-        paddle::Vec("FFNInterBias"),
-        paddle::Vec("FFNOutputWeight"),
-        paddle::Vec("FFNOutputBias"),
-        // "SequenceIdOffset",
-        // "TRTSeqLenOffset",
-        // "AmaxList",
-    })
-    .Outputs({"EncoderOut"})
-    .Attrs({"head_num: int64_t",
-            "size_per_head: int64_t",
-            "use_gelu: bool",
-            "remove_padding: bool",
-            "int8_mode: int64_t",
-            "num_layer: int64_t",
-            "layer_idx: int64_t",
-            "allow_gemm_test: bool",
-            "use_trt_kernel: bool",
-            "normalize_before: bool"})
-    .SetKernelFn(PD_KERNEL(EncoderForward))
-    .SetInferShapeFn(PD_INFER_SHAPE(EncoderInferShape))
-    .SetInferDtypeFn(PD_INFER_DTYPE(EncoderInferDtype));
\ No newline at end of file
diff --git a/paddlenlp/ops/fast_transformer/src/fusion_encoder_op.cu b/paddlenlp/ops/fast_transformer/src/fusion_encoder_op.cu
deleted file mode 100644
index 2fe897147ef8..000000000000
--- a/paddlenlp/ops/fast_transformer/src/fusion_encoder_op.cu
+++ /dev/null
@@ -1,443 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <curand.h>
-#include <curand_kernel.h>
-#include <algorithm>
-#include <iterator>
-#include <random>
-#include <sstream>
-#include <vector>
-
-#include "cublas_handle.h"
-#include "fastertransformer/bert_encoder_transformer.h"
-
-#include "fastertransformer/cuda/cuda_kernels.h"
-#include "fastertransformer/standard_encoder.h"
-#include "fusion_encoder_op.h"
-#include "pd_traits.h"
-
-
-template <paddle::DataType D>
-std::vector<paddle::Tensor> encoder_kernel(
-    const paddle::Tensor& input,
-    const paddle::Tensor& attn_mask,
-    const std::vector<paddle::Tensor>& attn_query_weight,
-    const std::vector<paddle::Tensor>& attn_query_bias,
-    const std::vector<paddle::Tensor>& attn_key_weight,
-    const std::vector<paddle::Tensor>& attn_key_bias,
-    const std::vector<paddle::Tensor>& attn_value_weight,
-    const std::vector<paddle::Tensor>& attn_value_bias,
-    const std::vector<paddle::Tensor>& attn_output_weight,
-    const std::vector<paddle::Tensor>& attn_output_bias,
-    /*
-    When calling BertEncoderTransformer(Post-Norm):
-        norm1 coresponds to BertInitParam.self_layernorm
-        norm2 coresponds to BertInitParam.ffn_layernorm
-    When calling OpenEncoder(Pre-Norm):
-        norm1 coresponds to EncoderInitParam.input_layernorm
-        norm2 coresponds to EncoderInitParam.self_layernorm
-    */
-    const std::vector<paddle::Tensor>& norm1_weight,
-    const std::vector<paddle::Tensor>& norm1_bias,
-    const std::vector<paddle::Tensor>& norm2_weight,
-    const std::vector<paddle::Tensor>& norm2_bias,
-    const std::vector<paddle::Tensor>& ffn_intermediate_weight,
-    const std::vector<paddle::Tensor>& ffn_intermediate_bias,
-    const std::vector<paddle::Tensor>& ffn_output_weight,
-    const std::vector<paddle::Tensor>& ffn_output_bias,
-    // const paddle::Tensor& sequence_id_offset,
-    // const paddle::Tensor& trt_seqlen_offset,
-    // const paddle::Tensor& amax_list,
-    std::vector<paddle::Tensor>& encoder_out,
-    int64_t head_num_,
-    int64_t size_per_head_,
-    bool use_gelu,
-    bool remove_padding,
-    int64_t int8_mode,  // no support now
-    int64_t num_layer_,
-    int64_t layer_idx_,
-    bool allow_gemm_test,
-    bool use_trt_kernel_,
-    bool normalize_before,
-    cudaStream_t stream) {
-
-  auto input_shape = input.shape();
-  int batch_size_ = input_shape[0];
-  int max_seq_len_ = input_shape[1];
-  typedef PDTraits<D> traits_;
-
-  fastertransformer::Allocator<AllocatorType::PD>* allocator_ =
-      new fastertransformer::Allocator<AllocatorType::PD>(stream);
-
-  typedef typename traits_::DataType DataType_;
-  typedef typename traits_::data_t data_t_;
-
-  int in_id = 0;
-  int layers = attn_query_weight.size();
-
-  if (normalize_before == false) {
-    typedef BertEncoderTransformerTraits<traits_::OpType,
-                                        fastertransformer::cuda::OpenMultiHeadAttention>
-        EncoderTraits_;
-
-    // Post-Normalization
-    BertInitParam<DataType_> encoder_param;
-
-    encoder_param.stream = stream;
-    encoder_param.cublas_handle = CublasHandle::GetInstance()->cublas_handle_;
-    encoder_param.cublaslt_handle =
-        CublasHandle::GetInstance()->cublaslt_handle_;
-
-    encoder_param.attr_mask =
-        reinterpret_cast<const DataType_*>(attn_mask.data<data_t_>());
-
-    BertEncoderTransformer<EncoderTraits_>* encoder =
-        new BertEncoderTransformer<EncoderTraits_>(
-            int8_mode, allow_gemm_test, use_gelu);
-
-    encoder->allocateBuffer(allocator_,
-                            batch_size_,
-                            max_seq_len_,
-                            max_seq_len_,
-                            head_num_,
-                            size_per_head_,
-                            use_trt_kernel_);
-
-    std::vector<data_t_*> enc_buf({
-        encoder_out[0].mutable_data<data_t_>(input.place()),
-        encoder_out[1].mutable_data<data_t_>(input.place())});
-
-    for (int layer = 0; layer < layers; ++layer) {
-        in_id = layer & 0x1;
-
-        if (0 == layer) {
-            encoder_param.from_tensor = reinterpret_cast<const DataType_*>(
-                input.data<data_t_>());
-            encoder_param.to_tensor = reinterpret_cast<const DataType_*>(
-                input.data<data_t_>());
-            encoder_param.transformer_out = reinterpret_cast<DataType_*>(
-                enc_buf[1 - in_id]);
-        } else {
-            encoder_param.from_tensor = reinterpret_cast<DataType_*>(
-                enc_buf[in_id]);
-            encoder_param.to_tensor = reinterpret_cast<DataType_*>(
-                enc_buf[in_id]);
-            encoder_param.transformer_out = reinterpret_cast<DataType_*>(
-                enc_buf[1 - in_id]);
-        }
-
-        // self attn
-        encoder_param.self_attention.query_weight.kernel =
-            reinterpret_cast<const DataType_*>(attn_query_weight[layer].data<data_t_>());
-        encoder_param.self_attention.query_weight.bias =
-            reinterpret_cast<const DataType_*>(attn_query_bias[layer].data<data_t_>());
-        encoder_param.self_attention.key_weight.kernel =
-            reinterpret_cast<const DataType_*>(attn_key_weight[layer].data<data_t_>());
-        encoder_param.self_attention.key_weight.bias =
-            reinterpret_cast<const DataType_*>(attn_key_bias[layer].data<data_t_>());
-        encoder_param.self_attention.value_weight.kernel =
-            reinterpret_cast<const DataType_*>(attn_value_weight[layer].data<data_t_>());
-        encoder_param.self_attention.value_weight.bias =
-            reinterpret_cast<const DataType_*>(attn_value_bias[layer].data<data_t_>());
-        encoder_param.self_attention.attention_output_weight.kernel =
-            reinterpret_cast<const DataType_*>(attn_output_weight[layer].data<data_t_>());
-        encoder_param.self_attention.attention_output_weight.bias =
-            reinterpret_cast<const DataType_*>(attn_output_bias[layer].data<data_t_>());
-
-        // self_attn_layer_norm
-        encoder_param.self_layernorm.gamma =
-            reinterpret_cast<const DataType_*>(norm1_weight[layer].data<data_t_>());
-        encoder_param.self_layernorm.beta =
-            reinterpret_cast<const DataType_*>(norm1_bias[layer].data<data_t_>());
-        encoder_param.ffn.intermediate_weight.kernel =
-            reinterpret_cast<const DataType_*>(
-                ffn_intermediate_weight[layer].data<data_t_>());
-        encoder_param.ffn.intermediate_weight.bias =
-            reinterpret_cast<const DataType_*>(
-                ffn_intermediate_bias[layer].data<data_t_>());
-
-        encoder_param.ffn.output_weight.kernel =
-            reinterpret_cast<const DataType_*>(ffn_output_weight[layer].data<data_t_>());
-        encoder_param.ffn.output_weight.bias =
-            reinterpret_cast<const DataType_*>(ffn_output_bias[layer].data<data_t_>());
-
-        // ffn_layer_norm
-        encoder_param.ffn_layernorm.gamma =
-            reinterpret_cast<const DataType_*>(norm2_weight[layer].data<data_t_>());
-        encoder_param.ffn_layernorm.beta =
-            reinterpret_cast<const DataType_*>(norm2_bias[layer].data<data_t_>());
-
-        int valid_word_num;
-
-        encoder_param.sequence_id_offset = nullptr;
-        valid_word_num = batch_size_ * max_seq_len_;
-
-        encoder_param.valid_word_num = valid_word_num;
-
-        encoder_param.trt_seqlen_offset = nullptr;
-        encoder_param.trt_seqlen_size = batch_size_ + 1;
-
-        encoder_param.amaxList = nullptr;
-
-        encoder->initialize(encoder_param);
-        encoder->forward();
-    }
-
-    encoder->freeBuffer();
-
-    delete allocator_;
-    delete encoder;
-  } else {
-    typedef OpenEncoderTraits<traits_::OpType, fastertransformer::cuda::OpenMultiHeadAttention>
-        OpenEncoderTraits_;
-
-    // Pre-Normalization
-    EncoderInitParam<DataType_> encoder_param;
-
-    encoder_param.stream = stream;
-    encoder_param.cublas_handle = CublasHandle::GetInstance()->cublas_handle_;
-    encoder_param.cublaslt_handle =
-        CublasHandle::GetInstance()->cublaslt_handle_;
-    encoder_param.attr_mask =
-        reinterpret_cast<const DataType_*>(attn_mask.data<data_t_>());
-
-    OpenEncoder<OpenEncoderTraits_>* encoder =
-        new OpenEncoder<OpenEncoderTraits_>(
-            int8_mode, allow_gemm_test, use_gelu);
-
-    encoder->allocateBuffer(allocator_,
-                            batch_size_,
-                            max_seq_len_,
-                            max_seq_len_,
-                            head_num_,
-                            size_per_head_,
-                            use_trt_kernel_);
-    
-    for (int layer = 0; layer < layers; ++layer) {
-        in_id = layer & 0x1;
-
-        if (0 == layer) {
-            encoder_param.from_tensor = reinterpret_cast<const DataType_*>(
-                input.data<data_t_>());
-            encoder_param.to_tensor = reinterpret_cast<const DataType_*>(
-                input.data<data_t_>());
-            encoder_param.transformer_out = reinterpret_cast<DataType_*>(
-                encoder_out[1 - in_id].mutable_data<data_t_>(input.place()));
-        } else {
-            encoder_param.from_tensor = reinterpret_cast<DataType_*>(
-                encoder_out[in_id].data<data_t_>());
-            encoder_param.to_tensor = reinterpret_cast<DataType_*>(
-                encoder_out[in_id].data<data_t_>());
-            encoder_param.transformer_out = reinterpret_cast<DataType_*>(
-                encoder_out[1 - in_id].mutable_data<data_t_>(input.place()));
-        }
-
-        // self attn
-        encoder_param.self_attention.query_weight.kernel =
-            reinterpret_cast<const DataType_*>(attn_query_weight[layer].data<data_t_>());
-        encoder_param.self_attention.query_weight.bias =
-            reinterpret_cast<const DataType_*>(attn_query_bias[layer].data<data_t_>());
-        encoder_param.self_attention.key_weight.kernel =
-            reinterpret_cast<const DataType_*>(attn_key_weight[layer].data<data_t_>());
-        encoder_param.self_attention.key_weight.bias =
-            reinterpret_cast<const DataType_*>(attn_key_bias[layer].data<data_t_>());
-        encoder_param.self_attention.value_weight.kernel =
-            reinterpret_cast<const DataType_*>(attn_value_weight[layer].data<data_t_>());
-        encoder_param.self_attention.value_weight.bias =
-            reinterpret_cast<const DataType_*>(attn_value_bias[layer].data<data_t_>());
-        encoder_param.self_attention.attention_output_weight.kernel =
-            reinterpret_cast<const DataType_*>(attn_output_weight[layer].data<data_t_>());
-        encoder_param.self_attention.attention_output_weight.bias =
-            reinterpret_cast<const DataType_*>(attn_output_bias[layer].data<data_t_>());
-
-        // Spicific for Pre-Normalization
-        encoder_param.input_layernorm.gamma =
-            reinterpret_cast<const DataType_*>(norm1_weight[layer].data<data_t_>());
-        encoder_param.input_layernorm.beta =
-            reinterpret_cast<const DataType_*>(norm1_bias[layer].data<data_t_>());
-
-        encoder_param.self_layernorm.gamma =
-            reinterpret_cast<const DataType_*>(norm2_weight[layer].data<data_t_>());
-        encoder_param.self_layernorm.beta =
-            reinterpret_cast<const DataType_*>(norm2_bias[layer].data<data_t_>());
-
-        encoder_param.ffn.intermediate_weight.kernel =
-            reinterpret_cast<const DataType_*>(
-                ffn_intermediate_weight[layer].data<data_t_>());
-        encoder_param.ffn.intermediate_weight.bias =
-            reinterpret_cast<const DataType_*>(
-                ffn_intermediate_bias[layer].data<data_t_>());
-
-        encoder_param.ffn.output_weight.kernel =
-            reinterpret_cast<const DataType_*>(ffn_output_weight[layer].data<data_t_>());
-        encoder_param.ffn.output_weight.bias =
-            reinterpret_cast<const DataType_*>(ffn_output_bias[layer].data<data_t_>());
-
-        int valid_word_num;
-        encoder_param.sequence_id_offset = nullptr;
-        valid_word_num = batch_size_ * max_seq_len_;
-
-        encoder_param.valid_word_num = valid_word_num;
-
-        encoder_param.trt_seqlen_offset =
-            nullptr;  // trt_seqlen_offset.data<int>();
-        encoder_param.trt_seqlen_size = batch_size_ + 1;
-
-        encoder_param.amaxList = nullptr;
-
-        encoder->initialize(encoder_param);
-        encoder->forward();
-    }
-
-    encoder->freeBuffer();
-    delete allocator_;
-    delete encoder;
-  }
-
-  return {encoder_out[1 - in_id]};
-}
-
-std::vector<paddle::Tensor> EncoderCUDAForward(
-    const paddle::Tensor& input,
-    const paddle::Tensor& attn_mask,
-    const std::vector<paddle::Tensor>& attn_query_weight,
-    const std::vector<paddle::Tensor>& attn_query_bias,
-    const std::vector<paddle::Tensor>& attn_key_weight,
-    const std::vector<paddle::Tensor>& attn_key_bias,
-    const std::vector<paddle::Tensor>& attn_value_weight,
-    const std::vector<paddle::Tensor>& attn_value_bias,
-    const std::vector<paddle::Tensor>& attn_output_weight,
-    const std::vector<paddle::Tensor>& attn_output_bias,
-    /*
-    When calling BertEncoderTransformer(Post-Norm):
-        norm1 coresponds to BertInitParam.self_layernorm
-        norm2 coresponds to BertInitParam.ffn_layernorm
-    When calling OpenEncoder(Pre-Norm):
-        norm1 coresponds to EncoderInitParam.input_layernorm
-        norm2 coresponds to EncoderInitParam.self_layernorm
-    */
-    const std::vector<paddle::Tensor>& norm1_weight,
-    const std::vector<paddle::Tensor>& norm1_bias,
-    const std::vector<paddle::Tensor>& norm2_weight,
-    const std::vector<paddle::Tensor>& norm2_bias,
-    const std::vector<paddle::Tensor>& ffn_intermediate_weight,
-    const std::vector<paddle::Tensor>& ffn_intermediate_bias,
-    const std::vector<paddle::Tensor>& ffn_output_weight,
-    const std::vector<paddle::Tensor>& ffn_output_bias,
-    // const paddle::Tensor& sequence_id_offset,
-    // const paddle::Tensor& trt_seqlen_offset,
-    // const paddle::Tensor& amax_list,
-    std::vector<paddle::Tensor>& encoder_out,
-    int64_t head_num,
-    int64_t size_per_head,
-    bool use_gelu,
-    bool remove_padding,
-    int64_t int8_mode,
-    int64_t num_layer,
-    int64_t layer_idx,
-    bool allow_gemm_test,
-    bool use_trt_kernel,
-    bool normalize_before) {
-  auto stream = input.stream();
-
-  cublasSetStream(CublasHandle::GetInstance()->cublas_handle_, stream);
-
-  std::vector<paddle::Tensor> ret;
-
-  switch (input.type()) {
-    case paddle::DataType::FLOAT16: {
-      ret = encoder_kernel<paddle::DataType::FLOAT16>(input,
-                                                      attn_mask,
-                                                      attn_query_weight,
-                                                      attn_query_bias,
-                                                      attn_key_weight,
-                                                      attn_key_bias,
-                                                      attn_value_weight,
-                                                      attn_value_bias,
-                                                      attn_output_weight,
-                                                      attn_output_bias,
-                                                      norm1_weight,
-                                                      norm1_bias,
-                                                      norm2_weight,
-                                                      norm2_bias,
-                                                      ffn_intermediate_weight,
-                                                      ffn_intermediate_bias,
-                                                      ffn_output_weight,
-                                                      ffn_output_bias,
-                                                      //   sequence_id_offset,
-                                                      //   trt_seqlen_offset,
-                                                      //   amax_list,
-                                                      encoder_out,
-                                                      head_num,
-                                                      size_per_head,
-                                                      use_gelu,
-                                                      remove_padding,
-                                                      int8_mode,
-                                                      num_layer,
-                                                      layer_idx,
-                                                      allow_gemm_test,
-                                                      use_trt_kernel,
-                                                      normalize_before,
-                                                      stream);
-
-      break;
-    }
-    case paddle::DataType::FLOAT32: {
-      ret = encoder_kernel<paddle::DataType::FLOAT32>(input,
-                                                      attn_mask,
-                                                      attn_query_weight,
-                                                      attn_query_bias,
-                                                      attn_key_weight,
-                                                      attn_key_bias,
-                                                      attn_value_weight,
-                                                      attn_value_bias,
-                                                      attn_output_weight,
-                                                      attn_output_bias,
-                                                      norm1_weight,
-                                                      norm1_bias,
-                                                      norm2_weight,
-                                                      norm2_bias,
-                                                      ffn_intermediate_weight,
-                                                      ffn_intermediate_bias,
-                                                      ffn_output_weight,
-                                                      ffn_output_bias,
-                                                      //   sequence_id_offset,
-                                                      //   trt_seqlen_offset,
-                                                      //   amax_list,
-                                                      encoder_out,
-                                                      head_num,
-                                                      size_per_head,
-                                                      use_gelu,
-                                                      remove_padding,
-                                                      int8_mode,
-                                                      num_layer,
-                                                      layer_idx,
-                                                      allow_gemm_test,
-                                                      use_trt_kernel,
-                                                      normalize_before,
-                                                      stream);
-      break;
-    }
-    default: {
-      PD_THROW(
-          "NOT supported data type. "
-          "Only float16 and float32 are supported. ");
-      break;
-    }
-  }
-  return ret;
-}
diff --git a/paddlenlp/ops/fast_transformer/src/fusion_encoder_op.h b/paddlenlp/ops/fast_transformer/src/fusion_encoder_op.h
deleted file mode 100644
index f9427e4c0eca..000000000000
--- a/paddlenlp/ops/fast_transformer/src/fusion_encoder_op.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include "fastertransformer/bert_encoder_transformer.h"
-#include "fastertransformer/utils/common.h"
-
-#ifdef PADDLE_ON_INFERENCE
-#include "paddle/extension.h"
-#include "paddle_inference_api.h"
-#include "paddle/common/exception.h"
-#else
-#include "paddle/extension.h"
-#endif
-
-
-std::vector<paddle::Tensor> EncoderCUDAForward(
-    const paddle::Tensor& input,
-    const paddle::Tensor& attn_mask,
-    const std::vector<paddle::Tensor>& attn_query_weight,
-    const std::vector<paddle::Tensor>& attn_query_bias,
-    const std::vector<paddle::Tensor>& attn_key_weight,
-    const std::vector<paddle::Tensor>& attn_key_bias,
-    const std::vector<paddle::Tensor>& attn_value_weight,
-    const std::vector<paddle::Tensor>& attn_value_bias,
-    const std::vector<paddle::Tensor>& attn_output_weight,
-    const std::vector<paddle::Tensor>& attn_output_bias,
-    const std::vector<paddle::Tensor>& norm1_weight,
-    const std::vector<paddle::Tensor>& norm1_bias,
-    const std::vector<paddle::Tensor>& norm2_weight,
-    const std::vector<paddle::Tensor>& norm2_bias,
-    const std::vector<paddle::Tensor>& ffn_intermediate_weight,
-    const std::vector<paddle::Tensor>& ffn_intermediate_bias,
-    const std::vector<paddle::Tensor>& ffn_output_weight,
-    const std::vector<paddle::Tensor>& ffn_output_bias,
-    // const paddle::Tensor& sequence_id_offset,
-    // const paddle::Tensor& trt_seqlen_offset,
-    // const paddle::Tensor& amax_list,
-    std::vector<paddle::Tensor>& encoder_out,
-    int64_t head_num_,
-    int64_t size_per_head_,
-    bool use_gelu,
-    bool remove_padding,
-    int64_t int8_mode,  // no support now
-    int64_t num_layer_,
-    int64_t layer_idx_,
-    bool allow_gemm_test,
-    bool use_trt_kernel_,
-    bool normalize_before);
diff --git a/paddlenlp/ops/fast_transformer/src/fusion_force_decoding_op.cc b/paddlenlp/ops/fast_transformer/src/fusion_force_decoding_op.cc
deleted file mode 100644
index 50892e56199b..000000000000
--- a/paddlenlp/ops/fast_transformer/src/fusion_force_decoding_op.cc
+++ /dev/null
@@ -1,340 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <string>
-#include <vector>
-
-#include "fusion_force_decoding_op.h"
-#include "pd_traits.h"
-
-
-std::vector<paddle::Tensor> DecodingForward(
-    const paddle::Tensor& input,
-    const paddle::Tensor& mem_seq_len,
-    const paddle::Tensor& word_embedding,
-    const std::vector<paddle::Tensor>& self_ln_weight,
-    const std::vector<paddle::Tensor>& self_ln_bias,
-    const std::vector<paddle::Tensor>& self_q_weight,
-    const std::vector<paddle::Tensor>& self_q_bias,
-    const std::vector<paddle::Tensor>& self_k_weight,
-    const std::vector<paddle::Tensor>& self_k_bias,
-    const std::vector<paddle::Tensor>& self_v_weight,
-    const std::vector<paddle::Tensor>& self_v_bias,
-    const std::vector<paddle::Tensor>& self_out_weight,
-    const std::vector<paddle::Tensor>& self_out_bias,
-    const std::vector<paddle::Tensor>& cross_ln_weight,
-    const std::vector<paddle::Tensor>& cross_ln_bias,
-    const std::vector<paddle::Tensor>& cross_q_weight,
-    const std::vector<paddle::Tensor>& cross_q_bias,
-    const std::vector<paddle::Tensor>& cross_k_weight,
-    const std::vector<paddle::Tensor>& cross_k_bias,
-    const std::vector<paddle::Tensor>& cross_v_weight,
-    const std::vector<paddle::Tensor>& cross_v_bias,
-    const std::vector<paddle::Tensor>& cross_out_weight,
-    const std::vector<paddle::Tensor>& cross_out_bias,
-    const std::vector<paddle::Tensor>& ffn_ln_weight,
-    const std::vector<paddle::Tensor>& ffn_ln_bias,
-    const std::vector<paddle::Tensor>& ffn_inter_weight,
-    const std::vector<paddle::Tensor>& ffn_inter_bias,
-    const std::vector<paddle::Tensor>& ffn_out_weight,
-    const std::vector<paddle::Tensor>& ffn_out_bias,
-    const paddle::Tensor& decoder_ln_weight,
-    const paddle::Tensor& decoder_ln_bias,
-    const paddle::Tensor& embedding_weight,
-    const paddle::Tensor& embedding_bias,
-    const paddle::Tensor& positional_embedding_weight,
-    const paddle::Tensor& trg_word,
-    const std::string& decoding_strategy,
-    const int& beam_size,
-    const int& topk,
-    const float& topp,
-    const int& n_head,
-    const int& size_per_head,
-    const int& num_layer,
-    const int& bos_id,
-    const int& eos_id,
-    const int64_t& max_len,
-    const float& beam_search_diversity_rate,
-    const bool& rel_len,
-    const float& alpha) {
-  int batch_size = input.shape()[0];
-  int max_out_len = rel_len ? max_len + input.shape()[1] : max_len;
-
-  std::vector<int64_t> output_dims;
-  std::vector<int64_t> parent_ids_dims;
-  std::vector<int64_t> sequence_length_dims({batch_size});
-  if (decoding_strategy == "beam_search") {
-    batch_size /= beam_size;
-    output_dims = {max_out_len, batch_size, beam_size};
-    parent_ids_dims = output_dims;
-  } else if (decoding_strategy == "beam_search_v2") {
-    // Use separated alive and finish beam queues to avoid the decrease of alive
-    // beams. The outputs must include both the finish and alive to trace full
-    // path.
-    if (batch_size != -1) {
-      sequence_length_dims = {batch_size * 2};
-      batch_size /= beam_size;
-    } else {
-      sequence_length_dims = {batch_size};
-    }
-    output_dims = {max_out_len, batch_size, beam_size * 2};
-    parent_ids_dims = output_dims;
-  } else if (decoding_strategy == "sampling") {
-    output_dims = {max_out_len, batch_size};
-    parent_ids_dims = {1};
-  } else {
-    PD_THROW("Not supported decoding strategy. ");
-  }
-
-  if (input.place() == paddle::PlaceType::kGPU) {
-    auto output_ids = paddle::Tensor(paddle::PlaceType::kGPU, output_dims);
-    auto parent_ids = paddle::Tensor(paddle::PlaceType::kGPU, parent_ids_dims);
-    auto sequence_length =
-        paddle::Tensor(paddle::PlaceType::kGPU, sequence_length_dims);
-
-    paddle::Tensor seq_len = paddle::Tensor(paddle::PlaceType::kGPU);
-
-    if (mem_seq_len.place() != paddle::PlaceType::kGPU) {
-      seq_len = mem_seq_len.copy_to<int>(paddle::PlaceType::kGPU);
-    } else {
-      seq_len = mem_seq_len;
-    }
-
-    return DecodingCUDAForward(input,
-                               seq_len,
-                               word_embedding,
-                               self_ln_weight,
-                               self_ln_bias,
-                               self_q_weight,
-                               self_q_bias,
-                               self_k_weight,
-                               self_k_bias,
-                               self_v_weight,
-                               self_v_bias,
-                               self_out_weight,
-                               self_out_bias,
-                               cross_ln_weight,
-                               cross_ln_bias,
-                               cross_q_weight,
-                               cross_q_bias,
-                               cross_k_weight,
-                               cross_k_bias,
-                               cross_v_weight,
-                               cross_v_bias,
-                               cross_out_weight,
-                               cross_out_bias,
-                               ffn_ln_weight,
-                               ffn_ln_bias,
-                               ffn_inter_weight,
-                               ffn_inter_bias,
-                               ffn_out_weight,
-                               ffn_out_bias,
-                               decoder_ln_weight,
-                               decoder_ln_bias,
-                               embedding_weight,
-                               embedding_bias,
-                               positional_embedding_weight,
-                               trg_word,
-                               output_ids,
-                               parent_ids,
-                               sequence_length,
-                               decoding_strategy,
-                               beam_size,
-                               topk,
-                               topp,
-                               n_head,
-                               size_per_head,
-                               num_layer,
-                               bos_id,
-                               eos_id,
-                               max_out_len,
-                               beam_search_diversity_rate,
-                               alpha);
-  } else {
-    PD_THROW("Not implemented place. Only GPU is supported. ");
-  }
-}
-
-std::vector<std::vector<int64_t>> DecodingInferShape(
-    const std::vector<int64_t>& input_shape,
-    const std::vector<int64_t>& mem_seq_len_shape,
-    const std::vector<int64_t>& word_embedding_shape,
-    const std::vector<std::vector<int64_t>>& self_ln_weight_shapes,
-    const std::vector<std::vector<int64_t>>& self_ln_bias_shapes,
-    const std::vector<std::vector<int64_t>>& self_q_weight_shapes,
-    const std::vector<std::vector<int64_t>>& self_q_bias_shapes,
-    const std::vector<std::vector<int64_t>>& self_k_weight_shapes,
-    const std::vector<std::vector<int64_t>>& self_k_bias_shapes,
-    const std::vector<std::vector<int64_t>>& self_v_weight_shapes,
-    const std::vector<std::vector<int64_t>>& self_v_bias_shapes,
-    const std::vector<std::vector<int64_t>>& self_out_weight_shapes,
-    const std::vector<std::vector<int64_t>>& self_out_bias_shapes,
-    const std::vector<std::vector<int64_t>>& cross_ln_weight_shapes,
-    const std::vector<std::vector<int64_t>>& cross_ln_bias_shapes,
-    const std::vector<std::vector<int64_t>>& cross_q_weight_shapes,
-    const std::vector<std::vector<int64_t>>& cross_q_bias_shapes,
-    const std::vector<std::vector<int64_t>>& cross_k_weight_shapes,
-    const std::vector<std::vector<int64_t>>& cross_k_bias_shapes,
-    const std::vector<std::vector<int64_t>>& cross_v_weight_shapes,
-    const std::vector<std::vector<int64_t>>& cross_v_bias_shapes,
-    const std::vector<std::vector<int64_t>>& cross_out_weight_shapes,
-    const std::vector<std::vector<int64_t>>& cross_out_bias_shapes,
-    const std::vector<std::vector<int64_t>>& ffn_ln_weight_shapes,
-    const std::vector<std::vector<int64_t>>& ffn_ln_bias_shapes,
-    const std::vector<std::vector<int64_t>>& ffn_inter_weight_shapes,
-    const std::vector<std::vector<int64_t>>& ffn_inter_bias_shapes,
-    const std::vector<std::vector<int64_t>>& ffn_out_weight_shapes,
-    const std::vector<std::vector<int64_t>>& ffn_out_bias_shapes,
-    const std::vector<int64_t>& decoder_ln_weight_shape,
-    const std::vector<int64_t>& decoder_ln_bias_shape,
-    const std::vector<int64_t>& embedding_weight_shape,
-    const std::vector<int64_t>& embedding_bias_shape,
-    const std::vector<int64_t>& positional_embedding_weight_shape,
-    const std::vector<int64_t>& trg_word_shape,
-    const std::string& decoding_strategy,
-    const int& beam_size,
-    const int& topk,
-    const float& topp,
-    const int& n_head,
-    const int& size_per_head,
-    const int& num_layer,
-    const int& bos_id,
-    const int& eos_id,
-    const int64_t& max_len,
-    const float& beam_search_diversity_rate,
-    const bool& rel_len,
-    const float& alpha) {
-  int batch_size = input_shape[0];
-
-  std::vector<int64_t> output_dims;
-  std::vector<int64_t> sequence_length_dims({batch_size});
-  if (decoding_strategy == "beam_search") {
-    if (batch_size != -1) {
-      batch_size /= beam_size;
-    }
-    output_dims = {max_len, batch_size, beam_size};
-    return {output_dims, output_dims, sequence_length_dims};
-  } else if (decoding_strategy == "beam_search_v2") {
-    // Use separated alive and finish beam queues to avoid the decrease of alive
-    // beams. The outputs must include both the finish and alive to trace full
-    // path.
-    if (batch_size != -1) {
-      sequence_length_dims = {batch_size * 2};
-      batch_size /= beam_size;
-    } else {
-      sequence_length_dims = {batch_size};
-    }
-    output_dims = {max_len, batch_size, beam_size * 2};
-    return {output_dims, output_dims, sequence_length_dims};
-  } else if (decoding_strategy == "sampling") {
-    output_dims = {max_len, batch_size};
-    return {output_dims, {1}, sequence_length_dims};
-  } else {
-    PD_THROW("Not supported decoding strategy. ");
-  }
-}
-
-std::vector<paddle::DataType> DecodingInferDtype(
-    const paddle::DataType& input,
-    const paddle::DataType& mem_seq_len,
-    const paddle::DataType& word_embedding,
-    const std::vector<paddle::DataType>& self_ln_weight,
-    const std::vector<paddle::DataType>& self_ln_bias,
-    const std::vector<paddle::DataType>& self_q_weight,
-    const std::vector<paddle::DataType>& self_q_bias,
-    const std::vector<paddle::DataType>& self_k_weight,
-    const std::vector<paddle::DataType>& self_k_bias,
-    const std::vector<paddle::DataType>& self_v_weight,
-    const std::vector<paddle::DataType>& self_v_bias,
-    const std::vector<paddle::DataType>& self_out_weight,
-    const std::vector<paddle::DataType>& self_out_bias,
-    const std::vector<paddle::DataType>& cross_ln_weight,
-    const std::vector<paddle::DataType>& cross_ln_bias,
-    const std::vector<paddle::DataType>& cross_q_weight,
-    const std::vector<paddle::DataType>& cross_q_bias,
-    const std::vector<paddle::DataType>& cross_k_weight,
-    const std::vector<paddle::DataType>& cross_k_bias,
-    const std::vector<paddle::DataType>& cross_v_weight,
-    const std::vector<paddle::DataType>& cross_v_bias,
-    const std::vector<paddle::DataType>& cross_out_weight,
-    const std::vector<paddle::DataType>& cross_out_bias,
-    const std::vector<paddle::DataType>& ffn_ln_weight,
-    const std::vector<paddle::DataType>& ffn_ln_bias,
-    const std::vector<paddle::DataType>& ffn_inter_weight,
-    const std::vector<paddle::DataType>& ffn_inter_bias,
-    const std::vector<paddle::DataType>& ffn_out_weight,
-    const std::vector<paddle::DataType>& ffn_out_bias,
-    const paddle::DataType& decoder_ln_weight,
-    const paddle::DataType& decoder_ln_bias,
-    const paddle::DataType& embedding_weight,
-    const paddle::DataType& embedding_bias,
-    const paddle::DataType& positional_embedding_weight,
-    const paddle::DataType& trg_word) {
-  return {paddle::DataType::INT32,
-          paddle::DataType::INT32,
-          paddle::DataType::INT32};
-}
-
-PD_BUILD_OP(fusion_force_decoding)
-    .Inputs({"Input",
-             "MemSeqLen",
-             "WordEmbedding",
-             paddle::Vec("SelfLayernormWeight"),
-             paddle::Vec("SelfLayernormBias"),
-             paddle::Vec("SelfQueryWeight"),
-             paddle::Vec("SelfQueryBias"),
-             paddle::Vec("SelfKeyWeight"),
-             paddle::Vec("SelfKeyBias"),
-             paddle::Vec("SelfValueWeight"),
-             paddle::Vec("SelfValueBias"),
-             paddle::Vec("SelfOutWeight"),
-             paddle::Vec("SelfOutBias"),
-             paddle::Vec("CrossLayernormWeight"),
-             paddle::Vec("CrossLayernormBias"),
-             paddle::Vec("CrossQueryWeight"),
-             paddle::Vec("CrossQueryBias"),
-             paddle::Vec("CrossKeyWeight"),
-             paddle::Vec("CrossKeyBias"),
-             paddle::Vec("CrossValueWeight"),
-             paddle::Vec("CrossValueBias"),
-             paddle::Vec("CrossOutWeight"),
-             paddle::Vec("CrossOutBias"),
-             paddle::Vec("FFNLayernormWeight"),
-             paddle::Vec("FFNLayernormBias"),
-             paddle::Vec("FFNInterWeight"),
-             paddle::Vec("FFNInterBias"),
-             paddle::Vec("FFNOutWeight"),
-             paddle::Vec("FFNOutBias"),
-             "DecoderLayernormWeight",
-             "DecoderLayernormBias",
-             "EmbWeight",
-             "EmbBias",
-             "PositionEncEmb",
-             "TrgWord"})
-    .Outputs({"OutputIds", "ParentIds", "SequenceLength"})
-    .Attrs({"decoding_strategy: std::string",
-            "beam_size: int",
-            "topk: int",
-            "topp: float",
-            "n_head: int",
-            "size_per_head: int",
-            "num_layer: int",
-            "bos_id: int",
-            "eos_id: int",
-            "max_len: int64_t",
-            "beam_search_diversity_rate: float",
-            "rel_len: bool",
-            "alpha: float"})
-    .SetKernelFn(PD_KERNEL(DecodingForward))
-    .SetInferShapeFn(PD_INFER_SHAPE(DecodingInferShape))
-    .SetInferDtypeFn(PD_INFER_DTYPE(DecodingInferDtype));
diff --git a/paddlenlp/ops/fast_transformer/src/fusion_force_decoding_op.cu b/paddlenlp/ops/fast_transformer/src/fusion_force_decoding_op.cu
deleted file mode 100644
index ae269e34ae42..000000000000
--- a/paddlenlp/ops/fast_transformer/src/fusion_force_decoding_op.cu
+++ /dev/null
@@ -1,572 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <curand.h>
-#include <curand_kernel.h>
-
-#include <algorithm>
-#include <iterator>
-#include <random>
-#include <sstream>
-#include <vector>
-
-
-#include "fusion_force_decoding_op.h"
-#include "pd_traits.h"
-
-
-__global__ void get_trg_length(const int* trg_word,
-                               int* trg_length,
-                               const int seq_len,
-                               const int pad_id) {
-  int bid = threadIdx.x;
-
-  int cnt_nonpads = 0;
-  for (int i = 0; i < seq_len; ++i) {
-    if (pad_id != trg_word[bid * seq_len + i]) {
-      cnt_nonpads++;
-    } else {
-      break;
-    }
-  }
-  trg_length[bid] = cnt_nonpads;
-}
-
-template <paddle::DataType D>
-std::vector<paddle::Tensor> decoding_kernel(
-    const paddle::Tensor& input,
-    const paddle::Tensor& memory_sequence_length,
-    const paddle::Tensor& word_emb,
-    const std::vector<paddle::Tensor>& self_layernorm_weight,
-    const std::vector<paddle::Tensor>& self_layernorm_bias,
-    const std::vector<paddle::Tensor>& self_attn_query_weight,
-    const std::vector<paddle::Tensor>& self_attn_query_bias,
-    const std::vector<paddle::Tensor>& self_attn_key_weight,
-    const std::vector<paddle::Tensor>& self_attn_key_bias,
-    const std::vector<paddle::Tensor>& self_attn_value_weight,
-    const std::vector<paddle::Tensor>& self_attn_value_bias,
-    const std::vector<paddle::Tensor>& self_attn_output_weight,
-    const std::vector<paddle::Tensor>& self_attn_output_bias,
-    const std::vector<paddle::Tensor>& cross_layernorm_weight,
-    const std::vector<paddle::Tensor>& cross_layernorm_bias,
-    const std::vector<paddle::Tensor>& cross_attn_query_weight,
-    const std::vector<paddle::Tensor>& cross_attn_query_bias,
-    const std::vector<paddle::Tensor>& cross_attn_key_weight,
-    const std::vector<paddle::Tensor>& cross_attn_key_bias,
-    const std::vector<paddle::Tensor>& cross_attn_value_weight,
-    const std::vector<paddle::Tensor>& cross_attn_value_bias,
-    const std::vector<paddle::Tensor>& cross_attn_output_weight,
-    const std::vector<paddle::Tensor>& cross_attn_output_bias,
-    const std::vector<paddle::Tensor>& ffn_layernorm_weight,
-    const std::vector<paddle::Tensor>& ffn_layernorm_bias,
-    const std::vector<paddle::Tensor>& ffn_intermediate_weight,
-    const std::vector<paddle::Tensor>& ffn_intermediate_bias,
-    const std::vector<paddle::Tensor>& ffn_output_weight,
-    const std::vector<paddle::Tensor>& ffn_output_bias,
-    const paddle::Tensor& decoder_layernorm_weight,
-    const paddle::Tensor& decoder_layernorm_bias,
-    const paddle::Tensor& embedding_weight,
-    const paddle::Tensor& embedding_bias,
-    const paddle::Tensor& position_encoding_table,
-    const paddle::Tensor& trg_word,
-    paddle::Tensor& output_ids,
-    paddle::Tensor& parent_ids,
-    paddle::Tensor& sequence_length,
-    const std::string& decoding_strategy,
-    const int beam_size,
-    const int topk,
-    const float topp,
-    const int head_num_,
-    const int size_per_head_,
-    const int num_layer_,
-    const int start_id_,
-    const int end_id_,
-    const int64_t max_seq_len_,
-    const float beam_search_diversity_rate_,
-    const float alpha,
-    cudaStream_t stream) {
-  int beam_width_ = (decoding_strategy == "beam_search" ||
-                     decoding_strategy == "beam_search_v2")
-                        ? beam_size
-                        : 1;
-  int candidate_num_ = (decoding_strategy == "sampling") ? topk : 1;
-  float probability_threshold_ = (decoding_strategy == "sampling") ? topp : 0.0;
-
-  auto input_dims = input.shape();
-  int batch_size_ = (decoding_strategy == "beam_search" ||
-                     decoding_strategy == "beam_search_v2")
-                        ? input_dims[0] / beam_width_
-                        : input_dims[0];
-  const int memory_max_seq_len = input_dims[1];
-  const int memory_hidden_dim = input_dims[2];
-  const int vocab_size = word_emb.shape()[0];
-
-  typedef PDTraits<D> traits_;
-  typedef typename traits_::DataType DataType_;
-  typedef typename traits_::data_t data_t_;
-
-  DecodingInitParam<DataType_> decoding_params;
-  decoding_params.cublas_handle = CublasHandle::GetInstance()->cublas_handle_;
-  decoding_params.cublaslt_handle =
-      CublasHandle::GetInstance()->cublaslt_handle_;
-
-  decoding_params.output_ids = output_ids.mutable_data<int>(input.place());
-  decoding_params.parent_ids = parent_ids.mutable_data<int>(input.place());
-  decoding_params.sequence_length =
-      sequence_length.mutable_data<int>(input.place());
-
-  typedef DecoderTransformerTraits<traits_::OpType> DecodingTraits_;
-  decoding_params.stream = stream;
-  fastertransformer::Allocator<AllocatorType::PD> allocator_(stream);
-
-  decoding_params.memory_tensor =
-      reinterpret_cast<const DataType_*>(input.data<data_t_>());
-  decoding_params.memory_sequence_length = memory_sequence_length.data<int>();
-
-  auto trg_word_shape = trg_word.shape();
-  int trg_max_len =
-      (trg_word_shape.size() == 2) ? static_cast<int>(trg_word_shape[1]) : 0;
-
-  paddle::Tensor trg_length =
-      (trg_word_shape.size() == 2 && trg_word_shape[0] != 0)
-          ? paddle::Tensor(paddle::PlaceType::kGPU, {trg_word_shape[0]})
-          : paddle::Tensor(paddle::PlaceType::kGPU, {1});
-  auto trg_length_ptr = trg_length.mutable_data<int>(input.place());
-
-  if (trg_word_shape.size() == 2 && trg_word_shape[0] != 0) {
-    decoding_params.trg_word = trg_word.data<int>();
-
-    get_trg_length<<<1, trg_word_shape[0], 0, stream>>>(
-        decoding_params.trg_word, trg_length_ptr, trg_max_len, start_id_);
-    decoding_params.trg_length = trg_length_ptr;
-  }
-
-  DecoderInitParam<DataType_>* params =
-      new DecoderInitParam<DataType_>[num_layer_];
-
-  int inner_coeff = ffn_intermediate_weight[0].shape()[1] / memory_hidden_dim;
-
-  auto q_weight_shape = self_attn_query_weight[0].shape();
-  auto k_weight_shape = self_attn_key_weight[0].shape();
-  bool fuse_qkv = (q_weight_shape[1] == k_weight_shape[1]) ? false : true;
-
-  for (int i = 0; i < num_layer_; i++) {
-    params[i].stream = stream;
-    params[i].cublas_handle = CublasHandle::GetInstance()->cublas_handle_;
-    params[i].cublaslt_handle = CublasHandle::GetInstance()->cublaslt_handle_;
-
-    if (decoding_strategy == "beam_search" ||
-        decoding_strategy == "beam_search_v2") {
-      params[i].request_batch_size = batch_size_ * beam_width_;
-      params[i].request_max_mem_seq_len = memory_max_seq_len;
-    } else if (decoding_strategy == "sampling" ||
-               decoding_strategy == "topk_sampling" ||
-               decoding_strategy == "topp_sampling") {
-      params[i].request_batch_size = batch_size_;
-      params[i].request_max_mem_seq_len = memory_max_seq_len;
-    }
-
-    // self attn
-    params[i].self_layernorm.gamma = reinterpret_cast<const DataType_*>(
-        self_layernorm_weight[i].data<data_t_>());
-    params[i].self_layernorm.beta = reinterpret_cast<const DataType_*>(
-        self_layernorm_bias[i].data<data_t_>());
-    // query
-    params[i].self_attention.query_weight.kernel =
-        reinterpret_cast<const DataType_*>(
-            self_attn_query_weight[i].data<data_t_>());
-    params[i].self_attention.query_weight.bias =
-        reinterpret_cast<const DataType_*>(
-            self_attn_query_bias[i].data<data_t_>());
-    // key
-    params[i].self_attention.key_weight.kernel =
-        reinterpret_cast<const DataType_*>(
-            self_attn_key_weight[i].data<data_t_>());
-    params[i].self_attention.key_weight.bias =
-        reinterpret_cast<const DataType_*>(
-            self_attn_key_bias[i].data<data_t_>());
-    // value
-    params[i].self_attention.value_weight.kernel =
-        reinterpret_cast<const DataType_*>(
-            self_attn_value_weight[i].data<data_t_>());
-    params[i].self_attention.value_weight.bias =
-        reinterpret_cast<const DataType_*>(
-            self_attn_value_bias[i].data<data_t_>());
-    // out proj
-    params[i].self_attention.attention_output_weight.kernel =
-        reinterpret_cast<const DataType_*>(
-            self_attn_output_weight[i].data<data_t_>());
-    params[i].self_attention.attention_output_weight.bias =
-        reinterpret_cast<const DataType_*>(
-            self_attn_output_bias[i].data<data_t_>());
-
-    // cross
-    params[i].cross_layernorm.gamma = reinterpret_cast<const DataType_*>(
-        cross_layernorm_weight[i].data<data_t_>());
-    params[i].cross_layernorm.beta = reinterpret_cast<const DataType_*>(
-        cross_layernorm_bias[i].data<data_t_>());
-    // query
-    params[i].cross_attention.query_weight.kernel =
-        reinterpret_cast<const DataType_*>(
-            cross_attn_query_weight[i].data<data_t_>());
-    params[i].cross_attention.query_weight.bias =
-        reinterpret_cast<const DataType_*>(
-            cross_attn_query_bias[i].data<data_t_>());
-    // key
-    params[i].cross_attention.key_weight.kernel =
-        reinterpret_cast<const DataType_*>(
-            cross_attn_key_weight[i].data<data_t_>());
-    params[i].cross_attention.key_weight.bias =
-        reinterpret_cast<const DataType_*>(
-            cross_attn_key_bias[i].data<data_t_>());
-    // value
-    params[i].cross_attention.value_weight.kernel =
-        reinterpret_cast<const DataType_*>(
-            cross_attn_value_weight[i].data<data_t_>());
-    params[i].cross_attention.value_weight.bias =
-        reinterpret_cast<const DataType_*>(
-            cross_attn_value_bias[i].data<data_t_>());
-    // out proj
-    params[i].cross_attention.attention_output_weight.kernel =
-        reinterpret_cast<const DataType_*>(
-            cross_attn_output_weight[i].data<data_t_>());
-    params[i].cross_attention.attention_output_weight.bias =
-        reinterpret_cast<const DataType_*>(
-            cross_attn_output_bias[i].data<data_t_>());
-
-    // ffn
-    params[i].ffn_layernorm.gamma = reinterpret_cast<const DataType_*>(
-        ffn_layernorm_weight[i].data<data_t_>());
-    params[i].ffn_layernorm.beta = reinterpret_cast<const DataType_*>(
-        ffn_layernorm_bias[i].data<data_t_>());
-    // intermediate proj
-    params[i].ffn.intermediate_weight.kernel =
-        reinterpret_cast<const DataType_*>(
-            ffn_intermediate_weight[i].data<data_t_>());
-    params[i].ffn.intermediate_weight.bias = reinterpret_cast<const DataType_*>(
-        ffn_intermediate_bias[i].data<data_t_>());
-    // out proj
-    params[i].ffn.output_weight.kernel = reinterpret_cast<const DataType_*>(
-        ffn_output_weight[i].data<data_t_>());
-    params[i].ffn.output_weight.bias =
-        reinterpret_cast<const DataType_*>(ffn_output_bias[i].data<data_t_>());
-  }
-
-  decoding_params.layernorm.gamma = reinterpret_cast<const DataType_*>(
-      decoder_layernorm_weight.data<data_t_>());
-  decoding_params.layernorm.beta = reinterpret_cast<const DataType_*>(
-      decoder_layernorm_bias.data<data_t_>());
-  // for embedding
-  decoding_params.embedding_table =
-      reinterpret_cast<const DataType_*>(word_emb.data<data_t_>());
-
-  // for weight sharing matmul
-  decoding_params.embedding_kernel =
-      reinterpret_cast<const DataType_*>(embedding_weight.data<data_t_>());
-  // for matmul bias
-  decoding_params.embedding_bias =
-      reinterpret_cast<const DataType_*>(embedding_bias.data<data_t_>());
-
-  decoding_params.position_encoding_table = reinterpret_cast<const DataType_*>(
-      position_encoding_table.data<data_t_>());
-
-  if ("beam_search" == decoding_strategy) {
-    DecodingBeamsearch<DecodingTraits_::OpType>* decoding_beam_search_;
-    decoding_beam_search_ = new DecodingBeamsearch<DecodingTraits_::OpType>(
-        allocator_,
-        batch_size_,
-        beam_width_,
-        max_seq_len_,
-        head_num_,
-        size_per_head_,
-        vocab_size,
-        num_layer_,
-        memory_hidden_dim,
-        memory_max_seq_len,
-        start_id_,
-        end_id_,
-        beam_search_diversity_rate_,
-        true,  // is_fuse_topk_softMax
-        fuse_qkv,
-        false,                 // keep_alive_beam
-        0.6,                   // alpha
-        true,                  // normalization_before
-        0,                     // pos_offset
-        ActivationType::RELU,  // act
-        false,                 // pos_bias
-        false,                 // prefix_lm
-        -1,                    // finished_candidate_num
-        false,                 // early_stopping
-        false,                 // is_mbart
-        0,                     // min_length
-        inner_coeff);
-
-    decoding_beam_search_->forward(params, decoding_params);
-
-    delete decoding_beam_search_;
-  } else if ("beam_search_v2" == decoding_strategy) {
-    DecodingBeamsearch<DecodingTraits_::OpType>* decoding_beam_search_;
-    decoding_beam_search_ = new DecodingBeamsearch<DecodingTraits_::OpType>(
-        allocator_,
-        batch_size_,
-        beam_width_,
-        max_seq_len_,
-        head_num_,
-        size_per_head_,
-        vocab_size,
-        num_layer_,
-        memory_hidden_dim,
-        memory_max_seq_len,
-        start_id_,
-        end_id_,
-        beam_search_diversity_rate_,
-        true,      // is_fuse_topk_softMax
-        fuse_qkv,  // is_fuse_qkv
-        true,      // keep_alive_beam
-        alpha,
-        true,                  // normalization_before
-        0,                     // pos_offset
-        ActivationType::RELU,  // act
-        false,                 // pos_bias
-        false,                 // prefix_lm
-        -1,                    // finished_candidate_num
-        false,                 // early_stopping
-        false,                 // is_mbart
-        0,                     // min_length
-        inner_coeff);
-
-    decoding_beam_search_->forward(params, decoding_params);
-
-    delete decoding_beam_search_;
-  } else if ("topk_sampling" == decoding_strategy ||
-             "topp_sampling" == decoding_strategy ||
-             "sampling" == decoding_strategy) {
-    DecodingSampling<DecodingTraits_::OpType>* decoding_sampling_;
-    decoding_sampling_ = new DecodingSampling<DecodingTraits_::OpType>(
-        allocator_,
-        batch_size_,
-        max_seq_len_,
-        head_num_,
-        size_per_head_,
-        vocab_size,
-        num_layer_,
-        memory_hidden_dim,
-        memory_max_seq_len,
-        start_id_,
-        end_id_,
-        candidate_num_,
-        probability_threshold_,
-        fuse_qkv,
-        true,                  // normalization_before
-        0,                     // pos_offset
-        ActivationType::RELU,  // act
-        false,                 // pos_bias
-        1.0,                   // temperature
-        1.0,                   // repeat_penalty
-        false,                 // prefix_lm
-        false,                 // is_mbart
-        0,                     // min_length
-        inner_coeff);
-
-    decoding_sampling_->forward(params, decoding_params);
-
-    delete decoding_sampling_;
-  } else {
-    PD_THROW(
-        "Only beam_search, beam_search_v2 and sampling are supported for "
-        "FastGeneration. ");
-  }
-  delete[] params;
-
-  return {output_ids, parent_ids, sequence_length};
-}
-
-std::vector<paddle::Tensor> DecodingCUDAForward(
-    const paddle::Tensor& input,
-    const paddle::Tensor& mem_seq_len,
-    const paddle::Tensor& word_embedding,
-    const std::vector<paddle::Tensor>& self_ln_weight,
-    const std::vector<paddle::Tensor>& self_ln_bias,
-    const std::vector<paddle::Tensor>& self_q_weight,
-    const std::vector<paddle::Tensor>& self_q_bias,
-    const std::vector<paddle::Tensor>& self_k_weight,
-    const std::vector<paddle::Tensor>& self_k_bias,
-    const std::vector<paddle::Tensor>& self_v_weight,
-    const std::vector<paddle::Tensor>& self_v_bias,
-    const std::vector<paddle::Tensor>& self_out_weight,
-    const std::vector<paddle::Tensor>& self_out_bias,
-    const std::vector<paddle::Tensor>& cross_ln_weight,
-    const std::vector<paddle::Tensor>& cross_ln_bias,
-    const std::vector<paddle::Tensor>& cross_q_weight,
-    const std::vector<paddle::Tensor>& cross_q_bias,
-    const std::vector<paddle::Tensor>& cross_k_weight,
-    const std::vector<paddle::Tensor>& cross_k_bias,
-    const std::vector<paddle::Tensor>& cross_v_weight,
-    const std::vector<paddle::Tensor>& cross_v_bias,
-    const std::vector<paddle::Tensor>& cross_out_weight,
-    const std::vector<paddle::Tensor>& cross_out_bias,
-    const std::vector<paddle::Tensor>& ffn_ln_weight,
-    const std::vector<paddle::Tensor>& ffn_ln_bias,
-    const std::vector<paddle::Tensor>& ffn_inter_weight,
-    const std::vector<paddle::Tensor>& ffn_inter_bias,
-    const std::vector<paddle::Tensor>& ffn_out_weight,
-    const std::vector<paddle::Tensor>& ffn_out_bias,
-    const paddle::Tensor& decoder_ln_weight,
-    const paddle::Tensor& decoder_ln_bias,
-    const paddle::Tensor& embedding_weight,
-    const paddle::Tensor& embedding_bias,
-    const paddle::Tensor& positional_embedding_weight,
-    const paddle::Tensor& trg_word,
-    paddle::Tensor& output_ids,
-    paddle::Tensor& parent_ids,
-    paddle::Tensor& sequence_length,
-    const std::string& decoding_strategy,
-    const int beam_size,
-    const int topk,
-    const float topp,
-    const int n_head,
-    const int size_per_head,
-    const int num_layer,
-    const int bos_id,
-    const int eos_id,
-    const int64_t max_len,
-    const float beam_search_diversity_rate,
-    const float alpha) {
-  auto stream = input.stream();
-  cublasSetStream(CublasHandle::GetInstance()->cublas_handle_, stream);
-
-  std::vector<paddle::Tensor> ret;
-
-  switch (input.type()) {
-    case paddle::DataType::FLOAT16: {
-      ret = decoding_kernel<paddle::DataType::FLOAT16>(
-          input,
-          mem_seq_len,
-          word_embedding,
-          self_ln_weight,
-          self_ln_bias,
-          self_q_weight,
-          self_q_bias,
-          self_k_weight,
-          self_k_bias,
-          self_v_weight,
-          self_v_bias,
-          self_out_weight,
-          self_out_bias,
-          cross_ln_weight,
-          cross_ln_bias,
-          cross_q_weight,
-          cross_q_bias,
-          cross_k_weight,
-          cross_k_bias,
-          cross_v_weight,
-          cross_v_bias,
-          cross_out_weight,
-          cross_out_bias,
-          ffn_ln_weight,
-          ffn_ln_bias,
-          ffn_inter_weight,
-          ffn_inter_bias,
-          ffn_out_weight,
-          ffn_out_bias,
-          decoder_ln_weight,
-          decoder_ln_bias,
-          embedding_weight,
-          embedding_bias,
-          positional_embedding_weight,
-          trg_word,
-          output_ids,
-          parent_ids,
-          sequence_length,
-          decoding_strategy,
-          beam_size,
-          topk,
-          topp,
-          n_head,
-          size_per_head,
-          num_layer,
-          bos_id,
-          eos_id,
-          max_len,
-          beam_search_diversity_rate,
-          alpha,
-          stream);
-      break;
-    }
-    case paddle::DataType::FLOAT32: {
-      ret = decoding_kernel<paddle::DataType::FLOAT32>(
-          input,
-          mem_seq_len,
-          word_embedding,
-          self_ln_weight,
-          self_ln_bias,
-          self_q_weight,
-          self_q_bias,
-          self_k_weight,
-          self_k_bias,
-          self_v_weight,
-          self_v_bias,
-          self_out_weight,
-          self_out_bias,
-          cross_ln_weight,
-          cross_ln_bias,
-          cross_q_weight,
-          cross_q_bias,
-          cross_k_weight,
-          cross_k_bias,
-          cross_v_weight,
-          cross_v_bias,
-          cross_out_weight,
-          cross_out_bias,
-          ffn_ln_weight,
-          ffn_ln_bias,
-          ffn_inter_weight,
-          ffn_inter_bias,
-          ffn_out_weight,
-          ffn_out_bias,
-          decoder_ln_weight,
-          decoder_ln_bias,
-          embedding_weight,
-          embedding_bias,
-          positional_embedding_weight,
-          trg_word,
-          output_ids,
-          parent_ids,
-          sequence_length,
-          decoding_strategy,
-          beam_size,
-          topk,
-          topp,
-          n_head,
-          size_per_head,
-          num_layer,
-          bos_id,
-          eos_id,
-          max_len,
-          beam_search_diversity_rate,
-          alpha,
-          stream);
-      break;
-    }
-    default: {
-      PD_THROW(
-          "NOT supported data type. "
-          "Only float16 and float32 are supported. ");
-      break;
-    }
-  }
-
-  return ret;
-}
diff --git a/paddlenlp/ops/fast_transformer/src/fusion_force_decoding_op.h b/paddlenlp/ops/fast_transformer/src/fusion_force_decoding_op.h
deleted file mode 100644
index 99eef25111d6..000000000000
--- a/paddlenlp/ops/fast_transformer/src/fusion_force_decoding_op.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include "cublas_handle.h"
-
-#include "fastertransformer/decoding_beamsearch.h"
-#include "fastertransformer/decoding_sampling.h"
-#include "fastertransformer/open_decoder.h"
-#include "fastertransformer/utils/common.h"
-
-#ifdef PADDLE_ON_INFERENCE
-#include "paddle/extension.h"
-#include "paddle_inference_api.h"
-#include "paddle/common/exception.h"
-#else
-#include "paddle/extension.h"
-#endif
-
-
-std::vector<paddle::Tensor> DecodingCUDAForward(
-    const paddle::Tensor& input,
-    const paddle::Tensor& mem_seq_len,
-    const paddle::Tensor& word_embedding,
-    const std::vector<paddle::Tensor>& self_ln_weight,
-    const std::vector<paddle::Tensor>& self_ln_bias,
-    const std::vector<paddle::Tensor>& self_q_weight,
-    const std::vector<paddle::Tensor>& self_q_bias,
-    const std::vector<paddle::Tensor>& self_k_weight,
-    const std::vector<paddle::Tensor>& self_k_bias,
-    const std::vector<paddle::Tensor>& self_v_weight,
-    const std::vector<paddle::Tensor>& self_v_bias,
-    const std::vector<paddle::Tensor>& self_out_weight,
-    const std::vector<paddle::Tensor>& self_out_bias,
-    const std::vector<paddle::Tensor>& cross_ln_weight,
-    const std::vector<paddle::Tensor>& cross_ln_bias,
-    const std::vector<paddle::Tensor>& cross_q_weight,
-    const std::vector<paddle::Tensor>& cross_q_bias,
-    const std::vector<paddle::Tensor>& cross_k_weight,
-    const std::vector<paddle::Tensor>& cross_k_bias,
-    const std::vector<paddle::Tensor>& cross_v_weight,
-    const std::vector<paddle::Tensor>& cross_v_bias,
-    const std::vector<paddle::Tensor>& cross_out_weight,
-    const std::vector<paddle::Tensor>& cross_out_bias,
-    const std::vector<paddle::Tensor>& ffn_ln_weight,
-    const std::vector<paddle::Tensor>& ffn_ln_bias,
-    const std::vector<paddle::Tensor>& ffn_inter_weight,
-    const std::vector<paddle::Tensor>& ffn_inter_bias,
-    const std::vector<paddle::Tensor>& ffn_out_weight,
-    const std::vector<paddle::Tensor>& ffn_out_bias,
-    const paddle::Tensor& decoder_ln_weight,
-    const paddle::Tensor& decoder_ln_bias,
-    const paddle::Tensor& embedding_weight,
-    const paddle::Tensor& embedding_bias,
-    const paddle::Tensor& positional_embedding_weight,
-    const paddle::Tensor& trg_word,
-    paddle::Tensor& output_ids,
-    paddle::Tensor& parent_ids,
-    paddle::Tensor& sequence_length,
-    const std::string& decoding_strategy,
-    const int beam_size,
-    const int topk,
-    const float topp,
-    const int n_head,
-    const int size_per_head,
-    const int num_layer,
-    const int bos_id,
-    const int eos_id,
-    const int64_t max_len,
-    const float beam_search_diversity_rate,
-    const float alpha);
diff --git a/paddlenlp/ops/fast_transformer/src/fusion_gpt_op.cc b/paddlenlp/ops/fast_transformer/src/fusion_gpt_op.cc
deleted file mode 100644
index a32867451e9d..000000000000
--- a/paddlenlp/ops/fast_transformer/src/fusion_gpt_op.cc
+++ /dev/null
@@ -1,223 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// 
-//     http://www.apache.org/licenses/LICENSE-2.0
-// 
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <string>
-#include <vector>
-
-#include "fusion_gpt_op.h"
-#include "pd_traits.h"
-
-
-std::vector<paddle::Tensor> GPT2Forward(
-    const paddle::Tensor& input,
-    const paddle::Tensor& attn_mask,
-    const paddle::Tensor& start_length,
-    const paddle::Tensor& word_embedding,
-    const std::vector<paddle::Tensor>& self_ln_weight,
-    const std::vector<paddle::Tensor>& self_ln_bias,
-    const std::vector<paddle::Tensor>& self_q_weight,
-    const std::vector<paddle::Tensor>& self_q_bias,
-    const std::vector<paddle::Tensor>& self_k_weight,
-    const std::vector<paddle::Tensor>& self_k_bias,
-    const std::vector<paddle::Tensor>& self_v_weight,
-    const std::vector<paddle::Tensor>& self_v_bias,
-    const std::vector<paddle::Tensor>& self_out_weight,
-    const std::vector<paddle::Tensor>& self_out_bias,
-    const std::vector<paddle::Tensor>& ffn_ln_weight,
-    const std::vector<paddle::Tensor>& ffn_ln_bias,
-    const std::vector<paddle::Tensor>& ffn_inter_weight,
-    const std::vector<paddle::Tensor>& ffn_inter_bias,
-    const std::vector<paddle::Tensor>& ffn_out_weight,
-    const std::vector<paddle::Tensor>& ffn_out_bias,
-    const paddle::Tensor& decoder_ln_weight,
-    const paddle::Tensor& decoder_ln_bias,
-    const paddle::Tensor& positional_embedding_weight,
-    const paddle::Tensor& emb_weight,
-    const int& topk,
-    const float& topp,
-    const int& max_len,
-    const int& n_head,
-    const int& size_per_head,
-    const int& num_layer,
-    const int& bos_id,
-    const int& eos_id,
-    const float& temperature,
-    const bool& use_fp16 = false,
-    const int& tensor_para_size = 1,
-    const int& layer_para_size = 1,
-    const int& layer_para_batch_size = 1) {
-  int batch_size = input.shape()[0];
-  int start_len = input.shape()[1];
-  int total_len = max_len + start_len;
-  std::vector<int64_t> output_dims({total_len, batch_size});
-  auto output_ids = paddle::Tensor(input.place(), output_dims);
-
-  if (word_embedding.place() == paddle::PlaceType::kGPU) {
-    return GPT2CUDAForward(input,
-                           attn_mask,
-                           start_length,
-                           word_embedding,
-                           self_ln_weight,
-                           self_ln_bias,
-                           self_q_weight,
-                           self_q_bias,
-                           self_k_weight,
-                           self_k_bias,
-                           self_v_weight,
-                           self_v_bias,
-                           self_out_weight,
-                           self_out_bias,
-                           ffn_ln_weight,
-                           ffn_ln_bias,
-                           ffn_inter_weight,
-                           ffn_inter_bias,
-                           ffn_out_weight,
-                           ffn_out_bias,
-                           decoder_ln_weight,
-                           decoder_ln_bias,
-                           positional_embedding_weight,
-                           emb_weight,
-                           output_ids,
-                           topk,
-                           topp,
-                           total_len,
-                           n_head,
-                           size_per_head,
-                           num_layer,
-                           bos_id,
-                           eos_id,
-                           temperature,
-                           use_fp16,
-                           tensor_para_size,
-                           layer_para_size,
-                           layer_para_batch_size);
-  } else {
-    PD_THROW("Not implemented place. Only GPU is supported. ");
-  }
-}
-
-std::vector<std::vector<int64_t>> GPT2InferShape(
-    const std::vector<int64_t>& input_shape,
-    const std::vector<int64_t>& attn_mask_shape,
-    const std::vector<int64_t>& start_length,
-    const std::vector<int64_t>& word_embedding_shape,
-    const std::vector<std::vector<int64_t>>& self_ln_weight_shapes,
-    const std::vector<std::vector<int64_t>>& self_ln_bias_shapes,
-    const std::vector<std::vector<int64_t>>& self_q_weight_shapes,
-    const std::vector<std::vector<int64_t>>& self_q_bias_shapes,
-    const std::vector<std::vector<int64_t>>& self_k_weight_shapes,
-    const std::vector<std::vector<int64_t>>& self_k_bias_shapes,
-    const std::vector<std::vector<int64_t>>& self_v_weight_shapes,
-    const std::vector<std::vector<int64_t>>& self_v_bias_shapes,
-    const std::vector<std::vector<int64_t>>& self_out_weight_shapes,
-    const std::vector<std::vector<int64_t>>& self_out_bias_shapes,
-    const std::vector<std::vector<int64_t>>& ffn_ln_weight_shapes,
-    const std::vector<std::vector<int64_t>>& ffn_ln_bias_shapes,
-    const std::vector<std::vector<int64_t>>& ffn_inter_weight_shapes,
-    const std::vector<std::vector<int64_t>>& ffn_inter_bias_shapes,
-    const std::vector<std::vector<int64_t>>& ffn_out_weight_shapes,
-    const std::vector<std::vector<int64_t>>& ffn_out_bias_shapes,
-    const std::vector<int64_t>& decoder_ln_weight_shape,
-    const std::vector<int64_t>& decoder_ln_bias_shape,
-    const std::vector<int64_t>& positional_embedding_weight_shape,
-    const std::vector<int64_t>& emb_weight_shape,
-    const int& topk,
-    const float& topp,
-    const int& max_len,
-    const int& n_head,
-    const int& size_per_head,
-    const int& num_layer,
-    const int& bos_id,
-    const int& eos_id,
-    const float& temperature,
-    const bool& use_fp16 = false,
-    const int& tensor_para_size = 1,
-    const int& layer_para_size = 1,
-    const int& layer_para_batch_size = 1) {
-  int64_t batch_size = input_shape[0];
-  int64_t start_len = input_shape[1];
-  std::vector<int64_t> output_dims({max_len + start_len, batch_size});
-  return {output_dims};
-}
-
-std::vector<paddle::DataType> GPT2InferDtype(
-    const paddle::DataType& input_dtype,
-    const paddle::DataType& attn_mask_dtype,
-    const paddle::DataType& start_length_dtype,
-    const paddle::DataType& word_embedding_dtype,
-    const std::vector<paddle::DataType>& self_ln_weight_dtype,
-    const std::vector<paddle::DataType>& self_ln_bias_dtype,
-    const std::vector<paddle::DataType>& self_q_weight_dtype,
-    const std::vector<paddle::DataType>& self_q_bias_dtype,
-    const std::vector<paddle::DataType>& self_k_weight_dtype,
-    const std::vector<paddle::DataType>& self_k_bias_dtype,
-    const std::vector<paddle::DataType>& self_v_weight_dtype,
-    const std::vector<paddle::DataType>& self_v_bias_dtype,
-    const std::vector<paddle::DataType>& self_out_weight_dtype,
-    const std::vector<paddle::DataType>& self_out_bias_dtype,
-    const std::vector<paddle::DataType>& ffn_ln_weight_dtype,
-    const std::vector<paddle::DataType>& ffn_ln_bias_dtype,
-    const std::vector<paddle::DataType>& ffn_inter_weight_dtype,
-    const std::vector<paddle::DataType>& ffn_inter_bias_dtype,
-    const std::vector<paddle::DataType>& ffn_out_weight_dtype,
-    const std::vector<paddle::DataType>& ffn_out_bias_dtype,
-    const paddle::DataType& decoder_ln_weight_dtype,
-    const paddle::DataType& decoder_ln_bias_dtype,
-    const paddle::DataType& positional_embedding_weight_dtype,
-    const paddle::DataType& emb_weight_dtype) {
-  return {paddle::DataType::INT32};
-}
-
-PD_BUILD_OP(fusion_gpt)
-    .Inputs({"Input",
-             "AttentionMask",
-             "StartLength",
-             "WordEmbedding",
-             paddle::Vec("SelfLayernormWeight"),
-             paddle::Vec("SelfLayernormBias"),
-             paddle::Vec("SelfQueryWeight"),
-             paddle::Vec("SelfQueryBias"),
-             paddle::Vec("SelfKeyWeight"),
-             paddle::Vec("SelfKeyBias"),
-             paddle::Vec("SelfValueWeight"),
-             paddle::Vec("SelfValueBias"),
-             paddle::Vec("SelfOutWeight"),
-             paddle::Vec("SelfOutBias"),
-             paddle::Vec("FFNLayernormWeight"),
-             paddle::Vec("FFNLayernormBias"),
-             paddle::Vec("FFNInterWeight"),
-             paddle::Vec("FFNInterBias"),
-             paddle::Vec("FFNOutWeight"),
-             paddle::Vec("FFNOutBias"),
-             "DecoderLayernormWeight",
-             "DecoderLayernormBias",
-             "PositionEncEmb",
-             "EmbWeight"})
-    .Outputs({"OutputIds"})
-    .Attrs({"topk: int",
-            "topp: float",
-            "max_len: int",
-            "n_head: int",
-            "size_per_head: int",
-            "num_layer: int",
-            "bos_id: int",
-            "eos_id: int",
-            "temperature: float",
-            "use_fp16: bool",
-            "tensor_para_size: int",
-            "layer_para_size: int",
-            "layer_para_batch_size: int"})
-    .SetKernelFn(PD_KERNEL(GPT2Forward))
-    .SetInferShapeFn(PD_INFER_SHAPE(GPT2InferShape))
-    .SetInferDtypeFn(PD_INFER_DTYPE(GPT2InferDtype));
diff --git a/paddlenlp/ops/fast_transformer/src/fusion_gpt_op.cu b/paddlenlp/ops/fast_transformer/src/fusion_gpt_op.cu
deleted file mode 100644
index 29bc57747f04..000000000000
--- a/paddlenlp/ops/fast_transformer/src/fusion_gpt_op.cu
+++ /dev/null
@@ -1,378 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// 
-//     http://www.apache.org/licenses/LICENSE-2.0
-// 
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <curand.h>
-#include <curand_kernel.h>
-#include <algorithm>
-#include <iterator>
-#include <random>
-#include <sstream>
-#include <vector>
-
-// TODO(guosheng): `HOST` conflict exists in float.h of paddle and mpi.h of mpi
-#include "fusion_gpt_op.h"
-#include "pd_traits.h"
-#ifdef HOST
-#undef HOST
-#endif
-
-#include "fastertransformer/gpt.h"
-#include "fastertransformer/utils/common.h"
-
-#ifdef BUILD_GPT  // consistent with FasterTransformer
-#include "parallel_utils.h"
-#endif
-
-template <paddle::DataType D>
-std::vector<paddle::Tensor> gpt2_kernel(
-    const paddle::Tensor& input,
-    const paddle::Tensor& attn_mask,
-    const paddle::Tensor& start_length,
-    const paddle::Tensor& word_emb,
-    const std::vector<paddle::Tensor>& self_ln_weight,
-    const std::vector<paddle::Tensor>& self_ln_bias,
-    const std::vector<paddle::Tensor>& self_q_weight,
-    const std::vector<paddle::Tensor>& self_q_bias,
-    const std::vector<paddle::Tensor>& self_k_weight,
-    const std::vector<paddle::Tensor>& self_k_bias,
-    const std::vector<paddle::Tensor>& self_v_weight,
-    const std::vector<paddle::Tensor>& self_v_bias,
-    const std::vector<paddle::Tensor>& self_out_weight,
-    const std::vector<paddle::Tensor>& self_out_bias,
-    const std::vector<paddle::Tensor>& ffn_ln_weight,
-    const std::vector<paddle::Tensor>& ffn_ln_bias,
-    const std::vector<paddle::Tensor>& ffn_inter_weight,
-    const std::vector<paddle::Tensor>& ffn_inter_bias,
-    const std::vector<paddle::Tensor>& ffn_out_weight,
-    const std::vector<paddle::Tensor>& ffn_out_bias,
-    const paddle::Tensor& decoder_ln_weight,
-    const paddle::Tensor& decoder_ln_bias,
-    const paddle::Tensor& positional_embedding_weight,
-    const paddle::Tensor& emb_weight,
-    paddle::Tensor& output_ids,
-    const int& topk,
-    const float& topp,
-    const int& max_len,
-    const int& n_head,
-    const int& size_per_head,
-    const int& num_layer,
-    const int& bos_id,
-    const int& eos_id,
-    const float& temperature,
-    cublasHandle_t cublas_handle_,
-    cublasLtHandle_t cublaslt_handle_,
-    cudaStream_t stream,
-    const int& tensor_para_size = 1,
-    const int& layer_para_size = 1,
-    const int& layer_para_batch_size = 1) {
-  auto input_dims = input.shape();
-  int batch_size_ = input_dims[0];
-  int start_len = input_dims[1];
-  const int vocab_size = word_emb.shape()[0];
-
-  typedef PDTraits<D> traits_;
-  typedef typename traits_::DataType DataType_;
-  typedef typename traits_::data_t data_t_;
-
-  DecodingInitParam<DataType_> decoding_params;
-  decoding_params.cublas_handle = cublas_handle_;
-  decoding_params.cublaslt_handle = cublaslt_handle_;
-
-  decoding_params.output_ids = output_ids.mutable_data<int>(word_emb.place());
-
-  typedef DecoderTransformerTraits<traits_::OpType> DecodingTraits_;
-  decoding_params.stream = stream;
-  fastertransformer::Allocator<AllocatorType::PD> allocator_(stream);
-
-  const int hidden_unit = size_per_head * n_head;
-
-#ifdef BUILD_GPT
-  auto* model_para_desc = ModelParaDescFactory::CreateModelParaDesc(
-      n_head,
-      size_per_head,
-      num_layer,
-      tensor_para_size,
-      layer_para_size,
-      layer_para_batch_size,
-      const_cast<data_t_*>(word_emb.data<data_t_>()));
-  auto& tensor_parallel_param = model_para_desc->tensor_parallel_param;
-  auto& layer_parallel_param = model_para_desc->layer_parallel_param;
-  auto seed = model_para_desc->dist(model_para_desc->gen);
-#else
-  TensorParallelParam tensor_parallel_param;
-  LayerParallelParam layer_parallel_param;
-  tensor_parallel_param.rank = 0;
-  tensor_parallel_param.world_size = 1;
-  tensor_parallel_param.local_head_num_ = n_head;
-  tensor_parallel_param.local_hidden_units_ = hidden_unit;
-
-  layer_parallel_param.rank = 0;
-  layer_parallel_param.world_size = 1;
-  layer_parallel_param.layers_per_group = num_layer;
-  layer_parallel_param.local_batch_size = batch_size_;
-  int seed = -1;
-#endif
-
-  DecodingGpt<DecodingTraits_::OpType>* gpt_decoding;
-
-  decoding_params.request_batch_size = batch_size_;
-  decoding_params.max_input_len = start_len;
-  decoding_params.request_input_len = start_len;
-  decoding_params.request_output_len = max_len - start_len;
-
-  decoding_params.d_start_ids = const_cast<int *>(input.data<int>());
-  decoding_params.d_attn_mask =
-      reinterpret_cast<DataType_*>(const_cast<data_t_ *>(attn_mask.data<data_t_>()));
-  decoding_params.d_start_lengths = start_length.data<int>();
-
-  gpt_decoding =
-      new DecodingGpt<DecodingTraits_::OpType>(allocator_,
-                                               batch_size_,
-                                               max_len,
-                                               n_head,
-                                               size_per_head,
-                                               vocab_size,
-                                               num_layer,
-                                               bos_id,
-                                               eos_id,
-                                               topk,
-                                               topp,
-                                               temperature,
-                                               tensor_para_size,
-                                               layer_para_size,
-                                               true, /*is_fuse_QKV*/
-                                               1.0,  /*repetition_penalty*/
-                                               seed);
-
-  gpt_decoding->set_tensor_parallel_param(tensor_parallel_param);
-  gpt_decoding->set_layer_parallel_param(layer_parallel_param);
-
-  DecoderInitParam<DataType_>* params =
-      new DecoderInitParam<DataType_>[num_layer];
-
-  for (int i = 0; i < self_ln_weight.size(); ++i) {
-    // Allow python passing weights of all layers or only passing the
-    // corresponding layers to save memory.
-    int layer_idx = self_ln_weight.size() != num_layer
-                        ? layer_parallel_param.rank *
-                                  layer_parallel_param.layers_per_group +
-                              i
-                        : i;
-
-    params[layer_idx].stream = stream;
-    params[layer_idx].cublas_handle = cublas_handle_;
-    params[layer_idx].cublaslt_handle = cublaslt_handle_;
-
-    params[layer_idx].request_batch_size = batch_size_;
-    params[layer_idx].request_max_mem_seq_len = start_len;
-
-    params[layer_idx].self_layernorm.gamma =
-        reinterpret_cast<const DataType_*>(self_ln_weight[i].data<data_t_>());
-    params[layer_idx].self_layernorm.beta =
-        reinterpret_cast<const DataType_*>(self_ln_bias[i].data<data_t_>());
-
-    params[layer_idx].self_attention.query_weight.kernel =
-        reinterpret_cast<const DataType_*>(self_q_weight[i].data<data_t_>());
-    params[layer_idx].self_attention.query_weight.bias =
-        reinterpret_cast<const DataType_*>(self_q_bias[i].data<data_t_>());
-    // For `is_fuse_QKV == true`, ignore weight and bias of key and value to
-    // remove requirements on python passing weights to save memory.
-    // params[layer_idx].self_attention.key_weight.kernel =
-    //     reinterpret_cast<const DataType_*>(self_k_weight[i].data<data_t_>());
-    // params[layer_idx].self_attention.key_weight.bias =
-    //     reinterpret_cast<const DataType_*>(self_k_bias[i].data<data_t_>());
-    // params[layer_idx].self_attention.value_weight.kernel =
-    //     reinterpret_cast<const DataType_*>(self_v_weight[i].data<data_t_>());
-    // params[layer_idx].self_attention.value_weight.bias =
-    //     reinterpret_cast<const DataType_*>(self_v_bias[i].data<data_t_>());
-
-    params[layer_idx].self_attention.attention_output_weight.kernel =
-        reinterpret_cast<const DataType_*>(self_out_weight[i].data<data_t_>());
-    params[layer_idx].self_attention.attention_output_weight.bias =
-        reinterpret_cast<const DataType_*>(self_out_bias[i].data<data_t_>());
-
-    params[layer_idx].ffn_layernorm.gamma =
-        reinterpret_cast<const DataType_*>(ffn_ln_weight[i].data<data_t_>());
-    params[layer_idx].ffn_layernorm.beta =
-        reinterpret_cast<const DataType_*>(ffn_ln_bias[i].data<data_t_>());
-
-    params[layer_idx].ffn.intermediate_weight.kernel =
-        reinterpret_cast<const DataType_*>(ffn_inter_weight[i].data<data_t_>());
-    params[layer_idx].ffn.intermediate_weight.bias =
-        reinterpret_cast<const DataType_*>(ffn_inter_bias[i].data<data_t_>());
-    params[layer_idx].ffn.output_weight.kernel =
-        reinterpret_cast<const DataType_*>(ffn_out_weight[i].data<data_t_>());
-    params[layer_idx].ffn.output_weight.bias =
-        reinterpret_cast<const DataType_*>(ffn_out_bias[i].data<data_t_>());
-  }
-
-  decoding_params.layernorm.gamma =
-      reinterpret_cast<const DataType_*>(decoder_ln_weight.data<data_t_>());
-  decoding_params.layernorm.beta =
-      reinterpret_cast<const DataType_*>(decoder_ln_bias.data<data_t_>());
-  decoding_params.embedding_table =
-      reinterpret_cast<const DataType_*>(word_emb.data<data_t_>());
-  decoding_params.embedding_kernel =
-      reinterpret_cast<const DataType_*>(emb_weight.data<data_t_>());
-  decoding_params.position_encoding_table = reinterpret_cast<const DataType_*>(
-      positional_embedding_weight.data<data_t_>());
-
-  gpt_decoding->forward_context(params, decoding_params);
-  gpt_decoding->forward(params, decoding_params);
-
-  delete gpt_decoding;
-  delete[] params;
-
-  return {output_ids};
-}
-
-std::vector<paddle::Tensor> GPT2CUDAForward(
-    const paddle::Tensor& input,
-    const paddle::Tensor& attn_mask,
-    const paddle::Tensor& start_length,
-    const paddle::Tensor& word_embedding,
-    const std::vector<paddle::Tensor>& self_ln_weight,
-    const std::vector<paddle::Tensor>& self_ln_bias,
-    const std::vector<paddle::Tensor>& self_q_weight,
-    const std::vector<paddle::Tensor>& self_q_bias,
-    const std::vector<paddle::Tensor>& self_k_weight,
-    const std::vector<paddle::Tensor>& self_k_bias,
-    const std::vector<paddle::Tensor>& self_v_weight,
-    const std::vector<paddle::Tensor>& self_v_bias,
-    const std::vector<paddle::Tensor>& self_out_weight,
-    const std::vector<paddle::Tensor>& self_out_bias,
-    const std::vector<paddle::Tensor>& ffn_ln_weight,
-    const std::vector<paddle::Tensor>& ffn_ln_bias,
-    const std::vector<paddle::Tensor>& ffn_inter_weight,
-    const std::vector<paddle::Tensor>& ffn_inter_bias,
-    const std::vector<paddle::Tensor>& ffn_out_weight,
-    const std::vector<paddle::Tensor>& ffn_out_bias,
-    const paddle::Tensor& decoder_ln_weight,
-    const paddle::Tensor& decoder_ln_bias,
-    const paddle::Tensor& positional_embedding_weight,
-    const paddle::Tensor& emb_weight,
-    paddle::Tensor& output_ids,
-    const int& topk,
-    const float& topp,
-    const int& max_len,
-    const int& n_head,
-    const int& size_per_head,
-    const int& num_layer,
-    const int& bos_id,
-    const int& eos_id,
-    const float& temperature,
-    const bool& use_fp16 = false,
-    const int& tensor_para_size = 1,
-    const int& layer_para_size = 1,
-    const int& layer_para_batch_size = 1) {
-  auto stream = word_embedding.stream();
-  // TODO(guosheng): use the global cublas handle
-  cublasHandle_t cublas_handle_;
-  cublasCreate(&cublas_handle_);
-  cublasLtHandle_t cublaslt_handle_;
-  cublasLtCreate(&cublaslt_handle_);
-  cublasSetStream(cublas_handle_, stream);
-
-  std::vector<paddle::Tensor> ret;
-
-  if (use_fp16) {
-    ret = gpt2_kernel<paddle::DataType::FLOAT16>(input,
-                                                 attn_mask,
-                                                 start_length,
-                                                 word_embedding,
-                                                 self_ln_weight,
-                                                 self_ln_bias,
-                                                 self_q_weight,
-                                                 self_q_bias,
-                                                 self_k_weight,
-                                                 self_k_bias,
-                                                 self_v_weight,
-                                                 self_v_bias,
-                                                 self_out_weight,
-                                                 self_out_bias,
-                                                 ffn_ln_weight,
-                                                 ffn_ln_bias,
-                                                 ffn_inter_weight,
-                                                 ffn_inter_bias,
-                                                 ffn_out_weight,
-                                                 ffn_out_bias,
-                                                 decoder_ln_weight,
-                                                 decoder_ln_bias,
-                                                 positional_embedding_weight,
-                                                 emb_weight,
-                                                 output_ids,
-                                                 topk,
-                                                 topp,
-                                                 max_len,
-                                                 n_head,
-                                                 size_per_head,
-                                                 num_layer,
-                                                 bos_id,
-                                                 eos_id,
-                                                 temperature,
-                                                 cublas_handle_,
-                                                 cublaslt_handle_,
-                                                 stream,
-                                                 tensor_para_size,
-                                                 layer_para_size,
-                                                 layer_para_batch_size);
-  } else {
-    ret = gpt2_kernel<paddle::DataType::FLOAT32>(input,
-                                                 attn_mask,
-                                                 start_length,
-                                                 word_embedding,
-                                                 self_ln_weight,
-                                                 self_ln_bias,
-                                                 self_q_weight,
-                                                 self_q_bias,
-                                                 self_k_weight,
-                                                 self_k_bias,
-                                                 self_v_weight,
-                                                 self_v_bias,
-                                                 self_out_weight,
-                                                 self_out_bias,
-                                                 ffn_ln_weight,
-                                                 ffn_ln_bias,
-                                                 ffn_inter_weight,
-                                                 ffn_inter_bias,
-                                                 ffn_out_weight,
-                                                 ffn_out_bias,
-                                                 decoder_ln_weight,
-                                                 decoder_ln_bias,
-                                                 positional_embedding_weight,
-                                                 emb_weight,
-                                                 output_ids,
-                                                 topk,
-                                                 topp,
-                                                 max_len,
-                                                 n_head,
-                                                 size_per_head,
-                                                 num_layer,
-                                                 bos_id,
-                                                 eos_id,
-                                                 temperature,
-                                                 cublas_handle_,
-                                                 cublaslt_handle_,
-                                                 stream,
-                                                 tensor_para_size,
-                                                 layer_para_size,
-                                                 layer_para_batch_size);
-  }
-
-  cublasDestroy(cublas_handle_);
-  cublasLtDestroy(cublaslt_handle_);
-  return ret;
-}
diff --git a/paddlenlp/ops/fast_transformer/src/fusion_gpt_op.h b/paddlenlp/ops/fast_transformer/src/fusion_gpt_op.h
deleted file mode 100644
index 75394d5a8ee2..000000000000
--- a/paddlenlp/ops/fast_transformer/src/fusion_gpt_op.h
+++ /dev/null
@@ -1,71 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// 
-//     http://www.apache.org/licenses/LICENSE-2.0
-// 
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include <vector>
-
-// #include "fastertransformer/gpt.h"
-// #include "fastertransformer/open_decoder.h"
-// #include "fastertransformer/utils/common.h"
-
-#ifdef PADDLE_ON_INFERENCE
-#include "paddle/extension.h"
-#include "paddle_inference_api.h"
-#include "paddle/common/exception.h"
-#else
-#include "paddle/extension.h"
-#endif
-
-
-std::vector<paddle::Tensor> GPT2CUDAForward(
-    const paddle::Tensor& input,
-    const paddle::Tensor& attn_mask,
-    const paddle::Tensor& start_length,
-    const paddle::Tensor& word_embedding,
-    const std::vector<paddle::Tensor>& self_ln_weight,
-    const std::vector<paddle::Tensor>& self_ln_bias,
-    const std::vector<paddle::Tensor>& self_q_weight,
-    const std::vector<paddle::Tensor>& self_q_bias,
-    const std::vector<paddle::Tensor>& self_k_weight,
-    const std::vector<paddle::Tensor>& self_k_bias,
-    const std::vector<paddle::Tensor>& self_v_weight,
-    const std::vector<paddle::Tensor>& self_v_bias,
-    const std::vector<paddle::Tensor>& self_out_weight,
-    const std::vector<paddle::Tensor>& self_out_bias,
-    const std::vector<paddle::Tensor>& ffn_ln_weight,
-    const std::vector<paddle::Tensor>& ffn_ln_bias,
-    const std::vector<paddle::Tensor>& ffn_inter_weight,
-    const std::vector<paddle::Tensor>& ffn_inter_bias,
-    const std::vector<paddle::Tensor>& ffn_out_weight,
-    const std::vector<paddle::Tensor>& ffn_out_bias,
-    const paddle::Tensor& decoder_ln_weight,
-    const paddle::Tensor& decoder_ln_bias,
-    const paddle::Tensor& positional_embedding_weight,
-    const paddle::Tensor& emb_weight,
-    paddle::Tensor& output_ids,
-    const int& topk,
-    const float& topp,
-    const int& max_len,
-    const int& n_head,
-    const int& size_per_head,
-    const int& num_layer,
-    const int& bos_id,
-    const int& eos_id,
-    const float& temperature,
-    const bool& use_fp16,
-    const int& tensor_para_size,
-    const int& layer_para_size,
-    const int& layer_para_batch_size);
diff --git a/paddlenlp/ops/fast_transformer/src/fusion_gptj_op.cc b/paddlenlp/ops/fast_transformer/src/fusion_gptj_op.cc
deleted file mode 100644
index 81d5411eb4be..000000000000
--- a/paddlenlp/ops/fast_transformer/src/fusion_gptj_op.cc
+++ /dev/null
@@ -1,203 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <string>
-#include <vector>
-
-#include "fusion_gptj_op.h"
-#include "pd_traits.h"
-
-
-std::vector<paddle::Tensor> GPTJForward(
-    const paddle::Tensor& input,
-    const paddle::Tensor& attn_mask,
-    const paddle::Tensor& start_length,
-    const paddle::Tensor& word_embedding,
-    const std::vector<paddle::Tensor>& self_ln_weight,
-    const std::vector<paddle::Tensor>& self_ln_bias,
-    const std::vector<paddle::Tensor>& self_q_weight,
-    const std::vector<paddle::Tensor>& self_out_weight,
-    const std::vector<paddle::Tensor>& ffn_inter_weight,
-    const std::vector<paddle::Tensor>& ffn_inter_bias,
-    const std::vector<paddle::Tensor>& ffn_out_weight,
-    const std::vector<paddle::Tensor>& ffn_out_bias,
-    const paddle::Tensor& decoder_ln_weight,
-    const paddle::Tensor& decoder_ln_bias,
-    const paddle::Tensor& emb_weight,
-    const paddle::Tensor& emb_bias,
-    const int topk,
-    const float topp,
-    const int max_len,
-    const int n_head,
-    const int size_per_head,
-    const int num_layer,
-    const int bos_id,
-    const int eos_id,
-    const float temperature,
-    const int rotary_embedding_dim,
-    const float repetition_penalty,
-    const int min_length,
-    const bool use_fp16 = false,
-    const int tensor_para_size = 1,
-    const int layer_para_size = 1,
-    const int layer_para_batch_size = 1) {
-  int batch_size = input.shape()[0];
-  int start_len = input.shape()[1];
-  int total_len = max_len + start_len;
-  std::vector<int64_t> output_dims({total_len, batch_size});
-  
-#ifdef PADDLE_NEW_ALLOCATOR
-  // For PaddlePaddle>=2.3.0
-  auto output_ids = paddle::empty(output_dims, paddle::DataType::INT32, input.place());
-  auto gpu_place = paddle::GPUPlace();
-#else
-  auto output_ids = paddle::Tensor(input.place(), output_dims);
-  auto gpu_place = paddle::PlaceType::kGPU;
-#endif
-
-  if (word_embedding.place() == gpu_place) {
-    return GPTJCUDAForward(input,
-                           attn_mask,
-                           start_length,
-                           word_embedding,
-                           self_ln_weight,
-                           self_ln_bias,
-                           self_q_weight,
-                           self_out_weight,
-                           ffn_inter_weight,
-                           ffn_inter_bias,
-                           ffn_out_weight,
-                           ffn_out_bias,
-                           decoder_ln_weight,
-                           decoder_ln_bias,
-                           emb_weight,
-                           emb_bias,
-                           output_ids,
-                           topk,
-                           topp,
-                           total_len,
-                           n_head,
-                           size_per_head,
-                           num_layer,
-                           bos_id,
-                           eos_id,
-                           temperature,
-                           rotary_embedding_dim,
-                           repetition_penalty,
-                           min_length,
-                           use_fp16,
-                           tensor_para_size,
-                           layer_para_size,
-                           layer_para_batch_size);
-  } else {
-    PD_THROW("Not implemented place. Only GPU is supported. ");
-  }
-}
-
-std::vector<std::vector<int64_t>> GPTJInferShape(
-    const std::vector<int64_t>& input_shape,
-    const std::vector<int64_t>& attn_mask_shape,
-    const std::vector<int64_t>& start_length,
-    const std::vector<int64_t>& word_embedding_shape,
-    const std::vector<std::vector<int64_t>>& self_ln_weight_shapes,
-    const std::vector<std::vector<int64_t>>& self_ln_bias_shapes,
-    const std::vector<std::vector<int64_t>>& self_q_weight_shapes,
-    const std::vector<std::vector<int64_t>>& self_out_weight_shapes,
-    const std::vector<std::vector<int64_t>>& ffn_inter_weight_shapes,
-    const std::vector<std::vector<int64_t>>& ffn_inter_bias_shapes,
-    const std::vector<std::vector<int64_t>>& ffn_out_weight_shapes,
-    const std::vector<std::vector<int64_t>>& ffn_out_bias_shapes,
-    const std::vector<int64_t>& decoder_ln_weight_shape,
-    const std::vector<int64_t>& decoder_ln_bias_shape,
-    const std::vector<int64_t>& emb_weight_shape,
-    const std::vector<int64_t>& emb_bias_shape,
-    const int topk,
-    const float topp,
-    const int max_len,
-    const int n_head,
-    const int size_per_head,
-    const int num_layer,
-    const int bos_id,
-    const int eos_id,
-    const float temperature,
-    const int rotary_embedding_dim,
-    const float repetition_penalty,
-    const int min_length,
-    const bool use_fp16 = false,
-    const int tensor_para_size = 1,
-    const int layer_para_size = 1,
-    const int layer_para_batch_size = 1) {
-  int64_t batch_size = input_shape[0];
-  int64_t start_len = input_shape[1];
-  std::vector<int64_t> output_dims({max_len + start_len, batch_size});
-  return {output_dims};
-}
-
-std::vector<paddle::DataType> GPTJInferDtype(
-    const paddle::DataType& input_dtype,
-    const paddle::DataType& attn_mask_dtype,
-    const paddle::DataType& start_length_dtype,
-    const paddle::DataType& word_embedding_dtype,
-    const std::vector<paddle::DataType>& self_ln_weight_dtype,
-    const std::vector<paddle::DataType>& self_ln_bias_dtype,
-    const std::vector<paddle::DataType>& self_q_weight_dtype,
-    const std::vector<paddle::DataType>& self_out_weight_dtype,
-    const std::vector<paddle::DataType>& ffn_inter_weight_dtype,
-    const std::vector<paddle::DataType>& ffn_inter_bias_dtype,
-    const std::vector<paddle::DataType>& ffn_out_weight_dtype,
-    const std::vector<paddle::DataType>& ffn_out_bias_dtype,
-    const paddle::DataType& decoder_ln_weight_dtype,
-    const paddle::DataType& decoder_ln_bias_dtype,
-    const paddle::DataType& emb_weight_dtype,
-    const paddle::DataType& emb_bias_dtype) {
-  return {paddle::DataType::INT32};
-}
-
-PD_BUILD_OP(fusion_gptj)
-    .Inputs({"Input",
-             "AttentionMask",
-             "StartLength",
-             "WordEmbedding",
-             paddle::Vec("SelfLayernormWeight"),
-             paddle::Vec("SelfLayernormBias"),
-             paddle::Vec("SelfQueryWeight"),
-             paddle::Vec("SelfOutWeight"),
-             paddle::Vec("FFNInterWeight"),
-             paddle::Vec("FFNInterBias"),
-             paddle::Vec("FFNOutWeight"),
-             paddle::Vec("FFNOutBias"),
-             "DecoderLayernormWeight",
-             "DecoderLayernormBias",
-             "EmbWeight",
-             "EmbBias"})
-    .Outputs({"OutputIds"})
-    .Attrs({"topk: int",
-            "topp: float",
-            "max_len: int",
-            "n_head: int",
-            "size_per_head: int",
-            "num_layer: int",
-            "bos_id: int",
-            "eos_id: int",
-            "temperature: float",
-            "rotary_embedding_dim: int",
-            "repetition_penalty: float",
-            "min_length: int",
-            "use_fp16: bool",
-            "tensor_para_size: int",
-            "layer_para_size: int",
-            "layer_para_batch_size: int"})
-    .SetKernelFn(PD_KERNEL(GPTJForward))
-    .SetInferShapeFn(PD_INFER_SHAPE(GPTJInferShape))
-    .SetInferDtypeFn(PD_INFER_DTYPE(GPTJInferDtype));
diff --git a/paddlenlp/ops/fast_transformer/src/fusion_gptj_op.cu b/paddlenlp/ops/fast_transformer/src/fusion_gptj_op.cu
deleted file mode 100644
index 25e28210796b..000000000000
--- a/paddlenlp/ops/fast_transformer/src/fusion_gptj_op.cu
+++ /dev/null
@@ -1,334 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <curand.h>
-#include <curand_kernel.h>
-#include <algorithm>
-#include <iterator>
-#include <random>
-#include <sstream>
-#include <vector>
-
-// Use the global cublas handle
-#include "cublas_handle.h"
-
-// TODO(guosheng): `HOST` conflict exists in float.h of paddle and mpi.h of mpi
-#include "fusion_gptj_op.h"
-#include "pd_traits.h"
-#ifdef HOST
-#undef HOST
-#endif
-
-#include "fastertransformer/utils/common.h"
-
-#ifdef BUILD_GPT  // consistent with FasterTransformer
-#include "parallel_utils.h"
-#endif
-
-template <paddle::DataType D>
-std::vector<paddle::Tensor> gptj_kernel(
-    const paddle::Tensor& input,
-    const paddle::Tensor& attn_mask,
-    const paddle::Tensor& start_length,
-    const paddle::Tensor& word_emb,
-    const std::vector<paddle::Tensor>& self_ln_weight,
-    const std::vector<paddle::Tensor>& self_ln_bias,
-    const std::vector<paddle::Tensor>& self_q_weight,
-    const std::vector<paddle::Tensor>& self_out_weight,
-    const std::vector<paddle::Tensor>& ffn_inter_weight,
-    const std::vector<paddle::Tensor>& ffn_inter_bias,
-    const std::vector<paddle::Tensor>& ffn_out_weight,
-    const std::vector<paddle::Tensor>& ffn_out_bias,
-    const paddle::Tensor& decoder_ln_weight,
-    const paddle::Tensor& decoder_ln_bias,
-    const paddle::Tensor& emb_weight,
-    const paddle::Tensor& emb_bias,
-    paddle::Tensor& output_ids,
-    const int topk,
-    const float topp,
-    const int max_len,
-    const int n_head,
-    const int size_per_head,
-    const int num_layer,
-    const int bos_id,
-    const int eos_id,
-    const float temperature,
-    const int rotary_embedding_dim,
-    const float repetition_penalty,
-    const int min_length,
-    cudaStream_t stream,
-    const int tensor_para_size = 1,
-    const int layer_para_size = 1,
-    const int layer_para_batch_size = 1) {
-  auto input_dims = input.shape();
-  int batch_size_ = input_dims[0];
-  int start_len = input_dims[1];
-  const int vocab_size = emb_bias.shape()[0];
-
-  typedef PDTraits<D> traits_;
-  typedef typename traits_::DataType DataType_;
-  typedef typename traits_::data_t data_t_;
-
-  DecodingInitParam<DataType_> decoding_params;
-  decoding_params.cublas_handle = CublasHandle::GetInstance()->cublas_handle_;
-  decoding_params.cublaslt_handle = CublasHandle::GetInstance()->cublaslt_handle_;
-
-#ifdef PADDLE_NEW_ALLOCATOR
-  // For PaddlePaddle>=2.3.0
-  decoding_params.output_ids = output_ids.data<int>();
-#else
-  decoding_params.output_ids = output_ids.mutable_data<int>(word_emb.place());
-#endif
-
-  typedef DecoderTransformerTraits<traits_::OpType> DecodingTraits_;
-  decoding_params.stream = stream;
-  fastertransformer::Allocator<AllocatorType::PD> allocator_(stream);
-
-  const int hidden_unit = size_per_head * n_head;
-
-#ifdef BUILD_GPT
-  auto* model_para_desc = ModelParaDescFactory::CreateModelParaDesc(
-      n_head,
-      size_per_head,
-      num_layer,
-      tensor_para_size,
-      layer_para_size,
-      layer_para_batch_size,
-      const_cast<data_t_*>(word_emb.data<data_t_>()));
-  auto& tensor_parallel_param = model_para_desc->tensor_parallel_param;
-  auto& layer_parallel_param = model_para_desc->layer_parallel_param;
-  auto seed = model_para_desc->dist(model_para_desc->gen);
-#else
-  TensorParallelParam tensor_parallel_param;
-  LayerParallelParam layer_parallel_param;
-  tensor_parallel_param.rank = 0;
-  tensor_parallel_param.world_size = 1;
-  tensor_parallel_param.local_head_num_ = n_head;
-  tensor_parallel_param.local_hidden_units_ = hidden_unit;
-
-  layer_parallel_param.rank = 0;
-  layer_parallel_param.world_size = 1;
-  layer_parallel_param.layers_per_group = num_layer;
-  layer_parallel_param.local_batch_size = batch_size_;
-  int seed = -1;
-#endif
-
-  DecodingGptJ<DecodingTraits_::OpType>* gptj_decoding;
-
-  decoding_params.request_batch_size = batch_size_;
-  decoding_params.max_input_len = start_len;
-  decoding_params.request_input_len = start_len;
-  decoding_params.request_output_len = max_len - start_len;
-
-  decoding_params.d_start_ids = const_cast<int *>(input.data<int>());
-  decoding_params.d_attn_mask =
-      reinterpret_cast<DataType_*>(const_cast<data_t_ *>(attn_mask.data<data_t_>()));
-  decoding_params.d_start_lengths = start_length.data<int>();
-
-  gptj_decoding =
-      new DecodingGptJ<DecodingTraits_::OpType>(allocator_,
-                                               batch_size_,
-                                               max_len,
-                                               n_head,
-                                               size_per_head,
-                                               vocab_size,
-                                               num_layer,
-                                               bos_id,
-                                               eos_id,
-                                               topk,
-                                               topp,
-                                               temperature,
-                                               tensor_para_size,
-                                               layer_para_size,
-                                               true, /*is_fuse_QKV*/
-                                               repetition_penalty,  /*repetition_penalty*/
-                                               seed,
-                                               rotary_embedding_dim,
-                                               min_length);
-
-  gptj_decoding->set_tensor_parallel_param(tensor_parallel_param);
-  gptj_decoding->set_layer_parallel_param(layer_parallel_param);
-
-  DecoderInitParam<DataType_>* params =
-      new DecoderInitParam<DataType_>[num_layer];
-
-  for (int i = 0; i < self_ln_weight.size(); ++i) {
-    // Allow python passing weights of all layers or only passing the
-    // corresponding layers to save memory.
-    int layer_idx = self_ln_weight.size() != num_layer
-                        ? layer_parallel_param.rank *
-                                  layer_parallel_param.layers_per_group +
-                              i
-                        : i;
-
-    params[layer_idx].stream = stream;
-    params[layer_idx].cublas_handle = CublasHandle::GetInstance()->cublas_handle_;
-    params[layer_idx].cublaslt_handle = CublasHandle::GetInstance()->cublaslt_handle_;
-
-    params[layer_idx].request_batch_size = batch_size_;
-    params[layer_idx].request_max_mem_seq_len = start_len;
-
-    params[layer_idx].self_layernorm.gamma =
-        reinterpret_cast<const DataType_*>(self_ln_weight[i].data<data_t_>());
-    params[layer_idx].self_layernorm.beta =
-        reinterpret_cast<const DataType_*>(self_ln_bias[i].data<data_t_>());
-
-    params[layer_idx].self_attention.query_weight.kernel =
-        reinterpret_cast<const DataType_*>(self_q_weight[i].data<data_t_>());
-    params[layer_idx].self_attention.query_weight.bias = nullptr;
-
-    params[layer_idx].self_attention.attention_output_weight.kernel =
-        reinterpret_cast<const DataType_*>(self_out_weight[i].data<data_t_>());
-    params[layer_idx].self_attention.attention_output_weight.bias = nullptr;
-
-    params[layer_idx].ffn.intermediate_weight.kernel =
-        reinterpret_cast<const DataType_*>(ffn_inter_weight[i].data<data_t_>());
-    params[layer_idx].ffn.intermediate_weight.bias =
-        reinterpret_cast<const DataType_*>(ffn_inter_bias[i].data<data_t_>());
-    params[layer_idx].ffn.output_weight.kernel =
-        reinterpret_cast<const DataType_*>(ffn_out_weight[i].data<data_t_>());
-    params[layer_idx].ffn.output_weight.bias =
-        reinterpret_cast<const DataType_*>(ffn_out_bias[i].data<data_t_>());
-  }
-
-  decoding_params.layernorm.gamma =
-      reinterpret_cast<const DataType_*>(decoder_ln_weight.data<data_t_>());
-  decoding_params.layernorm.beta =
-      reinterpret_cast<const DataType_*>(decoder_ln_bias.data<data_t_>());
-  decoding_params.embedding_table =
-      reinterpret_cast<const DataType_*>(word_emb.data<data_t_>());
-  decoding_params.embedding_kernel =
-      reinterpret_cast<const DataType_*>(emb_weight.data<data_t_>());
-  decoding_params.embedding_bias =
-      reinterpret_cast<const DataType_*>(emb_bias.data<data_t_>());
-
-  gptj_decoding->forward_context(params, decoding_params);
-  gptj_decoding->forward(params, decoding_params);
-
-  delete gptj_decoding;
-  delete[] params;
-
-  return {output_ids};
-}
-
-std::vector<paddle::Tensor> GPTJCUDAForward(
-    const paddle::Tensor& input,
-    const paddle::Tensor& attn_mask,
-    const paddle::Tensor& start_length,
-    const paddle::Tensor& word_embedding,
-    const std::vector<paddle::Tensor>& self_ln_weight,
-    const std::vector<paddle::Tensor>& self_ln_bias,
-    const std::vector<paddle::Tensor>& self_q_weight,
-    const std::vector<paddle::Tensor>& self_out_weight,
-    const std::vector<paddle::Tensor>& ffn_inter_weight,
-    const std::vector<paddle::Tensor>& ffn_inter_bias,
-    const std::vector<paddle::Tensor>& ffn_out_weight,
-    const std::vector<paddle::Tensor>& ffn_out_bias,
-    const paddle::Tensor& decoder_ln_weight,
-    const paddle::Tensor& decoder_ln_bias,
-    const paddle::Tensor& emb_weight,
-    const paddle::Tensor& emb_bias,
-    paddle::Tensor& output_ids,
-    const int topk,
-    const float topp,
-    const int max_len,
-    const int n_head,
-    const int size_per_head,
-    const int num_layer,
-    const int bos_id,
-    const int eos_id,
-    const float temperature,
-    const int rotary_embedding_dim,
-    const float repetition_penalty,
-    const int min_length,
-    const bool use_fp16 = false,
-    const int tensor_para_size = 1,
-    const int layer_para_size = 1,
-    const int layer_para_batch_size = 1) {
-        
-  auto stream = word_embedding.stream();
-  cublasSetStream(CublasHandle::GetInstance()->cublas_handle_, stream);
-
-  if (use_fp16) {
-    return gptj_kernel<paddle::DataType::FLOAT16>(input,
-                                                 attn_mask,
-                                                 start_length,
-                                                 word_embedding,
-                                                 self_ln_weight,
-                                                 self_ln_bias,
-                                                 self_q_weight,
-                                                 self_out_weight,
-                                                 ffn_inter_weight,
-                                                 ffn_inter_bias,
-                                                 ffn_out_weight,
-                                                 ffn_out_bias,
-                                                 decoder_ln_weight,
-                                                 decoder_ln_bias,
-                                                 emb_weight,
-                                                 emb_bias,
-                                                 output_ids,
-                                                 topk,
-                                                 topp,
-                                                 max_len,
-                                                 n_head,
-                                                 size_per_head,
-                                                 num_layer,
-                                                 bos_id,
-                                                 eos_id,
-                                                 temperature,
-                                                 rotary_embedding_dim,
-                                                 repetition_penalty,
-                                                 min_length,
-                                                 stream,
-                                                 tensor_para_size,
-                                                 layer_para_size,
-                                                 layer_para_batch_size);
-  } else {
-    return gptj_kernel<paddle::DataType::FLOAT32>(input,
-                                                 attn_mask,
-                                                 start_length,
-                                                 word_embedding,
-                                                 self_ln_weight,
-                                                 self_ln_bias,
-                                                 self_q_weight,
-                                                 self_out_weight,
-                                                 ffn_inter_weight,
-                                                 ffn_inter_bias,
-                                                 ffn_out_weight,
-                                                 ffn_out_bias,
-                                                 decoder_ln_weight,
-                                                 decoder_ln_bias,
-                                                 emb_weight,
-                                                 emb_bias,
-                                                 output_ids,
-                                                 topk,
-                                                 topp,
-                                                 max_len,
-                                                 n_head,
-                                                 size_per_head,
-                                                 num_layer,
-                                                 bos_id,
-                                                 eos_id,
-                                                 temperature,
-                                                 rotary_embedding_dim,
-                                                 repetition_penalty,
-                                                 min_length,
-                                                 stream,
-                                                 tensor_para_size,
-                                                 layer_para_size,
-                                                 layer_para_batch_size);
-  }
-}
diff --git a/paddlenlp/ops/fast_transformer/src/fusion_gptj_op.h b/paddlenlp/ops/fast_transformer/src/fusion_gptj_op.h
deleted file mode 100644
index 48c70553fd52..000000000000
--- a/paddlenlp/ops/fast_transformer/src/fusion_gptj_op.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include "fastertransformer/gptj.h"
-#include "fastertransformer/open_decoder.h"
-#include "fastertransformer/utils/common.h"
-
-#ifdef PADDLE_ON_INFERENCE
-#include "paddle/extension.h"
-#include "paddle_inference_api.h"
-#include "paddle/common/exception.h"
-#else
-#include "paddle/extension.h"
-#endif
-
-
-std::vector<paddle::Tensor> GPTJCUDAForward(
-    const paddle::Tensor& input,
-    const paddle::Tensor& attn_mask,
-    const paddle::Tensor& start_length,
-    const paddle::Tensor& word_embedding,
-    const std::vector<paddle::Tensor>& self_ln_weight,
-    const std::vector<paddle::Tensor>& self_ln_bias,
-    const std::vector<paddle::Tensor>& self_q_weight,
-    const std::vector<paddle::Tensor>& self_out_weight,
-    const std::vector<paddle::Tensor>& ffn_inter_weight,
-    const std::vector<paddle::Tensor>& ffn_inter_bias,
-    const std::vector<paddle::Tensor>& ffn_out_weight,
-    const std::vector<paddle::Tensor>& ffn_out_bias,
-    const paddle::Tensor& decoder_ln_weight,
-    const paddle::Tensor& decoder_ln_bias,
-    const paddle::Tensor& emb_weight,
-    const paddle::Tensor& emb_bias,
-    paddle::Tensor& output_ids,
-    const int topk,
-    const float topp,
-    const int max_len,
-    const int n_head,
-    const int size_per_head,
-    const int num_layer,
-    const int bos_id,
-    const int eos_id,
-    const float temperature,
-    const int rotary_embedding_dim,
-    const float repetition_penalty,
-    const int min_length,
-    const bool use_fp16,
-    const int tensor_para_size,
-    const int layer_para_size,
-    const int layer_para_batch_size);
diff --git a/paddlenlp/ops/fast_transformer/src/fusion_mbart_decoding_op.cc b/paddlenlp/ops/fast_transformer/src/fusion_mbart_decoding_op.cc
deleted file mode 100644
index 30ecb154c1f5..000000000000
--- a/paddlenlp/ops/fast_transformer/src/fusion_mbart_decoding_op.cc
+++ /dev/null
@@ -1,368 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <string>
-#include <vector>
-
-#include "fusion_mbart_decoding_op.h"
-#include "pd_traits.h"
-
-
-std::vector<paddle::Tensor> MBartDecodingForward(
-    const paddle::Tensor& input,
-    const paddle::Tensor& mem_seq_len,
-    const paddle::Tensor& word_embedding,
-    const std::vector<paddle::Tensor>& self_ln_weight,
-    const std::vector<paddle::Tensor>& self_ln_bias,
-    const std::vector<paddle::Tensor>& self_q_weight,
-    const std::vector<paddle::Tensor>& self_q_bias,
-    const std::vector<paddle::Tensor>& self_k_weight,
-    const std::vector<paddle::Tensor>& self_k_bias,
-    const std::vector<paddle::Tensor>& self_v_weight,
-    const std::vector<paddle::Tensor>& self_v_bias,
-    const std::vector<paddle::Tensor>& self_out_weight,
-    const std::vector<paddle::Tensor>& self_out_bias,
-    const std::vector<paddle::Tensor>& cross_ln_weight,
-    const std::vector<paddle::Tensor>& cross_ln_bias,
-    const std::vector<paddle::Tensor>& cross_q_weight,
-    const std::vector<paddle::Tensor>& cross_q_bias,
-    const std::vector<paddle::Tensor>& cross_k_weight,
-    const std::vector<paddle::Tensor>& cross_k_bias,
-    const std::vector<paddle::Tensor>& cross_v_weight,
-    const std::vector<paddle::Tensor>& cross_v_bias,
-    const std::vector<paddle::Tensor>& cross_out_weight,
-    const std::vector<paddle::Tensor>& cross_out_bias,
-    const std::vector<paddle::Tensor>& ffn_ln_weight,
-    const std::vector<paddle::Tensor>& ffn_ln_bias,
-    const std::vector<paddle::Tensor>& ffn_inter_weight,
-    const std::vector<paddle::Tensor>& ffn_inter_bias,
-    const std::vector<paddle::Tensor>& ffn_out_weight,
-    const std::vector<paddle::Tensor>& ffn_out_bias,
-    const paddle::Tensor& decoder_ln_weight,
-    const paddle::Tensor& decoder_ln_bias,
-    const paddle::Tensor& mbart_ln_weight,
-    const paddle::Tensor& mbart_ln_bias,
-    const paddle::Tensor& embedding_weight,
-    const paddle::Tensor& embedding_bias,
-    const paddle::Tensor& positional_embedding_weight,
-    const paddle::Tensor& trg_word,
-    const std::string& decoding_strategy,
-    const int& beam_size,
-    const int& topk,
-    const float& topp,
-    const int& n_head,
-    const int& size_per_head,
-    const int& num_layer,
-    const int& bos_id,
-    const int& eos_id,
-    const float& temperature,
-    const int64_t& max_len,
-    const float& beam_search_diversity_rate,
-    const bool& rel_len,
-    const float& alpha,
-    const bool& early_stopping,
-    const std::string& hidden_act) {
-  int batch_size = input.shape()[0];
-  int max_out_len = rel_len ? max_len + input.shape()[1] : max_len;
-
-  std::vector<int64_t> output_dims;
-  std::vector<int64_t> parent_ids_dims;
-  std::vector<int64_t> sequence_length_dims({batch_size});
-  if (decoding_strategy == "beam_search") {
-    batch_size /= beam_size;
-    output_dims = {max_out_len, batch_size, beam_size};
-    parent_ids_dims = output_dims;
-  } else if (decoding_strategy == "beam_search_v2" ||
-             decoding_strategy == "beam_search_v3") {
-    // Use separated alive and finish beam queues to avoid the decrease of alive
-    // beams. The outputs must include both the finish and alive to trace full
-    // path.
-    if (batch_size != -1) {
-      sequence_length_dims = {batch_size * 2};
-      batch_size /= beam_size;
-    } else {
-      sequence_length_dims = {batch_size};
-    }
-    output_dims = {max_out_len, batch_size, beam_size * 2};
-    parent_ids_dims = output_dims;
-  } else if (decoding_strategy == "topk_sampling" ||
-             decoding_strategy == "topp_sampling") {
-    output_dims = {max_out_len, batch_size};
-    parent_ids_dims = {1};
-  } else {
-    PD_THROW("Not supported decoding strategy. ");
-  }
-
-  if (input.place() == paddle::PlaceType::kGPU) {
-    auto output_ids = paddle::Tensor(paddle::PlaceType::kGPU, output_dims);
-    auto parent_ids = paddle::Tensor(paddle::PlaceType::kGPU, parent_ids_dims);
-    auto sequence_length =
-        paddle::Tensor(paddle::PlaceType::kGPU, sequence_length_dims);
-
-    paddle::Tensor seq_len = paddle::Tensor(paddle::PlaceType::kGPU);
-
-    if (mem_seq_len.place() != paddle::PlaceType::kGPU) {
-      seq_len = mem_seq_len.copy_to<int>(paddle::PlaceType::kGPU);
-    } else {
-      seq_len = mem_seq_len;
-    }
-
-    return MBartDecodingCUDAForward(input,
-                                    seq_len,
-                                    word_embedding,
-                                    self_ln_weight,
-                                    self_ln_bias,
-                                    self_q_weight,
-                                    self_q_bias,
-                                    self_k_weight,
-                                    self_k_bias,
-                                    self_v_weight,
-                                    self_v_bias,
-                                    self_out_weight,
-                                    self_out_bias,
-                                    cross_ln_weight,
-                                    cross_ln_bias,
-                                    cross_q_weight,
-                                    cross_q_bias,
-                                    cross_k_weight,
-                                    cross_k_bias,
-                                    cross_v_weight,
-                                    cross_v_bias,
-                                    cross_out_weight,
-                                    cross_out_bias,
-                                    ffn_ln_weight,
-                                    ffn_ln_bias,
-                                    ffn_inter_weight,
-                                    ffn_inter_bias,
-                                    ffn_out_weight,
-                                    ffn_out_bias,
-                                    decoder_ln_weight,
-                                    decoder_ln_bias,
-                                    mbart_ln_weight,
-                                    mbart_ln_bias,
-                                    embedding_weight,
-                                    embedding_bias,
-                                    positional_embedding_weight,
-                                    trg_word,
-                                    output_ids,
-                                    parent_ids,
-                                    sequence_length,
-                                    decoding_strategy,
-                                    beam_size,
-                                    topk,
-                                    topp,
-                                    n_head,
-                                    size_per_head,
-                                    num_layer,
-                                    bos_id,
-                                    eos_id,
-                                    temperature,
-                                    max_out_len,
-                                    beam_search_diversity_rate,
-                                    alpha,
-                                    early_stopping,
-                                    hidden_act);
-  } else {
-    PD_THROW("Not implemented place. Only GPU is supported. ");
-  }
-}
-
-std::vector<std::vector<int64_t>> MBartDecodingInferShape(
-    const std::vector<int64_t>& input_shape,
-    const std::vector<int64_t>& mem_seq_len_shape,
-    const std::vector<int64_t>& word_embedding_shape,
-    const std::vector<std::vector<int64_t>>& self_ln_weight_shapes,
-    const std::vector<std::vector<int64_t>>& self_ln_bias_shapes,
-    const std::vector<std::vector<int64_t>>& self_q_weight_shapes,
-    const std::vector<std::vector<int64_t>>& self_q_bias_shapes,
-    const std::vector<std::vector<int64_t>>& self_k_weight_shapes,
-    const std::vector<std::vector<int64_t>>& self_k_bias_shapes,
-    const std::vector<std::vector<int64_t>>& self_v_weight_shapes,
-    const std::vector<std::vector<int64_t>>& self_v_bias_shapes,
-    const std::vector<std::vector<int64_t>>& self_out_weight_shapes,
-    const std::vector<std::vector<int64_t>>& self_out_bias_shapes,
-    const std::vector<std::vector<int64_t>>& cross_ln_weight_shapes,
-    const std::vector<std::vector<int64_t>>& cross_ln_bias_shapes,
-    const std::vector<std::vector<int64_t>>& cross_q_weight_shapes,
-    const std::vector<std::vector<int64_t>>& cross_q_bias_shapes,
-    const std::vector<std::vector<int64_t>>& cross_k_weight_shapes,
-    const std::vector<std::vector<int64_t>>& cross_k_bias_shapes,
-    const std::vector<std::vector<int64_t>>& cross_v_weight_shapes,
-    const std::vector<std::vector<int64_t>>& cross_v_bias_shapes,
-    const std::vector<std::vector<int64_t>>& cross_out_weight_shapes,
-    const std::vector<std::vector<int64_t>>& cross_out_bias_shapes,
-    const std::vector<std::vector<int64_t>>& ffn_ln_weight_shapes,
-    const std::vector<std::vector<int64_t>>& ffn_ln_bias_shapes,
-    const std::vector<std::vector<int64_t>>& ffn_inter_weight_shapes,
-    const std::vector<std::vector<int64_t>>& ffn_inter_bias_shapes,
-    const std::vector<std::vector<int64_t>>& ffn_out_weight_shapes,
-    const std::vector<std::vector<int64_t>>& ffn_out_bias_shapes,
-    const std::vector<int64_t>& decoder_ln_weight_shape,
-    const std::vector<int64_t>& decoder_ln_bias_shape,
-    const std::vector<int64_t>& mbart_ln_weight_shape,
-    const std::vector<int64_t>& mbart_ln_bias_shape,
-    const std::vector<int64_t>& embedding_weight_shape,
-    const std::vector<int64_t>& embedding_bias_shape,
-    const std::vector<int64_t>& positional_embedding_weight_shape,
-    const std::vector<int64_t>& trg_word_shape,
-    const std::string& decoding_strategy,
-    const int& beam_size,
-    const int& topk,
-    const float& topp,
-    const int& n_head,
-    const int& size_per_head,
-    const int& num_layer,
-    const int& bos_id,
-    const int& eos_id,
-    const float& temperature,
-    const int64_t& max_len,
-    const float& beam_search_diversity_rate,
-    const bool& rel_len,
-    const float& alpha,
-    const bool& early_stopping,
-    const std::string& hidden_act) {
-  int batch_size = input_shape[0];
-
-  std::vector<int64_t> output_dims;
-  std::vector<int64_t> sequence_length_dims({batch_size});
-  if (decoding_strategy == "beam_search") {
-    if (batch_size != -1) {
-      batch_size /= beam_size;
-    }
-    output_dims = {max_len, batch_size, beam_size};
-    return {output_dims, output_dims, sequence_length_dims};
-  } else if (decoding_strategy == "beam_search_v2" ||
-            decoding_strategy == "beam_search_v3") {
-    // Use separated alive and finish beam queues to avoid the decrease of alive
-    // beams. The outputs must include both the finish and alive to trace full
-    // path.
-    if (batch_size != -1) {
-      sequence_length_dims = {batch_size * 2};
-      batch_size /= beam_size;
-    } else {
-      sequence_length_dims = {batch_size};
-    }
-    output_dims = {max_len, batch_size, beam_size * 2};
-    return {output_dims, output_dims, sequence_length_dims};
-  } else if (decoding_strategy == "topk_sampling" ||
-             decoding_strategy == "topp_sampling") {
-    output_dims = {max_len, batch_size};
-    return {output_dims, {1}, sequence_length_dims};
-  } else {
-    PD_THROW("Not supported decoding strategy. ");
-  }
-}
-
-std::vector<paddle::DataType> MBartDecodingInferDtype(
-    const paddle::DataType& input,
-    const paddle::DataType& mem_seq_len,
-    const paddle::DataType& word_embedding,
-    const std::vector<paddle::DataType>& self_ln_weight,
-    const std::vector<paddle::DataType>& self_ln_bias,
-    const std::vector<paddle::DataType>& self_q_weight,
-    const std::vector<paddle::DataType>& self_q_bias,
-    const std::vector<paddle::DataType>& self_k_weight,
-    const std::vector<paddle::DataType>& self_k_bias,
-    const std::vector<paddle::DataType>& self_v_weight,
-    const std::vector<paddle::DataType>& self_v_bias,
-    const std::vector<paddle::DataType>& self_out_weight,
-    const std::vector<paddle::DataType>& self_out_bias,
-    const std::vector<paddle::DataType>& cross_ln_weight,
-    const std::vector<paddle::DataType>& cross_ln_bias,
-    const std::vector<paddle::DataType>& cross_q_weight,
-    const std::vector<paddle::DataType>& cross_q_bias,
-    const std::vector<paddle::DataType>& cross_k_weight,
-    const std::vector<paddle::DataType>& cross_k_bias,
-    const std::vector<paddle::DataType>& cross_v_weight,
-    const std::vector<paddle::DataType>& cross_v_bias,
-    const std::vector<paddle::DataType>& cross_out_weight,
-    const std::vector<paddle::DataType>& cross_out_bias,
-    const std::vector<paddle::DataType>& ffn_ln_weight,
-    const std::vector<paddle::DataType>& ffn_ln_bias,
-    const std::vector<paddle::DataType>& ffn_inter_weight,
-    const std::vector<paddle::DataType>& ffn_inter_bias,
-    const std::vector<paddle::DataType>& ffn_out_weight,
-    const std::vector<paddle::DataType>& ffn_out_bias,
-    const paddle::DataType& decoder_ln_weight,
-    const paddle::DataType& decoder_ln_bias,
-    const paddle::DataType& mbart_ln_weight,
-    const paddle::DataType& mbart_ln_bias,
-    const paddle::DataType& embedding_weight,
-    const paddle::DataType& embedding_bias,
-    const paddle::DataType& positional_embedding_weight,
-    const paddle::DataType& trg_word) {
-  return {paddle::DataType::INT32,
-          paddle::DataType::INT32,
-          paddle::DataType::INT32};
-}
-
-PD_BUILD_OP(fusion_mbart_decoding)
-    .Inputs({"Input",
-             "MemSeqLen",
-             "WordEmbedding",
-             paddle::Vec("SelfLayernormWeight"),
-             paddle::Vec("SelfLayernormBias"),
-             paddle::Vec("SelfQueryWeight"),
-             paddle::Vec("SelfQueryBias"),
-             paddle::Vec("SelfKeyWeight"),
-             paddle::Vec("SelfKeyBias"),
-             paddle::Vec("SelfValueWeight"),
-             paddle::Vec("SelfValueBias"),
-             paddle::Vec("SelfOutWeight"),
-             paddle::Vec("SelfOutBias"),
-             paddle::Vec("CrossLayernormWeight"),
-             paddle::Vec("CrossLayernormBias"),
-             paddle::Vec("CrossQueryWeight"),
-             paddle::Vec("CrossQueryBias"),
-             paddle::Vec("CrossKeyWeight"),
-             paddle::Vec("CrossKeyBias"),
-             paddle::Vec("CrossValueWeight"),
-             paddle::Vec("CrossValueBias"),
-             paddle::Vec("CrossOutWeight"),
-             paddle::Vec("CrossOutBias"),
-             paddle::Vec("FFNLayernormWeight"),
-             paddle::Vec("FFNLayernormBias"),
-             paddle::Vec("FFNInterWeight"),
-             paddle::Vec("FFNInterBias"),
-             paddle::Vec("FFNOutWeight"),
-             paddle::Vec("FFNOutBias"),
-             "DecoderLayernormWeight",
-             "DecoderLayernormBias",
-             "MBARTLayernormWeight",
-             "MBARTLayernormBias",
-             "EmbWeight",
-             "EmbBias",
-             "PositionEncEmb",
-             "TrgWord"})
-    .Outputs({"OutputIds", "ParentIds", "SequenceLength"})
-    .Attrs({
-        "decoding_strategy: std::string",
-        "beam_size: int",
-        "topk: int",
-        "topp: float",
-        "n_head: int",
-        "size_per_head: int",
-        "num_layer: int",
-        "bos_id: int",
-        "eos_id: int",
-        "temperature: float",
-        "max_len: int64_t",
-        "beam_search_diversity_rate: float",
-        "rel_len: bool",
-        "alpha: float",
-        "early_stopping: bool",
-        "hidden_act: std::string",
-    })
-    .SetKernelFn(PD_KERNEL(MBartDecodingForward))
-    .SetInferShapeFn(PD_INFER_SHAPE(MBartDecodingInferShape))
-    .SetInferDtypeFn(PD_INFER_DTYPE(MBartDecodingInferDtype));
diff --git a/paddlenlp/ops/fast_transformer/src/fusion_mbart_decoding_op.cu b/paddlenlp/ops/fast_transformer/src/fusion_mbart_decoding_op.cu
deleted file mode 100644
index 08da17ce7d7f..000000000000
--- a/paddlenlp/ops/fast_transformer/src/fusion_mbart_decoding_op.cu
+++ /dev/null
@@ -1,596 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <curand.h>
-#include <curand_kernel.h>
-#include <algorithm>
-#include <iterator>
-#include <random>
-#include <sstream>
-#include <vector>
-
-#include "cublas_handle.h"
-
-#include "fusion_mbart_decoding_op.h"
-#include "pd_traits.h"
-
-
-__global__ void get_trg_length_mbart(const int* trg_word,
-                                     int* trg_length,
-                                     const int seq_len,
-                                     const int pad_id) {
-  int bid = threadIdx.x;
-
-  int cnt_nonpads = 0;
-  for (int i = 0; i < seq_len; ++i) {
-    if (pad_id != trg_word[bid * seq_len + i]) {
-      cnt_nonpads++;
-    } else {
-      break;
-    }
-  }
-  trg_length[bid] = cnt_nonpads;
-}
-
-template <paddle::DataType D>
-std::vector<paddle::Tensor> mbart_decoding_kernel(
-    const paddle::Tensor& input,
-    const paddle::Tensor& memory_sequence_length,
-    const paddle::Tensor& word_emb,
-    const std::vector<paddle::Tensor>& self_layernorm_weight,
-    const std::vector<paddle::Tensor>& self_layernorm_bias,
-    const std::vector<paddle::Tensor>& self_attn_query_weight,
-    const std::vector<paddle::Tensor>& self_attn_query_bias,
-    const std::vector<paddle::Tensor>& self_attn_key_weight,
-    const std::vector<paddle::Tensor>& self_attn_key_bias,
-    const std::vector<paddle::Tensor>& self_attn_value_weight,
-    const std::vector<paddle::Tensor>& self_attn_value_bias,
-    const std::vector<paddle::Tensor>& self_attn_output_weight,
-    const std::vector<paddle::Tensor>& self_attn_output_bias,
-    const std::vector<paddle::Tensor>& cross_layernorm_weight,
-    const std::vector<paddle::Tensor>& cross_layernorm_bias,
-    const std::vector<paddle::Tensor>& cross_attn_query_weight,
-    const std::vector<paddle::Tensor>& cross_attn_query_bias,
-    const std::vector<paddle::Tensor>& cross_attn_key_weight,
-    const std::vector<paddle::Tensor>& cross_attn_key_bias,
-    const std::vector<paddle::Tensor>& cross_attn_value_weight,
-    const std::vector<paddle::Tensor>& cross_attn_value_bias,
-    const std::vector<paddle::Tensor>& cross_attn_output_weight,
-    const std::vector<paddle::Tensor>& cross_attn_output_bias,
-    const std::vector<paddle::Tensor>& ffn_layernorm_weight,
-    const std::vector<paddle::Tensor>& ffn_layernorm_bias,
-    const std::vector<paddle::Tensor>& ffn_intermediate_weight,
-    const std::vector<paddle::Tensor>& ffn_intermediate_bias,
-    const std::vector<paddle::Tensor>& ffn_output_weight,
-    const std::vector<paddle::Tensor>& ffn_output_bias,
-    const paddle::Tensor& decoder_layernorm_weight,
-    const paddle::Tensor& decoder_layernorm_bias,
-    const paddle::Tensor& mbart_layernorm_weight,
-    const paddle::Tensor& mbart_layernorm_bias,
-    const paddle::Tensor& embedding_weight,
-    const paddle::Tensor& embedding_bias,
-    const paddle::Tensor& position_encoding_table,
-    const paddle::Tensor& trg_word,
-    paddle::Tensor& output_ids,
-    paddle::Tensor& parent_ids,
-    paddle::Tensor& sequence_length,
-    const std::string& decoding_strategy,
-    const int& beam_size,
-    const int& topk,
-    const float& topp,
-    const int& head_num_,
-    const int& size_per_head_,
-    const int& num_layer_,
-    const int& start_id_,
-    const int& end_id_,
-    const float& temperature,
-    const int64_t& max_seq_len_,
-    const float& beam_search_diversity_rate_,
-    const float& alpha,
-    const bool& early_stopping,
-    const std::string& hidden_act,
-    cudaStream_t stream) {
-  int beam_width_ = (decoding_strategy == "beam_search" ||
-                     decoding_strategy == "beam_search_v2" ||
-                     decoding_strategy == "beam_search_v3")
-                        ? beam_size
-                        : 1;
-  int candidate_num_ = (decoding_strategy == "sampling") ? topk : 1;
-  float probability_threshold_ = (decoding_strategy == "sampling") ? topp : 0.0;
-
-  auto input_dims = input.shape();
-  int batch_size_ = (decoding_strategy == "beam_search" ||
-                     decoding_strategy == "beam_search_v2" ||
-                     decoding_strategy == "beam_search_v3")
-                        ? input_dims[0] / beam_width_
-                        : input_dims[0];
-  const int memory_max_seq_len = input_dims[1];
-  const int memory_hidden_dim = input_dims[2];
-  const int vocab_size = word_emb.shape()[0];
-
-  typedef PDTraits<D> traits_;
-  typedef typename traits_::DataType DataType_;
-  typedef typename traits_::data_t data_t_;
-
-  DecodingInitParam<DataType_> decoding_params;
-  decoding_params.cublas_handle = CublasHandle::GetInstance()->cublas_handle_;
-  decoding_params.cublaslt_handle = CublasHandle::GetInstance()->cublaslt_handle_;
-
-  decoding_params.output_ids = output_ids.mutable_data<int>(input.place());
-  decoding_params.parent_ids = parent_ids.mutable_data<int>(input.place());
-  decoding_params.sequence_length =
-      sequence_length.mutable_data<int>(input.place());
-
-  typedef DecoderTransformerTraits<traits_::OpType> DecodingTraits_;
-  decoding_params.stream = stream;
-  fastertransformer::Allocator<AllocatorType::PD> allocator_(stream);
-
-  decoding_params.memory_tensor =
-      reinterpret_cast<const DataType_*>(input.data<data_t_>());
-  decoding_params.memory_sequence_length = memory_sequence_length.data<int>();
-
-  auto trg_word_shape = trg_word.shape();
-  int trg_max_len =
-      (trg_word_shape.size() == 2) ? static_cast<int>(trg_word_shape[1]) : 0;
-
-  paddle::Tensor trg_length =
-      (trg_word_shape.size() == 2 && trg_word_shape[0] != 0)
-          ? paddle::Tensor(paddle::PlaceType::kGPU, {trg_word_shape[0]})
-          : paddle::Tensor(paddle::PlaceType::kGPU, {1});
-  auto trg_length_ptr = trg_length.mutable_data<int>(input.place());
-
-  if (trg_word_shape.size() == 2 && trg_word_shape[0] != 0) {
-    decoding_params.trg_word = trg_word.data<int>();
-
-    get_trg_length_mbart<<<1, trg_word_shape[0], 0, stream>>>(
-        decoding_params.trg_word, trg_length_ptr, trg_max_len, start_id_);
-    decoding_params.trg_length = trg_length_ptr;
-  }
-
-  DecoderInitParam<DataType_>* params =
-      new DecoderInitParam<DataType_>[num_layer_];
-
-  for (int i = 0; i < num_layer_; i++) {
-    params[i].stream = stream;
-    params[i].cublas_handle = CublasHandle::GetInstance()->cublas_handle_;
-    params[i].cublaslt_handle = CublasHandle::GetInstance()->cublaslt_handle_;
-
-    if (decoding_strategy == "beam_search" ||
-        decoding_strategy == "beam_search_v2" ||
-        decoding_strategy == "beam_search_v3") {
-      params[i].request_batch_size = batch_size_ * beam_width_;
-      params[i].request_max_mem_seq_len = memory_max_seq_len;
-    } else if (decoding_strategy == "sampling" ||
-               decoding_strategy == "topk_sampling" ||
-               decoding_strategy == "topp_sampling") {
-      params[i].request_batch_size = batch_size_;
-      params[i].request_max_mem_seq_len = memory_max_seq_len;
-    }
-
-    // self attn
-    params[i].self_layernorm.gamma = reinterpret_cast<const DataType_*>(
-        self_layernorm_weight[i].data<data_t_>());
-    params[i].self_layernorm.beta = reinterpret_cast<const DataType_*>(
-        self_layernorm_bias[i].data<data_t_>());
-    // query
-    params[i].self_attention.query_weight.kernel =
-        reinterpret_cast<const DataType_*>(
-            self_attn_query_weight[i].data<data_t_>());
-    params[i].self_attention.query_weight.bias =
-        reinterpret_cast<const DataType_*>(
-            self_attn_query_bias[i].data<data_t_>());
-    // key
-    params[i].self_attention.key_weight.kernel =
-        reinterpret_cast<const DataType_*>(
-            self_attn_key_weight[i].data<data_t_>());
-    params[i].self_attention.key_weight.bias =
-        reinterpret_cast<const DataType_*>(
-            self_attn_key_bias[i].data<data_t_>());
-    // value
-    params[i].self_attention.value_weight.kernel =
-        reinterpret_cast<const DataType_*>(
-            self_attn_value_weight[i].data<data_t_>());
-    params[i].self_attention.value_weight.bias =
-        reinterpret_cast<const DataType_*>(
-            self_attn_value_bias[i].data<data_t_>());
-    // out proj
-    params[i].self_attention.attention_output_weight.kernel =
-        reinterpret_cast<const DataType_*>(
-            self_attn_output_weight[i].data<data_t_>());
-    params[i].self_attention.attention_output_weight.bias =
-        reinterpret_cast<const DataType_*>(
-            self_attn_output_bias[i].data<data_t_>());
-
-    // cross
-    params[i].cross_layernorm.gamma = reinterpret_cast<const DataType_*>(
-        cross_layernorm_weight[i].data<data_t_>());
-    params[i].cross_layernorm.beta = reinterpret_cast<const DataType_*>(
-        cross_layernorm_bias[i].data<data_t_>());
-    // query
-    params[i].cross_attention.query_weight.kernel =
-        reinterpret_cast<const DataType_*>(
-            cross_attn_query_weight[i].data<data_t_>());
-    params[i].cross_attention.query_weight.bias =
-        reinterpret_cast<const DataType_*>(
-            cross_attn_query_bias[i].data<data_t_>());
-    // key
-    params[i].cross_attention.key_weight.kernel =
-        reinterpret_cast<const DataType_*>(
-            cross_attn_key_weight[i].data<data_t_>());
-    params[i].cross_attention.key_weight.bias =
-        reinterpret_cast<const DataType_*>(
-            cross_attn_key_bias[i].data<data_t_>());
-    // value
-    params[i].cross_attention.value_weight.kernel =
-        reinterpret_cast<const DataType_*>(
-            cross_attn_value_weight[i].data<data_t_>());
-    params[i].cross_attention.value_weight.bias =
-        reinterpret_cast<const DataType_*>(
-            cross_attn_value_bias[i].data<data_t_>());
-    // out proj
-    params[i].cross_attention.attention_output_weight.kernel =
-        reinterpret_cast<const DataType_*>(
-            cross_attn_output_weight[i].data<data_t_>());
-    params[i].cross_attention.attention_output_weight.bias =
-        reinterpret_cast<const DataType_*>(
-            cross_attn_output_bias[i].data<data_t_>());
-
-    // ffn
-    params[i].ffn_layernorm.gamma = reinterpret_cast<const DataType_*>(
-        ffn_layernorm_weight[i].data<data_t_>());
-    params[i].ffn_layernorm.beta = reinterpret_cast<const DataType_*>(
-        ffn_layernorm_bias[i].data<data_t_>());
-    // intermediate proj
-    params[i].ffn.intermediate_weight.kernel =
-        reinterpret_cast<const DataType_*>(
-            ffn_intermediate_weight[i].data<data_t_>());
-    params[i].ffn.intermediate_weight.bias = reinterpret_cast<const DataType_*>(
-        ffn_intermediate_bias[i].data<data_t_>());
-    // out proj
-    params[i].ffn.output_weight.kernel = reinterpret_cast<const DataType_*>(
-        ffn_output_weight[i].data<data_t_>());
-    params[i].ffn.output_weight.bias =
-        reinterpret_cast<const DataType_*>(ffn_output_bias[i].data<data_t_>());
-  }
-
-  decoding_params.layernorm.gamma = reinterpret_cast<const DataType_*>(
-      decoder_layernorm_weight.data<data_t_>());
-  decoding_params.layernorm.beta = reinterpret_cast<const DataType_*>(
-      decoder_layernorm_bias.data<data_t_>());
-
-  // for mbart embedding layernorm
-  decoding_params.mbart_layernorm.gamma = reinterpret_cast<const DataType_*>(
-      mbart_layernorm_weight.data<data_t_>());
-  decoding_params.mbart_layernorm.beta =
-      reinterpret_cast<const DataType_*>(mbart_layernorm_bias.data<data_t_>());
-
-  // for embedding
-  decoding_params.embedding_table =
-      reinterpret_cast<const DataType_*>(word_emb.data<data_t_>());
-
-  // for weight sharing matmul
-  decoding_params.embedding_kernel =
-      reinterpret_cast<const DataType_*>(embedding_weight.data<data_t_>());
-  // for matmul bias
-  decoding_params.embedding_bias =
-      reinterpret_cast<const DataType_*>(embedding_bias.data<data_t_>());
-
-  decoding_params.position_encoding_table = reinterpret_cast<const DataType_*>(
-      position_encoding_table.data<data_t_>());
-    
-  int finished_candidate_num_ =
-      ("beam_search_v3" == decoding_strategy) ? beam_width_ : beam_width_ * 2;
-
-  ActivationType activate =
-      (hidden_act == "gelu") ? ActivationType::GELU : ActivationType::RELU;
-
-  if ("beam_search" == decoding_strategy) {
-    DecodingBeamsearch<DecodingTraits_::OpType>* decoding_beam_search_;
-    decoding_beam_search_ = new DecodingBeamsearch<DecodingTraits_::OpType>(
-        allocator_,
-        batch_size_,
-        beam_width_,
-        max_seq_len_,
-        head_num_,
-        size_per_head_,
-        vocab_size,
-        num_layer_,
-        memory_hidden_dim,
-        memory_max_seq_len,
-        start_id_,
-        end_id_,
-        beam_search_diversity_rate_,
-        true,  /*is_fuse_topk_softMax*/
-        false, /*is_fuse_qkv*/
-        false, /*keep_alive_beam*/
-        alpha, /*alpha not used for this case*/
-        true,
-        2, /*pos_offset BART and MBART only for now*/
-        activate,
-        false,  // pos_bias
-        false /*prefix_lm*/,
-        -1, /*finished_candidate_num*/
-        false, /*early_stopping*/
-        true /*is_mbart */);
-
-    decoding_beam_search_->forward(params, decoding_params);
-
-    delete decoding_beam_search_;
-  } else if ("beam_search_v2" == decoding_strategy ||
-             "beam_search_v3" == decoding_strategy) {
-    DecodingBeamsearch<DecodingTraits_::OpType>* decoding_beam_search_;
-    decoding_beam_search_ = new DecodingBeamsearch<DecodingTraits_::OpType>(
-        allocator_,
-        batch_size_,
-        beam_width_,
-        max_seq_len_,
-        head_num_,
-        size_per_head_,
-        vocab_size,
-        num_layer_,
-        memory_hidden_dim,
-        memory_max_seq_len,
-        start_id_,
-        end_id_,
-        beam_search_diversity_rate_,
-        true,   // is_fuse_topk_softMax
-        false,  // is_fuse_qkv
-        true,   // keep_alive_beam
-        alpha,
-        true,
-        2, /*pos_offset BART and MBART only for now*/
-        activate,
-        false,  // pos_bias
-        false /*prefix_lm*/,
-        finished_candidate_num_, /*finished_candidate_num*/
-        early_stopping, /*early_stopping*/
-        true /*is_mbart */);
-
-    decoding_beam_search_->forward(params, decoding_params);
-
-    delete decoding_beam_search_;
-  } else if ("topk_sampling" == decoding_strategy ||
-             "topp_sampling" == decoding_strategy ||
-             "sampling" == decoding_strategy) {
-    DecodingSampling<DecodingTraits_::OpType>* decoding_sampling_;
-    decoding_sampling_ = new DecodingSampling<DecodingTraits_::OpType>(
-        allocator_,
-        batch_size_,
-        max_seq_len_,
-        head_num_,
-        size_per_head_,
-        vocab_size,
-        num_layer_,
-        memory_hidden_dim,
-        memory_max_seq_len,
-        start_id_,
-        end_id_,
-        candidate_num_,
-        probability_threshold_,
-        false, /*is_fuse_qkv*/
-        true,
-        2, /*pos_offset BART and MBART only for now*/
-        activate,
-        false,  // pos_bias
-        temperature,    // temperature
-        1.0,    // repeat_penalty
-        false,  // prefix_lm
-        true /*is_mbart */);
-
-    decoding_sampling_->forward(params, decoding_params);
-
-    delete decoding_sampling_;
-  } else {
-    PD_THROW(
-        "Only beam_search, beam_search_v2 and sampling are supported for "
-        "FastGeneration. ");
-  }
-  delete[] params;
-
-  return {output_ids, parent_ids, sequence_length};
-}
-
-std::vector<paddle::Tensor> MBartDecodingCUDAForward(
-    const paddle::Tensor& input,
-    const paddle::Tensor& mem_seq_len,
-    const paddle::Tensor& word_embedding,
-    const std::vector<paddle::Tensor>& self_ln_weight,
-    const std::vector<paddle::Tensor>& self_ln_bias,
-    const std::vector<paddle::Tensor>& self_q_weight,
-    const std::vector<paddle::Tensor>& self_q_bias,
-    const std::vector<paddle::Tensor>& self_k_weight,
-    const std::vector<paddle::Tensor>& self_k_bias,
-    const std::vector<paddle::Tensor>& self_v_weight,
-    const std::vector<paddle::Tensor>& self_v_bias,
-    const std::vector<paddle::Tensor>& self_out_weight,
-    const std::vector<paddle::Tensor>& self_out_bias,
-    const std::vector<paddle::Tensor>& cross_ln_weight,
-    const std::vector<paddle::Tensor>& cross_ln_bias,
-    const std::vector<paddle::Tensor>& cross_q_weight,
-    const std::vector<paddle::Tensor>& cross_q_bias,
-    const std::vector<paddle::Tensor>& cross_k_weight,
-    const std::vector<paddle::Tensor>& cross_k_bias,
-    const std::vector<paddle::Tensor>& cross_v_weight,
-    const std::vector<paddle::Tensor>& cross_v_bias,
-    const std::vector<paddle::Tensor>& cross_out_weight,
-    const std::vector<paddle::Tensor>& cross_out_bias,
-    const std::vector<paddle::Tensor>& ffn_ln_weight,
-    const std::vector<paddle::Tensor>& ffn_ln_bias,
-    const std::vector<paddle::Tensor>& ffn_inter_weight,
-    const std::vector<paddle::Tensor>& ffn_inter_bias,
-    const std::vector<paddle::Tensor>& ffn_out_weight,
-    const std::vector<paddle::Tensor>& ffn_out_bias,
-    const paddle::Tensor& decoder_ln_weight,
-    const paddle::Tensor& decoder_ln_bias,
-    const paddle::Tensor& mbart_ln_weight,
-    const paddle::Tensor& mbart_ln_bias,
-    const paddle::Tensor& embedding_weight,
-    const paddle::Tensor& embedding_bias,
-    const paddle::Tensor& positional_embedding_weight,
-    const paddle::Tensor& trg_word,
-    paddle::Tensor& output_ids,
-    paddle::Tensor& parent_ids,
-    paddle::Tensor& sequence_length,
-    const std::string& decoding_strategy,
-    const int&  beam_size,
-    const int& topk,
-    const float& topp,
-    const int& n_head,
-    const int& size_per_head,
-    const int& num_layer,
-    const int& bos_id,
-    const int& eos_id,
-    const float& temperature,
-    const int64_t& max_len,
-    const float& beam_search_diversity_rate,
-    const float& alpha,
-    const bool& early_stopping,
-    const std::string& hidden_act) {
-  auto stream = input.stream();
-  cublasSetStream(CublasHandle::GetInstance()->cublas_handle_, stream);
-
-  std::vector<paddle::Tensor> ret;
-
-  switch (input.type()) {
-    case paddle::DataType::FLOAT16: {
-      ret = mbart_decoding_kernel<paddle::DataType::FLOAT16>(
-          input,
-          mem_seq_len,
-          word_embedding,
-          self_ln_weight,
-          self_ln_bias,
-          self_q_weight,
-          self_q_bias,
-          self_k_weight,
-          self_k_bias,
-          self_v_weight,
-          self_v_bias,
-          self_out_weight,
-          self_out_bias,
-          cross_ln_weight,
-          cross_ln_bias,
-          cross_q_weight,
-          cross_q_bias,
-          cross_k_weight,
-          cross_k_bias,
-          cross_v_weight,
-          cross_v_bias,
-          cross_out_weight,
-          cross_out_bias,
-          ffn_ln_weight,
-          ffn_ln_bias,
-          ffn_inter_weight,
-          ffn_inter_bias,
-          ffn_out_weight,
-          ffn_out_bias,
-          decoder_ln_weight,
-          decoder_ln_bias,
-          mbart_ln_weight,
-          mbart_ln_bias,
-          embedding_weight,
-          embedding_bias,
-          positional_embedding_weight,
-          trg_word,
-          output_ids,
-          parent_ids,
-          sequence_length,
-          decoding_strategy,
-          beam_size,
-          topk,
-          topp,
-          n_head,
-          size_per_head,
-          num_layer,
-          bos_id,
-          eos_id,
-          temperature,
-          max_len,
-          beam_search_diversity_rate,
-          alpha,
-          early_stopping,
-          hidden_act,
-          stream);
-      break;
-    }
-    case paddle::DataType::FLOAT32: {
-      ret = mbart_decoding_kernel<paddle::DataType::FLOAT32>(
-          input,
-          mem_seq_len,
-          word_embedding,
-          self_ln_weight,
-          self_ln_bias,
-          self_q_weight,
-          self_q_bias,
-          self_k_weight,
-          self_k_bias,
-          self_v_weight,
-          self_v_bias,
-          self_out_weight,
-          self_out_bias,
-          cross_ln_weight,
-          cross_ln_bias,
-          cross_q_weight,
-          cross_q_bias,
-          cross_k_weight,
-          cross_k_bias,
-          cross_v_weight,
-          cross_v_bias,
-          cross_out_weight,
-          cross_out_bias,
-          ffn_ln_weight,
-          ffn_ln_bias,
-          ffn_inter_weight,
-          ffn_inter_bias,
-          ffn_out_weight,
-          ffn_out_bias,
-          decoder_ln_weight,
-          decoder_ln_bias,
-          mbart_ln_weight,
-          mbart_ln_bias,
-          embedding_weight,
-          embedding_bias,
-          positional_embedding_weight,
-          trg_word,
-          output_ids,
-          parent_ids,
-          sequence_length,
-          decoding_strategy,
-          beam_size,
-          topk,
-          topp,
-          n_head,
-          size_per_head,
-          num_layer,
-          bos_id,
-          eos_id,
-          temperature,
-          max_len,
-          beam_search_diversity_rate,
-          alpha,
-          early_stopping,
-          hidden_act,
-          stream);
-      break;
-    }
-    default: {
-      PD_THROW(
-          "NOT supported data type. "
-          "Only float16 and float32 are supported. ");
-      break;
-    }
-  }
-
-  return ret;
-}
diff --git a/paddlenlp/ops/fast_transformer/src/fusion_mbart_decoding_op.h b/paddlenlp/ops/fast_transformer/src/fusion_mbart_decoding_op.h
deleted file mode 100644
index cf21beea10f0..000000000000
--- a/paddlenlp/ops/fast_transformer/src/fusion_mbart_decoding_op.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include "fastertransformer/decoding_beamsearch.h"
-#include "fastertransformer/decoding_sampling.h"
-#include "fastertransformer/open_decoder.h"
-#include "fastertransformer/utils/common.h"
-
-#ifdef PADDLE_ON_INFERENCE
-#include "paddle/extension.h"
-#include "paddle_inference_api.h"
-#include "paddle/common/exception.h"
-#else
-#include "paddle/extension.h"
-#endif
-
-
-std::vector<paddle::Tensor> MBartDecodingCUDAForward(
-    const paddle::Tensor& input,
-    const paddle::Tensor& mem_seq_len,
-    const paddle::Tensor& word_embedding,
-    const std::vector<paddle::Tensor>& self_ln_weight,
-    const std::vector<paddle::Tensor>& self_ln_bias,
-    const std::vector<paddle::Tensor>& self_q_weight,
-    const std::vector<paddle::Tensor>& self_q_bias,
-    const std::vector<paddle::Tensor>& self_k_weight,
-    const std::vector<paddle::Tensor>& self_k_bias,
-    const std::vector<paddle::Tensor>& self_v_weight,
-    const std::vector<paddle::Tensor>& self_v_bias,
-    const std::vector<paddle::Tensor>& self_out_weight,
-    const std::vector<paddle::Tensor>& self_out_bias,
-    const std::vector<paddle::Tensor>& cross_ln_weight,
-    const std::vector<paddle::Tensor>& cross_ln_bias,
-    const std::vector<paddle::Tensor>& cross_q_weight,
-    const std::vector<paddle::Tensor>& cross_q_bias,
-    const std::vector<paddle::Tensor>& cross_k_weight,
-    const std::vector<paddle::Tensor>& cross_k_bias,
-    const std::vector<paddle::Tensor>& cross_v_weight,
-    const std::vector<paddle::Tensor>& cross_v_bias,
-    const std::vector<paddle::Tensor>& cross_out_weight,
-    const std::vector<paddle::Tensor>& cross_out_bias,
-    const std::vector<paddle::Tensor>& ffn_ln_weight,
-    const std::vector<paddle::Tensor>& ffn_ln_bias,
-    const std::vector<paddle::Tensor>& ffn_inter_weight,
-    const std::vector<paddle::Tensor>& ffn_inter_bias,
-    const std::vector<paddle::Tensor>& ffn_out_weight,
-    const std::vector<paddle::Tensor>& ffn_out_bias,
-    const paddle::Tensor& decoder_ln_weight,
-    const paddle::Tensor& decoder_ln_bias,
-    const paddle::Tensor& mbart_ln_weight,
-    const paddle::Tensor& mbart_ln_bias,
-    const paddle::Tensor& embedding_weight,
-    const paddle::Tensor& embedding_bias,
-    const paddle::Tensor& positional_embedding_weight,
-    const paddle::Tensor& trg_word,
-    paddle::Tensor& output_ids,
-    paddle::Tensor& parent_ids,
-    paddle::Tensor& sequence_length,
-    const std::string& decoding_strategy,
-    const int& beam_size,
-    const int& topk,
-    const float& topp,
-    const int& head_num_,
-    const int& size_per_head_,
-    const int& num_layer_,
-    const int& start_id_,
-    const int& end_id_,
-    const float& temperature,
-    const int64_t& max_seq_len_,
-    const float& beam_search_diversity_rate_,
-    const float& alpha,
-    const bool& early_stopping,
-    const std::string& hidden_act);
diff --git a/paddlenlp/ops/fast_transformer/src/fusion_miro_op.cc b/paddlenlp/ops/fast_transformer/src/fusion_miro_op.cc
deleted file mode 100644
index 0f5a1a2221c8..000000000000
--- a/paddlenlp/ops/fast_transformer/src/fusion_miro_op.cc
+++ /dev/null
@@ -1,427 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <string>
-#include <vector>
-
-#include "fusion_miro_op.h"
-#include "pd_traits.h"
-
-
-std::vector<paddle::Tensor> MIROForward(
-    const paddle::Tensor& input_ids,
-    const paddle::Tensor& attn_mask,
-    const paddle::Tensor& mem_seq_len,
-    const paddle::Tensor& type_id,
-    const paddle::Tensor& decoder_type_id,
-    const paddle::Tensor& logits_mask,
-    const paddle::Tensor& word_embedding,
-    const paddle::Tensor& pre_decoder_ln_weight,
-    const paddle::Tensor& pre_decoder_ln_bias,
-    const std::vector<paddle::Tensor>& self_ln_weight,
-    const std::vector<paddle::Tensor>& self_ln_bias,
-    const std::vector<paddle::Tensor>& self_q_weight,
-    const std::vector<paddle::Tensor>& self_q_bias,
-    const std::vector<paddle::Tensor>& self_k_weight,
-    const std::vector<paddle::Tensor>& self_k_bias,
-    const std::vector<paddle::Tensor>& self_v_weight,
-    const std::vector<paddle::Tensor>& self_v_bias,
-    const std::vector<paddle::Tensor>& self_out_weight,
-    const std::vector<paddle::Tensor>& self_out_bias,
-    const std::vector<paddle::Tensor>& ffn_ln_weight,
-    const std::vector<paddle::Tensor>& ffn_ln_bias,
-    const std::vector<paddle::Tensor>& ffn_inter_weight,
-    const std::vector<paddle::Tensor>& ffn_inter_bias,
-    const std::vector<paddle::Tensor>& ffn_out_weight,
-    const std::vector<paddle::Tensor>& ffn_out_bias,
-    const paddle::Tensor& decoder_ln_weight,
-    const paddle::Tensor& decoder_ln_bias,
-    const paddle::Tensor& trans_weight,
-    const paddle::Tensor& trans_bias,
-    const paddle::Tensor& lm_ln_weight,
-    const paddle::Tensor& lm_ln_bias,
-    const paddle::Tensor& embedding_weight,
-    const paddle::Tensor& embedding_bias,
-    const paddle::Tensor& positional_embedding_weight,
-    const paddle::Tensor& type_embedding_weight,
-    const paddle::Tensor& role_id,
-    const paddle::Tensor& decoder_role_id,
-    const paddle::Tensor& role_embedding_table,
-    const paddle::Tensor& position_ids,
-    const paddle::Tensor& decoder_position_ids,
-    const std::string& decoding_strategy,
-    const int& beam_size,
-    const int& topk,
-    const float& topp,
-    const int& n_head,
-    const int& size_per_head,
-    const int& num_layer,
-    const int& bos_id,
-    const int& eos_id,
-    const int64_t& max_len,
-    const float& beam_search_diversity_rate,
-    const int& unk_id,
-    const int& mask_id,
-    const float& temperature,
-    const float& len_penalty,
-    const bool& normalize_before,
-    const bool& pos_bias,
-    const std::string& hidden_act,
-    const bool& rel_len,
-    const bool& early_stopping,
-    const int& min_length,
-    const int& tensor_para_size,
-    const int& layer_para_size,
-    const int& layer_para_batch_size) {
-  int batch_size = input_ids.shape()[0];
-  int max_out_len = rel_len ? max_len + input_ids.shape()[1] : max_len;
-
-  std::vector<int64_t> output_ids_dims;
-  std::vector<int64_t> output_scores_dims;
-  std::vector<int64_t> parent_ids_dims;
-  std::vector<int64_t> sequence_length_dims({batch_size});
-  if (decoding_strategy == "beam_search") {
-    if (batch_size != -1) {
-      batch_size /= beam_size;
-    }
-    output_ids_dims = {max_out_len, batch_size, beam_size};
-    output_scores_dims = {batch_size, beam_size};
-    parent_ids_dims = output_ids_dims;
-  } else if (decoding_strategy == "beam_search_v2" ||
-             decoding_strategy == "beam_search_v3") {
-    // Use separated alive and finish beam queues to avoid the decrease of alive
-    // beams. The outputs must include both the finish and alive to trace full
-    // path.
-    if (batch_size != -1) {
-      sequence_length_dims = {batch_size * 2};
-      batch_size /= beam_size;
-    } else {
-      sequence_length_dims = {batch_size};
-    }
-    output_ids_dims = {max_out_len, batch_size, beam_size * 2};
-    output_scores_dims = {batch_size, beam_size * 2};
-    parent_ids_dims = output_ids_dims;
-  } else if (decoding_strategy == "topk_sampling" ||
-             decoding_strategy == "topp_sampling" ||
-             decoding_strategy == "sampling") {
-    output_ids_dims = {max_out_len, batch_size};
-    output_scores_dims = {batch_size};
-    parent_ids_dims = {1};
-  } else {
-    PD_THROW("Not supported decoding strategy. ");
-  }
-  auto output_ids = paddle::Tensor(input_ids.place(), output_ids_dims);
-  auto parent_ids = paddle::Tensor(input_ids.place(), parent_ids_dims);
-  auto sequence_length =
-      paddle::Tensor(input_ids.place(), sequence_length_dims);
-  auto output_scores = paddle::Tensor(input_ids.place(), output_scores_dims);
-
-  if (input_ids.place() == paddle::PlaceType::kGPU) {
-    auto mem_seq_length = paddle::Tensor(paddle::PlaceType::kGPU);
-
-    if (mem_seq_len.place() != paddle::PlaceType::kGPU) {
-      mem_seq_length = mem_seq_len.copy_to<int>(paddle::PlaceType::kGPU);
-    } else {
-      mem_seq_length = mem_seq_len;
-    }
-
-    return MIROCUDAForward(input_ids,
-                                      attn_mask,
-                                      mem_seq_length,
-                                      type_id,
-                                      decoder_type_id,
-                                      logits_mask,
-                                      word_embedding,
-                                      pre_decoder_ln_weight,
-                                      pre_decoder_ln_bias,
-                                      self_ln_weight,
-                                      self_ln_bias,
-                                      self_q_weight,
-                                      self_q_bias,
-                                      self_k_weight,
-                                      self_k_bias,
-                                      self_v_weight,
-                                      self_v_bias,
-                                      self_out_weight,
-                                      self_out_bias,
-                                      ffn_ln_weight,
-                                      ffn_ln_bias,
-                                      ffn_inter_weight,
-                                      ffn_inter_bias,
-                                      ffn_out_weight,
-                                      ffn_out_bias,
-                                      decoder_ln_weight,
-                                      decoder_ln_bias,
-                                      trans_weight,
-                                      trans_bias,
-                                      lm_ln_weight,
-                                      lm_ln_bias,
-                                      embedding_weight,
-                                      embedding_bias,
-                                      positional_embedding_weight,
-                                      type_embedding_weight,
-                                      role_id,
-                                      decoder_role_id,
-                                      role_embedding_table,
-                                      position_ids,
-                                      decoder_position_ids,
-                                      output_ids,
-                                      parent_ids,
-                                      sequence_length,
-                                      output_scores,
-                                      decoding_strategy,
-                                      beam_size,
-                                      topk,
-                                      topp,
-                                      n_head,
-                                      size_per_head,
-                                      num_layer,
-                                      bos_id,
-                                      eos_id,
-                                      max_out_len,
-                                      beam_search_diversity_rate,
-                                      unk_id,
-                                      mask_id,
-                                      temperature,
-                                      len_penalty,
-                                      normalize_before,
-                                      pos_bias,
-                                      hidden_act,
-                                      early_stopping,
-                                      min_length,
-                                      tensor_para_size,
-                                      layer_para_size,
-                                      layer_para_batch_size);
-  } else {
-    PD_THROW("Not implemented place. Only GPU is supported. ");
-  }
-}
-
-std::vector<std::vector<int64_t>> MIROInferShape(
-    const std::vector<int64_t>& input_ids_shape,
-    const std::vector<int64_t>& attn_mask_shape,
-    const std::vector<int64_t>& mem_seq_len_shape,
-    const std::vector<int64_t>& logits_mask_shape,
-    const std::vector<int64_t>& type_id_shape,
-    const std::vector<int64_t>& decoder_type_id_shape,
-    const std::vector<int64_t>& word_embedding_shape,
-    const std::vector<int64_t>& pre_decoder_ln_weight_shape,
-    const std::vector<int64_t>& pre_decoder_ln_bias_shape,
-    const std::vector<std::vector<int64_t>>& self_ln_weight_shapes,
-    const std::vector<std::vector<int64_t>>& self_ln_bias_shapes,
-    const std::vector<std::vector<int64_t>>& self_q_weight_shapes,
-    const std::vector<std::vector<int64_t>>& self_q_bias_shapes,
-    const std::vector<std::vector<int64_t>>& self_k_weight_shapes,
-    const std::vector<std::vector<int64_t>>& self_k_bias_shapes,
-    const std::vector<std::vector<int64_t>>& self_v_weight_shapes,
-    const std::vector<std::vector<int64_t>>& self_v_bias_shapes,
-    const std::vector<std::vector<int64_t>>& self_out_weight_shapes,
-    const std::vector<std::vector<int64_t>>& self_out_bias_shapes,
-    const std::vector<std::vector<int64_t>>& ffn_ln_weight_shapes,
-    const std::vector<std::vector<int64_t>>& ffn_ln_bias_shapes,
-    const std::vector<std::vector<int64_t>>& ffn_inter_weight_shapes,
-    const std::vector<std::vector<int64_t>>& ffn_inter_bias_shapes,
-    const std::vector<std::vector<int64_t>>& ffn_out_weight_shapes,
-    const std::vector<std::vector<int64_t>>& ffn_out_bias_shapes,
-    const std::vector<int64_t>& decoder_ln_weight_shape,
-    const std::vector<int64_t>& decoder_ln_bias_shape,
-    const std::vector<int64_t>& trans_weight_shape,
-    const std::vector<int64_t>& trans_bias_shape,
-    const std::vector<int64_t>& lm_ln_weight_shape,
-    const std::vector<int64_t>& lm_ln_bias_shape,
-    const std::vector<int64_t>& embedding_weight_shape,
-    const std::vector<int64_t>& embedding_bias_shape,
-    const std::vector<int64_t>& positional_embedding_weight_shape,
-    const std::vector<int64_t>& type_embedding_weight_shape,
-    const std::vector<int64_t>& role_id_shape,
-    const std::vector<int64_t>& decoder_role_id_shape,
-    const std::vector<int64_t>& role_embedding_table_shape,
-    const std::vector<int64_t>& position_ids_shape,
-    const std::vector<int64_t>& decoder_position_ids_shape,
-    const std::string& decoding_strategy,
-    const int& beam_size,
-    const int& topk,
-    const float& topp,
-    const int& n_head,
-    const int& size_per_head,
-    const int& num_layer,
-    const int& bos_id,
-    const int& eos_id,
-    const int64_t& max_len,
-    const float& beam_search_diversity_rate,
-    const int& unk_id,
-    const int& mask_id,
-    const float& temperature,
-    const float& len_penalty,
-    const bool& normalize_before,
-    const bool& pos_bias,
-    const std::string& hidden_act,
-    const bool& rel_len,
-    const bool& early_stopping,
-    const int& min_length,
-    const int& tensor_para_size = 1,
-    const int& layer_para_size = 1,
-    const int& layer_para_batch_size = 1) {
-  int batch_size = input_ids_shape[0];
-
-  std::vector<int64_t> output_ids_dims;
-  std::vector<int64_t> output_scores_dims;
-  std::vector<int64_t> sequence_length_dims({batch_size});
-  if (decoding_strategy == "beam_search") {
-    if (batch_size != -1) {
-      batch_size /= beam_size;
-    }
-    output_ids_dims = {max_len, batch_size, beam_size};
-    output_scores_dims = {batch_size, beam_size};
-    return {output_ids_dims, output_ids_dims, sequence_length_dims, output_scores_dims};
-  } else if (decoding_strategy == "beam_search_v2" ||
-             decoding_strategy == "beam_search_v3") {
-    // Use separated alive and finish beam queues to avoid the decrease of alive
-    // beams. The outputs must include both the finish and alive to trace full
-    // path.
-    if (batch_size != -1) {
-      sequence_length_dims = {batch_size * 2};
-      batch_size /= beam_size;
-    } else {
-      sequence_length_dims = {batch_size};
-    }
-    output_ids_dims = {max_len, batch_size, beam_size * 2};
-    output_scores_dims = {batch_size, beam_size * 2};
-    return {output_ids_dims, output_ids_dims, sequence_length_dims, output_scores_dims};
-  } else if (decoding_strategy == "topk_sampling" ||
-             decoding_strategy == "topp_sampling" ||
-             decoding_strategy == "sampling") {
-    output_ids_dims = {max_len, batch_size};
-    output_scores_dims = {batch_size};
-    return {output_ids_dims, {1}, sequence_length_dims, output_scores_dims};
-  } else {
-    PD_THROW("Not supported decoding strategy. ");
-  }
-}
-
-std::vector<paddle::DataType> MIROInferDtype(
-    const paddle::DataType& input_ids,
-    const paddle::DataType& attn_mask,
-    const paddle::DataType& mem_seq_len,
-    const paddle::DataType& logits_mask,
-    const paddle::DataType& type_id,
-    const paddle::DataType& decoder_type_id,
-    const paddle::DataType& word_embedding,
-    const paddle::DataType& pre_decoder_ln_weight,
-    const paddle::DataType& pre_decoder_ln_bias,
-    const std::vector<paddle::DataType>& self_ln_weight,
-    const std::vector<paddle::DataType>& self_ln_bias,
-    const std::vector<paddle::DataType>& self_q_weight,
-    const std::vector<paddle::DataType>& self_q_bias,
-    const std::vector<paddle::DataType>& self_k_weight,
-    const std::vector<paddle::DataType>& self_k_bias,
-    const std::vector<paddle::DataType>& self_v_weight,
-    const std::vector<paddle::DataType>& self_v_bias,
-    const std::vector<paddle::DataType>& self_out_weight,
-    const std::vector<paddle::DataType>& self_out_bias,
-    const std::vector<paddle::DataType>& ffn_ln_weight,
-    const std::vector<paddle::DataType>& ffn_ln_bias,
-    const std::vector<paddle::DataType>& ffn_inter_weight,
-    const std::vector<paddle::DataType>& ffn_inter_bias,
-    const std::vector<paddle::DataType>& ffn_out_weight,
-    const std::vector<paddle::DataType>& ffn_out_bias,
-    const paddle::DataType& decoder_ln_weight,
-    const paddle::DataType& decoder_ln_bias,
-    const paddle::DataType& trans_weight,
-    const paddle::DataType& trans_bias,
-    const paddle::DataType& lm_ln_weight,
-    const paddle::DataType& lm_ln_bias,
-    const paddle::DataType& embedding_weight,
-    const paddle::DataType& embedding_bias,
-    const paddle::DataType& positional_embedding_weight,
-    const paddle::DataType& type_embedding_weight,
-    const paddle::DataType& role_id,
-    const paddle::DataType& decoder_role_id,
-    const paddle::DataType& role_embedding_table,
-    const paddle::DataType& position_ids,
-    const paddle::DataType& decoder_position_ids) {
-  return {paddle::DataType::INT32,
-          paddle::DataType::INT32,
-          paddle::DataType::INT32,
-          paddle::DataType::FLOAT32};
-}
-
-PD_BUILD_OP(fusion_miro)
-    .Inputs({"InputIds",
-             "AttnMask",
-             "MemSeqLen",
-             "TypeIds",
-             "DecTypeIds",
-             "LogitsMask",
-             "WordEmbedding",
-             "PreDecoderLayernormWeight",
-             "PreDecoderLayernormBias",
-             paddle::Vec("SelfLayernormWeight"),
-             paddle::Vec("SelfLayernormBias"),
-             paddle::Vec("SelfQueryWeight"),
-             paddle::Vec("SelfQueryBias"),
-             paddle::Vec("SelfKeyWeight"),
-             paddle::Vec("SelfKeyBias"),
-             paddle::Vec("SelfValueWeight"),
-             paddle::Vec("SelfValueBias"),
-             paddle::Vec("SelfOutWeight"),
-             paddle::Vec("SelfOutBias"),
-             paddle::Vec("FFNLayernormWeight"),
-             paddle::Vec("FFNLayernormBias"),
-             paddle::Vec("FFNInterWeight"),
-             paddle::Vec("FFNInterBias"),
-             paddle::Vec("FFNOutWeight"),
-             paddle::Vec("FFNOutBias"),
-             "DecoderLayernormWeight",
-             "DecoderLayernormBias",
-             "TransWeight",
-             "TransBias",
-             "LMLayernormWeight",
-             "LMLayernormBias",
-             "EmbWeight",
-             "EmbBias",
-             "PositionEncEmb",
-             "TypeEmb",
-             "RoleIds",
-             "DecRoleIds",
-             "RoleEmbedding",
-             "PositionIds",
-             "DecPositionIds"})
-    .Outputs({"OutputIds", "ParentIds", "SequenceLength", "OutputScores"})
-    .Attrs({"decoding_strategy: std::string",
-            "beam_size: int",
-            "topk: int",
-            "topp: float",
-            "n_head: int",
-            "size_per_head: int",
-            "num_layer: int",
-            "bos_id: int",
-            "eos_id: int",
-            "max_len: int64_t",
-            "beam_search_diversity_rate: float",
-            "unk_id: int",
-            "mask_id: int",
-            "temperature: float",
-            "len_penalty: float",
-            "normalize_before: bool",
-            "pos_bias: bool",
-            "hidden_act: std::string",
-            "rel_len: bool",
-            "early_stopping: bool",
-            "min_length: int",
-            "tensor_para_size: int",
-            "layer_para_size: int",
-            "layer_para_batch_size: int"})
-    .SetKernelFn(PD_KERNEL(MIROForward))
-    .SetInferShapeFn(PD_INFER_SHAPE(MIROInferShape))
-    .SetInferDtypeFn(PD_INFER_DTYPE(MIROInferDtype));
diff --git a/paddlenlp/ops/fast_transformer/src/fusion_miro_op.cu b/paddlenlp/ops/fast_transformer/src/fusion_miro_op.cu
deleted file mode 100644
index db3d57d7d423..000000000000
--- a/paddlenlp/ops/fast_transformer/src/fusion_miro_op.cu
+++ /dev/null
@@ -1,710 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <curand.h>
-#include <curand_kernel.h>
-#include <algorithm>
-#include <iterator>
-#include <random>
-#include <sstream>
-#include <vector>
-
-// TODO(guosheng): `HOST` conflict exists in float.h of paddle and mpi.h of mpi
-#include "fusion_miro_op.h"
-#include "pd_traits.h"
-#ifdef HOST
-#undef HOST
-#endif
-
-#include "fastertransformer/decoding_beamsearch.h"
-#include "fastertransformer/decoding_sampling.h"
-#include "fastertransformer/utils/common.h"
-#include "fastertransformer/utils/arguments.h"
-
-#ifdef BUILD_GPT  // consistent with FasterTransformer
-#include "parallel_utils.h"
-#endif
-
-
-template <paddle::DataType D>
-std::vector<paddle::Tensor> miro_decoding_kernel(
-    const paddle::Tensor& input_ids,
-    const paddle::Tensor& attn_mask,
-    const paddle::Tensor& memory_sequence_length,
-    const paddle::Tensor& type_id,
-    const paddle::Tensor& decoder_type_id,
-    const paddle::Tensor& logits_mask,
-    const paddle::Tensor& word_emb,
-    const paddle::Tensor& pre_decoder_layernorm_weight,
-    const paddle::Tensor& pre_decoder_layernorm_bias,
-    const std::vector<paddle::Tensor>& self_layernorm_weight,
-    const std::vector<paddle::Tensor>& self_layernorm_bias,
-    const std::vector<paddle::Tensor>& self_attn_query_weight,
-    const std::vector<paddle::Tensor>& self_attn_query_bias,
-    const std::vector<paddle::Tensor>& self_attn_key_weight,
-    const std::vector<paddle::Tensor>& self_attn_key_bias,
-    const std::vector<paddle::Tensor>& self_attn_value_weight,
-    const std::vector<paddle::Tensor>& self_attn_value_bias,
-    const std::vector<paddle::Tensor>& self_attn_output_weight,
-    const std::vector<paddle::Tensor>& self_attn_output_bias,
-    const std::vector<paddle::Tensor>& ffn_layernorm_weight,
-    const std::vector<paddle::Tensor>& ffn_layernorm_bias,
-    const std::vector<paddle::Tensor>& ffn_intermediate_weight,
-    const std::vector<paddle::Tensor>& ffn_intermediate_bias,
-    const std::vector<paddle::Tensor>& ffn_output_weight,
-    const std::vector<paddle::Tensor>& ffn_output_bias,
-    const paddle::Tensor& decoder_layernorm_weight,
-    const paddle::Tensor& decoder_layernorm_bias,
-    const paddle::Tensor& trans_weight,
-    const paddle::Tensor& trans_bias,
-    const paddle::Tensor& lm_layernorm_weight,
-    const paddle::Tensor& lm_layernorm_bias,
-    const paddle::Tensor& embedding_weight,
-    const paddle::Tensor& embedding_bias,
-    const paddle::Tensor& position_encoding_table,
-    const paddle::Tensor& type_embedding_weight,
-    const paddle::Tensor& role_id,
-    const paddle::Tensor& decoder_role_id,
-    const paddle::Tensor& role_embedding_table,
-    const paddle::Tensor& position_ids,
-    const paddle::Tensor& decoder_position_ids,
-    paddle::Tensor& output_ids,
-    paddle::Tensor& parent_ids,
-    paddle::Tensor& sequence_length,
-    paddle::Tensor& output_scores,
-    const std::string& decoding_strategy,
-    const int beam_size,
-    const int topk,
-    const float topp,
-    const int head_num_,
-    const int size_per_head_,
-    const int num_layer_,
-    const int start_id_,
-    const int end_id_,
-    const int64_t max_seq_len_,
-    const float beam_search_diversity_rate_,
-    const int unk_id,
-    const int mask_id,
-    const float temperature,
-    const float len_penalty,
-    const bool normalize_before,
-    const bool pos_bias,
-    const std::string& hidden_act,
-    const bool early_stopping,
-    const int min_length,
-    cudaStream_t stream,
-    const int tensor_para_size = 1,
-    const int layer_para_size = 1,
-    const int layer_para_batch_size = 1) {
-  int beam_width_ = (decoding_strategy == "beam_search" ||
-                     decoding_strategy == "beam_search_v2" ||
-                     decoding_strategy == "beam_search_v3")
-                        ? beam_size
-                        : 1;
-  int candidate_num_ =
-      ("topk_sampling" == decoding_strategy ||
-       "topp_sampling" == decoding_strategy || "sampling" == decoding_strategy)
-          ? topk
-          : 1;
-  float probability_threshold_ =
-      ("topk_sampling" == decoding_strategy ||
-       "topp_sampling" == decoding_strategy || "sampling" == decoding_strategy)
-          ? topp
-          : 0.0;
-
-  auto input_ids_dims = input_ids.shape();
-  int batch_size_ = (decoding_strategy == "beam_search" ||
-                     decoding_strategy == "beam_search_v2" ||
-                     decoding_strategy == "beam_search_v3")
-                        ? input_ids_dims[0] / beam_width_
-                        : input_ids_dims[0];
-  const int memory_max_seq_len = input_ids_dims[1];
-  const int memory_hidden_dim = head_num_ * size_per_head_;
-  const int vocab_size = word_emb.shape()[0];
-
-  typedef PDTraits<D> traits_;
-  typedef typename traits_::DataType DataType_;
-  typedef typename traits_::data_t data_t_;
-
-  DecodingInitParam<DataType_> decoding_params;
-  decoding_params.cublas_handle = CublasHandle::GetInstance()->cublas_handle_;
-  decoding_params.cublaslt_handle =
-      CublasHandle::GetInstance()->cublaslt_handle_;
-
-  decoding_params.output_ids = output_ids.mutable_data<int>(input_ids.place());
-  decoding_params.parent_ids = parent_ids.mutable_data<int>(input_ids.place());
-  decoding_params.sequence_length =
-      sequence_length.mutable_data<int>(input_ids.place());
-  decoding_params.output_scores = output_scores.mutable_data<float>(input_ids.place());
-
-  typedef DecoderTransformerTraits<traits_::OpType> DecodingTraits_;
-  decoding_params.stream = stream;
-  fastertransformer::Allocator<AllocatorType::PD> allocator_(stream);
-
-  decoding_params.d_start_ids = const_cast<int *>(input_ids.data<int>());
-  decoding_params.d_attn_mask =
-      reinterpret_cast<DataType_*>(const_cast<data_t_ *>(attn_mask.data<data_t_>()));
-  decoding_params.d_start_lengths = memory_sequence_length.data<int>();
-
-  decoding_params.memory_sequence_length = memory_sequence_length.data<int>();
-  decoding_params.type_id = type_id.data<int>();
-  decoding_params.decoder_type_id = decoder_type_id.data<int>();
-
-  if (decoding_strategy == "beam_search" ||
-      decoding_strategy == "beam_search_v2" ||
-      decoding_strategy == "beam_search_v3") {
-    decoding_params.request_batch_size = batch_size_ * beam_width_;
-  } else if (decoding_strategy == "sampling" ||
-             decoding_strategy == "topk_sampling" ||
-             decoding_strategy == "topp_sampling") {
-    decoding_params.request_batch_size = batch_size_;
-  }
-  decoding_params.max_input_len = memory_max_seq_len;
-  decoding_params.request_input_len = memory_max_seq_len;
-  decoding_params.request_output_len = max_seq_len_;
-
-#ifdef BUILD_GPT
-  auto* model_para_desc = ModelParaDescFactory::CreateModelParaDesc(
-      head_num_,
-      size_per_head_,
-      num_layer_,
-      tensor_para_size,
-      layer_para_size,
-      layer_para_batch_size,
-      const_cast<data_t_*>(word_emb.data<data_t_>()));
-  auto& tensor_parallel_param = model_para_desc->tensor_parallel_param;
-  auto& layer_parallel_param = model_para_desc->layer_parallel_param;
-  auto seed = model_para_desc->dist(model_para_desc->gen);
-#else
-  TensorParallelParam tensor_parallel_param;
-  LayerParallelParam layer_parallel_param;
-  tensor_parallel_param.rank = 0;
-  tensor_parallel_param.world_size = 1;
-  tensor_parallel_param.local_head_num_ = head_num_;
-  tensor_parallel_param.local_hidden_units_ = memory_hidden_dim;
-
-  layer_parallel_param.rank = 0;
-  layer_parallel_param.world_size = 1;
-  layer_parallel_param.layers_per_group = num_layer_;
-  layer_parallel_param.local_batch_size = batch_size_;
-  int seed = -1;
-#endif
-
-  DecoderInitParam<DataType_>* params =
-      new DecoderInitParam<DataType_>[num_layer_];
-
-  // Allow python passing partial weights for model parallel.
-  int inner_coeff =
-      (memory_hidden_dim == self_attn_output_weight[0].shape()[0])
-          ? ffn_intermediate_weight[0].shape()[1] / memory_hidden_dim
-          : (ffn_intermediate_weight[0].shape()[1] * tensor_para_size /
-             memory_hidden_dim);
-
-  for (int i = 0; i < self_layernorm_weight.size(); i++) {
-    // Allow python passing weights of all layers or only passing the
-    // corresponding layers to save memory.
-    int layer_idx = self_layernorm_weight.size() != num_layer_
-                        ? layer_parallel_param.rank *
-                                  layer_parallel_param.layers_per_group +
-                              i
-                        : i;
-    params[layer_idx].stream = stream;
-    params[layer_idx].cublas_handle = CublasHandle::GetInstance()->cublas_handle_;
-    params[layer_idx].cublaslt_handle = CublasHandle::GetInstance()->cublaslt_handle_;
-
-    if (decoding_strategy == "beam_search" ||
-        decoding_strategy == "beam_search_v2" ||
-        decoding_strategy == "beam_search_v3") {
-      params[layer_idx].request_batch_size = batch_size_ * beam_width_;
-      params[layer_idx].request_max_mem_seq_len = memory_max_seq_len;
-    } else if (decoding_strategy == "sampling" ||
-               decoding_strategy == "topk_sampling" ||
-               decoding_strategy == "topp_sampling") {
-      params[layer_idx].request_batch_size = batch_size_;
-      params[layer_idx].request_max_mem_seq_len = memory_max_seq_len;
-    }
-
-    // self attn
-    params[layer_idx].self_layernorm.gamma = reinterpret_cast<const DataType_*>(
-        self_layernorm_weight[i].data<data_t_>());
-    params[layer_idx].self_layernorm.beta = reinterpret_cast<const DataType_*>(
-        self_layernorm_bias[i].data<data_t_>());
-    // query
-    params[layer_idx].self_attention.query_weight.kernel =
-        reinterpret_cast<const DataType_*>(
-            self_attn_query_weight[i].data<data_t_>());
-    params[layer_idx].self_attention.query_weight.bias =
-        reinterpret_cast<const DataType_*>(
-            self_attn_query_bias[i].data<data_t_>());
-    // key
-    params[layer_idx].self_attention.key_weight.kernel =
-        reinterpret_cast<const DataType_*>(
-            self_attn_key_weight[i].data<data_t_>());
-    params[layer_idx].self_attention.key_weight.bias =
-        reinterpret_cast<const DataType_*>(
-            self_attn_key_bias[i].data<data_t_>());
-    // value
-    params[layer_idx].self_attention.value_weight.kernel =
-        reinterpret_cast<const DataType_*>(
-            self_attn_value_weight[i].data<data_t_>());
-    params[layer_idx].self_attention.value_weight.bias =
-        reinterpret_cast<const DataType_*>(
-            self_attn_value_bias[i].data<data_t_>());
-    // out proj
-    params[layer_idx].self_attention.attention_output_weight.kernel =
-        reinterpret_cast<const DataType_*>(
-            self_attn_output_weight[i].data<data_t_>());
-
-    params[layer_idx].self_attention.attention_output_weight.bias =
-        reinterpret_cast<const DataType_*>(
-            self_attn_output_bias[i].data<data_t_>());
-
-    // ffn
-    params[layer_idx].ffn_layernorm.gamma = reinterpret_cast<const DataType_*>(
-        ffn_layernorm_weight[i].data<data_t_>());
-    params[layer_idx].ffn_layernorm.beta = reinterpret_cast<const DataType_*>(
-        ffn_layernorm_bias[i].data<data_t_>());
-    // intermediate proj
-    params[layer_idx].ffn.intermediate_weight.kernel =
-        reinterpret_cast<const DataType_*>(
-            ffn_intermediate_weight[i].data<data_t_>());
-    params[layer_idx].ffn.intermediate_weight.bias = reinterpret_cast<const DataType_*>(
-        ffn_intermediate_bias[i].data<data_t_>());
-    // out proj
-    params[layer_idx].ffn.output_weight.kernel = reinterpret_cast<const DataType_*>(
-        ffn_output_weight[i].data<data_t_>());
-    params[layer_idx].ffn.output_weight.bias =
-        reinterpret_cast<const DataType_*>(ffn_output_bias[i].data<data_t_>());
-  }
-
-  decoding_params.pre_layernorm.gamma = reinterpret_cast<const DataType_*>(
-      pre_decoder_layernorm_weight.data<data_t_>());
-  decoding_params.pre_layernorm.beta = reinterpret_cast<const DataType_*>(
-      pre_decoder_layernorm_bias.data<data_t_>());
-
-  decoding_params.layernorm.gamma = reinterpret_cast<const DataType_*>(
-      decoder_layernorm_weight.data<data_t_>());
-  decoding_params.layernorm.beta = reinterpret_cast<const DataType_*>(
-      decoder_layernorm_bias.data<data_t_>());
-  decoding_params.trans_kernel =
-      reinterpret_cast<const DataType_*>(trans_weight.data<data_t_>());
-  decoding_params.trans_bias =
-      reinterpret_cast<const DataType_*>(trans_bias.data<data_t_>());
-
-  decoding_params.lm_layernorm.gamma =
-      reinterpret_cast<const DataType_*>(lm_layernorm_weight.data<data_t_>());
-  decoding_params.lm_layernorm.beta =
-      reinterpret_cast<const DataType_*>(lm_layernorm_bias.data<data_t_>());
-
-  // For embedding
-  decoding_params.embedding_table =
-      reinterpret_cast<const DataType_*>(word_emb.data<data_t_>());
-  // For weight sharing matmul
-  decoding_params.embedding_kernel =
-      reinterpret_cast<const DataType_*>(embedding_weight.data<data_t_>());
-  // For matmul bias
-  decoding_params.embedding_bias =
-      reinterpret_cast<const DataType_*>(embedding_bias.data<data_t_>());
-  decoding_params.position_encoding_table = reinterpret_cast<const DataType_*>(
-      position_encoding_table.data<data_t_>());
-
-  // For masking some id during gen.
-  decoding_params.logits_mask =
-      reinterpret_cast<const DataType_*>(logits_mask.data<data_t_>());
-
-  decoding_params.type_table =
-      reinterpret_cast<const DataType_*>(type_embedding_weight.data<data_t_>());
-
-  // For role embedding.
-  auto role_id_shape = role_id.shape();
-  if (role_id_shape.size() > 0 && numel(role_id_shape) > 0) {
-    decoding_params.role_id = role_id.data<int>();
-    decoding_params.decoder_role_id = decoder_role_id.data<int>();
-    decoding_params.role_embedding_table =
-        reinterpret_cast<const DataType_*>(role_embedding_table.data<data_t_>());
-  }
-
-  auto position_id_shape = position_ids.shape();
-  if (position_id_shape.size() > 0 && numel(position_id_shape) > 0) {
-      decoding_params.position_ids = position_ids.data<int>();
-      decoding_params.decoder_position_ids = decoder_position_ids.data<int>();
-  }
-
-  ActivationType activate =
-      (hidden_act == "gelu") ? ActivationType::GELU : ActivationType::RELU;
-
-  int finished_candidate_num_ =
-      ("beam_search_v3" == decoding_strategy) ? beam_width_ : beam_width_ * 2;
-
-  if ("beam_search" == decoding_strategy) {
-    DecodingBeamsearch<DecodingTraits_::OpType>* miro_beam_search_;
-
-    miro_beam_search_ =
-        new DecodingBeamsearch<DecodingTraits_::OpType>(
-            allocator_,
-            batch_size_,
-            beam_width_,
-            max_seq_len_,
-            head_num_,
-            size_per_head_,
-            vocab_size,
-            num_layer_,
-            memory_hidden_dim,
-            memory_max_seq_len,
-            start_id_,
-            end_id_,
-            beam_search_diversity_rate_,
-            true,        /*is_fuse_topk_softMax*/
-            true,        /*is_fuse_qkv*/
-            false,       /*keep_alive_beam*/
-            len_penalty, /*alpha not used for this case*/
-            normalize_before,
-            0, /*pos_offset BART only for now*/
-            activate,
-            pos_bias,
-            true, /*prefix_lm*/
-            -1,  /*finished_candidate_num*/
-            false,  /*early_stopping*/
-            false,  /*is_mbart*/
-            min_length,
-            inner_coeff,
-            true);  /*is_miro*/
-    miro_beam_search_->set_tensor_parallel_param(
-        tensor_parallel_param);
-    miro_beam_search_->set_layer_parallel_param(
-        layer_parallel_param);
-    miro_beam_search_->forward_context(params, decoding_params);
-    miro_beam_search_->forward(params, decoding_params);
-
-    delete miro_beam_search_;
-  } else if ("beam_search_v2" == decoding_strategy ||
-             "beam_search_v3" == decoding_strategy) {
-    DecodingBeamsearch<DecodingTraits_::OpType>* miro_beam_search_;
-
-    miro_beam_search_ =
-        new DecodingBeamsearch<DecodingTraits_::OpType>(
-            allocator_,
-            batch_size_,
-            beam_width_,
-            max_seq_len_,
-            head_num_,
-            size_per_head_,
-            vocab_size,
-            num_layer_,
-            memory_hidden_dim,
-            memory_max_seq_len,
-            start_id_,
-            end_id_,
-            beam_search_diversity_rate_,
-            true, /*is_fuse_topk_softMax*/
-            true, /*is_fuse_qkv*/
-            true, /*keep_alive_beam*/
-            len_penalty,
-            normalize_before,
-            0, /*pos_offset BART only for now*/
-            activate,
-            pos_bias,
-            true, /*prefix_lm*/
-            finished_candidate_num_,
-            early_stopping,
-            false,  /*is_mbart*/
-            min_length,
-            inner_coeff,
-            true);  /*is_miro*/
-    miro_beam_search_->forward_context(params, decoding_params);
-    miro_beam_search_->forward(params, decoding_params);
-
-    delete miro_beam_search_;
-  } else if ("topk_sampling" == decoding_strategy ||
-             "topp_sampling" == decoding_strategy ||
-             "sampling" == decoding_strategy) {
-    DecodingSampling<DecodingTraits_::OpType>* miro_sampling_;
-
-    miro_sampling_ = new DecodingSampling<DecodingTraits_::OpType>(
-        allocator_,
-        batch_size_,
-        max_seq_len_,
-        head_num_,
-        size_per_head_,
-        vocab_size,
-        num_layer_,
-        memory_hidden_dim,
-        memory_max_seq_len,
-        start_id_,
-        end_id_,
-        candidate_num_,
-        probability_threshold_,
-        true, /*is_fuse_qkv*/
-        normalize_before,
-        0, /*pos_offset BART only for now*/
-        activate,
-        pos_bias,
-        temperature,
-        1.0,   /*repeat_penalty*/
-        true,  /*prefix_lm*/
-        false, /*is_mbart*/
-        min_length,
-        inner_coeff,
-        seed,
-        tensor_para_size,
-        layer_para_size,
-        true);  /*is_miro*/
-    miro_sampling_->set_tensor_parallel_param(
-        tensor_parallel_param);
-    miro_sampling_->set_layer_parallel_param(layer_parallel_param);
-    miro_sampling_->forward_context(params, decoding_params);
-    miro_sampling_->forward(params, decoding_params);
-
-    delete miro_sampling_;
-  } else {
-    PD_THROW(
-        "Only beam_search, beam_search_v2, topk_sampling and topp_sampling are "
-        "supported for "
-        "FasterTransformer. ");
-  }
-  delete[] params;
-
-  return {output_ids, parent_ids, sequence_length, output_scores};
-}
-
-std::vector<paddle::Tensor> MIROCUDAForward(
-    const paddle::Tensor& input_ids,
-    const paddle::Tensor& attn_mask,
-    const paddle::Tensor& mem_seq_len,
-    const paddle::Tensor& type_id,
-    const paddle::Tensor& decoder_type_id,
-    const paddle::Tensor& logits_mask,
-    const paddle::Tensor& word_embedding,
-    const paddle::Tensor& pre_decoder_ln_weight,
-    const paddle::Tensor& pre_decoder_ln_bias,
-    const std::vector<paddle::Tensor>& self_ln_weight,
-    const std::vector<paddle::Tensor>& self_ln_bias,
-    const std::vector<paddle::Tensor>& self_q_weight,
-    const std::vector<paddle::Tensor>& self_q_bias,
-    const std::vector<paddle::Tensor>& self_k_weight,
-    const std::vector<paddle::Tensor>& self_k_bias,
-    const std::vector<paddle::Tensor>& self_v_weight,
-    const std::vector<paddle::Tensor>& self_v_bias,
-    const std::vector<paddle::Tensor>& self_out_weight,
-    const std::vector<paddle::Tensor>& self_out_bias,
-    const std::vector<paddle::Tensor>& ffn_ln_weight,
-    const std::vector<paddle::Tensor>& ffn_ln_bias,
-    const std::vector<paddle::Tensor>& ffn_inter_weight,
-    const std::vector<paddle::Tensor>& ffn_inter_bias,
-    const std::vector<paddle::Tensor>& ffn_out_weight,
-    const std::vector<paddle::Tensor>& ffn_out_bias,
-    const paddle::Tensor& decoder_ln_weight,
-    const paddle::Tensor& decoder_ln_bias,
-    const paddle::Tensor& trans_weight,
-    const paddle::Tensor& trans_bias,
-    const paddle::Tensor& lm_ln_weight,
-    const paddle::Tensor& lm_ln_bias,
-    const paddle::Tensor& embedding_weight,
-    const paddle::Tensor& embedding_bias,
-    const paddle::Tensor& positional_embedding_weight,
-    const paddle::Tensor& type_embedding_weight,
-    const paddle::Tensor& role_id,
-    const paddle::Tensor& decoder_role_id,
-    const paddle::Tensor& role_embedding_table,
-    const paddle::Tensor& position_ids,
-    const paddle::Tensor& decoder_position_ids,
-    paddle::Tensor& output_ids,
-    paddle::Tensor& parent_ids,
-    paddle::Tensor& sequence_length,
-    paddle::Tensor& output_scores,
-    const std::string& decoding_strategy,
-    const int beam_size,
-    const int topk,
-    const float topp,
-    const int n_head,
-    const int size_per_head,
-    const int num_layer,
-    const int bos_id,
-    const int eos_id,
-    const int64_t max_len,
-    const float beam_search_diversity_rate,
-    const int unk_id,
-    const int mask_id,
-    const float temperature,
-    const float len_penalty,
-    const bool normalize_before,
-    const bool pos_bias,
-    const std::string& hidden_act,
-    const bool early_stopping,
-    const int min_length,
-    const int tensor_para_size = 1,
-    const int layer_para_size = 1,
-    const int layer_para_batch_size = 1) {
-  auto stream = input_ids.stream();
-
-  cublasSetStream(CublasHandle::GetInstance()->cublas_handle_, stream);
-
-  std::vector<paddle::Tensor> ret;
-
-  switch (self_ln_weight[0].type()) {
-    case paddle::DataType::FLOAT16: {
-      ret = miro_decoding_kernel<paddle::DataType::FLOAT16>(
-          input_ids,
-          attn_mask,
-          mem_seq_len,
-          type_id,
-          decoder_type_id,
-          logits_mask,
-          word_embedding,
-          pre_decoder_ln_weight,
-          pre_decoder_ln_bias,
-          self_ln_weight,
-          self_ln_bias,
-          self_q_weight,
-          self_q_bias,
-          self_k_weight,
-          self_k_bias,
-          self_v_weight,
-          self_v_bias,
-          self_out_weight,
-          self_out_bias,
-          ffn_ln_weight,
-          ffn_ln_bias,
-          ffn_inter_weight,
-          ffn_inter_bias,
-          ffn_out_weight,
-          ffn_out_bias,
-          decoder_ln_weight,
-          decoder_ln_bias,
-          trans_weight,
-          trans_bias,
-          lm_ln_weight,
-          lm_ln_bias,
-          embedding_weight,
-          embedding_bias,
-          positional_embedding_weight,
-          type_embedding_weight,
-          role_id,
-          decoder_role_id,
-          role_embedding_table,
-          position_ids,
-          decoder_position_ids,
-          output_ids,
-          parent_ids,
-          sequence_length,
-          output_scores,
-          decoding_strategy,
-          beam_size,
-          topk,
-          topp,
-          n_head,
-          size_per_head,
-          num_layer,
-          bos_id,
-          eos_id,
-          max_len,
-          beam_search_diversity_rate,
-          unk_id,
-          mask_id,
-          temperature,
-          len_penalty,
-          normalize_before,
-          pos_bias,
-          hidden_act,
-          early_stopping,
-          min_length,
-          stream,
-          tensor_para_size,
-          layer_para_size,
-          layer_para_batch_size);
-      break;
-    }
-    case paddle::DataType::FLOAT32: {
-      ret = miro_decoding_kernel<paddle::DataType::FLOAT32>(
-          input_ids,
-          attn_mask,
-          mem_seq_len,
-          type_id,
-          decoder_type_id,
-          logits_mask,
-          word_embedding,
-          pre_decoder_ln_weight,
-          pre_decoder_ln_bias,
-          self_ln_weight,
-          self_ln_bias,
-          self_q_weight,
-          self_q_bias,
-          self_k_weight,
-          self_k_bias,
-          self_v_weight,
-          self_v_bias,
-          self_out_weight,
-          self_out_bias,
-          ffn_ln_weight,
-          ffn_ln_bias,
-          ffn_inter_weight,
-          ffn_inter_bias,
-          ffn_out_weight,
-          ffn_out_bias,
-          decoder_ln_weight,
-          decoder_ln_bias,
-          trans_weight,
-          trans_bias,
-          lm_ln_weight,
-          lm_ln_bias,
-          embedding_weight,
-          embedding_bias,
-          positional_embedding_weight,
-          type_embedding_weight,
-          role_id,
-          decoder_role_id,
-          role_embedding_table,
-          position_ids,
-          decoder_position_ids,
-          output_ids,
-          parent_ids,
-          sequence_length,
-          output_scores,
-          decoding_strategy,
-          beam_size,
-          topk,
-          topp,
-          n_head,
-          size_per_head,
-          num_layer,
-          bos_id,
-          eos_id,
-          max_len,
-          beam_search_diversity_rate,
-          unk_id,
-          mask_id,
-          temperature,
-          len_penalty,
-          normalize_before,
-          pos_bias,
-          hidden_act,
-          early_stopping,
-          min_length,
-          stream,
-          tensor_para_size,
-          layer_para_size,
-          layer_para_batch_size);
-      break;
-    }
-    default: {
-      PD_THROW(
-          "NOT supported data type. "
-          "Only float16 and float32 are supported. ");
-      break;
-    }
-  }
-
-  return ret;
-}
diff --git a/paddlenlp/ops/fast_transformer/src/fusion_miro_op.h b/paddlenlp/ops/fast_transformer/src/fusion_miro_op.h
deleted file mode 100644
index c8213cb1dcad..000000000000
--- a/paddlenlp/ops/fast_transformer/src/fusion_miro_op.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#include <string>
-#include <vector>
-
-// #include "fastertransformer/decoding_beamsearch.h"
-// #include "fastertransformer/decoding_sampling.h"
-// #include "fastertransformer/open_decoder.h"
-// #include "fastertransformer/utils/common.h"
-#include "cublas_handle.h"
-#include "utils.h"
-
-#ifdef PADDLE_ON_INFERENCE
-#include "paddle/extension.h"
-#include "paddle_inference_api.h"
-#include "paddle/common/exception.h"
-#else
-#include "paddle/extension.h"
-#endif
-
-
-std::vector<paddle::Tensor> MIROCUDAForward(
-    const paddle::Tensor& input_ids,
-    const paddle::Tensor& attn_mask,
-    const paddle::Tensor& mem_seq_len,
-    const paddle::Tensor& type_id,
-    const paddle::Tensor& decoder_type_id,
-    const paddle::Tensor& logits_mask,
-    const paddle::Tensor& word_embedding,
-    const paddle::Tensor& pre_decoder_ln_weight,
-    const paddle::Tensor& pre_decoder_ln_bias,
-    const std::vector<paddle::Tensor>& self_ln_weight,
-    const std::vector<paddle::Tensor>& self_ln_bias,
-    const std::vector<paddle::Tensor>& self_q_weight,
-    const std::vector<paddle::Tensor>& self_q_bias,
-    const std::vector<paddle::Tensor>& self_k_weight,
-    const std::vector<paddle::Tensor>& self_k_bias,
-    const std::vector<paddle::Tensor>& self_v_weight,
-    const std::vector<paddle::Tensor>& self_v_bias,
-    const std::vector<paddle::Tensor>& self_out_weight,
-    const std::vector<paddle::Tensor>& self_out_bias,
-    const std::vector<paddle::Tensor>& ffn_ln_weight,
-    const std::vector<paddle::Tensor>& ffn_ln_bias,
-    const std::vector<paddle::Tensor>& ffn_inter_weight,
-    const std::vector<paddle::Tensor>& ffn_inter_bias,
-    const std::vector<paddle::Tensor>& ffn_out_weight,
-    const std::vector<paddle::Tensor>& ffn_out_bias,
-    const paddle::Tensor& decoder_ln_weight,
-    const paddle::Tensor& decoder_ln_bias,
-    const paddle::Tensor& trans_weight,
-    const paddle::Tensor& trans_bias,
-    const paddle::Tensor& lm_ln_weight,
-    const paddle::Tensor& lm_ln_bias,
-    const paddle::Tensor& embedding_weight,
-    const paddle::Tensor& embedding_bias,
-    const paddle::Tensor& positional_embedding_weight,
-    const paddle::Tensor& type_embedding_weight,
-    const paddle::Tensor& role_id,
-    const paddle::Tensor& decoder_role_id,
-    const paddle::Tensor& role_embedding_table,
-    const paddle::Tensor& position_ids,
-    const paddle::Tensor& decoder_position_ids,
-    paddle::Tensor& output_ids,
-    paddle::Tensor& parent_ids,
-    paddle::Tensor& sequence_length,
-    paddle::Tensor& output_scores,
-    const std::string& decoding_strategy,
-    const int beam_size,
-    const int topk,
-    const float topp,
-    const int n_head,
-    const int size_per_head,
-    const int num_layer,
-    const int bos_id,
-    const int eos_id,
-    const int64_t max_len,
-    const float beam_search_diversity_rate,
-    const int unk_id,
-    const int mask_id,
-    const float temperature,
-    const float len_penalty,
-    const bool normalize_before,
-    const bool pos_bias,
-    const std::string& hidden_act,
-    const bool early_stopping,
-    const int min_length,
-    const int tensor_para_size,
-    const int layer_para_size,
-    const int layer_para_batch_size);
diff --git a/paddlenlp/ops/fast_transformer/src/fusion_opt_op.cc b/paddlenlp/ops/fast_transformer/src/fusion_opt_op.cc
deleted file mode 100644
index 21d4f1b2bad0..000000000000
--- a/paddlenlp/ops/fast_transformer/src/fusion_opt_op.cc
+++ /dev/null
@@ -1,227 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// 
-//     http://www.apache.org/licenses/LICENSE-2.0
-// 
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <string>
-#include <vector>
-
-#include "fusion_opt_op.h"
-#include "pd_traits.h"
-
-
-std::vector<paddle::Tensor> OPTForward(
-    const paddle::Tensor& input,
-    const paddle::Tensor& attn_mask,
-    const paddle::Tensor& start_length,
-    const paddle::Tensor& word_embedding,
-    const std::vector<paddle::Tensor>& self_ln_weight,
-    const std::vector<paddle::Tensor>& self_ln_bias,
-    const std::vector<paddle::Tensor>& self_q_weight,
-    const std::vector<paddle::Tensor>& self_q_bias,
-    const std::vector<paddle::Tensor>& self_k_weight,
-    const std::vector<paddle::Tensor>& self_k_bias,
-    const std::vector<paddle::Tensor>& self_v_weight,
-    const std::vector<paddle::Tensor>& self_v_bias,
-    const std::vector<paddle::Tensor>& self_out_weight,
-    const std::vector<paddle::Tensor>& self_out_bias,
-    const std::vector<paddle::Tensor>& ffn_ln_weight,
-    const std::vector<paddle::Tensor>& ffn_ln_bias,
-    const std::vector<paddle::Tensor>& ffn_inter_weight,
-    const std::vector<paddle::Tensor>& ffn_inter_bias,
-    const std::vector<paddle::Tensor>& ffn_out_weight,
-    const std::vector<paddle::Tensor>& ffn_out_bias,
-    const paddle::Tensor& decoder_ln_weight,
-    const paddle::Tensor& decoder_ln_bias,
-    const paddle::Tensor& positional_embedding_weight,
-    const paddle::Tensor& emb_weight,
-    const bool& normalize_before,
-    const int& topk,
-    const float& topp,
-    const int& max_len,
-    const int& n_head,
-    const int& size_per_head,
-    const int& num_layer,
-    const int& bos_id,
-    const int& eos_id,
-    const float& temperature,
-    const bool& use_fp16 = false,
-    const int& tensor_para_size = 1,
-    const int& layer_para_size = 1,
-    const int& layer_para_batch_size = 1) {
-  int batch_size = input.shape()[0];
-  int start_len = input.shape()[1];
-  int total_len = max_len + start_len;
-  std::vector<int64_t> output_dims({total_len, batch_size});
-  auto output_ids = paddle::Tensor(input.place(), output_dims);
-
-  if (word_embedding.place() == paddle::PlaceType::kGPU) {
-    return OPTCUDAForward(input,
-                           attn_mask,
-                           start_length,
-                           word_embedding,
-                           self_ln_weight,
-                           self_ln_bias,
-                           self_q_weight,
-                           self_q_bias,
-                           self_k_weight,
-                           self_k_bias,
-                           self_v_weight,
-                           self_v_bias,
-                           self_out_weight,
-                           self_out_bias,
-                           ffn_ln_weight,
-                           ffn_ln_bias,
-                           ffn_inter_weight,
-                           ffn_inter_bias,
-                           ffn_out_weight,
-                           ffn_out_bias,
-                           decoder_ln_weight,
-                           decoder_ln_bias,
-                           positional_embedding_weight,
-                           emb_weight,
-                           output_ids,
-                           normalize_before,
-                           topk,
-                           topp,
-                           total_len,
-                           n_head,
-                           size_per_head,
-                           num_layer,
-                           bos_id,
-                           eos_id,
-                           temperature,
-                           use_fp16,
-                           tensor_para_size,
-                           layer_para_size,
-                           layer_para_batch_size);
-  } else {
-    PD_THROW("Not implemented place. Only GPU is supported. ");
-  }
-}
-
-std::vector<std::vector<int64_t>> OPTInferShape(
-    const std::vector<int64_t>& input_shape,
-    const std::vector<int64_t>& attn_mask_shape,
-    const std::vector<int64_t>& start_length,
-    const std::vector<int64_t>& word_embedding_shape,
-    const std::vector<std::vector<int64_t>>& self_ln_weight_shapes,
-    const std::vector<std::vector<int64_t>>& self_ln_bias_shapes,
-    const std::vector<std::vector<int64_t>>& self_q_weight_shapes,
-    const std::vector<std::vector<int64_t>>& self_q_bias_shapes,
-    const std::vector<std::vector<int64_t>>& self_k_weight_shapes,
-    const std::vector<std::vector<int64_t>>& self_k_bias_shapes,
-    const std::vector<std::vector<int64_t>>& self_v_weight_shapes,
-    const std::vector<std::vector<int64_t>>& self_v_bias_shapes,
-    const std::vector<std::vector<int64_t>>& self_out_weight_shapes,
-    const std::vector<std::vector<int64_t>>& self_out_bias_shapes,
-    const std::vector<std::vector<int64_t>>& ffn_ln_weight_shapes,
-    const std::vector<std::vector<int64_t>>& ffn_ln_bias_shapes,
-    const std::vector<std::vector<int64_t>>& ffn_inter_weight_shapes,
-    const std::vector<std::vector<int64_t>>& ffn_inter_bias_shapes,
-    const std::vector<std::vector<int64_t>>& ffn_out_weight_shapes,
-    const std::vector<std::vector<int64_t>>& ffn_out_bias_shapes,
-    const std::vector<int64_t>& decoder_ln_weight_shape,
-    const std::vector<int64_t>& decoder_ln_bias_shape,
-    const std::vector<int64_t>& positional_embedding_weight_shape,
-    const std::vector<int64_t>& emb_weight_shape,
-    const bool& normalize_before,
-    const int& topk,
-    const float& topp,
-    const int& max_len,
-    const int& n_head,
-    const int& size_per_head,
-    const int& num_layer,
-    const int& bos_id,
-    const int& eos_id,
-    const float& temperature,
-    const bool& use_fp16 = false,
-    const int& tensor_para_size = 1,
-    const int& layer_para_size = 1,
-    const int& layer_para_batch_size = 1) {
-  int64_t batch_size = input_shape[0];
-  int64_t start_len = input_shape[1];
-  std::vector<int64_t> output_dims({max_len + start_len, batch_size});
-  return {output_dims};
-}
-
-std::vector<paddle::DataType> OPTInferDtype(
-    const paddle::DataType& input_dtype,
-    const paddle::DataType& attn_mask_dtype,
-    const paddle::DataType& start_length_dtype,
-    const paddle::DataType& word_embedding_dtype,
-    const std::vector<paddle::DataType>& self_ln_weight_dtype,
-    const std::vector<paddle::DataType>& self_ln_bias_dtype,
-    const std::vector<paddle::DataType>& self_q_weight_dtype,
-    const std::vector<paddle::DataType>& self_q_bias_dtype,
-    const std::vector<paddle::DataType>& self_k_weight_dtype,
-    const std::vector<paddle::DataType>& self_k_bias_dtype,
-    const std::vector<paddle::DataType>& self_v_weight_dtype,
-    const std::vector<paddle::DataType>& self_v_bias_dtype,
-    const std::vector<paddle::DataType>& self_out_weight_dtype,
-    const std::vector<paddle::DataType>& self_out_bias_dtype,
-    const std::vector<paddle::DataType>& ffn_ln_weight_dtype,
-    const std::vector<paddle::DataType>& ffn_ln_bias_dtype,
-    const std::vector<paddle::DataType>& ffn_inter_weight_dtype,
-    const std::vector<paddle::DataType>& ffn_inter_bias_dtype,
-    const std::vector<paddle::DataType>& ffn_out_weight_dtype,
-    const std::vector<paddle::DataType>& ffn_out_bias_dtype,
-    const paddle::DataType& decoder_ln_weight_dtype,
-    const paddle::DataType& decoder_ln_bias_dtype,
-    const paddle::DataType& positional_embedding_weight_dtype,
-    const paddle::DataType& emb_weight_dtype) {
-  return {paddle::DataType::INT32};
-}
-
-PD_BUILD_OP(fusion_opt)
-    .Inputs({"Input",
-             "AttentionMask",
-             "StartLength",
-             "WordEmbedding",
-             paddle::Vec("SelfLayernormWeight"),
-             paddle::Vec("SelfLayernormBias"),
-             paddle::Vec("SelfQueryWeight"),
-             paddle::Vec("SelfQueryBias"),
-             paddle::Vec("SelfKeyWeight"),
-             paddle::Vec("SelfKeyBias"),
-             paddle::Vec("SelfValueWeight"),
-             paddle::Vec("SelfValueBias"),
-             paddle::Vec("SelfOutWeight"),
-             paddle::Vec("SelfOutBias"),
-             paddle::Vec("FFNLayernormWeight"),
-             paddle::Vec("FFNLayernormBias"),
-             paddle::Vec("FFNInterWeight"),
-             paddle::Vec("FFNInterBias"),
-             paddle::Vec("FFNOutWeight"),
-             paddle::Vec("FFNOutBias"),
-             "DecoderLayernormWeight",
-             "DecoderLayernormBias",
-             "PositionEncEmb",
-             "EmbWeight"})
-    .Outputs({"OutputIds"})
-    .Attrs({"normalize_before: bool",
-            "topk: int",
-            "topp: float",
-            "max_len: int",
-            "n_head: int",
-            "size_per_head: int",
-            "num_layer: int",
-            "bos_id: int",
-            "eos_id: int",
-            "temperature: float",
-            "use_fp16: bool",
-            "tensor_para_size: int",
-            "layer_para_size: int",
-            "layer_para_batch_size: int"})
-    .SetKernelFn(PD_KERNEL(OPTForward))
-    .SetInferShapeFn(PD_INFER_SHAPE(OPTInferShape))
-    .SetInferDtypeFn(PD_INFER_DTYPE(OPTInferDtype));
diff --git a/paddlenlp/ops/fast_transformer/src/fusion_opt_op.cu b/paddlenlp/ops/fast_transformer/src/fusion_opt_op.cu
deleted file mode 100644
index 6af9f9a381ba..000000000000
--- a/paddlenlp/ops/fast_transformer/src/fusion_opt_op.cu
+++ /dev/null
@@ -1,384 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// 
-//     http://www.apache.org/licenses/LICENSE-2.0
-// 
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <curand.h>
-#include <curand_kernel.h>
-#include <algorithm>
-#include <iterator>
-#include <random>
-#include <sstream>
-#include <vector>
-
-// TODO(guosheng): `HOST` conflict exists in float.h of paddle and mpi.h of mpi
-#include "fusion_opt_op.h"
-#include "pd_traits.h"
-#ifdef HOST
-#undef HOST
-#endif
-
-#include "fastertransformer/opt.h"
-#include "fastertransformer/utils/common.h"
-
-#ifdef BUILD_GPT  // consistent with FasterTransformer
-#include "parallel_utils.h"
-#endif
-
-template <paddle::DataType D>
-std::vector<paddle::Tensor> opt_kernel(
-    const paddle::Tensor& input,
-    const paddle::Tensor& attn_mask,
-    const paddle::Tensor& start_length,
-    const paddle::Tensor& word_emb,
-    const std::vector<paddle::Tensor>& self_ln_weight,
-    const std::vector<paddle::Tensor>& self_ln_bias,
-    const std::vector<paddle::Tensor>& self_q_weight,
-    const std::vector<paddle::Tensor>& self_q_bias,
-    const std::vector<paddle::Tensor>& self_k_weight,
-    const std::vector<paddle::Tensor>& self_k_bias,
-    const std::vector<paddle::Tensor>& self_v_weight,
-    const std::vector<paddle::Tensor>& self_v_bias,
-    const std::vector<paddle::Tensor>& self_out_weight,
-    const std::vector<paddle::Tensor>& self_out_bias,
-    const std::vector<paddle::Tensor>& ffn_ln_weight,
-    const std::vector<paddle::Tensor>& ffn_ln_bias,
-    const std::vector<paddle::Tensor>& ffn_inter_weight,
-    const std::vector<paddle::Tensor>& ffn_inter_bias,
-    const std::vector<paddle::Tensor>& ffn_out_weight,
-    const std::vector<paddle::Tensor>& ffn_out_bias,
-    const paddle::Tensor& decoder_ln_weight,
-    const paddle::Tensor& decoder_ln_bias,
-    const paddle::Tensor& positional_embedding_weight,
-    const paddle::Tensor& emb_weight,
-    paddle::Tensor& output_ids,
-    const bool& normalize_before,
-    const int& topk,
-    const float& topp,
-    const int& max_len,
-    const int& n_head,
-    const int& size_per_head,
-    const int& num_layer,
-    const int& bos_id,
-    const int& eos_id,
-    const float& temperature,
-    cublasHandle_t cublas_handle_,
-    cublasLtHandle_t cublaslt_handle_,
-    cudaStream_t stream,
-    const int& tensor_para_size = 1,
-    const int& layer_para_size = 1,
-    const int& layer_para_batch_size = 1) {
-  auto input_dims = input.shape();
-  int batch_size_ = input_dims[0];
-  int start_len = input_dims[1];
-  const int vocab_size = word_emb.shape()[0];
-
-  typedef PDTraits<D> traits_;
-  typedef typename traits_::DataType DataType_;
-  typedef typename traits_::data_t data_t_;
-
-  DecodingInitParam<DataType_> decoding_params;
-  decoding_params.cublas_handle = cublas_handle_;
-  decoding_params.cublaslt_handle = cublaslt_handle_;
-
-  decoding_params.output_ids = output_ids.mutable_data<int>(word_emb.place());
-
-  typedef DecoderTransformerTraits<traits_::OpType> DecodingTraits_;
-  decoding_params.stream = stream;
-  fastertransformer::Allocator<AllocatorType::PD> allocator_(stream);
-
-  const int hidden_unit = size_per_head * n_head;
-
-#ifdef BUILD_GPT
-  auto* model_para_desc = ModelParaDescFactory::CreateModelParaDesc(
-      n_head,
-      size_per_head,
-      num_layer,
-      tensor_para_size,
-      layer_para_size,
-      layer_para_batch_size,
-      const_cast<data_t_*>(word_emb.data<data_t_>()));
-  auto& tensor_parallel_param = model_para_desc->tensor_parallel_param;
-  auto& layer_parallel_param = model_para_desc->layer_parallel_param;
-  auto seed = model_para_desc->dist(model_para_desc->gen);
-#else
-  TensorParallelParam tensor_parallel_param;
-  LayerParallelParam layer_parallel_param;
-  tensor_parallel_param.rank = 0;
-  tensor_parallel_param.world_size = 1;
-  tensor_parallel_param.local_head_num_ = n_head;
-  tensor_parallel_param.local_hidden_units_ = hidden_unit;
-
-  layer_parallel_param.rank = 0;
-  layer_parallel_param.world_size = 1;
-  layer_parallel_param.layers_per_group = num_layer;
-  layer_parallel_param.local_batch_size = batch_size_;
-  int seed = -1;
-#endif
-
-  DecodingOpt<DecodingTraits_::OpType>* opt_decoding;
-
-  decoding_params.request_batch_size = batch_size_;
-  decoding_params.max_input_len = start_len;
-  decoding_params.request_input_len = start_len;
-  decoding_params.request_output_len = max_len - start_len;
-
-  decoding_params.d_start_ids = const_cast<int *>(input.data<int>());
-
-  decoding_params.d_attn_mask =
-      reinterpret_cast<DataType_*>(const_cast<data_t_ *>(attn_mask.data<data_t_>()));
-  decoding_params.d_start_lengths = start_length.data<int>();
-
-  opt_decoding =
-      new DecodingOpt<DecodingTraits_::OpType>(allocator_,
-                                               batch_size_,
-                                               max_len,
-                                               n_head,
-                                               size_per_head,
-                                               vocab_size,
-                                               num_layer,
-                                               bos_id,
-                                               eos_id,
-                                               topk,
-                                               topp,
-                                               temperature,
-                                               tensor_para_size,
-                                               layer_para_size,
-                                               true, /*is_fuse_QKV*/
-                                               normalize_before,
-                                               1.0,  /*repetition_penalty*/
-                                               seed);
-
-  opt_decoding->set_tensor_parallel_param(tensor_parallel_param);
-  opt_decoding->set_layer_parallel_param(layer_parallel_param);
-
-  DecoderInitParam<DataType_>* params =
-      new DecoderInitParam<DataType_>[num_layer];
-
-  for (int i = 0; i < self_ln_weight.size(); ++i) {
-    // Allow python passing weights of all layers or only passing the
-    // corresponding layers to save memory.
-    int layer_idx = self_ln_weight.size() != num_layer
-                        ? layer_parallel_param.rank *
-                                  layer_parallel_param.layers_per_group +
-                              i
-                        : i;
-
-    params[layer_idx].stream = stream;
-    params[layer_idx].cublas_handle = cublas_handle_;
-    params[layer_idx].cublaslt_handle = cublaslt_handle_;
-
-    params[layer_idx].request_batch_size = batch_size_;
-    params[layer_idx].request_max_mem_seq_len = start_len;
-
-    params[layer_idx].self_layernorm.gamma =
-        reinterpret_cast<const DataType_*>(self_ln_weight[i].data<data_t_>());
-    params[layer_idx].self_layernorm.beta =
-        reinterpret_cast<const DataType_*>(self_ln_bias[i].data<data_t_>());
-
-    params[layer_idx].self_attention.query_weight.kernel =
-        reinterpret_cast<const DataType_*>(self_q_weight[i].data<data_t_>());
-    params[layer_idx].self_attention.query_weight.bias =
-        reinterpret_cast<const DataType_*>(self_q_bias[i].data<data_t_>());
-    // For `is_fuse_QKV == true`, ignore weight and bias of key and value to
-    // remove requirements on python passing weights to save memory.
-    // params[layer_idx].self_attention.key_weight.kernel =
-    //     reinterpret_cast<const DataType_*>(self_k_weight[i].data<data_t_>());
-    // params[layer_idx].self_attention.key_weight.bias =
-    //     reinterpret_cast<const DataType_*>(self_k_bias[i].data<data_t_>());
-    // params[layer_idx].self_attention.value_weight.kernel =
-    //     reinterpret_cast<const DataType_*>(self_v_weight[i].data<data_t_>());
-    // params[layer_idx].self_attention.value_weight.bias =
-    //     reinterpret_cast<const DataType_*>(self_v_bias[i].data<data_t_>());
-
-    params[layer_idx].self_attention.attention_output_weight.kernel =
-        reinterpret_cast<const DataType_*>(self_out_weight[i].data<data_t_>());
-    params[layer_idx].self_attention.attention_output_weight.bias =
-        reinterpret_cast<const DataType_*>(self_out_bias[i].data<data_t_>());
-
-    params[layer_idx].ffn_layernorm.gamma =
-        reinterpret_cast<const DataType_*>(ffn_ln_weight[i].data<data_t_>());
-    params[layer_idx].ffn_layernorm.beta =
-        reinterpret_cast<const DataType_*>(ffn_ln_bias[i].data<data_t_>());
-
-    params[layer_idx].ffn.intermediate_weight.kernel =
-        reinterpret_cast<const DataType_*>(ffn_inter_weight[i].data<data_t_>());
-    params[layer_idx].ffn.intermediate_weight.bias =
-        reinterpret_cast<const DataType_*>(ffn_inter_bias[i].data<data_t_>());
-    params[layer_idx].ffn.output_weight.kernel =
-        reinterpret_cast<const DataType_*>(ffn_out_weight[i].data<data_t_>());
-    params[layer_idx].ffn.output_weight.bias =
-        reinterpret_cast<const DataType_*>(ffn_out_bias[i].data<data_t_>());
-  }
-
-  decoding_params.layernorm.gamma =
-      reinterpret_cast<const DataType_*>(decoder_ln_weight.data<data_t_>());
-  decoding_params.layernorm.beta =
-      reinterpret_cast<const DataType_*>(decoder_ln_bias.data<data_t_>());
-  decoding_params.embedding_table =
-      reinterpret_cast<const DataType_*>(word_emb.data<data_t_>());
-  decoding_params.embedding_kernel =
-      reinterpret_cast<const DataType_*>(emb_weight.data<data_t_>());
-  decoding_params.position_encoding_table = reinterpret_cast<const DataType_*>(
-      positional_embedding_weight.data<data_t_>());
-
-  opt_decoding->forward_context(params, decoding_params);
-  opt_decoding->forward(params, decoding_params);
-
-  delete opt_decoding;
-  delete[] params;
-
-  return {output_ids};
-}
-
-std::vector<paddle::Tensor> OPTCUDAForward(
-    const paddle::Tensor& input,
-    const paddle::Tensor& attn_mask,
-    const paddle::Tensor& start_length,
-    const paddle::Tensor& word_embedding,
-    const std::vector<paddle::Tensor>& self_ln_weight,
-    const std::vector<paddle::Tensor>& self_ln_bias,
-    const std::vector<paddle::Tensor>& self_q_weight,
-    const std::vector<paddle::Tensor>& self_q_bias,
-    const std::vector<paddle::Tensor>& self_k_weight,
-    const std::vector<paddle::Tensor>& self_k_bias,
-    const std::vector<paddle::Tensor>& self_v_weight,
-    const std::vector<paddle::Tensor>& self_v_bias,
-    const std::vector<paddle::Tensor>& self_out_weight,
-    const std::vector<paddle::Tensor>& self_out_bias,
-    const std::vector<paddle::Tensor>& ffn_ln_weight,
-    const std::vector<paddle::Tensor>& ffn_ln_bias,
-    const std::vector<paddle::Tensor>& ffn_inter_weight,
-    const std::vector<paddle::Tensor>& ffn_inter_bias,
-    const std::vector<paddle::Tensor>& ffn_out_weight,
-    const std::vector<paddle::Tensor>& ffn_out_bias,
-    const paddle::Tensor& decoder_ln_weight,
-    const paddle::Tensor& decoder_ln_bias,
-    const paddle::Tensor& positional_embedding_weight,
-    const paddle::Tensor& emb_weight,
-    paddle::Tensor& output_ids,
-    const bool& normalize_before,
-    const int& topk,
-    const float& topp,
-    const int& max_len,
-    const int& n_head,
-    const int& size_per_head,
-    const int& num_layer,
-    const int& bos_id,
-    const int& eos_id,
-    const float& temperature,
-    const bool& use_fp16 = false,
-    const int& tensor_para_size = 1,
-    const int& layer_para_size = 1,
-    const int& layer_para_batch_size = 1) {
-  auto stream = word_embedding.stream();
-  // TODO(guosheng): use the global cublas handle
-  cublasHandle_t cublas_handle_;
-  cublasCreate(&cublas_handle_);
-  cublasLtHandle_t cublaslt_handle_;
-  cublasLtCreate(&cublaslt_handle_);
-  cublasSetStream(cublas_handle_, stream);
-
-  std::vector<paddle::Tensor> ret;
-
-  if (use_fp16) {
-    ret = opt_kernel<paddle::DataType::FLOAT16>(input,
-                                                 attn_mask,
-                                                 start_length,
-                                                 word_embedding,
-                                                 self_ln_weight,
-                                                 self_ln_bias,
-                                                 self_q_weight,
-                                                 self_q_bias,
-                                                 self_k_weight,
-                                                 self_k_bias,
-                                                 self_v_weight,
-                                                 self_v_bias,
-                                                 self_out_weight,
-                                                 self_out_bias,
-                                                 ffn_ln_weight,
-                                                 ffn_ln_bias,
-                                                 ffn_inter_weight,
-                                                 ffn_inter_bias,
-                                                 ffn_out_weight,
-                                                 ffn_out_bias,
-                                                 decoder_ln_weight,
-                                                 decoder_ln_bias,
-                                                 positional_embedding_weight,
-                                                 emb_weight,
-                                                 output_ids,
-                                                 normalize_before,
-                                                 topk,
-                                                 topp,
-                                                 max_len,
-                                                 n_head,
-                                                 size_per_head,
-                                                 num_layer,
-                                                 bos_id,
-                                                 eos_id,
-                                                 temperature,
-                                                 cublas_handle_,
-                                                 cublaslt_handle_,
-                                                 stream,
-                                                 tensor_para_size,
-                                                 layer_para_size,
-                                                 layer_para_batch_size);
-  } else {
-    ret = opt_kernel<paddle::DataType::FLOAT32>(input,
-                                                 attn_mask,
-                                                 start_length,
-                                                 word_embedding,
-                                                 self_ln_weight,
-                                                 self_ln_bias,
-                                                 self_q_weight,
-                                                 self_q_bias,
-                                                 self_k_weight,
-                                                 self_k_bias,
-                                                 self_v_weight,
-                                                 self_v_bias,
-                                                 self_out_weight,
-                                                 self_out_bias,
-                                                 ffn_ln_weight,
-                                                 ffn_ln_bias,
-                                                 ffn_inter_weight,
-                                                 ffn_inter_bias,
-                                                 ffn_out_weight,
-                                                 ffn_out_bias,
-                                                 decoder_ln_weight,
-                                                 decoder_ln_bias,
-                                                 positional_embedding_weight,
-                                                 emb_weight,
-                                                 output_ids,
-                                                 normalize_before,
-                                                 topk,
-                                                 topp,
-                                                 max_len,
-                                                 n_head,
-                                                 size_per_head,
-                                                 num_layer,
-                                                 bos_id,
-                                                 eos_id,
-                                                 temperature,
-                                                 cublas_handle_,
-                                                 cublaslt_handle_,
-                                                 stream,
-                                                 tensor_para_size,
-                                                 layer_para_size,
-                                                 layer_para_batch_size);
-  }
-
-  cublasDestroy(cublas_handle_);
-  cublasLtDestroy(cublaslt_handle_);
-  return ret;
-}
diff --git a/paddlenlp/ops/fast_transformer/src/fusion_opt_op.h b/paddlenlp/ops/fast_transformer/src/fusion_opt_op.h
deleted file mode 100644
index 0519df524010..000000000000
--- a/paddlenlp/ops/fast_transformer/src/fusion_opt_op.h
+++ /dev/null
@@ -1,71 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// 
-//     http://www.apache.org/licenses/LICENSE-2.0
-// 
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include <vector>
-
-// #include "fastertransformer/gpt.h"
-// #include "fastertransformer/open_decoder.h"
-// #include "fastertransformer/utils/common.h"
-
-#ifdef PADDLE_ON_INFERENCE
-#include "paddle/extension.h"
-#include "paddle_inference_api.h"
-#include "paddle/common/exception.h"
-#else
-#include "paddle/extension.h"
-#endif
-
-std::vector<paddle::Tensor> OPTCUDAForward(
-    const paddle::Tensor& input,
-    const paddle::Tensor& attn_mask,
-    const paddle::Tensor& start_length,
-    const paddle::Tensor& word_embedding,
-    const std::vector<paddle::Tensor>& self_ln_weight,
-    const std::vector<paddle::Tensor>& self_ln_bias,
-    const std::vector<paddle::Tensor>& self_q_weight,
-    const std::vector<paddle::Tensor>& self_q_bias,
-    const std::vector<paddle::Tensor>& self_k_weight,
-    const std::vector<paddle::Tensor>& self_k_bias,
-    const std::vector<paddle::Tensor>& self_v_weight,
-    const std::vector<paddle::Tensor>& self_v_bias,
-    const std::vector<paddle::Tensor>& self_out_weight,
-    const std::vector<paddle::Tensor>& self_out_bias,
-    const std::vector<paddle::Tensor>& ffn_ln_weight,
-    const std::vector<paddle::Tensor>& ffn_ln_bias,
-    const std::vector<paddle::Tensor>& ffn_inter_weight,
-    const std::vector<paddle::Tensor>& ffn_inter_bias,
-    const std::vector<paddle::Tensor>& ffn_out_weight,
-    const std::vector<paddle::Tensor>& ffn_out_bias,
-    const paddle::Tensor& decoder_ln_weight,
-    const paddle::Tensor& decoder_ln_bias,
-    const paddle::Tensor& positional_embedding_weight,
-    const paddle::Tensor& emb_weight,
-    paddle::Tensor& output_ids,
-    const bool& normalize_before,
-    const int& topk,
-    const float& topp,
-    const int& max_len,
-    const int& n_head,
-    const int& size_per_head,
-    const int& num_layer,
-    const int& bos_id,
-    const int& eos_id,
-    const float& temperature,
-    const bool& use_fp16,
-    const int& tensor_para_size,
-    const int& layer_para_size,
-    const int& layer_para_batch_size);
diff --git a/paddlenlp/ops/fast_transformer/src/fusion_pegasus_decoding_op.cc b/paddlenlp/ops/fast_transformer/src/fusion_pegasus_decoding_op.cc
deleted file mode 100644
index c98fd9f744a7..000000000000
--- a/paddlenlp/ops/fast_transformer/src/fusion_pegasus_decoding_op.cc
+++ /dev/null
@@ -1,372 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <string>
-#include <vector>
-
-#include "fusion_pegasus_decoding_op.h"
-#include "pd_traits.h"
-
-
-std::vector<paddle::Tensor> PegasusDecodingForward(
-    const paddle::Tensor& input,
-    const paddle::Tensor& mem_seq_len,
-    const paddle::Tensor& word_embedding,
-    const std::vector<paddle::Tensor>& self_ln_weight,
-    const std::vector<paddle::Tensor>& self_ln_bias,
-    const std::vector<paddle::Tensor>& self_q_weight,
-    const std::vector<paddle::Tensor>& self_q_bias,
-    const std::vector<paddle::Tensor>& self_k_weight,
-    const std::vector<paddle::Tensor>& self_k_bias,
-    const std::vector<paddle::Tensor>& self_v_weight,
-    const std::vector<paddle::Tensor>& self_v_bias,
-    const std::vector<paddle::Tensor>& self_out_weight,
-    const std::vector<paddle::Tensor>& self_out_bias,
-    const std::vector<paddle::Tensor>& cross_ln_weight,
-    const std::vector<paddle::Tensor>& cross_ln_bias,
-    const std::vector<paddle::Tensor>& cross_q_weight,
-    const std::vector<paddle::Tensor>& cross_q_bias,
-    const std::vector<paddle::Tensor>& cross_k_weight,
-    const std::vector<paddle::Tensor>& cross_k_bias,
-    const std::vector<paddle::Tensor>& cross_v_weight,
-    const std::vector<paddle::Tensor>& cross_v_bias,
-    const std::vector<paddle::Tensor>& cross_out_weight,
-    const std::vector<paddle::Tensor>& cross_out_bias,
-    const std::vector<paddle::Tensor>& ffn_ln_weight,
-    const std::vector<paddle::Tensor>& ffn_ln_bias,
-    const std::vector<paddle::Tensor>& ffn_inter_weight,
-    const std::vector<paddle::Tensor>& ffn_inter_bias,
-    const std::vector<paddle::Tensor>& ffn_out_weight,
-    const std::vector<paddle::Tensor>& ffn_out_bias,
-    const paddle::Tensor& decoder_ln_weight,
-    const paddle::Tensor& decoder_ln_bias,
-    const paddle::Tensor& embedding_weight,
-    const paddle::Tensor& embedding_bias,
-    const paddle::Tensor& positional_embedding_weight,
-    const std::string decoding_strategy,
-    const int beam_size,
-    const int topk,
-    const float topp,
-    const int n_head,
-    const int size_per_head,
-    const int num_layer,
-    const int bos_id,
-    const int eos_id,
-    const float temperature,
-    const int64_t max_len,
-    const int64_t min_len,
-    const float beam_search_diversity_rate,
-    const bool rel_len,
-    const float alpha,
-    const bool early_stopping,
-    const std::string hidden_act) {
-  int batch_size = input.shape()[0];
-  int max_out_len = rel_len ? max_len + input.shape()[1] : max_len;
-
-  std::vector<int64_t> output_dims;
-  std::vector<int64_t> parent_ids_dims;
-  std::vector<int64_t> sequence_length_dims({batch_size});
-  if (decoding_strategy == "beam_search") {
-    batch_size /= beam_size;
-    output_dims = {max_out_len, batch_size, beam_size};
-    parent_ids_dims = output_dims;
-  } else if (decoding_strategy == "beam_search_v2" ||
-             decoding_strategy == "beam_search_v3") {
-    // Use separated alive and finish beam queues to avoid the decrease of alive
-    // beams. The outputs must include both the finish and alive to trace full
-    // path.
-    if (batch_size != -1) {
-      sequence_length_dims = {batch_size * 2};
-      batch_size /= beam_size;
-    } else {
-      sequence_length_dims = {batch_size};
-    }
-    output_dims = {max_out_len, batch_size, beam_size * 2};
-    parent_ids_dims = output_dims;
-  } else if (decoding_strategy == "topk_sampling" ||
-             decoding_strategy == "topp_sampling") {
-    output_dims = {max_out_len, batch_size};
-    parent_ids_dims = {1};
-  } else {
-    PD_THROW("Not supported decoding strategy. ");
-  }
-
-#ifdef PADDLE_NEW_ALLOCATOR
-  // For PaddlePaddle>=2.3.0
-  if (input.place() == paddle::GPUPlace()) {
-    auto output_ids = paddle::empty(output_dims, paddle::DataType::INT32, input.place());
-    auto parent_ids = paddle::empty(parent_ids_dims, paddle::DataType::INT32, input.place());
-    auto sequence_length = paddle::empty(sequence_length_dims, paddle::DataType::INT32, input.place());
-
-    paddle::Tensor seq_len = paddle::empty(mem_seq_len.shape(), mem_seq_len.dtype(), input.place());
-
-    if (mem_seq_len.place() != paddle::GPUPlace()) {
-      seq_len = mem_seq_len.copy_to<int>(paddle::GPUPlace());
-    } else {
-      seq_len = mem_seq_len;
-    }
-#else
-  if (input.place() == paddle::PlaceType::kGPU) {
-    auto output_ids = paddle::Tensor(paddle::PlaceType::kGPU, output_dims);
-    auto parent_ids = paddle::Tensor(paddle::PlaceType::kGPU, parent_ids_dims);
-    auto sequence_length =
-        paddle::Tensor(paddle::PlaceType::kGPU, sequence_length_dims);
-
-    paddle::Tensor seq_len = paddle::Tensor(paddle::PlaceType::kGPU);
-
-    if (mem_seq_len.place() != paddle::PlaceType::kGPU) {
-      seq_len = mem_seq_len.copy_to<int>(paddle::PlaceType::kGPU);
-    } else {
-      seq_len = mem_seq_len;
-    }
-#endif
-    return PegasusDecodingCUDAForward(input,
-                                    seq_len,
-                                    word_embedding,
-                                    self_ln_weight,
-                                    self_ln_bias,
-                                    self_q_weight,
-                                    self_q_bias,
-                                    self_k_weight,
-                                    self_k_bias,
-                                    self_v_weight,
-                                    self_v_bias,
-                                    self_out_weight,
-                                    self_out_bias,
-                                    cross_ln_weight,
-                                    cross_ln_bias,
-                                    cross_q_weight,
-                                    cross_q_bias,
-                                    cross_k_weight,
-                                    cross_k_bias,
-                                    cross_v_weight,
-                                    cross_v_bias,
-                                    cross_out_weight,
-                                    cross_out_bias,
-                                    ffn_ln_weight,
-                                    ffn_ln_bias,
-                                    ffn_inter_weight,
-                                    ffn_inter_bias,
-                                    ffn_out_weight,
-                                    ffn_out_bias,
-                                    decoder_ln_weight,
-                                    decoder_ln_bias,
-                                    embedding_weight,
-                                    embedding_bias,
-                                    positional_embedding_weight,
-                                    output_ids,
-                                    parent_ids,
-                                    sequence_length,
-                                    decoding_strategy,
-                                    beam_size,
-                                    topk,
-                                    topp,
-                                    n_head,
-                                    size_per_head,
-                                    num_layer,
-                                    bos_id,
-                                    eos_id,
-                                    temperature,
-                                    max_out_len,
-                                    min_len,
-                                    beam_search_diversity_rate,
-                                    alpha,
-                                    early_stopping,
-                                    hidden_act);
-  } else {
-    PD_THROW("Not implemented place. Only GPU is supported. ");
-  }
-}
-
-std::vector<std::vector<int64_t>> PegasusDecodingInferShape(
-    const std::vector<int64_t>& input_shape,
-    const std::vector<int64_t>& mem_seq_len_shape,
-    const std::vector<int64_t>& word_embedding_shape,
-    const std::vector<std::vector<int64_t>>& self_ln_weight_shapes,
-    const std::vector<std::vector<int64_t>>& self_ln_bias_shapes,
-    const std::vector<std::vector<int64_t>>& self_q_weight_shapes,
-    const std::vector<std::vector<int64_t>>& self_q_bias_shapes,
-    const std::vector<std::vector<int64_t>>& self_k_weight_shapes,
-    const std::vector<std::vector<int64_t>>& self_k_bias_shapes,
-    const std::vector<std::vector<int64_t>>& self_v_weight_shapes,
-    const std::vector<std::vector<int64_t>>& self_v_bias_shapes,
-    const std::vector<std::vector<int64_t>>& self_out_weight_shapes,
-    const std::vector<std::vector<int64_t>>& self_out_bias_shapes,
-    const std::vector<std::vector<int64_t>>& cross_ln_weight_shapes,
-    const std::vector<std::vector<int64_t>>& cross_ln_bias_shapes,
-    const std::vector<std::vector<int64_t>>& cross_q_weight_shapes,
-    const std::vector<std::vector<int64_t>>& cross_q_bias_shapes,
-    const std::vector<std::vector<int64_t>>& cross_k_weight_shapes,
-    const std::vector<std::vector<int64_t>>& cross_k_bias_shapes,
-    const std::vector<std::vector<int64_t>>& cross_v_weight_shapes,
-    const std::vector<std::vector<int64_t>>& cross_v_bias_shapes,
-    const std::vector<std::vector<int64_t>>& cross_out_weight_shapes,
-    const std::vector<std::vector<int64_t>>& cross_out_bias_shapes,
-    const std::vector<std::vector<int64_t>>& ffn_ln_weight_shapes,
-    const std::vector<std::vector<int64_t>>& ffn_ln_bias_shapes,
-    const std::vector<std::vector<int64_t>>& ffn_inter_weight_shapes,
-    const std::vector<std::vector<int64_t>>& ffn_inter_bias_shapes,
-    const std::vector<std::vector<int64_t>>& ffn_out_weight_shapes,
-    const std::vector<std::vector<int64_t>>& ffn_out_bias_shapes,
-    const std::vector<int64_t>& decoder_ln_weight_shape,
-    const std::vector<int64_t>& decoder_ln_bias_shape,
-    const std::vector<int64_t>& embedding_weight_shape,
-    const std::vector<int64_t>& embedding_bias_shape,
-    const std::vector<int64_t>& positional_embedding_weight_shape,
-    const std::string decoding_strategy,
-    const int beam_size,
-    const int topk,
-    const float topp,
-    const int n_head,
-    const int size_per_head,
-    const int num_layer,
-    const int bos_id,
-    const int eos_id,
-    const float temperature,
-    const int64_t max_len,
-    const int64_t min_len,
-    const float beam_search_diversity_rate,
-    const bool rel_len,
-    const float alpha,
-    const bool early_stopping,
-    const std::string hidden_act) {
-  int batch_size = input_shape[0];
-
-  std::vector<int64_t> output_dims;
-  std::vector<int64_t> sequence_length_dims({batch_size});
-  if (decoding_strategy == "beam_search") {
-    if (batch_size != -1) {
-      batch_size /= beam_size;
-    }
-    output_dims = {max_len, batch_size, beam_size};
-    return {output_dims, output_dims, sequence_length_dims};
-  } else if (decoding_strategy == "beam_search_v2" ||
-            decoding_strategy == "beam_search_v3") {
-    // Use separated alive and finish beam queues to avoid the decrease of alive
-    // beams. The outputs must include both the finish and alive to trace full
-    // path.
-    if (batch_size != -1) {
-      sequence_length_dims = {batch_size * 2};
-      batch_size /= beam_size;
-    } else {
-      sequence_length_dims = {batch_size};
-    }
-    output_dims = {max_len, batch_size, beam_size * 2};
-    return {output_dims, output_dims, sequence_length_dims};
-  } else if (decoding_strategy == "topk_sampling" ||
-             decoding_strategy == "topp_sampling") {
-    output_dims = {max_len, batch_size};
-    return {output_dims, {1}, sequence_length_dims};
-  } else {
-    PD_THROW("Not supported decoding strategy. ");
-  }
-}
-
-std::vector<paddle::DataType> PegasusDecodingInferDtype(
-    const paddle::DataType& input,
-    const paddle::DataType& mem_seq_len,
-    const paddle::DataType& word_embedding,
-    const std::vector<paddle::DataType>& self_ln_weight,
-    const std::vector<paddle::DataType>& self_ln_bias,
-    const std::vector<paddle::DataType>& self_q_weight,
-    const std::vector<paddle::DataType>& self_q_bias,
-    const std::vector<paddle::DataType>& self_k_weight,
-    const std::vector<paddle::DataType>& self_k_bias,
-    const std::vector<paddle::DataType>& self_v_weight,
-    const std::vector<paddle::DataType>& self_v_bias,
-    const std::vector<paddle::DataType>& self_out_weight,
-    const std::vector<paddle::DataType>& self_out_bias,
-    const std::vector<paddle::DataType>& cross_ln_weight,
-    const std::vector<paddle::DataType>& cross_ln_bias,
-    const std::vector<paddle::DataType>& cross_q_weight,
-    const std::vector<paddle::DataType>& cross_q_bias,
-    const std::vector<paddle::DataType>& cross_k_weight,
-    const std::vector<paddle::DataType>& cross_k_bias,
-    const std::vector<paddle::DataType>& cross_v_weight,
-    const std::vector<paddle::DataType>& cross_v_bias,
-    const std::vector<paddle::DataType>& cross_out_weight,
-    const std::vector<paddle::DataType>& cross_out_bias,
-    const std::vector<paddle::DataType>& ffn_ln_weight,
-    const std::vector<paddle::DataType>& ffn_ln_bias,
-    const std::vector<paddle::DataType>& ffn_inter_weight,
-    const std::vector<paddle::DataType>& ffn_inter_bias,
-    const std::vector<paddle::DataType>& ffn_out_weight,
-    const std::vector<paddle::DataType>& ffn_out_bias,
-    const paddle::DataType& decoder_ln_weight,
-    const paddle::DataType& decoder_ln_bias,
-    const paddle::DataType& embedding_weight,
-    const paddle::DataType& embedding_bias,
-    const paddle::DataType& positional_embedding_weight) {
-  return {paddle::DataType::INT32,
-          paddle::DataType::INT32,
-          paddle::DataType::INT32};
-}
-
-PD_BUILD_OP(fusion_pegasus_decoding)
-    .Inputs({"Input",
-             "MemSeqLen",
-             "WordEmbedding",
-             paddle::Vec("SelfLayernormWeight"),
-             paddle::Vec("SelfLayernormBias"),
-             paddle::Vec("SelfQueryWeight"),
-             paddle::Vec("SelfQueryBias"),
-             paddle::Vec("SelfKeyWeight"),
-             paddle::Vec("SelfKeyBias"),
-             paddle::Vec("SelfValueWeight"),
-             paddle::Vec("SelfValueBias"),
-             paddle::Vec("SelfOutWeight"),
-             paddle::Vec("SelfOutBias"),
-             paddle::Vec("CrossLayernormWeight"),
-             paddle::Vec("CrossLayernormBias"),
-             paddle::Vec("CrossQueryWeight"),
-             paddle::Vec("CrossQueryBias"),
-             paddle::Vec("CrossKeyWeight"),
-             paddle::Vec("CrossKeyBias"),
-             paddle::Vec("CrossValueWeight"),
-             paddle::Vec("CrossValueBias"),
-             paddle::Vec("CrossOutWeight"),
-             paddle::Vec("CrossOutBias"),
-             paddle::Vec("FFNLayernormWeight"),
-             paddle::Vec("FFNLayernormBias"),
-             paddle::Vec("FFNInterWeight"),
-             paddle::Vec("FFNInterBias"),
-             paddle::Vec("FFNOutWeight"),
-             paddle::Vec("FFNOutBias"),
-             "DecoderLayernormWeight",
-             "DecoderLayernormBias",
-             "EmbWeight",
-             "EmbBias",
-             "PositionEncEmb"})
-    .Outputs({"OutputIds", "ParentIds", "SequenceLength"})
-    .Attrs({
-        "decoding_strategy: std::string",
-        "beam_size: int",
-        "topk: int",
-        "topp: float",
-        "n_head: int",
-        "size_per_head: int",
-        "num_layer: int",
-        "bos_id: int",
-        "eos_id: int",
-        "temperature: float",
-        "max_len: int64_t",
-        "min_len: int64_t",
-        "beam_search_diversity_rate: float",
-        "rel_len: bool",
-        "alpha: float",
-        "early_stopping: bool",
-        "hidden_act: std::string",
-    })
-    .SetKernelFn(PD_KERNEL(PegasusDecodingForward))
-    .SetInferShapeFn(PD_INFER_SHAPE(PegasusDecodingInferShape))
-    .SetInferDtypeFn(PD_INFER_DTYPE(PegasusDecodingInferDtype));
diff --git a/paddlenlp/ops/fast_transformer/src/fusion_pegasus_decoding_op.cu b/paddlenlp/ops/fast_transformer/src/fusion_pegasus_decoding_op.cu
deleted file mode 100644
index 753f70bf2ae0..000000000000
--- a/paddlenlp/ops/fast_transformer/src/fusion_pegasus_decoding_op.cu
+++ /dev/null
@@ -1,554 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <curand.h>
-#include <curand_kernel.h>
-#include <algorithm>
-#include <iterator>
-#include <random>
-#include <sstream>
-#include <vector>
-
-#include "cublas_handle.h"
-
-#include "fusion_pegasus_decoding_op.h"
-#include "pd_traits.h"
-
-template <paddle::DataType D>
-std::vector<paddle::Tensor> pegasus_decoding_kernel(
-    const paddle::Tensor& input,
-    const paddle::Tensor& memory_sequence_length,
-    const paddle::Tensor& word_emb,
-    const std::vector<paddle::Tensor>& self_layernorm_weight,
-    const std::vector<paddle::Tensor>& self_layernorm_bias,
-    const std::vector<paddle::Tensor>& self_attn_query_weight,
-    const std::vector<paddle::Tensor>& self_attn_query_bias,
-    const std::vector<paddle::Tensor>& self_attn_key_weight,
-    const std::vector<paddle::Tensor>& self_attn_key_bias,
-    const std::vector<paddle::Tensor>& self_attn_value_weight,
-    const std::vector<paddle::Tensor>& self_attn_value_bias,
-    const std::vector<paddle::Tensor>& self_attn_output_weight,
-    const std::vector<paddle::Tensor>& self_attn_output_bias,
-    const std::vector<paddle::Tensor>& cross_layernorm_weight,
-    const std::vector<paddle::Tensor>& cross_layernorm_bias,
-    const std::vector<paddle::Tensor>& cross_attn_query_weight,
-    const std::vector<paddle::Tensor>& cross_attn_query_bias,
-    const std::vector<paddle::Tensor>& cross_attn_key_weight,
-    const std::vector<paddle::Tensor>& cross_attn_key_bias,
-    const std::vector<paddle::Tensor>& cross_attn_value_weight,
-    const std::vector<paddle::Tensor>& cross_attn_value_bias,
-    const std::vector<paddle::Tensor>& cross_attn_output_weight,
-    const std::vector<paddle::Tensor>& cross_attn_output_bias,
-    const std::vector<paddle::Tensor>& ffn_layernorm_weight,
-    const std::vector<paddle::Tensor>& ffn_layernorm_bias,
-    const std::vector<paddle::Tensor>& ffn_intermediate_weight,
-    const std::vector<paddle::Tensor>& ffn_intermediate_bias,
-    const std::vector<paddle::Tensor>& ffn_output_weight,
-    const std::vector<paddle::Tensor>& ffn_output_bias,
-    const paddle::Tensor& decoder_layernorm_weight,
-    const paddle::Tensor& decoder_layernorm_bias,
-    const paddle::Tensor& embedding_weight,
-    const paddle::Tensor& embedding_bias,
-    const paddle::Tensor& position_encoding_table,
-    paddle::Tensor& output_ids,
-    paddle::Tensor& parent_ids,
-    paddle::Tensor& sequence_length,
-    const std::string decoding_strategy,
-    const int beam_size,
-    const int topk,
-    const float topp,
-    const int head_num_,
-    const int size_per_head_,
-    const int num_layer_,
-    const int start_id_,
-    const int end_id_,
-    const float temperature,
-    const int64_t max_seq_len_,
-    const int64_t min_seq_len_,
-    const float beam_search_diversity_rate_,
-    const float alpha,
-    const bool early_stopping,
-    const std::string& hidden_act,
-    cudaStream_t stream) {
-  int beam_width_ = (decoding_strategy == "beam_search" ||
-                     decoding_strategy == "beam_search_v2" ||
-                     decoding_strategy == "beam_search_v3")
-                        ? beam_size
-                        : 1;
-  int candidate_num_ = (decoding_strategy == "sampling") ? topk : 1;
-  float probability_threshold_ = (decoding_strategy == "sampling") ? topp : 0.0;
-
-  auto input_dims = input.shape();
-  int batch_size_ = (decoding_strategy == "beam_search" ||
-                     decoding_strategy == "beam_search_v2" ||
-                     decoding_strategy == "beam_search_v3")
-                        ? input_dims[0] / beam_width_
-                        : input_dims[0];
-  const int memory_max_seq_len = input_dims[1];
-  const int memory_hidden_dim = input_dims[2];
-  const int vocab_size = word_emb.shape()[0];
-
-  typedef PDTraits<D> traits_;
-  typedef typename traits_::DataType DataType_;
-  typedef typename traits_::data_t data_t_;
-
-  DecodingInitParam<DataType_> decoding_params;
-  decoding_params.cublas_handle = CublasHandle::GetInstance()->cublas_handle_;
-  decoding_params.cublaslt_handle = CublasHandle::GetInstance()->cublaslt_handle_;
-
-#ifdef PADDLE_NEW_ALLOCATOR
-  // For PaddlePaddle>=2.3.0
-  decoding_params.output_ids = output_ids.data<int>();
-  decoding_params.parent_ids = parent_ids.data<int>();
-  decoding_params.sequence_length = sequence_length.data<int>();
-#else
-  decoding_params.output_ids = output_ids.mutable_data<int>(input.place());
-  decoding_params.parent_ids = parent_ids.mutable_data<int>(input.place());
-  decoding_params.sequence_length = sequence_length.mutable_data<int>(input.place());
-#endif
-  typedef DecoderTransformerTraits<traits_::OpType> DecodingTraits_;
-  decoding_params.stream = stream;
-  fastertransformer::Allocator<AllocatorType::PD> allocator_(stream);
-
-  decoding_params.memory_tensor =
-      reinterpret_cast<const DataType_*>(input.data<data_t_>());
-  decoding_params.memory_sequence_length = memory_sequence_length.data<int>();
-
-  DecoderInitParam<DataType_>* params =
-      new DecoderInitParam<DataType_>[num_layer_];
-
-  for (int i = 0; i < num_layer_; i++) {
-    params[i].stream = stream;
-    params[i].cublas_handle = CublasHandle::GetInstance()->cublas_handle_;
-    params[i].cublaslt_handle = CublasHandle::GetInstance()->cublaslt_handle_;
-
-    if (decoding_strategy == "beam_search" ||
-        decoding_strategy == "beam_search_v2" ||
-        decoding_strategy == "beam_search_v3") {
-      params[i].request_batch_size = batch_size_ * beam_width_;
-      params[i].request_max_mem_seq_len = memory_max_seq_len;
-    } else if (decoding_strategy == "sampling" ||
-               decoding_strategy == "topk_sampling" ||
-               decoding_strategy == "topp_sampling") {
-      params[i].request_batch_size = batch_size_;
-      params[i].request_max_mem_seq_len = memory_max_seq_len;
-    }
-
-    // self attn
-    params[i].self_layernorm.gamma = reinterpret_cast<const DataType_*>(
-        self_layernorm_weight[i].data<data_t_>());
-    params[i].self_layernorm.beta = reinterpret_cast<const DataType_*>(
-        self_layernorm_bias[i].data<data_t_>());
-    // query
-    params[i].self_attention.query_weight.kernel =
-        reinterpret_cast<const DataType_*>(
-            self_attn_query_weight[i].data<data_t_>());
-    params[i].self_attention.query_weight.bias =
-        reinterpret_cast<const DataType_*>(
-            self_attn_query_bias[i].data<data_t_>());
-    // key
-    params[i].self_attention.key_weight.kernel =
-        reinterpret_cast<const DataType_*>(
-            self_attn_key_weight[i].data<data_t_>());
-    params[i].self_attention.key_weight.bias =
-        reinterpret_cast<const DataType_*>(
-            self_attn_key_bias[i].data<data_t_>());
-    // value
-    params[i].self_attention.value_weight.kernel =
-        reinterpret_cast<const DataType_*>(
-            self_attn_value_weight[i].data<data_t_>());
-    params[i].self_attention.value_weight.bias =
-        reinterpret_cast<const DataType_*>(
-            self_attn_value_bias[i].data<data_t_>());
-    // out proj
-    params[i].self_attention.attention_output_weight.kernel =
-        reinterpret_cast<const DataType_*>(
-            self_attn_output_weight[i].data<data_t_>());
-    params[i].self_attention.attention_output_weight.bias =
-        reinterpret_cast<const DataType_*>(
-            self_attn_output_bias[i].data<data_t_>());
-
-    // cross
-    params[i].cross_layernorm.gamma = reinterpret_cast<const DataType_*>(
-        cross_layernorm_weight[i].data<data_t_>());
-    params[i].cross_layernorm.beta = reinterpret_cast<const DataType_*>(
-        cross_layernorm_bias[i].data<data_t_>());
-    // query
-    params[i].cross_attention.query_weight.kernel =
-        reinterpret_cast<const DataType_*>(
-            cross_attn_query_weight[i].data<data_t_>());
-    params[i].cross_attention.query_weight.bias =
-        reinterpret_cast<const DataType_*>(
-            cross_attn_query_bias[i].data<data_t_>());
-    // key
-    params[i].cross_attention.key_weight.kernel =
-        reinterpret_cast<const DataType_*>(
-            cross_attn_key_weight[i].data<data_t_>());
-    params[i].cross_attention.key_weight.bias =
-        reinterpret_cast<const DataType_*>(
-            cross_attn_key_bias[i].data<data_t_>());
-    // value
-    params[i].cross_attention.value_weight.kernel =
-        reinterpret_cast<const DataType_*>(
-            cross_attn_value_weight[i].data<data_t_>());
-    params[i].cross_attention.value_weight.bias =
-        reinterpret_cast<const DataType_*>(
-            cross_attn_value_bias[i].data<data_t_>());
-    // out proj
-    params[i].cross_attention.attention_output_weight.kernel =
-        reinterpret_cast<const DataType_*>(
-            cross_attn_output_weight[i].data<data_t_>());
-    params[i].cross_attention.attention_output_weight.bias =
-        reinterpret_cast<const DataType_*>(
-            cross_attn_output_bias[i].data<data_t_>());
-
-    // ffn
-    params[i].ffn_layernorm.gamma = reinterpret_cast<const DataType_*>(
-        ffn_layernorm_weight[i].data<data_t_>());
-    params[i].ffn_layernorm.beta = reinterpret_cast<const DataType_*>(
-        ffn_layernorm_bias[i].data<data_t_>());
-    // intermediate proj
-    params[i].ffn.intermediate_weight.kernel =
-        reinterpret_cast<const DataType_*>(
-            ffn_intermediate_weight[i].data<data_t_>());
-    params[i].ffn.intermediate_weight.bias = reinterpret_cast<const DataType_*>(
-        ffn_intermediate_bias[i].data<data_t_>());
-    // out proj
-    params[i].ffn.output_weight.kernel = reinterpret_cast<const DataType_*>(
-        ffn_output_weight[i].data<data_t_>());
-    params[i].ffn.output_weight.bias =
-        reinterpret_cast<const DataType_*>(ffn_output_bias[i].data<data_t_>());
-  }
-
-  decoding_params.layernorm.gamma = reinterpret_cast<const DataType_*>(
-      decoder_layernorm_weight.data<data_t_>());
-  decoding_params.layernorm.beta = reinterpret_cast<const DataType_*>(
-      decoder_layernorm_bias.data<data_t_>());
-
-  // for embedding
-  decoding_params.embedding_table =
-      reinterpret_cast<const DataType_*>(word_emb.data<data_t_>());
-
-  // for weight sharing matmul
-  decoding_params.embedding_kernel =
-      reinterpret_cast<const DataType_*>(embedding_weight.data<data_t_>());
-  // for matmul bias
-  decoding_params.embedding_bias =
-      reinterpret_cast<const DataType_*>(embedding_bias.data<data_t_>());
-
-  decoding_params.position_encoding_table = reinterpret_cast<const DataType_*>(
-      position_encoding_table.data<data_t_>());
-    
-  int finished_candidate_num_ =
-      ("beam_search_v3" == decoding_strategy) ? beam_width_ : beam_width_ * 2;
-
-  ActivationType activate =
-      (hidden_act == "gelu") ? ActivationType::GELU : ActivationType::RELU;
-
-  if ("beam_search" == decoding_strategy) {
-    DecodingBeamsearch<DecodingTraits_::OpType>* decoding_beam_search_;
-    decoding_beam_search_ = new DecodingBeamsearch<DecodingTraits_::OpType>(
-        allocator_,
-        batch_size_,
-        beam_width_,
-        max_seq_len_,
-        head_num_,
-        size_per_head_,
-        vocab_size,
-        num_layer_,
-        memory_hidden_dim,
-        memory_max_seq_len,
-        start_id_,
-        end_id_,
-        beam_search_diversity_rate_,
-        true,  /*is_fuse_topk_softMax*/
-        true, /*is_fuse_qkv*/
-        false, /*keep_alive_beam*/
-        alpha, /*alpha not used for this case*/
-        true,
-        0, /*pos_offset BART and MBART only for now*/
-        activate,
-        false,  // pos_bias
-        false /*prefix_lm*/,
-        -1, /*finished_candidate_num*/
-        false, /*early_stopping*/
-        false, /*is_mbart */
-        min_seq_len_ /*min_length*/);
-
-    decoding_beam_search_->forward(params, decoding_params);
-
-    delete decoding_beam_search_;
-  } else if ("beam_search_v2" == decoding_strategy ||
-             "beam_search_v3" == decoding_strategy) {
-    DecodingBeamsearch<DecodingTraits_::OpType>* decoding_beam_search_;
-    decoding_beam_search_ = new DecodingBeamsearch<DecodingTraits_::OpType>(
-        allocator_,
-        batch_size_,
-        beam_width_,
-        max_seq_len_,
-        head_num_,
-        size_per_head_,
-        vocab_size,
-        num_layer_,
-        memory_hidden_dim,
-        memory_max_seq_len,
-        start_id_,
-        end_id_,
-        beam_search_diversity_rate_,
-        true,   // is_fuse_topk_softMax
-        true,  // is_fuse_qkv
-        true,   // keep_alive_beam
-        alpha,
-        true, // normalize_before
-        0, /*pos_offset BART and MBART only for now*/
-        activate,
-        false,  // pos_bias
-        false /*prefix_lm*/,
-        finished_candidate_num_, /*finished_candidate_num*/
-        early_stopping, /*early_stopping*/
-        false, /*is_mbart */
-        min_seq_len_ /*min_length*/);
-
-    decoding_beam_search_->forward(params, decoding_params);
-
-    delete decoding_beam_search_;
-  } else if ("topk_sampling" == decoding_strategy ||
-             "topp_sampling" == decoding_strategy ||
-             "sampling" == decoding_strategy) {
-    DecodingSampling<DecodingTraits_::OpType>* decoding_sampling_;
-    decoding_sampling_ = new DecodingSampling<DecodingTraits_::OpType>(
-        allocator_,
-        batch_size_,
-        max_seq_len_,
-        head_num_,
-        size_per_head_,
-        vocab_size,
-        num_layer_,
-        memory_hidden_dim,
-        memory_max_seq_len,
-        start_id_,
-        end_id_,
-        candidate_num_,
-        probability_threshold_,
-        true, /*is_fuse_qkv*/
-        true, // normalize_before
-        0, /*pos_offset BART and MBART only for now*/
-        activate,
-        false,  // pos_bias
-        temperature,    // temperature
-        1.0,    // repeat_penalty
-        false,  // prefix_lm
-        false, /*is_mbart */
-        min_seq_len_ /*min_length*/);
-
-    decoding_sampling_->forward(params, decoding_params);
-
-    delete decoding_sampling_;
-  } else {
-    PD_THROW(
-        "Only beam_search, beam_search_v2 and sampling are supported for "
-        "FastGeneration. ");
-  }
-  delete[] params;
-
-  return {output_ids, parent_ids, sequence_length};
-}
-
-std::vector<paddle::Tensor> PegasusDecodingCUDAForward(
-    const paddle::Tensor& input,
-    const paddle::Tensor& mem_seq_len,
-    const paddle::Tensor& word_embedding,
-    const std::vector<paddle::Tensor>& self_ln_weight,
-    const std::vector<paddle::Tensor>& self_ln_bias,
-    const std::vector<paddle::Tensor>& self_q_weight,
-    const std::vector<paddle::Tensor>& self_q_bias,
-    const std::vector<paddle::Tensor>& self_k_weight,
-    const std::vector<paddle::Tensor>& self_k_bias,
-    const std::vector<paddle::Tensor>& self_v_weight,
-    const std::vector<paddle::Tensor>& self_v_bias,
-    const std::vector<paddle::Tensor>& self_out_weight,
-    const std::vector<paddle::Tensor>& self_out_bias,
-    const std::vector<paddle::Tensor>& cross_ln_weight,
-    const std::vector<paddle::Tensor>& cross_ln_bias,
-    const std::vector<paddle::Tensor>& cross_q_weight,
-    const std::vector<paddle::Tensor>& cross_q_bias,
-    const std::vector<paddle::Tensor>& cross_k_weight,
-    const std::vector<paddle::Tensor>& cross_k_bias,
-    const std::vector<paddle::Tensor>& cross_v_weight,
-    const std::vector<paddle::Tensor>& cross_v_bias,
-    const std::vector<paddle::Tensor>& cross_out_weight,
-    const std::vector<paddle::Tensor>& cross_out_bias,
-    const std::vector<paddle::Tensor>& ffn_ln_weight,
-    const std::vector<paddle::Tensor>& ffn_ln_bias,
-    const std::vector<paddle::Tensor>& ffn_inter_weight,
-    const std::vector<paddle::Tensor>& ffn_inter_bias,
-    const std::vector<paddle::Tensor>& ffn_out_weight,
-    const std::vector<paddle::Tensor>& ffn_out_bias,
-    const paddle::Tensor& decoder_ln_weight,
-    const paddle::Tensor& decoder_ln_bias,
-    const paddle::Tensor& embedding_weight,
-    const paddle::Tensor& embedding_bias,
-    const paddle::Tensor& positional_embedding_weight,
-    paddle::Tensor& output_ids,
-    paddle::Tensor& parent_ids,
-    paddle::Tensor& sequence_length,
-    const std::string decoding_strategy,
-    const int  beam_size,
-    const int topk,
-    const float topp,
-    const int n_head,
-    const int size_per_head,
-    const int num_layer,
-    const int bos_id,
-    const int eos_id,
-    const float temperature,
-    const int64_t max_len,
-    const int64_t min_len,
-    const float beam_search_diversity_rate,
-    const float alpha,
-    const bool early_stopping,
-    const std::string hidden_act) {
-  auto stream = input.stream();
-  cublasSetStream(CublasHandle::GetInstance()->cublas_handle_, stream);
-
-  std::vector<paddle::Tensor> ret;
-
-  switch (input.type()) {
-    case paddle::DataType::FLOAT16: {
-      ret = pegasus_decoding_kernel<paddle::DataType::FLOAT16>(
-          input,
-          mem_seq_len,
-          word_embedding,
-          self_ln_weight,
-          self_ln_bias,
-          self_q_weight,
-          self_q_bias,
-          self_k_weight,
-          self_k_bias,
-          self_v_weight,
-          self_v_bias,
-          self_out_weight,
-          self_out_bias,
-          cross_ln_weight,
-          cross_ln_bias,
-          cross_q_weight,
-          cross_q_bias,
-          cross_k_weight,
-          cross_k_bias,
-          cross_v_weight,
-          cross_v_bias,
-          cross_out_weight,
-          cross_out_bias,
-          ffn_ln_weight,
-          ffn_ln_bias,
-          ffn_inter_weight,
-          ffn_inter_bias,
-          ffn_out_weight,
-          ffn_out_bias,
-          decoder_ln_weight,
-          decoder_ln_bias,
-          embedding_weight,
-          embedding_bias,
-          positional_embedding_weight,
-          output_ids,
-          parent_ids,
-          sequence_length,
-          decoding_strategy,
-          beam_size,
-          topk,
-          topp,
-          n_head,
-          size_per_head,
-          num_layer,
-          bos_id,
-          eos_id,
-          temperature,
-          max_len,
-          min_len,
-          beam_search_diversity_rate,
-          alpha,
-          early_stopping,
-          hidden_act,
-          stream);
-      break;
-    }
-    case paddle::DataType::FLOAT32: {
-      ret = pegasus_decoding_kernel<paddle::DataType::FLOAT32>(
-          input,
-          mem_seq_len,
-          word_embedding,
-          self_ln_weight,
-          self_ln_bias,
-          self_q_weight,
-          self_q_bias,
-          self_k_weight,
-          self_k_bias,
-          self_v_weight,
-          self_v_bias,
-          self_out_weight,
-          self_out_bias,
-          cross_ln_weight,
-          cross_ln_bias,
-          cross_q_weight,
-          cross_q_bias,
-          cross_k_weight,
-          cross_k_bias,
-          cross_v_weight,
-          cross_v_bias,
-          cross_out_weight,
-          cross_out_bias,
-          ffn_ln_weight,
-          ffn_ln_bias,
-          ffn_inter_weight,
-          ffn_inter_bias,
-          ffn_out_weight,
-          ffn_out_bias,
-          decoder_ln_weight,
-          decoder_ln_bias,
-          embedding_weight,
-          embedding_bias,
-          positional_embedding_weight,
-          output_ids,
-          parent_ids,
-          sequence_length,
-          decoding_strategy,
-          beam_size,
-          topk,
-          topp,
-          n_head,
-          size_per_head,
-          num_layer,
-          bos_id,
-          eos_id,
-          temperature,
-          max_len,
-          min_len,
-          beam_search_diversity_rate,
-          alpha,
-          early_stopping,
-          hidden_act,
-          stream);
-      break;
-    }
-    default: {
-      PD_THROW(
-          "NOT supported data type. "
-          "Only float16 and float32 are supported. ");
-      break;
-    }
-  }
-
-  return ret;
-}
diff --git a/paddlenlp/ops/fast_transformer/src/fusion_pegasus_decoding_op.h b/paddlenlp/ops/fast_transformer/src/fusion_pegasus_decoding_op.h
deleted file mode 100644
index 43ccad5a23c7..000000000000
--- a/paddlenlp/ops/fast_transformer/src/fusion_pegasus_decoding_op.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include "fastertransformer/decoding_beamsearch.h"
-#include "fastertransformer/decoding_sampling.h"
-#include "fastertransformer/open_decoder.h"
-#include "fastertransformer/utils/common.h"
-
-#ifdef PADDLE_ON_INFERENCE
-#include "paddle/extension.h"
-#include "paddle_inference_api.h"
-#include "paddle/common/exception.h"
-#else
-#include "paddle/extension.h"
-#endif
-
-
-std::vector<paddle::Tensor> PegasusDecodingCUDAForward(
-    const paddle::Tensor& input,
-    const paddle::Tensor& mem_seq_len,
-    const paddle::Tensor& word_embedding,
-    const std::vector<paddle::Tensor>& self_ln_weight,
-    const std::vector<paddle::Tensor>& self_ln_bias,
-    const std::vector<paddle::Tensor>& self_q_weight,
-    const std::vector<paddle::Tensor>& self_q_bias,
-    const std::vector<paddle::Tensor>& self_k_weight,
-    const std::vector<paddle::Tensor>& self_k_bias,
-    const std::vector<paddle::Tensor>& self_v_weight,
-    const std::vector<paddle::Tensor>& self_v_bias,
-    const std::vector<paddle::Tensor>& self_out_weight,
-    const std::vector<paddle::Tensor>& self_out_bias,
-    const std::vector<paddle::Tensor>& cross_ln_weight,
-    const std::vector<paddle::Tensor>& cross_ln_bias,
-    const std::vector<paddle::Tensor>& cross_q_weight,
-    const std::vector<paddle::Tensor>& cross_q_bias,
-    const std::vector<paddle::Tensor>& cross_k_weight,
-    const std::vector<paddle::Tensor>& cross_k_bias,
-    const std::vector<paddle::Tensor>& cross_v_weight,
-    const std::vector<paddle::Tensor>& cross_v_bias,
-    const std::vector<paddle::Tensor>& cross_out_weight,
-    const std::vector<paddle::Tensor>& cross_out_bias,
-    const std::vector<paddle::Tensor>& ffn_ln_weight,
-    const std::vector<paddle::Tensor>& ffn_ln_bias,
-    const std::vector<paddle::Tensor>& ffn_inter_weight,
-    const std::vector<paddle::Tensor>& ffn_inter_bias,
-    const std::vector<paddle::Tensor>& ffn_out_weight,
-    const std::vector<paddle::Tensor>& ffn_out_bias,
-    const paddle::Tensor& decoder_ln_weight,
-    const paddle::Tensor& decoder_ln_bias,
-    const paddle::Tensor& embedding_weight,
-    const paddle::Tensor& embedding_bias,
-    const paddle::Tensor& positional_embedding_weight,
-    paddle::Tensor& output_ids,
-    paddle::Tensor& parent_ids,
-    paddle::Tensor& sequence_length,
-    const std::string decoding_strategy,
-    const int beam_size,
-    const int topk,
-    const float topp,
-    const int head_num,
-    const int size_per_head,
-    const int num_layer,
-    const int start_id,
-    const int end_id,
-    const float temperature,
-    const int64_t max_len,
-    const int64_t min_len,
-    const float beam_search_diversity_rate,
-    const float alpha,
-    const bool early_stopping,
-    const std::string hidden_act);
diff --git a/paddlenlp/ops/fast_transformer/src/fusion_t5_decoding_op.cc b/paddlenlp/ops/fast_transformer/src/fusion_t5_decoding_op.cc
deleted file mode 100644
index 840b23b03929..000000000000
--- a/paddlenlp/ops/fast_transformer/src/fusion_t5_decoding_op.cc
+++ /dev/null
@@ -1,377 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "fusion_t5_decoding_op.h"
-
-#include <string>
-#include <vector>
-
-#include "pd_traits.h"
-
-
-std::vector<paddle::Tensor> T5DecodingForward(
-    const paddle::Tensor& input,
-    const paddle::Tensor& mem_seq_len,
-    const paddle::Tensor& word_embedding,
-    const std::vector<paddle::Tensor>& self_ln_weight,
-    const std::vector<paddle::Tensor>& self_ln_bias,
-    const std::vector<paddle::Tensor>& self_q_weight,
-    const std::vector<paddle::Tensor>& self_q_bias,
-    const std::vector<paddle::Tensor>& self_k_weight,
-    const std::vector<paddle::Tensor>& self_k_bias,
-    const std::vector<paddle::Tensor>& self_v_weight,
-    const std::vector<paddle::Tensor>& self_v_bias,
-    const std::vector<paddle::Tensor>& self_out_weight,
-    const std::vector<paddle::Tensor>& self_out_bias,
-    const std::vector<paddle::Tensor>& cross_ln_weight,
-    const std::vector<paddle::Tensor>& cross_ln_bias,
-    const std::vector<paddle::Tensor>& cross_q_weight,
-    const std::vector<paddle::Tensor>& cross_q_bias,
-    const std::vector<paddle::Tensor>& cross_k_weight,
-    const std::vector<paddle::Tensor>& cross_k_bias,
-    const std::vector<paddle::Tensor>& cross_v_weight,
-    const std::vector<paddle::Tensor>& cross_v_bias,
-    const std::vector<paddle::Tensor>& cross_out_weight,
-    const std::vector<paddle::Tensor>& cross_out_bias,
-    const std::vector<paddle::Tensor>& ffn_ln_weight,
-    const std::vector<paddle::Tensor>& ffn_ln_bias,
-    const std::vector<paddle::Tensor>& ffn_inter_weight_0,
-    const std::vector<paddle::Tensor>& ffn_inter_bias_0,
-    const std::vector<paddle::Tensor>& ffn_inter_weight_1,
-    const std::vector<paddle::Tensor>& ffn_inter_bias_1,
-    const std::vector<paddle::Tensor>& ffn_out_weight,
-    const std::vector<paddle::Tensor>& ffn_out_bias,
-    const paddle::Tensor& self_relative_attention_bias_weight,
-    const paddle::Tensor& decoder_ln_weight,
-    const paddle::Tensor& decoder_ln_bias,
-    const paddle::Tensor& embedding_weight,
-    const paddle::Tensor& embedding_bias,
-    const std::string& decoding_strategy,
-    const int& beam_size,
-    const int& topk,
-    const float& topp,
-    const int& n_head,
-    const int& size_per_head,
-    const int& num_layer,
-    const int& bos_id,
-    const int& eos_id,
-    const int64_t& max_len,
-    const float& beam_search_diversity_rate,
-    const bool& rel_len,
-    const float& alpha,
-    const float& temperature,
-    const bool& early_stopping,
-    const int& max_distance,
-    const int& num_buckets,
-    const bool& tie_word_embeddings,
-    const std::string& act) {
-  int batch_size = input.shape()[0];
-  int max_out_len = rel_len ? max_len + input.shape()[1] : max_len;
-
-  std::vector<int64_t> output_dims;
-  std::vector<int64_t> parent_ids_dims;
-  std::vector<int64_t> sequence_length_dims({batch_size});
-  if (decoding_strategy == "beam_search") {
-    batch_size /= beam_size;
-    output_dims = {max_out_len, batch_size, beam_size};
-    parent_ids_dims = output_dims;
-  } else if (decoding_strategy == "beam_search_v2" ||
-             decoding_strategy == "beam_search_v3") {
-    // Use separated alive and finish beam queues to avoid the decrease of alive
-    // beams. The outputs must include both the finish and alive to trace full
-    // path.
-    if (batch_size != -1) {
-      sequence_length_dims = {batch_size * 2};
-      batch_size /= beam_size;
-    } else {
-      sequence_length_dims = {batch_size};
-    }
-    output_dims = {max_out_len, batch_size, beam_size * 2};
-    parent_ids_dims = output_dims;
-  } else if (decoding_strategy == "topk_sampling" ||
-             decoding_strategy == "topp_sampling" ||
-             decoding_strategy == "sampling") {
-    output_dims = {max_out_len, batch_size};
-    parent_ids_dims = {1};
-  } else {
-    PD_THROW("Not supported decoding strategy. ");
-  }
-
-  if (input.place() == paddle::PlaceType::kGPU) {
-    auto output_ids = paddle::Tensor(paddle::PlaceType::kGPU, output_dims);
-    auto parent_ids = paddle::Tensor(paddle::PlaceType::kGPU, parent_ids_dims);
-    auto sequence_length =
-        paddle::Tensor(paddle::PlaceType::kGPU, sequence_length_dims);
-
-    paddle::Tensor seq_len = paddle::Tensor(paddle::PlaceType::kGPU);
-
-    if (mem_seq_len.place() != paddle::PlaceType::kGPU) {
-      seq_len = mem_seq_len.copy_to<int>(paddle::PlaceType::kGPU);
-    } else {
-      seq_len = mem_seq_len;
-    }
-
-    return T5DecodingCUDAForward(input,
-                                 seq_len,
-                                 word_embedding,
-                                 self_ln_weight,
-                                 self_ln_bias,
-                                 self_q_weight,
-                                 self_q_bias,
-                                 self_k_weight,
-                                 self_k_bias,
-                                 self_v_weight,
-                                 self_v_bias,
-                                 self_out_weight,
-                                 self_out_bias,
-                                 cross_ln_weight,
-                                 cross_ln_bias,
-                                 cross_q_weight,
-                                 cross_q_bias,
-                                 cross_k_weight,
-                                 cross_k_bias,
-                                 cross_v_weight,
-                                 cross_v_bias,
-                                 cross_out_weight,
-                                 cross_out_bias,
-                                 ffn_ln_weight,
-                                 ffn_ln_bias,
-                                 ffn_inter_weight_0,
-                                 ffn_inter_bias_0,
-                                 ffn_inter_weight_1,
-                                 ffn_inter_bias_1,
-                                 ffn_out_weight,
-                                 ffn_out_bias,
-                                 self_relative_attention_bias_weight,
-                                 decoder_ln_weight,
-                                 decoder_ln_bias,
-                                 embedding_weight,
-                                 embedding_bias,
-                                 output_ids,
-                                 parent_ids,
-                                 sequence_length,
-                                 decoding_strategy,
-                                 beam_size,
-                                 topk,
-                                 topp,
-                                 n_head,
-                                 size_per_head,
-                                 num_layer,
-                                 bos_id,
-                                 eos_id,
-                                 max_out_len,
-                                 beam_search_diversity_rate,
-                                 alpha,
-                                 temperature,
-                                 early_stopping,
-                                 max_distance,
-                                 num_buckets,
-                                 tie_word_embeddings,
-                                 act);
-  } else {
-    PD_THROW("Not implemented place. Only GPU is supported. ");
-  }
-}
-
-std::vector<std::vector<int64_t>> T5DecodingInferShape(
-    const std::vector<int64_t>& input_shape,
-    const std::vector<int64_t>& mem_seq_len_shape,
-    const std::vector<int64_t>& word_embedding_shape,
-    const std::vector<std::vector<int64_t>>& self_ln_weight_shapes,
-    const std::vector<std::vector<int64_t>>& self_ln_bias_shapes,
-    const std::vector<std::vector<int64_t>>& self_q_weight_shapes,
-    const std::vector<std::vector<int64_t>>& self_q_bias_shapes,
-    const std::vector<std::vector<int64_t>>& self_k_weight_shapes,
-    const std::vector<std::vector<int64_t>>& self_k_bias_shapes,
-    const std::vector<std::vector<int64_t>>& self_v_weight_shapes,
-    const std::vector<std::vector<int64_t>>& self_v_bias_shapes,
-    const std::vector<std::vector<int64_t>>& self_out_weight_shapes,
-    const std::vector<std::vector<int64_t>>& self_out_bias_shapes,
-    const std::vector<std::vector<int64_t>>& cross_ln_weight_shapes,
-    const std::vector<std::vector<int64_t>>& cross_ln_bias_shapes,
-    const std::vector<std::vector<int64_t>>& cross_q_weight_shapes,
-    const std::vector<std::vector<int64_t>>& cross_q_bias_shapes,
-    const std::vector<std::vector<int64_t>>& cross_k_weight_shapes,
-    const std::vector<std::vector<int64_t>>& cross_k_bias_shapes,
-    const std::vector<std::vector<int64_t>>& cross_v_weight_shapes,
-    const std::vector<std::vector<int64_t>>& cross_v_bias_shapes,
-    const std::vector<std::vector<int64_t>>& cross_out_weight_shapes,
-    const std::vector<std::vector<int64_t>>& cross_out_bias_shapes,
-    const std::vector<std::vector<int64_t>>& ffn_ln_weight_shapes,
-    const std::vector<std::vector<int64_t>>& ffn_ln_bias_shapes,
-    const std::vector<std::vector<int64_t>>& ffn_inter_weight_0_shapes,
-    const std::vector<std::vector<int64_t>>& ffn_inter_bias_0_shapes,
-    const std::vector<std::vector<int64_t>>& ffn_inter_weight_1_shapes,
-    const std::vector<std::vector<int64_t>>& ffn_inter_bias_1_shapes,
-    const std::vector<std::vector<int64_t>>& ffn_out_weight_shapes,
-    const std::vector<std::vector<int64_t>>& ffn_out_bias_shapes,
-    const std::vector<int64_t>& self_relative_attention_bias_weight_shapes,
-    const std::vector<int64_t>& decoder_ln_weight_shape,
-    const std::vector<int64_t>& decoder_ln_bias_shape,
-    const std::vector<int64_t>& embedding_weight_shape,
-    const std::vector<int64_t>& embedding_bias_shape,
-    const std::string& decoding_strategy,
-    const int& beam_size,
-    const int& topk,
-    const float& topp,
-    const int& n_head,
-    const int& size_per_head,
-    const int& num_layer,
-    const int& bos_id,
-    const int& eos_id,
-    const int64_t& max_len,
-    const float& beam_search_diversity_rate,
-    const bool& rel_len,
-    const float& alpha,
-    const float& temperature,
-    const bool& early_stopping,
-    const int& max_distance,
-    const int& num_buckets,
-    const bool& tie_word_embeddings,
-    const std::string& act) {
-  int batch_size = input_shape[0];
-
-  std::vector<int64_t> output_dims;
-  std::vector<int64_t> sequence_length_dims({batch_size});
-  if (decoding_strategy == "beam_search") {
-    if (batch_size != -1) {
-      batch_size /= beam_size;
-    }
-    output_dims = {max_len, batch_size, beam_size};
-    return {output_dims, output_dims, sequence_length_dims};
-  } else if (decoding_strategy == "beam_search_v2" ||
-             decoding_strategy == "beam_search_v3") {
-    // Use separated alive and finish beam queues to avoid the decrease of alive
-    // beams. The outputs must include both the finish and alive to trace full
-    // path.
-    if (batch_size != -1) {
-      sequence_length_dims = {batch_size * 2};
-      batch_size /= beam_size;
-    } else {
-      sequence_length_dims = {batch_size};
-    }
-    output_dims = {max_len, batch_size, beam_size * 2};
-    return {output_dims, output_dims, sequence_length_dims};
-  } else if (decoding_strategy == "topk_sampling" ||
-             decoding_strategy == "topp_sampling" ||
-             decoding_strategy == "sampling") {
-    output_dims = {max_len, batch_size};
-    return {output_dims, {1}, sequence_length_dims};
-  } else {
-    PD_THROW("Not supported decoding strategy. ");
-  }
-}
-
-std::vector<paddle::DataType> T5DecodingInferDtype(
-    const paddle::DataType& input,
-    const paddle::DataType& mem_seq_len,
-    const paddle::DataType& word_embedding,
-    const std::vector<paddle::DataType>& self_ln_weight,
-    const std::vector<paddle::DataType>& self_ln_bias,
-    const std::vector<paddle::DataType>& self_q_weight,
-    const std::vector<paddle::DataType>& self_q_bias,
-    const std::vector<paddle::DataType>& self_k_weight,
-    const std::vector<paddle::DataType>& self_k_bias,
-    const std::vector<paddle::DataType>& self_v_weight,
-    const std::vector<paddle::DataType>& self_v_bias,
-    const std::vector<paddle::DataType>& self_out_weight,
-    const std::vector<paddle::DataType>& self_out_bias,
-    const std::vector<paddle::DataType>& cross_ln_weight,
-    const std::vector<paddle::DataType>& cross_ln_bias,
-    const std::vector<paddle::DataType>& cross_q_weight,
-    const std::vector<paddle::DataType>& cross_q_bias,
-    const std::vector<paddle::DataType>& cross_k_weight,
-    const std::vector<paddle::DataType>& cross_k_bias,
-    const std::vector<paddle::DataType>& cross_v_weight,
-    const std::vector<paddle::DataType>& cross_v_bias,
-    const std::vector<paddle::DataType>& cross_out_weight,
-    const std::vector<paddle::DataType>& cross_out_bias,
-    const std::vector<paddle::DataType>& ffn_ln_weight,
-    const std::vector<paddle::DataType>& ffn_ln_bias,
-    const std::vector<paddle::DataType>& ffn_inter_weight_0,
-    const std::vector<paddle::DataType>& ffn_inter_bias_0,
-    const std::vector<paddle::DataType>& ffn_inter_weight_1,
-    const std::vector<paddle::DataType>& ffn_inter_bias_1,
-    const std::vector<paddle::DataType>& ffn_out_weight,
-    const std::vector<paddle::DataType>& ffn_out_bias,
-    const paddle::DataType& self_relative_attention_bias_weight,
-    const paddle::DataType& decoder_ln_weight,
-    const paddle::DataType& decoder_ln_bias,
-    const paddle::DataType& embedding_weight,
-    const paddle::DataType& embedding_bias) {
-  return {paddle::DataType::INT32,
-          paddle::DataType::INT32,
-          paddle::DataType::INT32};
-}
-
-
-PD_BUILD_OP(fusion_t5_decoding)
-    .Inputs({"Input",
-             "MemSeqLen",
-             "WordEmbedding",
-             paddle::Vec("SelfLayernormWeight"),
-             paddle::Vec("SelfLayernormBias"),
-             paddle::Vec("SelfQueryWeight"),
-             paddle::Vec("SelfQueryBias"),
-             paddle::Vec("SelfKeyWeight"),
-             paddle::Vec("SelfKeyBias"),
-             paddle::Vec("SelfValueWeight"),
-             paddle::Vec("SelfValueBias"),
-             paddle::Vec("SelfOutWeight"),
-             paddle::Vec("SelfOutBias"),
-             paddle::Vec("CrossLayernormWeight"),
-             paddle::Vec("CrossLayernormBias"),
-             paddle::Vec("CrossQueryWeight"),
-             paddle::Vec("CrossQueryBias"),
-             paddle::Vec("CrossKeyWeight"),
-             paddle::Vec("CrossKeyBias"),
-             paddle::Vec("CrossValueWeight"),
-             paddle::Vec("CrossValueBias"),
-             paddle::Vec("CrossOutWeight"),
-             paddle::Vec("CrossOutBias"),
-             paddle::Vec("FFNLayernormWeight"),
-             paddle::Vec("FFNLayernormBias"),
-             paddle::Vec("FFNInterWeight0"),
-             paddle::Vec("FFNInterBias0"),
-             paddle::Vec("FFNInterWeight1"),
-             paddle::Vec("FFNInterBias1"),
-             paddle::Vec("FFNOutWeight"),
-             paddle::Vec("FFNOutBias"),
-             "SelfRelativeAttentionBiasWeight",
-             "DecoderLayernormWeight",
-             "DecoderLayernormBias",
-             "EmbWeight",
-             "EmbBias"})
-    .Outputs({"OutputIds", "ParentIds", "SequenceLength"})
-    .Attrs({"decoding_strategy: std::string",
-            "beam_size: int",
-            "topk: int",
-            "topp: float",
-            "n_head: int",
-            "size_per_head: int",
-            "num_layer: int",
-            "bos_id: int",
-            "eos_id: int",
-            "max_len: int64_t",
-            "beam_search_diversity_rate: float",
-            "rel_len: bool",
-            "alpha: float",
-            "temperature: float",
-            "early_stopping: bool",
-            "max_distance: int",
-            "num_buckets: int",
-            "tie_word_embeddings: bool",
-            "act: std::string"})
-    .SetKernelFn(PD_KERNEL(T5DecodingForward))
-    .SetInferShapeFn(PD_INFER_SHAPE(T5DecodingInferShape))
-    .SetInferDtypeFn(PD_INFER_DTYPE(T5DecodingInferDtype));
diff --git a/paddlenlp/ops/fast_transformer/src/fusion_t5_decoding_op.cu b/paddlenlp/ops/fast_transformer/src/fusion_t5_decoding_op.cu
deleted file mode 100644
index 5fd211f5fd22..000000000000
--- a/paddlenlp/ops/fast_transformer/src/fusion_t5_decoding_op.cu
+++ /dev/null
@@ -1,635 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <curand.h>
-#include <curand_kernel.h>
-
-#include <algorithm>
-#include <iterator>
-#include <random>
-#include <sstream>
-#include <vector>
-
-
-#include "fusion_t5_decoding_op.h"
-#include "pd_traits.h"
-
-template <paddle::DataType D>
-std::vector<paddle::Tensor> t5_decoding_kernel(
-    const paddle::Tensor& input,
-    const paddle::Tensor& memory_sequence_length,
-    const paddle::Tensor& word_emb,
-    const std::vector<paddle::Tensor>& self_layernorm_weight,
-    const std::vector<paddle::Tensor>& self_layernorm_bias,
-    const std::vector<paddle::Tensor>& self_attn_query_weight,
-    const std::vector<paddle::Tensor>& self_attn_query_bias,
-    const std::vector<paddle::Tensor>& self_attn_key_weight,
-    const std::vector<paddle::Tensor>& self_attn_key_bias,
-    const std::vector<paddle::Tensor>& self_attn_value_weight,
-    const std::vector<paddle::Tensor>& self_attn_value_bias,
-    const std::vector<paddle::Tensor>& self_attn_output_weight,
-    const std::vector<paddle::Tensor>& self_attn_output_bias,
-    const std::vector<paddle::Tensor>& cross_layernorm_weight,
-    const std::vector<paddle::Tensor>& cross_layernorm_bias,
-    const std::vector<paddle::Tensor>& cross_attn_query_weight,
-    const std::vector<paddle::Tensor>& cross_attn_query_bias,
-    const std::vector<paddle::Tensor>& cross_attn_key_weight,
-    const std::vector<paddle::Tensor>& cross_attn_key_bias,
-    const std::vector<paddle::Tensor>& cross_attn_value_weight,
-    const std::vector<paddle::Tensor>& cross_attn_value_bias,
-    const std::vector<paddle::Tensor>& cross_attn_output_weight,
-    const std::vector<paddle::Tensor>& cross_attn_output_bias,
-    const std::vector<paddle::Tensor>& ffn_layernorm_weight,
-    const std::vector<paddle::Tensor>& ffn_layernorm_bias,
-    const std::vector<paddle::Tensor>& ffn_intermediate_weight_0,
-    const std::vector<paddle::Tensor>& ffn_intermediate_bias_0,
-    const std::vector<paddle::Tensor>& ffn_intermediate_weight_1,
-    const std::vector<paddle::Tensor>& ffn_intermediate_bias_1,
-    const std::vector<paddle::Tensor>& ffn_output_weight,
-    const std::vector<paddle::Tensor>& ffn_output_bias,
-    const paddle::Tensor& self_relative_attention_bias_weight,
-    const paddle::Tensor& decoder_layernorm_weight,
-    const paddle::Tensor& decoder_layernorm_bias,
-    const paddle::Tensor& embedding_weight,
-    const paddle::Tensor& embedding_bias,
-    paddle::Tensor& output_ids,
-    paddle::Tensor& parent_ids,
-    paddle::Tensor& sequence_length,
-    const std::string& decoding_strategy,
-    const int& beam_size,
-    const int& topk,
-    const float& topp,
-    const int& head_num_,
-    const int& size_per_head_,
-    const int& num_layer_,
-    const int& start_id_,
-    const int& end_id_,
-    const int64_t& max_seq_len_,
-    const float& beam_search_diversity_rate_,
-    const float& alpha,
-    const float& temperature,
-    const bool& early_stopping,
-    const int& max_distance,
-    const int& num_buckets,
-    const bool& tie_word_embeddings,
-    const std::string& act,
-    cudaStream_t stream) {
-  int beam_width_ = (decoding_strategy == "beam_search" ||
-                     decoding_strategy == "beam_search_v2" ||
-                     decoding_strategy == "beam_search_v3")
-                        ? beam_size
-                        : 1;
-  int candidate_num_ =
-      (decoding_strategy == "topk_sampling" ||
-       decoding_strategy == "topp_sampling" || decoding_strategy == "sampling")
-          ? topk
-          : 1;
-  float probability_threshold_ =
-      (decoding_strategy == "topk_sampling" ||
-       decoding_strategy == "topp_sampling" || decoding_strategy == "sampling")
-          ? topp
-          : 0.0;
-
-  auto input_dims = input.shape();
-  int batch_size_ = (decoding_strategy == "beam_search" ||
-                     decoding_strategy == "beam_search_v2" ||
-                     decoding_strategy == "beam_search_v3")
-                        ? input_dims[0] / beam_width_
-                        : input_dims[0];
-  const int memory_max_seq_len = input_dims[1];
-  const int memory_hidden_dim = input_dims[2];
-  const int vocab_size = word_emb.shape()[0];
-
-  typedef PDTraits<D> traits_;
-  typedef typename traits_::DataType DataType_;
-  typedef typename traits_::data_t data_t_;
-
-  DecodingInitParam<DataType_> decoding_params;
-  decoding_params.cublas_handle = CublasHandle::GetInstance()->cublas_handle_;
-  decoding_params.cublaslt_handle =
-      CublasHandle::GetInstance()->cublaslt_handle_;
-
-  decoding_params.output_ids = output_ids.mutable_data<int>(input.place());
-  decoding_params.parent_ids = parent_ids.mutable_data<int>(input.place());
-  decoding_params.sequence_length =
-      sequence_length.mutable_data<int>(input.place());
-
-  typedef DecoderTransformerTraits<traits_::OpType> DecodingTraits_;
-  decoding_params.stream = stream;
-  fastertransformer::Allocator<AllocatorType::PD> allocator_(stream);
-
-  decoding_params.memory_tensor =
-      reinterpret_cast<const DataType_*>(input.data<data_t_>());
-  decoding_params.memory_sequence_length = memory_sequence_length.data<int>();
-
-  DecoderInitParam<DataType_>* params =
-      new DecoderInitParam<DataType_>[num_layer_];
-
-  int inner_coeff = ffn_intermediate_weight_0[0].shape()[1] / memory_hidden_dim;
-  int inner_size = ffn_intermediate_weight_0[0].shape()[1];
-
-  auto q_weight_shape = self_attn_query_weight[0].shape();
-  auto k_weight_shape = self_attn_key_weight[0].shape();
-
-  if (decoding_strategy == "beam_search" ||
-      decoding_strategy == "beam_search_v2" ||
-      decoding_strategy == "beam_search_v3") {
-    decoding_params.request_batch_size = batch_size_ * beam_width_;
-  } else if (decoding_strategy == "sampling" ||
-             decoding_strategy == "topk_sampling" ||
-             decoding_strategy == "topp_sampling") {
-    decoding_params.request_batch_size = batch_size_;
-  }
-
-  bool use_gated = false;
-
-  for (int i = 0; i < num_layer_; i++) {
-    params[i].stream = stream;
-    params[i].cublas_handle = CublasHandle::GetInstance()->cublas_handle_;
-    params[i].cublaslt_handle = CublasHandle::GetInstance()->cublaslt_handle_;
-
-    if (decoding_strategy == "beam_search" ||
-        decoding_strategy == "beam_search_v2" ||
-        decoding_strategy == "beam_search_v3") {
-      params[i].request_batch_size = batch_size_ * beam_width_;
-      params[i].request_max_mem_seq_len = memory_max_seq_len;
-    } else if (decoding_strategy == "sampling" ||
-               decoding_strategy == "topk_sampling" ||
-               decoding_strategy == "topp_sampling") {
-      params[i].request_batch_size = batch_size_;
-      params[i].request_max_mem_seq_len = memory_max_seq_len;
-    }
-
-    // self attn
-    params[i].self_layernorm.gamma = reinterpret_cast<const DataType_*>(
-        self_layernorm_weight[i].data<data_t_>());
-
-    if (self_layernorm_bias[i].shape()[0] != 1) {
-      params[i].self_layernorm.beta = reinterpret_cast<const DataType_*>(
-          self_layernorm_bias[i].data<data_t_>());
-    } else {
-      params[i].self_layernorm.beta = nullptr;
-    }
-
-    // query
-    params[i].self_attention.query_weight.kernel =
-        reinterpret_cast<const DataType_*>(
-            self_attn_query_weight[i].data<data_t_>());
-    params[i].self_attention.query_weight.bias =
-        reinterpret_cast<const DataType_*>(
-            self_attn_query_bias[i].data<data_t_>());
-    // key
-    params[i].self_attention.key_weight.kernel =
-        reinterpret_cast<const DataType_*>(
-            self_attn_key_weight[i].data<data_t_>());
-    params[i].self_attention.key_weight.bias =
-        reinterpret_cast<const DataType_*>(
-            self_attn_key_bias[i].data<data_t_>());
-    // value
-    params[i].self_attention.value_weight.kernel =
-        reinterpret_cast<const DataType_*>(
-            self_attn_value_weight[i].data<data_t_>());
-    params[i].self_attention.value_weight.bias =
-        reinterpret_cast<const DataType_*>(
-            self_attn_value_bias[i].data<data_t_>());
-    // out proj
-    params[i].self_attention.attention_output_weight.kernel =
-        reinterpret_cast<const DataType_*>(
-            self_attn_output_weight[i].data<data_t_>());
-    params[i].self_attention.attention_output_weight.bias =
-        reinterpret_cast<const DataType_*>(
-            self_attn_output_bias[i].data<data_t_>());
-
-    // cross
-    params[i].cross_layernorm.gamma = reinterpret_cast<const DataType_*>(
-        cross_layernorm_weight[i].data<data_t_>());
-    if (cross_layernorm_bias[i].shape()[0] != 1) {
-      params[i].cross_layernorm.beta = reinterpret_cast<const DataType_*>(
-          cross_layernorm_bias[i].data<data_t_>());
-    } else {
-      params[i].cross_layernorm.beta = nullptr;
-    }
-    // query
-    params[i].cross_attention.query_weight.kernel =
-        reinterpret_cast<const DataType_*>(
-            cross_attn_query_weight[i].data<data_t_>());
-    params[i].cross_attention.query_weight.bias =
-        reinterpret_cast<const DataType_*>(
-            cross_attn_query_bias[i].data<data_t_>());
-    // key
-    params[i].cross_attention.key_weight.kernel =
-        reinterpret_cast<const DataType_*>(
-            cross_attn_key_weight[i].data<data_t_>());
-    params[i].cross_attention.key_weight.bias =
-        reinterpret_cast<const DataType_*>(
-            cross_attn_key_bias[i].data<data_t_>());
-    // value
-    params[i].cross_attention.value_weight.kernel =
-        reinterpret_cast<const DataType_*>(
-            cross_attn_value_weight[i].data<data_t_>());
-    params[i].cross_attention.value_weight.bias =
-        reinterpret_cast<const DataType_*>(
-            cross_attn_value_bias[i].data<data_t_>());
-    // out proj
-    params[i].cross_attention.attention_output_weight.kernel =
-        reinterpret_cast<const DataType_*>(
-            cross_attn_output_weight[i].data<data_t_>());
-    params[i].cross_attention.attention_output_weight.bias =
-        reinterpret_cast<const DataType_*>(
-            cross_attn_output_bias[i].data<data_t_>());
-
-    // ffn
-    params[i].ffn_layernorm.gamma = reinterpret_cast<const DataType_*>(
-        ffn_layernorm_weight[i].data<data_t_>());
-    if (ffn_layernorm_bias[i].shape()[0] != 1) {
-      params[i].ffn_layernorm.beta = reinterpret_cast<const DataType_*>(
-          ffn_layernorm_bias[i].data<data_t_>());
-    } else {
-      params[i].ffn_layernorm.beta = nullptr;
-    }
-    // intermediate proj
-    params[i].ffn.intermediate_weight.kernel =
-        reinterpret_cast<const DataType_*>(
-            ffn_intermediate_weight_0[i].data<data_t_>());
-    params[i].ffn.intermediate_weight.bias = reinterpret_cast<const DataType_*>(
-        ffn_intermediate_bias_0[i].data<data_t_>());
-
-    if (ffn_intermediate_weight_1[i].shape()[0] != 1) {
-        use_gated = true;
-        params[i].ffn.intermediate_weight_1.kernel =
-            reinterpret_cast<const DataType_*>(
-                ffn_intermediate_weight_1[i].data<data_t_>());
-        params[i].ffn.intermediate_weight_1.bias = reinterpret_cast<const DataType_*>(
-            ffn_intermediate_bias_1[i].data<data_t_>());
-    } else {
-        params[i].ffn.intermediate_weight_1.kernel = nullptr;
-        params[i].ffn.intermediate_weight_1.bias = nullptr;
-    }
-    
-    // out proj
-    params[i].ffn.output_weight.kernel = reinterpret_cast<const DataType_*>(
-        ffn_output_weight[i].data<data_t_>());
-    params[i].ffn.output_weight.bias =
-        reinterpret_cast<const DataType_*>(ffn_output_bias[i].data<data_t_>());
-  }
-
-  // relative bias
-  decoding_params.self_relative_attention_bias_weight =
-      reinterpret_cast<const DataType_*>(
-          self_relative_attention_bias_weight.data<data_t_>());
-
-  decoding_params.layernorm.gamma = reinterpret_cast<const DataType_*>(
-      decoder_layernorm_weight.data<data_t_>());
-  if (decoder_layernorm_bias.shape()[0] != 1) {
-    decoding_params.layernorm.beta = reinterpret_cast<const DataType_*>(
-        decoder_layernorm_bias.data<data_t_>());
-  } else {
-    decoding_params.layernorm.beta = nullptr;
-  }
-  // for embedding
-  decoding_params.embedding_table =
-      reinterpret_cast<const DataType_*>(word_emb.data<data_t_>());
-
-  // for weight sharing matmul
-  decoding_params.embedding_kernel =
-      reinterpret_cast<const DataType_*>(embedding_weight.data<data_t_>());
-  // for matmul bias
-  decoding_params.embedding_bias =
-      reinterpret_cast<const DataType_*>(embedding_bias.data<data_t_>());
-
-  int finished_candidate_num_ =
-      ("beam_search_v3" == decoding_strategy) ? beam_width_ : beam_width_ * 2;
-
-  ActivationType activate =
-      (act == "gelu") ? ActivationType::GELU : ActivationType::RELU;
-
-  if ("beam_search" == decoding_strategy) {
-    T5DecodingBeamsearch<DecodingTraits_::OpType>* decoding_beam_search_;
-    decoding_beam_search_ = new T5DecodingBeamsearch<DecodingTraits_::OpType>(
-        allocator_,
-        batch_size_,
-        beam_width_,
-        max_seq_len_,
-        head_num_,
-        size_per_head_,
-        vocab_size,
-        num_layer_,
-        memory_hidden_dim,
-        memory_max_seq_len,
-        start_id_,
-        end_id_,
-        beam_search_diversity_rate_,
-        true,  // is_fuse_topk_softMax
-        true,  // fuse_qkv
-        false,                 // keep_alive_beam
-        0.6,                   // alpha
-        true,                  // normalization_before
-        activate,
-        -1,                    // finished_candidate_num
-        false,                 // early_stopping
-        0,                     // min_length
-        inner_coeff,
-        inner_size,
-        num_buckets,
-        max_distance,
-        tie_word_embeddings,
-        use_gated);
-
-    decoding_beam_search_->forward(params, decoding_params);
-
-    delete decoding_beam_search_;
-  } else if ("beam_search_v2" == decoding_strategy ||
-             "beam_search_v3" == decoding_strategy) {
-    T5DecodingBeamsearch<DecodingTraits_::OpType>* decoding_beam_search_;
-    decoding_beam_search_ = new T5DecodingBeamsearch<DecodingTraits_::OpType>(
-        allocator_,
-        batch_size_,
-        beam_width_,
-        max_seq_len_,
-        head_num_,
-        size_per_head_,
-        vocab_size,
-        num_layer_,
-        memory_hidden_dim,
-        memory_max_seq_len,
-        start_id_,
-        end_id_,
-        beam_search_diversity_rate_,
-        true,  // is_fuse_topk_softMax
-        true,  // fuse_qkv
-        true,  // keep_alive_beam
-        alpha,
-        true,                     // normalization_before
-        activate,
-        finished_candidate_num_,
-        early_stopping,
-        0,                        // min_length
-        inner_coeff,
-        inner_size,
-        num_buckets,
-        max_distance,
-        tie_word_embeddings,
-        use_gated);
-
-    decoding_beam_search_->forward(params, decoding_params);
-
-    delete decoding_beam_search_;
-  } else if ("topk_sampling" == decoding_strategy ||
-             "topp_sampling" == decoding_strategy ||
-             "sampling" == decoding_strategy) {
-
-    T5DecodingSampling<DecodingTraits_::OpType>* decoding_sampling_;
-    decoding_sampling_ = new T5DecodingSampling<DecodingTraits_::OpType>(
-        allocator_,
-        batch_size_,
-        max_seq_len_,
-        head_num_,
-        size_per_head_,
-        vocab_size,
-        num_layer_,
-        memory_hidden_dim,
-        memory_max_seq_len,
-        start_id_,
-        end_id_,
-        candidate_num_,
-        probability_threshold_,
-        true,  // fuse_qkv
-        true,                  // normalization_before
-        activate,
-        1.0,                   // temperature
-        1.0,                   // repeat_penalty
-        0,                     // min_length
-        inner_coeff,
-        inner_size,
-        -1,  // seed
-        1,  // tensor_para_size
-        1,  // layer_para_size
-        num_buckets,
-        max_distance,
-        tie_word_embeddings,
-        use_gated);
-
-    decoding_sampling_->forward(params, decoding_params);
-
-    delete decoding_sampling_;
-
-  } else {
-    PD_THROW(
-        "Only beam_search, topk_sampling and topp_sampling are supported for "
-        "FastGeneration. ");
-  }
-  delete[] params;
-
-  return {output_ids, parent_ids, sequence_length};
-}
-
-std::vector<paddle::Tensor> T5DecodingCUDAForward(
-    const paddle::Tensor& input,
-    const paddle::Tensor& mem_seq_len,
-    const paddle::Tensor& word_embedding,
-    const std::vector<paddle::Tensor>& self_ln_weight,
-    const std::vector<paddle::Tensor>& self_ln_bias,
-    const std::vector<paddle::Tensor>& self_q_weight,
-    const std::vector<paddle::Tensor>& self_q_bias,
-    const std::vector<paddle::Tensor>& self_k_weight,
-    const std::vector<paddle::Tensor>& self_k_bias,
-    const std::vector<paddle::Tensor>& self_v_weight,
-    const std::vector<paddle::Tensor>& self_v_bias,
-    const std::vector<paddle::Tensor>& self_out_weight,
-    const std::vector<paddle::Tensor>& self_out_bias,
-    const std::vector<paddle::Tensor>& cross_ln_weight,
-    const std::vector<paddle::Tensor>& cross_ln_bias,
-    const std::vector<paddle::Tensor>& cross_q_weight,
-    const std::vector<paddle::Tensor>& cross_q_bias,
-    const std::vector<paddle::Tensor>& cross_k_weight,
-    const std::vector<paddle::Tensor>& cross_k_bias,
-    const std::vector<paddle::Tensor>& cross_v_weight,
-    const std::vector<paddle::Tensor>& cross_v_bias,
-    const std::vector<paddle::Tensor>& cross_out_weight,
-    const std::vector<paddle::Tensor>& cross_out_bias,
-    const std::vector<paddle::Tensor>& ffn_ln_weight,
-    const std::vector<paddle::Tensor>& ffn_ln_bias,
-    const std::vector<paddle::Tensor>& ffn_inter_weight_0,
-    const std::vector<paddle::Tensor>& ffn_inter_bias_0,
-    const std::vector<paddle::Tensor>& ffn_inter_weight_1,
-    const std::vector<paddle::Tensor>& ffn_inter_bias_1,
-    const std::vector<paddle::Tensor>& ffn_out_weight,
-    const std::vector<paddle::Tensor>& ffn_out_bias,
-    const paddle::Tensor& self_relative_attention_bias_weight,
-    const paddle::Tensor& decoder_ln_weight,
-    const paddle::Tensor& decoder_ln_bias,
-    const paddle::Tensor& embedding_weight,
-    const paddle::Tensor& embedding_bias,
-    paddle::Tensor& output_ids,
-    paddle::Tensor& parent_ids,
-    paddle::Tensor& sequence_length,
-    const std::string& decoding_strategy,
-    const int& beam_size,
-    const int& topk,
-    const float& topp,
-    const int& n_head,
-    const int& size_per_head,
-    const int& num_layer,
-    const int& bos_id,
-    const int& eos_id,
-    const int64_t& max_len,
-    const float& beam_search_diversity_rate,
-    const float& alpha,
-    const float& temperature,
-    const bool& early_stopping,
-    const int& max_distance,
-    const int& num_buckets,
-    const bool& tie_word_embeddings,
-    const std::string& act) {
-  auto stream = input.stream();
-
-  cublasSetStream(CublasHandle::GetInstance()->cublas_handle_, stream);
-
-  std::vector<paddle::Tensor> ret;
-
-  switch (input.type()) {
-    case paddle::DataType::FLOAT16: {
-      ret = t5_decoding_kernel<paddle::DataType::FLOAT16>(
-          input,
-          mem_seq_len,
-          word_embedding,
-          self_ln_weight,
-          self_ln_bias,
-          self_q_weight,
-          self_q_bias,
-          self_k_weight,
-          self_k_bias,
-          self_v_weight,
-          self_v_bias,
-          self_out_weight,
-          self_out_bias,
-          cross_ln_weight,
-          cross_ln_bias,
-          cross_q_weight,
-          cross_q_bias,
-          cross_k_weight,
-          cross_k_bias,
-          cross_v_weight,
-          cross_v_bias,
-          cross_out_weight,
-          cross_out_bias,
-          ffn_ln_weight,
-          ffn_ln_bias,
-          ffn_inter_weight_0,
-          ffn_inter_bias_0,
-          ffn_inter_weight_1,
-          ffn_inter_bias_1,
-          ffn_out_weight,
-          ffn_out_bias,
-          self_relative_attention_bias_weight,
-          decoder_ln_weight,
-          decoder_ln_bias,
-          embedding_weight,
-          embedding_bias,
-          output_ids,
-          parent_ids,
-          sequence_length,
-          decoding_strategy,
-          beam_size,
-          topk,
-          topp,
-          n_head,
-          size_per_head,
-          num_layer,
-          bos_id,
-          eos_id,
-          max_len,
-          beam_search_diversity_rate,
-          alpha,
-          temperature,
-          early_stopping,
-          max_distance,
-          num_buckets,
-          tie_word_embeddings,
-          act,
-          stream);
-      break;
-    }
-    case paddle::DataType::FLOAT32: {
-      ret = t5_decoding_kernel<paddle::DataType::FLOAT32>(
-          input,
-          mem_seq_len,
-          word_embedding,
-          self_ln_weight,
-          self_ln_bias,
-          self_q_weight,
-          self_q_bias,
-          self_k_weight,
-          self_k_bias,
-          self_v_weight,
-          self_v_bias,
-          self_out_weight,
-          self_out_bias,
-          cross_ln_weight,
-          cross_ln_bias,
-          cross_q_weight,
-          cross_q_bias,
-          cross_k_weight,
-          cross_k_bias,
-          cross_v_weight,
-          cross_v_bias,
-          cross_out_weight,
-          cross_out_bias,
-          ffn_ln_weight,
-          ffn_ln_bias,
-          ffn_inter_weight_0,
-          ffn_inter_bias_0,
-          ffn_inter_weight_1,
-          ffn_inter_bias_1,
-          ffn_out_weight,
-          ffn_out_bias,
-          self_relative_attention_bias_weight,
-          decoder_ln_weight,
-          decoder_ln_bias,
-          embedding_weight,
-          embedding_bias,
-          output_ids,
-          parent_ids,
-          sequence_length,
-          decoding_strategy,
-          beam_size,
-          topk,
-          topp,
-          n_head,
-          size_per_head,
-          num_layer,
-          bos_id,
-          eos_id,
-          max_len,
-          beam_search_diversity_rate,
-          alpha,
-          temperature,
-          early_stopping,
-          max_distance,
-          num_buckets,
-          tie_word_embeddings,
-          act,
-          stream);
-      break;
-    }
-    default: {
-      PD_THROW(
-          "NOT supported data type. "
-          "Only float16 and float32 are supported. ");
-      break;
-    }
-  }
-  return ret;
-}
diff --git a/paddlenlp/ops/fast_transformer/src/fusion_t5_decoding_op.h b/paddlenlp/ops/fast_transformer/src/fusion_t5_decoding_op.h
deleted file mode 100644
index 1fe581d4879c..000000000000
--- a/paddlenlp/ops/fast_transformer/src/fusion_t5_decoding_op.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include "cublas_handle.h"
-#include "fastertransformer/open_decoder.h"
-#include "fastertransformer/t5_beamsearch.h"
-#include "fastertransformer/t5_sampling.h"
-#include "fastertransformer/utils/common.h"
-
-#ifdef PADDLE_ON_INFERENCE
-#include "paddle/extension.h"
-#include "paddle_inference_api.h"
-#include "paddle/common/exception.h"
-#else
-#include "paddle/extension.h"
-#endif
-
-
-std::vector<paddle::Tensor> T5DecodingCUDAForward(
-    const paddle::Tensor& input,
-    const paddle::Tensor& mem_seq_len,
-    const paddle::Tensor& word_embedding,
-    const std::vector<paddle::Tensor>& self_ln_weight,
-    const std::vector<paddle::Tensor>& self_ln_bias,
-    const std::vector<paddle::Tensor>& self_q_weight,
-    const std::vector<paddle::Tensor>& self_q_bias,
-    const std::vector<paddle::Tensor>& self_k_weight,
-    const std::vector<paddle::Tensor>& self_k_bias,
-    const std::vector<paddle::Tensor>& self_v_weight,
-    const std::vector<paddle::Tensor>& self_v_bias,
-    const std::vector<paddle::Tensor>& self_out_weight,
-    const std::vector<paddle::Tensor>& self_out_bias,
-    const std::vector<paddle::Tensor>& cross_ln_weight,
-    const std::vector<paddle::Tensor>& cross_ln_bias,
-    const std::vector<paddle::Tensor>& cross_q_weight,
-    const std::vector<paddle::Tensor>& cross_q_bias,
-    const std::vector<paddle::Tensor>& cross_k_weight,
-    const std::vector<paddle::Tensor>& cross_k_bias,
-    const std::vector<paddle::Tensor>& cross_v_weight,
-    const std::vector<paddle::Tensor>& cross_v_bias,
-    const std::vector<paddle::Tensor>& cross_out_weight,
-    const std::vector<paddle::Tensor>& cross_out_bias,
-    const std::vector<paddle::Tensor>& ffn_ln_weight,
-    const std::vector<paddle::Tensor>& ffn_ln_bias,
-    const std::vector<paddle::Tensor>& ffn_inter_weight_0,
-    const std::vector<paddle::Tensor>& ffn_inter_bias_0,
-    const std::vector<paddle::Tensor>& ffn_inter_weight_1,
-    const std::vector<paddle::Tensor>& ffn_inter_bias_1,
-    const std::vector<paddle::Tensor>& ffn_out_weight,
-    const std::vector<paddle::Tensor>& ffn_out_bias,
-    const paddle::Tensor& self_relative_attention_bias_weight,
-    const paddle::Tensor& decoder_ln_weight,
-    const paddle::Tensor& decoder_ln_bias,
-    const paddle::Tensor& embedding_weight,
-    const paddle::Tensor& embedding_bias,
-    paddle::Tensor& output_ids,
-    paddle::Tensor& parent_ids,
-    paddle::Tensor& sequence_length,
-    const std::string& decoding_strategy,
-    const int& beam_size,
-    const int& topk,
-    const float& topp,
-    const int& n_head,
-    const int& size_per_head,
-    const int& num_layer,
-    const int& bos_id,
-    const int& eos_id,
-    const int64_t& max_len,
-    const float& beam_search_diversity_rate,
-    const float& alpha,
-    const float& temperature,
-    const bool& early_stopping,
-    const int& max_distance,
-    const int& num_buckets,
-    const bool& tie_word_embeddings,
-    const std::string& act);
diff --git a/paddlenlp/ops/fast_transformer/src/fusion_unified_decoding_op.cc b/paddlenlp/ops/fast_transformer/src/fusion_unified_decoding_op.cc
deleted file mode 100644
index 6053ad48a9b1..000000000000
--- a/paddlenlp/ops/fast_transformer/src/fusion_unified_decoding_op.cc
+++ /dev/null
@@ -1,417 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <string>
-#include <vector>
-
-#include "fusion_unified_decoding_op.h"
-#include "pd_traits.h"
-
-
-std::vector<paddle::Tensor> UnifiedDecodingForward(
-    const paddle::Tensor& input_ids,
-    const paddle::Tensor& attn_mask,
-    const paddle::Tensor& mem_seq_len,
-    const paddle::Tensor& type_id,
-    const paddle::Tensor& decoder_type_id,
-    const paddle::Tensor& logits_mask,
-    const paddle::Tensor& word_embedding,
-    const std::vector<paddle::Tensor>& self_ln_weight,
-    const std::vector<paddle::Tensor>& self_ln_bias,
-    const std::vector<paddle::Tensor>& self_q_weight,
-    const std::vector<paddle::Tensor>& self_q_bias,
-    const std::vector<paddle::Tensor>& self_k_weight,
-    const std::vector<paddle::Tensor>& self_k_bias,
-    const std::vector<paddle::Tensor>& self_v_weight,
-    const std::vector<paddle::Tensor>& self_v_bias,
-    const std::vector<paddle::Tensor>& self_out_weight,
-    const std::vector<paddle::Tensor>& self_out_bias,
-    const std::vector<paddle::Tensor>& ffn_ln_weight,
-    const std::vector<paddle::Tensor>& ffn_ln_bias,
-    const std::vector<paddle::Tensor>& ffn_inter_weight,
-    const std::vector<paddle::Tensor>& ffn_inter_bias,
-    const std::vector<paddle::Tensor>& ffn_out_weight,
-    const std::vector<paddle::Tensor>& ffn_out_bias,
-    const paddle::Tensor& decoder_ln_weight,
-    const paddle::Tensor& decoder_ln_bias,
-    const paddle::Tensor& trans_weight,
-    const paddle::Tensor& trans_bias,
-    const paddle::Tensor& lm_ln_weight,
-    const paddle::Tensor& lm_ln_bias,
-    const paddle::Tensor& embedding_weight,
-    const paddle::Tensor& embedding_bias,
-    const paddle::Tensor& positional_embedding_weight,
-    const paddle::Tensor& type_embedding_weight,
-    const paddle::Tensor& role_id,
-    const paddle::Tensor& decoder_role_id,
-    const paddle::Tensor& role_embedding_table,
-    const paddle::Tensor& position_ids,
-    const paddle::Tensor& decoder_position_ids,
-    const std::string& decoding_strategy,
-    const int& beam_size,
-    const int& topk,
-    const float& topp,
-    const int& n_head,
-    const int& size_per_head,
-    const int& num_layer,
-    const int& bos_id,
-    const int& eos_id,
-    const int64_t& max_len,
-    const float& beam_search_diversity_rate,
-    const int& unk_id,
-    const int& mask_id,
-    const float& temperature,
-    const float& len_penalty,
-    const bool& normalize_before,
-    const bool& pos_bias,
-    const std::string& hidden_act,
-    const bool& rel_len,
-    const bool& early_stopping,
-    const int& min_length,
-    const int& tensor_para_size,
-    const int& layer_para_size,
-    const int& layer_para_batch_size) {
-  int batch_size = input_ids.shape()[0];
-  int max_out_len = rel_len ? max_len + input_ids.shape()[1] : max_len;
-
-  std::vector<int64_t> output_ids_dims;
-  std::vector<int64_t> output_scores_dims;
-  std::vector<int64_t> parent_ids_dims;
-  std::vector<int64_t> sequence_length_dims({batch_size});
-  if (decoding_strategy == "beam_search") {
-    if (batch_size != -1) {
-      batch_size /= beam_size;
-    }
-    output_ids_dims = {max_out_len, batch_size, beam_size};
-    output_scores_dims = {batch_size, beam_size};
-    parent_ids_dims = output_ids_dims;
-  } else if (decoding_strategy == "beam_search_v2" ||
-             decoding_strategy == "beam_search_v3") {
-    // Use separated alive and finish beam queues to avoid the decrease of alive
-    // beams. The outputs must include both the finish and alive to trace full
-    // path.
-    if (batch_size != -1) {
-      sequence_length_dims = {batch_size * 2};
-      batch_size /= beam_size;
-    } else {
-      sequence_length_dims = {batch_size};
-    }
-    output_ids_dims = {max_out_len, batch_size, beam_size * 2};
-    output_scores_dims = {batch_size, beam_size * 2};
-    parent_ids_dims = output_ids_dims;
-  } else if (decoding_strategy == "topk_sampling" ||
-             decoding_strategy == "topp_sampling" ||
-             decoding_strategy == "sampling") {
-    output_ids_dims = {max_out_len, batch_size};
-    output_scores_dims = {batch_size};
-    parent_ids_dims = {1};
-  } else {
-    PD_THROW("Not supported decoding strategy. ");
-  }
-  auto output_ids = paddle::Tensor(input_ids.place(), output_ids_dims);
-  auto parent_ids = paddle::Tensor(input_ids.place(), parent_ids_dims);
-  auto sequence_length =
-      paddle::Tensor(input_ids.place(), sequence_length_dims);
-  auto output_scores = paddle::Tensor(input_ids.place(), output_scores_dims);
-
-  if (input_ids.place() == paddle::PlaceType::kGPU) {
-    auto mem_seq_length = paddle::Tensor(paddle::PlaceType::kGPU);
-
-    if (mem_seq_len.place() != paddle::PlaceType::kGPU) {
-      mem_seq_length = mem_seq_len.copy_to<int>(paddle::PlaceType::kGPU);
-    } else {
-      mem_seq_length = mem_seq_len;
-    }
-
-    return UnifiedDecodingCUDAForward(input_ids,
-                                      attn_mask,
-                                      mem_seq_length,
-                                      type_id,
-                                      decoder_type_id,
-                                      logits_mask,
-                                      word_embedding,
-                                      self_ln_weight,
-                                      self_ln_bias,
-                                      self_q_weight,
-                                      self_q_bias,
-                                      self_k_weight,
-                                      self_k_bias,
-                                      self_v_weight,
-                                      self_v_bias,
-                                      self_out_weight,
-                                      self_out_bias,
-                                      ffn_ln_weight,
-                                      ffn_ln_bias,
-                                      ffn_inter_weight,
-                                      ffn_inter_bias,
-                                      ffn_out_weight,
-                                      ffn_out_bias,
-                                      decoder_ln_weight,
-                                      decoder_ln_bias,
-                                      trans_weight,
-                                      trans_bias,
-                                      lm_ln_weight,
-                                      lm_ln_bias,
-                                      embedding_weight,
-                                      embedding_bias,
-                                      positional_embedding_weight,
-                                      type_embedding_weight,
-                                      role_id,
-                                      decoder_role_id,
-                                      role_embedding_table,
-                                      position_ids,
-                                      decoder_position_ids,
-                                      output_ids,
-                                      parent_ids,
-                                      sequence_length,
-                                      output_scores,
-                                      decoding_strategy,
-                                      beam_size,
-                                      topk,
-                                      topp,
-                                      n_head,
-                                      size_per_head,
-                                      num_layer,
-                                      bos_id,
-                                      eos_id,
-                                      max_out_len,
-                                      beam_search_diversity_rate,
-                                      unk_id,
-                                      mask_id,
-                                      temperature,
-                                      len_penalty,
-                                      normalize_before,
-                                      pos_bias,
-                                      hidden_act,
-                                      early_stopping,
-                                      min_length,
-                                      tensor_para_size,
-                                      layer_para_size,
-                                      layer_para_batch_size);
-  } else {
-    PD_THROW("Not implemented place. Only GPU is supported. ");
-  }
-}
-
-std::vector<std::vector<int64_t>> UnifiedDecodingInferShape(
-    const std::vector<int64_t>& input_ids_shape,
-    const std::vector<int64_t>& attn_mask_shape,
-    const std::vector<int64_t>& mem_seq_len_shape,
-    const std::vector<int64_t>& logits_mask_shape,
-    const std::vector<int64_t>& type_id_shape,
-    const std::vector<int64_t>& decoder_type_id_shape,
-    const std::vector<int64_t>& word_embedding_shape,
-    const std::vector<std::vector<int64_t>>& self_ln_weight_shapes,
-    const std::vector<std::vector<int64_t>>& self_ln_bias_shapes,
-    const std::vector<std::vector<int64_t>>& self_q_weight_shapes,
-    const std::vector<std::vector<int64_t>>& self_q_bias_shapes,
-    const std::vector<std::vector<int64_t>>& self_k_weight_shapes,
-    const std::vector<std::vector<int64_t>>& self_k_bias_shapes,
-    const std::vector<std::vector<int64_t>>& self_v_weight_shapes,
-    const std::vector<std::vector<int64_t>>& self_v_bias_shapes,
-    const std::vector<std::vector<int64_t>>& self_out_weight_shapes,
-    const std::vector<std::vector<int64_t>>& self_out_bias_shapes,
-    const std::vector<std::vector<int64_t>>& ffn_ln_weight_shapes,
-    const std::vector<std::vector<int64_t>>& ffn_ln_bias_shapes,
-    const std::vector<std::vector<int64_t>>& ffn_inter_weight_shapes,
-    const std::vector<std::vector<int64_t>>& ffn_inter_bias_shapes,
-    const std::vector<std::vector<int64_t>>& ffn_out_weight_shapes,
-    const std::vector<std::vector<int64_t>>& ffn_out_bias_shapes,
-    const std::vector<int64_t>& decoder_ln_weight_shape,
-    const std::vector<int64_t>& decoder_ln_bias_shape,
-    const std::vector<int64_t>& trans_weight_shape,
-    const std::vector<int64_t>& trans_bias_shape,
-    const std::vector<int64_t>& lm_ln_weight_shape,
-    const std::vector<int64_t>& lm_ln_bias_shape,
-    const std::vector<int64_t>& embedding_weight_shape,
-    const std::vector<int64_t>& embedding_bias_shape,
-    const std::vector<int64_t>& positional_embedding_weight_shape,
-    const std::vector<int64_t>& type_embedding_weight_shape,
-    const std::vector<int64_t>& role_id_shape,
-    const std::vector<int64_t>& decoder_role_id_shape,
-    const std::vector<int64_t>& role_embedding_table_shape,
-    const std::vector<int64_t>& position_ids_shape,
-    const std::vector<int64_t>& decoder_position_ids_shape,
-    const std::string& decoding_strategy,
-    const int& beam_size,
-    const int& topk,
-    const float& topp,
-    const int& n_head,
-    const int& size_per_head,
-    const int& num_layer,
-    const int& bos_id,
-    const int& eos_id,
-    const int64_t& max_len,
-    const float& beam_search_diversity_rate,
-    const int& unk_id,
-    const int& mask_id,
-    const float& temperature,
-    const float& len_penalty,
-    const bool& normalize_before,
-    const bool& pos_bias,
-    const std::string& hidden_act,
-    const bool& rel_len,
-    const bool& early_stopping,
-    const int& min_length,
-    const int& tensor_para_size = 1,
-    const int& layer_para_size = 1,
-    const int& layer_para_batch_size = 1) {
-  int batch_size = input_ids_shape[0];
-
-  std::vector<int64_t> output_ids_dims;
-  std::vector<int64_t> output_scores_dims;
-  std::vector<int64_t> sequence_length_dims({batch_size});
-  if (decoding_strategy == "beam_search") {
-    if (batch_size != -1) {
-      batch_size /= beam_size;
-    }
-    output_ids_dims = {max_len, batch_size, beam_size};
-    output_scores_dims = {batch_size, beam_size};
-    return {output_ids_dims, output_ids_dims, sequence_length_dims, output_scores_dims};
-  } else if (decoding_strategy == "beam_search_v2" ||
-             decoding_strategy == "beam_search_v3") {
-    // Use separated alive and finish beam queues to avoid the decrease of alive
-    // beams. The outputs must include both the finish and alive to trace full
-    // path.
-    if (batch_size != -1) {
-      sequence_length_dims = {batch_size * 2};
-      batch_size /= beam_size;
-    } else {
-      sequence_length_dims = {batch_size};
-    }
-    output_ids_dims = {max_len, batch_size, beam_size * 2};
-    output_scores_dims = {batch_size, beam_size * 2};
-    return {output_ids_dims, output_ids_dims, sequence_length_dims, output_scores_dims};
-  } else if (decoding_strategy == "topk_sampling" ||
-             decoding_strategy == "topp_sampling" ||
-             decoding_strategy == "sampling") {
-    output_ids_dims = {max_len, batch_size};
-    output_scores_dims = {batch_size};
-    return {output_ids_dims, {1}, sequence_length_dims, output_scores_dims};
-  } else {
-    PD_THROW("Not supported decoding strategy. ");
-  }
-}
-
-std::vector<paddle::DataType> UnifiedDecodingInferDtype(
-    const paddle::DataType& input_ids,
-    const paddle::DataType& attn_mask,
-    const paddle::DataType& mem_seq_len,
-    const paddle::DataType& logits_mask,
-    const paddle::DataType& type_id,
-    const paddle::DataType& decoder_type_id,
-    const paddle::DataType& word_embedding,
-    const std::vector<paddle::DataType>& self_ln_weight,
-    const std::vector<paddle::DataType>& self_ln_bias,
-    const std::vector<paddle::DataType>& self_q_weight,
-    const std::vector<paddle::DataType>& self_q_bias,
-    const std::vector<paddle::DataType>& self_k_weight,
-    const std::vector<paddle::DataType>& self_k_bias,
-    const std::vector<paddle::DataType>& self_v_weight,
-    const std::vector<paddle::DataType>& self_v_bias,
-    const std::vector<paddle::DataType>& self_out_weight,
-    const std::vector<paddle::DataType>& self_out_bias,
-    const std::vector<paddle::DataType>& ffn_ln_weight,
-    const std::vector<paddle::DataType>& ffn_ln_bias,
-    const std::vector<paddle::DataType>& ffn_inter_weight,
-    const std::vector<paddle::DataType>& ffn_inter_bias,
-    const std::vector<paddle::DataType>& ffn_out_weight,
-    const std::vector<paddle::DataType>& ffn_out_bias,
-    const paddle::DataType& decoder_ln_weight,
-    const paddle::DataType& decoder_ln_bias,
-    const paddle::DataType& trans_weight,
-    const paddle::DataType& trans_bias,
-    const paddle::DataType& lm_ln_weight,
-    const paddle::DataType& lm_ln_bias,
-    const paddle::DataType& embedding_weight,
-    const paddle::DataType& embedding_bias,
-    const paddle::DataType& positional_embedding_weight,
-    const paddle::DataType& type_embedding_weight,
-    const paddle::DataType& role_id,
-    const paddle::DataType& decoder_role_id,
-    const paddle::DataType& role_embedding_table,
-    const paddle::DataType& position_ids,
-    const paddle::DataType& decoder_position_ids) {
-  return {paddle::DataType::INT32,
-          paddle::DataType::INT32,
-          paddle::DataType::INT32,
-          paddle::DataType::FLOAT32};
-}
-
-PD_BUILD_OP(fusion_unified_decoding)
-    .Inputs({"InputIds",
-             "AttnMask",
-             "MemSeqLen",
-             "TypeIds",
-             "DecTypeIds",
-             "LogitsMask",
-             "WordEmbedding",
-             paddle::Vec("SelfLayernormWeight"),
-             paddle::Vec("SelfLayernormBias"),
-             paddle::Vec("SelfQueryWeight"),
-             paddle::Vec("SelfQueryBias"),
-             paddle::Vec("SelfKeyWeight"),
-             paddle::Vec("SelfKeyBias"),
-             paddle::Vec("SelfValueWeight"),
-             paddle::Vec("SelfValueBias"),
-             paddle::Vec("SelfOutWeight"),
-             paddle::Vec("SelfOutBias"),
-             paddle::Vec("FFNLayernormWeight"),
-             paddle::Vec("FFNLayernormBias"),
-             paddle::Vec("FFNInterWeight"),
-             paddle::Vec("FFNInterBias"),
-             paddle::Vec("FFNOutWeight"),
-             paddle::Vec("FFNOutBias"),
-             "DecoderLayernormWeight",
-             "DecoderLayernormBias",
-             "TransWeight",
-             "TransBias",
-             "LMLayernormWeight",
-             "LMLayernormBias",
-             "EmbWeight",
-             "EmbBias",
-             "PositionEncEmb",
-             "TypeEmb",
-             "RoleIds",
-             "DecRoleIds",
-             "RoleEmbedding",
-             "PositionIds",
-             "DecPositionIds"})
-    .Outputs({"OutputIds", "ParentIds", "SequenceLength", "OutputScores"})
-    .Attrs({"decoding_strategy: std::string",
-            "beam_size: int",
-            "topk: int",
-            "topp: float",
-            "n_head: int",
-            "size_per_head: int",
-            "num_layer: int",
-            "bos_id: int",
-            "eos_id: int",
-            "max_len: int64_t",
-            "beam_search_diversity_rate: float",
-            "unk_id: int",
-            "mask_id: int",
-            "temperature: float",
-            "len_penalty: float",
-            "normalize_before: bool",
-            "pos_bias: bool",
-            "hidden_act: std::string",
-            "rel_len: bool",
-            "early_stopping: bool",
-            "min_length: int",
-            "tensor_para_size: int",
-            "layer_para_size: int",
-            "layer_para_batch_size: int"})
-    .SetKernelFn(PD_KERNEL(UnifiedDecodingForward))
-    .SetInferShapeFn(PD_INFER_SHAPE(UnifiedDecodingInferShape))
-    .SetInferDtypeFn(PD_INFER_DTYPE(UnifiedDecodingInferDtype));
diff --git a/paddlenlp/ops/fast_transformer/src/fusion_unified_decoding_op.cu b/paddlenlp/ops/fast_transformer/src/fusion_unified_decoding_op.cu
deleted file mode 100644
index 2df429cc9ee9..000000000000
--- a/paddlenlp/ops/fast_transformer/src/fusion_unified_decoding_op.cu
+++ /dev/null
@@ -1,693 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <curand.h>
-#include <curand_kernel.h>
-#include <algorithm>
-#include <iterator>
-#include <random>
-#include <sstream>
-#include <vector>
-
-// TODO(guosheng): `HOST` conflict exists in float.h of paddle and mpi.h of mpi
-#include "fusion_unified_decoding_op.h"
-#include "pd_traits.h"
-#ifdef HOST
-#undef HOST
-#endif
-
-#include "fastertransformer/decoding_beamsearch.h"
-#include "fastertransformer/decoding_sampling.h"
-#include "fastertransformer/utils/common.h"
-
-#ifdef BUILD_GPT  // consistent with FasterTransformer
-#include "parallel_utils.h"
-#endif
-
-
-template <paddle::DataType D>
-std::vector<paddle::Tensor> unified_decoding_kernel(
-    const paddle::Tensor& input_ids,
-    const paddle::Tensor& attn_mask,
-    const paddle::Tensor& memory_sequence_length,
-    const paddle::Tensor& type_id,
-    const paddle::Tensor& decoder_type_id,
-    const paddle::Tensor& logits_mask,
-    const paddle::Tensor& word_emb,
-    const std::vector<paddle::Tensor>& self_layernorm_weight,
-    const std::vector<paddle::Tensor>& self_layernorm_bias,
-    const std::vector<paddle::Tensor>& self_attn_query_weight,
-    const std::vector<paddle::Tensor>& self_attn_query_bias,
-    const std::vector<paddle::Tensor>& self_attn_key_weight,
-    const std::vector<paddle::Tensor>& self_attn_key_bias,
-    const std::vector<paddle::Tensor>& self_attn_value_weight,
-    const std::vector<paddle::Tensor>& self_attn_value_bias,
-    const std::vector<paddle::Tensor>& self_attn_output_weight,
-    const std::vector<paddle::Tensor>& self_attn_output_bias,
-    const std::vector<paddle::Tensor>& ffn_layernorm_weight,
-    const std::vector<paddle::Tensor>& ffn_layernorm_bias,
-    const std::vector<paddle::Tensor>& ffn_intermediate_weight,
-    const std::vector<paddle::Tensor>& ffn_intermediate_bias,
-    const std::vector<paddle::Tensor>& ffn_output_weight,
-    const std::vector<paddle::Tensor>& ffn_output_bias,
-    const paddle::Tensor& decoder_layernorm_weight,
-    const paddle::Tensor& decoder_layernorm_bias,
-    const paddle::Tensor& trans_weight,
-    const paddle::Tensor& trans_bias,
-    const paddle::Tensor& lm_layernorm_weight,
-    const paddle::Tensor& lm_layernorm_bias,
-    const paddle::Tensor& embedding_weight,
-    const paddle::Tensor& embedding_bias,
-    const paddle::Tensor& position_encoding_table,
-    const paddle::Tensor& type_embedding_weight,
-    const paddle::Tensor& role_id,
-    const paddle::Tensor& decoder_role_id,
-    const paddle::Tensor& role_embedding_table,
-    const paddle::Tensor& position_ids,
-    const paddle::Tensor& decoder_position_ids,
-    paddle::Tensor& output_ids,
-    paddle::Tensor& parent_ids,
-    paddle::Tensor& sequence_length,
-    paddle::Tensor& output_scores,
-    const std::string& decoding_strategy,
-    const int beam_size,
-    const int topk,
-    const float topp,
-    const int head_num_,
-    const int size_per_head_,
-    const int num_layer_,
-    const int start_id_,
-    const int end_id_,
-    const int64_t max_seq_len_,
-    const float beam_search_diversity_rate_,
-    const int unk_id,
-    const int mask_id,
-    const float temperature,
-    const float len_penalty,
-    const bool normalize_before,
-    const bool pos_bias,
-    const std::string& hidden_act,
-    const bool early_stopping,
-    const int min_length,
-    cudaStream_t stream,
-    const int tensor_para_size = 1,
-    const int layer_para_size = 1,
-    const int layer_para_batch_size = 1) {
-  int beam_width_ = (decoding_strategy == "beam_search" ||
-                     decoding_strategy == "beam_search_v2" ||
-                     decoding_strategy == "beam_search_v3")
-                        ? beam_size
-                        : 1;
-  int candidate_num_ =
-      ("topk_sampling" == decoding_strategy ||
-       "topp_sampling" == decoding_strategy || "sampling" == decoding_strategy)
-          ? topk
-          : 1;
-  float probability_threshold_ =
-      ("topk_sampling" == decoding_strategy ||
-       "topp_sampling" == decoding_strategy || "sampling" == decoding_strategy)
-          ? topp
-          : 0.0;
-
-  auto input_ids_dims = input_ids.shape();
-  int batch_size_ = (decoding_strategy == "beam_search" ||
-                     decoding_strategy == "beam_search_v2" ||
-                     decoding_strategy == "beam_search_v3")
-                        ? input_ids_dims[0] / beam_width_
-                        : input_ids_dims[0];
-  const int memory_max_seq_len = input_ids_dims[1];
-  const int memory_hidden_dim = head_num_ * size_per_head_;
-  const int vocab_size = word_emb.shape()[0];
-
-  typedef PDTraits<D> traits_;
-  typedef typename traits_::DataType DataType_;
-  typedef typename traits_::data_t data_t_;
-
-  DecodingInitParam<DataType_> decoding_params;
-  decoding_params.cublas_handle = CublasHandle::GetInstance()->cublas_handle_;
-  decoding_params.cublaslt_handle =
-      CublasHandle::GetInstance()->cublaslt_handle_;
-
-  decoding_params.output_ids = output_ids.mutable_data<int>(input_ids.place());
-  decoding_params.parent_ids = parent_ids.mutable_data<int>(input_ids.place());
-  decoding_params.sequence_length =
-      sequence_length.mutable_data<int>(input_ids.place());
-  decoding_params.output_scores = output_scores.mutable_data<float>(input_ids.place());
-
-  typedef DecoderTransformerTraits<traits_::OpType> DecodingTraits_;
-  decoding_params.stream = stream;
-  fastertransformer::Allocator<AllocatorType::PD> allocator_(stream);
-
-  decoding_params.d_start_ids = const_cast<int *>(input_ids.data<int>());
-  decoding_params.d_attn_mask =
-      reinterpret_cast<DataType_*>(const_cast<data_t_ *>(attn_mask.data<data_t_>()));
-  decoding_params.d_start_lengths = memory_sequence_length.data<int>();
-
-  decoding_params.memory_sequence_length = memory_sequence_length.data<int>();
-  decoding_params.type_id = type_id.data<int>();
-  decoding_params.decoder_type_id = decoder_type_id.data<int>();
-
-  if (decoding_strategy == "beam_search" ||
-      decoding_strategy == "beam_search_v2" ||
-      decoding_strategy == "beam_search_v3") {
-    decoding_params.request_batch_size = batch_size_ * beam_width_;
-  } else if (decoding_strategy == "sampling" ||
-             decoding_strategy == "topk_sampling" ||
-             decoding_strategy == "topp_sampling") {
-    decoding_params.request_batch_size = batch_size_;
-  }
-  decoding_params.max_input_len = memory_max_seq_len;
-  decoding_params.request_input_len = memory_max_seq_len;
-  decoding_params.request_output_len = max_seq_len_;
-
-#ifdef BUILD_GPT
-  auto* model_para_desc = ModelParaDescFactory::CreateModelParaDesc(
-      head_num_,
-      size_per_head_,
-      num_layer_,
-      tensor_para_size,
-      layer_para_size,
-      layer_para_batch_size,
-      const_cast<data_t_*>(word_emb.data<data_t_>()));
-  auto& tensor_parallel_param = model_para_desc->tensor_parallel_param;
-  auto& layer_parallel_param = model_para_desc->layer_parallel_param;
-  auto seed = model_para_desc->dist(model_para_desc->gen);
-#else
-  TensorParallelParam tensor_parallel_param;
-  LayerParallelParam layer_parallel_param;
-  tensor_parallel_param.rank = 0;
-  tensor_parallel_param.world_size = 1;
-  tensor_parallel_param.local_head_num_ = head_num_;
-  tensor_parallel_param.local_hidden_units_ = memory_hidden_dim;
-
-  layer_parallel_param.rank = 0;
-  layer_parallel_param.world_size = 1;
-  layer_parallel_param.layers_per_group = num_layer_;
-  layer_parallel_param.local_batch_size = batch_size_;
-  int seed = -1;
-#endif
-
-  DecoderInitParam<DataType_>* params =
-      new DecoderInitParam<DataType_>[num_layer_];
-
-  // Allow python passing partial weights for model parallel.
-  int inner_coeff =
-      (memory_hidden_dim == self_attn_output_weight[0].shape()[0])
-          ? ffn_intermediate_weight[0].shape()[1] / memory_hidden_dim
-          : (ffn_intermediate_weight[0].shape()[1] * tensor_para_size /
-             memory_hidden_dim);
-
-  for (int i = 0; i < self_layernorm_weight.size(); i++) {
-    // Allow python passing weights of all layers or only passing the
-    // corresponding layers to save memory.
-    int layer_idx = self_layernorm_weight.size() != num_layer_
-                        ? layer_parallel_param.rank *
-                                  layer_parallel_param.layers_per_group +
-                              i
-                        : i;
-    params[layer_idx].stream = stream;
-    params[layer_idx].cublas_handle = CublasHandle::GetInstance()->cublas_handle_;
-    params[layer_idx].cublaslt_handle = CublasHandle::GetInstance()->cublaslt_handle_;
-
-    if (decoding_strategy == "beam_search" ||
-        decoding_strategy == "beam_search_v2" ||
-        decoding_strategy == "beam_search_v3") {
-      params[layer_idx].request_batch_size = batch_size_ * beam_width_;
-      params[layer_idx].request_max_mem_seq_len = memory_max_seq_len;
-    } else if (decoding_strategy == "sampling" ||
-               decoding_strategy == "topk_sampling" ||
-               decoding_strategy == "topp_sampling") {
-      params[layer_idx].request_batch_size = batch_size_;
-      params[layer_idx].request_max_mem_seq_len = memory_max_seq_len;
-    }
-
-    // self attn
-    params[layer_idx].self_layernorm.gamma = reinterpret_cast<const DataType_*>(
-        self_layernorm_weight[i].data<data_t_>());
-    params[layer_idx].self_layernorm.beta = reinterpret_cast<const DataType_*>(
-        self_layernorm_bias[i].data<data_t_>());
-    // query
-    params[layer_idx].self_attention.query_weight.kernel =
-        reinterpret_cast<const DataType_*>(
-            self_attn_query_weight[i].data<data_t_>());
-    params[layer_idx].self_attention.query_weight.bias =
-        reinterpret_cast<const DataType_*>(
-            self_attn_query_bias[i].data<data_t_>());
-    // key
-    params[layer_idx].self_attention.key_weight.kernel =
-        reinterpret_cast<const DataType_*>(
-            self_attn_key_weight[i].data<data_t_>());
-    params[layer_idx].self_attention.key_weight.bias =
-        reinterpret_cast<const DataType_*>(
-            self_attn_key_bias[i].data<data_t_>());
-    // value
-    params[layer_idx].self_attention.value_weight.kernel =
-        reinterpret_cast<const DataType_*>(
-            self_attn_value_weight[i].data<data_t_>());
-    params[layer_idx].self_attention.value_weight.bias =
-        reinterpret_cast<const DataType_*>(
-            self_attn_value_bias[i].data<data_t_>());
-    // out proj
-    params[layer_idx].self_attention.attention_output_weight.kernel =
-        reinterpret_cast<const DataType_*>(
-            self_attn_output_weight[i].data<data_t_>());
-
-    params[layer_idx].self_attention.attention_output_weight.bias =
-        reinterpret_cast<const DataType_*>(
-            self_attn_output_bias[i].data<data_t_>());
-
-    // ffn
-    params[layer_idx].ffn_layernorm.gamma = reinterpret_cast<const DataType_*>(
-        ffn_layernorm_weight[i].data<data_t_>());
-    params[layer_idx].ffn_layernorm.beta = reinterpret_cast<const DataType_*>(
-        ffn_layernorm_bias[i].data<data_t_>());
-    // intermediate proj
-    params[layer_idx].ffn.intermediate_weight.kernel =
-        reinterpret_cast<const DataType_*>(
-            ffn_intermediate_weight[i].data<data_t_>());
-    params[layer_idx].ffn.intermediate_weight.bias = reinterpret_cast<const DataType_*>(
-        ffn_intermediate_bias[i].data<data_t_>());
-    // out proj
-    params[layer_idx].ffn.output_weight.kernel = reinterpret_cast<const DataType_*>(
-        ffn_output_weight[i].data<data_t_>());
-    params[layer_idx].ffn.output_weight.bias =
-        reinterpret_cast<const DataType_*>(ffn_output_bias[i].data<data_t_>());
-  }
-
-  decoding_params.layernorm.gamma = reinterpret_cast<const DataType_*>(
-      decoder_layernorm_weight.data<data_t_>());
-  decoding_params.layernorm.beta = reinterpret_cast<const DataType_*>(
-      decoder_layernorm_bias.data<data_t_>());
-  decoding_params.trans_kernel =
-      reinterpret_cast<const DataType_*>(trans_weight.data<data_t_>());
-  decoding_params.trans_bias =
-      reinterpret_cast<const DataType_*>(trans_bias.data<data_t_>());
-
-  decoding_params.lm_layernorm.gamma =
-      reinterpret_cast<const DataType_*>(lm_layernorm_weight.data<data_t_>());
-  decoding_params.lm_layernorm.beta =
-      reinterpret_cast<const DataType_*>(lm_layernorm_bias.data<data_t_>());
-
-  // For embedding
-  decoding_params.embedding_table =
-      reinterpret_cast<const DataType_*>(word_emb.data<data_t_>());
-  // For weight sharing matmul
-  decoding_params.embedding_kernel =
-      reinterpret_cast<const DataType_*>(embedding_weight.data<data_t_>());
-  // For matmul bias
-  decoding_params.embedding_bias =
-      reinterpret_cast<const DataType_*>(embedding_bias.data<data_t_>());
-  decoding_params.position_encoding_table = reinterpret_cast<const DataType_*>(
-      position_encoding_table.data<data_t_>());
-
-  // For masking some id during gen.
-  decoding_params.logits_mask =
-      reinterpret_cast<const DataType_*>(logits_mask.data<data_t_>());
-
-  decoding_params.type_table =
-      reinterpret_cast<const DataType_*>(type_embedding_weight.data<data_t_>());
-
-  // For role embedding.
-  auto role_id_shape = role_id.shape();
-  if (role_id_shape.size() > 0 && numel(role_id_shape) > 0) {
-    decoding_params.role_id = role_id.data<int>();
-    decoding_params.decoder_role_id = decoder_role_id.data<int>();
-    decoding_params.role_embedding_table =
-        reinterpret_cast<const DataType_*>(role_embedding_table.data<data_t_>());
-  }
-
-  auto position_id_shape = position_ids.shape();
-  if (position_id_shape.size() > 0 && numel(position_id_shape) > 0) {
-      decoding_params.position_ids = position_ids.data<int>();
-      decoding_params.decoder_position_ids = decoder_position_ids.data<int>();
-  }
-
-  ActivationType activate =
-      (hidden_act == "gelu") ? ActivationType::GELU : ActivationType::RELU;
-
-  int finished_candidate_num_ =
-      ("beam_search_v3" == decoding_strategy) ? beam_width_ : beam_width_ * 2;
-
-  if ("beam_search" == decoding_strategy) {
-    DecodingBeamsearch<DecodingTraits_::OpType>* unified_decoding_beam_search_;
-
-    unified_decoding_beam_search_ =
-        new DecodingBeamsearch<DecodingTraits_::OpType>(
-            allocator_,
-            batch_size_,
-            beam_width_,
-            max_seq_len_,
-            head_num_,
-            size_per_head_,
-            vocab_size,
-            num_layer_,
-            memory_hidden_dim,
-            memory_max_seq_len,
-            start_id_,
-            end_id_,
-            beam_search_diversity_rate_,
-            true,        /*is_fuse_topk_softMax*/
-            true,        /*is_fuse_qkv*/
-            false,       /*keep_alive_beam*/
-            len_penalty, /*alpha not used for this case*/
-            normalize_before,
-            0, /*pos_offset BART only for now*/
-            activate,
-            pos_bias,
-            true, /*prefix_lm*/
-            -1,  /*finished_candidate_num*/
-            false,  /*early_stopping*/
-            false,  /*is_mbart*/
-            min_length,
-            inner_coeff);
-    unified_decoding_beam_search_->set_tensor_parallel_param(
-        tensor_parallel_param);
-    unified_decoding_beam_search_->set_layer_parallel_param(
-        layer_parallel_param);
-    unified_decoding_beam_search_->forward_context(params, decoding_params);
-    unified_decoding_beam_search_->forward(params, decoding_params);
-
-    delete unified_decoding_beam_search_;
-  } else if ("beam_search_v2" == decoding_strategy ||
-             "beam_search_v3" == decoding_strategy) {
-    DecodingBeamsearch<DecodingTraits_::OpType>* unified_decoding_beam_search_;
-
-    unified_decoding_beam_search_ =
-        new DecodingBeamsearch<DecodingTraits_::OpType>(
-            allocator_,
-            batch_size_,
-            beam_width_,
-            max_seq_len_,
-            head_num_,
-            size_per_head_,
-            vocab_size,
-            num_layer_,
-            memory_hidden_dim,
-            memory_max_seq_len,
-            start_id_,
-            end_id_,
-            beam_search_diversity_rate_,
-            true, /*is_fuse_topk_softMax*/
-            true, /*is_fuse_qkv*/
-            true, /*keep_alive_beam*/
-            len_penalty,
-            normalize_before,
-            0, /*pos_offset BART only for now*/
-            activate,
-            pos_bias,
-            true, /*prefix_lm*/
-            finished_candidate_num_,
-            early_stopping,
-            false,  /*is_mbart*/
-            min_length,
-            inner_coeff);
-    unified_decoding_beam_search_->forward_context(params, decoding_params);
-    unified_decoding_beam_search_->forward(params, decoding_params);
-
-    delete unified_decoding_beam_search_;
-  } else if ("topk_sampling" == decoding_strategy ||
-             "topp_sampling" == decoding_strategy ||
-             "sampling" == decoding_strategy) {
-    DecodingSampling<DecodingTraits_::OpType>* unified_decoding_sampling_;
-
-    unified_decoding_sampling_ = new DecodingSampling<DecodingTraits_::OpType>(
-        allocator_,
-        batch_size_,
-        max_seq_len_,
-        head_num_,
-        size_per_head_,
-        vocab_size,
-        num_layer_,
-        memory_hidden_dim,
-        memory_max_seq_len,
-        start_id_,
-        end_id_,
-        candidate_num_,
-        probability_threshold_,
-        true, /*is_fuse_qkv*/
-        normalize_before,
-        0, /*pos_offset BART only for now*/
-        activate,
-        pos_bias,
-        temperature,
-        1.0,   /*repeat_penalty*/
-        true,  /*prefix_lm*/
-        false, /*is_mbart*/
-        min_length,
-        inner_coeff,
-        seed,
-        tensor_para_size,
-        layer_para_size);
-    unified_decoding_sampling_->set_tensor_parallel_param(
-        tensor_parallel_param);
-    unified_decoding_sampling_->set_layer_parallel_param(layer_parallel_param);
-    unified_decoding_sampling_->forward_context(params, decoding_params);
-    unified_decoding_sampling_->forward(params, decoding_params);
-
-    delete unified_decoding_sampling_;
-  } else {
-    PD_THROW(
-        "Only beam_search, beam_search_v2, topk_sampling and topp_sampling are "
-        "supported for "
-        "FastGeneration. ");
-  }
-  delete[] params;
-
-  return {output_ids, parent_ids, sequence_length, output_scores};
-}
-
-std::vector<paddle::Tensor> UnifiedDecodingCUDAForward(
-    const paddle::Tensor& input_ids,
-    const paddle::Tensor& attn_mask,
-    const paddle::Tensor& mem_seq_len,
-    const paddle::Tensor& type_id,
-    const paddle::Tensor& decoder_type_id,
-    const paddle::Tensor& logits_mask,
-    const paddle::Tensor& word_embedding,
-    const std::vector<paddle::Tensor>& self_ln_weight,
-    const std::vector<paddle::Tensor>& self_ln_bias,
-    const std::vector<paddle::Tensor>& self_q_weight,
-    const std::vector<paddle::Tensor>& self_q_bias,
-    const std::vector<paddle::Tensor>& self_k_weight,
-    const std::vector<paddle::Tensor>& self_k_bias,
-    const std::vector<paddle::Tensor>& self_v_weight,
-    const std::vector<paddle::Tensor>& self_v_bias,
-    const std::vector<paddle::Tensor>& self_out_weight,
-    const std::vector<paddle::Tensor>& self_out_bias,
-    const std::vector<paddle::Tensor>& ffn_ln_weight,
-    const std::vector<paddle::Tensor>& ffn_ln_bias,
-    const std::vector<paddle::Tensor>& ffn_inter_weight,
-    const std::vector<paddle::Tensor>& ffn_inter_bias,
-    const std::vector<paddle::Tensor>& ffn_out_weight,
-    const std::vector<paddle::Tensor>& ffn_out_bias,
-    const paddle::Tensor& decoder_ln_weight,
-    const paddle::Tensor& decoder_ln_bias,
-    const paddle::Tensor& trans_weight,
-    const paddle::Tensor& trans_bias,
-    const paddle::Tensor& lm_ln_weight,
-    const paddle::Tensor& lm_ln_bias,
-    const paddle::Tensor& embedding_weight,
-    const paddle::Tensor& embedding_bias,
-    const paddle::Tensor& positional_embedding_weight,
-    const paddle::Tensor& type_embedding_weight,
-    const paddle::Tensor& role_id,
-    const paddle::Tensor& decoder_role_id,
-    const paddle::Tensor& role_embedding_table,
-    const paddle::Tensor& position_ids,
-    const paddle::Tensor& decoder_position_ids,
-    paddle::Tensor& output_ids,
-    paddle::Tensor& parent_ids,
-    paddle::Tensor& sequence_length,
-    paddle::Tensor& output_scores,
-    const std::string& decoding_strategy,
-    const int beam_size,
-    const int topk,
-    const float topp,
-    const int n_head,
-    const int size_per_head,
-    const int num_layer,
-    const int bos_id,
-    const int eos_id,
-    const int64_t max_len,
-    const float beam_search_diversity_rate,
-    const int unk_id,
-    const int mask_id,
-    const float temperature,
-    const float len_penalty,
-    const bool normalize_before,
-    const bool pos_bias,
-    const std::string& hidden_act,
-    const bool early_stopping,
-    const int min_length,
-    const int tensor_para_size = 1,
-    const int layer_para_size = 1,
-    const int layer_para_batch_size = 1) {
-  auto stream = input_ids.stream();
-
-  cublasSetStream(CublasHandle::GetInstance()->cublas_handle_, stream);
-
-  std::vector<paddle::Tensor> ret;
-
-  switch (self_ln_weight[0].type()) {
-    case paddle::DataType::FLOAT16: {
-      ret = unified_decoding_kernel<paddle::DataType::FLOAT16>(
-          input_ids,
-          attn_mask,
-          mem_seq_len,
-          type_id,
-          decoder_type_id,
-          logits_mask,
-          word_embedding,
-          self_ln_weight,
-          self_ln_bias,
-          self_q_weight,
-          self_q_bias,
-          self_k_weight,
-          self_k_bias,
-          self_v_weight,
-          self_v_bias,
-          self_out_weight,
-          self_out_bias,
-          ffn_ln_weight,
-          ffn_ln_bias,
-          ffn_inter_weight,
-          ffn_inter_bias,
-          ffn_out_weight,
-          ffn_out_bias,
-          decoder_ln_weight,
-          decoder_ln_bias,
-          trans_weight,
-          trans_bias,
-          lm_ln_weight,
-          lm_ln_bias,
-          embedding_weight,
-          embedding_bias,
-          positional_embedding_weight,
-          type_embedding_weight,
-          role_id,
-          decoder_role_id,
-          role_embedding_table,
-          position_ids,
-          decoder_position_ids,
-          output_ids,
-          parent_ids,
-          sequence_length,
-          output_scores,
-          decoding_strategy,
-          beam_size,
-          topk,
-          topp,
-          n_head,
-          size_per_head,
-          num_layer,
-          bos_id,
-          eos_id,
-          max_len,
-          beam_search_diversity_rate,
-          unk_id,
-          mask_id,
-          temperature,
-          len_penalty,
-          normalize_before,
-          pos_bias,
-          hidden_act,
-          early_stopping,
-          min_length,
-          stream,
-          tensor_para_size,
-          layer_para_size,
-          layer_para_batch_size);
-      break;
-    }
-    case paddle::DataType::FLOAT32: {
-      ret = unified_decoding_kernel<paddle::DataType::FLOAT32>(
-          input_ids,
-          attn_mask,
-          mem_seq_len,
-          type_id,
-          decoder_type_id,
-          logits_mask,
-          word_embedding,
-          self_ln_weight,
-          self_ln_bias,
-          self_q_weight,
-          self_q_bias,
-          self_k_weight,
-          self_k_bias,
-          self_v_weight,
-          self_v_bias,
-          self_out_weight,
-          self_out_bias,
-          ffn_ln_weight,
-          ffn_ln_bias,
-          ffn_inter_weight,
-          ffn_inter_bias,
-          ffn_out_weight,
-          ffn_out_bias,
-          decoder_ln_weight,
-          decoder_ln_bias,
-          trans_weight,
-          trans_bias,
-          lm_ln_weight,
-          lm_ln_bias,
-          embedding_weight,
-          embedding_bias,
-          positional_embedding_weight,
-          type_embedding_weight,
-          role_id,
-          decoder_role_id,
-          role_embedding_table,
-          position_ids,
-          decoder_position_ids,
-          output_ids,
-          parent_ids,
-          sequence_length,
-          output_scores,
-          decoding_strategy,
-          beam_size,
-          topk,
-          topp,
-          n_head,
-          size_per_head,
-          num_layer,
-          bos_id,
-          eos_id,
-          max_len,
-          beam_search_diversity_rate,
-          unk_id,
-          mask_id,
-          temperature,
-          len_penalty,
-          normalize_before,
-          pos_bias,
-          hidden_act,
-          early_stopping,
-          min_length,
-          stream,
-          tensor_para_size,
-          layer_para_size,
-          layer_para_batch_size);
-      break;
-    }
-    default: {
-      PD_THROW(
-          "NOT supported data type. "
-          "Only float16 and float32 are supported. ");
-      break;
-    }
-  }
-
-  return ret;
-}
diff --git a/paddlenlp/ops/fast_transformer/src/fusion_unified_decoding_op.h b/paddlenlp/ops/fast_transformer/src/fusion_unified_decoding_op.h
deleted file mode 100644
index 071636b6029c..000000000000
--- a/paddlenlp/ops/fast_transformer/src/fusion_unified_decoding_op.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#include <string>
-#include <vector>
-
-// #include "fastertransformer/decoding_beamsearch.h"
-// #include "fastertransformer/decoding_sampling.h"
-// #include "fastertransformer/open_decoder.h"
-// #include "fastertransformer/utils/common.h"
-#include "cublas_handle.h"
-#include "utils.h"
-
-#ifdef PADDLE_ON_INFERENCE
-#include "paddle/extension.h"
-#include "paddle_inference_api.h"
-#include "paddle/common/exception.h"
-#else
-#include "paddle/extension.h"
-#endif
-
-
-std::vector<paddle::Tensor> UnifiedDecodingCUDAForward(
-    const paddle::Tensor& input_ids,
-    const paddle::Tensor& attn_mask,
-    const paddle::Tensor& mem_seq_len,
-    const paddle::Tensor& type_id,
-    const paddle::Tensor& decoder_type_id,
-    const paddle::Tensor& logits_mask,
-    const paddle::Tensor& word_embedding,
-    const std::vector<paddle::Tensor>& self_ln_weight,
-    const std::vector<paddle::Tensor>& self_ln_bias,
-    const std::vector<paddle::Tensor>& self_q_weight,
-    const std::vector<paddle::Tensor>& self_q_bias,
-    const std::vector<paddle::Tensor>& self_k_weight,
-    const std::vector<paddle::Tensor>& self_k_bias,
-    const std::vector<paddle::Tensor>& self_v_weight,
-    const std::vector<paddle::Tensor>& self_v_bias,
-    const std::vector<paddle::Tensor>& self_out_weight,
-    const std::vector<paddle::Tensor>& self_out_bias,
-    const std::vector<paddle::Tensor>& ffn_ln_weight,
-    const std::vector<paddle::Tensor>& ffn_ln_bias,
-    const std::vector<paddle::Tensor>& ffn_inter_weight,
-    const std::vector<paddle::Tensor>& ffn_inter_bias,
-    const std::vector<paddle::Tensor>& ffn_out_weight,
-    const std::vector<paddle::Tensor>& ffn_out_bias,
-    const paddle::Tensor& decoder_ln_weight,
-    const paddle::Tensor& decoder_ln_bias,
-    const paddle::Tensor& trans_weight,
-    const paddle::Tensor& trans_bias,
-    const paddle::Tensor& lm_ln_weight,
-    const paddle::Tensor& lm_ln_bias,
-    const paddle::Tensor& embedding_weight,
-    const paddle::Tensor& embedding_bias,
-    const paddle::Tensor& positional_embedding_weight,
-    const paddle::Tensor& type_embedding_weight,
-    const paddle::Tensor& role_id,
-    const paddle::Tensor& decoder_role_id,
-    const paddle::Tensor& role_embedding_table,
-    const paddle::Tensor& position_ids,
-    const paddle::Tensor& decoder_position_ids,
-    paddle::Tensor& output_ids,
-    paddle::Tensor& parent_ids,
-    paddle::Tensor& sequence_length,
-    paddle::Tensor& output_scores,
-    const std::string& decoding_strategy,
-    const int beam_size,
-    const int topk,
-    const float topp,
-    const int n_head,
-    const int size_per_head,
-    const int num_layer,
-    const int bos_id,
-    const int eos_id,
-    const int64_t max_len,
-    const float beam_search_diversity_rate,
-    const int unk_id,
-    const int mask_id,
-    const float temperature,
-    const float len_penalty,
-    const bool normalize_before,
-    const bool pos_bias,
-    const std::string& hidden_act,
-    const bool early_stopping,
-    const int min_length,
-    const int tensor_para_size,
-    const int layer_para_size,
-    const int layer_para_batch_size);
diff --git a/paddlenlp/ops/fast_transformer/src/parallel_utils.cc b/paddlenlp/ops/fast_transformer/src/parallel_utils.cc
deleted file mode 100644
index 730c3a398649..000000000000
--- a/paddlenlp/ops/fast_transformer/src/parallel_utils.cc
+++ /dev/null
@@ -1,148 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "parallel_utils.h"
-
-static std::mutex mpi_global_mutex;
-static std::once_flag once_flag_init_mpi;
-
-void MPIExit() {
-  std::unique_lock<std::mutex> global_lock(mpi_global_mutex);
-  MPICHECK(MPI_Finalize());
-}
-
-void InitMPIOnce() {
-  // Initialize MPI environment
-  std::call_once(once_flag_init_mpi, []() {
-    MPICHECK(MPI_Init(nullptr, nullptr));
-    if (std::atexit(MPIExit)) {
-      throw std::runtime_error("Fail to register the MPI exit handler");
-    }
-  });
-}
-
-void InitNCCLComm(ncclUniqueId& tensor_para_nccl_uid,
-                  ncclUniqueId& layer_para_nccl_uid,
-                  ncclComm_t& tensor_para_nccl_comm,
-                  ncclComm_t& layer_para_nccl_comm,
-                  int rank,
-                  int tensor_para_size,
-                  int layer_para_size,
-                  int tensor_para_rank,
-                  int layer_para_rank) {
-  // assume gpu_num = n * k,
-  // tensor parallelism group size is n
-  // layer parallelism group size is k
-
-  if (tensor_para_rank == 0) {
-    // get the uid of each tensor parallelism group
-    // here, 0, 1, ..., n-1 are in group 0,
-    //       n, ..., 2n - 1 are in group 1.
-    NCCLCHECK(ncclGetUniqueId(&tensor_para_nccl_uid));
-    for (int i = 1; i < tensor_para_size; i++) {
-      printf("[INFO] rank %d sends tensor_para_nccl_uid to rank %d \n",
-             rank,
-             rank + i);
-      MPICHECK(MPI_Send(&tensor_para_nccl_uid,
-                        sizeof(tensor_para_nccl_uid),
-                        MPI_BYTE,
-                        rank + i,
-                        0,
-                        MPI_COMM_WORLD));
-    }
-  } else {
-    MPI_Status status;
-    printf("[INFO] rank %d receives tensor_para_nccl_uid from rank %d \n",
-           rank,
-           rank - tensor_para_rank);
-    MPICHECK(MPI_Recv(&tensor_para_nccl_uid,
-                      sizeof(tensor_para_nccl_uid),
-                      MPI_BYTE,
-                      rank - tensor_para_rank,
-                      0,
-                      MPI_COMM_WORLD,
-                      &status));
-  }
-
-  if (layer_para_rank == 0) {
-    // get the uid of each layer parallelism group
-    // 0, k, 2k, are in group 0
-    // 1, k+1, 2k+1 are in group 1
-    NCCLCHECK(ncclGetUniqueId(&layer_para_nccl_uid));
-    for (int i = 1; i < layer_para_size; i++) {
-      printf("[INFO] rank %d sends layer_para_nccl_uid to rank %d \n",
-             rank,
-             rank + i * tensor_para_size);
-      MPICHECK(MPI_Send(&layer_para_nccl_uid,
-                        sizeof(layer_para_nccl_uid),
-                        MPI_BYTE,
-                        rank + i * tensor_para_size,
-                        0,
-                        MPI_COMM_WORLD));
-    }
-  } else {
-    MPI_Status status;
-    printf("[INFO] rank %d receives layer_para_nccl_uid from rank %d \n",
-           rank,
-           rank % tensor_para_size);
-    MPICHECK(MPI_Recv(&layer_para_nccl_uid,
-                      sizeof(layer_para_nccl_uid),
-                      MPI_BYTE,
-                      rank % tensor_para_size,
-                      0,
-                      MPI_COMM_WORLD,
-                      &status));
-  }
-
-  NCCLCHECK(ncclCommInitRank(&tensor_para_nccl_comm,
-                             tensor_para_size,
-                             tensor_para_nccl_uid,
-                             tensor_para_rank));
-  NCCLCHECK(ncclCommInitRank(&layer_para_nccl_comm,
-                             layer_para_size,
-                             layer_para_nccl_uid,
-                             layer_para_rank));
-}
-
-// Make model parallel settings init only once for one model by using a global
-// dict mapping parameters representing different models to corresponding
-// settings. Note: `paddle::Tensor` for custom_op is re-created every step and
-// we use pointers as keys. Maybe using weakref as keys is better.
-static std::unordered_map<void*, std::unique_ptr<ModelParaDesc>>
-    model_para_infos;
-
-ModelParaDesc* ModelParaDescFactory::CreateModelParaDesc(
-    int head_num,
-    int size_per_head,
-    int layer_num,
-    int tensor_para_size,
-    int layer_para_size,
-    int layer_para_batch_size,
-    void* param_ptr = nullptr) {
-  InitMPIOnce();
-  auto it = model_para_infos.find(param_ptr);
-  if (it != model_para_infos.end()) {
-    return it->second.get();
-  } else {
-    model_para_infos.emplace(param_ptr,
-                             std::unique_ptr<ModelParaDesc>(
-                                 new ModelParaDesc(head_num,
-                                                   size_per_head,
-                                                   layer_num,
-                                                   tensor_para_size,
-                                                   layer_para_size,
-                                                   layer_para_batch_size)));
-    return model_para_infos[param_ptr].get();
-  }
-}
diff --git a/paddlenlp/ops/fast_transformer/src/parallel_utils.h b/paddlenlp/ops/fast_transformer/src/parallel_utils.h
deleted file mode 100644
index 461e79702d18..000000000000
--- a/paddlenlp/ops/fast_transformer/src/parallel_utils.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <unordered_map>
-#include <memory>
-#include <mutex>
-#include <random>
-#include <assert.h>
-
-#include "fastertransformer/utils/nccl_utils.h"
-
-
-void MPIExit();
-
-void InitMPIOnce();
-
-void InitNCCLComm(ncclUniqueId& tensor_para_nccl_uid,
-                  ncclUniqueId& layer_para_nccl_uid,
-                  ncclComm_t& tensor_para_nccl_comm,
-                  ncclComm_t& layer_para_nccl_comm,
-                  int rank,
-                  int tensor_para_size,
-                  int layer_para_size,
-                  int tensor_para_rank,
-                  int layer_para_rank);
-
-struct ModelParaDesc {
-  TensorParallelParam tensor_parallel_param;
-  LayerParallelParam layer_parallel_param;
-  ncclComm_t tensor_para_nccl_comm, layer_para_nccl_comm;
-  std::mt19937_64 gen;
-  std::uniform_int_distribution<> dist{0, std::numeric_limits<int>::max()};
-
-  ModelParaDesc(int head_num,
-                int size_per_head,
-                int layer_num,
-                int tensor_para_size,
-                int layer_para_size,
-                int layer_para_batch_size) {
-    int rank;
-    MPICHECK(MPI_Comm_rank(MPI_COMM_WORLD, &rank));
-    const int local_head_num = head_num / tensor_para_size;
-    const int local_hidden_units = local_head_num * size_per_head;
-    const int layers_per_group = layer_num / layer_para_size;
-    assert(layer_num % layer_para_size == 0);
-    const int tensor_para_rank = rank % tensor_para_size;
-    const int layer_para_rank = rank / tensor_para_size;
-    ncclUniqueId tensor_para_nccl_uid, layer_para_nccl_uid;
-    InitNCCLComm(tensor_para_nccl_uid,
-                 layer_para_nccl_uid,
-                 tensor_para_nccl_comm,
-                 layer_para_nccl_comm,
-                 rank,
-                 tensor_para_size,
-                 layer_para_size,
-                 tensor_para_rank,
-                 layer_para_rank);
-    tensor_parallel_param.rank = tensor_para_rank;
-    tensor_parallel_param.world_size = tensor_para_size;
-    tensor_parallel_param.local_head_num_ = local_head_num;
-    tensor_parallel_param.local_hidden_units_ = local_hidden_units;
-    tensor_parallel_param.nccl_comm = tensor_para_nccl_comm;
-    layer_parallel_param.rank = layer_para_rank;
-    layer_parallel_param.world_size = layer_para_size;
-    layer_parallel_param.layers_per_group = layers_per_group;
-    layer_parallel_param.local_batch_size = layer_para_batch_size;
-    layer_parallel_param.nccl_comm = layer_para_nccl_comm;
-    // fix the seed to prevent the seed of different gpu are differnet in Tensor
-    // Parallel
-    size_t meta_seed =
-        *(reinterpret_cast<size_t*>(tensor_para_nccl_uid.internal));
-    gen = std::mt19937_64(meta_seed);
-  }
-
-  ~ModelParaDesc() {
-    if (tensor_para_nccl_comm) ncclCommDestroy(tensor_para_nccl_comm);
-    if (layer_para_nccl_comm) ncclCommDestroy(layer_para_nccl_comm);
-  }
-};
-
-struct ModelParaDescFactory {
-  static ModelParaDesc* CreateModelParaDesc(int head_num,
-                                            int size_per_head,
-                                            int layer_num,
-                                            int tensor_para_size,
-                                            int layer_para_size,
-                                            int layer_para_batch_size,
-                                            void* param_ptr);
-};
diff --git a/paddlenlp/ops/fast_transformer/src/pd_traits.h b/paddlenlp/ops/fast_transformer/src/pd_traits.h
deleted file mode 100644
index 0a7a1e26dd90..000000000000
--- a/paddlenlp/ops/fast_transformer/src/pd_traits.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#include "fastertransformer/utils/common.h"
-
-using namespace fastertransformer;
-
-template <paddle::DataType D>
-class PDTraits;
-
-template <>
-class PDTraits<paddle::DataType::FLOAT32> {
-public:
-  typedef float DataType;
-  typedef float data_t;
-  static const OperationType OpType = OperationType::FP32;
-};
-
-template <>
-class PDTraits<paddle::DataType::FLOAT16> {
-public:
-  typedef half DataType;
-  typedef paddle::float16 data_t;
-  static const OperationType OpType = OperationType::FP16;
-};
diff --git a/paddlenlp/ops/fast_transformer/src/utils.cc b/paddlenlp/ops/fast_transformer/src/utils.cc
deleted file mode 100644
index fe9652422806..000000000000
--- a/paddlenlp/ops/fast_transformer/src/utils.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "utils.h"
-
-
-const int64_t numel(const std::vector<int64_t>& tensor_shape) {
-    int size = tensor_shape.size();
-    int64_t n = 1;
-    for (int i = 0; i < size; ++i) {
-        n *= tensor_shape[i];
-    }
-    return n;
-}
diff --git a/paddlenlp/ops/fast_transformer/src/utils.h b/paddlenlp/ops/fast_transformer/src/utils.h
deleted file mode 100644
index b4731958ab7e..000000000000
--- a/paddlenlp/ops/fast_transformer/src/utils.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdint.h>
-#include <vector>
-
-
-const int64_t numel(const std::vector<int64_t>& tensor_shape);
diff --git a/paddlenlp/ops/fast_transformer/transformer/__init__.py b/paddlenlp/ops/fast_transformer/transformer/__init__.py
deleted file mode 100644
index 185a92b8d94d..000000000000
--- a/paddlenlp/ops/fast_transformer/transformer/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/paddlenlp/ops/fast_transformer/transformer/decoder.py b/paddlenlp/ops/fast_transformer/transformer/decoder.py
deleted file mode 100644
index 82b0f2339aec..000000000000
--- a/paddlenlp/ops/fast_transformer/transformer/decoder.py
+++ /dev/null
@@ -1,586 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-
-import numpy as np
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-
-from paddlenlp.ops import transfer_param
-from paddlenlp.ops.ext_utils import LOADED_EXT, load
-from paddlenlp.transformers import (
-    PositionalEmbedding,
-    WordEmbedding,
-    position_encoding_init,
-)
-from paddlenlp.utils.log import logger
-
-from .decoding import run_custom
-
-
-def infer_transformer_decoder(
-    from_tensor,
-    memory_tensor,
-    mem_seq_len,
-    self_ln_weight,
-    self_ln_bias,
-    self_q_weight,
-    self_q_bias,
-    self_k_weight,
-    self_k_bias,
-    self_v_weight,
-    self_v_bias,
-    self_out_weight,
-    self_out_bias,
-    cross_ln_weight,
-    cross_ln_bias,
-    cross_q_weight,
-    cross_q_bias,
-    cross_k_weight,
-    cross_k_bias,
-    cross_v_weight,
-    cross_v_bias,
-    cross_out_weight,
-    cross_out_bias,
-    ffn_ln_weight,
-    ffn_ln_bias,
-    ffn_inter_weight,
-    ffn_inter_bias,
-    ffn_out_weight,
-    ffn_out_bias,
-    old_self_cache_key,
-    old_self_cache_value,
-    old_mem_cache,
-    step,
-    n_head,
-    size_per_head,
-    memory_hidden_dim,
-    is_fuse_qkv=False,
-):
-    inputs_names = [
-        "FromTensor",
-        "MemoryTensor",
-        "MemSeqLen",
-        "SelfLayernormWeight",
-        "SelfLayernormBias",
-        "SelfQueryWeight",
-        "SelfQueryBias",
-        "SelfKeyWeight",
-        "SelfKeyBias",
-        "SelfValueWeight",
-        "SelfValueBias",
-        "SelfOutWeight",
-        "SelfOutBias",
-        "CrossLayernormWeight",
-        "CrossLayernormBias",
-        "CrossQueryWeight",
-        "CrossQueryBias",
-        "CrossKeyWeight",
-        "CrossKeyBias",
-        "CrossValueWeight",
-        "CrossValueBias",
-        "CrossOutWeight",
-        "CrossOutBias",
-        "FFNLayernormWeight",
-        "FFNLayernormBias",
-        "FFNInterWeight",
-        "FFNInterBias",
-        "FFNOutWeight",
-        "FFNOutBias",
-        "OldSelfCacheKey",
-        "OldSelfCacheValue",
-    ]
-
-    inputs_var = [
-        from_tensor,
-        memory_tensor,
-        mem_seq_len,
-        self_ln_weight,
-        self_ln_bias,
-        self_q_weight,
-        self_q_bias,
-        self_k_weight,
-        self_k_bias,
-        self_v_weight,
-        self_v_bias,
-        self_out_weight,
-        self_out_bias,
-        cross_ln_weight,
-        cross_ln_bias,
-        cross_q_weight,
-        cross_q_bias,
-        cross_k_weight,
-        cross_k_bias,
-        cross_v_weight,
-        cross_v_bias,
-        cross_out_weight,
-        cross_out_bias,
-        ffn_ln_weight,
-        ffn_ln_bias,
-        ffn_inter_weight,
-        ffn_inter_bias,
-        ffn_out_weight,
-        ffn_out_bias,
-        old_self_cache_key,
-        old_self_cache_value,
-        old_mem_cache,
-    ]
-
-    attrs_names = ["step", "n_head", "size_per_head", "memory_hidden_dim", "is_fuse_qkv"]
-
-    attrs_val = [step, n_head, size_per_head, memory_hidden_dim, is_fuse_qkv]
-
-    outputs_names = ["DecoderOutput", "NewSelfCacheKey", "NewSelfCacheValue", "NewMemCache"]
-
-    outputs_dtype = [memory_tensor.dtype] * len(outputs_names)
-
-    return run_custom("fusion_decoder", inputs_names, inputs_var, attrs_names, attrs_val, outputs_names, outputs_dtype)
-
-
-def get_op_cache_config(use_batch_major_op_cache, size_per_head, is_fp16):
-    x = 8 if is_fp16 else 4
-    use_batch_major_op_cache = True if use_batch_major_op_cache is True and size_per_head % x == 0 else False
-    x = x if use_batch_major_op_cache else 1
-    return use_batch_major_op_cache, x
-
-
-class InferTransformerDecoder(nn.Layer):
-    """
-    FasterTransformer decoder block.
-
-    Args:
-        decoder (`TransformerDecoder`):
-            Transformer decoder block.
-        n_head (`int`):
-            The number of head used in multi-head attention.
-        size_per_head (`int`):
-            The size of per head used in multi-head attention.
-        decoder_lib (`str`):
-            The path to decoder_lib. Default to None.
-        use_fp16_decoder (`bool`):
-            Whether to use fp16 for decoder. Default to False.
-    """
-
-    def __init__(
-        self, decoder, n_head, size_per_head, decoder_lib=None, use_fp16_decoder=False, use_batch_major_op_cache=False
-    ):
-
-        if decoder_lib is not None and os.path.isfile(decoder_lib):
-            # Maybe it has been loadad by `ext_utils.load`
-            if "FastGeneration" not in LOADED_EXT.keys():
-                ops = paddle.utils.cpp_extension.load_op_meta_info_and_register_op(decoder_lib)
-                LOADED_EXT["FastGeneration"] = ops
-        else:
-            if decoder_lib is not None:
-                logger.warning("The specified decoder_lib does not exist, and it will be built automatically.")
-            load("FastGeneration", verbose=True)
-
-        super(InferTransformerDecoder, self).__init__()
-        self.n_head = n_head
-        self.size_per_head = size_per_head
-        self.use_batch_major_op_cache = use_batch_major_op_cache
-
-        if use_fp16_decoder:
-            for idx, mod in enumerate(decoder.layers):
-                mod.norm1.weight = transfer_param(mod.norm1.weight)
-                mod.norm1.bias = transfer_param(mod.norm1.bias, is_bias=True)
-                mod.self_attn.q_proj.weight = transfer_param(mod.self_attn.q_proj.weight)
-                mod.self_attn.q_proj.bias = transfer_param(mod.self_attn.q_proj.bias, is_bias=True)
-                mod.self_attn.k_proj.weight = transfer_param(mod.self_attn.k_proj.weight)
-                mod.self_attn.k_proj.bias = transfer_param(mod.self_attn.k_proj.bias, is_bias=True)
-                mod.self_attn.v_proj.weight = transfer_param(mod.self_attn.v_proj.weight)
-                mod.self_attn.v_proj.bias = transfer_param(mod.self_attn.v_proj.bias, is_bias=True)
-                mod.self_attn.out_proj.weight = transfer_param(mod.self_attn.out_proj.weight)
-                mod.self_attn.out_proj.bias = transfer_param(mod.self_attn.out_proj.bias, is_bias=True)
-
-                mod.norm2.weight = transfer_param(mod.norm2.weight)
-                mod.norm2.bias = transfer_param(mod.norm2.bias, is_bias=True)
-                mod.cross_attn.q_proj.weight = transfer_param(mod.cross_attn.q_proj.weight)
-                mod.cross_attn.q_proj.bias = transfer_param(mod.cross_attn.q_proj.bias, is_bias=True)
-                mod.cross_attn.k_proj.weight = transfer_param(mod.cross_attn.k_proj.weight)
-                mod.cross_attn.k_proj.bias = transfer_param(mod.cross_attn.k_proj.bias, is_bias=True)
-                mod.cross_attn.v_proj.weight = transfer_param(mod.cross_attn.v_proj.weight)
-                mod.cross_attn.v_proj.bias = transfer_param(mod.cross_attn.v_proj.bias, is_bias=True)
-                mod.cross_attn.out_proj.weight = transfer_param(mod.cross_attn.out_proj.weight)
-                mod.cross_attn.out_proj.bias = transfer_param(mod.cross_attn.out_proj.bias, is_bias=True)
-
-                mod.norm3.weight = transfer_param(mod.norm3.weight)
-                mod.norm3.bias = transfer_param(mod.norm3.bias, is_bias=True)
-                mod.linear1.weight = transfer_param(mod.linear1.weight)
-                mod.linear1.bias = transfer_param(mod.linear1.bias, is_bias=True)
-                mod.linear2.weight = transfer_param(mod.linear2.weight)
-                mod.linear2.bias = transfer_param(mod.linear2.bias, is_bias=True)
-
-        self.weights = []
-        for idx, mod in enumerate(decoder.layers):
-            layer_weight = []
-            layer_weight.append(mod.norm1.weight)
-            layer_weight.append(mod.norm1.bias)
-            layer_weight.append(mod.self_attn.q_proj.weight)
-            layer_weight.append(mod.self_attn.q_proj.bias)
-            layer_weight.append(mod.self_attn.k_proj.weight)
-            layer_weight.append(mod.self_attn.k_proj.bias)
-            layer_weight.append(mod.self_attn.v_proj.weight)
-            layer_weight.append(mod.self_attn.v_proj.bias)
-            layer_weight.append(mod.self_attn.out_proj.weight)
-            layer_weight.append(mod.self_attn.out_proj.bias)
-            layer_weight.append(mod.norm2.weight)
-            layer_weight.append(mod.norm2.bias)
-            layer_weight.append(mod.cross_attn.q_proj.weight)
-            layer_weight.append(mod.cross_attn.q_proj.bias)
-            layer_weight.append(mod.cross_attn.k_proj.weight)
-            layer_weight.append(mod.cross_attn.k_proj.bias)
-            layer_weight.append(mod.cross_attn.v_proj.weight)
-            layer_weight.append(mod.cross_attn.v_proj.bias)
-            layer_weight.append(mod.cross_attn.out_proj.weight)
-            layer_weight.append(mod.cross_attn.out_proj.bias)
-            layer_weight.append(mod.norm3.weight)
-            layer_weight.append(mod.norm3.bias)
-            layer_weight.append(mod.linear1.weight)
-            layer_weight.append(mod.linear1.bias)
-            layer_weight.append(mod.linear2.weight)
-            layer_weight.append(mod.linear2.bias)
-            self.weights.append(layer_weight)
-
-    def forward(
-        self,
-        from_tensor,
-        memory_tensor,
-        mem_seq_len,
-        self_cache_key,
-        self_cache_value,
-        mem_cache,
-        step,
-        memory_hidden_dim,
-        is_fuse_qkv,
-    ):
-        decoder_output = from_tensor
-        self_caches_key = []
-        self_caches_value = []
-        mem_caches = []
-        if not self.use_batch_major_op_cache:
-            self_cache_key = paddle.concat(
-                [
-                    self_cache_key,
-                    paddle.zeros(
-                        shape=[len(self.weights), 1, memory_tensor.shape[0], self.n_head * self.size_per_head],
-                        dtype=self_cache_key.dtype,
-                    ),
-                ],
-                axis=1,
-            )
-            self_cache_value = paddle.concat(
-                [
-                    self_cache_value,
-                    paddle.zeros(
-                        shape=[len(self.weights), 1, memory_tensor.shape[0], self.n_head * self.size_per_head],
-                        dtype=self_cache_value.dtype,
-                    ),
-                ],
-                axis=1,
-            )
-        for idx in range(len(self.weights)):
-            weight = self.weights[idx]
-            decoder_output, new_self_cache_key, new_self_cache_value, new_mem_cache = infer_transformer_decoder(
-                from_tensor=decoder_output,
-                memory_tensor=memory_tensor,
-                mem_seq_len=mem_seq_len,
-                self_ln_weight=weight[0],
-                self_ln_bias=weight[1],
-                self_q_weight=weight[2],
-                self_q_bias=weight[3],
-                self_k_weight=weight[4],
-                self_k_bias=weight[5],
-                self_v_weight=weight[6],
-                self_v_bias=weight[7],
-                self_out_weight=weight[8],
-                self_out_bias=weight[9],
-                cross_ln_weight=weight[10],
-                cross_ln_bias=weight[11],
-                cross_q_weight=weight[12],
-                cross_q_bias=weight[13],
-                cross_k_weight=weight[14],
-                cross_k_bias=weight[15],
-                cross_v_weight=weight[16],
-                cross_v_bias=weight[17],
-                cross_out_weight=weight[18],
-                cross_out_bias=weight[19],
-                ffn_ln_weight=weight[20],
-                ffn_ln_bias=weight[21],
-                ffn_inter_weight=weight[22],
-                ffn_inter_bias=weight[23],
-                ffn_out_weight=weight[24],
-                ffn_out_bias=weight[25],
-                old_self_cache_key=self_cache_key[idx],
-                old_self_cache_value=self_cache_value[idx],
-                old_mem_cache=mem_cache[idx],
-                step=step,
-                n_head=self.n_head,
-                size_per_head=self.size_per_head,
-                memory_hidden_dim=memory_hidden_dim,
-                is_fuse_qkv=is_fuse_qkv,
-            )
-            self_caches_key.append(new_self_cache_key)
-            self_caches_value.append(new_self_cache_value)
-            mem_caches.append(new_mem_cache)
-
-        self_cache_key = paddle.stack(self_caches_key, axis=0)
-        self_cache_value = paddle.stack(self_caches_value, axis=0)
-        mem_cache = paddle.stack(mem_caches, axis=0)
-        return decoder_output, self_cache_key, self_cache_value, mem_cache
-
-
-class FasterDecoder(nn.Layer):
-    """
-    FasterTransformer decoder for auto-regressive generation.
-
-    Args:
-        src_vocab_size (`int`):
-            The size of source vocabulary.
-        trg_vocab_size (`int`):
-            The size of target vocabulary.
-        max_length (`int`):
-            The maximum length of input sequences.
-        num_encoder_layers (`int`):
-            The number of sub-layers to be stacked in the encoder.
-        num_decoder_layers (`int`):
-            The number of sub-layers to be stacked in the decoder.
-        n_head (`int`):
-            The number of head used in multi-head attention.
-        d_model (`int`):
-            The dimension for word embeddings, which is also the last dimension of
-            the input and output of multi-head attention, position-wise feed-forward
-            networks, encoder and decoder.
-        d_inner_hid (`int`):
-            Size of the hidden layer in position-wise feed-forward networks.
-        dropout (`float`):
-            Dropout rates. Used for pre-process, activation and inside attention.
-        weight_sharing (`bool`):
-            Whether to use weight sharing.
-        bos_id (`int`, optional):
-            The start token id and also is used as padding id. Defaults to 0.
-        eos_id (`int`, optional):
-            The end token id. Defaults to 1.
-        max_out_len (int, optional):
-            The maximum output length. Defaults to 256.
-        decoder_lib (`str`):
-            The path to decoder_lib. Default to None.
-        use_fp16_decoder (`bool`):
-            Whether to use fp16 for decoder. Default to False.
-    """
-
-    def __init__(
-        self,
-        src_vocab_size,
-        trg_vocab_size,
-        max_length,
-        num_encoder_layers,
-        num_decoder_layers,
-        n_head,
-        d_model,
-        d_inner_hid,
-        dropout,
-        weight_sharing,
-        bos_id=0,
-        eos_id=1,
-        max_out_len=256,
-        decoder_lib=None,
-        use_fp16_decoder=False,
-        use_batch_major_op_cache=False,
-    ):
-        super().__init__()
-        self.trg_vocab_size = trg_vocab_size
-        self.n_head = n_head
-        self.emb_dim = d_model
-        self.bos_id = bos_id
-        self.eos_id = eos_id
-        self.dropout = dropout
-        self.max_out_len = max_out_len
-        self.max_length = max_length
-        self.use_fp16_decoder = use_fp16_decoder
-        self.num_decoder_layers = num_decoder_layers
-        self.d_model = d_model
-        self.size_per_head = d_model // n_head
-        self.use_batch_major_op_cache, self.x = get_op_cache_config(
-            use_batch_major_op_cache, self.size_per_head, use_fp16_decoder
-        )
-
-        self.src_word_embedding = WordEmbedding(vocab_size=src_vocab_size, emb_dim=d_model, bos_id=self.bos_id)
-        # print(self.src_word_embedding.word_embedding.weight)
-        self.src_pos_embedding = PositionalEmbedding(emb_dim=d_model, max_length=max_length)
-        if weight_sharing:
-            assert (
-                src_vocab_size == trg_vocab_size
-            ), "Vocabularies in source and target should be same for weight sharing."
-            self.trg_word_embedding = self.src_word_embedding
-            self.trg_pos_embedding = self.src_pos_embedding
-        else:
-            self.trg_word_embedding = WordEmbedding(vocab_size=trg_vocab_size, emb_dim=d_model, bos_id=self.bos_id)
-            self.trg_pos_embedding = PositionalEmbedding(emb_dim=d_model, max_length=max_length)
-
-        self.transformer = paddle.nn.Transformer(
-            d_model=d_model,
-            nhead=n_head,
-            num_encoder_layers=num_encoder_layers,
-            num_decoder_layers=num_decoder_layers,
-            dim_feedforward=d_inner_hid,
-            dropout=dropout,
-            activation="relu",
-            normalize_before=True,
-        )
-
-        self.decoder = InferTransformerDecoder(
-            decoder=self.transformer.decoder,
-            n_head=n_head,
-            size_per_head=self.size_per_head,
-            decoder_lib=decoder_lib,
-            use_fp16_decoder=use_fp16_decoder,
-            use_batch_major_op_cache=self.use_batch_major_op_cache,
-        )
-
-        if weight_sharing:
-            self.linear = lambda x: paddle.matmul(
-                x=x, y=self.trg_word_embedding.word_embedding.weight, transpose_y=True
-            )
-        else:
-            self.linear = nn.Linear(in_features=d_model, out_features=trg_vocab_size, bias_attr=False)
-
-    def forward(self, src_word):
-        src_max_len = src_word.shape[-1]
-        mem_seq_lens = paddle.sum(
-            paddle.cast(src_word != self.bos_id, dtype="int32"), axis=-1, keepdim=True, dtype="int32"
-        )
-
-        src_slf_attn_bias = (
-            paddle.cast(src_word == self.bos_id, dtype=paddle.get_default_dtype()).unsqueeze([1, 2]) * -1e9
-        )
-
-        src_slf_attn_bias.stop_gradient = True
-
-        src_pos = paddle.cast(src_word != self.bos_id, dtype="int64") * paddle.arange(start=0, end=src_max_len)
-
-        src_emb = self.src_word_embedding(src_word)
-
-        src_pos_emb = self.src_pos_embedding(src_pos)
-        src_emb = src_emb + src_pos_emb
-        enc_input = F.dropout(src_emb, p=self.dropout, training=self.training) if self.dropout else src_emb
-        enc_output = self.transformer.encoder(enc_input, src_mask=src_slf_attn_bias)
-
-        batch_size, _, memory_hidden_dim = enc_output.shape
-        end_token_tensor = paddle.full(shape=[batch_size, 1], fill_value=self.eos_id, dtype="int64")
-
-        predict_ids = []
-        log_probs = paddle.full(shape=[batch_size, 1], fill_value=0, dtype="float32")
-        trg_word = paddle.full(shape=[batch_size, 1], fill_value=self.bos_id, dtype="int64")
-
-        if self.use_fp16_decoder:
-            enc_output = paddle.cast(enc_output, "float16")
-
-        # Init cache
-        if not self.use_batch_major_op_cache:
-            self_cache_key = paddle.zeros(
-                shape=[self.num_decoder_layers, 0, batch_size, self.d_model], dtype=enc_output.dtype
-            )
-            self_cache_value = paddle.zeros(
-                shape=[self.num_decoder_layers, 0, batch_size, self.d_model], dtype=enc_output.dtype
-            )
-        else:
-            self_cache_key = paddle.zeros(
-                shape=[
-                    self.num_decoder_layers,
-                    batch_size,
-                    self.n_head,
-                    self.size_per_head // self.x,
-                    self.max_out_len,
-                    self.x,
-                ],
-                dtype=enc_output.dtype,
-            )
-            self_cache_value = paddle.zeros(
-                shape=[self.num_decoder_layers, batch_size, self.n_head, self.max_out_len, self.size_per_head],
-                dtype=enc_output.dtype,
-            )
-        mem_cache = paddle.zeros(
-            shape=[self.num_decoder_layers, 2, batch_size, src_max_len, self.d_model], dtype=enc_output.dtype
-        )
-        for i in range(self.max_out_len):
-            trg_pos = paddle.full(shape=trg_word.shape, fill_value=i, dtype="int64")
-            trg_emb = self.trg_word_embedding(trg_word)
-            trg_pos_emb = self.trg_pos_embedding(trg_pos)
-            trg_emb = trg_emb + trg_pos_emb
-            dec_input = F.dropout(trg_emb, p=self.dropout, training=self.training) if self.dropout else trg_emb
-
-            # TODO(gongenlei): do cast in op
-            if self.use_fp16_decoder:
-                dec_input = paddle.cast(dec_input, "float16")
-            dec_output, self_cache_key, self_cache_value, mem_cache = self.decoder(
-                from_tensor=dec_input,
-                memory_tensor=enc_output,
-                mem_seq_len=mem_seq_lens,
-                self_cache_key=self_cache_key,
-                self_cache_value=self_cache_value,
-                mem_cache=mem_cache,
-                step=i,
-                memory_hidden_dim=memory_hidden_dim,
-                is_fuse_qkv=False,
-            )
-
-            if self.use_fp16_decoder:
-                dec_output = paddle.cast(dec_output, "float32")
-
-            dec_output = paddle.reshape(dec_output, shape=[-1, dec_output.shape[-1]])
-
-            logits = self.linear(dec_output)
-            step_log_probs = paddle.log(F.softmax(logits, axis=-1))
-            log_probs = paddle.add(x=step_log_probs, y=log_probs)
-            scores = log_probs
-            topk_scores, topk_indices = paddle.topk(x=scores, k=1)
-
-            finished = paddle.equal(topk_indices, end_token_tensor)
-            trg_word = topk_indices
-            log_probs = topk_scores
-
-            predict_ids.append(topk_indices)
-
-            # TODO(gongenlei): support static graph
-            if paddle.all(finished).numpy():
-                break
-
-        predict_ids = paddle.stack(predict_ids, axis=0)
-        finished_seq = paddle.transpose(predict_ids, [1, 2, 0])
-        finished_scores = topk_scores
-
-        return finished_seq, finished_scores
-
-    def load(self, init_from_params):
-        # Load the trained model
-        assert init_from_params, "Please set init_from_params to load the infer model."
-
-        model_dict = paddle.load(init_from_params, return_numpy=True)
-
-        # To set weight[padding_idx] to 0.
-        model_dict["trg_word_embedding.word_embedding.weight"][self.bos_id] = [0] * self.d_model
-
-        # To avoid a longer length than training, reset the size of position
-        # encoding to max_length
-        model_dict["encoder.pos_encoder.weight"] = position_encoding_init(self.max_length, self.d_model)
-        model_dict["decoder.pos_encoder.weight"] = position_encoding_init(self.max_length, self.d_model)
-
-        if self.use_fp16_decoder:
-            for item in self.state_dict():
-                if "decoder.layers" in item:
-                    model_dict[item] = np.float16(model_dict[item])
-
-        self.load_dict(model_dict)
diff --git a/paddlenlp/ops/fast_transformer/transformer/decoding.py b/paddlenlp/ops/fast_transformer/transformer/decoding.py
deleted file mode 100644
index 28b30faebc2b..000000000000
--- a/paddlenlp/ops/fast_transformer/transformer/decoding.py
+++ /dev/null
@@ -1,4550 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import functools
-import os
-from collections import defaultdict
-from functools import partial
-
-import numpy as np
-import paddle
-import paddle.nn as nn
-from paddle.common_ops_import import LayerHelper
-from paddle.framework import core
-
-import paddlenlp
-from paddlenlp.ops.ext_utils import LOADED_EXT, load
-from paddlenlp.transformers import OPTForCausalLM
-from paddlenlp.transformers.t5.modeling import T5DenseGatedGeluDense, T5DenseReluDense
-from paddlenlp.transformers.utils import fn_args_to_dict
-from paddlenlp.utils.log import logger
-
-
-def run_custom(op_name, inputs_names, inputs_var, attrs_names, attrs_val, outputs_names, outputs_dtype=None):
-    ret = []
-
-    if paddle.in_dynamic_mode():
-        new_inputs_var = []
-        for k, v in zip(inputs_names, inputs_var):
-            if not k.endswith("@VECTOR") and isinstance(v, (list, tuple)) and len(v) == 1:
-                new_inputs_var.append(v[0])
-            else:
-                new_inputs_var.append(v)
-        outs = core.eager._run_custom_op(op_name, *new_inputs_var, *attrs_val)
-        return outs[0] if len(outs) == 1 else outs
-    else:
-        inputs = dict(zip(inputs_names, inputs_var))
-        attrs = dict(zip(attrs_names, attrs_val))
-        outputs = {}
-
-        helper = LayerHelper(op_name, **locals())
-
-        for i, name in enumerate(outputs_names):
-            outputs[name] = helper.create_variable(dtype=outputs_dtype[i])
-            ret.append(outputs[name])
-
-        helper.append_op(type=op_name, inputs=inputs, outputs=outputs, attrs=attrs)
-
-    return ret
-
-
-def infer_transformer_decoding(
-    enc_output,
-    memory_seq_lens,
-    word_emb,
-    slf_ln_weight,
-    slf_ln_bias,
-    slf_q_weight,
-    slf_q_bias,
-    slf_k_weight,
-    slf_k_bias,
-    slf_v_weight,
-    slf_v_bias,
-    slf_out_weight,
-    slf_out_bias,
-    cross_ln_weight,
-    cross_ln_bias,
-    cross_q_weight,
-    cross_q_bias,
-    cross_k_weight,
-    cross_k_bias,
-    cross_v_weight,
-    cross_v_bias,
-    cross_out_weight,
-    cross_out_bias,
-    ffn_ln_weight,
-    ffn_ln_bias,
-    ffn_inter_weight,
-    ffn_inter_bias,
-    ffn_out_weight,
-    ffn_out_bias,
-    decoder_ln_weight,
-    decoder_ln_bias,
-    linear_weight,
-    linear_bias,
-    pos_emb,
-    _decoding_strategy,
-    _beam_size,
-    _topk,
-    _topp,
-    _n_head,
-    _size_per_head,
-    _n_layer,
-    _bos_id,
-    _eos_id,
-    _max_out_len,
-    _diversity_rate,
-    _rel_len,
-    _alpha,
-):
-
-    inputs_names = [
-        "Input",
-        "MemSeqLen",
-        "WordEmbedding",
-        "SelfLayernormWeight@VECTOR",
-        "SelfLayernormBias@VECTOR",
-        "SelfQueryWeight@VECTOR",
-        "SelfQueryBias@VECTOR",
-        "SelfKeyWeight@VECTOR",
-        "SelfKeyBias@VECTOR",
-        "SelfValueWeight@VECTOR",
-        "SelfValueBias@VECTOR",
-        "SelfOutWeight@VECTOR",
-        "SelfOutBias@VECTOR",
-        "CrossLayernormWeight@VECTOR",
-        "CrossLayernormBias@VECTOR",
-        "CrossQueryWeight@VECTOR",
-        "CrossQueryBias@VECTOR",
-        "CrossKeyWeight@VECTOR",
-        "CrossKeyBias@VECTOR",
-        "CrossValueWeight@VECTOR",
-        "CrossValueBias@VECTOR",
-        "CrossOutWeight@VECTOR",
-        "CrossOutBias@VECTOR",
-        "FFNLayernormWeight@VECTOR",
-        "FFNLayernormBias@VECTOR",
-        "FFNInterWeight@VECTOR",
-        "FFNInterBias@VECTOR",
-        "FFNOutWeight@VECTOR",
-        "FFNOutBias@VECTOR",
-        "DecoderLayernormWeight",
-        "DecoderLayernormBias",
-        "EmbWeight",
-        "EmbBias",
-        "PositionEncEmb",
-    ]
-
-    inputs_var = [
-        enc_output,
-        memory_seq_lens,
-        word_emb,
-        slf_ln_weight,
-        slf_ln_bias,
-        slf_q_weight,
-        slf_q_bias,
-        slf_k_weight,
-        slf_k_bias,
-        slf_v_weight,
-        slf_v_bias,
-        slf_out_weight,
-        slf_out_bias,
-        cross_ln_weight,
-        cross_ln_bias,
-        cross_q_weight,
-        cross_q_bias,
-        cross_k_weight,
-        cross_k_bias,
-        cross_v_weight,
-        cross_v_bias,
-        cross_out_weight,
-        cross_out_bias,
-        ffn_ln_weight,
-        ffn_ln_bias,
-        ffn_inter_weight,
-        ffn_inter_bias,
-        ffn_out_weight,
-        ffn_out_bias,
-        decoder_ln_weight,
-        decoder_ln_bias,
-        linear_weight,
-        linear_bias,
-        pos_emb,
-    ]
-
-    attrs_names = [
-        "decoding_strategy",
-        "beam_size",
-        "topk",
-        "topp",
-        "n_head",
-        "size_per_head",
-        "num_layer",
-        "bos_id",
-        "eos_id",
-        "max_len",
-        "beam_search_diversity_rate",
-        "rel_len",
-        "alpha",
-    ]
-
-    attrs_val = [
-        _decoding_strategy,
-        _beam_size,
-        _topk,
-        _topp,
-        _n_head,
-        _size_per_head,
-        _n_layer,
-        _bos_id,
-        _eos_id,
-        _max_out_len,
-        _diversity_rate,
-        _rel_len,
-        _alpha,
-    ]
-
-    outputs_names = ["OutputIds", "ParentIds", "SequenceLength"]
-
-    outputs_dtype = ["int32"] * len(outputs_names)
-
-    return run_custom(
-        "fusion_decoding", inputs_names, inputs_var, attrs_names, attrs_val, outputs_names, outputs_dtype
-    )
-
-
-def infer_force_decoding(
-    enc_output,
-    memory_seq_lens,
-    word_emb,
-    slf_ln_weight,
-    slf_ln_bias,
-    slf_q_weight,
-    slf_q_bias,
-    slf_k_weight,
-    slf_k_bias,
-    slf_v_weight,
-    slf_v_bias,
-    slf_out_weight,
-    slf_out_bias,
-    cross_ln_weight,
-    cross_ln_bias,
-    cross_q_weight,
-    cross_q_bias,
-    cross_k_weight,
-    cross_k_bias,
-    cross_v_weight,
-    cross_v_bias,
-    cross_out_weight,
-    cross_out_bias,
-    ffn_ln_weight,
-    ffn_ln_bias,
-    ffn_inter_weight,
-    ffn_inter_bias,
-    ffn_out_weight,
-    ffn_out_bias,
-    decoder_ln_weight,
-    decoder_ln_bias,
-    linear_weight,
-    linear_bias,
-    pos_emb,
-    trg_word,
-    _decoding_strategy,
-    _beam_size,
-    _topk,
-    _topp,
-    _n_head,
-    _size_per_head,
-    _n_layer,
-    _bos_id,
-    _eos_id,
-    _max_out_len,
-    _diversity_rate,
-    _rel_len,
-    _alpha,
-):
-
-    inputs_names = [
-        "Input",
-        "MemSeqLen",
-        "WordEmbedding",
-        "SelfLayernormWeight@VECTOR",
-        "SelfLayernormBias@VECTOR",
-        "SelfQueryWeight@VECTOR",
-        "SelfQueryBias@VECTOR",
-        "SelfKeyWeight@VECTOR",
-        "SelfKeyBias@VECTOR",
-        "SelfValueWeight@VECTOR",
-        "SelfValueBias@VECTOR",
-        "SelfOutWeight@VECTOR",
-        "SelfOutBias@VECTOR",
-        "CrossLayernormWeight@VECTOR",
-        "CrossLayernormBias@VECTOR",
-        "CrossQueryWeight@VECTOR",
-        "CrossQueryBias@VECTOR",
-        "CrossKeyWeight@VECTOR",
-        "CrossKeyBias@VECTOR",
-        "CrossValueWeight@VECTOR",
-        "CrossValueBias@VECTOR",
-        "CrossOutWeight@VECTOR",
-        "CrossOutBias@VECTOR",
-        "FFNLayernormWeight@VECTOR",
-        "FFNLayernormBias@VECTOR",
-        "FFNInterWeight@VECTOR",
-        "FFNInterBias@VECTOR",
-        "FFNOutWeight@VECTOR",
-        "FFNOutBias@VECTOR",
-        "DecoderLayernormWeight",
-        "DecoderLayernormBias",
-        "EmbWeight",
-        "EmbBias",
-        "PositionEncEmb",
-        # The input of custom op must be given.
-        # Dispensable() and Intermediate() are not supported.
-        "TrgWord",
-    ]
-
-    inputs_var = [
-        enc_output,
-        memory_seq_lens,
-        word_emb,
-        slf_ln_weight,
-        slf_ln_bias,
-        slf_q_weight,
-        slf_q_bias,
-        slf_k_weight,
-        slf_k_bias,
-        slf_v_weight,
-        slf_v_bias,
-        slf_out_weight,
-        slf_out_bias,
-        cross_ln_weight,
-        cross_ln_bias,
-        cross_q_weight,
-        cross_q_bias,
-        cross_k_weight,
-        cross_k_bias,
-        cross_v_weight,
-        cross_v_bias,
-        cross_out_weight,
-        cross_out_bias,
-        ffn_ln_weight,
-        ffn_ln_bias,
-        ffn_inter_weight,
-        ffn_inter_bias,
-        ffn_out_weight,
-        ffn_out_bias,
-        decoder_ln_weight,
-        decoder_ln_bias,
-        linear_weight,
-        linear_bias,
-        pos_emb,
-        # The input of custom op must be given.
-        # Dispensable() and Intermediate() are not supported.
-        trg_word,
-    ]
-
-    attrs_names = [
-        "decoding_strategy",
-        "beam_size",
-        "topk",
-        "topp",
-        "n_head",
-        "size_per_head",
-        "num_layer",
-        "bos_id",
-        "eos_id",
-        "max_len",
-        "beam_search_diversity_rate",
-        "rel_len",
-        "alpha",
-    ]
-
-    attrs_val = [
-        _decoding_strategy,
-        _beam_size,
-        _topk,
-        _topp,
-        _n_head,
-        _size_per_head,
-        _n_layer,
-        _bos_id,
-        _eos_id,
-        _max_out_len,
-        _diversity_rate,
-        _rel_len,
-        _alpha,
-    ]
-
-    outputs_names = ["OutputIds", "ParentIds", "SequenceLength"]
-
-    outputs_dtype = ["int32"] * len(outputs_names)
-
-    return run_custom(
-        "fusion_force_decoding", inputs_names, inputs_var, attrs_names, attrs_val, outputs_names, outputs_dtype
-    )
-
-
-def infer_opt_decoding(
-    input,
-    attn_mask,
-    mem_seq_len,
-    word_emb,
-    slf_ln_weight,
-    slf_ln_bias,
-    slf_q_weight,
-    slf_q_bias,
-    slf_k_weight,
-    slf_k_bias,
-    slf_v_weight,
-    slf_v_bias,
-    slf_out_weight,
-    slf_out_bias,
-    ffn_ln_weight,
-    ffn_ln_bias,
-    ffn_inter_weight,
-    ffn_inter_bias,
-    ffn_out_weight,
-    ffn_out_bias,
-    decoder_ln_weight,
-    decoder_ln_bias,
-    pos_emb,
-    linear_weight,
-    normalize_before,
-    topk,
-    topp,
-    max_out_len,
-    head_num,
-    size_per_head,
-    num_layer,
-    bos_id,
-    eos_id,
-    temperature,
-    use_fp16_decoding,
-):
-    helper = LayerHelper("fusion_opt", **locals())
-
-    inputs = {
-        "Input": input,
-        "AttentionMask": attn_mask,
-        "StartLength": mem_seq_len,
-        "WordEmbedding": word_emb,
-        "SelfLayernormWeight@VECTOR": slf_ln_weight,
-        "SelfLayernormBias@VECTOR": slf_ln_bias,
-        "SelfQueryWeight@VECTOR": slf_q_weight,
-        "SelfQueryBias@VECTOR": slf_q_bias,
-        "SelfKeyWeight@VECTOR": slf_k_weight,
-        "SelfKeyBias@VECTOR": slf_k_bias,
-        "SelfValueWeight@VECTOR": slf_v_weight,
-        "SelfValueBias@VECTOR": slf_v_bias,
-        "SelfOutWeight@VECTOR": slf_out_weight,
-        "SelfOutBias@VECTOR": slf_out_bias,
-        "FFNLayernormWeight@VECTOR": ffn_ln_weight,
-        "FFNLayernormBias@VECTOR": ffn_ln_bias,
-        "FFNInterWeight@VECTOR": ffn_inter_weight,
-        "FFNInterBias@VECTOR": ffn_inter_bias,
-        "FFNOutWeight@VECTOR": ffn_out_weight,
-        "FFNOutBias@VECTOR": ffn_out_bias,
-        "DecoderLayernormWeight": decoder_ln_weight,
-        "DecoderLayernormBias": decoder_ln_bias,
-        "PositionEncEmb": pos_emb,
-        "EmbWeight": linear_weight,
-    }
-    tensor_para_size = get_ft_para_conf().tensor_para_size
-    layer_para_size = get_ft_para_conf().layer_para_size
-    layer_para_batch_size = get_ft_para_conf().layer_para_batch_size
-    attrs = {
-        "normalize_before": normalize_before,
-        "topk": topk,
-        "topp": topp,
-        "max_len": max_out_len,
-        "n_head": head_num,
-        "size_per_head": size_per_head,
-        "num_layer": num_layer,
-        "bos_id": bos_id,
-        "eos_id": eos_id,
-        "temperature": temperature,
-        "use_fp16": use_fp16_decoding,
-        "tensor_para_size": tensor_para_size,
-        "layer_para_size": layer_para_size,
-        "layer_para_batch_size": layer_para_batch_size,
-    }
-
-    output_ids = helper.create_variable(dtype="int32")
-    outputs = {"OutputIds": output_ids}
-
-    helper.append_op(type="fusion_opt", inputs=inputs, outputs=outputs, attrs=attrs)
-
-    return output_ids
-
-
-def infer_gpt_decoding(
-    input,
-    attn_mask,
-    mem_seq_len,
-    word_emb,
-    slf_ln_weight,
-    slf_ln_bias,
-    slf_q_weight,
-    slf_q_bias,
-    slf_k_weight,
-    slf_k_bias,
-    slf_v_weight,
-    slf_v_bias,
-    slf_out_weight,
-    slf_out_bias,
-    ffn_ln_weight,
-    ffn_ln_bias,
-    ffn_inter_weight,
-    ffn_inter_bias,
-    ffn_out_weight,
-    ffn_out_bias,
-    decoder_ln_weight,
-    decoder_ln_bias,
-    pos_emb,
-    linear_weight,
-    topk,
-    topp,
-    max_out_len,
-    head_num,
-    size_per_head,
-    num_layer,
-    bos_id,
-    eos_id,
-    temperature,
-    use_fp16_decoding,
-):
-
-    tensor_para_size = get_ft_para_conf().tensor_para_size
-    layer_para_size = get_ft_para_conf().layer_para_size
-    layer_para_batch_size = get_ft_para_conf().layer_para_batch_size
-
-    inputs_names = [
-        "Input",
-        "AttentionMask",
-        "StartLength",
-        "WordEmbedding",
-        "SelfLayernormWeight@VECTOR",
-        "SelfLayernormBias@VECTOR",
-        "SelfQueryWeight@VECTOR",
-        "SelfQueryBias@VECTOR",
-        "SelfKeyWeight@VECTOR",
-        "SelfKeyBias@VECTOR",
-        "SelfValueWeight@VECTOR",
-        "SelfValueBias@VECTOR",
-        "SelfOutWeight@VECTOR",
-        "SelfOutBias@VECTOR",
-        "FFNLayernormWeight@VECTOR",
-        "FFNLayernormBias@VECTOR",
-        "FFNInterWeight@VECTOR",
-        "FFNInterBias@VECTOR",
-        "FFNOutWeight@VECTOR",
-        "FFNOutBias@VECTOR",
-        "DecoderLayernormWeight",
-        "DecoderLayernormBias",
-        "PositionEncEmb",
-        "EmbWeight",
-    ]
-
-    inputs_var = [
-        input,
-        attn_mask,
-        mem_seq_len,
-        word_emb,
-        slf_ln_weight,
-        slf_ln_bias,
-        slf_q_weight,
-        slf_q_bias,
-        slf_k_weight,
-        slf_k_bias,
-        slf_v_weight,
-        slf_v_bias,
-        slf_out_weight,
-        slf_out_bias,
-        ffn_ln_weight,
-        ffn_ln_bias,
-        ffn_inter_weight,
-        ffn_inter_bias,
-        ffn_out_weight,
-        ffn_out_bias,
-        decoder_ln_weight,
-        decoder_ln_bias,
-        pos_emb,
-        linear_weight,
-    ]
-
-    attrs_names = [
-        "topk",
-        "topp",
-        "max_len",
-        "n_head",
-        "size_per_head",
-        "num_layer",
-        "bos_id",
-        "eos_id",
-        "temperature",
-        "use_fp16",
-        "tensor_para_size",
-        "layer_para_size",
-        "layer_para_batch_size",
-    ]
-
-    attrs_val = [
-        topk,
-        topp,
-        max_out_len,
-        head_num,
-        size_per_head,
-        num_layer,
-        bos_id,
-        eos_id,
-        temperature,
-        use_fp16_decoding,
-        tensor_para_size,
-        layer_para_size,
-        layer_para_batch_size,
-    ]
-
-    outputs_names = ["OutputIds"]
-
-    outputs_dtype = ["int32"]
-
-    return run_custom("fusion_gpt", inputs_names, inputs_var, attrs_names, attrs_val, outputs_names, outputs_dtype)
-
-
-def infer_unified_decoding(
-    input_ids,
-    attn_mask,
-    memory_seq_lens,
-    type_id,
-    decoder_type_id,
-    logits_mask,
-    word_emb,
-    slf_ln_weight,
-    slf_ln_bias,
-    slf_q_weight,
-    slf_q_bias,
-    slf_k_weight,
-    slf_k_bias,
-    slf_v_weight,
-    slf_v_bias,
-    slf_out_weight,
-    slf_out_bias,
-    ffn_ln_weight,
-    ffn_ln_bias,
-    ffn_inter_weight,
-    ffn_inter_bias,
-    ffn_out_weight,
-    ffn_out_bias,
-    decoder_ln_weight,
-    decoder_ln_bias,
-    trans_weight,
-    trans_bias,
-    lm_ln_weight,
-    lm_ln_bias,
-    linear_weight,
-    linear_bias,
-    pos_emb,
-    type_emb,
-    role_id,
-    decoder_role_id,
-    role_emb,
-    position_id,
-    decoder_position_id,
-    _decoding_strategy,
-    _beam_size,
-    _topk,
-    _topp,
-    _n_head,
-    _size_per_head,
-    _n_layer,
-    _bos_id,
-    _eos_id,
-    _max_out_len,
-    _diversity_rate,
-    _unk_id,
-    _mask_id,
-    _temperature,
-    _len_penalty,
-    _normalize_before,
-    _pos_bias,
-    _hidden_act,
-    _rel_len,
-    _early_stopping,
-    _min_length,
-):
-
-    tensor_para_size = get_ft_para_conf().tensor_para_size
-    layer_para_size = get_ft_para_conf().layer_para_size
-    layer_para_batch_size = get_ft_para_conf().layer_para_batch_size
-
-    inputs_names = [
-        "InputIds",
-        "AttnMask",
-        "MemSeqLen",
-        "TypeIds",
-        "DecTypeIds",
-        "LogitsMask",
-        "WordEmbedding",
-        "SelfLayernormWeight@VECTOR",
-        "SelfLayernormBias@VECTOR",
-        "SelfQueryWeight@VECTOR",
-        "SelfQueryBias@VECTOR",
-        "SelfKeyWeight@VECTOR",
-        "SelfKeyBias@VECTOR",
-        "SelfValueWeight@VECTOR",
-        "SelfValueBias@VECTOR",
-        "SelfOutWeight@VECTOR",
-        "SelfOutBias@VECTOR",
-        "FFNLayernormWeight@VECTOR",
-        "FFNLayernormBias@VECTOR",
-        "FFNInterWeight@VECTOR",
-        "FFNInterBias@VECTOR",
-        "FFNOutWeight@VECTOR",
-        "FFNOutBias@VECTOR",
-        "DecoderLayernormWeight",
-        "DecoderLayernormBias",
-        "TransWeight",
-        "TransBias",
-        "LMLayernormWeight",
-        "LMLayernormBias",
-        "EmbWeight",
-        "EmbBias",
-        "PositionEncEmb",
-        "TypeEmb",
-        "RoleIds",
-        "DecRoleIds",
-        "RoleEmbedding",
-        "PositionIds",
-        "DecPositionIds",
-    ]
-
-    inputs_var = [
-        input_ids,
-        attn_mask,
-        memory_seq_lens,
-        type_id,
-        decoder_type_id,
-        logits_mask,
-        word_emb,
-        slf_ln_weight,
-        slf_ln_bias,
-        slf_q_weight,
-        slf_q_bias,
-        slf_k_weight,
-        slf_k_bias,
-        slf_v_weight,
-        slf_v_bias,
-        slf_out_weight,
-        slf_out_bias,
-        ffn_ln_weight,
-        ffn_ln_bias,
-        ffn_inter_weight,
-        ffn_inter_bias,
-        ffn_out_weight,
-        ffn_out_bias,
-        decoder_ln_weight,
-        decoder_ln_bias,
-        trans_weight,
-        trans_bias,
-        lm_ln_weight,
-        lm_ln_bias,
-        linear_weight,
-        linear_bias,
-        pos_emb,
-        type_emb,
-        role_id,
-        decoder_role_id,
-        role_emb,
-        position_id,
-        decoder_position_id,
-    ]
-
-    attrs_names = [
-        "decoding_strategy",
-        "beam_size",
-        "topk",
-        "topp",
-        "n_head",
-        "size_per_head",
-        "num_layer",
-        "bos_id",
-        "eos_id",
-        "max_len",
-        "beam_search_diversity_rate",
-        "unk_id",
-        "mask_id",
-        "temperature",
-        "len_penalty",
-        "normalize_before",
-        "pos_bias",
-        "hidden_act",
-        "rel_len",
-        "early_stopping",
-        "min_length",
-        "tensor_para_size",
-        "layer_para_size",
-        "layer_para_batch_size",
-    ]
-
-    attrs_val = [
-        _decoding_strategy,
-        _beam_size,
-        _topk,
-        float(_topp),
-        _n_head,
-        _size_per_head,
-        _n_layer,
-        _bos_id,
-        _eos_id,
-        _max_out_len,
-        _diversity_rate,
-        _unk_id,
-        _mask_id,
-        _temperature,
-        _len_penalty,
-        _normalize_before,
-        _pos_bias,
-        _hidden_act,
-        _rel_len,
-        _early_stopping,
-        _min_length,
-        tensor_para_size,
-        layer_para_size,
-        layer_para_batch_size,
-    ]
-
-    outputs_names = ["OutputIds", "ParentIds", "SequenceLength", "OutputScores"]
-
-    outputs_dtype = ["int32", "int32", "int32", "float32"]
-
-    return run_custom(
-        "fusion_unified_decoding", inputs_names, inputs_var, attrs_names, attrs_val, outputs_names, outputs_dtype
-    )
-
-
-def infer_miro_decoding(
-    input_ids,
-    attn_mask,
-    memory_seq_lens,
-    type_id,
-    decoder_type_id,
-    logits_mask,
-    word_emb,
-    pre_decoder_ln_weight,
-    pre_decoder_ln_bias,
-    slf_ln_weight,
-    slf_ln_bias,
-    slf_q_weight,
-    slf_q_bias,
-    slf_k_weight,
-    slf_k_bias,
-    slf_v_weight,
-    slf_v_bias,
-    slf_out_weight,
-    slf_out_bias,
-    ffn_ln_weight,
-    ffn_ln_bias,
-    ffn_inter_weight,
-    ffn_inter_bias,
-    ffn_out_weight,
-    ffn_out_bias,
-    decoder_ln_weight,
-    decoder_ln_bias,
-    trans_weight,
-    trans_bias,
-    lm_ln_weight,
-    lm_ln_bias,
-    linear_weight,
-    linear_bias,
-    pos_emb,
-    type_emb,
-    role_id,
-    decoder_role_id,
-    role_emb,
-    position_id,
-    decoder_position_id,
-    _decoding_strategy,
-    _beam_size,
-    _topk,
-    _topp,
-    _n_head,
-    _size_per_head,
-    _n_layer,
-    _bos_id,
-    _eos_id,
-    _max_out_len,
-    _diversity_rate,
-    _unk_id,
-    _mask_id,
-    _temperature,
-    _len_penalty,
-    _normalize_before,
-    _pos_bias,
-    _hidden_act,
-    _rel_len,
-    _early_stopping,
-    _min_length,
-):
-
-    tensor_para_size = get_ft_para_conf().tensor_para_size
-    layer_para_size = get_ft_para_conf().layer_para_size
-    layer_para_batch_size = get_ft_para_conf().layer_para_batch_size
-
-    inputs_names = [
-        "InputIds",
-        "AttnMask",
-        "MemSeqLen",
-        "TypeIds",
-        "DecTypeIds",
-        "LogitsMask",
-        "WordEmbedding",
-        "PreDecoderLayernormWeight",
-        "PreDecoderLayernormBias",
-        "SelfLayernormWeight@VECTOR",
-        "SelfLayernormBias@VECTOR",
-        "SelfQueryWeight@VECTOR",
-        "SelfQueryBias@VECTOR",
-        "SelfKeyWeight@VECTOR",
-        "SelfKeyBias@VECTOR",
-        "SelfValueWeight@VECTOR",
-        "SelfValueBias@VECTOR",
-        "SelfOutWeight@VECTOR",
-        "SelfOutBias@VECTOR",
-        "FFNLayernormWeight@VECTOR",
-        "FFNLayernormBias@VECTOR",
-        "FFNInterWeight@VECTOR",
-        "FFNInterBias@VECTOR",
-        "FFNOutWeight@VECTOR",
-        "FFNOutBias@VECTOR",
-        "DecoderLayernormWeight",
-        "DecoderLayernormBias",
-        "TransWeight",
-        "TransBias",
-        "LMLayernormWeight",
-        "LMLayernormBias",
-        "EmbWeight",
-        "EmbBias",
-        "PositionEncEmb",
-        "TypeEmb",
-        "RoleIds",
-        "DecRoleIds",
-        "RoleEmbedding",
-        "PositionIds",
-        "DecPositionIds",
-    ]
-
-    inputs_var = [
-        input_ids,
-        attn_mask,
-        memory_seq_lens,
-        type_id,
-        decoder_type_id,
-        logits_mask,
-        word_emb,
-        pre_decoder_ln_weight,
-        pre_decoder_ln_bias,
-        slf_ln_weight,
-        slf_ln_bias,
-        slf_q_weight,
-        slf_q_bias,
-        slf_k_weight,
-        slf_k_bias,
-        slf_v_weight,
-        slf_v_bias,
-        slf_out_weight,
-        slf_out_bias,
-        ffn_ln_weight,
-        ffn_ln_bias,
-        ffn_inter_weight,
-        ffn_inter_bias,
-        ffn_out_weight,
-        ffn_out_bias,
-        decoder_ln_weight,
-        decoder_ln_bias,
-        trans_weight,
-        trans_bias,
-        lm_ln_weight,
-        lm_ln_bias,
-        linear_weight,
-        linear_bias,
-        pos_emb,
-        type_emb,
-        role_id,
-        decoder_role_id,
-        role_emb,
-        position_id,
-        decoder_position_id,
-    ]
-
-    attrs_names = [
-        "decoding_strategy",
-        "beam_size",
-        "topk",
-        "topp",
-        "n_head",
-        "size_per_head",
-        "num_layer",
-        "bos_id",
-        "eos_id",
-        "max_len",
-        "beam_search_diversity_rate",
-        "unk_id",
-        "mask_id",
-        "temperature",
-        "len_penalty",
-        "normalize_before",
-        "pos_bias",
-        "hidden_act",
-        "rel_len",
-        "early_stopping",
-        "min_length",
-        "tensor_para_size",
-        "layer_para_size",
-        "layer_para_batch_size",
-    ]
-
-    attrs_val = [
-        _decoding_strategy,
-        _beam_size,
-        _topk,
-        float(_topp),
-        _n_head,
-        _size_per_head,
-        _n_layer,
-        _bos_id,
-        _eos_id,
-        _max_out_len,
-        _diversity_rate,
-        _unk_id,
-        _mask_id,
-        _temperature,
-        _len_penalty,
-        _normalize_before,
-        _pos_bias,
-        _hidden_act,
-        _rel_len,
-        _early_stopping,
-        _min_length,
-        tensor_para_size,
-        layer_para_size,
-        layer_para_batch_size,
-    ]
-
-    outputs_names = ["OutputIds", "ParentIds", "SequenceLength", "OutputScores"]
-
-    outputs_dtype = ["int32", "int32", "int32", "float32"]
-
-    return run_custom("fusion_miro", inputs_names, inputs_var, attrs_names, attrs_val, outputs_names, outputs_dtype)
-
-
-def infer_bart_decoding(
-    enc_output,
-    memory_seq_lens,
-    word_emb,
-    slf_ln_weight,
-    slf_ln_bias,
-    slf_q_weight,
-    slf_q_bias,
-    slf_k_weight,
-    slf_k_bias,
-    slf_v_weight,
-    slf_v_bias,
-    slf_out_weight,
-    slf_out_bias,
-    cross_ln_weight,
-    cross_ln_bias,
-    cross_q_weight,
-    cross_q_bias,
-    cross_k_weight,
-    cross_k_bias,
-    cross_v_weight,
-    cross_v_bias,
-    cross_out_weight,
-    cross_out_bias,
-    ffn_ln_weight,
-    ffn_ln_bias,
-    ffn_inter_weight,
-    ffn_inter_bias,
-    ffn_out_weight,
-    ffn_out_bias,
-    decoder_ln_weight,
-    decoder_ln_bias,
-    linear_weight,
-    linear_bias,
-    pos_emb,
-    _decoding_strategy,
-    _beam_size,
-    _topk,
-    _topp,
-    _temperature,
-    _n_head,
-    _size_per_head,
-    _n_layer,
-    _bos_id,
-    _eos_id,
-    _max_out_len,
-    _min_out_len,
-    _diversity_rate,
-    _rel_len,
-    _alpha,
-    _early_stopping,
-):
-
-    inputs_names = [
-        "Input",
-        "MemSeqLen",
-        "WordEmbedding",
-        "SelfLayernormWeight@VECTOR",
-        "SelfLayernormBias@VECTOR",
-        "SelfQueryWeight@VECTOR",
-        "SelfQueryBias@VECTOR",
-        "SelfKeyWeight@VECTOR",
-        "SelfKeyBias@VECTOR",
-        "SelfValueWeight@VECTOR",
-        "SelfValueBias@VECTOR",
-        "SelfOutWeight@VECTOR",
-        "SelfOutBias@VECTOR",
-        "CrossLayernormWeight@VECTOR",
-        "CrossLayernormBias@VECTOR",
-        "CrossQueryWeight@VECTOR",
-        "CrossQueryBias@VECTOR",
-        "CrossKeyWeight@VECTOR",
-        "CrossKeyBias@VECTOR",
-        "CrossValueWeight@VECTOR",
-        "CrossValueBias@VECTOR",
-        "CrossOutWeight@VECTOR",
-        "CrossOutBias@VECTOR",
-        "FFNLayernormWeight@VECTOR",
-        "FFNLayernormBias@VECTOR",
-        "FFNInterWeight@VECTOR",
-        "FFNInterBias@VECTOR",
-        "FFNOutWeight@VECTOR",
-        "FFNOutBias@VECTOR",
-        "DecoderLayernormWeight",
-        "DecoderLayernormBias",
-        "EmbWeight",
-        "EmbBias",
-        "PositionEncEmb",
-    ]
-
-    inputs_var = [
-        enc_output,
-        memory_seq_lens,
-        word_emb,
-        slf_ln_weight,
-        slf_ln_bias,
-        slf_q_weight,
-        slf_q_bias,
-        slf_k_weight,
-        slf_k_bias,
-        slf_v_weight,
-        slf_v_bias,
-        slf_out_weight,
-        slf_out_bias,
-        cross_ln_weight,
-        cross_ln_bias,
-        cross_q_weight,
-        cross_q_bias,
-        cross_k_weight,
-        cross_k_bias,
-        cross_v_weight,
-        cross_v_bias,
-        cross_out_weight,
-        cross_out_bias,
-        ffn_ln_weight,
-        ffn_ln_bias,
-        ffn_inter_weight,
-        ffn_inter_bias,
-        ffn_out_weight,
-        ffn_out_bias,
-        decoder_ln_weight,
-        decoder_ln_bias,
-        linear_weight,
-        linear_bias,
-        pos_emb,
-    ]
-
-    attrs_names = [
-        "decoding_strategy",
-        "beam_size",
-        "topk",
-        "topp",
-        "temperature",
-        "n_head",
-        "size_per_head",
-        "num_layer",
-        "bos_id",
-        "eos_id",
-        "max_len",
-        "min_len",
-        "beam_search_diversity_rate",
-        "rel_len",
-        "alpha",
-        "early_stopping",
-    ]
-
-    attrs_val = [
-        _decoding_strategy,
-        _beam_size,
-        _topk,
-        _topp,
-        _temperature,
-        _n_head,
-        _size_per_head,
-        _n_layer,
-        _bos_id,
-        _eos_id,
-        _max_out_len,
-        _min_out_len,
-        _diversity_rate,
-        _rel_len,
-        _alpha,
-        _early_stopping,
-    ]
-
-    outputs_names = ["OutputIds", "ParentIds", "SequenceLength"]
-
-    outputs_dtype = ["int32"] * len(outputs_names)
-
-    return run_custom(
-        "fusion_bart_decoding", inputs_names, inputs_var, attrs_names, attrs_val, outputs_names, outputs_dtype
-    )
-
-
-def infer_mbart_decoding(
-    enc_output,
-    memory_seq_lens,
-    word_emb,
-    slf_ln_weight,
-    slf_ln_bias,
-    slf_q_weight,
-    slf_q_bias,
-    slf_k_weight,
-    slf_k_bias,
-    slf_v_weight,
-    slf_v_bias,
-    slf_out_weight,
-    slf_out_bias,
-    cross_ln_weight,
-    cross_ln_bias,
-    cross_q_weight,
-    cross_q_bias,
-    cross_k_weight,
-    cross_k_bias,
-    cross_v_weight,
-    cross_v_bias,
-    cross_out_weight,
-    cross_out_bias,
-    ffn_ln_weight,
-    ffn_ln_bias,
-    ffn_inter_weight,
-    ffn_inter_bias,
-    ffn_out_weight,
-    ffn_out_bias,
-    decoder_ln_weight,
-    decoder_ln_bias,
-    mbart_ln_weight,
-    mbart_ln_bias,
-    linear_weight,
-    linear_bias,
-    pos_emb,
-    trg_word,
-    _decoding_strategy,
-    _beam_size,
-    _topk,
-    _topp,
-    _n_head,
-    _size_per_head,
-    _n_layer,
-    _bos_id,
-    _eos_id,
-    _max_out_len,
-    _diversity_rate,
-    _rel_len,
-    _alpha,
-    _temperature,
-    _early_stopping,
-    _hidden_act,
-):
-
-    inputs_names = [
-        "Input",
-        "MemSeqLen",
-        "WordEmbedding",
-        "SelfLayernormWeight@VECTOR",
-        "SelfLayernormBias@VECTOR",
-        "SelfQueryWeight@VECTOR",
-        "SelfQueryBias@VECTOR",
-        "SelfKeyWeight@VECTOR",
-        "SelfKeyBias@VECTOR",
-        "SelfValueWeight@VECTOR",
-        "SelfValueBias@VECTOR",
-        "SelfOutWeight@VECTOR",
-        "SelfOutBias@VECTOR",
-        "CrossLayernormWeight@VECTOR",
-        "CrossLayernormBias@VECTOR",
-        "CrossQueryWeight@VECTOR",
-        "CrossQueryBias@VECTOR",
-        "CrossKeyWeight@VECTOR",
-        "CrossKeyBias@VECTOR",
-        "CrossValueWeight@VECTOR",
-        "CrossValueBias@VECTOR",
-        "CrossOutWeight@VECTOR",
-        "CrossOutBias@VECTOR",
-        "FFNLayernormWeight@VECTOR",
-        "FFNLayernormBias@VECTOR",
-        "FFNInterWeight@VECTOR",
-        "FFNInterBias@VECTOR",
-        "FFNOutWeight@VECTOR",
-        "FFNOutBias@VECTOR",
-        "DecoderLayernormWeight",
-        "DecoderLayernormBias",
-        "MBARTLayernormWeight",
-        "MBARTLayernormBias",
-        "EmbWeight",
-        "EmbBias",
-        "PositionEncEmb",
-        # The input of custom op must be given.
-        # Dispensable() and Intermediate() are not supported.
-        "TrgWord",
-    ]
-
-    inputs_var = [
-        enc_output,
-        memory_seq_lens,
-        word_emb,
-        slf_ln_weight,
-        slf_ln_bias,
-        slf_q_weight,
-        slf_q_bias,
-        slf_k_weight,
-        slf_k_bias,
-        slf_v_weight,
-        slf_v_bias,
-        slf_out_weight,
-        slf_out_bias,
-        cross_ln_weight,
-        cross_ln_bias,
-        cross_q_weight,
-        cross_q_bias,
-        cross_k_weight,
-        cross_k_bias,
-        cross_v_weight,
-        cross_v_bias,
-        cross_out_weight,
-        cross_out_bias,
-        ffn_ln_weight,
-        ffn_ln_bias,
-        ffn_inter_weight,
-        ffn_inter_bias,
-        ffn_out_weight,
-        ffn_out_bias,
-        decoder_ln_weight,
-        decoder_ln_bias,
-        mbart_ln_weight,
-        mbart_ln_bias,
-        linear_weight,
-        linear_bias,
-        pos_emb,
-        # The input of custom op must be given.
-        # Dispensable() and Intermediate() are not supported.
-        trg_word,
-    ]
-
-    attrs_names = [
-        "decoding_strategy",
-        "beam_size",
-        "topk",
-        "topp",
-        "n_head",
-        "size_per_head",
-        "num_layer",
-        "bos_id",
-        "eos_id",
-        "temperature",
-        "max_len",
-        "beam_search_diversity_rate",
-        "rel_len",
-        "alpha",
-        "early_stopping",
-        "hidden_act",
-    ]
-
-    attrs_val = [
-        _decoding_strategy,
-        _beam_size,
-        _topk,
-        _topp,
-        _n_head,
-        _size_per_head,
-        _n_layer,
-        _bos_id,
-        _eos_id,
-        _temperature,
-        _max_out_len,
-        _diversity_rate,
-        _rel_len,
-        _alpha,
-        _early_stopping,
-        _hidden_act,
-    ]
-
-    outputs_names = ["OutputIds", "ParentIds", "SequenceLength"]
-
-    outputs_dtype = ["int32"] * len(outputs_names)
-
-    return run_custom(
-        "fusion_mbart_decoding", inputs_names, inputs_var, attrs_names, attrs_val, outputs_names, outputs_dtype
-    )
-
-
-def infer_gptj_decoding(
-    input,
-    attn_mask,
-    mem_seq_len,
-    word_emb,
-    slf_ln_weight,
-    slf_ln_bias,
-    slf_q_weight,
-    slf_out_weight,
-    ffn_inter_weight,
-    ffn_inter_bias,
-    ffn_out_weight,
-    ffn_out_bias,
-    decoder_ln_weight,
-    decoder_ln_bias,
-    linear_weight,
-    linear_bias,
-    topk,
-    topp,
-    max_out_len,
-    head_num,
-    size_per_head,
-    num_layer,
-    bos_id,
-    eos_id,
-    temperature,
-    rotary_embedding_dim,
-    repetition_penalty,
-    min_length,
-    use_fp16_decoding,
-):
-    tensor_para_size = get_ft_para_conf().tensor_para_size
-    layer_para_size = get_ft_para_conf().layer_para_size
-    layer_para_batch_size = get_ft_para_conf().layer_para_batch_size
-
-    inputs = {
-        "Input": input,
-        "AttentionMask": attn_mask,
-        "StartLength": mem_seq_len,
-        "WordEmbedding": word_emb,
-        "SelfLayernormWeight@VECTOR": slf_ln_weight,
-        "SelfLayernormBias@VECTOR": slf_ln_bias,
-        "SelfQueryWeight@VECTOR": slf_q_weight,
-        "SelfOutWeight@VECTOR": slf_out_weight,
-        "FFNInterWeight@VECTOR": ffn_inter_weight,
-        "FFNInterBias@VECTOR": ffn_inter_bias,
-        "FFNOutWeight@VECTOR": ffn_out_weight,
-        "FFNOutBias@VECTOR": ffn_out_bias,
-        "DecoderLayernormWeight": decoder_ln_weight,
-        "DecoderLayernormBias": decoder_ln_bias,
-        "EmbWeight": linear_weight,
-        "EmbBias": linear_bias,
-    }
-
-    attrs = {
-        "topk": topk,
-        "topp": topp,
-        "max_len": max_out_len,
-        "n_head": head_num,
-        "size_per_head": size_per_head,
-        "num_layer": num_layer,
-        "bos_id": bos_id,
-        "eos_id": eos_id,
-        "temperature": temperature,
-        "rotary_embedding_dim": rotary_embedding_dim,
-        "repetition_penalty": repetition_penalty,
-        "min_length": min_length,
-        "use_fp16": use_fp16_decoding,
-        "tensor_para_size": tensor_para_size,
-        "layer_para_size": layer_para_size,
-        "layer_para_batch_size": layer_para_batch_size,
-    }
-
-    outputs_names = ["OutputIds"]
-    outputs_dtype = ["int32"]
-
-    return run_custom(
-        op_name="fusion_gptj",
-        inputs_names=inputs.keys(),
-        inputs_var=inputs.values(),
-        attrs_names=attrs.keys(),
-        attrs_val=attrs.values(),
-        outputs_names=outputs_names,
-        outputs_dtype=outputs_dtype,
-    )
-
-
-def infer_pegasus_decoding(
-    enc_output,
-    memory_seq_lens,
-    word_emb,
-    slf_ln_weight,
-    slf_ln_bias,
-    slf_q_weight,
-    slf_q_bias,
-    slf_k_weight,
-    slf_k_bias,
-    slf_v_weight,
-    slf_v_bias,
-    slf_out_weight,
-    slf_out_bias,
-    cross_ln_weight,
-    cross_ln_bias,
-    cross_q_weight,
-    cross_q_bias,
-    cross_k_weight,
-    cross_k_bias,
-    cross_v_weight,
-    cross_v_bias,
-    cross_out_weight,
-    cross_out_bias,
-    ffn_ln_weight,
-    ffn_ln_bias,
-    ffn_inter_weight,
-    ffn_inter_bias,
-    ffn_out_weight,
-    ffn_out_bias,
-    decoder_ln_weight,
-    decoder_ln_bias,
-    linear_weight,
-    linear_bias,
-    pos_emb,
-    _decoding_strategy,
-    _beam_size,
-    _topk,
-    _topp,
-    _n_head,
-    _size_per_head,
-    _n_layer,
-    _bos_id,
-    _eos_id,
-    _max_out_len,
-    _min_out_len,
-    _diversity_rate,
-    _rel_len,
-    _alpha,
-    _temperature,
-    _early_stopping,
-    _hidden_act,
-):
-
-    inputs_names = [
-        "Input",
-        "MemSeqLen",
-        "WordEmbedding",
-        "SelfLayernormWeight@VECTOR",
-        "SelfLayernormBias@VECTOR",
-        "SelfQueryWeight@VECTOR",
-        "SelfQueryBias@VECTOR",
-        "SelfKeyWeight@VECTOR",
-        "SelfKeyBias@VECTOR",
-        "SelfValueWeight@VECTOR",
-        "SelfValueBias@VECTOR",
-        "SelfOutWeight@VECTOR",
-        "SelfOutBias@VECTOR",
-        "CrossLayernormWeight@VECTOR",
-        "CrossLayernormBias@VECTOR",
-        "CrossQueryWeight@VECTOR",
-        "CrossQueryBias@VECTOR",
-        "CrossKeyWeight@VECTOR",
-        "CrossKeyBias@VECTOR",
-        "CrossValueWeight@VECTOR",
-        "CrossValueBias@VECTOR",
-        "CrossOutWeight@VECTOR",
-        "CrossOutBias@VECTOR",
-        "FFNLayernormWeight@VECTOR",
-        "FFNLayernormBias@VECTOR",
-        "FFNInterWeight@VECTOR",
-        "FFNInterBias@VECTOR",
-        "FFNOutWeight@VECTOR",
-        "FFNOutBias@VECTOR",
-        "DecoderLayernormWeight",
-        "DecoderLayernormBias",
-        "EmbWeight",
-        "EmbBias",
-        "PositionEncEmb",
-        # The input of custom op must be given.
-        # Dispensable() and Intermediate() are not supported.
-    ]
-
-    inputs_var = [
-        enc_output,
-        memory_seq_lens,
-        word_emb,
-        slf_ln_weight,
-        slf_ln_bias,
-        slf_q_weight,
-        slf_q_bias,
-        slf_k_weight,
-        slf_k_bias,
-        slf_v_weight,
-        slf_v_bias,
-        slf_out_weight,
-        slf_out_bias,
-        cross_ln_weight,
-        cross_ln_bias,
-        cross_q_weight,
-        cross_q_bias,
-        cross_k_weight,
-        cross_k_bias,
-        cross_v_weight,
-        cross_v_bias,
-        cross_out_weight,
-        cross_out_bias,
-        ffn_ln_weight,
-        ffn_ln_bias,
-        ffn_inter_weight,
-        ffn_inter_bias,
-        ffn_out_weight,
-        ffn_out_bias,
-        decoder_ln_weight,
-        decoder_ln_bias,
-        linear_weight,
-        linear_bias,
-        pos_emb,
-        # The input of custom op must be given.
-        # Dispensable() and Intermediate() are not supported.
-    ]
-
-    attrs_names = [
-        "decoding_strategy",
-        "beam_size",
-        "topk",
-        "topp",
-        "n_head",
-        "size_per_head",
-        "num_layer",
-        "bos_id",
-        "eos_id",
-        "temperature",
-        "max_len",
-        "min_len",
-        "beam_search_diversity_rate",
-        "rel_len",
-        "alpha",
-        "early_stopping",
-        "hidden_act",
-        "emb_scale",
-    ]
-
-    attrs_val = [
-        _decoding_strategy,
-        _beam_size,
-        _topk,
-        _topp,
-        _n_head,
-        _size_per_head,
-        _n_layer,
-        _bos_id,
-        _eos_id,
-        _temperature,
-        _max_out_len,
-        _min_out_len,
-        _diversity_rate,
-        _rel_len,
-        _alpha,
-        _early_stopping,
-        _hidden_act,
-    ]
-
-    outputs_names = ["OutputIds", "ParentIds", "SequenceLength"]
-
-    outputs_dtype = ["int32"] * len(outputs_names)
-
-    return run_custom(
-        "fusion_pegasus_decoding", inputs_names, inputs_var, attrs_names, attrs_val, outputs_names, outputs_dtype
-    )
-
-
-def infer_t5_decoding(
-    enc_output,
-    memory_seq_lens,
-    word_emb,
-    slf_ln_weight,
-    slf_ln_bias,
-    slf_q_weight,
-    slf_q_bias,
-    slf_k_weight,
-    slf_k_bias,
-    slf_v_weight,
-    slf_v_bias,
-    slf_out_weight,
-    slf_out_bias,
-    cross_ln_weight,
-    cross_ln_bias,
-    cross_q_weight,
-    cross_q_bias,
-    cross_k_weight,
-    cross_k_bias,
-    cross_v_weight,
-    cross_v_bias,
-    cross_out_weight,
-    cross_out_bias,
-    ffn_ln_weight,
-    ffn_ln_bias,
-    ffn_inter_weight_0,
-    ffn_inter_bias_0,
-    ffn_inter_weight_1,
-    ffn_inter_bias_1,
-    ffn_out_weight,
-    ffn_out_bias,
-    relative_attention_bias_weight,
-    decoder_ln_weight,
-    decoder_ln_bias,
-    linear_weight,
-    linear_bias,
-    decoding_strategy,
-    beam_size,
-    top_k,
-    top_p,
-    head_num,
-    size_per_head,
-    num_decoder_layers,
-    start_id,
-    end_id,
-    max_out_len,
-    diversity_rate,
-    rel_len,
-    alpha,
-    temperature,
-    early_stopping,
-    max_distance,
-    relative_attention_num_buckets,
-    tie_word_embeddings,
-    act,
-):
-
-    inputs_names = [
-        "Input",
-        "MemSeqLen",
-        "WordEmbedding",
-        "SelfLayernormWeight@VECTOR",
-        "SelfLayernormBias@VECTOR",
-        "SelfQueryWeight@VECTOR",
-        "SelfQueryBias@VECTOR",
-        "SelfKeyWeight@VECTOR",
-        "SelfKeyBias@VECTOR",
-        "SelfValueWeight@VECTOR",
-        "SelfValueBias@VECTOR",
-        "SelfOutWeight@VECTOR",
-        "SelfOutBias@VECTOR",
-        "CrossLayernormWeight@VECTOR",
-        "CrossLayernormBias@VECTOR",
-        "CrossQueryWeight@VECTOR",
-        "CrossQueryBias@VECTOR",
-        "CrossKeyWeight@VECTOR",
-        "CrossKeyBias@VECTOR",
-        "CrossValueWeight@VECTOR",
-        "CrossValueBias@VECTOR",
-        "CrossOutWeight@VECTOR",
-        "CrossOutBias@VECTOR",
-        "FFNLayernormWeight@VECTOR",
-        "FFNLayernormBias@VECTOR",
-        "FFNInterWeight0@VECTOR",
-        "FFNInterBias0@VECTOR",
-        "FFNInterWeight1@VECTOR",
-        "FFNInterBias1@VECTOR",
-        "FFNOutWeight@VECTOR",
-        "FFNOutBias@VECTOR",
-        "SelfRelativeAttentionBiasWeight",
-        "DecoderLayernormWeight",
-        "DecoderLayernormBias",
-        "EmbWeight",
-        "EmbBias",
-    ]
-
-    inputs_var = [
-        enc_output,
-        memory_seq_lens,
-        word_emb,
-        slf_ln_weight,
-        slf_ln_bias,
-        slf_q_weight,
-        slf_q_bias,
-        slf_k_weight,
-        slf_k_bias,
-        slf_v_weight,
-        slf_v_bias,
-        slf_out_weight,
-        slf_out_bias,
-        cross_ln_weight,
-        cross_ln_bias,
-        cross_q_weight,
-        cross_q_bias,
-        cross_k_weight,
-        cross_k_bias,
-        cross_v_weight,
-        cross_v_bias,
-        cross_out_weight,
-        cross_out_bias,
-        ffn_ln_weight,
-        ffn_ln_bias,
-        ffn_inter_weight_0,
-        ffn_inter_bias_0,
-        ffn_inter_weight_1,
-        ffn_inter_bias_1,
-        ffn_out_weight,
-        ffn_out_bias,
-        relative_attention_bias_weight,
-        decoder_ln_weight,
-        decoder_ln_bias,
-        linear_weight,
-        linear_bias,
-    ]
-
-    attrs_names = [
-        "decoding_strategy",
-        "beam_size",
-        "topk",
-        "topp",
-        "n_head",
-        "size_per_head",
-        "num_layer",
-        "bos_id",
-        "eos_id",
-        "max_len",
-        "beam_search_diversity_rate",
-        "rel_len",
-        "alpha",
-        "temperature",
-        "early_stopping",
-        "max_distance",
-        "num_buckets",
-        "tie_word_embeddings",
-        "act",
-    ]
-
-    attrs_val = [
-        decoding_strategy,
-        beam_size,
-        top_k,
-        top_p,
-        head_num,
-        size_per_head,
-        num_decoder_layers,
-        start_id,
-        end_id,
-        max_out_len,
-        diversity_rate,
-        rel_len,
-        alpha,
-        temperature,
-        early_stopping,
-        max_distance,
-        relative_attention_num_buckets,
-        tie_word_embeddings,
-        act,
-    ]
-
-    outputs_names = ["OutputIds", "ParentIds", "SequenceLength"]
-
-    outputs_dtype = ["int32"] * len(outputs_names)
-
-    return run_custom(
-        "fusion_t5_decoding", inputs_names, inputs_var, attrs_names, attrs_val, outputs_names, outputs_dtype
-    )
-
-
-def finalize(
-    beam_size,
-    output_ids,
-    parent_ids,
-    out_seq_lens,
-    forced_eos_token_id=None,
-    max_seq_len=None,
-    decoding_strategy="beam_search",
-):
-    if max_seq_len is None:
-        max_seq_len = paddle.max(out_seq_lens)
-    ids = paddle.slice(output_ids, [0], [0], [max_seq_len])
-    if decoding_strategy.startswith("beam_search"):
-        parent_ids = paddle.slice(parent_ids, [0], [0], [max_seq_len]) % (
-            beam_size * 2 if decoding_strategy.endswith("_v2") or decoding_strategy.endswith("_v3") else beam_size
-        )
-        ids = paddle.nn.functional.gather_tree(ids, parent_ids)
-        if forced_eos_token_id is not None:
-            ids[-1, :, :] = forced_eos_token_id
-    else:
-        if forced_eos_token_id is not None:
-            ids[-1, :] = forced_eos_token_id
-    return ids
-
-
-def transfer_param(p, is_bias=False, dtype="float16", restore_data=False):
-    param_shape = p.shape
-    # Allow CPU/GPU and float16/float32 transfer
-    # NOTE: str(p.place) differs between paddle develop and 2.2
-    if str(p.dtype)[-len(dtype) :] == dtype and ("gpu" in str(p.place).lower() or "cuda" in str(p.place).lower()):
-        return p
-    if restore_data:
-        if paddle.in_dynamic_mode():
-            param_data = p.numpy()
-            # Creating parameters with Assign initializer is too slow. Maybe we
-            # can cast to fp16 directly and get a tensor, while we do it more
-            # elaborately to get a ParamBase. Also note `VarBase.set_value`
-            # enforce the same dtype and can not be used directly.
-            new_p = type(p)(shape=param_shape, dtype=dtype, is_bias=is_bias)
-            new_p.value().get_tensor().set(param_data.astype(dtype), paddle.framework._current_expected_place())
-            return new_p
-        else:
-            param_data = np.array(paddle.static.global_scope().find_var(p.name).get_tensor())
-    return paddle.create_parameter(
-        shape=param_shape,
-        dtype=dtype,
-        is_bias=is_bias,
-        default_initializer=paddle.nn.initializer.Assign(param_data) if restore_data else None,
-    )
-
-
-def _convert_qkv(q_proj, k_proj, v_proj, attr="weight", use_numpy=True, del_param=False, dummy_tensor=None):
-    ft_para_conf = get_ft_para_conf()
-    # TODO(guosheng): maybe static graph need this
-    # p = fast_model.create_parameter(
-    #     shape=[q.shape[0], q.shape[1] + k.shape[1] + v.shape[1]],
-    #     dtype=q.dtype,
-    #     is_bias=is_bias)
-    q = getattr(q_proj, attr)
-    k = getattr(k_proj, attr)
-    v = getattr(v_proj, attr)
-    if use_numpy:
-        q = q.numpy()
-        if del_param:
-            if attr == "weight":
-                del q_proj.weight
-            else:
-                del q_proj.bias
-        k = k.numpy()
-        if del_param:
-            if attr == "weight":
-                del k_proj.weight
-            else:
-                del k_proj.bias
-        v = v.numpy()
-        if del_param:
-            if attr == "weight":
-                del v_proj.weight
-            else:
-                del v_proj.bias
-    else:
-        if del_param:
-            for i in [q_proj, k_proj, v_proj]:
-                if attr == "weight":
-                    del i.weight
-                else:
-                    del i.bias
-    q = ft_para_conf.slice_weight(q, 1)
-    k = ft_para_conf.slice_weight(k, 1)
-    v = ft_para_conf.slice_weight(v, 1)
-    if del_param:
-        # NOTE: dygraph_to_static/convert_call_func.py would log the converted
-        # function. For linear layer, if we delete the params, log would fail.
-        # And the log requires weight to be a 2D tensor.
-        # NOTE: Assignment to parameter 'weight' should be of type
-        # Parameter or None, thus delete before in case of tensor.
-        setattr(q_proj, attr, dummy_tensor)
-        setattr(k_proj, attr, dummy_tensor)
-        setattr(v_proj, attr, dummy_tensor)
-    if use_numpy:
-        p = paddle.to_tensor(np.concatenate([q, k, v], axis=-1))
-    else:
-        p = paddle.concat([q, k, v], axis=-1)
-    return p
-
-
-def convert_params(fast_model, model, fuse_qkv=1, use_fp16=False, restore_data=False):
-    r"""
-    Convert parameters included in Transformer layer (`nn.TransformerEncoder`
-    and `gpt.modeling.TransformerDecoder`) from original models to the format
-    of faster models.
-
-    Args:
-        fast_model (Layer): The faster model object.
-        model (Layer): The Transformer layer. It can be an instance of
-            `nn.TransformerEncoder` or `gpt.modeling.TransformerDecoder`
-            currently, and `nn.TransformerDecoder` would be supported soon.
-        fuse_qkv (int): 0 for nofuse, 1 for fuse, 2 for fuse and delete the
-            unfused parameters. If environment variable `PPFG_QKV_MEM_OPT` is
-            set and the weights of q/k/v is fused, it will try to delete the
-            original unfused weights. Note the rollback to original model would
-            not be guarantee anymore when the faster model failed if the original
-            weights are deleted. Default to 1.
-        use_fp16 (bool): Whether to use float16. Maybe we should use the default
-            dtype as the highest priority later. Default to `False`.
-        restore_data (bool): If `False`, need to reload the weight values. It
-            should be `True` for weight loaded models. Default to `False`.
-
-    Returns:
-        defaultdict: Each value is a list including converted parameters in all
-            layers. For other parameters not included in Transformer module to
-            be converted, such as embeddings, you can achieve it by using the
-            returned dict `params` though `params['word_emb'].append()` directly
-            which would do CPU/GPU and fp32/fp16 transfer automatically.
-    """
-    if fuse_qkv == 1:
-        fuse_qkv = 2 if os.getenv("PPFG_QKV_MEM_OPT", "0") == "1" else 1
-    ft_para_conf = get_ft_para_conf()
-
-    class _list(list):
-        def append(self, item):
-            def attr_handle_func(x):
-                return x
-
-            if isinstance(item[0], nn.Layer):
-                # Axis is used for tensor slice in tensor parallel.
-                # Use None to make no slice on the tensor.
-                if len(item) == 2:
-                    layer, attr = item
-                    axis = None
-                else:
-                    layer, attr, axis = item
-                param = getattr(layer, attr)
-                if axis is not None and isinstance(layer, nn.Linear):
-                    param = ft_para_conf.slice_weight(param, axis)
-                param = transfer_param(
-                    param,
-                    is_bias=attr.endswith("bias"),
-                    dtype="float16" if use_fp16 else "float32",
-                    restore_data=restore_data,
-                )
-                # NOTE: Assignment to parameter 'weight' should be of type
-                # Parameter or None, thus delete first in case of param is
-                # a tensor.
-                # TODO(guosheng): Make slice_weight use `output_param=True`
-                # and remove delattr. Currently, if `param` is Tensor rather
-                # than Parameter, it would not be in state_dict.
-                delattr(layer, attr)
-                setattr(layer, attr, param)
-            else:
-                # NOTE: Compared with if branch, there is no layer attribute
-                # refered to the transfered param, thus we should set it as
-                # the layer attribute to be able to convert to static graph.
-                # Additionally, we suppose no need to process tensor parallel
-                # here since the param passed in might have been processed.
-                if len(item) == 2:
-                    param, is_bias = item
-                    attr_handle = attr_handle_func
-                else:
-                    param, is_bias, attr_handle = item
-                param = transfer_param(
-                    param, is_bias=is_bias, dtype="float16" if use_fp16 else "float32", restore_data=restore_data
-                )
-                attr_handle(param)
-            return super().append(param)
-
-    params = defaultdict(_list)
-
-    def _convert(module):
-        if isinstance(
-            module,
-            (
-                nn.TransformerEncoder,
-                nn.TransformerDecoder,
-                paddlenlp.transformers.gpt.modeling.TransformerDecoder,
-                paddlenlp.transformers.opt.modeling.TransformerDecoder,
-            ),
-        ):
-            num_layer = len(module.layers)
-            for i, layer in enumerate(module.layers):
-                if not ft_para_conf.is_load(i, num_layer):
-                    continue
-                # fuse_qkv: 0 for nofuse, 1 for fuse,
-                # 2 for fuse and delete the unfused
-                if fuse_qkv == 0:
-                    params["slf_q_weight"].append((layer.self_attn.q_proj, "weight", 1))
-                    params["slf_q_bias"].append((layer.self_attn.q_proj, "bias", 1))
-                    params["slf_k_weight"].append((layer.self_attn.k_proj, "weight", 1))
-                    params["slf_k_bias"].append((layer.self_attn.k_proj, "bias", 1))
-                    params["slf_v_weight"].append((layer.self_attn.v_proj, "weight", 1))
-                    params["slf_v_bias"].append((layer.self_attn.v_proj, "bias", 1))
-
-                else:
-                    # TODO(guosheng): Tensor with size 0 might be failed in
-                    # paddle develop, thus use tensor with size 1 instead
-                    # temporarily. Besides, we use 2D tensor since jit log
-                    # requires that on linear weight. While size 0 seems all
-                    # right in jit.to_static/jit.save.
-                    dummy_tensor = paddle.zeros([1, 1])
-                    w = _convert_qkv(
-                        layer.self_attn.q_proj,
-                        layer.self_attn.k_proj,
-                        layer.self_attn.v_proj,
-                        attr="weight",
-                        use_numpy=fuse_qkv == 2,
-                        del_param=fuse_qkv == 2,
-                        dummy_tensor=dummy_tensor,
-                    )
-                    b = _convert_qkv(
-                        layer.self_attn.q_proj,
-                        layer.self_attn.k_proj,
-                        layer.self_attn.v_proj,
-                        attr="bias",
-                        use_numpy=fuse_qkv == 2,
-                        del_param=fuse_qkv == 2,
-                        dummy_tensor=dummy_tensor,
-                    )
-                    params["slf_q_weight"].append((w, False))
-                    params["slf_q_bias"].append((b, True))
-                    # NOTE: Use `params["slf_q_weight"][-1]` rather than `w`,
-                    # since the appended tensor might be a new transfered tensor.
-                    # Besides, to allow convert_params be called more than once,
-                    # we find a attr name not existing to avoid overwriting the
-                    # existing attr.
-                    attr = "slf_q_weight_" + str(i)
-                    while hasattr(fast_model, attr):
-                        attr += "_"
-                    setattr(fast_model, attr, params["slf_q_weight"][-1])
-                    attr = "slf_q_bias_" + str(i)
-                    while hasattr(fast_model, attr):
-                        attr += "_"
-                    setattr(fast_model, attr, params["slf_q_bias"][-1])
-                    for key in [f"slf_{m}_{n}" for m in ("k", "v") for n in ("weight", "bias")]:
-                        params[key].append((dummy_tensor, True if key.endswith("bias") else False))
-                        attr = key + "_" + str(i)
-                        while hasattr(fast_model, attr):
-                            attr += "_"
-                        setattr(fast_model, attr, params[key][-1])
-                if hasattr(layer, "cross_attn"):
-                    # nn.TransformerDecoder
-                    params["cross_q_weight"].append((layer.cross_attn.q_proj, "weight", 1))
-                    params["cross_q_bias"].append((layer.cross_attn.q_proj, "bias", 1))
-                    params["cross_k_weight"].append((layer.cross_attn.k_proj, "weight", 1))
-                    params["cross_k_bias"].append((layer.cross_attn.k_proj, "bias", 1))
-                    params["cross_v_weight"].append((layer.cross_attn.v_proj, "weight", 1))
-                    params["cross_v_bias"].append((layer.cross_attn.v_proj, "bias", 1))
-                    params["cross_out_weight"].append((layer.cross_attn.out_proj, "weight", 0))
-                    params["cross_out_bias"].append((layer.cross_attn.out_proj, "bias", 0))
-
-                params["slf_out_weight"].append((layer.self_attn.out_proj, "weight", 0))
-                params["slf_out_bias"].append((layer.self_attn.out_proj, "bias"))
-                params["slf_ln_weight"].append((layer.norm1, "weight"))
-                params["slf_ln_bias"].append((layer.norm1, "bias"))
-                # Slice tensor when append according to axis(1 or 0) if parallel
-                # is enable.
-                params["ffn_inter_weight"].append((layer.linear1, "weight", 1))
-                params["ffn_inter_bias"].append((layer.linear1, "bias", 1))
-                params["ffn_out_weight"].append((layer.linear2, "weight", 0))
-                params["ffn_out_bias"].append((layer.linear2, "bias"))
-                if hasattr(layer, "norm3"):
-                    # nn.TransformerDecoder
-                    params["cross_ln_weight"].append((layer.norm2, "weight"))
-                    params["cross_ln_bias"].append((layer.norm2, "bias"))
-                    params["ffn_ln_weight"].append((layer.norm3, "weight"))
-                    params["ffn_ln_bias"].append((layer.norm3, "bias"))
-                else:
-                    params["ffn_ln_weight"].append((layer.norm2, "weight"))
-                    params["ffn_ln_bias"].append((layer.norm2, "bias"))
-
-            if getattr(module, "norm", None) is not None:
-                params["decoder_ln_weight"].append((module.norm, "weight"))
-                params["decoder_ln_bias"].append((module.norm, "bias"))
-        elif isinstance(module, (paddlenlp.transformers.t5.modeling.T5Stack)) and module.is_decoder:
-            num_layer = len(module.block)
-            for i, block in enumerate(module.block):
-                if not ft_para_conf.is_load(i, num_layer):
-                    continue
-                # fuse_qkv: 0 for nofuse, 1 for fuse,
-                # 2 for fuse and delete the unfused
-                if fuse_qkv == 0:
-                    params["slf_q_weight"].append((block.layer[0].SelfAttention.q, "weight", 1))
-                    if getattr(block.layer[0].SelfAttention.q, "bias", None) is not None:
-                        params["slf_q_bias"].append((block.layer[0].SelfAttention.q, "bias", 1))
-
-                    params["slf_k_weight"].append((block.layer[0].SelfAttention.k, "weight", 1))
-                    if getattr(block.layer[0].SelfAttention.k, "bias", None) is not None:
-                        params["slf_k_bias"].append((block.layer[0].SelfAttention.k, "bias", 1))
-
-                    params["slf_v_weight"].append((block.layer[0].SelfAttention.v, "weight", 1))
-                    if getattr(block.layer[0].SelfAttention.v, "bias", None) is not None:
-                        params["slf_k_bias"].append((block.layer[0].SelfAttention.v, "bias", 1))
-
-                else:
-                    dummy_tensor = paddle.zeros([1, 1])
-                    w = _convert_qkv(
-                        block.layer[0].SelfAttention.q,
-                        block.layer[0].SelfAttention.k,
-                        block.layer[0].SelfAttention.v,
-                        attr="weight",
-                        use_numpy=(fuse_qkv == 2),
-                        del_param=(fuse_qkv == 2),
-                        dummy_tensor=dummy_tensor,
-                    )
-                    params["slf_q_weight"].append((w, False))
-
-                    if (
-                        getattr(block.layer[0].SelfAttention.q, "bias", None) is not None
-                        and getattr(block.layer[0].SelfAttention.k, "bias", None) is not None
-                        and getattr(block.layer[0].SelfAttention.v, "bias", None) is not None
-                    ):
-                        b = _convert_qkv(
-                            block.layer[0].SelfAttention.q,
-                            block.layer[0].SelfAttention.k,
-                            block.layer[0].SelfAttention.v,
-                            attr="bias",
-                            use_numpy=(fuse_qkv == 2),
-                            del_param=(fuse_qkv == 2),
-                            dummy_tensor=dummy_tensor,
-                        )
-                        params["slf_q_bias"].append((b, True))
-
-                    # NOTE: Use `params["slf_q_weight"][-1]` rather than `w`,
-                    # since the appended tensor might be a new transfered tensor.
-                    # Besides, to allow convert_params be called more than once,
-                    # we find a attr name not existing to avoid overwriting the
-                    # existing attr.
-                    attr = "slf_q_weight_" + str(i)
-                    while hasattr(fast_model, attr):
-                        attr += "_"
-                    setattr(fast_model, attr, params["slf_q_weight"][-1])
-
-                    param_type = "weight"
-                    if "slf_q_bias" in params.keys():
-                        attr = "slf_q_bias_" + str(i)
-                        while hasattr(fast_model, attr):
-                            attr += "_"
-                        setattr(fast_model, attr, params["slf_q_bias"][-1])
-                        param_type.append("bias")
-
-                    for key in [f"slf_{m}_{n}" for m in ("k", "v") for n in param_type]:
-                        params[key].append((dummy_tensor, True if key.endswith("bias") else False))
-                        attr = key + "_" + str(i)
-                        while hasattr(fast_model, attr):
-                            attr += "_"
-                        setattr(fast_model, attr, params[key][-1])
-
-                ffn_index = 1
-                if len(block.layer) == 3:
-                    ffn_index = 2
-
-                    params["cross_q_weight"].append((block.layer[1].EncDecAttention.q, "weight", 1))
-                    if getattr(block.layer[1].EncDecAttention.q, "bias", None) is not None:
-                        params["cross_q_bias"].append((block.layer[1].EncDecAttention.q, "bias", 1))
-
-                    params["cross_k_weight"].append((block.layer[1].EncDecAttention.k, "weight", 1))
-                    if getattr(block.layer[1].EncDecAttention.k, "bias", None) is not None:
-                        params["cross_k_bias"].append((block.layer[1].EncDecAttention.k, "bias", 1))
-
-                    params["cross_v_weight"].append((block.layer[1].EncDecAttention.v, "weight", 1))
-                    if getattr(block.layer[1].EncDecAttention.v, "bias", None) is not None:
-                        params["cross_v_bias"].append((block.layer[1].EncDecAttention.v, "bias", 1))
-
-                    params["cross_out_weight"].append((block.layer[1].EncDecAttention.o, "weight", 0))
-                    if getattr(block.layer[1].EncDecAttention.o, "bias", None) is not None:
-                        params["cross_out_bias"].append((block.layer[1].EncDecAttention.o, "bias", 0))
-
-                    params["cross_ln_weight"].append((block.layer[1].layer_norm, "weight", 0))
-                    if getattr(block.layer[1].layer_norm, "bias", None) is not None:
-                        params["cross_ln_bias"].append((block.layer[1].layer_norm, "bias", 0))
-
-                if hasattr(block.layer[ffn_index], "DenseReluDense"):
-                    if isinstance(block.layer[ffn_index].DenseReluDense, (T5DenseReluDense)):
-                        params["ffn_inter_weight_0"].append((block.layer[ffn_index].DenseReluDense.wi, "weight", 1))
-                        if getattr(block.layer[ffn_index].DenseReluDense.wi, "bias", None) is not None:
-                            params["ffn_inter_bias_0"].append((block.layer[ffn_index].DenseReluDense.wi, "bias", 1))
-
-                        params["ffn_out_weight"].append((block.layer[ffn_index].DenseReluDense.wo, "weight", 0))
-                        if getattr(block.layer[ffn_index].DenseReluDense.wo, "bias", None) is not None:
-                            params["ffn_out_bias"].append((block.layer[ffn_index].DenseReluDense.wo, "bias"))
-                    elif isinstance(block.layer[ffn_index].DenseReluDense, (T5DenseGatedGeluDense)):
-                        params["ffn_inter_weight_0"].append((block.layer[ffn_index].DenseReluDense.wi_0, "weight", 1))
-                        if getattr(block.layer[ffn_index].DenseReluDense.wi_0, "bias", None) is not None:
-                            params["ffn_inter_bias_0"].append((block.layer[ffn_index].DenseReluDense.wi_0, "bias", 1))
-
-                        params["ffn_inter_weight_1"].append((block.layer[ffn_index].DenseReluDense.wi_1, "weight", 1))
-                        if getattr(block.layer[ffn_index].DenseReluDense.wi_1, "bias", None) is not None:
-                            params["ffn_inter_bias_1"].append((block.layer[ffn_index].DenseReluDense.wi_1, "bias", 1))
-
-                        params["ffn_out_weight"].append((block.layer[ffn_index].DenseReluDense.wo, "weight", 0))
-                        if getattr(block.layer[ffn_index].DenseReluDense.wo, "bias", None) is not None:
-                            params["ffn_out_bias"].append((block.layer[ffn_index].DenseReluDense.wo, "bias"))
-                    else:
-                        raise NotImplementedError("Faster only support T5DenseReluDense and T5DenseGatedGeluDense. ")
-
-                params["ffn_ln_weight"].append((block.layer[ffn_index].layer_norm, "weight"))
-                if getattr(block.layer[ffn_index].layer_norm, "bias", None) is not None:
-                    params["ffn_ln_bias"].append((block.layer[ffn_index].layer_norm, "bias"))
-
-                params["slf_out_weight"].append((block.layer[0].SelfAttention.o, "weight", 0))
-                if getattr(block.layer[0].SelfAttention.o, "bias", None) is not None:
-                    params["slf_out_bias"].append((block.layer[0].SelfAttention.o, "bias"))
-
-                params["slf_ln_weight"].append((block.layer[0].layer_norm, "weight"))
-                if getattr(block.layer[0].layer_norm, "bias", None) is not None:
-                    params["slf_ln_bias"].append((block.layer[0].layer_norm, "bias"))
-
-            if getattr(module, "norm", None) is not None:
-                params["decoder_ln_weight"].append((module.final_layer_norm, "weight"))
-                if getattr(module.final_layer_norm, "bias", None) is not None:
-                    params["decoder_ln_bias"].append((module.final_layer_norm, "bias"))
-
-    model.apply(_convert)
-    return params
-
-
-class InferBase(nn.Layer):
-    def __init__(self, use_fp16_decoding):
-        super(InferBase, self).__init__()
-        self._use_fp16_decoding = use_fp16_decoding
-
-    def default_bias(self, weight, index, is_null=False):
-        if is_null:
-            size = 1
-        elif isinstance(weight, (list, tuple)):
-            size = weight[0].shape[index]
-        else:
-            size = weight.shape[index]
-
-        if not hasattr(self, "default_bias_" + str(size)):
-            setattr(
-                self,
-                "default_bias_" + str(size),
-                paddle.zeros(shape=[size], dtype="float16" if self._use_fp16_decoding else "float32"),
-            )
-
-        if isinstance(weight, (list, tuple)):
-            return [getattr(self, "default_bias_" + str(size))] * len(weight)
-        else:
-            return [getattr(self, "default_bias_" + str(size))]
-
-
-class InferTransformerDecoding(nn.Layer):
-    def __init__(
-        self,
-        decoder,
-        word_embedding,
-        positional_embedding,
-        linear,
-        num_decoder_layers,
-        n_head,
-        d_model,
-        bos_id=0,
-        eos_id=1,
-        decoding_strategy="beam_search",
-        beam_size=4,
-        topk=1,
-        topp=0.0,
-        max_out_len=256,
-        diversity_rate=0.0,
-        decoding_lib=None,
-        use_fp16_decoding=False,
-        rel_len=False,
-        alpha=0.6,
-    ):
-        # if decoding_lib is None:
-        #     raise ValueError(
-        #         "The args decoding_lib must be set to use FastGeneration. ")
-        # elif not os.path.exists(decoding_lib):
-        #     raise ValueError("The path to decoding lib is not exist.")
-        if decoding_lib is not None and os.path.isfile(decoding_lib):
-            # Maybe it has been loadad by `ext_utils.load`
-            if "FastGeneration" not in LOADED_EXT.keys():
-                ops = paddle.utils.cpp_extension.load_op_meta_info_and_register_op(decoding_lib)
-                LOADED_EXT["FastGeneration"] = ops
-        else:
-            if decoding_lib is not None:
-                logger.warning("The specified decoding_lib does not exist, and it will be built automatically.")
-            load("FastGeneration", verbose=True)
-
-        size_per_head = d_model / n_head
-        # fuse_qkv can only support size_per_head of [32, 64, 128].
-        if size_per_head in [32, 64, 128]:
-            self._fuse_qkv = True
-        else:
-            self._fuse_qkv = False
-
-        super(InferTransformerDecoding, self).__init__()
-        for arg, value in locals().items():
-            if arg not in ["self", "decoder", "word_embedding", "positional_embedding", "linear"]:
-                setattr(self, "_" + arg, value)
-        # process weights
-        if use_fp16_decoding:
-            for mod in decoder.layers:
-                mod.norm1.weight = transfer_param(mod.norm1.weight)
-                mod.norm1.bias = transfer_param(mod.norm1.bias, is_bias=True)
-                mod.self_attn.q_proj.weight = transfer_param(mod.self_attn.q_proj.weight)
-                mod.self_attn.q_proj.bias = transfer_param(mod.self_attn.q_proj.bias, is_bias=True)
-                mod.self_attn.k_proj.weight = transfer_param(mod.self_attn.k_proj.weight)
-                mod.self_attn.k_proj.bias = transfer_param(mod.self_attn.k_proj.bias, is_bias=True)
-                mod.self_attn.v_proj.weight = transfer_param(mod.self_attn.v_proj.weight)
-                mod.self_attn.v_proj.bias = transfer_param(mod.self_attn.v_proj.bias, is_bias=True)
-                mod.self_attn.out_proj.weight = transfer_param(mod.self_attn.out_proj.weight)
-                mod.self_attn.out_proj.bias = transfer_param(mod.self_attn.out_proj.bias, is_bias=True)
-
-                mod.norm2.weight = transfer_param(mod.norm2.weight)
-                mod.norm2.bias = transfer_param(mod.norm2.bias, is_bias=True)
-                mod.cross_attn.q_proj.weight = transfer_param(mod.cross_attn.q_proj.weight)
-                mod.cross_attn.q_proj.bias = transfer_param(mod.cross_attn.q_proj.bias, is_bias=True)
-                mod.cross_attn.k_proj.weight = transfer_param(mod.cross_attn.k_proj.weight)
-                mod.cross_attn.k_proj.bias = transfer_param(mod.cross_attn.k_proj.bias, is_bias=True)
-                mod.cross_attn.v_proj.weight = transfer_param(mod.cross_attn.v_proj.weight)
-                mod.cross_attn.v_proj.bias = transfer_param(mod.cross_attn.v_proj.bias, is_bias=True)
-                mod.cross_attn.out_proj.weight = transfer_param(mod.cross_attn.out_proj.weight)
-                mod.cross_attn.out_proj.bias = transfer_param(mod.cross_attn.out_proj.bias, is_bias=True)
-
-                mod.norm3.weight = transfer_param(mod.norm3.weight)
-                mod.norm3.bias = transfer_param(mod.norm3.bias, is_bias=True)
-                mod.linear1.weight = transfer_param(mod.linear1.weight)
-                mod.linear1.bias = transfer_param(mod.linear1.bias, is_bias=True)
-                mod.linear2.weight = transfer_param(mod.linear2.weight)
-                mod.linear2.bias = transfer_param(mod.linear2.bias, is_bias=True)
-
-            decoder.norm.weight = transfer_param(decoder.norm.weight)
-            decoder.norm.bias = transfer_param(decoder.norm.bias, is_bias=True)
-
-            linear.weight = transfer_param(linear.weight)
-            linear.bias = transfer_param(linear.bias, is_bias=True)
-
-            positional_embedding.weight = transfer_param(positional_embedding.weight)
-            word_embedding.weight = transfer_param(word_embedding.weight)
-
-        self.slf_ln_weight = []
-        self.slf_ln_bias = []
-        self.slf_q_weight = []
-        self.slf_q_bias = []
-        self.slf_k_weight = []
-        self.slf_k_bias = []
-        self.slf_v_weight = []
-        self.slf_v_bias = []
-        self.slf_out_weight = []
-        self.slf_out_bias = []
-
-        self.cross_ln_weight = []
-        self.cross_ln_bias = []
-        self.cross_q_weight = []
-        self.cross_q_bias = []
-        self.cross_k_weight = []
-        self.cross_k_bias = []
-        self.cross_v_weight = []
-        self.cross_v_bias = []
-        self.cross_out_weight = []
-        self.cross_out_bias = []
-
-        self.ffn_ln_weight = []
-        self.ffn_ln_bias = []
-        self.ffn_inter_weight = []
-        self.ffn_inter_bias = []
-        self.ffn_out_weight = []
-        self.ffn_out_bias = []
-
-        for i, mod in enumerate(decoder.layers):
-            self.slf_ln_weight.append(mod.norm1.weight)
-            self.slf_ln_bias.append(mod.norm1.bias)
-
-            if self._fuse_qkv:
-                q_weight_shape = mod.self_attn.q_proj.weight.shape
-                k_weight_shape = mod.self_attn.k_proj.weight.shape
-                v_weight_shape = mod.self_attn.v_proj.weight.shape
-
-                q_weights = self.create_parameter(
-                    shape=[q_weight_shape[0], q_weight_shape[1] + k_weight_shape[1] + v_weight_shape[1]],
-                    dtype="float16" if use_fp16_decoding else "float32",
-                )
-                setattr(self, "slf_q_weight_" + str(i), q_weights)
-                self.slf_q_weight.append(getattr(self, "slf_q_weight_" + str(i)))
-
-                q_bias_shape = mod.self_attn.q_proj.bias.shape
-                k_bias_shape = mod.self_attn.k_proj.bias.shape
-                v_bias_shape = mod.self_attn.v_proj.bias.shape
-
-                q_biases = self.create_parameter(
-                    shape=[q_bias_shape[0] + k_bias_shape[0] + v_bias_shape[0]],
-                    dtype="float16" if use_fp16_decoding else "float32",
-                    is_bias=True,
-                )
-                setattr(self, "slf_q_bias_" + str(i), q_biases)
-                self.slf_q_bias.append(getattr(self, "slf_q_bias_" + str(i)))
-            else:
-                self.slf_q_weight.append(mod.self_attn.q_proj.weight)
-                self.slf_q_bias.append(mod.self_attn.q_proj.bias)
-
-            self.slf_k_weight.append(mod.self_attn.k_proj.weight)
-            self.slf_k_bias.append(mod.self_attn.k_proj.bias)
-            self.slf_v_weight.append(mod.self_attn.v_proj.weight)
-            self.slf_v_bias.append(mod.self_attn.v_proj.bias)
-            self.slf_out_weight.append(mod.self_attn.out_proj.weight)
-            self.slf_out_bias.append(mod.self_attn.out_proj.bias)
-
-            self.cross_ln_weight.append(mod.norm2.weight)
-            self.cross_ln_bias.append(mod.norm2.bias)
-            self.cross_q_weight.append(mod.cross_attn.q_proj.weight)
-            self.cross_q_bias.append(mod.cross_attn.q_proj.bias)
-            self.cross_k_weight.append(mod.cross_attn.k_proj.weight)
-            self.cross_k_bias.append(mod.cross_attn.k_proj.bias)
-            self.cross_v_weight.append(mod.cross_attn.v_proj.weight)
-            self.cross_v_bias.append(mod.cross_attn.v_proj.bias)
-            self.cross_out_weight.append(mod.cross_attn.out_proj.weight)
-            self.cross_out_bias.append(mod.cross_attn.out_proj.bias)
-
-            self.ffn_ln_weight.append(mod.norm3.weight)
-            self.ffn_ln_bias.append(mod.norm3.bias)
-            self.ffn_inter_weight.append(mod.linear1.weight)
-            self.ffn_inter_bias.append(mod.linear1.bias)
-            self.ffn_out_weight.append(mod.linear2.weight)
-            self.ffn_out_bias.append(mod.linear2.bias)
-
-        self.decoder_ln_weight = [decoder.norm.weight]
-        self.decoder_ln_bias = [decoder.norm.bias]
-
-        self.pos_emb = [positional_embedding.weight]
-        self.word_emb = [word_embedding.weight]
-
-        self.linear_weight = [linear.weight]
-        self.linear_bias = [linear.bias]
-
-    def forward(self, enc_output, memory_seq_lens, trg_word=None):
-        def parse_function(func_name):
-            return partial(
-                func_name,
-                word_emb=self.word_emb,
-                slf_ln_weight=self.slf_ln_weight,
-                slf_ln_bias=self.slf_ln_bias,
-                slf_q_weight=self.slf_q_weight,
-                slf_q_bias=self.slf_q_bias,
-                slf_k_weight=self.slf_k_weight,
-                slf_k_bias=self.slf_k_bias,
-                slf_v_weight=self.slf_v_weight,
-                slf_v_bias=self.slf_v_bias,
-                slf_out_weight=self.slf_out_weight,
-                slf_out_bias=self.slf_out_bias,
-                cross_ln_weight=self.cross_ln_weight,
-                cross_ln_bias=self.cross_ln_bias,
-                cross_q_weight=self.cross_q_weight,
-                cross_q_bias=self.cross_q_bias,
-                cross_k_weight=self.cross_k_weight,
-                cross_k_bias=self.cross_k_bias,
-                cross_v_weight=self.cross_v_weight,
-                cross_v_bias=self.cross_v_bias,
-                cross_out_weight=self.cross_out_weight,
-                cross_out_bias=self.cross_out_bias,
-                ffn_ln_weight=self.ffn_ln_weight,
-                ffn_ln_bias=self.ffn_ln_bias,
-                ffn_inter_weight=self.ffn_inter_weight,
-                ffn_inter_bias=self.ffn_inter_bias,
-                ffn_out_weight=self.ffn_out_weight,
-                ffn_out_bias=self.ffn_out_bias,
-                decoder_ln_weight=self.decoder_ln_weight,
-                decoder_ln_bias=self.decoder_ln_bias,
-                linear_weight=self.linear_weight,
-                linear_bias=self.linear_bias,
-                pos_emb=self.pos_emb,
-                _decoding_strategy=self._decoding_strategy,
-                _beam_size=self._beam_size,
-                _topk=self._topk,
-                _topp=self._topp,
-                _n_head=self._n_head,
-                _size_per_head=int(self._d_model / self._n_head),
-                _n_layer=self._num_decoder_layers,
-                _bos_id=self._bos_id,
-                _eos_id=self._eos_id,
-                _max_out_len=self._max_out_len,
-                _diversity_rate=self._diversity_rate,
-                _rel_len=self._rel_len,
-                _alpha=self._alpha,
-            )
-
-        if self._decoding_strategy.startswith("beam_search"):
-            # TODO: Due to paddle.tile bug in static graph, tile_beam_merge_with_batch
-            # cannot work properly. These comments should be opened after PaddlePaddle v2.2.2.
-            if paddle.__version__ <= "2.1.3":
-                enc_output = nn.decode.BeamSearchDecoder.tile_beam_merge_with_batch(enc_output, self._beam_size)
-                memory_seq_lens = nn.decode.BeamSearchDecoder.tile_beam_merge_with_batch(
-                    memory_seq_lens, self._beam_size
-                )
-            else:
-                enc_output_shape = enc_output.shape
-                batch_size = enc_output_shape[0]
-                max_seq_len = enc_output_shape[1]
-                enc_output = enc_output.unsqueeze([1])
-                memory_seq_lens = memory_seq_lens.unsqueeze([1])
-                enc_output = paddle.expand(
-                    enc_output, shape=[batch_size, self._beam_size, max_seq_len, self._d_model]
-                ).reshape([batch_size * self._beam_size, max_seq_len, self._d_model])
-                memory_seq_lens = paddle.expand(memory_seq_lens, shape=[batch_size, self._beam_size]).reshape(
-                    [batch_size * self._beam_size]
-                )
-
-        if trg_word is None:
-            output_ids, parent_ids, sequence_length = parse_function(infer_transformer_decoding)(
-                enc_output=[enc_output], memory_seq_lens=[memory_seq_lens]
-            )
-        else:
-            output_ids, parent_ids, sequence_length = parse_function(infer_force_decoding)(
-                enc_output=[enc_output], memory_seq_lens=[memory_seq_lens], trg_word=[trg_word]
-            )
-
-        ids = finalize(
-            self._beam_size, output_ids, parent_ids, sequence_length, decoding_strategy=self._decoding_strategy
-        )
-
-        return ids
-
-
-# Patch for parallel inference to save memory
-class FTParaConf(object):
-    r"""
-    Configurations for model parallel in FastGeneration. Currently only
-    support GPT. Please refer to  `Megatron <https://arxiv.org/pdf/2104.04473.pdf>`__
-    for details.
-
-    Args:
-        tensor_para_size (int, optional): The size for tensor parallel. If it is
-            1, tensor parallel would not be used. Default to 1.
-        layer_para_size (int, optional): The size for layer parallel. If it is
-            1, layer parallel would not be used. Default to 1.
-        layer_para_batch_size (int, optional): The local batch size for pipeline
-            parallel. It is suggested to use `batch_size // layer_para_size`.
-            Default to 1.
-    """
-
-    def __init__(self, tensor_para_size=None, layer_para_size=None, layer_para_batch_size=1):
-        self.world_size = self._env2int(
-            [  # MPICH, OpenMPI, IMPI
-                "MPI_LOCALNRANKS",
-                "OMPI_COMM_WORLD_SIZE",
-                "PMI_SIZE",
-                "MV2_COMM_WORLD_SIZE",
-                "WORLD_SIZE",
-            ],
-            1,
-        )
-        self.rank = self._env2int(
-            [  # MPICH, OpenMPI, IMPI
-                "MPI_LOCALRANKID",
-                "OMPI_COMM_WORLD_RANK",
-                "PMI_RANK",
-                "MV2_COMM_WORLD_RANK",
-                "RANK",
-            ],
-            0,
-        )
-        if layer_para_size is None:
-            layer_para_size = 1
-        if tensor_para_size is None:
-            tensor_para_size = self.world_size // layer_para_size
-        self.no_para = tensor_para_size == 1 and layer_para_size == 1
-        self.tensor_para_size = tensor_para_size
-        self.layer_para_size = layer_para_size
-        self.layer_para_batch_size = layer_para_batch_size
-
-        assert (
-            self.world_size == tensor_para_size * layer_para_size
-        ), "tensor_para_size * layer_para_size must be equal to world_size."
-        self.tensor_para_rank = self.rank % self.tensor_para_size
-        self.layer_para_rank = self.rank // self.tensor_para_size
-        self.is_partial_model = False
-
-    @staticmethod
-    def _env2int(env_list, default=-1):
-        for e in env_list:
-            val = int(os.environ.get(e, -1))
-            if val >= 0:
-                return val
-        return default
-
-    def is_last_group(self):
-        r"""
-        For layer parallel, only the process corresponding to the last layer
-        group can get the predict results. It is used to check whether this is
-        the process corresponding to the last layer group.
-        """
-        return self.layer_para_rank == self.layer_para_size - 1
-
-    def is_load(self, i, num_layer):
-        r"""
-        Whether or not the given transformer layer of should be loaded to the
-        current parallel model. For layer parallel, there is no need not to load
-        other layer groups.
-
-        Args:
-            i (int): The index of Transformer layer.
-            num_layer (int): The number of Transformer layers.
-
-        Returns:
-            bool: Indicate whether or not the given transformer layer of should
-                be loaded to the current parallel model.
-        """
-        if self.no_para:
-            return True
-        # Take into account model only including partial weights.
-        if self.is_partial_model:
-            return True
-        layers_per_device = num_layer // self.layer_para_size
-        return (i >= layers_per_device * self.layer_para_rank) and i < layers_per_device * (self.layer_para_rank + 1)
-
-    def slice_weight(self, weight, axis, phase=1, out_param=False):
-        r"""
-        Get the weight slice for tensor parallel.
-
-        Args:
-            weight (Tensor or ndarray): The weight or bias to be sliced.
-            axis (int): The axis to perform slice.
-            phase (int, optional): 0 is used for creating partial model when
-                initializing and `from_pretrained`. While 1 is used in converting
-                parameters to FastGeneration. No slice would be performed if
-                it is 1, since parameters have been sliced in `phase=0`.
-            out_param (bool, optional): If true, `weight` should be a Parameter
-                and force the output to be a Parameter.
-
-        Returns:
-            Tensor or ndarray: The sliced weight.
-        """
-        # weight can be parameter/tensor/ndarray
-        if self.no_para:
-            return weight
-        # Take into account model only including partial weights.
-        if self.is_partial_model:
-            if phase == 1:
-                # 0 for init
-                # 1 for convert param to FT
-                # TODO(guosheng): Maybe we can remove slice_weight in converting
-                # parameters to FT if we have sliced parameters at phase 0, while
-                # we allow to use non-partial model when converting parameters
-                # to FT currently.
-                return weight
-        if len(weight.shape) == 1:
-            axis = 0
-        local_size = weight.shape[axis] // self.tensor_para_size
-        start_offset = self.tensor_para_rank * local_size
-        end_offset = start_offset + local_size
-        if len(weight.shape) == 1:
-            w_slice = weight[start_offset:end_offset]
-        else:
-            w_slice = weight[:, start_offset:end_offset] if axis == 1 else weight[start_offset:end_offset, :]
-        if out_param:
-            # Assume weight is also a Parameter.
-            w = type(weight)(shape=w_slice.shape, dtype=weight.dtype, is_bias=len(weight.shape) == 1)
-            # NOTE: `VarBase.set_value` would use `w.numpy()` while w is not
-            # initialized and can not be used directly.
-            # TODO(guosheng): If `w.place `can be used here, use `w.place` to
-            # avoid w.place and _current_expected_place are different.
-            w.value().get_tensor().set(w_slice, paddle.framework._current_expected_place())
-            return w
-        else:
-            return w_slice
-
-    def set_partial_model(self, is_partial_model):
-        r"""
-        This is used to set whether or not the current model has complete
-        parameters.
-
-        Args:
-            is_partial_model (bool): It is used to set whether or not the
-                current model has complete parameters.
-        """
-        self.is_partial_model = is_partial_model
-
-    def fit_partial_model(self, model, state_to_load):
-        r"""
-        Slice every values included in `state_to_load` according to the shape
-        of corresponding parameters in `model`. This is used in `from_pratrained`
-        to get sliced parameter values.
-
-        Args:
-            model (PretrainedModel): The model to use.
-            state_to_load (dict): The state dict including complete parameter
-                values of model.
-
-        Returns:
-            dict: The state dict contains adjusted values.
-        """
-        if self.no_para or not self.is_partial_model:
-            return state_to_load
-
-        def fit_param(p, v):
-            if p.shape[0] != v.shape[0]:
-                return _ft_para_conf.slice_weight(v, axis=0, phase=0)
-            if len(p.shape) == 2 and p.shape[1] != v.shape[1]:
-                return _ft_para_conf.slice_weight(v, axis=1, phase=0)
-            return v
-
-        for k, v in model.state_dict().items():
-            if k in state_to_load:
-                state_to_load[k] = fit_param(v, state_to_load[k])
-        return state_to_load
-
-
-# TODO(guosheng): Maybe use context-manager to allow multiple models.
-_ft_para_conf = FTParaConf()
-
-
-def get_ft_para_conf():
-    r"""
-    Get settings for model parallel.
-
-    Returns:
-        FTParaConf: The settings for model parallel.
-    """
-    return _ft_para_conf
-
-
-def enable_ft_para(tensor_para_size=None, layer_para_size=None, layer_para_batch_size=1):
-    r"""
-    Enable model parallel with the given settings in FastGeneration. Currently only
-    support GPT. Please refer to `Megatron <https://arxiv.org/pdf/2104.04473.pdf>`__
-    for details.
-
-    Args:
-        tensor_para_size (int, optional): The size for tensor parallel. If it is
-            1, tensor parallel would not be used. When it is None, tensor parallel
-            size would be set as `world_size / layer_para_size`. Default to None.
-        layer_para_size (int, optional): The size for layer parallel. If it is
-            1, layer parallel would not be used. When it is None, it would be set
-            as 1. Default to None.
-        layer_para_batch_size (int, optional): The local batch size for pipeline
-            parallel. It is suggested to use `batch_size // layer_para_size`.
-            Default to 1.
-    """
-    global _ft_para_conf
-    _ft_para_conf = FTParaConf(tensor_para_size, layer_para_size, layer_para_batch_size)
-    if _ft_para_conf.no_para:
-        return
-
-    def reset_param(layer, attr, axis):
-        param = getattr(layer, attr)
-        # NOTE: Assignment to parameter 'weight' should be of type Parameter or
-        # None. Additionaly, we cannot delattr and setattr which would remove
-        # the param from layer._parameters and state_dict, thus cannot fit_partial_model
-        param = _ft_para_conf.slice_weight(param, axis, phase=0, out_param=True)
-        setattr(layer, attr, param)
-
-    def layer_init_wrapper(func):
-        @functools.wraps(func)
-        def _impl(self, *args, **kwargs):
-            init_dict = fn_args_to_dict(func, *((self,) + args), **kwargs)
-            init_dict.pop("self")
-            assert (
-                init_dict["nhead"] % _ft_para_conf.tensor_para_size == 0
-            ), "The number of heads(%d) cannot be evenly divisible by `tensor_para_size`(%d)." % (
-                init_dict["nhead"],
-                _ft_para_conf.tensor_para_size,
-            )
-            func(self, *args, **kwargs)
-            # Reset parameters with corresponding slice.
-            for x, attr in [(m, n) for m in ("q", "k", "v") for n in ("weight", "bias")]:
-                reset_param(getattr(self.self_attn, x + "_proj"), attr, 1)
-            reset_param(self.self_attn.out_proj, "weight", 0)
-            reset_param(self.linear1, "weight", 1)
-            reset_param(self.linear1, "bias", 1)
-            reset_param(self.linear2, "weight", 0)
-
-        return _impl
-
-    def block_init_wrapper(func):
-        @functools.wraps(func)
-        def _impl(self, *args, **kwargs):
-            init_dict = fn_args_to_dict(func, *((self,) + args), **kwargs)
-            init_dict.pop("self")
-            num_layers = init_dict["num_hidden_layers"]
-            init_dict["num_hidden_layers"] //= _ft_para_conf.layer_para_size
-            func(self, **init_dict)
-            self.num_layers = num_layers
-            self.config["num_hidden_layers"] = num_layers
-
-        return _impl
-
-    def block_state_wrapper(func):
-        # TODO(guosheng): Uset state hook instead of block_state_wrapper.
-        # self.register_state_dict_hook(reidx_state_layer)
-        @functools.wraps(func)
-        def _impl(self, *args, **kwargs):
-            state_dict = func(self, *args, **kwargs)
-            arg_dict = fn_args_to_dict(func, *((self,) + args), **kwargs)
-            structured_name_prefix = arg_dict["structured_name_prefix"]
-
-            def reidx_state_layer(state_dict):
-                prefix = structured_name_prefix + "decoder.layers."
-                prefix_len = len(prefix)
-                for name, param in list(state_dict.items()):
-                    if name.startswith(prefix):
-                        layer_idx_len = 0
-                        for i in name[prefix_len:]:
-                            if i == ".":
-                                break
-                            else:
-                                layer_idx_len += 1
-                        layer_idx = int(name[prefix_len : prefix_len + layer_idx_len])
-                        new_name = (
-                            name[:prefix_len]
-                            + str(_ft_para_conf.layer_para_rank * len(self.decoder.layers) + layer_idx)
-                            + name[prefix_len + layer_idx_len :]
-                        )
-                        state_dict[new_name] = state_dict.pop(name)
-
-            reidx_state_layer(state_dict)
-            return state_dict
-
-        return _impl
-
-    # GPT
-    layer_init_fn = paddlenlp.transformers.gpt.modeling.TransformerDecoderLayer.__init__
-    paddlenlp.transformers.gpt.modeling.TransformerDecoderLayer.__init__ = layer_init_wrapper(layer_init_fn)
-    # Note that Transformer block in GPT is not created in TransformerDecoder
-    # but in GPTModel.
-    block_init_fn = paddlenlp.transformers.gpt.modeling.GPTModel.__init__
-    paddlenlp.transformers.gpt.modeling.GPTModel.__init__ = block_init_wrapper(block_init_fn)
-    block_state_fn = paddlenlp.transformers.gpt.modeling.GPTModel.state_dict
-    paddlenlp.transformers.gpt.modeling.GPTModel.state_dict = block_state_wrapper(block_state_fn)
-    # PLATO
-    paddle.nn.TransformerEncoderLayer.__init__ = layer_init_wrapper(paddle.nn.TransformerEncoderLayer.__init__)
-    _ft_para_conf.set_partial_model(True)
-    # TODO(guosheng): Should we set device here, sometimes we want to create
-    # models on CPU first to save memory.
-    # paddle.set_device("gpu:" + str(_ft_para_conf.rank))
-    # yield
-
-
-class InferOptDecoding(nn.Layer):
-    """extract infer model parameters and feed it into the cuda decoder"""
-
-    def __init__(self, model: OPTForCausalLM, decoding_lib=None, use_fp16_decoding=False):
-        if decoding_lib is not None and os.path.isfile(decoding_lib):
-            if "FastGeneration" not in LOADED_EXT.keys():
-                ops = paddle.utils.cpp_extension.load_op_meta_info_and_register_op(decoding_lib)
-                LOADED_EXT["FastGeneration"] = ops
-        else:
-            if decoding_lib is not None:
-                logger.warning("The specified decoding_lib does not exist, and it will be built automatically.")
-            load(
-                "FastGeneration" if get_ft_para_conf().no_para else "FasterTransformerParallel",
-                verbose=True,
-                need_parallel=not get_ft_para_conf().no_para,
-            )
-
-        super(InferOptDecoding, self).__init__()
-
-        self.use_fp16_decoding = use_fp16_decoding
-        self.model = model
-        self.head_num = self.model.opt.config["num_attention_heads"]
-        self.size_per_head = int(self.model.opt.config["hidden_size"] / self.head_num)
-        self.num_layer = self.model.opt.config["num_hidden_layers"]
-        self.inner_size = self.model.opt.config["intermediate_size"]
-
-        params = convert_params(self, model, fuse_qkv=1, use_fp16=use_fp16_decoding, restore_data=True)
-
-        if self.model.opt.embeddings.project_in is not None:
-            self.word_emb = paddle.matmul(
-                self.model.opt.embeddings.word_embeddings.weight, self.model.opt.embeddings.project_in.weight
-            )
-            # set the linear_weight
-            self.linear_weight = paddle.matmul(
-                self.model.opt.embeddings.word_embeddings.weight, self.model.opt.decoder.project_out.weight.T
-            )
-        else:
-            self.word_emb = self.model.opt.embeddings.word_embeddings.weight
-            self.linear_weight = self.model.opt.embeddings.word_embeddings.weight
-
-        # reset the offset in position embedding
-        position_embedding = self.model.opt.embeddings.position_embeddings
-        self.pos_emb = paddle.concat([position_embedding.weight[2:], position_embedding.weight[:2]])
-
-        # if there is no final layer norm, pass empty tensor to fusion opt op
-        final_layer_norm = self.model.opt.decoder.final_layer_norm
-        if final_layer_norm is None:
-            self.decoder_ln_weight = paddle.empty(shape=[0])
-            self.decoder_ln_bias = paddle.empty(shape=[0])
-        else:
-            self.decoder_ln_weight = final_layer_norm.weight
-            self.decoder_ln_bias = final_layer_norm.bias
-
-        self.normalize_before = self.model.decoder.final_layer_norm is not None
-
-        for k, v in params.items():
-            setattr(self, k, v)
-
-        # check the dtype of embedding
-        dtype = "float16" if use_fp16_decoding else "float32"
-        self.word_emb = transfer_param(self.word_emb, dtype=dtype, is_bias=False, restore_data=True)
-        self.linear_weight = transfer_param(self.linear_weight, dtype=dtype, is_bias=False, restore_data=True)
-        self.pos_emb = transfer_param(self.pos_emb, dtype=dtype, is_bias=False, restore_data=True)
-        self.decoder_ln_weight = transfer_param(self.decoder_ln_weight, dtype=dtype, is_bias=False, restore_data=True)
-        self.decoder_ln_bias = transfer_param(self.decoder_ln_bias, dtype=dtype, is_bias=True, restore_data=True)
-
-    def forward(
-        self,
-        input_ids,
-        mem_seq_len,
-        attention_mask=None,
-        topk=4,
-        topp=0.0,
-        bos_token_id=None,
-        eos_token_id=None,
-        pad_token_id=None,
-        forced_eos_token_id=None,
-        max_out_len=256,
-        temperature=1,
-    ):
-        if attention_mask is None:
-            batch_size = input_ids.shape[0]
-            attention_mask = paddle.tril(
-                paddle.ones(
-                    [batch_size, mem_seq_len, mem_seq_len], dtype="float16" if self.use_fp16_decoding else "float32"
-                )
-            )
-        elif self.use_fp16_decoding and attention_mask.dtype == paddle.float32:
-            attention_mask = paddle.cast(attention_mask, dtype="float16")
-
-        output_ids = infer_opt_decoding(
-            input=[input_ids],
-            attn_mask=[attention_mask],
-            mem_seq_len=[mem_seq_len],
-            word_emb=self.word_emb,
-            slf_ln_weight=self.slf_ln_weight,
-            slf_ln_bias=self.slf_ln_bias,
-            slf_q_weight=self.slf_q_weight,
-            slf_q_bias=self.slf_q_bias,
-            slf_k_weight=self.slf_k_weight,
-            slf_k_bias=self.slf_k_bias,
-            slf_v_weight=self.slf_v_weight,
-            slf_v_bias=self.slf_v_bias,
-            slf_out_weight=self.slf_out_weight,
-            slf_out_bias=self.slf_out_bias,
-            ffn_ln_weight=self.ffn_ln_weight,
-            ffn_ln_bias=self.ffn_ln_bias,
-            ffn_inter_weight=self.ffn_inter_weight,
-            ffn_inter_bias=self.ffn_inter_bias,
-            ffn_out_weight=self.ffn_out_weight,
-            ffn_out_bias=self.ffn_out_bias,
-            decoder_ln_weight=self.decoder_ln_weight,
-            decoder_ln_bias=self.decoder_ln_bias,
-            pos_emb=self.pos_emb,
-            linear_weight=self.linear_weight,
-            normalize_before=self.normalize_before,
-            topk=topk,
-            topp=topp,
-            max_out_len=max_out_len,
-            head_num=self.head_num,
-            size_per_head=self.size_per_head,
-            num_layer=self.num_layer,
-            bos_id=bos_token_id,
-            eos_id=eos_token_id,
-            temperature=temperature,
-            use_fp16_decoding=self.use_fp16_decoding,
-        )
-
-        output_ids = output_ids[input_ids.shape[-1] :, :]
-        if forced_eos_token_id is not None:
-            output_ids[:, -1] = forced_eos_token_id
-        return output_ids
-
-
-class InferGptDecoding(nn.Layer):
-    def __init__(self, model, decoding_lib=None, use_fp16_decoding=False):
-        if decoding_lib is not None and os.path.isfile(decoding_lib):
-            if "FastGeneration" not in LOADED_EXT.keys():
-                ops = paddle.utils.cpp_extension.load_op_meta_info_and_register_op(decoding_lib)
-                LOADED_EXT["FastGeneration"] = ops
-        else:
-            if decoding_lib is not None:
-                logger.warning("The specified decoding_lib does not exist, and it will be built automatically.")
-            load(
-                "FastGeneration" if get_ft_para_conf().no_para else "FasterTransformerParallel",
-                verbose=True,
-                need_parallel=not get_ft_para_conf().no_para,
-            )
-
-        super(InferGptDecoding, self).__init__()
-
-        self.use_fp16_decoding = use_fp16_decoding
-        self.model = model
-        self.head_num = self.model.gpt.config["num_attention_heads"]
-        self.size_per_head = int(self.model.gpt.config["hidden_size"] / self.head_num)
-        self.num_layer = self.model.gpt.config["num_hidden_layers"]
-        self.inner_size = self.model.gpt.config["intermediate_size"]
-
-        params = convert_params(self, model, fuse_qkv=1, use_fp16=use_fp16_decoding, restore_data=True)
-        params["word_emb"].append((self.model.gpt.embeddings.word_embeddings, "weight"))
-        params["pos_emb"].append((self.model.gpt.embeddings.position_embeddings, "weight"))
-
-        # if model share word_embeddings weight
-        if id(self.model.gpt.embeddings.word_embeddings) == id(self.model.lm_head.weight):
-            params["linear_weight"].append((self.model.gpt.embeddings.word_embeddings, "weight"))
-        else:
-            params["linear_weight"].append((self.model.lm_head.weight, False, partial(setattr, self, "weight")))
-
-        for k, v in params.items():
-            setattr(self, k, v)
-
-    def forward(
-        self,
-        input_ids,
-        mem_seq_len,
-        attention_mask=None,
-        topk=4,
-        topp=0.0,
-        bos_token_id=None,
-        eos_token_id=None,
-        pad_token_id=None,
-        forced_eos_token_id=None,
-        max_out_len=256,
-        temperature=1,
-    ):
-        if attention_mask is None:
-            batch_size = input_ids.shape[0]
-            attention_mask = paddle.tril(
-                paddle.ones(
-                    [batch_size, paddle.max(mem_seq_len), paddle.max(mem_seq_len)],
-                    dtype="float16" if self.use_fp16_decoding else "float32",
-                )
-            )
-        elif self.use_fp16_decoding and attention_mask.dtype == paddle.float32:
-            attention_mask = paddle.cast(attention_mask, dtype="float16")
-
-        (output_ids,) = infer_gpt_decoding(
-            input=[input_ids],
-            attn_mask=[attention_mask],
-            mem_seq_len=[mem_seq_len],
-            word_emb=self.word_emb,
-            slf_ln_weight=self.slf_ln_weight,
-            slf_ln_bias=self.slf_ln_bias,
-            slf_q_weight=self.slf_q_weight,
-            slf_q_bias=self.slf_q_bias,
-            slf_k_weight=self.slf_k_weight,
-            slf_k_bias=self.slf_k_bias,
-            slf_v_weight=self.slf_v_weight,
-            slf_v_bias=self.slf_v_bias,
-            slf_out_weight=self.slf_out_weight,
-            slf_out_bias=self.slf_out_bias,
-            ffn_ln_weight=self.ffn_ln_weight,
-            ffn_ln_bias=self.ffn_ln_bias,
-            ffn_inter_weight=self.ffn_inter_weight,
-            ffn_inter_bias=self.ffn_inter_bias,
-            ffn_out_weight=self.ffn_out_weight,
-            ffn_out_bias=self.ffn_out_bias,
-            decoder_ln_weight=self.decoder_ln_weight,
-            decoder_ln_bias=self.decoder_ln_bias,
-            pos_emb=self.pos_emb,
-            linear_weight=self.linear_weight,
-            topk=topk,
-            topp=topp,
-            max_out_len=max_out_len,
-            head_num=self.head_num,
-            size_per_head=self.size_per_head,
-            num_layer=self.num_layer,
-            bos_id=bos_token_id,
-            eos_id=eos_token_id,
-            temperature=temperature,
-            use_fp16_decoding=self.use_fp16_decoding,
-        )
-
-        output_ids = output_ids[input_ids.shape[-1] :, :]
-        if forced_eos_token_id is not None:
-            output_ids[:, -1] = forced_eos_token_id
-        return output_ids
-
-
-class InferUnifiedDecoding(nn.Layer):
-    def __init__(
-        self,
-        model,
-        decoding_lib=None,
-        use_fp16_decoding=False,
-        logits_mask=None,
-        n_head=8,
-        hidden_dims=512,
-        size_per_head=64,
-        n_layer=6,
-        unk_id=0,
-        mask_id=30000,
-        normalize_before=True,
-        hidden_act="gelu",
-    ):
-        if decoding_lib is not None and os.path.isfile(decoding_lib):
-            # Maybe it has been loadad by `ext_utils.load`
-            if "FastGeneration" not in LOADED_EXT.keys():
-                ops = paddle.utils.cpp_extension.load_op_meta_info_and_register_op(decoding_lib)
-                LOADED_EXT["FastGeneration"] = ops
-        else:
-            if decoding_lib is not None:
-                logger.warning("The specified decoding_lib does not exist, and it will be built automatically.")
-            load(
-                "FastGeneration" if get_ft_para_conf().no_para else "FasterTransformerParallel",
-                verbose=True,
-                need_parallel=not get_ft_para_conf().no_para,
-            )
-
-        super(InferUnifiedDecoding, self).__init__()
-        for arg, value in locals().items():
-            if arg not in ["self"]:
-                setattr(self, "_" + arg, value)
-
-        params = convert_params(self, model, fuse_qkv=1, use_fp16=use_fp16_decoding, restore_data=True)
-        params["word_emb"].append((model.embeddings.word_embeddings, "weight"))
-        params["pos_emb"].append((model.embeddings.position_embeddings, "weight"))
-        params["type_emb"].append((model.embeddings.token_type_embeddings, "weight"))
-        if getattr(model.embeddings, "role_embeddings", None) is not None:
-            params["role_emb"].append((model.embeddings.role_embeddings, "weight"))
-        else:
-            # inputs of custom op cannot be None
-            params["role_emb"].append((paddle.zeros(shape=[1]), False, partial(setattr, self, "default_role_emb")))
-        if not self._normalize_before:
-            # pre-norm params has been converted in `convert_params`, and this
-            # is only for post-norm such as UNIMO.
-            params["decoder_ln_weight"].append((model.encoder_norm, "weight"))
-            params["decoder_ln_bias"].append((model.encoder_norm, "bias"))
-        params["trans_weight"].append((model.lm_head.transform, "weight"))
-        params["trans_bias"].append((model.lm_head.transform, "bias"))
-        params["lm_ln_weight"].append((model.lm_head.layer_norm, "weight"))
-        params["lm_ln_bias"].append((model.lm_head.layer_norm, "bias"))
-        # NOTE: newly created tensors should be layer attribute refered to be
-        # able to convert to static graph.
-        params["linear_weight"].append((model.lm_head.decoder_weight.t(), False, partial(setattr, self, "dec_weight")))
-        params["linear_bias"].append(
-            (paddle.assign(model.lm_head.decoder_bias), True, partial(setattr, self, "dec_bias"))
-        )
-        for k, v in params.items():
-            setattr(self, k, v)
-
-    def forward(
-        self,
-        input_ids,
-        attn_mask,
-        memory_seq_lens,
-        type_id,
-        decoder_type_id,
-        role_id=None,
-        decoder_role_id=None,
-        position_id=None,
-        decoder_position_id=None,
-        beam_size=4,
-        topk=4,
-        topp=0.0,
-        decoding_strategy="greedy_search",
-        max_out_len=256,
-        bos_token_id=None,
-        eos_token_id=None,
-        pad_token_id=None,
-        forced_eos_token_id=None,
-        temperature=1.0,
-        length_penalty=1.0,
-        diversity_rate=0.0,
-        pos_bias=True,
-        rel_len=False,
-        early_stopping=False,
-        min_length=0,
-    ):
-        if role_id is None:
-            role_id = paddle.zeros(shape=[0], dtype="int32")
-            decoder_role_id = paddle.zeros(shape=[0], dtype="int32")
-        if position_id is None:
-            position_id = paddle.zeros(shape=[0], dtype="int32")
-            decoder_position_id = paddle.zeros(shape=[0], dtype="int32")
-
-        if decoding_strategy == "greedy_search":
-            decoding_strategy = "topk_sampling"
-            topk = 1
-            topp = 0.0
-        elif decoding_strategy in ["sampling", "topk_sampling", "topp_sampling"]:
-            if topp == 1 and topk > 0:
-                decoding_strategy = "topk_sampling"
-                topp = 0.0
-            elif topp > 0 and topk == 0:
-                decoding_strategy = "topp_sampling"
-            else:
-                raise AttributeError(
-                    "Only topk sampling or topp sampling are supported. "
-                    "Topk sampling and topp sampling cannot be both applied in the fast version."
-                )
-        elif decoding_strategy.startswith("beam_search"):
-            decoding_strategy = "beam_search_v3"
-
-        output_ids, parent_ids, sequence_length, output_scores = infer_unified_decoding(
-            input_ids=[input_ids],
-            attn_mask=[attn_mask],
-            memory_seq_lens=[memory_seq_lens],
-            type_id=[type_id],
-            decoder_type_id=[decoder_type_id],
-            logits_mask=[self._logits_mask],
-            word_emb=self.word_emb,
-            slf_ln_weight=self.slf_ln_weight,
-            slf_ln_bias=self.slf_ln_bias,
-            slf_q_weight=self.slf_q_weight,
-            slf_q_bias=self.slf_q_bias,
-            slf_k_weight=self.slf_k_weight,
-            slf_k_bias=self.slf_k_bias,
-            slf_v_weight=self.slf_v_weight,
-            slf_v_bias=self.slf_v_bias,
-            slf_out_weight=self.slf_out_weight,
-            slf_out_bias=self.slf_out_bias,
-            ffn_ln_weight=self.ffn_ln_weight,
-            ffn_ln_bias=self.ffn_ln_bias,
-            ffn_inter_weight=self.ffn_inter_weight,
-            ffn_inter_bias=self.ffn_inter_bias,
-            ffn_out_weight=self.ffn_out_weight,
-            ffn_out_bias=self.ffn_out_bias,
-            decoder_ln_weight=self.decoder_ln_weight,
-            decoder_ln_bias=self.decoder_ln_bias,
-            trans_weight=self.trans_weight,
-            trans_bias=self.trans_bias,
-            lm_ln_weight=self.lm_ln_weight,
-            lm_ln_bias=self.lm_ln_bias,
-            linear_weight=self.linear_weight,
-            linear_bias=self.linear_bias,
-            pos_emb=self.pos_emb,
-            type_emb=self.type_emb,
-            role_id=[role_id],
-            decoder_role_id=[decoder_role_id],
-            role_emb=self.role_emb,
-            position_id=[position_id],
-            decoder_position_id=[decoder_position_id],
-            _decoding_strategy=decoding_strategy,
-            _beam_size=beam_size,
-            _topk=topk,
-            _topp=topp,
-            _n_head=self._n_head,
-            _size_per_head=self._size_per_head,
-            _n_layer=self._n_layer,
-            _bos_id=bos_token_id,
-            _eos_id=eos_token_id,
-            _max_out_len=max_out_len,
-            _diversity_rate=-diversity_rate,
-            _unk_id=self._unk_id,
-            _mask_id=self._mask_id,
-            _temperature=temperature,
-            _len_penalty=length_penalty,
-            _normalize_before=self._normalize_before,
-            _pos_bias=pos_bias,
-            _hidden_act=self._hidden_act,
-            _rel_len=rel_len,
-            _early_stopping=early_stopping,
-            _min_length=min_length,
-        )
-        ids = finalize(
-            beam_size,
-            output_ids,
-            parent_ids,
-            sequence_length,
-            forced_eos_token_id=forced_eos_token_id,
-            decoding_strategy=decoding_strategy,
-        )
-        return ids, output_scores
-
-
-class InferMIRODecoding(nn.Layer):
-    def __init__(
-        self,
-        model,
-        decoding_lib=None,
-        use_fp16_decoding=False,
-        logits_mask=None,
-        n_head=8,
-        hidden_dims=512,
-        size_per_head=64,
-        n_layer=6,
-        unk_id=0,
-        mask_id=30000,
-        normalize_before=True,
-        hidden_act="relu",
-    ):
-
-        if decoding_lib is not None and os.path.isfile(decoding_lib):
-            # Maybe it has been loadad by `ext_utils.load`
-            if "FasterTransformer" not in LOADED_EXT.keys():
-                ops = paddle.utils.cpp_extension.load_op_meta_info_and_register_op(decoding_lib)
-                LOADED_EXT["FasterTransformer"] = ops
-        else:
-            if decoding_lib is not None:
-                logger.warning("The specified decoding_lib does not exist, and it will be built automatically.")
-            load(
-                "FasterTransformer" if get_ft_para_conf().no_para else "FasterTransformerParallel",
-                verbose=True,
-                need_parallel=not get_ft_para_conf().no_para,
-            )
-
-        super(InferMIRODecoding, self).__init__()
-        for arg, value in locals().items():
-            if arg not in ["self"]:
-                setattr(self, "_" + arg, value)
-
-        params = convert_params(self, model, fuse_qkv=1, use_fp16=use_fp16_decoding, restore_data=True)
-        params["word_emb"].append((model.embeddings.word_embeddings, "weight"))
-        params["pos_emb"].append((model.embeddings.position_embeddings, "weight"))
-        params["type_emb"].append((model.embeddings.token_type_embeddings, "weight"))
-        if getattr(model.embeddings, "role_embeddings", None) is not None:
-            params["role_emb"].append((model.embeddings.role_embeddings, "weight"))
-        else:
-            # inputs of custom op cannot be None
-            params["role_emb"].append((paddle.zeros(shape=[1]), False, partial(setattr, self, "default_role_emb")))
-        # if not self._normalize_before:
-        #     # pre-norm params has been converted in `convert_params`, and this
-        #     # is only for post-norm such as UNIMO.
-        #     params["decoder_ln_weight"].append((model.encoder_norm, "weight"))
-        #     params["decoder_ln_bias"].append((model.encoder_norm, "bias"))
-        params["pre_decoder_ln_weight"].append((model.encoder_norm, "weight"))
-        params["pre_decoder_ln_bias"].append((model.encoder_norm, "bias"))
-
-        params["trans_weight"].append((model.lm_head.transform, "weight"))
-        params["trans_bias"].append((model.lm_head.transform, "bias"))
-        params["lm_ln_weight"].append((model.lm_head.layer_norm, "weight"))
-        params["lm_ln_bias"].append((model.lm_head.layer_norm, "bias"))
-        # NOTE: newly created tensors should be layer attribute refered to be
-        # able to convert to static graph.
-        params["linear_weight"].append((model.lm_head.decoder_weight.t(), False, partial(setattr, self, "dec_weight")))
-        params["linear_bias"].append(
-            (paddle.assign(model.lm_head.decoder_bias), True, partial(setattr, self, "dec_bias"))
-        )
-        for k, v in params.items():
-            setattr(self, k, v)
-
-    def forward(
-        self,
-        input_ids,
-        attn_mask,
-        memory_seq_lens,
-        type_id,
-        decoder_type_id,
-        role_id=None,
-        decoder_role_id=None,
-        position_id=None,
-        decoder_position_id=None,
-        beam_size=4,
-        topk=4,
-        topp=0.0,
-        decoding_strategy="greedy_search",
-        max_out_len=256,
-        bos_token_id=None,
-        eos_token_id=None,
-        pad_token_id=None,
-        forced_eos_token_id=None,
-        temperature=1.0,
-        length_penalty=1.0,
-        diversity_rate=0.0,
-        pos_bias=True,
-        rel_len=False,
-        early_stopping=False,
-        min_length=0,
-    ):
-        if role_id is None:
-            role_id = paddle.zeros(shape=[0], dtype="int32")
-            decoder_role_id = paddle.zeros(shape=[0], dtype="int32")
-        if position_id is None:
-            position_id = paddle.zeros(shape=[0], dtype="int32")
-            decoder_position_id = paddle.zeros(shape=[0], dtype="int32")
-
-        if decoding_strategy == "greedy_search":
-            decoding_strategy = "topk_sampling"
-            topk = 1
-            topp = 0.0
-        elif decoding_strategy in ["sampling", "topk_sampling", "topp_sampling"]:
-            if topp == 1 and topk > 0:
-                decoding_strategy = "topk_sampling"
-                topp = 0.0
-            elif topp > 0 and topk == 0:
-                decoding_strategy = "topp_sampling"
-            else:
-                raise AttributeError(
-                    "Only topk sampling or topp sampling are supported. "
-                    "Topk sampling and topp sampling cannot be both applied in the faster version."
-                )
-        elif decoding_strategy.startswith("beam_search"):
-            decoding_strategy = "beam_search_v3"
-
-        output_ids, parent_ids, sequence_length, output_scores = infer_miro_decoding(
-            input_ids=[input_ids],
-            attn_mask=[attn_mask],
-            memory_seq_lens=[memory_seq_lens],
-            type_id=[type_id],
-            decoder_type_id=[decoder_type_id],
-            logits_mask=[self._logits_mask],
-            word_emb=self.word_emb,
-            pre_decoder_ln_weight=self.pre_decoder_ln_weight,
-            pre_decoder_ln_bias=self.pre_decoder_ln_bias,
-            slf_ln_weight=self.slf_ln_weight,
-            slf_ln_bias=self.slf_ln_bias,
-            slf_q_weight=self.slf_q_weight,
-            slf_q_bias=self.slf_q_bias,
-            slf_k_weight=self.slf_k_weight,
-            slf_k_bias=self.slf_k_bias,
-            slf_v_weight=self.slf_v_weight,
-            slf_v_bias=self.slf_v_bias,
-            slf_out_weight=self.slf_out_weight,
-            slf_out_bias=self.slf_out_bias,
-            ffn_ln_weight=self.ffn_ln_weight,
-            ffn_ln_bias=self.ffn_ln_bias,
-            ffn_inter_weight=self.ffn_inter_weight,
-            ffn_inter_bias=self.ffn_inter_bias,
-            ffn_out_weight=self.ffn_out_weight,
-            ffn_out_bias=self.ffn_out_bias,
-            decoder_ln_weight=self.decoder_ln_weight,
-            decoder_ln_bias=self.decoder_ln_bias,
-            trans_weight=self.trans_weight,
-            trans_bias=self.trans_bias,
-            lm_ln_weight=self.lm_ln_weight,
-            lm_ln_bias=self.lm_ln_bias,
-            linear_weight=self.linear_weight,
-            linear_bias=self.linear_bias,
-            pos_emb=self.pos_emb,
-            type_emb=self.type_emb,
-            role_id=[role_id],
-            decoder_role_id=[decoder_role_id],
-            role_emb=self.role_emb,
-            position_id=[position_id],
-            decoder_position_id=[decoder_position_id],
-            _decoding_strategy=decoding_strategy,
-            _beam_size=beam_size,
-            _topk=topk,
-            _topp=topp,
-            _n_head=self._n_head,
-            _size_per_head=self._size_per_head,
-            _n_layer=self._n_layer,
-            _bos_id=bos_token_id,
-            _eos_id=eos_token_id,
-            _max_out_len=max_out_len,
-            _diversity_rate=-diversity_rate,
-            _unk_id=self._unk_id,
-            _mask_id=self._mask_id,
-            _temperature=temperature,
-            _len_penalty=length_penalty,
-            _normalize_before=self._normalize_before,
-            _pos_bias=pos_bias,
-            _hidden_act=self._hidden_act,
-            _rel_len=rel_len,
-            _early_stopping=early_stopping,
-            _min_length=min_length,
-        )
-
-        ids = finalize(
-            beam_size,
-            output_ids,
-            parent_ids,
-            sequence_length,
-            forced_eos_token_id=forced_eos_token_id,
-            decoding_strategy=decoding_strategy,
-        )
-
-        return ids, output_scores
-
-
-class InferBartDecoding(nn.Layer):
-    def __init__(self, model, decoding_lib=None, use_fp16_decoding=False):
-        if decoding_lib is not None and os.path.isfile(decoding_lib):
-            # Maybe it has been loadad by `ext_utils.load`
-            if "FastGeneration" not in LOADED_EXT.keys():
-                ops = paddle.utils.cpp_extension.load_op_meta_info_and_register_op(decoding_lib)
-                LOADED_EXT["FastGeneration"] = ops
-        else:
-            if decoding_lib is not None:
-                logger.warning("The specified decoding_lib does not exist, and it will be built automatically.")
-            load("FastGeneration", verbose=True)
-
-        super(InferBartDecoding, self).__init__()
-        for arg, value in locals().items():
-            if arg not in ["self", "model", "word_embedding", "positional_embedding", "linear"]:
-                setattr(self, "_" + arg, value)
-        self._num_decoder_layers = model.bart.config["decoder_layers"]
-        self._n_head = model.bart.config["decoder_attention_heads"]
-        self._d_model = model.bart.config["d_model"]
-
-        params = convert_params(self, model.get_decoder(), fuse_qkv=2, use_fp16=use_fp16_decoding, restore_data=True)
-        params["decoder_ln_weight"].append((model.decoder.decoder_layernorm_embedding, "weight"))
-        params["decoder_ln_bias"].append((model.decoder.decoder_layernorm_embedding, "bias"))
-        params["word_emb"].append((model.decoder.embed_tokens, "weight"))
-        params["pos_emb"].append((model.decoder.decoder_embed_positions, "weight"))
-        params["linear_weight"].append((model.lm_head_weight.t(), False, partial(setattr, self, "lm_head_weight_")))
-        params["linear_bias"].append((model.final_logits_bias, True, partial(setattr, self, "lm_head_bias_")))
-        for k, v in params.items():
-            setattr(self, k, v)
-
-    def forward(
-        self,
-        enc_output,
-        memory_seq_lens,
-        beam_size=4,
-        top_k=1,
-        top_p=0.0,
-        temperature=1.0,
-        decoding_strategy="beam_search_v3",
-        max_out_len=256,
-        min_out_len=256,
-        diversity_rate=0.0,
-        rel_len=False,
-        bos_token_id=None,
-        eos_token_id=None,
-        pad_token_id=None,
-        forced_eos_token_id=None,
-        alpha=0.6,
-        early_stopping=False,
-    ):
-        # beam_search/beam_search_v2/beam_search_v3 should be corrected to beam_search_v3.
-        if decoding_strategy.startswith("beam_search"):
-            decoding_strategy = "beam_search_v3"
-        elif decoding_strategy == "greedy_search":
-            decoding_strategy = "topk_sampling"
-            top_k = 1
-            top_p = 0.0
-        elif decoding_strategy in ["sampling", "topk_sampling", "topp_sampling"]:
-            if top_p == 1 and top_k > 0:
-                decoding_strategy = "topk_sampling"
-                top_p = 0.0
-            elif top_p > 0 and top_k == 0:
-                decoding_strategy = "topp_sampling"
-            else:
-                raise AttributeError(
-                    "Only topk sampling or topp sampling are supported. "
-                    "Topk sampling and topp sampling cannot be both applied in the fast version. "
-                )
-
-        output_ids, parent_ids, sequence_length = infer_bart_decoding(
-            [enc_output],
-            [memory_seq_lens],
-            self.word_emb,
-            self.slf_ln_weight,
-            self.slf_ln_bias,
-            self.slf_q_weight,
-            self.slf_q_bias,
-            self.slf_k_weight,
-            self.slf_k_bias,
-            self.slf_v_weight,
-            self.slf_v_bias,
-            self.slf_out_weight,
-            self.slf_out_bias,
-            self.cross_ln_weight,
-            self.cross_ln_bias,
-            self.cross_q_weight,
-            self.cross_q_bias,
-            self.cross_k_weight,
-            self.cross_k_bias,
-            self.cross_v_weight,
-            self.cross_v_bias,
-            self.cross_out_weight,
-            self.cross_out_bias,
-            self.ffn_ln_weight,
-            self.ffn_ln_bias,
-            self.ffn_inter_weight,
-            self.ffn_inter_bias,
-            self.ffn_out_weight,
-            self.ffn_out_bias,
-            self.decoder_ln_weight,
-            self.decoder_ln_bias,
-            self.linear_weight,
-            self.linear_bias,
-            self.pos_emb,
-            decoding_strategy,
-            beam_size,
-            top_k,
-            top_p,
-            temperature,
-            self._n_head,
-            int(self._d_model / self._n_head),
-            self._num_decoder_layers,
-            bos_token_id,
-            eos_token_id,
-            max_out_len,
-            min_out_len,
-            -diversity_rate,
-            rel_len,
-            alpha,
-            early_stopping,
-        )
-
-        ids = finalize(
-            beam_size,
-            output_ids,
-            parent_ids,
-            sequence_length,
-            forced_eos_token_id=forced_eos_token_id,
-            decoding_strategy=decoding_strategy,
-        )
-        return ids
-
-
-class InferMBartDecoding(nn.Layer):
-    def __init__(self, model, decoding_lib=None, use_fp16_decoding=False, hidden_act="gelu"):
-        if decoding_lib is not None and os.path.isfile(decoding_lib):
-            # Maybe it has been loadad by `ext_utils.load`
-            if "FastGeneration" not in LOADED_EXT.keys():
-                ops = paddle.utils.cpp_extension.load_op_meta_info_and_register_op(decoding_lib)
-                LOADED_EXT["FastGeneration"] = ops
-        else:
-            if decoding_lib is not None:
-                logger.warning("The specified decoding_lib does not exist, and it will be built automatically.")
-            load("FastGeneration", verbose=True)
-
-        super(InferMBartDecoding, self).__init__()
-        for arg, value in locals().items():
-            if arg not in ["self", "model", "word_embedding", "positional_embedding", "linear"]:
-                setattr(self, "_" + arg, value)
-        self._num_decoder_layers = model.mbart.config["decoder_layers"]
-        self._n_head = model.mbart.config["decoder_attention_heads"]
-        self._d_model = model.mbart.config["d_model"]
-
-        # process weights
-        if use_fp16_decoding:
-            for mod in model.mbart.decoder.decoder.layers:
-                mod.norm1.weight = transfer_param(mod.norm1.weight, restore_data=True)
-                mod.norm1.bias = transfer_param(mod.norm1.bias, is_bias=True, restore_data=True)
-                mod.self_attn.q_proj.weight = transfer_param(mod.self_attn.q_proj.weight, restore_data=True)
-                mod.self_attn.q_proj.bias = transfer_param(mod.self_attn.q_proj.bias, is_bias=True, restore_data=True)
-                mod.self_attn.k_proj.weight = transfer_param(mod.self_attn.k_proj.weight, restore_data=True)
-                mod.self_attn.k_proj.bias = transfer_param(mod.self_attn.k_proj.bias, is_bias=True, restore_data=True)
-                mod.self_attn.v_proj.weight = transfer_param(mod.self_attn.v_proj.weight, restore_data=True)
-                mod.self_attn.v_proj.bias = transfer_param(mod.self_attn.v_proj.bias, is_bias=True, restore_data=True)
-                mod.self_attn.out_proj.weight = transfer_param(mod.self_attn.out_proj.weight, restore_data=True)
-                mod.self_attn.out_proj.bias = transfer_param(
-                    mod.self_attn.out_proj.bias, is_bias=True, restore_data=True
-                )
-
-                mod.norm2.weight = transfer_param(mod.norm2.weight, restore_data=True)
-                mod.norm2.bias = transfer_param(mod.norm2.bias, is_bias=True, restore_data=True)
-                mod.cross_attn.q_proj.weight = transfer_param(mod.cross_attn.q_proj.weight, restore_data=True)
-                mod.cross_attn.q_proj.bias = transfer_param(
-                    mod.cross_attn.q_proj.bias, is_bias=True, restore_data=True
-                )
-                mod.cross_attn.k_proj.weight = transfer_param(mod.cross_attn.k_proj.weight, restore_data=True)
-                mod.cross_attn.k_proj.bias = transfer_param(
-                    mod.cross_attn.k_proj.bias, is_bias=True, restore_data=True
-                )
-                mod.cross_attn.v_proj.weight = transfer_param(mod.cross_attn.v_proj.weight, restore_data=True)
-                mod.cross_attn.v_proj.bias = transfer_param(
-                    mod.cross_attn.v_proj.bias, is_bias=True, restore_data=True
-                )
-                mod.cross_attn.out_proj.weight = transfer_param(mod.cross_attn.out_proj.weight, restore_data=True)
-                mod.cross_attn.out_proj.bias = transfer_param(
-                    mod.cross_attn.out_proj.bias, is_bias=True, restore_data=True
-                )
-
-                mod.norm3.weight = transfer_param(mod.norm3.weight, restore_data=True)
-                mod.norm3.bias = transfer_param(mod.norm3.bias, is_bias=True, restore_data=True)
-                mod.linear1.weight = transfer_param(mod.linear1.weight, restore_data=True)
-                mod.linear1.bias = transfer_param(mod.linear1.bias, is_bias=True, restore_data=True)
-                mod.linear2.weight = transfer_param(mod.linear2.weight, restore_data=True)
-                mod.linear2.bias = transfer_param(mod.linear2.bias, is_bias=True, restore_data=True)
-
-            model.decoder.decoder_layernorm_embedding.weight = transfer_param(
-                model.decoder.decoder_layernorm_embedding.weight, restore_data=True
-            )
-            model.decoder.decoder_layernorm_embedding.bias = transfer_param(
-                model.decoder.decoder_layernorm_embedding.bias, is_bias=True, restore_data=True
-            )
-
-            model.decoder.decoder.norm.weight = transfer_param(model.decoder.decoder.norm.weight, restore_data=True)
-            model.decoder.decoder.norm.bias = transfer_param(
-                model.decoder.decoder.norm.bias, is_bias=True, restore_data=True
-            )
-
-            model.lm_head_weight = transfer_param(model.lm_head_weight, restore_data=True)
-            model.final_logits_bias = transfer_param(model.final_logits_bias, is_bias=True, restore_data=True)
-
-            model.decoder.decoder_embed_positions.weight = transfer_param(
-                model.decoder.decoder_embed_positions.weight, restore_data=True
-            )
-            model.decoder.embed_tokens.weight = transfer_param(model.decoder.embed_tokens.weight, restore_data=True)
-
-        self.slf_ln_weight = []
-        self.slf_ln_bias = []
-        self.slf_q_weight = []
-        self.slf_q_bias = []
-        self.slf_k_weight = []
-        self.slf_k_bias = []
-        self.slf_v_weight = []
-        self.slf_v_bias = []
-        self.slf_out_weight = []
-        self.slf_out_bias = []
-
-        self.cross_ln_weight = []
-        self.cross_ln_bias = []
-        self.cross_q_weight = []
-        self.cross_q_bias = []
-        self.cross_k_weight = []
-        self.cross_k_bias = []
-        self.cross_v_weight = []
-        self.cross_v_bias = []
-        self.cross_out_weight = []
-        self.cross_out_bias = []
-
-        self.ffn_ln_weight = []
-        self.ffn_ln_bias = []
-        self.ffn_inter_weight = []
-        self.ffn_inter_bias = []
-        self.ffn_out_weight = []
-        self.ffn_out_bias = []
-
-        for mod in model.mbart.decoder.decoder.layers:
-            self.slf_ln_weight.append(mod.norm1.weight)
-            self.slf_ln_bias.append(mod.norm1.bias)
-            self.slf_q_weight.append(mod.self_attn.q_proj.weight)
-            self.slf_q_bias.append(mod.self_attn.q_proj.bias)
-            self.slf_k_weight.append(mod.self_attn.k_proj.weight)
-            self.slf_k_bias.append(mod.self_attn.k_proj.bias)
-            self.slf_v_weight.append(mod.self_attn.v_proj.weight)
-            self.slf_v_bias.append(mod.self_attn.v_proj.bias)
-            self.slf_out_weight.append(mod.self_attn.out_proj.weight)
-            self.slf_out_bias.append(mod.self_attn.out_proj.bias)
-
-            self.cross_ln_weight.append(mod.norm2.weight)
-            self.cross_ln_bias.append(mod.norm2.bias)
-            self.cross_q_weight.append(mod.cross_attn.q_proj.weight)
-            self.cross_q_bias.append(mod.cross_attn.q_proj.bias)
-            self.cross_k_weight.append(mod.cross_attn.k_proj.weight)
-            self.cross_k_bias.append(mod.cross_attn.k_proj.bias)
-            self.cross_v_weight.append(mod.cross_attn.v_proj.weight)
-            self.cross_v_bias.append(mod.cross_attn.v_proj.bias)
-            self.cross_out_weight.append(mod.cross_attn.out_proj.weight)
-            self.cross_out_bias.append(mod.cross_attn.out_proj.bias)
-
-            self.ffn_ln_weight.append(mod.norm3.weight)
-            self.ffn_ln_bias.append(mod.norm3.bias)
-            self.ffn_inter_weight.append(mod.linear1.weight)
-            self.ffn_inter_bias.append(mod.linear1.bias)
-            self.ffn_out_weight.append(mod.linear2.weight)
-            self.ffn_out_bias.append(mod.linear2.bias)
-
-        self.decoder_ln_weight = [model.decoder.decoder.norm.weight]
-        self.decoder_ln_bias = [model.decoder.decoder.norm.bias]
-
-        self.mbart_ln_weight = [model.decoder.decoder_layernorm_embedding.weight]
-        self.mbart_ln_bias = [model.decoder.decoder_layernorm_embedding.bias]
-
-        self.pos_emb = [model.decoder.decoder_embed_positions.weight]
-        self.word_emb = [model.decoder.embed_tokens.weight]
-
-        setattr(self, "lm_head_weight_", model.lm_head_weight.t())
-        self.linear_weight = [getattr(self, "lm_head_weight_")]
-        self.linear_bias = [model.final_logits_bias]
-
-    def forward(
-        self,
-        enc_output,
-        memory_seq_lens,
-        trg_word=None,
-        beam_size=4,
-        top_k=1,
-        top_p=0.0,
-        decoding_strategy="beam_search_v3",
-        max_out_len=256,
-        diversity_rate=0.0,
-        rel_len=False,
-        bos_token_id=None,
-        eos_token_id=None,
-        pad_token_id=None,
-        alpha=0.6,
-        temperature=1.0,
-        early_stopping=False,
-    ):
-        # Beam_search/beam_search_v2/beam_search_v3 should be corrected to beam_search_v3.
-        if decoding_strategy.startswith("beam_search"):
-            decoding_strategy = "beam_search_v3"
-        elif decoding_strategy == "greedy_search":
-            decoding_strategy = "topk_sampling"
-            top_k = 1
-            top_p = 0.0
-        elif decoding_strategy in ["sampling", "topk_sampling", "topp_sampling"]:
-            if top_p == 1 and top_k > 0:
-                decoding_strategy = "topk_sampling"
-                top_p = 0.0
-            elif top_p > 0 and top_k == 0:
-                decoding_strategy = "topp_sampling"
-            else:
-                raise AttributeError(
-                    "Only topk sampling or topp sampling are supported. "
-                    "Topk sampling and topp sampling cannot be both applied in the fast version. "
-                )
-        output_ids, parent_ids, sequence_length = infer_mbart_decoding(
-            [enc_output],
-            [memory_seq_lens],
-            self.word_emb,
-            self.slf_ln_weight,
-            self.slf_ln_bias,
-            self.slf_q_weight,
-            self.slf_q_bias,
-            self.slf_k_weight,
-            self.slf_k_bias,
-            self.slf_v_weight,
-            self.slf_v_bias,
-            self.slf_out_weight,
-            self.slf_out_bias,
-            self.cross_ln_weight,
-            self.cross_ln_bias,
-            self.cross_q_weight,
-            self.cross_q_bias,
-            self.cross_k_weight,
-            self.cross_k_bias,
-            self.cross_v_weight,
-            self.cross_v_bias,
-            self.cross_out_weight,
-            self.cross_out_bias,
-            self.ffn_ln_weight,
-            self.ffn_ln_bias,
-            self.ffn_inter_weight,
-            self.ffn_inter_bias,
-            self.ffn_out_weight,
-            self.ffn_out_bias,
-            self.decoder_ln_weight,
-            self.decoder_ln_bias,
-            self.mbart_ln_weight,
-            self.mbart_ln_bias,
-            self.linear_weight,
-            self.linear_bias,
-            self.pos_emb,
-            trg_word,
-            decoding_strategy,
-            beam_size,
-            top_k,
-            top_p,
-            self._n_head,
-            int(self._d_model / self._n_head),
-            self._num_decoder_layers,
-            bos_token_id,
-            eos_token_id,
-            max_out_len,
-            -diversity_rate,
-            rel_len,
-            alpha,
-            temperature,
-            early_stopping,
-            self._hidden_act,
-        )
-
-        ids = finalize(beam_size, output_ids, parent_ids, sequence_length, decoding_strategy=decoding_strategy)
-        return ids
-
-
-def convert_gptj_params(fast_model, model, fuse_qkv=1, use_fp16=False, restore_data=False, permutation=None):
-    r"""
-    Convert parameters included in Transformer layer  from original models
-    to the format of faster models.
-
-    Args:
-        fast_model (Layer): The faster model object.
-        model (Layer): The Transformer layer.
-        fuse_qkv (int): 0 for nofuse, 1 for fuse, 2 for fuse and delete the
-            unfused parameters. If environment variable `PPFG_QKV_MEM_OPT` is
-            set and the weights of q/k/v is fused, it will try to delete the
-            original unfused weights. Note the rollback to original model would
-            not be guarantee anymore when the faster model failed if the original
-            weights are deleted. Default to 1.
-        use_fp16 (bool): Whether to use float16. Maybe we should use the default
-            dtype as the highest priority later. Default to `False`.
-        restore_data (bool): If `False`, need to reload the weight values. It
-            should be `True` for weight loaded models. Default to `False`.
-
-    Returns:
-        defaultdict: Each value is a list including converted parameters in all
-            layers. For other parameters not included in Transformer module to
-            be converted, such as embeddings, you can achieve it by using the
-            returned dict `params` though `params['word_emb'].append()` directly
-            which would do CPU/GPU and fp32/fp16 transfer automatically.
-    """
-    if fuse_qkv == 1:
-        fuse_qkv = 2 if os.getenv("PPFG_QKV_MEM_OPT", "0") == "1" else 1
-    ft_para_conf = get_ft_para_conf()
-
-    class _list(list):
-        def append(self, item):
-            def attr_handle_func(x):
-                return x
-
-            if isinstance(item[0], nn.Layer):
-                # Axis is used for tensor slice in tensor parallel.
-                # Use None to make no slice on the tensor.
-                if len(item) == 2:
-                    layer, attr = item
-                    axis = None
-                else:
-                    layer, attr, axis = item
-                param = getattr(layer, attr)
-                if axis is not None and isinstance(layer, nn.Linear):
-                    param = ft_para_conf.slice_weight(param, axis)
-                param = transfer_param(
-                    param,
-                    is_bias=attr.endswith("bias"),
-                    dtype="float16" if use_fp16 else "float32",
-                    restore_data=restore_data,
-                )
-                # NOTE: Assignment to parameter 'weight' should be of type
-                # Parameter or None, thus delete first in case of param is
-                # a tensor.
-                # TODO(guosheng): Make slice_weight use `output_param=True`
-                # and remove delattr. Currently, if `param` is Tensor rather
-                # than Parameter, it would not be in state_dict.
-                delattr(layer, attr)
-                setattr(layer, attr, param)
-            else:
-                # NOTE: Compared with if branch, there is no layer attribute
-                # refered to the transfered param, thus we should set it as
-                # the layer attribute to be able to convert to static graph.
-                # Additionally, we suppose no need to process tensor parallel
-                # here since the param passed in might have been processed.
-                if len(item) == 2:
-                    param, is_bias = item
-                    attr_handle = attr_handle_func
-                else:
-                    param, is_bias, attr_handle = item
-                param = transfer_param(
-                    param, is_bias=is_bias, dtype="float16" if use_fp16 else "float32", restore_data=restore_data
-                )
-                attr_handle(param)
-            return super().append(param)
-
-    params = defaultdict(_list)
-
-    def _convert(module):
-        num_layer = len(module)
-        for i, layer in enumerate(module):
-            if not ft_para_conf.is_load(i, num_layer):
-                continue
-            # TODO(guosheng): Tensor with size 0 might be failed in
-            # paddle develop, thus use tensor with size 1 instead
-            # temporarily. Besides, we use 2D tensor since jit log
-            # requires that on linear weight. While size 0 seems all
-            # right in jit.to_static/jit.save.
-            dummy_tensor = paddle.zeros([1, 1])
-            if permutation is not None:
-                qkv = layer.attn.qkv_proj.weight.numpy()
-                qkv = qkv[:, permutation]
-                if fuse_qkv == 2:
-                    del layer.attn.qkv_proj.weight
-                    setattr(layer.attn.qkv_proj, "weight", dummy_tensor)
-                w = paddle.to_tensor(qkv)
-            else:
-                w = _convert_qkv(
-                    layer.attn.q_proj,
-                    layer.attn.k_proj,
-                    layer.attn.v_proj,
-                    attr="weight",
-                    use_numpy=fuse_qkv == 2,
-                    del_param=fuse_qkv == 2,
-                    dummy_tensor=dummy_tensor,
-                )
-            params["slf_q_weight"].append((w, False))
-            # NOTE: Use `params["slf_q_weight"][-1]` rather than `w`,
-            # since the appended tensor might be a new transfered tensor.
-            # Besides, to allow convert_params be called more than once,
-            # we find a attr name not existing to avoid overwriting the
-            # existing attr.
-            attr = "slf_q_weight_" + str(i)
-            while hasattr(fast_model, attr):
-                attr += "_"
-            setattr(fast_model, attr, params["slf_q_weight"][-1])
-
-            params["slf_out_weight"].append((layer.attn.out_proj, "weight", 0))
-            params["slf_ln_weight"].append((layer.ln_1, "weight"))
-            params["slf_ln_bias"].append((layer.ln_1, "bias"))
-            # Slice tensor when append according to axis(1 or 0) if parallel
-            # is enable.
-            params["ffn_inter_weight"].append((layer.mlp.fc_in, "weight", 1))
-            params["ffn_inter_bias"].append((layer.mlp.fc_in, "bias", 1))
-            params["ffn_out_weight"].append((layer.mlp.fc_out, "weight", 0))
-            params["ffn_out_bias"].append((layer.mlp.fc_out, "bias"))
-
-    _convert(model)
-    return params
-
-
-class InferGptJDecoding(nn.Layer):
-    def __init__(self, model, decoding_lib=None, use_fp16_decoding=False, transpose_qkv=False):
-        if decoding_lib is not None and os.path.isfile(decoding_lib):
-            if "FastGeneration" not in LOADED_EXT.keys():
-                ops = paddle.utils.cpp_extension.load_op_meta_info_and_register_op(decoding_lib)
-                LOADED_EXT["FastGeneration"] = ops
-        else:
-            if decoding_lib is not None:
-                logger.warning("The specified decoding_lib does not exist, and it will be built automatically.")
-            load(
-                "FastGeneration" if get_ft_para_conf().no_para else "FasterTransformerParallel",
-                verbose=True,
-                need_parallel=not get_ft_para_conf().no_para,
-            )
-
-        super(InferGptJDecoding, self).__init__()
-
-        self.use_fp16_decoding = use_fp16_decoding
-        self.model = model
-        self.head_num = self.model.transformer.config["n_head"]
-        self.size_per_head = int(self.model.transformer.config["n_embd"] / self.head_num)
-        self.num_layer = self.model.transformer.config["n_layer"]
-        self.rotary_embedding_dim = self.model.transformer.config["rotary_dim"]
-        logger.info("Converting model weights, it will cost a few seconds.....")
-        permutation = None
-        if transpose_qkv:
-            # GPTJ is different with CodeGen in attention project layer.
-            local_dim = self.model.transformer.config["n_embd"] // 4
-            base_permutation = [0, 3, 6, 9, 2, 5, 8, 11, 1, 4, 7, 10]
-            permutation = np.concatenate([np.arange(i * local_dim, (i + 1) * local_dim) for i in base_permutation])
-        params = convert_gptj_params(
-            self,
-            model.transformer.h,
-            fuse_qkv=2,
-            use_fp16=use_fp16_decoding,
-            restore_data=True,
-            permutation=permutation,
-        )
-
-        params["word_emb"].append((self.model.transformer.wte, "weight"))
-        params["decoder_ln_weight"].append((self.model.transformer.ln_f, "weight"))
-        params["decoder_ln_bias"].append((self.model.transformer.ln_f, "bias"))
-        params["linear_weight"].append((self.model.lm_head.weight.t(), partial(setattr, self, "linear_weight_out")))
-        params["linear_bias"].append((self.model.lm_head, "bias"))
-
-        for k, v in params.items():
-            setattr(self, k, v)
-        logger.info("Already converted model weights.")
-
-    def forward(
-        self,
-        input_ids,
-        mem_seq_len,
-        attention_mask=None,
-        topk=4,
-        topp=0.0,
-        bos_token_id=None,
-        eos_token_id=None,
-        pad_token_id=None,
-        forced_eos_token_id=None,
-        max_out_len=256,
-        temperature=1,
-        repetition_penalty=1.0,
-        min_length=0,
-    ):
-        if attention_mask is None:
-            batch_size, input_length = input_ids.shape
-            attention_mask = paddle.unsqueeze((input_ids != pad_token_id).astype("float32"), axis=[1])
-            causal_mask = paddle.tril(paddle.ones([batch_size, input_length, input_length], dtype="float32"))
-            attention_mask = paddle.logical_and(attention_mask, causal_mask)
-            if not self.use_fp16_decoding:
-                attention_mask = paddle.cast(attention_mask, dtype="float32")
-            else:
-                attention_mask = paddle.cast(attention_mask, dtype="float16")
-
-        if self.use_fp16_decoding and attention_mask.dtype == paddle.float32:
-            attention_mask = paddle.cast(attention_mask, dtype="float16")
-
-        (output_ids,) = infer_gptj_decoding(
-            input=[input_ids],
-            attn_mask=[attention_mask],
-            mem_seq_len=[mem_seq_len],
-            word_emb=self.word_emb,
-            slf_ln_weight=self.slf_ln_weight,
-            slf_ln_bias=self.slf_ln_bias,
-            slf_q_weight=self.slf_q_weight,
-            slf_out_weight=self.slf_out_weight,
-            ffn_inter_weight=self.ffn_inter_weight,
-            ffn_inter_bias=self.ffn_inter_bias,
-            ffn_out_weight=self.ffn_out_weight,
-            ffn_out_bias=self.ffn_out_bias,
-            decoder_ln_weight=self.decoder_ln_weight,
-            decoder_ln_bias=self.decoder_ln_bias,
-            linear_weight=self.linear_weight,
-            linear_bias=self.linear_bias,
-            topk=topk,
-            topp=topp,
-            max_out_len=max_out_len,
-            head_num=self.head_num,
-            size_per_head=self.size_per_head,
-            num_layer=self.num_layer,
-            bos_id=bos_token_id,
-            eos_id=eos_token_id,
-            temperature=temperature,
-            rotary_embedding_dim=self.rotary_embedding_dim,
-            repetition_penalty=repetition_penalty,
-            min_length=min_length,
-            use_fp16_decoding=self.use_fp16_decoding,
-        )
-
-        output_ids = output_ids[input_ids.shape[-1] :, :]
-        if forced_eos_token_id is not None:
-            output_ids[:, -1] = forced_eos_token_id
-        return output_ids
-
-
-class InferPegasusDecoding(nn.Layer):
-    def __init__(self, model, decoding_lib=None, use_fp16_decoding=False, hidden_act="gelu"):
-        if decoding_lib is not None and os.path.isfile(decoding_lib):
-            # Maybe it has been loadad by `ext_utils.load`
-            if "FastGeneration" not in LOADED_EXT.keys():
-                ops = paddle.utils.cpp_extension.load_op_meta_info_and_register_op(decoding_lib)
-                LOADED_EXT["FastGeneration"] = ops
-        else:
-            if decoding_lib is not None:
-                logger.warning("The specified decoding_lib does not exist, and it will be built automatically.")
-            load("FastGeneration", verbose=True)
-
-        super(InferPegasusDecoding, self).__init__()
-        self._hidden_act = hidden_act
-        self._num_decoder_layers = model.pegasus.config["num_decoder_layers"]
-        self._n_head = model.pegasus.config["decoder_attention_heads"]
-        self._d_model = model.pegasus.config["d_model"]
-
-        params = convert_params(self, model.decoder.decoder, fuse_qkv=2, use_fp16=use_fp16_decoding, restore_data=True)
-
-        self.decoder_ln_weight = [
-            transfer_param(
-                model.decoder.decoder_layernorm.weight,
-                is_bias=False,
-                dtype="float16" if use_fp16_decoding else "float32",
-                restore_data=True,
-            )
-        ]
-        self.decoder_ln_bias = [
-            transfer_param(
-                model.decoder.decoder_layernorm.bias,
-                is_bias=True,
-                dtype="float16" if use_fp16_decoding else "float32",
-                restore_data=True,
-            )
-        ]
-
-        self.pos_emb = [
-            transfer_param(
-                model.decoder.decoder_embed_positions.weight,
-                is_bias=False,
-                dtype="float16" if use_fp16_decoding else "float32",
-                restore_data=True,
-            )
-        ]
-        self.word_emb = [
-            transfer_param(
-                model.decoder.embed_tokens.weight,
-                is_bias=False,
-                dtype="float16" if use_fp16_decoding else "float32",
-                restore_data=True,
-            )
-        ]
-        setattr(
-            self,
-            "lm_head_weight_",
-            transfer_param(
-                model.lm_head_weight.t(),
-                is_bias=False,
-                dtype="float16" if use_fp16_decoding else "float32",
-                restore_data=True,
-            ),
-        )
-        self.linear_weight = [getattr(self, "lm_head_weight_")]
-        self.linear_bias = [
-            transfer_param(
-                model.final_logits_bias,
-                is_bias=True,
-                dtype="float16" if use_fp16_decoding else "float32",
-                restore_data=True,
-            )
-        ]
-        for k, v in params.items():
-            setattr(self, k, v)
-
-    def forward(
-        self,
-        enc_output,
-        memory_seq_lens,
-        beam_size=4,
-        top_k=1,
-        top_p=0.0,
-        decoding_strategy="beam_search_v3",
-        max_out_len=256,
-        min_out_len=256,
-        diversity_rate=0.0,
-        rel_len=False,
-        bos_token_id=None,
-        eos_token_id=None,
-        pad_token_id=None,
-        alpha=0.6,
-        temperature=1.0,
-        early_stopping=False,
-        forced_eos_token_id=None,
-    ):
-        # Beam_search/beam_search_v2/beam_search_v3 should be corrected to beam_search_v3.
-        if decoding_strategy.startswith("beam_search"):
-            decoding_strategy = "beam_search_v3"
-        elif decoding_strategy == "greedy_search":
-            decoding_strategy = "topk_sampling"
-            top_k = 1
-            top_p = 0.0
-        elif decoding_strategy in ["sampling", "topk_sampling", "topp_sampling"]:
-            if top_p == 1 and top_k > 0:
-                decoding_strategy = "topk_sampling"
-                top_p = 0.0
-            elif top_p > 0 and top_k == 0:
-                decoding_strategy = "topp_sampling"
-            else:
-                raise AttributeError(
-                    "Only topk sampling or topp sampling are supported. "
-                    "Topk sampling and topp sampling cannot be both applied in the fast version. "
-                )
-        output_ids, parent_ids, sequence_length = infer_pegasus_decoding(
-            [enc_output],
-            [memory_seq_lens],
-            self.word_emb,
-            self.slf_ln_weight,
-            self.slf_ln_bias,
-            self.slf_q_weight,
-            self.slf_q_bias,
-            self.slf_k_weight,
-            self.slf_k_bias,
-            self.slf_v_weight,
-            self.slf_v_bias,
-            self.slf_out_weight,
-            self.slf_out_bias,
-            self.cross_ln_weight,
-            self.cross_ln_bias,
-            self.cross_q_weight,
-            self.cross_q_bias,
-            self.cross_k_weight,
-            self.cross_k_bias,
-            self.cross_v_weight,
-            self.cross_v_bias,
-            self.cross_out_weight,
-            self.cross_out_bias,
-            self.ffn_ln_weight,
-            self.ffn_ln_bias,
-            self.ffn_inter_weight,
-            self.ffn_inter_bias,
-            self.ffn_out_weight,
-            self.ffn_out_bias,
-            self.decoder_ln_weight,
-            self.decoder_ln_bias,
-            self.linear_weight,
-            self.linear_bias,
-            self.pos_emb,
-            decoding_strategy,
-            beam_size,
-            top_k,
-            top_p,
-            self._n_head,
-            int(self._d_model / self._n_head),
-            self._num_decoder_layers,
-            bos_token_id,
-            eos_token_id,
-            max_out_len,
-            min_out_len,
-            diversity_rate,
-            rel_len,
-            alpha,
-            temperature,
-            early_stopping,
-            self._hidden_act,
-        )
-
-        ids = finalize(
-            beam_size,
-            output_ids,
-            parent_ids,
-            sequence_length,
-            forced_eos_token_id=forced_eos_token_id,
-            decoding_strategy=decoding_strategy,
-        )
-        return ids
-
-
-class InferT5Decoding(InferBase):
-    def __init__(self, model, decoding_lib=None, use_fp16_decoding=False):
-
-        if decoding_lib is not None and os.path.isfile(decoding_lib):
-            # Maybe it has been loadad by `ext_utils.load`
-            if "FastGeneration" not in LOADED_EXT.keys():
-                ops = paddle.utils.cpp_extension.load_op_meta_info_and_register_op(decoding_lib)
-                LOADED_EXT["FastGeneration"] = ops
-        else:
-            if decoding_lib is not None:
-                logger.warning("The specified decoding_lib does not exist, and it will be built automatically.")
-            load("FastGeneration", verbose=True)
-
-        super(InferT5Decoding, self).__init__(use_fp16_decoding)
-        for arg, value in locals().items():
-            if arg not in ["self", "model"]:
-                setattr(self, "_" + arg, value)
-
-        self._num_decoder_layers = model.config.num_decoder_layers
-        self._n_head = model.config.num_heads
-        self._d_model = model.config.d_model
-        self._relative_attention_num_buckets = model.config.relative_attention_num_buckets
-        self.tie_word_embeddings = model.config.tie_word_embeddings
-        self.act = model.config.feed_forward_proj
-
-        if "gelu" in self.act:
-            self.act = "gelu"
-        elif "relu" in self.act:
-            self.act = "relu"
-        else:
-            raise ValueError("Only gelu and relu are available in Faster. ")
-
-        # NOTE: using config when support.
-        self._max_distance = 128
-
-        params = convert_params(self, model.t5.decoder, fuse_qkv=2, use_fp16=use_fp16_decoding, restore_data=True)
-
-        self.decoder_ln_weight = [
-            transfer_param(
-                model.t5.decoder.final_layer_norm.weight,
-                is_bias=False,
-                dtype="float16" if use_fp16_decoding else "float32",
-                restore_data=True,
-            )
-        ]
-
-        self.word_emb = [
-            transfer_param(
-                model.t5.decoder.embed_tokens.weight,
-                is_bias=False,
-                dtype="float16" if use_fp16_decoding else "float32",
-                restore_data=True,
-            )
-        ]
-
-        if self.tie_word_embeddings:
-            setattr(
-                self,
-                "lm_head_weight_",
-                transfer_param(
-                    model.t5.decoder.embed_tokens.weight.t(),
-                    is_bias=False,
-                    dtype="float16" if use_fp16_decoding else "float32",
-                    restore_data=True,
-                ),
-            )
-        else:
-            setattr(
-                self,
-                "lm_head_weight_",
-                transfer_param(
-                    paddle.assign(model.lm_head.weight),
-                    is_bias=False,
-                    dtype="float16" if use_fp16_decoding else "float32",
-                    restore_data=True,
-                ),
-            )
-
-        self.linear_weight = [getattr(self, "lm_head_weight_")]
-        self.linear_bias = self.default_bias(self.linear_weight, 1)
-
-        setattr(
-            self,
-            "relative_attn_bias_w",
-            transfer_param(
-                model.t5.decoder.block[0].layer[0].SelfAttention.relative_attention_bias.weight,
-                is_bias=False,
-                dtype="float16" if use_fp16_decoding else "float32",
-                restore_data=True,
-            ),
-        )
-        self.relative_attention_bias_weight = [getattr(self, "relative_attn_bias_w")]
-        for k, v in params.items():
-            setattr(self, k, v)
-
-        self.zeros_t = paddle.zeros(shape=[1, 1], dtype="float16" if use_fp16_decoding else "float32")
-        if getattr(self, "slf_k_weight", None) is None:
-            self.slf_k_weight = [self.zeros_t] * model.t5.config["num_decoder_layers"]
-        if getattr(self, "slf_v_weight", None) is None:
-            self.slf_v_weight = [self.zeros_t] * model.t5.config["num_decoder_layers"]
-
-    def forward(
-        self,
-        enc_output,
-        memory_seq_lens,
-        beam_size=4,
-        top_k=1,
-        top_p=0.0,
-        decoding_strategy="beam_search_v3",
-        max_out_len=256,
-        diversity_rate=0.0,
-        rel_len=False,
-        bos_token_id=None,
-        eos_token_id=None,
-        pad_token_id=None,
-        alpha=0.6,
-        temperature=1.0,
-        early_stopping=False,
-    ):
-        # Beam_search/beam_search_v2/beam_search_v3 should be corrected to beam_search_v3.
-        if decoding_strategy.startswith("beam_search"):
-            decoding_strategy = "beam_search_v3"
-        elif decoding_strategy == "greedy_search":
-            decoding_strategy = "topk_sampling"
-            top_k = 1
-            top_p = 0.0
-        elif decoding_strategy in ["sampling", "topk_sampling", "topp_sampling"]:
-            if top_p == 1 and top_k > 0:
-                decoding_strategy = "topk_sampling"
-                top_p = 0.0
-            elif top_p > 0 and top_k == 0:
-                decoding_strategy = "topp_sampling"
-            else:
-                raise AttributeError(
-                    "Only topk sampling or topp sampling are supported. "
-                    "Topk sampling and topp sampling cannot be both applied in the fast version. "
-                )
-
-        output_ids, parent_ids, sequence_length = infer_t5_decoding(
-            enc_output=[enc_output],
-            memory_seq_lens=[memory_seq_lens],
-            word_emb=self.word_emb,
-            slf_ln_weight=self.slf_ln_weight,
-            slf_ln_bias=getattr(self, "slf_ln_bias", self.default_bias(self.slf_ln_weight, 0, True)),
-            slf_q_weight=self.slf_q_weight,
-            slf_q_bias=getattr(self, "slf_q_bias", self.default_bias(self.slf_q_weight, 1)),
-            slf_k_weight=self.slf_k_weight,
-            slf_k_bias=getattr(self, "slf_k_bias", self.default_bias(self.slf_k_weight, 1)),
-            slf_v_weight=self.slf_v_weight,
-            slf_v_bias=getattr(self, "slf_v_bias", self.default_bias(self.slf_v_weight, 1)),
-            slf_out_weight=self.slf_out_weight,
-            slf_out_bias=getattr(self, "slf_out_bias", self.default_bias(self.slf_out_weight, 1)),
-            relative_attention_bias_weight=self.relative_attention_bias_weight,
-            cross_ln_weight=self.cross_ln_weight,
-            cross_ln_bias=getattr(self, "cross_ln_bias", self.default_bias(self.cross_ln_weight, 0, True)),
-            cross_q_weight=self.cross_q_weight,
-            cross_q_bias=getattr(self, "cross_q_bias", self.default_bias(self.cross_q_weight, 1)),
-            cross_k_weight=self.cross_k_weight,
-            cross_k_bias=getattr(self, "cross_k_bias", self.default_bias(self.cross_k_weight, 1)),
-            cross_v_weight=self.cross_v_weight,
-            cross_v_bias=getattr(self, "cross_v_bias", self.default_bias(self.cross_v_weight, 1)),
-            cross_out_weight=self.cross_out_weight,
-            cross_out_bias=getattr(self, "cross_out_bias", self.default_bias(self.cross_out_weight, 1)),
-            ffn_ln_weight=self.ffn_ln_weight,
-            ffn_ln_bias=getattr(self, "ffn_ln_bias", self.default_bias(self.ffn_ln_weight, 0, True)),
-            ffn_inter_weight_0=self.ffn_inter_weight_0,
-            ffn_inter_bias_0=getattr(self, "ffn_inter_bias_0", self.default_bias(self.ffn_inter_weight_0, 1)),
-            ffn_inter_weight_1=getattr(
-                self, "ffn_inter_weight_1", self.default_bias(self.ffn_inter_weight_0, 1, True)
-            ),
-            ffn_inter_bias_1=getattr(self, "ffn_inter_bias_1", self.default_bias(self.ffn_inter_weight_1, 1))
-            if hasattr(self, "ffn_inter_weight_1")
-            else getattr(self, "ffn_inter_bias_1", self.default_bias(self.ffn_inter_weight_0, 1, True)),
-            ffn_out_weight=self.ffn_out_weight,
-            ffn_out_bias=getattr(self, "ffn_out_bias", self.default_bias(self.ffn_out_weight, 1)),
-            decoder_ln_weight=self.decoder_ln_weight,
-            decoder_ln_bias=getattr(self, "decoder_ln_bias", self.default_bias(self.decoder_ln_weight, 0, True)),
-            linear_weight=self.linear_weight,
-            linear_bias=getattr(self, "linear_bias", self.default_bias(self.linear_weight, 1)),
-            decoding_strategy=decoding_strategy,
-            beam_size=beam_size,
-            top_k=top_k,
-            top_p=top_p,
-            head_num=self._n_head,
-            size_per_head=int(self._d_model / self._n_head),
-            num_decoder_layers=self._num_decoder_layers,
-            start_id=bos_token_id,
-            end_id=eos_token_id,
-            max_out_len=max_out_len,
-            diversity_rate=-diversity_rate,
-            rel_len=rel_len,
-            alpha=alpha,
-            temperature=temperature,
-            early_stopping=early_stopping,
-            max_distance=self._max_distance,
-            relative_attention_num_buckets=self._relative_attention_num_buckets,
-            tie_word_embeddings=self.tie_word_embeddings,
-            act=self.act,
-        )
-
-        ids = finalize(beam_size, output_ids, parent_ids, sequence_length, decoding_strategy=decoding_strategy)
-
-        return ids
diff --git a/paddlenlp/ops/fast_transformer/transformer/encoder.py b/paddlenlp/ops/fast_transformer/transformer/encoder.py
deleted file mode 100644
index da14723dbb54..000000000000
--- a/paddlenlp/ops/fast_transformer/transformer/encoder.py
+++ /dev/null
@@ -1,456 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-from paddle.nn import TransformerEncoder, TransformerEncoderLayer
-
-from paddlenlp.ops.ext_utils import LOADED_EXT, load
-from paddlenlp.ops.fast_transformer.transformer.decoding import transfer_param
-from paddlenlp.utils.log import logger
-
-from .decoding import run_custom
-
-
-def infer_transformer_encoder(
-    input,
-    attn_mask,
-    q_weight,
-    q_bias,
-    k_weight,
-    k_bias,
-    v_weight,
-    v_bias,
-    attn_out_weight,
-    attn_out_bias,
-    norm1_weight,
-    norm1_bias,
-    norm2_weight,
-    norm2_bias,
-    ffn_inter_weight,
-    ffn_inter_bias,
-    ffn_out_weight,
-    ffn_out_bias,
-    #   sequence_id_offset,
-    #   trt_seqlen_offset,
-    #   amax_list,
-    n_head,
-    size_per_head,
-    n_layer=12,
-    use_gelu=True,
-    remove_padding=False,
-    int8_mode=0,
-    layer_idx=0,
-    allow_gemm_test=False,
-    use_trt_kernel=False,
-    normalize_before=False,
-):
-    """
-    Fusion Encoder API intergrating Encoder inference in FastGeneration. It
-    accepts the weight and bias of TransformerEncoder and some other parameters
-    for inference.
-    """
-    inputs_names = [
-        "Input",
-        "SelfAttnMask",
-        "SelfQueryWeight@VECTOR",
-        "SelfQueryBias@VECTOR",
-        "SelfKeyWeight@VECTOR",
-        "SelfKeyBias@VECTOR",
-        "SelfValueWeight@VECTOR",
-        "SelfValueBias@VECTOR",
-        "SelfAttnOutputWeight@VECTOR",
-        "SelfAttnOutputBias@VECTOR",
-        "SelfAttnOutputLayernormWeight@VECTOR",
-        "SelfAttnOutputLayernormBias@VECTOR",
-        "OutputLayernormWeight@VECTOR",
-        "OutputLayernormBias@VECTOR",
-        "FFNInterWeight@VECTOR",
-        "FFNInterBias@VECTOR",
-        "FFNOutputWeight@VECTOR",
-        "FFNOutputBias@VECTOR",
-        # 'SequenceIdOffset',
-        # "TRTSeqLenOffset",
-        # 'AmaxList'
-    ]
-
-    inputs_var = [
-        input,
-        attn_mask,
-        q_weight,
-        q_bias,
-        k_weight,
-        k_bias,
-        v_weight,
-        v_bias,
-        attn_out_weight,
-        attn_out_bias,
-        norm1_weight,
-        norm1_bias,
-        norm2_weight,
-        norm2_bias,
-        ffn_inter_weight,
-        ffn_inter_bias,
-        ffn_out_weight,
-        ffn_out_bias,
-        # 'SequenceIdOffset': sequence_id_offset,
-        # "TRTSeqLenOffset": trt_seqlen_offset,
-        # 'AmaxList': amax_list
-    ]
-
-    attrs_names = [
-        "head_num",
-        "size_per_head",
-        "use_gelu",
-        "remove_padding",
-        "int8_mode",
-        "num_layer",
-        "layer_idx",
-        "allow_gemm_test",
-        "use_trt_kernel",
-        "normalize_before",
-    ]
-
-    attrs_val = [
-        n_head,
-        size_per_head,
-        use_gelu,
-        remove_padding,
-        int8_mode,
-        n_layer,
-        layer_idx,
-        allow_gemm_test,
-        use_trt_kernel,
-        normalize_before,
-    ]
-
-    outputs_names = ["EncoderOut"]
-
-    outputs_dtype = [input[0].dtype]
-
-    return run_custom("fusion_encoder", inputs_names, inputs_var, attrs_names, attrs_val, outputs_names, outputs_dtype)
-
-
-def encoder_layer_forward(self, src, src_mask, cache=None, sequence_id_offset=None, trt_seq_len=None):
-    """
-    Redefines `forward` function of `paddle.nn.TransformerEncoderLayer` for
-    integrating FastGeneration for inference.
-
-    The original `forward` function would not be replaced unless
-    `enable_fast_encoder` is called by objects of its base class. After
-    replacing, objects of `paddle.nn.TransformerEncoderLayer` also have the
-    same member variables as before.
-
-    After inference, `disable_fast_encoder` could be called to restore the
-    `forward` function of `paddle.nn.TransformerEncoder` and
-    `paddle.nn.TransformerEncoderLayer`.
-
-    Args:
-        src (Tensor):
-            The input of Transformer encoder layer. It is a tensor with shape
-            `[batch_size, sequence_length, d_model]`. The data type should be
-            float32 or float64.
-        src_mask (Tensor, optional):
-            A tensor used in multi-head attention to prevents attention to some
-            unwanted positions, usually the paddings or the subsequent
-            positions. It is a tensor with shape `[batch_size, 1, 1, sequence_length]`.
-            When the data type is bool, the unwanted positions have `False`
-            values and the others have `True` values. When the data type is int,
-            the unwanted positions have 0 values and the others have 1 values.
-            When the data type is float, the unwanted positions have `-INF`
-            values and the others have 0 values. It can be None when nothing
-            wanted or needed to be prevented attention to. Defaults to None.
-
-    Returns:
-        src(Tensor|tuple):
-            It is a tensor that has the same shape and data type as `enc_input`,
-            representing the output of Transformer encoder layer. Or a tuple if
-            `cache` is not None, except for encoder layer output, the tuple
-            includes the new cache which is same as input `cache` argument but
-            `incremental_cache` has an incremental length. See
-            `paddle.nn.MultiHeadAttention.gen_cache` and
-            `paddle.nn.MultiHeadAttention.forward` for more details.
-    """
-    if cache is not None:
-        raise NotImplementedError("cache in encoder is not supported now")
-
-    src = infer_transformer_encoder(
-        input=[src],
-        attn_mask=[src_mask],
-        q_weight=[self.self_attn.q_proj.weight],
-        q_bias=[self.self_attn.q_proj.bias],
-        k_weight=[self.self_attn.k_proj.weight],
-        k_bias=[self.self_attn.k_proj.bias],
-        v_weight=[self.self_attn.v_proj.weight],
-        v_bias=[self.self_attn.v_proj.bias],
-        attn_out_weight=[self.self_attn.out_proj.weight],
-        attn_out_bias=[self.self_attn.out_proj.bias],
-        norm1_weight=[self.norm1.weight],
-        norm1_bias=[self.norm1.bias],
-        norm2_weight=[self.norm2.weight],
-        norm2_bias=[self.norm2.bias],
-        ffn_inter_weight=[self.linear1.weight],
-        ffn_inter_bias=[self.linear1.bias],
-        ffn_out_weight=[self.linear2.weight],
-        ffn_out_bias=[self.linear2.bias],
-        # sequence_id_offset=paddle.to_tensor([]),
-        # trt_seqlen_offset=paddle.to_tensor([]),
-        # amax_list=paddle.to_tensor([]),  # int8 mode is not supported.
-        n_head=self._config["nhead"],
-        size_per_head=self._config["d_model"] // self._config["nhead"],
-        use_gelu=self._config["activation"] == "gelu",
-        normalize_before=self._config["normalize_before"] is True,
-    )
-
-    return src
-
-
-def encoder_forward(self, src, src_mask=None, cache=None):
-    """
-    Redefines `forward` function of `paddle.nn.TransformerEncoder` for
-    integrating FastGeneration for inference.
-
-    The original `forward` function would not be replaced unless
-    `enable_fast_encoder` is called by objects of its base class. After
-    replacing, objects of `paddle.nn.TransformerEncoder` also have the same
-    member variables as before.
-
-    After inference, `disable_fast_encoder` could be called to restore the
-    `forward` function of `paddle.nn.TransformerEncoder` and
-    `paddle.nn.TransformerEncoderLayer`.
-
-    Args:
-        src (Tensor):
-            The input of Transformer encoder. It is a tensor
-            with shape `[batch_size, sequence_length, d_model]`. The data
-            type should be float32 or float16.
-        src_mask (Tensor, optional):
-            A tensor used in multi-head attention to prevents attention to
-            some unwanted positions, usually the paddings or the subsequent
-            positions. It is a tensor with shape `[batch_size, 1, 1, sequence_length]`.
-            The data type must be float, the unwanted positions have `-INF` values or other non-zeros
-            and the wanted positions must be 0.0.
-    Returns:
-        output (Tensor|tuple):
-            It is a tensor that has the same shape and data type as `src`,
-            representing the output of Transformer encoder. Or a tuple if
-            `cache` is not None, except for encoder output, the tuple includes
-            the new cache which is same as input `cache` argument but
-            `incremental_cache` in it has an incremental length. See
-            `paddle.nn.MultiHeadAttention.gen_cache` and
-            `paddle.nn.MultiHeadAttention.forward` for more details.
-    """
-    if cache is not None:
-        raise NotImplementedError("cache in encoder is not supported now")
-
-    if src_mask.dtype == paddle.float16:
-        src_mask = paddle.cast(src_mask, dtype="float32")
-    src_mask = src_mask == 0.0
-    if src_mask.dtype != src.dtype:
-        src_mask = paddle.cast(src_mask, src.dtype)
-
-    if len(src_mask.shape) == 4:
-        # transpose_src_mask: [batch_size, 1, sequence_length, 1]
-        transpose_src_mask = paddle.transpose(src_mask, perm=[0, 1, 3, 2])
-        # src_mask: [batch_size, 1, sequence_length, sequence_length]
-        src_mask = src_mask * transpose_src_mask
-
-    if getattr(self, "q_weight", None) is None:
-        self.q_weight = []
-        self.q_bias = []
-        self.k_weight = []
-        self.k_bias = []
-        self.v_weight = []
-        self.v_bias = []
-        self.attn_out_weight = []
-        self.attn_out_bias = []
-        self.norm1_weight = []
-        self.norm1_bias = []
-        self.norm2_weight = []
-        self.norm2_bias = []
-        self.ffn_inter_weight = []
-        self.ffn_inter_bias = []
-        self.ffn_out_weight = []
-        self.ffn_out_bias = []
-        for layer in self.layers:
-            self.q_weight.append(layer.self_attn.q_proj.weight)
-            self.q_bias.append(layer.self_attn.q_proj.bias)
-            self.k_weight.append(layer.self_attn.k_proj.weight)
-            self.k_bias.append(layer.self_attn.k_proj.bias)
-            self.v_weight.append(layer.self_attn.v_proj.weight)
-            self.v_bias.append(layer.self_attn.v_proj.bias)
-            self.attn_out_weight.append(layer.self_attn.out_proj.weight)
-            self.attn_out_bias.append(layer.self_attn.out_proj.bias)
-            self.norm1_weight.append(layer.norm1.weight)
-            self.norm1_bias.append(layer.norm1.bias)
-            self.norm2_weight.append(layer.norm2.weight)
-            self.norm2_bias.append(layer.norm2.bias)
-            self.ffn_inter_weight.append(layer.linear1.weight)
-            self.ffn_inter_bias.append(layer.linear1.bias)
-            self.ffn_out_weight.append(layer.linear2.weight)
-            self.ffn_out_bias.append(layer.linear2.bias)
-
-    output = infer_transformer_encoder(
-        input=[src],
-        attn_mask=[src_mask],
-        q_weight=self.q_weight,
-        q_bias=self.q_bias,
-        k_weight=self.k_weight,
-        k_bias=self.k_bias,
-        v_weight=self.v_weight,
-        v_bias=self.v_bias,
-        attn_out_weight=self.attn_out_weight,
-        attn_out_bias=self.attn_out_bias,
-        norm1_weight=self.norm1_weight,
-        norm1_bias=self.norm1_bias,
-        norm2_weight=self.norm2_weight,
-        norm2_bias=self.norm2_bias,
-        ffn_inter_weight=self.ffn_inter_weight,
-        ffn_inter_bias=self.ffn_inter_bias,
-        ffn_out_weight=self.ffn_out_weight,
-        ffn_out_bias=self.ffn_out_bias,
-        # sequence_id_offset=paddle.to_tensor([]),
-        # trt_seqlen_offset=paddle.to_tensor([]),
-        # amax_list=paddle.to_tensor([]),  # int8 mode is not supported.
-        n_head=self.layers[0]._config["nhead"],
-        size_per_head=self.layers[0]._config["d_model"] // self.layers[0]._config["nhead"],
-        use_gelu=self.layers[0]._config["activation"] == "gelu",
-        normalize_before=self.layers[0]._config["normalize_before"] is True,
-    )
-
-    if self.norm is not None:
-        output = self.norm(output)
-    return output
-
-
-def enable_fast_encoder(self, use_fp16=False, encoder_lib=None):
-    """
-    Compiles fusion encoder operator intergrated FastGeneration using the
-    method of JIT(Just-In-Time) and replaces the `forward` function of
-    `paddle.nn.TransformerEncoder` and `paddle.nn.TransformerEncoderLayer`
-    objects inherited from `self` to support inference using FastGeneration.
-
-    Examples:
-
-        .. code-block:: python
-
-            from paddlenlp.ops import enable_fast_encoder, disable_fast_encoder
-
-            model.eval()
-            model = enable_fast_encoder(model)
-            enc_out = model(src, src_mask)
-            model = disable_fast_encoder(model)
-    """
-
-    def init_func(layer):
-        if isinstance(layer, TransformerEncoderLayer):
-            is_usable = True
-            if layer._config["bias_attr"] is False:
-                logger.warning(
-                    "`False` for paddle.nn.TransformerEncoder's"
-                    " parameter `bias_attr` is not supported in "
-                    "FastGeneration by now. The original forward"
-                    " will be involved."
-                )
-                is_usable = False
-            if layer._config["activation"] not in ("relu", "gelu"):
-                logger.warning("Only 'relu' or 'gelu' is supported by now. " "The original forward will be involved.")
-                is_usable = False
-            if is_usable:
-                layer.forward = layer._ft_forward
-        elif isinstance(layer, TransformerEncoder):
-            layer.forward = layer._ft_forward
-            if use_fp16:
-                convert_to_fp16(layer)
-
-    if not self.training:
-        try:
-            # Pass decoding lib to prevent re-building encoder.
-            # Todo: check weather decoding lib have contained encoder or not.
-            if encoder_lib is not None:
-                if "FastGeneration" not in LOADED_EXT.keys():
-                    ops = paddle.utils.cpp_extension.load_op_meta_info_and_register_op(encoder_lib)
-                    LOADED_EXT["FastGeneration"] = ops
-            else:
-                load("FastGeneration", verbose=True)
-        except Exception:
-            logger.warning("Exception occurs when using FasterEncoder. " "The original forward will be involved. ")
-            return self
-        for layer in self.children():
-            layer.apply(init_func)
-    return self
-
-
-def disable_fast_encoder(self):
-    """
-    Restores the original `forward` function of `paddle.nn.TransformerEncoder`
-    and `paddle.nn.TransformerEncoderLayer` objects inherited from `self`.
-
-    Examples:
-
-        .. code-block:: python
-
-            from paddlenlp.ops import enable_fast_encoder, disable_fast_encoder
-
-            model.eval()
-            model = enable_fast_encoder(model)
-            enc_out = model(src, src_mask)
-            model = disable_fast_encoder(model)
-    """
-
-    def init_func(layer):
-        if isinstance(layer, (TransformerEncoderLayer, TransformerEncoder)):
-            layer.forward = layer._ori_forward
-
-    for layer in self.children():
-        layer.apply(init_func)
-    return self
-
-
-def convert_to_fp16(transformer_encoder):
-    """Convert paddle.nn.TransformerEncoder's parameter from float32 to float16
-
-    Args:
-        transformer_encoder (obeject, paddle.nn.TransformerEncoder):
-            The object to be converted to float16 inplaced, it must be an isinstance
-            of paddle.nn.TransformerEncoder.
-    """
-    if not isinstance(transformer_encoder, paddle.nn.TransformerEncoder):
-        logger.warning(
-            "transformer_encoder is not isinstance of paddle.nn.TransformerEncoder, return itself with no parameters convertion.".format
-        )
-        return transformer_encoder
-    else:
-        encoder_layers = transformer_encoder.layers
-
-        for mod in encoder_layers:
-            mod.norm1.weight = transfer_param(mod.norm1.weight, restore_data=True)
-            mod.norm1.bias = transfer_param(mod.norm1.bias, is_bias=True, restore_data=True)
-            mod.norm2.weight = transfer_param(mod.norm2.weight, restore_data=True)
-            mod.norm2.bias = transfer_param(mod.norm2.bias, is_bias=True, restore_data=True)
-
-            mod.linear1.weight = transfer_param(mod.linear1.weight, restore_data=True)
-            mod.linear1.bias = transfer_param(mod.linear1.bias, is_bias=True, restore_data=True)
-
-            mod.self_attn.q_proj.weight = transfer_param(mod.self_attn.q_proj.weight, restore_data=True)
-            mod.self_attn.q_proj.bias = transfer_param(mod.self_attn.q_proj.bias, is_bias=True, restore_data=True)
-            mod.self_attn.k_proj.weight = transfer_param(mod.self_attn.k_proj.weight, restore_data=True)
-            mod.self_attn.k_proj.bias = transfer_param(mod.self_attn.k_proj.bias, is_bias=True, restore_data=True)
-            mod.self_attn.v_proj.weight = transfer_param(mod.self_attn.v_proj.weight, restore_data=True)
-            mod.self_attn.v_proj.bias = transfer_param(mod.self_attn.v_proj.bias, is_bias=True, restore_data=True)
-            mod.self_attn.out_proj.weight = transfer_param(mod.self_attn.out_proj.weight, restore_data=True)
-            mod.self_attn.out_proj.bias = transfer_param(mod.self_attn.out_proj.bias, is_bias=True, restore_data=True)
-
-            mod.linear2.weight = transfer_param(mod.linear2.weight, restore_data=True)
-            mod.linear2.bias = transfer_param(mod.linear2.bias, is_bias=True, restore_data=True)
-        logger.info("Convert transformer_encoder's parameters from float32 to float16 succeessfully.")
diff --git a/paddlenlp/ops/fast_transformer/transformer/fast_transformer.py b/paddlenlp/ops/fast_transformer/transformer/fast_transformer.py
deleted file mode 100644
index b7b87c47a4c2..000000000000
--- a/paddlenlp/ops/fast_transformer/transformer/fast_transformer.py
+++ /dev/null
@@ -1,2021 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import shutil
-
-import numpy as np
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-
-from paddlenlp.ops import (
-    InferBartDecoding,
-    InferGptDecoding,
-    InferGptJDecoding,
-    InferMBartDecoding,
-    InferMIRODecoding,
-    InferOptDecoding,
-    InferPegasusDecoding,
-    InferT5Decoding,
-    InferTransformerDecoding,
-    InferUnifiedDecoding,
-)
-from paddlenlp.transformers import (
-    BartPretrainedModel,
-    CodeGenPreTrainedModel,
-    GPTChineseTokenizer,
-    GPTJPretrainedModel,
-    GPTPretrainedModel,
-    GPTTokenizer,
-    InferTransformerModel,
-    MBartPretrainedModel,
-    OPTPretrainedModel,
-    PegasusPretrainedModel,
-    PositionalEmbedding,
-    T5PretrainedModel,
-    TransformerModel,
-    UnifiedTransformerPretrainedModel,
-    UNIMOPretrainedModel,
-    WordEmbedding,
-    position_encoding_init,
-)
-from paddlenlp.utils.log import logger
-
-from .encoder import enable_fast_encoder
-
-
-class FasterTransformer(TransformerModel):
-    """
-    FasterTransformer is a fast version for generation with the Transformer
-    model. It uses a custom op based on and enhancing NV FasterTransformer to
-    do fast generation.
-
-    Args:
-        src_vocab_size (int):
-            The size of source vocabulary.
-        trg_vocab_size (int):
-            The size of target vocabulary.
-        max_length (int):
-            The maximum length of input sequences.
-        num_encoder_layers (int):
-            The number of sub-layers to be stacked in the encoder.
-        num_decoder_layers (int):
-            The number of sub-layers to be stacked in the decoder.
-        n_head (int):
-            The number of head used in multi-head attention.
-        d_model (int):
-            The dimension for word embeddings, which is also the last dimension of
-            the input and output of multi-head attention, position-wise feed-forward
-            networks, encoder and decoder.
-        d_inner_hid (int):
-            Size of the hidden layer in position-wise feed-forward networks.
-        dropout (float):
-            Dropout rates. Used for pre-process, activation and inside attention.
-        weight_sharing (bool):
-            Whether to use weight sharing.
-        attn_dropout (float):
-            The dropout probability used in MHA to drop some attention target.
-            If None, use the value of dropout. Defaults to None.
-        act_dropout (float):
-            The dropout probability used after FFN activition. If None, use
-            the value of dropout. Defaults to None.
-        bos_id (int, optional):
-            The start token id and also is used as padding id. Defaults to 0.
-        eos_id (int, optional):
-            The end token id. Defaults to 1.
-        pad_id (int, optional):
-            The pad token id. Defaults to None. If it's None, the bos_id will be used as pad_id.
-        decoding_strategy (str, optional):
-            Indicating the strategy of decoding. It can be 'beam_search', 'beam_search_v2',
-            'topk_sampling' and 'topp_sampling'. For beam search strategies,
-            'v2' would select the top `beam_size * 2` beams and process the top
-            `beam_size` alive and finish beams in them separately, while 'v1'
-            would only select the top `beam_size` beams and mix up the alive and
-            finish beams. 'v2' always searchs more and get better results, since
-            the alive beams would always be `beam_size` while the number of alive
-            beams in `v1` might decrease when meeting the end token. However,
-            'v2' always generates longer results thus might do more calculation
-            and be slower.
-        beam_size (int, optional):
-            The beam width for beam search. Defaults to 4.
-        topk (int, optional):
-            The number of highest probability tokens to keep for top-k sampling.
-            Defaults to 4.
-        topp (float, optional):
-            The most probable tokens whose cumulative probability is not less than
-            `topp` are kept for top-p sampling. Defaults to 4.
-        max_out_len (int, optional):
-            The maximum output length. Defaults to 256.
-        diversity_rate (float, optional):
-            Refer to `A Simple, Fast Diverse Decoding Algorithm for Neural Generation <https://arxiv.org/abs/1611.08562>`_
-            for details. Bigger `diversity_rate` would lead to more diversity.
-            if `diversity_rate == 0` is equivalent to naive BeamSearch. Default
-            to 0 if not set.
-        use_fp16_decoding(bool, optional):
-            Whether to use fp16 for decoding.
-        enable_fast_encoder(bool, optional):
-            Whether to use the fast version of encoder. This is experimental option for now.
-            Defaults to False.
-        use_fp16_encoder(bool, optional):
-            Whether to use fp16 for encoder. Only works when enable_fast_encoder is True.
-            Defaults to False.
-        rel_len(bool, optional):
-            Indicating whether `max_out_len` in is the length relative to that
-            of source text. Only works in `v2` temporarily. It is suggest to set
-            a small `max_out_len` and use `rel_len=True`. Default to False if
-            not set.
-        alpha(float, optional):
-            The power number in length penalty calculation. Only works in `v2`
-            temporarily. Refer to `GNMT <https://arxiv.org/pdf/1609.08144.pdf>`_.
-            Default to 0.6 if not set.
-    """
-
-    def __init__(
-        self,
-        src_vocab_size,
-        trg_vocab_size,
-        max_length,
-        num_encoder_layers,
-        num_decoder_layers,
-        n_head,
-        d_model,
-        d_inner_hid,
-        dropout,
-        weight_sharing,
-        attn_dropout=None,
-        act_dropout=None,
-        bos_id=0,
-        eos_id=1,
-        pad_id=None,
-        decoding_strategy="beam_search",
-        beam_size=4,
-        topk=1,
-        topp=0.0,
-        max_out_len=256,
-        diversity_rate=0.0,
-        decoding_lib=None,
-        use_fp16_decoding=False,
-        enable_fast_encoder=False,
-        use_fp16_encoder=False,
-        rel_len=False,
-        alpha=0.6,
-    ):
-        # if decoding_lib is None:
-        #     raise ValueError(
-        #         "The args decoding_lib must be set to use FasterTransformer. ")
-        # elif not os.path.exists(decoding_lib):
-        #     raise ValueError("The path to decoding lib is not exist.")
-
-        args = dict(locals())
-        args.pop("self")
-        args.pop("__class__", None)
-        self.decoding_strategy = args.pop("decoding_strategy")
-        self.beam_size = args.pop("beam_size")
-        self.topk = args.pop("topk")
-        self.topp = args.pop("topp")
-        self.max_out_len = args.pop("max_out_len")
-        self.diversity_rate = args.pop("diversity_rate")
-        self.decoding_lib = args.pop("decoding_lib")
-        self.use_fp16_decoding = args.pop("use_fp16_decoding")
-        self.enable_fast_encoder = args.pop("enable_fast_encoder")
-        self.use_fp16_encoder = args.pop("use_fp16_encoder")
-        self.rel_len = args.pop("rel_len")
-        self.alpha = args.pop("alpha")
-        self.dropout = dropout
-        self.weight_sharing = weight_sharing
-        self.trg_vocab_size = trg_vocab_size
-        self.d_model = d_model
-        self.bos_id = bos_id
-        self.pad_id = pad_id if pad_id is not None else self.bos_id
-        self.max_length = max_length
-        super(FasterTransformer, self).__init__(**args)
-
-        if self.enable_fast_encoder:
-            logger.warning("enable_fast_encoder is an experimental option and subject to change.")
-        elif self.use_fp16_encoder:
-            self.use_fp16_encoder = False
-
-        self.decoding_linear = nn.Linear(in_features=d_model, out_features=trg_vocab_size)
-
-        if weight_sharing:
-            self.trg_word_embedding = WordEmbedding(vocab_size=trg_vocab_size, emb_dim=d_model, bos_id=self.bos_id)
-            self.trg_pos_embedding = PositionalEmbedding(emb_dim=d_model, max_length=max_length)
-
-        self.decoding = InferTransformerDecoding(
-            decoder=self.transformer.decoder,
-            word_embedding=self.trg_word_embedding.word_embedding,
-            positional_embedding=self.trg_pos_embedding.pos_encoder,
-            linear=self.decoding_linear,
-            num_decoder_layers=num_decoder_layers,
-            n_head=n_head,
-            d_model=d_model,
-            bos_id=bos_id,
-            eos_id=eos_id,
-            decoding_strategy=decoding_strategy,
-            beam_size=beam_size,
-            topk=topk,
-            topp=topp,
-            max_out_len=max_out_len,
-            diversity_rate=self.diversity_rate,
-            decoding_lib=self.decoding_lib,
-            use_fp16_decoding=self.use_fp16_decoding,
-            rel_len=self.rel_len,
-            alpha=self.alpha,
-        )
-
-    def forward(self, src_word, trg_word=None):
-        src_max_len = src_word.shape[-1]
-        src_slf_attn_bias = (
-            paddle.cast(src_word == self.pad_id, dtype=paddle.get_default_dtype()).unsqueeze([1, 2]) * -1e9
-        )
-        src_pos = paddle.cast(src_word != self.pad_id, dtype=src_word.dtype) * paddle.arange(start=0, end=src_max_len)
-
-        # Run encoder
-        src_emb = self.src_word_embedding(src_word)
-        src_pos_emb = self.src_pos_embedding(src_pos)
-        src_emb = src_emb + src_pos_emb
-        enc_input = F.dropout(src_emb, p=self.dropout, training=False) if self.dropout else src_emb
-
-        if self.enable_fast_encoder and self.use_fp16_encoder:
-            enc_input = paddle.cast(enc_input, dtype="float16")
-
-        enc_output = self.transformer.encoder(enc_input, src_slf_attn_bias)
-
-        if self.use_fp16_decoding and enc_output.dtype != paddle.float16:
-            enc_output = paddle.cast(enc_output, dtype="float16")
-        elif not self.use_fp16_decoding and enc_output.dtype != paddle.float32:
-            enc_output = paddle.cast(enc_output, dtype="float32")
-
-        mem_seq_lens = paddle.sum(paddle.cast(src_word != self.pad_id, dtype="int32"), dtype="int32", axis=1)
-        ids = self.decoding(enc_output, mem_seq_lens, trg_word=trg_word)
-
-        return ids
-
-    def load(self, init_from_params=None, state_dict=None):
-        # Load the trained model
-        if init_from_params is None and state_dict is None:
-            raise ValueError("Either init_from_params or state_dict must be given to load the infer model. ")
-
-        if state_dict is None:
-            state_dict = paddle.load(init_from_params, return_numpy=True)
-        else:
-            for state in state_dict:
-                # NOTE: This API only used in dygraph, so paddle.Tensor is enough.
-                if isinstance(state_dict[state], paddle.Tensor):
-                    state_dict[state] = state_dict[state].numpy()
-
-        # To set weight[padding_idx] to 0.
-        state_dict["trg_word_embedding.word_embedding.weight"][self.bos_id] = [0] * self.d_model
-
-        # Dealing with weight sharing.
-        if self.weight_sharing:
-            state_dict["decoding_linear.weight"] = np.transpose(state_dict["trg_word_embedding.word_embedding.weight"])
-        else:
-            state_dict["decoding_linear.weight"] = state_dict["linear.weight"]
-
-        if self.decoding._fuse_qkv:
-            for item in self.state_dict():
-                if "decoder" in item and "self_attn.q_proj" in item:
-                    num_layer = item.split(".")[3]
-                    param_type = item.split(".")[-1]
-
-                    state_dict["decoding.slf_q_" + param_type + "_" + num_layer] = np.concatenate(
-                        (
-                            state_dict[item],
-                            state_dict["transformer.decoder.layers." + num_layer + ".self_attn.k_proj." + param_type],
-                            state_dict["transformer.decoder.layers." + num_layer + ".self_attn.v_proj." + param_type],
-                        ),
-                        axis=-1,
-                    )
-
-        if self.use_fp16_decoding:
-            for item in self.state_dict():
-                if "decoder" in item or "decoding.slf" in item:
-                    state_dict[item] = np.float16(state_dict[item])
-            state_dict["decoding_linear.weight"] = np.float16(state_dict["decoding_linear.weight"])
-            state_dict["trg_word_embedding.word_embedding.weight"] = np.float16(
-                state_dict["trg_word_embedding.word_embedding.weight"]
-            )
-            state_dict["trg_pos_embedding.pos_encoder.weight"] = np.float16(
-                state_dict["trg_pos_embedding.pos_encoder.weight"]
-            )
-            state_dict["decoding_linear.bias"] = np.zeros([self.trg_vocab_size], dtype="float16")
-
-        self.load_dict(state_dict)
-
-        if self.enable_fast_encoder:
-            self = enable_fast_encoder(self, use_fp16=self.use_fp16_encoder)
-
-    def export_params(self, init_from_params, place):
-        """
-        This method is used for load static graph from dygraph checkpoint
-        or export inference model using static graph.
-        Do NOT support faster encoder.
-
-        Args:
-            init_from_params (string):
-                The path to dygraph checkpoint.
-            place (paddle.Place):
-                The place to execute static graph.
-
-        Example:
-            .. code-block::
-                paddle.enable_static()
-                place = "gpu"
-                place = paddle.set_device(place)
-                reader.adapt_vocab_size(args)
-
-                test_program = paddle.static.Program()
-                startup_program = paddle.static.Program()
-                with paddle.static.program_guard(test_program, startup_program):
-                    src_word = paddle.static.data(
-                        name="src_word", shape=[None, None], dtype="int64")
-
-                    # Define model
-                    transformer = FasterTransformer(
-                        src_vocab_size=args.src_vocab_size,
-                        trg_vocab_size=args.trg_vocab_size,
-                        max_length=args.max_length + 1,
-                        num_encoder_layers=args.n_layer,
-                        num_decoder_layers=args.n_layer,
-                        n_head=args.n_head,
-                        d_model=args.d_model,
-                        d_inner_hid=args.d_inner_hid,
-                        dropout=args.dropout,
-                        weight_sharing=args.weight_sharing,
-                        bos_id=args.bos_idx,
-                        eos_id=args.eos_idx,
-                        decoding_strategy=args.decoding_strategy,
-                        beam_size=args.beam_size,
-                        max_out_len=args.max_out_len,
-                        decoding_lib=args.decoding_lib,
-                        use_fp16_decoding=args.use_fp16_decoding,
-                        rel_len=args.use_rel_len,
-                        alpha=args.alpha)
-
-                    finished_seq = transformer(src_word=src_word)
-
-                test_program = test_program.clone(for_test=True)
-
-                exe = paddle.static.Executor(place)
-                exe.run(startup_program)
-
-                # Load checkpoint.
-                transformer.export_params(
-                    init_from_params=os.path.join(args.init_from_params,
-                                                "transformer.pdparams"),
-                    place=place)
-
-                paddle.static.save_inference_model(
-                    os.path.join(args.inference_model_dir, "transformer"),
-                    feed_vars=src_word,
-                    fetch_vars=finished_seq,
-                    executor=exe,
-                    program=test_program)
-        """
-        # Load the trained model
-        assert init_from_params, "Please set init_from_params to load the infer model."
-
-        model_dict = paddle.load(init_from_params, return_numpy=True)
-
-        # To set weight[padding_idx] to 0.
-        model_dict["trg_word_embedding.word_embedding.weight"][self.bos_id] = [0] * self.d_model
-
-        # Dealing with weight sharing.
-        if self.weight_sharing:
-            model_dict["decoding_linear.weight"] = np.transpose(model_dict["trg_word_embedding.word_embedding.weight"])
-        else:
-            model_dict["decoding_linear.weight"] = model_dict["linear.weight"]
-
-        # To avoid a longer length than training, reset the size of position
-        # encoding to max_length
-        model_dict["encoder.pos_encoder.weight"] = position_encoding_init(self.max_length, self.d_model)
-        model_dict["decoder.pos_encoder.weight"] = position_encoding_init(self.max_length, self.d_model)
-
-        if self.decoding._fuse_qkv:
-            for item in self.state_dict():
-                if "decoder" in item and "self_attn.q_proj" in item:
-                    num_layer = item.split(".")[3]
-                    param_type = item.split(".")[-1]
-
-                    model_dict["decoding.slf_q_" + param_type + "_" + num_layer] = np.concatenate(
-                        (
-                            model_dict[item],
-                            model_dict["transformer.decoder.layers." + num_layer + ".self_attn.k_proj." + param_type],
-                            model_dict["transformer.decoder.layers." + num_layer + ".self_attn.v_proj." + param_type],
-                        ),
-                        axis=-1,
-                    )
-
-        if self.use_fp16_decoding:
-            for item in self.state_dict():
-                if "decoder" in item or "decoding.slf" in item:
-                    model_dict[item] = np.float16(model_dict[item])
-            model_dict["decoding_linear.weight"] = np.float16(model_dict["decoding_linear.weight"])
-            model_dict["trg_word_embedding.word_embedding.weight"] = np.float16(
-                model_dict["trg_word_embedding.word_embedding.weight"]
-            )
-            model_dict["trg_pos_embedding.pos_encoder.weight"] = np.float16(
-                model_dict["trg_pos_embedding.pos_encoder.weight"]
-            )
-            model_dict["decoding_linear.bias"] = np.zeros([self.trg_vocab_size], dtype="float16")
-
-        for item in self.state_dict():
-            param = self
-            attr_list = item.split(".")
-            for attr in attr_list:
-                param = getattr(param, attr)
-            param_name = param.name
-            var = paddle.static.global_scope().find_var(param_name).get_tensor()
-            var.set(model_dict[item], place)
-
-
-class TransformerGenerator(paddle.nn.Layer):
-    """
-    The Transformer model for auto-regressive generation with beam search. It wraps
-    `FasterTransformer` and `InferTransformerModel`, and automatically chioces using
-    `FasterTransformer` (with jit building) or the slower verison `InferTransformerModel`.
-
-    Args:
-        src_vocab_size (int):
-            The size of source vocabulary.
-        trg_vocab_size (int):
-            The size of target vocabulary.
-        max_length (int):
-            The maximum length of input sequences.
-        num_encoder_layers (int):
-            The number of sub-layers to be stacked in the encoder.
-        num_decoder_layers (int):
-            The number of sub-layers to be stacked in the decoder.
-        n_head (int):
-            The number of head used in multi-head attention.
-        d_model (int):
-            The dimension for word embeddings, which is also the last dimension of
-            the input and output of multi-head attention, position-wise feed-forward
-            networks, encoder and decoder.
-        d_inner_hid (int):
-            Size of the hidden layer in position-wise feed-forward networks.
-        dropout (float):
-            Dropout rates. Used for pre-process, activation and inside attention.
-        weight_sharing (bool):
-            Whether to use weight sharing.
-        bos_id (int, optional):
-            The start token id and also is used as padding id. Defaults to 0.
-        eos_id (int, optional):
-            The end token id. Defaults to 1.
-        beam_size (int, optional):
-            The beam width for beam search. Defaults to 4.
-        max_out_len (int, optional):
-            The maximum output length. Defaults to 256.
-        activation (str, optional):
-            The activation used in FFN. Defaults to "relu".
-        normalize_before (bool, optional):
-            Whether to apply pre-normalization. Defaults to True.
-        kwargs:
-            The key word arguments can be `output_time_major`, `use_ft`, `use_fp16_decoding`,
-            `rel_len`, `alpha`:
-
-            - `output_time_major(bool, optional)`: Indicate the data layout of predicted
-            Tensor. If `False`, the data layout would be batch major with shape
-            `[batch_size, seq_len, beam_size]`. If  `True`, the data layout would
-            be time major with shape `[seq_len, batch_size, beam_size]`. Default
-            to `False`.
-
-            - `use_ft(bool, optional)`: Whether to use FastGeneration
-            for decoding. Default to True if not set.
-
-            - `use_fp16_decoding(bool, optional)`: Whether to use fp16
-            for decoding.  Only works when using FastGeneration.
-
-            - `beam_search_version(str, optional)`: Indicating the strategy of
-            beam search. It can be 'v1' or 'v2'. 'v2' would select the top
-            `beam_size * 2` beams and process the top `beam_size` alive and
-            finish beams in them separately, while 'v1' would only select the
-            top `beam_size` beams and mix up the alive and finish beams. 'v2' always
-            searchs more and get better results, since the alive beams would
-            always be `beam_size` while the number of alive beams in `v1` might
-            decrease when meeting the end token. However, 'v2' always generates
-            longer results thus might do more calculation and be slower.
-
-            - `rel_len(bool, optional)`: Indicating whether `max_out_len` in is
-            the length relative to that of source text. Only works in `v2` temporarily.
-            It is suggest to set a small `max_out_len` and use `rel_len=True`.
-            Default to False if not set.
-
-            - `alpha(float, optional)`: The power number in length penalty
-            calculation. Refer to `GNMT <https://arxiv.org/pdf/1609.08144.pdf>`_.
-            Only works in `v2` temporarily. Default to 0.6 if not set.
-
-            - diversity_rate(float, optional): Refer to `A Simple, Fast Diverse
-            Decoding Algorithm for Neural Generation <https://arxiv.org/abs/1611.08562>`_
-            for details. Bigger `diversity_rate` would lead to more diversity.
-            if `diversity_rate == 0` is equivalent to naive BeamSearch. Default
-            to 0 if not set. **NOTE**: Only works when using FastGeneration
-            temporarily.
-    """
-
-    def __init__(
-        self,
-        src_vocab_size,
-        trg_vocab_size,
-        max_length,
-        num_encoder_layers,
-        num_decoder_layers,
-        n_head,
-        d_model,
-        d_inner_hid,
-        dropout,
-        weight_sharing,
-        bos_id=0,
-        eos_id=1,
-        pad_id=None,
-        beam_size=4,
-        max_out_len=256,
-        activation="relu",
-        normalize_before=True,
-        **kwargs
-    ):
-        logger.warning("TransformerGenerator is an experimental API and subject to change.")
-        # `kwargs` can include output_time_major, use_fp16_decoding, topk, topp.
-        # The later three arguments can only work when using FastGeneration,
-        # and expose topk, topp later.
-        super(TransformerGenerator, self).__init__()
-        self.d_model = d_model
-        self.max_length = max_length
-        self.output_time_major = kwargs.pop("output_time_major", True)
-        # Only works for FastGeneration.
-        # TODO: original version supports diversity rate.
-        diversity_rate = kwargs.pop("diversity_rate", 0.0)
-        use_fp16_decoding = kwargs.pop("use_fp16_decoding", False)
-        use_ft = kwargs.pop("use_ft", True)
-        beam_search_version = kwargs.pop("beam_search_version", "v1")
-        rel_len = kwargs.pop("rel_len", False)
-        alpha = kwargs.pop("alpha", 0.6)
-
-        # TODO: Faster version needs to update attr to support custom
-        # activation and normalize_before which are both aupport in cpp codes.
-        if use_ft and activation == "relu" and normalize_before:
-            try:
-                decoding_strategy = "beam_search_v2" if beam_search_version == "v2" else "beam_search"
-                self.transformer = FasterTransformer(
-                    src_vocab_size=src_vocab_size,
-                    trg_vocab_size=trg_vocab_size,
-                    max_length=max_length,
-                    num_encoder_layers=num_encoder_layers,
-                    num_decoder_layers=num_decoder_layers,
-                    n_head=n_head,
-                    d_model=d_model,
-                    d_inner_hid=d_inner_hid,
-                    dropout=dropout,
-                    weight_sharing=weight_sharing,
-                    bos_id=bos_id,
-                    eos_id=eos_id,
-                    pad_id=pad_id,
-                    beam_size=beam_size,
-                    max_out_len=max_out_len,
-                    diversity_rate=diversity_rate,
-                    decoding_strategy=decoding_strategy,
-                    use_fp16_decoding=use_fp16_decoding,
-                    rel_len=rel_len,
-                    alpha=alpha,
-                )
-            except Exception:
-                logger.warning(
-                    "Exception occurs when using FastGeneration. " "The original forward will be involved. "
-                )
-                if diversity_rate != 0:
-                    logger.warning(
-                        "diversity_rate would not work since it is only " "supported by FastGeneration temporarily."
-                    )
-                self.transformer = InferTransformerModel(
-                    src_vocab_size=src_vocab_size,
-                    trg_vocab_size=trg_vocab_size,
-                    max_length=max_length,
-                    num_encoder_layers=num_encoder_layers,
-                    num_decoder_layers=num_decoder_layers,
-                    n_head=n_head,
-                    d_model=d_model,
-                    d_inner_hid=d_inner_hid,
-                    dropout=dropout,
-                    weight_sharing=weight_sharing,
-                    bos_id=bos_id,
-                    eos_id=eos_id,
-                    pad_id=pad_id,
-                    beam_size=beam_size,
-                    max_out_len=max_out_len,
-                    output_time_major=self.output_time_major,
-                    beam_search_version=beam_search_version,
-                    activation=activation,
-                    normalize_before=normalize_before,
-                    rel_len=rel_len,
-                    alpha=alpha,
-                )
-        else:
-            if diversity_rate != 0:
-                logger.warning(
-                    "diversity_rate would not work since it is only " "supported by FastGeneration temporarily."
-                )
-            self.transformer = InferTransformerModel(
-                src_vocab_size=src_vocab_size,
-                trg_vocab_size=trg_vocab_size,
-                max_length=max_length,
-                num_encoder_layers=num_encoder_layers,
-                num_decoder_layers=num_decoder_layers,
-                n_head=n_head,
-                d_model=d_model,
-                d_inner_hid=d_inner_hid,
-                dropout=dropout,
-                weight_sharing=weight_sharing,
-                bos_id=bos_id,
-                eos_id=eos_id,
-                pad_id=pad_id,
-                beam_size=beam_size,
-                max_out_len=max_out_len,
-                output_time_major=self.output_time_major,
-                beam_search_version=beam_search_version,
-                activation=activation,
-                normalize_before=normalize_before,
-                rel_len=rel_len,
-                alpha=alpha,
-            )
-
-    def forward(self, src_word, trg_word=None):
-        r"""
-        Performs decoding for transformer model.
-
-        Args:
-            src_word (Tensor):
-                The ids of source sequence words. It is a tensor with shape
-                `[batch_size, source_sequence_length]` and its data type can be
-                int or int64.
-            trg_word (Tensor):
-                The ids of target sequence words. Normally, it should NOT be
-                given. If it's given, force decoding with previous output token
-                will be trigger. Defaults to None.
-
-        Returns:
-            Tensor:
-                An int64 tensor shaped indicating the predicted ids. Its shape is
-                `[batch_size, seq_len, beam_size]` or `[seq_len, batch_size, beam_size]`
-                according to `output_time_major`. While, when using FastGeneration
-                and beam search v2, the beam dimension would be doubled to include
-                both the top `beam_size` alive and finish beams, thus the tensor
-                shape is `[batch_size, seq_len, beam_size * 2]` or `[seq_len, batch_size, beam_size * 2]`.
-
-        Example:
-            .. code-block::
-
-                import paddle
-                from paddlenlp.ops import TransformerGenerator
-
-                transformer = TransformerGenerator(
-                    src_vocab_size=30000,
-                    trg_vocab_size=30000,
-                    max_length=256,
-                    num_encoder_layers=6,
-                    num_decoder_layers=6,
-                    n_head=8,
-                    d_model=512,
-                    d_inner_hid=2048,
-                    dropout=0.1,
-                    weight_sharing=True,
-                    bos_id=0,
-                    eos_id=1,
-                    beam_size=4,
-                    max_out_len=256)
-
-                batch_size = 5
-                seq_len = 10
-                transformer(
-                    src_word=paddle.randint(low=3, high=30000, shape=[batch_size, seq_len]))
-        """
-        out = self.transformer(src_word, trg_word=trg_word)
-        # TODO(guosheng): FasterTransformer has an output with layout
-        # `[seq_len, batch_size, beam_size]`. While the output layout of
-        # original one is `[batch_size, seq_len, beam_size]`. Maybe we need
-        # unify them later.
-        if not self.output_time_major and isinstance(self.transformer, FasterTransformer):
-            out = paddle.transpose(out, [1, 0, 2])
-        return out
-
-    def load(self, path=None, state_dict=None):
-        if path is None and state_dict is None:
-            raise ValueError("Either path or state_dict must be given to load the infer model. ")
-
-        if isinstance(self.transformer, FasterTransformer):
-            self.transformer.load(path, state_dict)
-        else:
-            if state_dict is None:
-                state_dict = paddle.load(path)
-            self.transformer.load_dict(state_dict)
-
-
-class FasterOPT(OPTPretrainedModel):
-    def __init__(self, model, decoding_lib=None, use_fp16_decoding=False):
-        super(FasterOPT, self).__init__(model.config)
-        self._model = model
-        self.use_fp16_decoding = use_fp16_decoding
-        self.decoding = InferOptDecoding(model=model, decoding_lib=decoding_lib, use_fp16_decoding=use_fp16_decoding)
-
-    def forward(
-        self,
-        input_ids,
-        seq_len=None,
-        attention_mask=None,
-        top_k=4,
-        top_p=0.0,
-        max_length=256,
-        bos_token_id=None,
-        eos_token_id=None,
-        pad_token_id=None,
-        forced_eos_token_id=None,
-        temperature=0,
-        decode_strategy="sample",
-        num_return_sequences=1,
-        **model_kwargs
-    ):
-        if input_ids.dtype == paddle.int64:
-            input_ids = paddle.cast(input_ids, "int32")
-
-        # change top_p to zero if not using top_p sampling for FT
-        if decode_strategy == "greedy_search":
-            top_p = 0.0
-            top_k = 1
-        if top_p == 1.0:
-            top_p = 0.0
-        if seq_len is None:
-            seq_len = paddle.sum(paddle.cast(input_ids != pad_token_id, dtype="int32"), axis=-1, dtype="int32")
-
-            if bos_token_id == pad_token_id and paddle.sum(paddle.any(input_ids == pad_token_id), dtype="int64") > 0:
-                seq_len = seq_len + 1
-
-        if num_return_sequences > 1:
-            input_ids, model_kwargs = self.expand_inputs_for_generation(
-                input_ids, expand_size=num_return_sequences, seq_len=seq_len, attention_mask=attention_mask
-            )
-            seq_len = model_kwargs["seq_len"]
-            attention_mask = model_kwargs.get("attention_mask", None)
-
-        return self.decoding(
-            input_ids,
-            mem_seq_len=seq_len,
-            attention_mask=attention_mask,
-            topk=top_k,
-            topp=top_p,
-            max_out_len=max_length,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            pad_token_id=pad_token_id,
-            forced_eos_token_id=forced_eos_token_id,
-            temperature=temperature,
-        )
-
-    def export_params(self, state_to_load, place):
-        for item in state_to_load:
-            param_data = np.array(state_to_load[item])
-            if self.use_fp16_decoding:
-                param_data = np.float16(param_data)
-
-            param = self
-            attr_list = item.split(".")
-            attr_list = ["decoding", "model"] + attr_list
-            for attr in attr_list:
-                param = getattr(param, attr)
-            param_name = param.name
-            var = paddle.static.global_scope().find_var(param_name).get_tensor()
-            var.set(param_data, place)
-
-    def save_resources(self, tokenizer, path):
-        vocab_file = os.path.join(path, "vocab.txt")
-        if isinstance(tokenizer, GPTTokenizer):
-            with open(vocab_file, "w", encoding="utf-8") as f:
-                for token in tokenizer.encoder:
-                    f.write(token + "\n")
-            merges_file = os.path.join(path, "merges.txt")
-            shutil.copyfile(tokenizer._merges_file, merges_file)
-        elif isinstance(tokenizer, GPTChineseTokenizer):
-            tokenizer.save_resources(path)
-
-    generate = forward
-
-
-class FasterGPT(GPTPretrainedModel):
-    def __init__(self, model, decoding_lib=None, use_fp16_decoding=False):
-        super(FasterGPT, self).__init__(model.config)
-        self._model = model
-        self.use_fp16_decoding = use_fp16_decoding
-        self.decoding = InferGptDecoding(model=model, decoding_lib=decoding_lib, use_fp16_decoding=use_fp16_decoding)
-
-    def forward(
-        self,
-        input_ids,
-        seq_len=None,
-        attention_mask=None,
-        top_k=4,
-        top_p=0.0,
-        max_length=256,
-        bos_token_id=None,
-        eos_token_id=None,
-        pad_token_id=None,
-        forced_eos_token_id=None,
-        temperature=0,
-        decode_strategy="sample",
-        num_return_sequences=1,
-        **model_kwargs
-    ):
-        if input_ids.dtype == paddle.int64:
-            input_ids = paddle.cast(input_ids, "int32")
-
-        # change top_p to zero if not using top_p sampling for FT
-        if decode_strategy == "greedy_search":
-            top_p = 0.0
-            top_k = 1
-        if top_p == 1.0:
-            top_p = 0.0
-        if seq_len is None:
-            seq_len = paddle.sum(paddle.cast(input_ids != pad_token_id, dtype="int32"), axis=-1, dtype="int32")
-
-            if bos_token_id == pad_token_id and paddle.sum(paddle.any(input_ids == pad_token_id), dtype="int64") > 0:
-                seq_len = seq_len + 1
-
-        if num_return_sequences > 1:
-            input_ids, model_kwargs = self.expand_inputs_for_generation(
-                input_ids, expand_size=num_return_sequences, seq_len=seq_len, attention_mask=attention_mask
-            )
-            seq_len = model_kwargs["seq_len"]
-            attention_mask = model_kwargs.get("attention_mask", None)
-
-        return self.decoding(
-            input_ids,
-            mem_seq_len=seq_len,
-            attention_mask=attention_mask,
-            topk=top_k,
-            topp=top_p,
-            max_out_len=max_length,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            pad_token_id=pad_token_id,
-            forced_eos_token_id=forced_eos_token_id,
-            temperature=temperature,
-        )
-
-    def export_params(self, state_to_load, place):
-        for item in state_to_load:
-            param_data = np.array(state_to_load[item])
-            if self.use_fp16_decoding:
-                param_data = np.float16(param_data)
-
-            param = self
-            attr_list = item.split(".")
-            attr_list = ["decoding", "model"] + attr_list
-            for attr in attr_list:
-                param = getattr(param, attr)
-            param_name = param.name
-            var = paddle.static.global_scope().find_var(param_name).get_tensor()
-            var.set(param_data, place)
-
-    def save_resources(self, tokenizer, path):
-        vocab_file = os.path.join(path, "vocab.txt")
-        if isinstance(tokenizer, GPTTokenizer):
-            with open(vocab_file, "w", encoding="utf-8") as f:
-                for token in tokenizer.encoder:
-                    f.write(token + "\n")
-            merges_file = os.path.join(path, "merges.txt")
-            shutil.copyfile(tokenizer._merges_file, merges_file)
-        elif isinstance(tokenizer, GPTChineseTokenizer):
-            tokenizer.save_resources(path)
-
-    generate = forward
-
-
-class FasterUnifiedTransformer(UnifiedTransformerPretrainedModel):
-    def __init__(self, model, decoding_lib=None, use_fp16_decoding=False):
-        super(FasterUnifiedTransformer, self).__init__(model.config)
-        self._model = model
-        self._use_fp16_decoding = use_fp16_decoding
-        self.vocab_size = model.lm_head.decoder_bias.shape[0]
-        self.unk_token_id = self._model.config.unk_token_id
-        self.mask_token_id = self._model.config.mask_token_id
-        self.bos_token_id = self._model.config.bos_token_id
-        self.pad_token_id = self._model.config.pad_token_id
-        self.logits_mask = self.generate_logits_mask(use_fp16_decoding)
-        self._n_head = self._model.config.num_attention_heads
-        self._hidden_dims = self._model.config.hidden_size
-        self._normalize_before = self._model.config.normalize_before
-        self._size_per_head = self._hidden_dims // self._n_head
-        self._n_layer = self._model.config.num_hidden_layers
-        self._hidden_act = self._model.config.hidden_act
-
-        self.decoding = InferUnifiedDecoding(
-            model=self._model,
-            decoding_lib=decoding_lib,
-            use_fp16_decoding=use_fp16_decoding,
-            logits_mask=self.logits_mask,
-            n_head=self._n_head,
-            hidden_dims=self._hidden_dims,
-            size_per_head=self._size_per_head,
-            n_layer=self._n_layer,
-            unk_id=self.unk_token_id,
-            mask_id=self.mask_token_id,
-            normalize_before=self._normalize_before,
-            hidden_act=self._hidden_act,
-        )
-
-    def prepare_inputs_for_generation(
-        self, input_ids, token_type_ids, attention_mask, seq_len, position_ids=None, role_ids=None, **kwargs
-    ):
-        input_ids = input_ids[:, :-1]
-        if input_ids.dtype == paddle.int64:
-            input_ids = paddle.cast(input_ids, dtype="int32")
-
-        if token_type_ids.dtype == paddle.int64:
-            token_type_ids = paddle.cast(token_type_ids, dtype="int32")
-        decoder_type_ids = token_type_ids[:, -1:]
-        token_type_ids = token_type_ids[:, :-1]
-
-        # TODO(guosheng): attention_mask of UnifiedTransformer uses 0/-INF
-        # and is 4D. While now we want to use 1/0 to unify all models and
-        # tokenizers.
-        attention_mask = attention_mask[:, :, :-1, :-1] if attention_mask.ndim == 4 else attention_mask[:, :-1, :-1]
-        attention_mask = paddle.cast(attention_mask == 0, dtype="float16" if self._use_fp16_decoding else "float32")
-
-        seq_len = seq_len - 1
-        if seq_len.dtype == paddle.int64:
-            seq_len = paddle.cast(seq_len, dtype="int32")
-
-        if position_ids is not None:
-            if position_ids.dtype == paddle.int64:
-                position_ids = paddle.cast(position_ids, dtype="int32")
-            decoder_position_ids = position_ids[:, -1:]
-            position_ids = position_ids[:, :-1]
-        else:
-            decoder_position_ids = None
-
-        field_values = {}
-        if role_ids is not None:
-            if role_ids.dtype == paddle.int64:
-                role_ids = paddle.cast(role_ids, dtype="int32")
-            decoder_role_ids = role_ids[:, -1:]
-            role_ids = role_ids[:, :-1]
-        else:
-            decoder_role_ids = None
-
-        field_values["input_ids"] = input_ids
-        field_values["token_type_ids"] = token_type_ids
-        field_values["attention_mask"] = attention_mask
-        field_values["seq_len"] = seq_len
-        field_values["decoder_type_ids"] = decoder_type_ids
-        field_values["position_ids"] = position_ids
-        field_values["decoder_position_ids"] = decoder_position_ids
-        field_values["role_ids"] = role_ids
-        field_values["decoder_role_ids"] = decoder_role_ids
-
-        return field_values
-
-    def generate_logits_mask(self, use_fp16_decoding):
-        # pre-process distribution
-        logits_mask = np.zeros(shape=[self.vocab_size], dtype=np.float32)
-
-        if use_fp16_decoding:
-            logits_mask[self.unk_token_id] = -1e4
-            logits_mask[self.bos_token_id] = -1e4
-            logits_mask[self.pad_token_id] = -1e4
-        else:
-            logits_mask[self.unk_token_id] = -1e9
-            logits_mask[self.bos_token_id] = -1e9
-            logits_mask[self.pad_token_id] = -1e9
-
-        logits_mask_t = paddle.assign(logits_mask)
-        if use_fp16_decoding:
-            return paddle.cast(logits_mask_t, dtype="float16")
-        else:
-            return logits_mask_t
-
-    def forward(
-        self,
-        input_ids,
-        token_type_ids,
-        attention_mask,
-        seq_len=None,
-        role_ids=None,
-        position_ids=None,
-        max_length=128,
-        min_length=0,
-        top_k=4,
-        top_p=0.0,
-        decode_strategy="sampling",
-        bos_token_id=None,
-        eos_token_id=None,
-        pad_token_id=None,
-        num_beams=4,
-        diversity_rate=0.0,
-        temperature=1.0,
-        num_return_sequences=1,
-        length_penalty=0.6,
-        early_stopping=False,
-        forced_eos_token_id=None,
-        **model_kwargs
-    ):
-
-        if seq_len is None:
-            assert input_ids is not None, "You have to specify either input_ids when generating seq_len."
-            seq_len = paddle.sum(
-                paddle.cast(input_ids != self.pad_token_id, dtype="int32"), axis=-1, keepdim=True, dtype="int32"
-            )
-        if decode_strategy.startswith("beam_search"):
-            input_ids, model_kwargs = self.expand_inputs_for_generation(
-                input_ids,
-                expand_size=num_beams,
-                token_type_ids=token_type_ids,
-                position_ids=position_ids,
-                attention_mask=attention_mask,
-                seq_len=seq_len,
-                role_ids=role_ids,
-            )
-        elif decode_strategy == "sampling":
-            input_ids, model_kwargs = self.expand_inputs_for_generation(
-                input_ids,
-                expand_size=num_return_sequences,
-                token_type_ids=token_type_ids,
-                position_ids=position_ids,
-                attention_mask=attention_mask,
-                seq_len=seq_len,
-                role_ids=role_ids,
-            )
-        elif decode_strategy == "greedy_search":
-            model_kwargs = {
-                "token_type_ids": token_type_ids,
-                "position_ids": position_ids,
-                "attention_mask": attention_mask,
-                "seq_len": seq_len,
-                "role_ids": role_ids,
-            }
-        else:
-            raise ValueError("Only greedy search, beam search and sampling are supported. ")
-
-        model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
-
-        seq_len = model_inputs.pop("seq_len")
-        decoder_type_ids = model_inputs.pop("decoder_type_ids")
-        role_ids = model_inputs.pop("role_ids", None)
-        decoder_role_ids = model_inputs.pop("decoder_role_ids", None)
-        position_ids = model_inputs.pop("position_ids", None)
-        decoder_position_ids = model_inputs.pop("decoder_position_ids", None)
-
-        return self.decoding(
-            input_ids=model_inputs["input_ids"],
-            attn_mask=model_inputs["attention_mask"],
-            memory_seq_lens=seq_len,
-            type_id=model_inputs["token_type_ids"],
-            decoder_type_id=decoder_type_ids,
-            role_id=role_ids,
-            decoder_role_id=decoder_role_ids,
-            position_id=position_ids,
-            decoder_position_id=decoder_position_ids,
-            beam_size=num_beams,
-            diversity_rate=diversity_rate,
-            topk=top_k,
-            topp=top_p,
-            decoding_strategy=decode_strategy,
-            max_out_len=max_length,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            pad_token_id=pad_token_id,
-            temperature=temperature,
-            length_penalty=length_penalty,
-            pos_bias=True,
-            forced_eos_token_id=forced_eos_token_id,
-            early_stopping=early_stopping,
-            min_length=min_length,
-        )
-
-    generate = forward
-
-
-class FasterUNIMOText(UNIMOPretrainedModel):
-    def __init__(self, model, decoding_lib=None, use_fp16_decoding=False, **kwargs):
-        super(FasterUNIMOText, self).__init__(model.config)
-        self._model = model
-        self._use_fp16_decoding = use_fp16_decoding
-        self.unk_token_id = self._model.config.unk_token_id
-        self.mask_token_id = self._model.config.mask_token_id
-        self.bos_token_id = self._model.config.bos_token_id
-        self.pad_token_id = self._model.config.pad_token_id
-        self.vocab_size = model.lm_head.decoder_bias.shape[0]
-
-        self.logits_mask = self.generate_logits_mask(use_fp16_decoding)
-        self._n_head = self._model.config.num_attention_heads
-        self._hidden_dims = self._model.config.hidden_size
-        self._normalize_before = self._model.config.normalize_before
-        self._size_per_head = self._hidden_dims // self._n_head
-        self._n_layer = self._model.config.num_hidden_layers
-        self._hidden_act = self._model.config.hidden_act
-        self.trans_out = kwargs.get("trans_out", False)
-
-        self.decoding = InferUnifiedDecoding(
-            model=self._model,
-            decoding_lib=decoding_lib,
-            use_fp16_decoding=use_fp16_decoding,
-            logits_mask=self.logits_mask,
-            n_head=self._n_head,
-            hidden_dims=self._hidden_dims,
-            size_per_head=self._size_per_head,
-            n_layer=self._n_layer,
-            unk_id=self.unk_token_id,
-            mask_id=self.mask_token_id,
-            normalize_before=self._normalize_before,
-            hidden_act=self._hidden_act,
-        )
-
-    def prepare_inputs_for_generation(self, input_ids, token_type_ids, attention_mask, **kwargs):
-        input_ids = input_ids[:, :-1]
-        if input_ids.dtype == paddle.int64:
-            input_ids = paddle.cast(input_ids, dtype="int32")
-
-        if token_type_ids.dtype == paddle.int64:
-            token_type_ids = paddle.cast(token_type_ids, dtype="int32")
-        decoder_type_ids = token_type_ids[:, -1:]
-        token_type_ids = token_type_ids[:, :-1]
-
-        attention_mask = attention_mask[:, :, :-1, :-1]
-        attention_mask = paddle.cast(attention_mask == 0, dtype="float16" if self._use_fp16_decoding else "float32")
-
-        seq_len = kwargs.get("seq_len") - 1
-        if seq_len.dtype == paddle.int64:
-            seq_len = paddle.cast(seq_len, dtype="int32")
-
-        return {
-            "input_ids": input_ids,
-            "token_type_ids": token_type_ids,
-            "attention_mask": attention_mask,
-            "seq_len": seq_len,
-            "decoder_type_ids": decoder_type_ids,
-        }
-
-    def generate_logits_mask(self, use_fp16_decoding):
-        # pre-process distribution
-        logits_mask = np.zeros(shape=[self.vocab_size], dtype=np.float32)
-
-        if use_fp16_decoding:
-            logits_mask[self.unk_token_id] = -1e4
-            logits_mask[self.bos_token_id] = -1e4
-            logits_mask[self.pad_token_id] = -1e4
-        else:
-            logits_mask[self.unk_token_id] = -1e9
-            logits_mask[self.bos_token_id] = -1e9
-            logits_mask[self.pad_token_id] = -1e9
-
-        logits_mask_t = paddle.assign(logits_mask)
-        if use_fp16_decoding:
-            return paddle.cast(logits_mask_t, dtype="float16")
-        else:
-            return logits_mask_t
-
-    def forward(
-        self,
-        input_ids,
-        token_type_ids,
-        attention_mask,
-        seq_len=None,
-        max_length=128,
-        min_length=0,
-        top_k=4,
-        top_p=0.0,
-        num_beams=4,
-        decode_strategy="sampling",
-        bos_token_id=None,
-        eos_token_id=None,
-        pad_token_id=None,
-        diversity_rate=0.0,
-        temperature=1.0,
-        num_return_sequences=1,
-        length_penalty=0.6,
-        early_stopping=False,
-        forced_eos_token_id=None,
-        position_ids=None,
-        **model_kwargs
-    ):
-
-        if seq_len is None:
-            assert input_ids is not None, "You have to specify either input_ids when generating seq_len."
-            seq_len = paddle.sum(paddle.cast(input_ids != self.pad_token_id, dtype="int32"), axis=-1, dtype="int32")
-        if decode_strategy.startswith("beam_search"):
-            input_ids, model_kwargs = self.expand_inputs_for_generation(
-                input_ids,
-                expand_size=num_beams,
-                token_type_ids=token_type_ids,
-                position_ids=position_ids,
-                attention_mask=attention_mask,
-                seq_len=seq_len,
-            )
-        elif decode_strategy == "sampling":
-            input_ids, model_kwargs = self.expand_inputs_for_generation(
-                input_ids,
-                expand_size=num_return_sequences,
-                token_type_ids=token_type_ids,
-                position_ids=position_ids,
-                attention_mask=attention_mask,
-                seq_len=seq_len,
-            )
-        elif decode_strategy == "greedy_search":
-            model_kwargs = {
-                "token_type_ids": token_type_ids,
-                "position_ids": position_ids,
-                "attention_mask": attention_mask,
-                "seq_len": seq_len,
-            }
-        else:
-            raise ValueError("Only greedy search, beam search and sampling are supported. ")
-
-        model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
-        seq_len = model_inputs.pop("seq_len")
-        decoder_type_ids = model_inputs.pop("decoder_type_ids")
-
-        ids, output_scores = self.decoding(
-            input_ids=model_inputs["input_ids"],
-            attn_mask=model_inputs["attention_mask"],
-            memory_seq_lens=seq_len,
-            type_id=model_inputs["token_type_ids"],
-            decoder_type_id=decoder_type_ids,
-            beam_size=num_beams,
-            diversity_rate=diversity_rate,
-            topk=top_k,
-            topp=top_p,
-            decoding_strategy=decode_strategy,
-            max_out_len=max_length,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            pad_token_id=pad_token_id,
-            temperature=temperature,
-            length_penalty=length_penalty,
-            forced_eos_token_id=forced_eos_token_id,
-            pos_bias=False,
-            early_stopping=early_stopping,
-            min_length=min_length,
-        )
-        if self.trans_out:
-            if decode_strategy.startswith("beam_search"):
-                ids = ids.transpose([1, 2, 0])
-            else:
-                ids = ids.transpose([1, 0])
-        return ids, output_scores
-
-    generate = forward
-
-
-class FasterMIRO(UNIMOPretrainedModel):
-    def __init__(self, model, decoding_lib=None, use_fp16_decoding=False, **kwargs):
-        super(FasterMIRO, self).__init__(model.config)
-        self._model = model
-        self._use_fp16_decoding = use_fp16_decoding
-        self.unk_token_id = self._model.config.unk_token_id
-        self.mask_token_id = self._model.config.mask_token_id
-        self.bos_token_id = self._model.config.bos_token_id
-        self.pad_token_id = self._model.config.pad_token_id
-        self.vocab_size = model.lm_head.decoder_bias.shape[0]
-
-        self.logits_mask = self.generate_logits_mask(use_fp16_decoding)
-        self._n_head = self._model.config.num_attention_heads
-        self._hidden_dims = self._model.config.hidden_size
-        self._normalize_before = self._model.config.normalize_before
-        self._size_per_head = self._hidden_dims // self._n_head
-        self._n_layer = self._model.config.num_hidden_layers
-        self._hidden_act = self._model.config.hidden_act
-        self.trans_out = kwargs.get("trans_out", False)
-
-        self.decoding = InferMIRODecoding(
-            model=self._model,
-            decoding_lib=decoding_lib,
-            use_fp16_decoding=use_fp16_decoding,
-            logits_mask=self.logits_mask,
-            n_head=self._n_head,
-            hidden_dims=self._hidden_dims,
-            size_per_head=self._size_per_head,
-            n_layer=self._n_layer,
-            unk_id=self.unk_token_id,
-            mask_id=self.mask_token_id,
-            normalize_before=self._normalize_before,
-            hidden_act=self._hidden_act,
-        )
-
-    def prepare_inputs_for_generation(self, input_ids, token_type_ids, attention_mask, **kwargs):
-        input_ids = input_ids[:, :-1]
-        if input_ids.dtype == paddle.int64:
-            input_ids = paddle.cast(input_ids, dtype="int32")
-
-        if token_type_ids.dtype == paddle.int64:
-            token_type_ids = paddle.cast(token_type_ids, dtype="int32")
-        decoder_type_ids = token_type_ids[:, -1:]
-        token_type_ids = token_type_ids[:, :-1]
-
-        attention_mask = attention_mask[:, :, :-1, :-1]
-        attention_mask = paddle.cast(attention_mask == 0, dtype="float16" if self._use_fp16_decoding else "float32")
-
-        seq_len = kwargs.get("seq_len") - 1
-        if seq_len.dtype == paddle.int64:
-            seq_len = paddle.cast(seq_len, dtype="int32")
-
-        return {
-            "input_ids": input_ids,
-            "token_type_ids": token_type_ids,
-            "attention_mask": attention_mask,
-            "seq_len": seq_len,
-            "decoder_type_ids": decoder_type_ids,
-        }
-
-    def generate_logits_mask(self, use_fp16_decoding):
-        # pre-process distribution
-        logits_mask = np.zeros(shape=[self.vocab_size], dtype=np.float32)
-
-        if use_fp16_decoding:
-            logits_mask[self.unk_token_id] = -1e4
-            logits_mask[self.bos_token_id] = -1e4
-            logits_mask[self.pad_token_id] = -1e4
-        else:
-            logits_mask[self.unk_token_id] = -1e9
-            logits_mask[self.bos_token_id] = -1e9
-            logits_mask[self.pad_token_id] = -1e9
-
-        logits_mask_t = paddle.assign(logits_mask)
-        if use_fp16_decoding:
-            return paddle.cast(logits_mask_t, dtype="float16")
-        else:
-            return logits_mask_t
-
-    def forward(
-        self,
-        input_ids,
-        token_type_ids,
-        attention_mask,
-        seq_len=None,
-        max_length=128,
-        min_length=0,
-        top_k=4,
-        top_p=0.0,
-        num_beams=4,
-        decode_strategy="sampling",
-        bos_token_id=None,
-        eos_token_id=None,
-        pad_token_id=None,
-        diversity_rate=0.0,
-        temperature=1.0,
-        num_return_sequences=1,
-        length_penalty=0.6,
-        early_stopping=False,
-        forced_eos_token_id=None,
-        position_ids=None,
-        **model_kwargs
-    ):
-
-        if seq_len is None:
-            assert input_ids is not None, "You have to specify either input_ids when generating seq_len."
-            seq_len = paddle.sum(paddle.cast(input_ids != self.pad_token_id, dtype="int32"), axis=-1, dtype="int32")
-        if decode_strategy.startswith("beam_search"):
-            input_ids, model_kwargs = self.expand_inputs_for_generation(
-                input_ids,
-                expand_size=num_beams,
-                token_type_ids=token_type_ids,
-                position_ids=position_ids,
-                attention_mask=attention_mask,
-                seq_len=seq_len,
-            )
-        elif decode_strategy == "sampling":
-            input_ids, model_kwargs = self.expand_inputs_for_generation(
-                input_ids,
-                expand_size=num_return_sequences,
-                token_type_ids=token_type_ids,
-                position_ids=position_ids,
-                attention_mask=attention_mask,
-                seq_len=seq_len,
-            )
-        elif decode_strategy == "greedy_search":
-            model_kwargs = {
-                "token_type_ids": token_type_ids,
-                "position_ids": position_ids,
-                "attention_mask": attention_mask,
-                "seq_len": seq_len,
-            }
-        else:
-            raise ValueError("Only greedy search, beam search and sampling are supported. ")
-
-        model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
-        seq_len = model_inputs.pop("seq_len")
-        decoder_type_ids = model_inputs.pop("decoder_type_ids")
-
-        ids, output_scores = self.decoding(
-            input_ids=model_inputs["input_ids"],
-            attn_mask=model_inputs["attention_mask"],
-            memory_seq_lens=seq_len,
-            type_id=model_inputs["token_type_ids"],
-            decoder_type_id=decoder_type_ids,
-            beam_size=num_beams,
-            diversity_rate=diversity_rate,
-            topk=top_k,
-            topp=top_p,
-            decoding_strategy=decode_strategy,
-            max_out_len=max_length,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            pad_token_id=pad_token_id,
-            temperature=temperature,
-            length_penalty=length_penalty,
-            forced_eos_token_id=forced_eos_token_id,
-            pos_bias=False,
-            early_stopping=early_stopping,
-            min_length=min_length,
-        )
-        if self.trans_out:
-            if decode_strategy.startswith("beam_search"):
-                ids = ids.transpose([1, 2, 0])
-            else:
-                ids = ids.transpose([1, 0])
-        return ids, output_scores
-
-    generate = forward
-
-
-class FasterBART(BartPretrainedModel):
-    enable_faster_encoder_func = enable_fast_encoder
-
-    def __init__(self, model, decoding_lib=None, use_fp16_decoding=False, enable_fast_encoder=True):
-        super(FasterBART, self).__init__(model.config)
-        self.use_fp16_decoding = use_fp16_decoding
-        self._model = model
-        if use_fp16_decoding:
-            weight_attr = paddle.ParamAttr(initializer=nn.initializer.Assign(model.bart.encoder.embed_tokens.weight))
-            model.bart.encoder.embed_tokens = nn.Embedding(
-                *model.bart.encoder.embed_tokens.weight.shape, weight_attr=weight_attr
-            )
-        self.encoder = model.bart.get_encoder()
-        self.decoder = model.bart.get_decoder()
-        self.pad_token_id = model.bart.config["pad_token_id"]
-        self.enable_fast_encoder = enable_fast_encoder
-
-        self.decoding = InferBartDecoding(
-            model=self._model, decoding_lib=decoding_lib, use_fp16_decoding=use_fp16_decoding
-        )
-        if self.enable_fast_encoder:
-            # Must use `enable_fast_encoder` in `__init__` when dygraph to static graph.
-            self.encoder = FasterBART.enable_faster_encoder_func(self.encoder)
-
-    def get_encoder(self):
-        return self.encoder
-
-    def get_decoder(self):
-        return self.decoder
-
-    def forward(
-        self,
-        input_ids=None,
-        encoder_output=None,
-        seq_len=None,
-        num_beams=4,
-        top_k=1,
-        top_p=0.0,
-        temperature=1.0,
-        decode_strategy="beam_search",
-        bos_token_id=None,
-        eos_token_id=None,
-        pad_token_id=None,
-        decoder_start_token_id=None,
-        min_length=0,
-        max_length=20,
-        diversity_rate=0.0,
-        length_penalty=0.6,
-        num_return_sequences=1,
-        early_stopping=False,
-        forced_eos_token_id=None,
-        **model_kwargs
-    ):
-
-        if encoder_output is None:
-            assert input_ids is not None, "You have to specify either input_ids or encoder_output."
-            encoder_output = self.prepare_encoder_decoder_kwargs_for_generation(input_ids, model_kwargs)[
-                "encoder_output"
-            ]
-        if seq_len is None:
-            assert input_ids is not None, "You have to specify either input_ids when generating seq_len."
-            seq_len = paddle.sum(
-                paddle.cast(input_ids != self.pad_token_id, dtype="int32"), axis=-1, keepdim=True, dtype="int32"
-            )
-        if self.use_fp16_decoding:
-            encoder_output = paddle.cast(encoder_output, "float16")
-        if decode_strategy.startswith("beam_search") and num_beams > 1:
-            encoder_output, expanded_kwargs = self.expand_inputs_for_generation(
-                encoder_output, expand_size=num_beams, seq_len=seq_len
-            )
-            seq_len = expanded_kwargs["seq_len"]
-        elif decode_strategy == "sampling" and num_return_sequences > 1:
-            encoder_output, expanded_kwargs = self.expand_inputs_for_generation(
-                encoder_output, expand_size=num_return_sequences, seq_len=seq_len
-            )
-            seq_len = expanded_kwargs["seq_len"]
-        if decoder_start_token_id is not None:
-            bos_token_id = decoder_start_token_id
-
-        return self.decoding(
-            enc_output=encoder_output,
-            memory_seq_lens=seq_len,
-            beam_size=num_beams,
-            top_k=top_k,
-            decoding_strategy=decode_strategy,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            pad_token_id=pad_token_id,
-            top_p=top_p,
-            max_out_len=max_length,
-            min_out_len=min_length,
-            temperature=temperature,
-            diversity_rate=diversity_rate,
-            alpha=length_penalty,
-            early_stopping=early_stopping,
-            forced_eos_token_id=forced_eos_token_id,
-        )
-
-    generate = forward
-
-
-class FasterMBART(MBartPretrainedModel):
-    enable_faster_encoder_func = enable_fast_encoder
-
-    def __init__(self, model, decoding_lib=None, use_fp16_decoding=False, enable_fast_encoder=False):
-        super(FasterMBART, self).__init__(model.config)
-        self.use_fp16_decoding = use_fp16_decoding
-        self._model = model
-        if use_fp16_decoding:
-            weight_attr = paddle.ParamAttr(initializer=nn.initializer.Assign(model.mbart.encoder.embed_tokens.weight))
-            model.mbart.encoder.embed_tokens = nn.Embedding(
-                *model.mbart.encoder.embed_tokens.weight.shape, weight_attr=weight_attr
-            )
-        self.encoder = model.mbart.get_encoder()
-        self.decoder = model.mbart.get_decoder()
-        self.pad_token_id = model.mbart.config["pad_token_id"]
-        self.enable_fast_encoder = enable_fast_encoder
-
-        self.decoding = InferMBartDecoding(
-            model=self._model,
-            decoding_lib=decoding_lib,
-            use_fp16_decoding=use_fp16_decoding,
-            hidden_act=model.mbart.config["activation_function"],
-        )
-
-        if self.enable_fast_encoder:
-            # Must use `enable_fast_encoder` in `__init__` when dygraph to static graph.
-            self.encoder = FasterMBART.enable_faster_encoder_func(self.encoder)
-
-    def get_encoder(self):
-        return self.encoder
-
-    def get_decoder(self):
-        return self.decoder
-
-    def forward(
-        self,
-        input_ids=None,
-        encoder_output=None,
-        seq_len=None,
-        forced_bos_token_id=None,
-        num_beams=4,
-        top_k=1,
-        top_p=0.0,
-        decode_strategy="beam_search_v3",
-        bos_token_id=None,
-        eos_token_id=None,
-        pad_token_id=None,
-        decoder_start_token_id=None,
-        max_length=256,
-        diversity_rate=0.0,
-        length_penalty=0.6,
-        temperature=1.0,
-        num_return_sequences=1,
-        early_stopping=False,
-        forced_eos_token_id=None,
-        **model_kwargs
-    ):
-
-        bos_token_id = bos_token_id if bos_token_id is not None else getattr(self._model, "bos_token_id", None)
-        eos_token_id = eos_token_id if eos_token_id is not None else getattr(self._model, "eos_token_id", None)
-        pad_token_id = pad_token_id if pad_token_id is not None else getattr(self._model, "pad_token_id", None)
-        decoder_start_token_id = (
-            decoder_start_token_id
-            if decoder_start_token_id is not None
-            else getattr(self._model, "decoder_start_token_id", None)
-        )
-
-        # (gongenlei) Not enable_fast_encoder temporarily
-        if encoder_output is None:
-            assert input_ids is not None, "You have to specify either input_ids or encoder_output."
-            encoder_output = self.prepare_encoder_decoder_kwargs_for_generation(input_ids, model_kwargs)[
-                "encoder_output"
-            ]
-        batch_size = encoder_output.shape[0]
-        if seq_len is None:
-            assert input_ids is not None, "You have to specify either input_ids when generating seq_len."
-            seq_len = paddle.sum(paddle.cast(input_ids != self.pad_token_id, dtype="int32"), axis=-1, dtype="int32")
-        if self.use_fp16_decoding:
-            encoder_output = paddle.cast(encoder_output, "float16")
-        if decode_strategy.startswith("beam_search") and num_beams > 1:
-            encoder_output, expanded_kwargs = self.expand_inputs_for_generation(
-                encoder_output, expand_size=num_beams, seq_len=seq_len
-            )
-            seq_len = expanded_kwargs["seq_len"]
-        elif decode_strategy == "sampling" and num_return_sequences > 1:
-            encoder_output, expanded_kwargs = self.expand_inputs_for_generation(
-                encoder_output, expand_size=num_return_sequences, seq_len=seq_len
-            )
-            seq_len = expanded_kwargs["seq_len"]
-        if decoder_start_token_id is not None:
-            bos_token_id = decoder_start_token_id
-
-        if not isinstance(forced_bos_token_id, type(input_ids)):
-            if forced_bos_token_id is not None:
-                if decode_strategy == "sampling":
-                    forced_bos_token_id = paddle.full(
-                        [batch_size * num_return_sequences, 1], forced_bos_token_id, dtype="int32"
-                    )
-                else:
-                    forced_bos_token_id = paddle.full([batch_size, 1], forced_bos_token_id, dtype="int32")
-            else:
-                forced_bos_token_id = paddle.zeros([0])
-        elif decode_strategy == "sampling":
-            num_samples = encoder_output.shape[0]
-            forced_bos_token_id = paddle.expand(forced_bos_token_id, shape=[num_samples, 1])
-
-        return self.decoding(
-            enc_output=encoder_output,
-            memory_seq_lens=seq_len,
-            beam_size=num_beams,
-            trg_word=forced_bos_token_id,
-            top_k=top_k,
-            top_p=top_p,
-            decoding_strategy=decode_strategy,
-            diversity_rate=diversity_rate,
-            max_out_len=max_length,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            pad_token_id=pad_token_id,
-            alpha=length_penalty,
-            temperature=temperature,
-            early_stopping=early_stopping,
-        )
-
-    generate = forward
-
-
-class FasterGPTJ(GPTJPretrainedModel):
-    def __init__(self, model, decoding_lib=None, use_fp16_decoding=False):
-        super(FasterGPTJ, self).__init__(model.config)
-        self._model = model
-        self.use_fp16_decoding = use_fp16_decoding
-        self.decoding = InferGptJDecoding(model=model, decoding_lib=decoding_lib, use_fp16_decoding=use_fp16_decoding)
-
-    def forward(
-        self,
-        input_ids,
-        seq_len=None,
-        attention_mask=None,
-        top_k=4,
-        top_p=0.0,
-        min_length=0,
-        max_length=256,
-        bos_token_id=None,
-        eos_token_id=None,
-        pad_token_id=None,
-        forced_eos_token_id=None,
-        temperature=0,
-        repetition_penalty=1.0,
-        decode_strategy="sampling",
-        num_return_sequences=1,
-        **model_kwargs
-    ):
-        if input_ids.dtype == paddle.int64:
-            input_ids = paddle.cast(input_ids, "int32")
-
-        # change top_p to zero if not using top_p sampling for FT
-        if decode_strategy == "greedy_search":
-            top_p = 0.0
-            top_k = 1
-        if top_p == 1.0:
-            top_p = 0.0
-        if seq_len is None:
-            seq_len = paddle.sum(paddle.cast(input_ids != pad_token_id, dtype="int32"), axis=-1, dtype="int32")
-
-        if num_return_sequences > 1:
-            input_ids, model_kwargs = self.expand_inputs_for_generation(
-                input_ids, expand_size=num_return_sequences, seq_len=seq_len, attention_mask=attention_mask
-            )
-            seq_len = model_kwargs["seq_len"]
-            attention_mask = model_kwargs.get("attention_mask", None)
-
-        return self.decoding(
-            input_ids,
-            mem_seq_len=seq_len,
-            attention_mask=attention_mask,
-            topk=top_k,
-            topp=top_p,
-            max_out_len=max_length,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            pad_token_id=pad_token_id,
-            forced_eos_token_id=forced_eos_token_id,
-            temperature=temperature,
-            repetition_penalty=repetition_penalty,
-            min_length=min_length,
-        )
-
-    generate = forward
-
-
-class FasterCodeGen(CodeGenPreTrainedModel):
-    def __init__(self, model, decoding_lib=None, use_fp16_decoding=False):
-        super(FasterCodeGen, self).__init__(model.config)
-        self._model = model
-        self.use_fp16_decoding = use_fp16_decoding
-        self.decoding = InferGptJDecoding(
-            model=model, decoding_lib=decoding_lib, use_fp16_decoding=use_fp16_decoding, transpose_qkv=True
-        )
-
-    def forward(
-        self,
-        input_ids,
-        seq_len=None,
-        attention_mask=None,
-        top_k=4,
-        top_p=0.0,
-        min_length=0,
-        max_length=256,
-        bos_token_id=None,
-        eos_token_id=None,
-        pad_token_id=None,
-        forced_eos_token_id=None,
-        temperature=0,
-        repetition_penalty=1.0,
-        decode_strategy="sampling",
-        num_return_sequences=1,
-        **model_kwargs
-    ):
-        if input_ids.dtype == paddle.int64:
-            input_ids = paddle.cast(input_ids, "int32")
-
-        # change top_p to zero if not using top_p sampling for FT
-        if decode_strategy == "greedy_search":
-            top_p = 0.0
-            top_k = 1
-        if top_p == 1.0:
-            top_p = 0.0
-        if seq_len is None:
-            seq_len = paddle.sum(paddle.cast(input_ids != pad_token_id, dtype="int32"), axis=-1, dtype="int32")
-
-        if num_return_sequences > 1:
-            input_ids, model_kwargs = self.expand_inputs_for_generation(
-                input_ids, expand_size=num_return_sequences, seq_len=seq_len, attention_mask=attention_mask
-            )
-            seq_len = model_kwargs["seq_len"]
-            attention_mask = model_kwargs.get("attention_mask", None)
-
-        return self.decoding(
-            input_ids,
-            mem_seq_len=seq_len,
-            attention_mask=attention_mask,
-            topk=top_k,
-            topp=top_p,
-            max_out_len=max_length,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            pad_token_id=pad_token_id,
-            forced_eos_token_id=forced_eos_token_id,
-            temperature=temperature,
-            repetition_penalty=repetition_penalty,
-            min_length=min_length,
-        )
-
-    generate = forward
-
-
-class FasterPegasus(PegasusPretrainedModel):
-    enable_faster_encoder_func = enable_fast_encoder
-
-    def __init__(self, model, decoding_lib=None, use_fp16_decoding=False, enable_fast_encoder=False, **kwargs):
-        super(FasterPegasus, self).__init__(model.config)
-        self.use_fp16_decoding = use_fp16_decoding
-        self._model = model
-        self.encoder = model.get_encoder()
-        self.decoder = model.get_decoder()
-        self.pad_token_id = model.pegasus.config["pad_token_id"]
-        self.enable_fast_encoder = enable_fast_encoder
-        self.trans_out = kwargs.get("trans_out", False)
-
-        self.decoding = InferPegasusDecoding(
-            model=self._model,
-            decoding_lib=decoding_lib,
-            use_fp16_decoding=use_fp16_decoding,
-            hidden_act=model.pegasus.config["activation_function"],
-        )
-
-        # TODO(gongenlei): Support faster_encoder
-        # if self.enable_fast_encoder:
-        #     # Must use `enable_fast_encoder` in `__init__` when dygraph to static graph.
-        #     self.encoder = FasterPegasus.enable_faster_encoder_func(self.encoder)
-
-    def get_encoder(self):
-        return self.encoder
-
-    def get_decoder(self):
-        return self.decoder
-
-    def forward(
-        self,
-        input_ids=None,
-        encoder_output=None,
-        seq_len=None,
-        min_length=0,
-        max_length=256,
-        num_beams=4,
-        decode_strategy="beam_search_v3",
-        decoder_start_token_id=None,
-        bos_token_id=None,
-        eos_token_id=None,
-        pad_token_id=None,
-        diversity_rate=0.0,
-        length_penalty=0.6,
-        top_k=1,
-        top_p=0.0,
-        temperature=1.0,
-        num_return_sequences=1,
-        early_stopping=False,
-        forced_bos_token_id=None,
-        forced_eos_token_id=None,
-        **model_kwargs
-    ):
-
-        bos_token_id = bos_token_id if bos_token_id is not None else getattr(self._model, "bos_token_id", None)
-        eos_token_id = eos_token_id if eos_token_id is not None else getattr(self._model, "eos_token_id", None)
-        pad_token_id = pad_token_id if pad_token_id is not None else getattr(self._model, "pad_token_id", None)
-        decoder_start_token_id = (
-            decoder_start_token_id
-            if decoder_start_token_id is not None
-            else getattr(self._model, "decoder_start_token_id", None)
-        )
-
-        if encoder_output is None:
-            assert input_ids is not None, "You have to specify either input_ids or encoder_output."
-            encoder_output = self.prepare_encoder_decoder_kwargs_for_generation(input_ids, model_kwargs)[
-                "encoder_output"
-            ]
-
-        if seq_len is None:
-            assert input_ids is not None, "You have to specify either input_ids when generating seq_len."
-            seq_len = paddle.sum(paddle.cast(input_ids != self.pad_token_id, dtype="int32"), axis=-1, dtype="int32")
-        if self.use_fp16_decoding:
-            encoder_output = paddle.cast(encoder_output, "float16")
-        if decode_strategy.startswith("beam_search") and num_beams > 1:
-            encoder_output, expanded_kwargs = self.expand_inputs_for_generation(
-                encoder_output, expand_size=num_beams, seq_len=seq_len
-            )
-            seq_len = expanded_kwargs["seq_len"]
-        elif decode_strategy == "sampling" and num_return_sequences > 1:
-            encoder_output, expanded_kwargs = self.expand_inputs_for_generation(
-                encoder_output, expand_size=num_return_sequences, seq_len=seq_len
-            )
-            seq_len = expanded_kwargs["seq_len"]
-        if decoder_start_token_id is not None:
-            bos_token_id = decoder_start_token_id
-
-        ids = self.decoding(
-            enc_output=encoder_output,
-            memory_seq_lens=seq_len,
-            beam_size=num_beams,
-            top_k=top_k,
-            top_p=top_p,
-            decoding_strategy=decode_strategy,
-            max_out_len=max_length,
-            min_out_len=min_length,
-            diversity_rate=diversity_rate,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            pad_token_id=pad_token_id,
-            alpha=length_penalty,
-            temperature=temperature,
-            early_stopping=early_stopping,
-            forced_eos_token_id=forced_eos_token_id,
-        )
-
-        if self.trans_out:
-            if decode_strategy.startswith("beam_search"):
-                ids = ids.transpose([1, 2, 0])
-            else:
-                ids = ids.transpose([1, 0])
-
-        return ids
-
-    generate = forward
-
-
-class FasterT5(T5PretrainedModel):
-    def __init__(self, model, decoding_lib=None, use_fp16_decoding=False):
-        super(FasterT5, self).__init__(model.config)
-        self.use_fp16_decoding = use_fp16_decoding
-        self._model = model
-        if use_fp16_decoding:
-            weight_attr = paddle.ParamAttr(initializer=nn.initializer.Assign(model.encoder.embed_tokens.weight))
-            model.encoder.embed_tokens = nn.Embedding(
-                *model.encoder.embed_tokens.weight.shape, weight_attr=weight_attr
-            )
-        self.encoder = model.t5.get_encoder()
-        self.decoder = model.t5.get_decoder()
-        self.pad_token_id = model.t5.config["pad_token_id"]
-
-        self.decoding = InferT5Decoding(
-            model=self._model, decoding_lib=decoding_lib, use_fp16_decoding=use_fp16_decoding
-        )
-
-    def get_encoder(self):
-        return self.encoder
-
-    def get_decoder(self):
-        return self.decoder
-
-    def forward(
-        self,
-        input_ids=None,
-        encoder_output=None,
-        seq_len=None,
-        max_length=128,
-        min_length=0,
-        top_k=4,
-        top_p=0.0,
-        num_beams=4,
-        decode_strategy="sampling",
-        decoder_start_token_id=None,
-        bos_token_id=None,
-        eos_token_id=None,
-        pad_token_id=None,
-        diversity_rate=0.0,
-        temperature=1.0,
-        num_return_sequences=1,
-        length_penalty=0.6,
-        early_stopping=False,
-        forced_eos_token_id=None,
-        **model_kwargs
-    ):
-
-        bos_token_id = bos_token_id if bos_token_id is not None else getattr(self._model, "bos_token_id", None)
-        eos_token_id = eos_token_id if eos_token_id is not None else getattr(self._model, "eos_token_id", None)
-        pad_token_id = pad_token_id if pad_token_id is not None else getattr(self._model, "pad_token_id", None)
-
-        if encoder_output is None:
-            assert input_ids is not None, "You have to specify either input_ids or encoder_output."
-            encoder_output = self.prepare_encoder_decoder_kwargs_for_generation(input_ids, model_kwargs)[
-                "encoder_output"
-            ]
-
-            if isinstance(encoder_output, (list, tuple)):
-                encoder_output = encoder_output[0]
-
-        if seq_len is None:
-            assert input_ids is not None, "You have to specify either input_ids when generating seq_len."
-            seq_len = paddle.sum(paddle.cast(input_ids != self.pad_token_id, dtype="int32"), axis=-1, dtype="int32")
-        if self.use_fp16_decoding:
-            encoder_output = paddle.cast(encoder_output, "float16")
-        if decode_strategy.startswith("beam_search") and num_beams > 1:
-            encoder_output, expanded_kwargs = self.expand_inputs_for_generation(
-                encoder_output, expand_size=num_beams, seq_len=seq_len
-            )
-            seq_len = expanded_kwargs["seq_len"]
-        elif decode_strategy == "sampling" and num_return_sequences > 1:
-            encoder_output, expanded_kwargs = self.expand_inputs_for_generation(
-                encoder_output, expand_size=num_return_sequences, seq_len=seq_len
-            )
-            seq_len = expanded_kwargs["seq_len"]
-        if decoder_start_token_id is not None:
-            bos_token_id = decoder_start_token_id
-
-        return self.decoding(
-            enc_output=encoder_output,
-            memory_seq_lens=seq_len,
-            beam_size=num_beams,
-            top_k=top_k,
-            top_p=top_p,
-            decoding_strategy=decode_strategy,
-            max_out_len=max_length,
-            diversity_rate=diversity_rate,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            pad_token_id=pad_token_id,
-            alpha=length_penalty,
-            temperature=temperature,
-            early_stopping=early_stopping,
-        )
-
-    generate = forward
diff --git a/paddlenlp/ops/patches/FasterTransformer/CMakeLists.txt b/paddlenlp/ops/patches/FasterTransformer/CMakeLists.txt
deleted file mode 100644
index 9e4e460d265a..000000000000
--- a/paddlenlp/ops/patches/FasterTransformer/CMakeLists.txt
+++ /dev/null
@@ -1,418 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-cmake_minimum_required(VERSION 3.8 FATAL_ERROR)
-project(FasterTransformer LANGUAGES CXX CUDA)
-
-find_package(CUDA 10.1 REQUIRED)
-
-find_program(CCACHE_PROGRAM ccache)
-if(CCACHE_PROGRAM)
-  set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache)
-  set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK ccache)
-endif()
-
-option(BUILD_PD "Build in PaddlePaddle mode" ON)
-option(BUILD_GPT "Build project with gpt"    ON)
-option(BUILD_ENCODER "Build project with encoder"    ON)
-
-if(BUILD_ENCODER)
-  add_definitions(-DBUILD_ENCODER)
-endif()
-
-if(BUILD_GPT)
-  message(STATUS "Add DBUILD_GPT, requires MPI and NCCL")
-  add_definitions("-DBUILD_GPT")
-  set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules)
-  find_package(MPI REQUIRED)
-  find_package(NCCL REQUIRED)
-  #if(${NCCL_VERSION} LESS 2.7)
-  #  message(FATAL_ERROR "NCCL_VERSION ${NCCL_VERSION} is less than 2.7")
-  #endif()
-  set(CMAKE_MODULE_PATH "") # prevent the bugs for pytorch building
-endif()
-
-set(CXX_STD "17" CACHE STRING "C++ standard")
-
-set(CUDA_PATH ${CUDA_TOOLKIT_ROOT_DIR})
-
-list(APPEND CMAKE_MODULE_PATH ${CUDA_PATH}/lib64)
-
-if (${CUDA_VERSION} GREATER_EQUAL 11.0)
-  message(STATUS "Add DCUDA11_MODE")
-  add_definitions("-DCUDA11_MODE")
-endif()
-
-# profiling
-option(USE_NVTX "Whether or not to use nvtx" OFF)
-if(USE_NVTX)
-  message(STATUS "NVTX is enabled.")
-  add_definitions("-DUSE_NVTX")
-endif()
-
-# setting compiler flags
-set(CMAKE_C_FLAGS    "${CMAKE_C_FLAGS}")
-set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS}")
-set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS}  -Xcompiler -Wall -ldl")
-
-# if (SM STREQUAL 80 OR
-#     SM STREQUAL 86 OR
-#     SM STREQUAL 70 OR
-#     SM STREQUAL 75 OR
-#     SM STREQUAL 61 OR
-#     SM STREQUAL 60)
-# #set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_${SM},code=\\\"sm_${SM},compute_${SM}\\\" -rdc=true")
-# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_${SM},code=\\\"sm_${SM},compute_${SM}\\\"")
-#   if (SM STREQUAL 70 OR SM STREQUAL 75 OR SM STREQUAL 80 OR SM STREQUAL 86)
-#     set(CMAKE_C_FLAGS    "${CMAKE_C_FLAGS}    -DWMMA")
-#     set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS}  -DWMMA")
-#     set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DWMMA")
-#   endif()
-# message("-- Assign GPU architecture (sm=${SM})")
-
-# else()
-# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS}  \
-#                       -gencode=arch=compute_70,code=\\\"sm_70,compute_70\\\" \
-#                       -gencode=arch=compute_75,code=\\\"sm_75,compute_75\\\" \
-#                       ")
-# #                      -rdc=true")
-# set(CMAKE_C_FLAGS    "${CMAKE_C_FLAGS}    -DWMMA")
-# set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS}  -DWMMA")
-# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DWMMA")
-# message("-- Assign GPU architecture (sm=70,75)")
-# endif()
-
-set(SM_SETS 52 60 61 70 75 80)
-set(USING_WMMA False)
-set(FIND_SM False)
-
-foreach(SM_NUM IN LISTS SM_SETS)
-  string(FIND "${SM}" "${SM_NUM}" SM_POS)
-  if(SM_POS GREATER -1)
-    if(FIND_SM STREQUAL False)
-      set(ENV{TORCH_CUDA_ARCH_LIST} "")
-    endif()
-    set(FIND_SM True)
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_${SM_NUM},code=\\\"sm_${SM_NUM},compute_${SM_NUM}\\\"")
-
-    if (SM_NUM STREQUAL 70 OR SM_NUM STREQUAL 75 OR SM_NUM STREQUAL 80 OR SM_NUM STREQUAL 86)
-      set(USING_WMMA True)
-    endif()
-
-    set(CMAKE_CUDA_ARCHITECTURES ${SM_NUM})
-    message("-- Assign GPU architecture (sm=${SM_NUM})")
-  endif()
-endforeach()
-
-if(USING_WMMA STREQUAL True)
-  set(CMAKE_C_FLAGS    "${CMAKE_C_FLAGS}    -DWMMA")
-  set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS}  -DWMMA")
-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DWMMA")
-  message("-- Use WMMA")
-endif()
-
-if(NOT (FIND_SM STREQUAL True))
-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS}  \
-                        -gencode=arch=compute_70,code=\\\"sm_70,compute_70\\\" \
-                        -gencode=arch=compute_75,code=\\\"sm_75,compute_75\\\" \
-                        -gencode=arch=compute_80,code=\\\"sm_80,compute_80\\\" \
-                        ")
-  #                      -rdc=true")
-  set(CMAKE_C_FLAGS    "${CMAKE_C_FLAGS}    -DWMMA")
-  set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS}  -DWMMA")
-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DWMMA")
-
-  set(CMAKE_CUDA_ARCHITECTURES 70 75 80)
-  message("-- Assign GPU architecture (sm=70,75,80)")
-endif()
-
-set(CMAKE_C_FLAGS_DEBUG    "${CMAKE_C_FLAGS_DEBUG}    -Wall -O0")
-set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG}  -Wall -O0")
-# set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -O0 -G -Xcompiler -Wall  --ptxas-options=-v --resource-usage")
-set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -O0 -G -Xcompiler -Wall")
-
-set(CMAKE_CXX17_STANDARD_COMPILE_OPTION "-std=c++{CXX_STD}")
-set(CMAKE_CXX17_EXTENSION_COMPILE_OPTION "-std=gnu++{CXX_STD}")
-set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda")
-set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
-set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --std=c++${CXX_STD}")
-
-set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3")
-# set(CMAKE_CUDA_FLAGS_RELEASE "${CMAKE_CUDA_FLAGS_RELEASE} -Xcompiler -O3 --ptxas-options=--verbose")
-set(CMAKE_CUDA_FLAGS_RELEASE "${CMAKE_CUDA_FLAGS_RELEASE} -Xcompiler -O3")
-
-set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
-set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
-set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
-
-set(COMMON_HEADER_DIRS
-  ${PROJECT_SOURCE_DIR}
-  ${CUDA_PATH}/include
-)
-
-set(COMMON_LIB_DIRS
-  ${CUDA_PATH}/lib64
-)
-
-if(NOT PY_CMD)
-  set(PYTHON_PATH "python" CACHE STRING "Python path")
-else()
-  set(PYTHON_PATH ${PY_CMD} CACHE STRING "Python path")
-endif()
-
-add_definitions(-w)
-
-if(BUILD_PD)
-  add_definitions(-DPADDLE_WITH_CUDA)
-
-  if(ON_INFER)
-    add_definitions(-DPADDLE_ON_INFERENCE)
-
-    link_directories(${COMMON_LIB_DIRS})
-
-    if(NOT WITH_STATIC_LIB)
-      add_definitions("-DPADDLE_WITH_SHARED_LIB")
-    else()
-      # PD_INFER_DECL is mainly used to set the dllimport/dllexport attribute in dynamic library mode.
-      # Set it to empty in static library mode to avoid compilation issues.
-      add_definitions("/DPD_INFER_DECL=")
-    endif()
-
-    macro(safe_set_static_flag)
-        foreach(flag_var
-            CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
-            CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
-          if(${flag_var} MATCHES "/MD")
-            string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
-          endif(${flag_var} MATCHES "/MD")
-        endforeach(flag_var)
-    endmacro()
-
-    if(NOT DEFINED PADDLE_LIB)
-      message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/path/paddle/lib")
-    endif()
-
-    include_directories("${PADDLE_LIB}/paddle/include/")
-    set(PADDLE_LIB_THIRD_PARTY_PATH "${PADDLE_LIB}/third_party/install/")
-    include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}protobuf/include")
-    include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}glog/include")
-    include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}gflags/include")
-    include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/include")
-    if (WITH_ONNXRUNTIME)
-      include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/include")
-      include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/include")
-    endif()
-
-    link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}protobuf/lib")
-    link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}glog/lib")
-    link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}gflags/lib")
-    link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/lib")
-    link_directories("${PADDLE_LIB}/paddle/lib")
-    if (WITH_ONNXRUNTIME)
-      include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib")
-      include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/lib")
-    endif()
-
-    if(WITH_MKL)
-      set(FLAG_OPENMP "-fopenmp")
-    endif()
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAG_OPENMP}")
-
-    if (USE_TENSORRT AND WITH_GPU)
-      set(TENSORRT_ROOT "" CACHE STRING "The root directory of TensorRT library")
-      if("${TENSORRT_ROOT}" STREQUAL "")
-          message(FATAL_ERROR "The TENSORRT_ROOT is empty, you must assign it a value with CMake command. Such as: -DTENSORRT_ROOT=TENSORRT_ROOT_PATH ")
-      endif()
-      set(TENSORRT_INCLUDE_DIR ${TENSORRT_ROOT}/include)
-      set(TENSORRT_LIB_DIR ${TENSORRT_ROOT}/lib)
-    endif()
-
-    if (USE_TENSORRT AND WITH_GPU)
-        include_directories("${TENSORRT_INCLUDE_DIR}")
-        link_directories("${TENSORRT_LIB_DIR}")
-    endif()
-
-    if(WITH_MKL)
-      set(MATH_LIB_PATH "${PADDLE_LIB_THIRD_PARTY_PATH}mklml")
-      include_directories("${MATH_LIB_PATH}/include")
-      set(MKLDNN_PATH "${PADDLE_LIB_THIRD_PARTY_PATH}mkldnn")
-      if(EXISTS ${MKLDNN_PATH})
-        include_directories("${MKLDNN_PATH}/include")
-        set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libmkldnn.so.0)
-      endif()
-    else()
-      set(OPENBLAS_LIB_PATH "${PADDLE_LIB_THIRD_PARTY_PATH}openblas")
-      include_directories("${OPENBLAS_LIB_PATH}/include/openblas")
-    endif()
-
-  else()
-    execute_process(COMMAND ${PYTHON_PATH} "-c" "from __future__ import print_function; import paddle; print(paddle.sysconfig.get_include())"
-                    RESULT_VARIABLE _INC_PYTHON_SUCCESS
-                    OUTPUT_VARIABLE _INC_PYTHON_VALUES)
-    if (NOT _INC_PYTHON_SUCCESS MATCHES 0)
-        message(FATAL_ERROR "Python config Error.")
-    endif()
-    string(REGEX REPLACE ";" "\\\\;" _INC_PYTHON_VALUES ${_INC_PYTHON_VALUES})
-    string(REGEX REPLACE "\n" ";" _INC_PYTHON_VALUES ${_INC_PYTHON_VALUES})
-    list(GET _INC_PYTHON_VALUES 0 PY_INCLUDE_DIR)
-
-    list(APPEND COMMON_HEADER_DIRS ${PY_INCLUDE_DIR})
-    list(APPEND COMMON_HEADER_DIRS ${PY_INCLUDE_DIR}/third_party)
-
-    execute_process(COMMAND ${PYTHON_PATH} "-c" "from __future__ import print_function; import paddle; print(paddle.sysconfig.get_lib())"
-                    RESULT_VARIABLE _LIB_PYTHON_SUCCESS
-                    OUTPUT_VARIABLE _LIB_PYTHON_VALUES)
-    if (NOT _LIB_PYTHON_SUCCESS MATCHES 0)
-        message(FATAL_ERROR "Python config Error.")
-    endif()
-    string(REGEX REPLACE ";" "\\\\;" _LIB_PYTHON_VALUES ${_LIB_PYTHON_VALUES})
-    string(REGEX REPLACE "\n" ";" _LIB_PYTHON_VALUES ${_LIB_PYTHON_VALUES})
-    list(GET _LIB_PYTHON_VALUES 0 PY_LIB_DIR)
-    list(APPEND COMMON_LIB_DIRS ${PY_LIB_DIR})
-
-    include_directories(${PY_INCLUDE_DIR})
-    include_directories(${PY_INCLUDE_DIR}\third_party)
-
-  endif()
-endif()
-
-if(BUILD_GPT)
-  list(APPEND COMMON_HEADER_DIRS ${NCCL_INCLUDE_DIRS})
-  get_filename_component(NCCL_LIB_DIRS ${NCCL_LIBRARIES} DIRECTORY)
-  list(APPEND COMMON_LIB_DIRS ${NCCL_LIB_DIRS})
-endif()
-
-list(APPEND COMMON_HEADER_DIRS ${MPI_INCLUDE_PATH})
-
-include_directories(
-  ${COMMON_HEADER_DIRS}
-)
-
-list(APPEND COMMON_LIB_DIRS /usr/local/mpi/lib)
-
-link_directories(
-  ${COMMON_LIB_DIRS}
-)
-
-add_subdirectory(fastertransformer)
-add_subdirectory(tools)
-# add_subdirectory(sample)
-
-########################################
-
-if(BUILD_GPT)
-# Following feature requires cmake 3.15
-# TODO Remove this part or modify such that we can run it under cmake 3.10
-cmake_minimum_required(VERSION 3.15 FATAL_ERROR)
-add_library(transformer-static STATIC
-  $<TARGET_OBJECTS:encoder>
-  $<TARGET_OBJECTS:cuda_kernels>
-  $<TARGET_OBJECTS:transformer_kernels>
-  $<TARGET_OBJECTS:nvtx_utils>
-  $<TARGET_OBJECTS:cuda_int8_kernels>
-  $<TARGET_OBJECTS:attention_kernels>
-  # trt_fused_multi_head_attention, gpt_triton_backend have been removed to
-  # resolve encoder ON_INFER compiling issue.
-  # $<TARGET_OBJECTS:trt_fused_multi_head_attention>
-  $<TARGET_OBJECTS:encoder_gemm_func>
-  $<TARGET_OBJECTS:encoder_igemm_func>
-  $<TARGET_OBJECTS:decoder>
-  $<TARGET_OBJECTS:decoding>
-  $<TARGET_OBJECTS:topk>
-  $<TARGET_OBJECTS:online_softmax_beamsearch>
-  $<TARGET_OBJECTS:nccl_utils>)
-set_property(TARGET transformer-static PROPERTY POSITION_INDEPENDENT_CODE ON)
-set_property(TARGET transformer-static PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
-target_link_libraries(transformer-static PUBLIC -lcublas -lcudart -lcurand -lnccl -lmpi nvtx_utils)
-
-add_library(transformer-shared SHARED
-  $<TARGET_OBJECTS:encoder>
-  $<TARGET_OBJECTS:cuda_kernels>
-  $<TARGET_OBJECTS:transformer_kernels>
-  $<TARGET_OBJECTS:nvtx_utils>
-  $<TARGET_OBJECTS:cuda_int8_kernels>
-  $<TARGET_OBJECTS:attention_kernels>
-  # $<TARGET_OBJECTS:trt_fused_multi_head_attention>
-  $<TARGET_OBJECTS:encoder_gemm_func>
-  $<TARGET_OBJECTS:encoder_igemm_func>
-  $<TARGET_OBJECTS:decoder>
-  $<TARGET_OBJECTS:decoding>
-  $<TARGET_OBJECTS:topk>
-  $<TARGET_OBJECTS:online_softmax_beamsearch>
-  $<TARGET_OBJECTS:nccl_utils>)
-  # $<TARGET_OBJECTS:gpt_triton_backend>)
-## add_library(transformer-shared SHARED  $<TARGET_OBJECTS:encoder>)
-set_target_properties(transformer-shared PROPERTIES POSITION_INDEPENDENT_CODE ON)
-set_target_properties(transformer-shared PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
-set_target_properties(transformer-shared PROPERTIES LINKER_LANGUAGE CXX)
-target_link_libraries(transformer-shared PUBLIC ${NCCL_LIBRARIES} ${MPI_LIBRARIES} -lcublas -lcublasLt -lcudart -lcurand )
-
-include(GNUInstallDirs)
-set(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/FasterTransformer)
-
-include(CMakePackageConfigHelpers)
-configure_package_config_file(
-  ${CMAKE_CURRENT_LIST_DIR}/cmake/FasterTransformerConfig.cmake.in
-  ${CMAKE_CURRENT_BINARY_DIR}/FasterTransformerConfig.cmake
-  INSTALL_DESTINATION ${INSTALL_CONFIGDIR}
-)
-
-install(
-  FILES
-  ${CMAKE_CURRENT_BINARY_DIR}/FasterTransformerConfig.cmake
-  DESTINATION ${INSTALL_CONFIGDIR}
-)
-
-install(
-  TARGETS
-    transformer-shared
-  EXPORT
-    transformer-shared-targets
-  LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/fastertransformer
-  ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/fastertransformer
-)
-
-install(
-  EXPORT
-    transformer-shared-targets
-  FILE
-    FasterTransformerTargets.cmake
-  DESTINATION
-    ${INSTALL_CONFIGDIR}
-)
-
-file(GLOB_RECURSE HEADER_FILES "*.h" "*.hpp" "*.cuh")
-foreach ( file ${HEADER_FILES} )
-    file( RELATIVE_PATH rfile ${CMAKE_CURRENT_SOURCE_DIR} ${file} )
-    get_filename_component( dir ${rfile} DIRECTORY )
-    install( FILES ${file} DESTINATION  ${CMAKE_INSTALL_PREFIX}/backends/fastertransformer/include/${dir} )
-endforeach()
-
-
-################################################################################
-add_executable(gpt sample/cpp/gpt_sample.cc )
-target_link_libraries(gpt PUBLIC -lcublas -lcublasLt -lcudart -lcurand -lnccl -lmpi transformer-static)
-# target_link_libraries(gpt PUBLIC -lcublas -lcublasLt -lcudart -lcurand -lnccl -lmpi decoder decoding)
-export(
-  EXPORT
-    transformer-shared-targets
-  FILE
-    ${CMAKE_CURRENT_BINARY_DIR}/FasterTransformerTargets.cmake
-  NAMESPACE
-    TritonCore::
-)
-
-export(PACKAGE FasterTransformer)
-
-endif() # BUILD_GPT
diff --git a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/CMakeLists.txt b/paddlenlp/ops/patches/FasterTransformer/fastertransformer/CMakeLists.txt
deleted file mode 100644
index 12fbd83615b5..000000000000
--- a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/CMakeLists.txt
+++ /dev/null
@@ -1,27 +0,0 @@
-# Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-cmake_minimum_required(VERSION 3.8)
-add_subdirectory(cuda)
-add_subdirectory(utils)
-add_subdirectory(gemm_test)
-if(BUILD_TF)
-  add_subdirectory(tf_op)
-endif()
-
-if(BUILD_PYT)
-  add_subdirectory(th_op)
-endif()
-
-# add_subdirectory(trt_fused_multihead_attention)
-# add_subdirectory(triton_backend)
diff --git a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/bert_encoder_transformer.h b/paddlenlp/ops/patches/FasterTransformer/fastertransformer/bert_encoder_transformer.h
deleted file mode 100644
index 8bd5738f00ee..000000000000
--- a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/bert_encoder_transformer.h
+++ /dev/null
@@ -1,1123 +0,0 @@
-/*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * BERT Encoder transformer
- **/
-
-#pragma once
-
-#include <cuda_runtime.h>
-#include "fastertransformer/cuda/cuda_int8_kernels.h"
-#include "fastertransformer/cuda/cuda_kernels.h"
-#include "fastertransformer/cuda/open_attention.h"
-#include "fastertransformer/gemm_test/encoder_gemm_func.h"
-#include "fastertransformer/gemm_test/encoder_igemm_func.h"
-#include "fastertransformer/utils/allocator.h"
-#include "fastertransformer/utils/common_structure.h"
-#include "fastertransformer/utils/functions.h"
-
-namespace fastertransformer {
-
-template <typename T>
-class BertInitParam {
-public:
-  const T *from_tensor = nullptr;
-  const T *to_tensor = nullptr;
-
-  AttentionWeight<T> self_attention;
-  const T *attr_mask = nullptr;
-  LayerNormWeight<T> self_layernorm;
-
-  FFNWeight<T> ffn;
-  LayerNormWeight<T> ffn_layernorm;
-
-  T *transformer_out;
-  cublasHandle_t cublas_handle = nullptr;
-  cublasLtHandle_t cublaslt_handle = nullptr;
-  cudaStream_t stream = 0;
-
-  const int *sequence_id_offset = nullptr;
-  int valid_word_num = -1;
-  int layer_idx = 0;
-  int layer_num = 12;
-
-  // Part 1:
-  //  First 80 are for activation amaxs. For each activation amax, there are 4
-  //  values: amax, amax/127.0f, amax/127.0f/127.0f, 127.0f/amax -- input_amax
-  //  0-3 , Q_aftergemm_amax 4-7, Qbias_amax 8-11, K_aftergemm_amax 12-15,
-  //  Kbias_amax 16-19, V_aftergemm_amax 20-23, Vbias_amax 24-27, bmm1_amax
-  //  28-31, Softmax_amax 32-35, bmm2_amax 36-39, Proj_aftergemm_scale 40-43,
-  //  ProjBiasNorm_amax 44-47, FC1_aftergemm_amax 48-51, F1Bias_amax 52-55,
-  //  FC2_aftergemm_amax 56-59, F2BiasNorm_amax 60-63, reserve 64-79
-  // Part 2:
-  //  Kernel amaxs, for each kernel amax list, there are output_channel values :
-  //  query_weight_amax_list, key_weight_amax_list, value_weight_amax_list,
-  //  proj_weight_amax_list, FC1_weight_amax_list, FC2_weight_amax_list
-  // Part 3:
-  //  Int8 gemm deQFactor list (8 values): Q_deQ_scale, K_deQ_scale,
-  //  V_deQ_scale, bmm1_deQ_scale, bmm2_deQ_scale, FC0_deQ_scale, FC1_deQ_scale,
-  //  FC2_deQ_scale
-  // Part 4:
-  //  Amax used in trt fused mha kernel (3 values) : QKVbias_amax, Softmax_amax,
-  //  bmm2_amax
-  const float *amaxList = nullptr;
-  const int *trt_seqlen_offset = nullptr;
-  int trt_seqlen_size = -1;
-};
-
-template <OperationType OpType_,
-          template <OperationType> class MultiHeadAttention_>
-class BertEncoderTransformerTraits;
-
-template <template <OperationType> class MultiHeadAttention_>
-class BertEncoderTransformerTraits<OperationType::FP32, MultiHeadAttention_>
-    : public TransformerTraits<OperationType::FP32> {
-public:
-  typedef MultiHeadAttention_<OpType> MultiHeadAttention;
-};
-
-template <template <OperationType> class MultiHeadAttention_>
-class BertEncoderTransformerTraits<OperationType::FP16, MultiHeadAttention_>
-    : public TransformerTraits<OperationType::FP16> {
-public:
-  typedef MultiHeadAttention_<OpType> MultiHeadAttention;
-};
-
-template <class Traits_>
-class BertEncoderTransformer {
-  IAllocator *allocator_ = NULL;
-  typename Traits_::MultiHeadAttention *attention_ = NULL;
-  typedef typename Traits_::DataType DataType_;
-  BertInitParam<DataType_> param_;
-
-  const cudaDataType_t AType_ = Traits_::AType;
-  const cudaDataType_t BType_ = Traits_::BType;
-  const cudaDataType_t CType_ = Traits_::CType;
-  std::map<std::string, cublasLtMatmulAlgo_info> cublasAlgoMap_;
-  std::map<std::string, int> parameterMap_;
-
-  DataType_ *buf_ = NULL;
-  DataType_ *attr_out_buf_;
-  DataType_ *attr_matmul_buf_;
-  DataType_ *inter_matmul_buf_;
-  DataType_ *attr_matmul_unnormed_buf_;
-  void *cublas_workspace_ = NULL;
-
-  int batch_size_;
-  int from_seq_len_;
-  int to_seq_len_;
-  int head_num_;
-  int size_per_head_;
-
-  int sm_;
-  bool allow_gemm_test_ = false;
-  bool use_gelu_ = true;
-  bool use_ORDER_COL32_2R_4R4_ = false;
-
-  // for int8 quantization
-  const float *FC0_weight_amax_list, *FC1_weight_amax_list,
-      *FC2_weight_amax_list;
-  float scale_list[INT8O_GEMM_NUM + TRT_FUSED_MHA_AMAX_NUM];
-  const float *bmm2_amax_ptr, *ProjBiasNorm_amax_ptr, *F1Bias_amax_ptr,
-      *F2BiasNorm_amax_ptr, *to_tensor_amax_ptr, *Proj_aftergemm_amax_ptr,
-      *F1_aftergemm_amax_ptr, *F2_aftergemm_amax_ptr,
-      *int8O_gemm_deQ_scale_list;
-  // int8_mode == 0 -- not use int8
-  // int8_mode == 1 -- use int8; without quantized residual; when (batch*seqLen
-  // >= 512) or (seqLen % 32 !=0 ), using trt fused mha
-  // int8_mode == 2 -- use int8; with quantized residual; with trt fused mha
-  // int8_mode == 3 -- use int8; with quantized residual; without trt fused mha
-  int int8_mode_;
-  int layer_idx_;
-  int layer_num_;
-  const int8_t *int8_from_tensor_;
-  const DataType_ *transA_from_tensor_;
-  int32_t *int_buf_;
-  DataType_ *tmp_DataType_, *transA_from_tensor_tmp_,
-      *transformer_out_tmp_DataType_;
-  int8_t *tmp_int8_, *int8_from_tensor_tmp_, *attr_matmul_buf_tmp_,
-      *transformer_out_tmp_int8_;
-
-public:
-  void setLayerIdx(int layer_idx) { layer_idx_ = layer_idx; }
-
-  size_t calBufSizeInByte(int batch_size,
-                          int seq_len,
-                          int head_num,
-                          int size_per_head,
-                          int int8_mode) {
-    size_t m = batch_size * seq_len;
-    size_t n = head_num * size_per_head;
-    size_t k = n;
-    size_t normal_buf_size;
-    if (int8_mode != 0) {
-      // transA_from_tensor & transformer_out_tmp_DataType
-      normal_buf_size =
-          m * k * sizeof(DataType_) +
-          // int8_from_tensor & attr_matmul_buf_tmp & transformer_out_tmp_int8
-          m * k * sizeof(int8_t) +
-          // int8 qkv weight
-          3 * n * k * sizeof(int8_t) +
-          // FC0 & FC1 & FC2 for m*k(4k)*sizeof(DataType)
-          4 * m * k * sizeof(int) +
-          // attr_out_buf_ & attr_matmul_buf_ & inter_matmul_buf_
-          6 * m * n * sizeof(DataType_) +
-          // temp buf
-          m * n * sizeof(DataType_);
-    } else {
-      normal_buf_size =
-          sizeof(DataType_) * (m * n) * 7 +
-          ((sizeof(half) == sizeof(DataType_)) ? CUBLAS_WORKSPACE_SIZE : 0);
-    }
-    return normal_buf_size;
-  }
-
-  bool checkParameterInMap(int batch_size,
-                           int seq_len,
-                           int head_num,
-                           int size_per_head,
-                           int int8_mode,
-                           int is_fp16) {
-    char mark[1000];
-    bool parameterInMap;
-    int dataType = is_fp16 == 0 ? FLOAT_DATATYPE : HALF_DATATYPE;
-    if (int8_mode != 0) {
-      dataType = INT8_DATATYPE;
-    }
-    sprintf(mark,
-            "%d_%d_%d_%d_%d",
-            batch_size,
-            seq_len,
-            head_num,
-            size_per_head,
-            dataType);
-    if (parameterMap_.find(std::string(mark)) != parameterMap_.end())
-      parameterInMap = true;
-    else
-      parameterInMap = false;
-    return parameterInMap;
-  }
-
-  // free buffer for gemm test
-  // This function requires the same allocator of allocateBufferForGemmTest(*)
-  void freeBufferForGemmTest(IAllocator *allocator, void *&buffer) {
-    if (buffer != NULL) {
-      allocator->free(buffer);
-      buffer = NULL;
-    }
-  }
-
-  void allocateBufferForGemmTest(IAllocator *allocator,
-                                 void *&buffer,
-                                 int batch_size,
-                                 int seq_len,
-                                 int head_num,
-                                 int size_per_head,
-                                 int int8_mode,
-                                 int is_fp16) {
-    size_t buf_size_in_byte = calGemmTestBufSizeInByte(
-        batch_size, seq_len, head_num, size_per_head, int8_mode, is_fp16);
-    size_t total, free;
-    check_cuda_error(cudaMemGetInfo(&free, &total));
-    if (free < buf_size_in_byte + 10 * 1024 * 1024) {
-      printf(
-          "[WARNING] There is no enough device memory for gemm test!\n %ld "
-          "Bytes is needed, but only %ld Bytes is free.\n",
-          buf_size_in_byte,
-          free);
-      buffer = NULL;
-      return;
-    }
-    buffer =
-        reinterpret_cast<void *>(allocator->malloc(buf_size_in_byte, false));
-  }
-
-  bool gemmTest(int batch_size,
-                int seq_len,
-                int head_num,
-                int size_per_head,
-                int int8_mode,
-                int is_fp16) {
-    bool hasChangedConfig = false;
-    if (int8_mode != 0) {
-      // if not found parameters in map,
-      // read config first
-      // in case multiple instances (for example in tensorflow op) are used
-      if (!checkParameterInMap(batch_size,
-                               seq_len,
-                               head_num,
-                               size_per_head,
-                               int8_mode,
-                               is_fp16)) {
-        readAlgoFromConfig(int8_mode, cublasAlgoMap_, parameterMap_);
-      } else {
-        return hasChangedConfig;
-      }
-
-      // if still not found algos in map,
-      // do gemm test
-      if (!checkParameterInMap(batch_size,
-                               seq_len,
-                               head_num,
-                               size_per_head,
-                               int8_mode,
-                               is_fp16)) {
-        void *gemm_test_buf = NULL;
-        allocateBufferForGemmTest(allocator_,
-                                  gemm_test_buf,
-                                  batch_size,
-                                  seq_len,
-                                  head_num,
-                                  size_per_head,
-                                  int8_mode,
-                                  is_fp16);
-        if (gemm_test_buf != NULL) {
-          generate_encoder_igemm_config(
-              batch_size, seq_len, head_num, size_per_head, gemm_test_buf);
-          freeBufferForGemmTest(allocator_, gemm_test_buf);
-          readAlgoFromConfig(int8_mode, cublasAlgoMap_, parameterMap_);
-          hasChangedConfig = true;
-        }
-      } else {
-        hasChangedConfig = true;
-        return hasChangedConfig;
-      }
-    } else {
-      // if not found parameters in map,
-      // read config first
-      // in case multiple instances (for example in tensorflow op) are used
-      if (!checkParameterInMap(batch_size,
-                               seq_len,
-                               head_num,
-                               size_per_head,
-                               int8_mode,
-                               is_fp16)) {
-        readAlgoFromConfig(int8_mode, cublasAlgoMap_, parameterMap_);
-      } else {
-        return hasChangedConfig;
-      }
-
-      // if still not found parameters in map,
-      // do gemm test
-      if (!checkParameterInMap(batch_size,
-                               seq_len,
-                               head_num,
-                               size_per_head,
-                               int8_mode,
-                               is_fp16)) {
-        void *gemm_test_buf = NULL;
-        allocateBufferForGemmTest(allocator_,
-                                  gemm_test_buf,
-                                  batch_size,
-                                  seq_len,
-                                  head_num,
-                                  size_per_head,
-                                  int8_mode,
-                                  is_fp16);
-        if (gemm_test_buf != NULL) {
-          if (is_fp16 == 1)
-            generate_encoder_gemm_config<half>(
-                batch_size, seq_len, head_num, size_per_head, gemm_test_buf);
-          else
-            generate_encoder_gemm_config<float>(
-                batch_size, seq_len, head_num, size_per_head, gemm_test_buf);
-          freeBufferForGemmTest(allocator_, gemm_test_buf);
-          readAlgoFromConfig(int8_mode, cublasAlgoMap_, parameterMap_);
-          hasChangedConfig = true;
-        }
-      } else {
-        hasChangedConfig = true;
-        return hasChangedConfig;
-      }
-    }
-    return hasChangedConfig;
-  }
-
-  // free buffer for BertEncoderTransformer
-  void freeBuffer() {
-#ifndef NDEBUG
-    PRINT_FUNC_NAME_();
-#endif
-    if (buf_ != NULL) {
-      if (allocator_ == NULL) {
-        printf(
-            "[ERROR][BertEncoderTransformer][freeBuffer] allocator_ is "
-            "NULL!\n");
-        exit(-1);
-      }
-      allocator_->free(buf_);
-      buf_ = NULL;
-    }
-    if (attention_ != NULL) attention_->freeBuffer();
-  }
-
-  // allocate buffer for BertEncoderTransformer
-  // do gemm test if allow_gemm_test == true
-  void allocateBuffer(IAllocator *allocator,
-                      int batch_size,
-                      int from_seq_len,
-                      int to_seq_len,
-                      int head_num,
-                      int size_per_head,
-                      bool use_trt_kernel = true) {
-#ifndef NDEBUG
-    PRINT_FUNC_NAME_();
-#endif
-    try {
-      if (allocator == NULL) {
-        printf(
-            "[ERROR][BertEncoderTransformer][allocateBuffer] allocator == "
-            "NULL!\n");
-        exit(-1);
-      }
-      // only allocate new buffer when buf_ is empty
-      // if buf_ is not empty, use previous allocated one
-      // this can ensure consistency between (allocator_, batch_size_, ...) and
-      // buf_
-      if (buf_ != nullptr) {
-        printf(
-            "[ERROR][BertEncoderTransformer][allocateBuffer] previous buffer "
-            "is not freed, use previous one. To allocate new buffer, please "
-            "use freeBuffer() to free previous buffer first.\n");
-        exit(-1);
-      } else {
-        allocator_ = allocator;
-        batch_size_ = batch_size;
-        from_seq_len_ = from_seq_len;
-        to_seq_len_ = to_seq_len;
-        head_num_ = head_num;
-        size_per_head_ = size_per_head;
-
-        int m = batch_size_ * from_seq_len_;
-        int k = head_num_ * size_per_head_;
-        int n = k;
-
-        int buf_size = m * n;
-        size_t buf_size_in_byte = calBufSizeInByte(
-            batch_size_, from_seq_len_, head_num_, size_per_head_, int8_mode_);
-
-        // allocate buffer
-        if (int8_mode_ != 0) {
-#ifdef WITH_INT8
-          buf_ = reinterpret_cast<DataType_ *>(
-              allocator_->malloc(buf_size_in_byte, false));
-          if (buf_ == nullptr)
-            throw std::runtime_error(
-                std::string("Allocator failed to allocate internal buffer."));
-
-          attr_out_buf_ =
-              (DataType_ *)(((char *)buf_) + m * k * sizeof(DataType_) +
-                            m * k * sizeof(int8_t) +
-                            3 * n * k * sizeof(int8_t) +
-                            4 * m * k * sizeof(int));
-          attr_matmul_buf_ = attr_out_buf_ + buf_size;
-          inter_matmul_buf_ = attr_matmul_buf_ + buf_size;
-
-          int8_from_tensor_tmp_ =
-              (int8_t *)(((char *)buf_) + m * k * (sizeof(DataType_)));
-          attr_matmul_buf_tmp_ = int8_from_tensor_tmp_;
-          transformer_out_tmp_int8_ = int8_from_tensor_tmp_;
-          transA_from_tensor_tmp_ = (DataType_ *)buf_;
-          transformer_out_tmp_DataType_ = transA_from_tensor_tmp_;
-
-          int_buf_ =
-              (int32_t *)(((char *)buf_) +
-                          (m * k) * (sizeof(DataType_) + sizeof(int8_t)) +
-                          3 * n * k * sizeof(int8_t));
-
-          tmp_DataType_ =
-              (DataType_ *)(((char *)buf_) +
-                            m * k * (sizeof(DataType_) + sizeof(int8_t)) +
-                            3 * n * k * sizeof(int8_t) +
-                            4 * m * k * sizeof(int32_t) +
-                            6 * m * n * sizeof(DataType_));
-          tmp_int8_ = (int8_t *)tmp_DataType_;
-#else
-          printf("[ERROR] BERT transformer encoder does not support INT8 in PaddleNLP. \n");
-          exit(-1);
-#endif
-
-        } else {
-          buf_ = reinterpret_cast<DataType_ *>(
-              allocator_->malloc(buf_size_in_byte, false));
-          if (buf_ == nullptr)
-            throw std::runtime_error(
-                std::string("Allocator failed to allocate internal buffer."));
-
-          if (sizeof(half) == sizeof(DataType_)) {
-            // cublas_workspace_ should be the start pointer of cudaMalloc()
-            // to ensure 16B alignemnet
-            cublas_workspace_ = buf_;
-            attr_out_buf_ = (DataType_ *)((char *)cublas_workspace_ +
-                                          CUBLAS_WORKSPACE_SIZE);
-          } else {
-            cublas_workspace_ = nullptr;
-            attr_out_buf_ = (DataType_ *)buf_;
-          }
-          attr_matmul_buf_ = attr_out_buf_ + buf_size;
-          inter_matmul_buf_ = attr_matmul_buf_ + buf_size;
-          attr_matmul_unnormed_buf_ = inter_matmul_buf_ + 4 * buf_size;
-        }
-      }
-
-      bool hasChangedConfig = false;
-      int is_fp16;
-      if (Traits_::OpType == OperationType::FP32)
-        is_fp16 = 0;
-      else
-        is_fp16 = 1;
-      // check if target algos in map
-      if (allow_gemm_test_) {
-        /*
-        hasChangedConfig = gemmTest(batch_size_,
-                                    from_seq_len_,
-                                    head_num_,
-                                    size_per_head_,
-                                    int8_mode_,
-                                    is_fp16);
-        */
-      }
-
-      // allocate buffer for attention_
-      attention_->allocateBuffer(allocator,
-                                 cublas_workspace_,
-                                 batch_size_,
-                                 from_seq_len_,
-                                 to_seq_len,
-                                 head_num_,
-                                 size_per_head_,
-                                 hasChangedConfig,
-                                 use_trt_kernel);
-    } catch (std::runtime_error &error) {
-      throw error;
-    }
-  }
-
-  BertEncoderTransformer(int int8_mode = 0,
-                         bool allow_gemm_test = false,
-                         bool use_gelu = true)
-      : int8_mode_(int8_mode),
-        allow_gemm_test_(allow_gemm_test),
-        use_gelu_(use_gelu) {
-#ifndef NDEBUG
-    PRINT_FUNC_NAME_();
-#endif
-
-    try {
-      // sm_ = getSMVersion();
-      // Set fake sm_ which have no effect.
-      sm_ = 70;
-
-      if (sm_ >= 80) {
-        use_ORDER_COL32_2R_4R4_ = true;
-      }
-      if (sm_ < 75 && int8_mode_ != 0) {
-        printf(
-            "[ERROR][BertEncoderTransformer] int8 mode only works with sm >= "
-            "75.\n");
-        exit(-1);
-      }
-
-      int isConfigExist = -1;
-      if (int8_mode_ != 0) {
-#ifdef WITH_INT8
-        isConfigExist = access(IGEMM_CONFIG, 0);
-#else
-        printf("[ERROR] BERT transformer encoder does not support INT8 in PaddleNLP. \n");
-        exit(-1);
-#endif
-      } else {
-        isConfigExist = access(GEMM_CONFIG, 0);
-      }
-      if (isConfigExist == -1) {
-        if (!allow_gemm_test_) {
-          // printf(
-          //     "[WARNING][BertEncoderTransformer] %s is not found; using "
-          //     "default GEMM algo\n",
-          //     int8_mode_ != 0 ? IGEMM_CONFIG : GEMM_CONFIG);
-        }
-      } else {
-        readAlgoFromConfig(int8_mode_, cublasAlgoMap_, parameterMap_);
-      }
-
-      attention_ = new typename Traits_::MultiHeadAttention(
-          int8_mode_, allow_gemm_test_, use_ORDER_COL32_2R_4R4_, sm_);
-    } catch (std::runtime_error &error) {
-      throw error;
-    }
-  }
-
-  BertEncoderTransformer(const BertEncoderTransformer *transformer) {
-#ifndef NDEBUG
-    PRINT_FUNC_NAME_();
-#endif
-    sm_ = transformer->sm_;
-    use_ORDER_COL32_2R_4R4_ = transformer->use_ORDER_COL32_2R_4R4_;
-    int8_mode_ = transformer->int8_mode_;
-    allow_gemm_test_ = transformer->allow_gemm_test_;
-    use_gelu_ = transformer->use_gelu_;
-
-    cublasAlgoMap_ = transformer->cublasAlgoMap_;
-    parameterMap_ = transformer->parameterMap_;
-
-    attention_ =
-        new typename Traits_::MultiHeadAttention(transformer->attention_);
-  }
-
-  void genTransATensorAndInt8TensorForFirstLayer() {
-    const int m = param_.sequence_id_offset == nullptr
-                      ? batch_size_ * from_seq_len_
-                      : param_.valid_word_num;
-    const int k = head_num_ * size_per_head_;
-    if (int8_mode_ == 1) {
-      transposeMatrix_colMajorToCOL32_kernelLauncher(
-          transA_from_tensor_tmp_, param_.from_tensor, k, m, param_.stream);
-      transA_from_tensor_ = (const DataType_ *)transA_from_tensor_tmp_;
-      quantized_kernelLauncher(int8_from_tensor_tmp_,
-                               transA_from_tensor_,
-                               m * k,
-                               to_tensor_amax_ptr + 3,
-                               param_.stream);
-    } else if (int8_mode_ == 2 || int8_mode_ == 3) {
-      transposeMatrix_colMajorToCOL32_quantize_kernelLauncher(
-          int8_from_tensor_tmp_,
-          param_.from_tensor,
-          k,
-          m,
-          to_tensor_amax_ptr + 3,
-          param_.stream);
-    }
-    int8_from_tensor_ = (const int8_t *)(int8_from_tensor_tmp_);
-  }
-
-  /**
-   * Initialize the parameters in class
-   * We will keep the Ctor empty to ensure the sub classes follow the same init
-   *routine.
-   * Please be aware that no dynamic memory allocation should be placed
-   **/
-  void initialize(BertInitParam<DataType_> param) {
-#ifndef NDEBUG
-    PRINT_FUNC_NAME_();
-#endif
-
-    param_ = param;
-    cuda::MultiHeadInitParam<DataType_> multi_head_init_param;
-
-    if (int8_mode_ != 0) {
-#ifdef WITH_INT8
-      int hidden_dim = size_per_head_ * head_num_;
-      layer_idx_ = param_.layer_idx;
-      layer_num_ = param_.layer_num;
-
-      bmm2_amax_ptr = param_.amaxList + 36;
-      ProjBiasNorm_amax_ptr = param_.amaxList + 44;
-      F1Bias_amax_ptr = param_.amaxList + 52;
-      F2BiasNorm_amax_ptr = param_.amaxList + 60;
-      Proj_aftergemm_amax_ptr = param_.amaxList + 40;
-      F1_aftergemm_amax_ptr = param_.amaxList + 48;
-      F2_aftergemm_amax_ptr = param_.amaxList + 56;
-      to_tensor_amax_ptr = param_.amaxList;
-
-      FC0_weight_amax_list =
-          param_.amaxList + ACTIVATION_AMAX_NUM + 3 * hidden_dim;
-      FC1_weight_amax_list = FC0_weight_amax_list + hidden_dim;
-      FC2_weight_amax_list = FC1_weight_amax_list + 4 * hidden_dim;
-
-      // This D2H copy operation will cause performance degradation
-      if ((int8_mode_ == 1 && ((batch_size_ * from_seq_len_ >= 512) ||
-                               (from_seq_len_ % 32 != 0))) ||
-          int8_mode_ == 2 || int8_mode_ == 3) {
-        // copy (int8O_gemm_deQ_scale_list + trt_fused_mha_amax_list) amax into
-        // scale_list
-        check_cuda_error(cudaMemcpyAsync(
-            scale_list,
-            FC2_weight_amax_list + hidden_dim,
-            (INT8O_GEMM_NUM + TRT_FUSED_MHA_AMAX_NUM) * sizeof(float),
-            cudaMemcpyDeviceToHost,
-            param_.stream));
-        int8O_gemm_deQ_scale_list = scale_list;
-      }
-      int k = hidden_dim;
-
-      const int m = param_.sequence_id_offset == nullptr
-                        ? batch_size_ * from_seq_len_
-                        : param_.valid_word_num;
-      if (layer_idx_ == 0) {
-        genTransATensorAndInt8TensorForFirstLayer();
-      } else {
-        transA_from_tensor_ = param_.from_tensor;
-        if (int8_mode_ == 2 || int8_mode_ == 3) {
-          int8_from_tensor_ = (const int8_t *)transA_from_tensor_;
-        } else if (int8_mode_ == 1) {
-          quantized_kernelLauncher(int8_from_tensor_tmp_,
-                                   transA_from_tensor_,
-                                   m * k,
-                                   to_tensor_amax_ptr + 3,
-                                   param_.stream);
-          int8_from_tensor_ = (const int8_t *)(int8_from_tensor_tmp_);
-        }
-      }
-
-      multi_head_init_param.int8_from_tensor = int8_from_tensor_;
-
-      multi_head_init_param.amaxList = param_.amaxList;
-
-      multi_head_init_param.int8O_gemm_deQ_scale_list =
-          int8O_gemm_deQ_scale_list;
-
-      multi_head_init_param.trt_fused_mha_amax_list =
-          scale_list + INT8O_GEMM_NUM;
-#else
-      printf("[ERROR] BERT transformer encoder does not support INT8 in PaddleNLP. \n");
-      exit(-1);
-#endif
-    }
-
-    multi_head_init_param.from_tensor = param.from_tensor;
-    multi_head_init_param.to_tensor = param.to_tensor;
-    multi_head_init_param.self_attention = param.self_attention;
-    multi_head_init_param.attr_mask = param.attr_mask;
-    multi_head_init_param.stream = param.stream;
-    multi_head_init_param.cublas_handle = param.cublas_handle;
-    multi_head_init_param.cublaslt_handle = param_.cublaslt_handle;
-    multi_head_init_param.attr_out = attr_out_buf_;
-    multi_head_init_param.valid_word_num = param.valid_word_num;
-    multi_head_init_param.sequence_id_offset = param.sequence_id_offset;
-    multi_head_init_param.trt_seqlen_offset = param_.trt_seqlen_offset;
-    multi_head_init_param.trt_seqlen_size = param_.trt_seqlen_size;
-
-    attention_->initialize(multi_head_init_param);
-  }
-
-  /**
-   * do forward
-   **/
-  void forward() {
-#ifndef NDEBUG
-    PRINT_FUNC_NAME_();
-#endif
-    try {
-      attention_->forward();
-
-#ifndef NDEBUG
-      cudaDeviceSynchronize();
-      check_cuda_error(cudaGetLastError());
-#endif
-
-      DataType_ alpha = (DataType_)1.0f;
-      DataType_ beta = (DataType_)0.0f;
-      const int m = param_.sequence_id_offset == nullptr
-                        ? batch_size_ * from_seq_len_
-                        : param_.valid_word_num;
-      int k = head_num_ * size_per_head_;
-      int n = k;
-
-      if (int8_mode_ != 0) {
-#ifdef WITH_INT8
-        if (int8_mode_ == 1) {
-          cublasLtMM_withAlgo(
-              int_buf_,
-              1,
-              m,
-              n,
-              k,
-              m * k,
-              n * k,
-              m * n,
-              (int8_t *)attr_out_buf_,
-              (int8_t *)(param_.self_attention.attention_output_weight.kernel),
-              param_.cublaslt_handle,
-              param_.stream,
-              cublasAlgoMap_,
-              use_ORDER_COL32_2R_4R4_);
-          add_bias_input_layernorm_COL32_int32I_DataTypeO_kernelLauncher(
-              attr_matmul_buf_,
-              int_buf_,
-              transA_from_tensor_,
-              param_.self_attention.attention_output_weight.bias,
-              param_.self_layernorm.gamma,
-              param_.self_layernorm.beta,
-              m,
-              n,
-              param_.stream,
-              FC0_weight_amax_list,
-              bmm2_amax_ptr);
-        } else if (int8_mode_ == 2 || int8_mode_ == 3) {
-          cublasLtMM_withAlgo_int8IO(
-              (int8_t *)int_buf_,
-              1,
-              m,
-              n,
-              k,
-              m * k,
-              n * k,
-              m * n,
-              int8O_gemm_deQ_scale_list[5],
-              (int8_t *)attr_out_buf_,
-              (int8_t *)(param_.self_attention.attention_output_weight.kernel),
-              param_.cublaslt_handle,
-              param_.stream,
-              cublasAlgoMap_,
-              use_ORDER_COL32_2R_4R4_);
-          add_bias_input_layernorm_COL32_int8IO_kernelLauncher(
-              (int8_t *)attr_matmul_buf_,
-              (int8_t *)int_buf_,
-              int8_from_tensor_,
-              param_.self_attention.attention_output_weight.bias,
-              param_.self_layernorm.gamma,
-              param_.self_layernorm.beta,
-              m,
-              n,
-              param_.stream,
-              Proj_aftergemm_amax_ptr + 1,
-              to_tensor_amax_ptr + 1,
-              ProjBiasNorm_amax_ptr + 3);
-        }
-
-#ifndef NDEBUG
-        cudaDeviceSynchronize();
-        check_cuda_error(cudaGetLastError());
-#endif
-
-        n *= 4;
-
-        if (int8_mode_ == 1) {
-          quantized_kernelLauncher(attr_matmul_buf_tmp_,
-                                   attr_matmul_buf_,
-                                   k * m,
-                                   ProjBiasNorm_amax_ptr + 3,
-                                   param_.stream);
-          cublasLtMM_withAlgo(int_buf_,
-                              1,
-                              m,
-                              n,
-                              k,
-                              m * k,
-                              n * k,
-                              m * n,
-                              attr_matmul_buf_tmp_,
-                              (int8_t *)(param_.ffn.intermediate_weight.kernel),
-                              param_.cublaslt_handle,
-                              param_.stream,
-                              cublasAlgoMap_,
-                              use_ORDER_COL32_2R_4R4_);
-          add_bias_act_COL32_int32I_int8O_kernelLauncher(
-              (int8_t *)inter_matmul_buf_,
-              int_buf_,
-              param_.ffn.intermediate_weight.bias,
-              m,
-              n,
-              param_.stream,
-              FC1_weight_amax_list,
-              ProjBiasNorm_amax_ptr + 2,
-              F1Bias_amax_ptr + 3);
-        } else if (int8_mode_ == 2 || int8_mode_ == 3) {
-          cublasLtMM_withAlgo_int8IO(
-              (int8_t *)int_buf_,
-              1,
-              m,
-              n,
-              k,
-              m * k,
-              n * k,
-              m * n,
-              int8O_gemm_deQ_scale_list[6],
-              (int8_t *)attr_matmul_buf_,
-              (int8_t *)(param_.ffn.intermediate_weight.kernel),
-              param_.cublaslt_handle,
-              param_.stream,
-              cublasAlgoMap_,
-              use_ORDER_COL32_2R_4R4_);
-          add_bias_act_COL32_int8IO_kernelLauncher(
-              (int8_t *)inter_matmul_buf_,
-              (int8_t *)int_buf_,
-              param_.ffn.intermediate_weight.bias,
-              m,
-              n,
-              param_.stream,
-              F1_aftergemm_amax_ptr + 1,
-              F1Bias_amax_ptr + 3);
-        }
-
-#ifndef NDEBUG
-        cudaDeviceSynchronize();
-        check_cuda_error(cudaGetLastError());
-#endif
-
-        n = k;
-        k *= 4;
-
-        if (int8_mode_ == 1) {
-          cublasLtMM_withAlgo(int_buf_,
-                              1,
-                              m,
-                              n,
-                              k,
-                              m * k,
-                              n * k,
-                              m * n,
-                              (int8_t *)inter_matmul_buf_,
-                              (int8_t *)(param_.ffn.output_weight.kernel),
-                              param_.cublaslt_handle,
-                              param_.stream,
-                              cublasAlgoMap_,
-                              use_ORDER_COL32_2R_4R4_);
-          if (layer_idx_ != layer_num_ - 1) {
-            add_bias_input_layernorm_COL32_int32I_DataTypeO_kernelLauncher(
-                param_.transformer_out,
-                int_buf_,
-                attr_matmul_buf_,
-                param_.ffn.output_weight.bias,
-                param_.ffn_layernorm.gamma,
-                param_.ffn_layernorm.beta,
-                m,
-                n,
-                param_.stream,
-                FC2_weight_amax_list,
-                F1Bias_amax_ptr);
-          } else {
-            add_bias_input_layernorm_COL32_int32I_DataTypeO_kernelLauncher(
-                transformer_out_tmp_DataType_,
-                int_buf_,
-                attr_matmul_buf_,
-                param_.ffn.output_weight.bias,
-                param_.ffn_layernorm.gamma,
-                param_.ffn_layernorm.beta,
-                m,
-                n,
-                param_.stream,
-                FC2_weight_amax_list,
-                F1Bias_amax_ptr);
-            transposeMatrix_COL32ToColMajor_kernelLauncher(
-                param_.transformer_out,
-                transformer_out_tmp_DataType_,
-                m,
-                n,
-                param_.stream);
-          }
-        } else if (int8_mode_ == 2 || int8_mode_ == 3) {
-          cublasLtMM_withAlgo_int8IO(
-              (int8_t *)int_buf_,
-              1,
-              m,
-              n,
-              k,
-              m * k,
-              n * k,
-              m * n,
-              int8O_gemm_deQ_scale_list[7],
-              (int8_t *)inter_matmul_buf_,
-              (int8_t *)(param_.ffn.output_weight.kernel),
-              param_.cublaslt_handle,
-              param_.stream,
-              cublasAlgoMap_,
-              use_ORDER_COL32_2R_4R4_);
-          if (layer_idx_ != layer_num_ - 1) {
-            add_bias_input_layernorm_COL32_int8IO_kernelLauncher(
-                (int8_t *)param_.transformer_out,
-                (int8_t *)int_buf_,
-                (int8_t *)attr_matmul_buf_,
-                param_.ffn.output_weight.bias,
-                param_.ffn_layernorm.gamma,
-                param_.ffn_layernorm.beta,
-                m,
-                n,
-                param_.stream,
-                F2_aftergemm_amax_ptr + 1,
-                ProjBiasNorm_amax_ptr + 1,
-                F2BiasNorm_amax_ptr + 3);
-          } else {
-            add_bias_input_layernorm_COL32_int8I_DataTypeO_kernelLauncher(
-                transformer_out_tmp_DataType_,
-                (int8_t *)int_buf_,
-                (int8_t *)attr_matmul_buf_,
-                param_.ffn.output_weight.bias,
-                param_.ffn_layernorm.gamma,
-                param_.ffn_layernorm.beta,
-                m,
-                n,
-                param_.stream,
-                F2_aftergemm_amax_ptr + 1,
-                ProjBiasNorm_amax_ptr + 1);
-            transposeMatrix_COL32ToColMajor_kernelLauncher(
-                param_.transformer_out,
-                transformer_out_tmp_DataType_,
-                m,
-                n,
-                param_.stream);
-          }
-        }
-
-#ifndef NDEBUG
-        cudaDeviceSynchronize();
-        check_cuda_error(cudaGetLastError());
-#endif
-
-#else  // WITH_INT8
-        printf("[ERROR] BERT transformer encoder does not support INT8 in PaddleNLP. \n");
-        exit(-1);
-#endif  // WITH_INT8
-
-      } else {
-        cublasMM_cublasLtMM_wrapper(
-            param_.cublaslt_handle,
-            param_.cublas_handle,
-            CUBLAS_OP_N,
-            CUBLAS_OP_N,
-            n,
-            m,
-            k,
-            &alpha,
-            param_.self_attention.attention_output_weight.kernel,
-            AType_,
-            n,
-            attr_out_buf_,
-            BType_,
-            k,
-            &beta,
-            (DataType_ *)attr_matmul_buf_,
-            CType_,
-            n,
-            param_.stream,
-            cublasAlgoMap_,
-            sm_,
-            cublas_workspace_);
-
-        add_bias_input_layernorm_kernelLauncher<DataType_>(
-            attr_matmul_buf_,
-            param_.from_tensor,
-            param_.self_attention.attention_output_weight.bias,
-            param_.self_layernorm.gamma,
-            param_.self_layernorm.beta,
-            m,
-            n,
-            param_.stream);
-
-#ifndef NDEBUG
-        cudaDeviceSynchronize();
-        check_cuda_error(cudaGetLastError());
-#endif
-
-        n *= 4;
-
-        cublasMM_cublasLtMM_wrapper(param_.cublaslt_handle,
-                                    param_.cublas_handle,
-                                    CUBLAS_OP_N,
-                                    CUBLAS_OP_N,
-                                    n,
-                                    m,
-                                    k,
-                                    &alpha,
-                                    param_.ffn.intermediate_weight.kernel,
-                                    AType_,
-                                    n,
-                                    attr_matmul_buf_,
-                                    BType_,
-                                    k,
-                                    &beta,
-                                    (DataType_ *)inter_matmul_buf_,
-                                    CType_,
-                                    n,
-                                    param_.stream,
-                                    cublasAlgoMap_,
-                                    sm_,
-                                    cublas_workspace_);
-        if (use_gelu_ == true) {
-          add_bias_act_kernelLauncher<DataType_>(
-              inter_matmul_buf_,
-              param_.ffn.intermediate_weight.bias,
-              m,
-              n,
-              ActivationType::GELU,
-              param_.stream);
-        } else {
-          add_bias_act_kernelLauncher<DataType_>(
-              inter_matmul_buf_,
-              param_.ffn.intermediate_weight.bias,
-              m,
-              n,
-              ActivationType::RELU,
-              param_.stream);
-        }
-
-#ifndef NDEBUG
-        cudaDeviceSynchronize();
-        check_cuda_error(cudaGetLastError());
-#endif
-
-        n = k;
-        k *= 4;
-
-        cublasMM_cublasLtMM_wrapper(param_.cublaslt_handle,
-                                    param_.cublas_handle,
-                                    CUBLAS_OP_N,
-                                    CUBLAS_OP_N,
-                                    n,
-                                    m,
-                                    k,
-                                    &alpha,
-                                    param_.ffn.output_weight.kernel,
-                                    AType_,
-                                    n,
-                                    inter_matmul_buf_,
-                                    BType_,
-                                    k,
-                                    &beta,
-                                    (DataType_ *)(param_.transformer_out),
-                                    CType_,
-                                    n,
-                                    param_.stream,
-                                    cublasAlgoMap_,
-                                    sm_,
-                                    cublas_workspace_);
-
-        add_bias_input_layernorm_kernelLauncher<DataType_>(
-            param_.transformer_out,
-            attr_matmul_buf_,
-            param_.ffn.output_weight.bias,
-            param_.ffn_layernorm.gamma,
-            param_.ffn_layernorm.beta,
-            m,
-            n,
-            param_.stream);
-
-#ifndef NDEBUG
-        cudaDeviceSynchronize();
-        check_cuda_error(cudaGetLastError());
-#endif
-      }
-    } catch (std::runtime_error &error) {
-      throw error;
-    }
-  }
-
-  ~BertEncoderTransformer() {
-    if (buf_ != NULL) {
-      if (allocator_ == NULL) {
-        printf(
-            "[ERROR][BertEncoderTransformer][~BertEncoderTransformer] "
-            "allocator_ is NULL!\n");
-        exit(-1);
-      }
-      allocator_->free(buf_);
-    }
-    if (attention_ != NULL) delete attention_;
-  }
-};
-
-}  // namespace fastertransformer
diff --git a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/attention_kernels.cu b/paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/attention_kernels.cu
deleted file mode 100644
index 32c824ef01b0..000000000000
--- a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/attention_kernels.cu
+++ /dev/null
@@ -1,154 +0,0 @@
-/*
- * Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- 
-#include "fastertransformer/cuda/masked_multihead_attention_utils.h"
-namespace fastertransformer 
-{
-
-template<typename T>
-struct Vec_t {};
-template<>
-struct Vec_t<float> {
-    using Type = float2;
-};
-template<>
-struct Vec_t<half> {
-    using Type = uint32_t;
-};
-
-#ifdef ENABLE_BF16
-template<>
-struct Vec_t<__nv_bfloat16> {
-    using Type = __nv_bfloat162;
-};
-#endif
-
-
-template<typename T>
-__global__ void add_fusedQKV_bias_transpose_kernel(T* q_buf,
-                                                   T* k_buf,
-                                                   T* v_buf,
-                                                   const T* __restrict QKV,
-                                                   const T* __restrict qkv_bias,
-                                                   const int batch_size,
-                                                   const int seq_len,
-                                                   const int head_num,
-                                                   const int size_per_head,
-                                                   const int rotary_embedding_dim)
-{
-    using Vec_t = typename Vec_t<T>::Type;
-    const int batch_idx = blockIdx.z;
-    const int head_idx = blockIdx.y;
-    const int seq_idx = blockIdx.x;
-    const int tidx = threadIdx.x;
-    if (tidx * 2 >= size_per_head) {
-        return;
-    }
-
-    const int batch_time_idx = seq_len * batch_idx + seq_idx;
-    const int hidden_idx = head_idx * size_per_head + tidx * 2;
-    const int n = head_num * size_per_head;
-
-    // src QKV: [batch, time, 3, head, hidden]
-    const int q_idx = batch_time_idx * 3 * n + hidden_idx;
-    const int k_idx = batch_time_idx * 3 * n + hidden_idx + n;
-    const int v_idx = batch_time_idx * 3 * n + hidden_idx + 2 * n;
-
-    Vec_t q = *reinterpret_cast<const Vec_t*>(&QKV[q_idx]);
-    Vec_t k = *reinterpret_cast<const Vec_t*>(&QKV[k_idx]);
-    Vec_t v = *reinterpret_cast<const Vec_t*>(&QKV[v_idx]);
-
-    if(qkv_bias != nullptr){
-    // qkv_bias: [3, head, hidden]
-        Vec_t q_bias = *reinterpret_cast<const Vec_t*>(&qkv_bias[hidden_idx]);
-        Vec_t k_bias = *reinterpret_cast<const Vec_t*>(&qkv_bias[hidden_idx + n]);
-        Vec_t v_bias = *reinterpret_cast<const Vec_t*>(&qkv_bias[hidden_idx + 2 * n]);
-
-        q = mmha::add(q, q_bias);
-        k = mmha::add(k, k_bias);
-        v = mmha::add(v, v_bias);
-    }
-
-    mmha::apply_rotary_embedding(q, k, tidx, rotary_embedding_dim, seq_idx);
-
-    // q_buf, k_buf, v_buf: [batch, head_num, seq_len, size_per_head]
-    const int dest_idx = size_per_head * seq_len * head_num * batch_idx + size_per_head * seq_len * head_idx
-                         + size_per_head * seq_idx + tidx * 2;
-
-    *reinterpret_cast<Vec_t*>(&q_buf[dest_idx]) = q;
-    *reinterpret_cast<Vec_t*>(&k_buf[dest_idx]) = k;
-    *reinterpret_cast<Vec_t*>(&v_buf[dest_idx]) = v;
-}
-
-template <typename T>
-void add_fusedQKV_bias_transpose_kernelLauncher(
-  T* q_buf,
-  T* k_buf,
-  T* v_buf,
-  T* QKV,
-  const T* qkv_bias,
-  const int batch_size,
-  const int seq_len,
-  const int head_num,
-  const int size_per_head,
-  const int rotary_embedding_dim,
-  cudaStream_t stream)
-{
-    if (rotary_embedding_dim == 0) {
-        const int m = batch_size * seq_len;
-        const int n = head_num * size_per_head;
-        dim3 block(384);
-        dim3 grid((int)(ceil(1.0 * m * n / 384)));
-        add_fusedQKV_bias_transpose_kernel<<<grid, block, 0, stream>>>(
-            q_buf, k_buf, v_buf, QKV, qkv_bias, batch_size, seq_len, head_num, size_per_head);
-    }
-    else {
-        // To implement rotary embeddings, each thread processes two QKV elems:
-        dim3 block((size_per_head / 2 + 31) / 32 * 32);
-        dim3 grid(seq_len, head_num, batch_size);
-        add_fusedQKV_bias_transpose_kernel<<<grid, block, 0, stream>>>(
-            q_buf, k_buf, v_buf, QKV, qkv_bias, batch_size, seq_len, head_num, size_per_head, rotary_embedding_dim);
-    }
-}
-
-template void add_fusedQKV_bias_transpose_kernelLauncher(
-    float* q_buf,
-    float* k_buf,
-    float* v_buf,
-    float* QKV,
-    const float* qkv_bias,
-    const int batch_size,
-    const int seq_len,
-    const int head_num,
-    const int size_per_head,
-    const int rotary_embedding_dim,
-    cudaStream_t stream);
-
-template void add_fusedQKV_bias_transpose_kernelLauncher(
-    half* q_buf,
-    half* k_buf,
-    half* v_buf,
-    half* QKV,
-    const half* qkv_bias,
-    const int batch_size,
-    const int seq_len,
-    const int head_num,
-    const int size_per_head,
-    const int rotary_embedding_dim,
-    cudaStream_t stream);
-      
-} // namespace fastertransformer
\ No newline at end of file
diff --git a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/attention_kernels.cuh b/paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/attention_kernels.cuh
deleted file mode 100644
index 9ae818a2c58f..000000000000
--- a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/attention_kernels.cuh
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- 
-namespace fastertransformer {
-
-template <typename T>
-void add_fusedQKV_bias_transpose_kernelLauncher(
-  T* q_buf,
-  T* k_buf,
-  T* v_buf,
-  T* QKV,
-  const T* qkv_bias,
-  const int batch_size,
-  const int seq_len,
-  const int head_num,
-  const int size_per_head,
-  const int rotary_embedding_dim,
-  cudaStream_t stream);
-
-} // namespace fastertransformer
diff --git a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/cuda_kernels.cu b/paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/cuda_kernels.cu
deleted file mode 100644
index ccc5083dcc83..000000000000
--- a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/cuda_kernels.cu
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
- * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-namespace fastertransformer {
-
-template <typename T, bool ALIVE>
-__global__ void update_logits_kernel(T* logits,
-                                     const T* bias,
-                                     const int end_id,
-                                     const bool* finished,
-                                     const int n) {
-  int bid = blockIdx.x;
-  bool finish = ALIVE ? false : finished[bid];
-  int offset = bid * n;
-
-  const T MAX_T_VAL = (sizeof(T) == 2) ? HALF_FLT_MAX : FLT_MAX;
-
-  float max_val = -FLT_MAX;
-  __shared__ float s_max_val;
-  __shared__ float s_sum_val;
-
-  for (int tid = threadIdx.x; tid < n; tid += blockDim.x) {
-    if (finish)
-      logits[offset + tid] = (tid == end_id) ? MAX_T_VAL : -MAX_T_VAL;
-    else
-      logits[offset + tid] += bias[tid];
-    max_val = max(max_val, (float)logits[offset + tid]);
-  }
-
-  max_val = blockReduceMax<float>((float)max_val);
-  if (threadIdx.x == 0) s_max_val = max_val;
-  __syncthreads();
-
-  float sum_val = 0.0f;
-  for (int tid = threadIdx.x; tid < n; tid += blockDim.x) {
-    float tmp = __expf((float)logits[offset + tid] - s_max_val);
-    logits[offset + tid] = (T)tmp;
-    sum_val += tmp;
-  }
-
-  sum_val = blockReduceSum<float>(sum_val);
-  if (threadIdx.x == 0) s_sum_val = sum_val;
-  __syncthreads();
-
-  for (int tid = threadIdx.x; tid < n; tid += blockDim.x) {
-    logits[offset + tid] = (T)logf((float)logits[offset + tid] / s_sum_val);
-  }
-}
-
-template <typename T>
-void update_logits_v2(T* logits,
-                      const T* bias,
-                      const int end_id,
-                      const bool* finished,
-                      const int m,
-                      const int n,
-                      cudaStream_t stream) {
-  dim3 grid(m);
-  dim3 block(min(n, 1024));
-  /*n is the vocab_size, e.g., 30000, 7000.... vocab_size is usually very big.
-   */
-  update_logits_kernel<T, true><<<grid, block, 0, stream>>>(
-      logits, bias, end_id, finished, n);
-}
-
-template void update_logits_v2(float* logits,
-                               const float* bias,
-                               const int end_id,
-                               const bool* finished,
-                               const int m,
-                               const int n,
-                               cudaStream_t stream);
-
-template void update_logits_v2(half* logits,
-                               const half* bias,
-                               const int end_id,
-                               const bool* finished,
-                               const int m,
-                               const int n,
-                               cudaStream_t stream);
-}  // namespace fastertransformer
diff --git a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/cuda_kernels.h b/paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/cuda_kernels.h
deleted file mode 100644
index 749eb6a1ba30..000000000000
--- a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/cuda_kernels.h
+++ /dev/null
@@ -1,198 +0,0 @@
-/*
- * Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
- * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-namespace fastertransformer {
-
-template <typename T>
-void init_kernelLauncher_v2(bool* finished,
-                            bool* alive_finished,
-                            int* sequence_length,
-                            int* word_ids,
-                            T* cum_log_probs,
-                            const int sentence_id,
-                            const int batch_size,
-                            const int beam_width,
-                            cudaStream_t stream);
-
-template <typename T>
-void update_logits_v2(T* logits,
-                      const T* bias,
-                      const int end_id,
-                      const bool* finished,
-                      const int m,
-                      const int n,
-                      cudaStream_t stream);
-
-template <typename T>
-void embedding_position_lookups_fix_kernel_launcher(T* from_tensor,
-                                                    const T* embedding_table,
-                                                    const T* pos_table,
-                                                    const int* word_ids,
-                                                    const int local_batch_size,
-                                                    const int batch_size,
-                                                    const int hidden_units,
-                                                    int step,
-                                                    int ite,
-                                                    int max_input_len,
-                                                    const int* start_lengths,
-                                                    cudaStream_t stream);
-
-template <typename T>
-void embedding_position_lookups_bart_kernel_launcher(
-    T* from_tensor,
-    const T* embedding_table,
-    const T* position_encoding_table,
-    const int* word_ids,
-    const int batch_size,
-    const int hidden_units,
-    cudaStream_t stream);
-
-template <typename T>
-void update_with_force_decodingLauncher(const int* trg_word,
-                                        const int* trg_length,
-                                        bool* finished,
-                                        int* word_ids,
-                                        int* sequence_length,
-                                        int* parent_ids_buf,
-                                        int* parent_ids,
-                                        int* output_ids,
-                                        T* scores,
-                                        bool keep_alive_beam,
-                                        const int batch_size,
-                                        const int beam_width,
-                                        const int max_trg_len,
-                                        const int step,
-                                        cudaStream_t stream);
-
-template <typename T>
-void update_KV_cache_kernelLauncher_v2(T** key_cache,
-                                       T** value_cache,
-                                       const int* beam_ids,
-                                       const bool* finished,
-                                       const int batch_size,
-                                       const int beam_width,
-                                       const int head_num,
-                                       const int size_per_head,
-                                       const int step,
-                                       const int decoder_max_seq_len,
-                                       const int cache_size,
-                                       const int decoder_layers,
-                                       cudaStream_t stream,
-                                       const int memory_max_seq_len = -1);
-
-template <typename T>
-void embeddings_kernel_launcher(T* from_tensor,
-                                const T* embedding_table,
-                                const T* position_encoding_table,
-                                const T* type_table,
-                                const int* memory_sequence_length,
-                                const int* type_id,
-                                const int* word_ids,
-                                const int step,
-                                const int batch_size,
-                                const int hidden_units,
-                                const bool pos_bias,
-                                cudaStream_t stream,
-                                const int* decoder_role_id = nullptr,
-                                const T* role_embedding_table = nullptr,
-                                const int* decoder_position_id = nullptr);
-
-template <typename T>
-void words_embeddings_kernel_launcher(T* from_tensor,
-                                      const T* embedding_table,
-                                      const int* word_ids,
-                                      const int batch_size,
-                                      const int hidden_units,
-                                      cudaStream_t stream);
-
-template<typename T>
-void build_relative_attention_bias_launcher(T* relative_attention_bias,
-                                            const T* relative_attention_bias_table,
-                                            const int head_num,
-                                            const int seq_len,
-                                            const int num_bucket,
-                                            const bool is_bidirectional,
-                                            const int max_distance,
-                                            cudaStream_t stream);
-
-template <typename T>
-void start_ids_embeddings_kernel_launcher(T* from_tensor,
-                                const T* embedding_table,
-                                const T* position_encoding_table,
-                                const T* type_table,
-                                const int* type_id,
-                                const int* word_ids,
-                                const int* memory_seq_len,
-                                const int start_step,
-                                const int max_length,
-                                const int batch_size,
-                                const int hidden_units,
-                                cudaStream_t stream,
-                                const int* role_id = nullptr,
-                                const T* role_embedding_table = nullptr,
-                                const int* position_id = nullptr);
-
-template <typename T>
-void init_cache_kernel_launcher(const float* cache_k,
-                                const float* cache_v,
-                                const int* memory_sequence_length,
-                                T* k_tgt,
-                                T* v_tgt,
-                                int n_head,
-                                int size_per_head,
-                                int mem_len,
-                                int batch_size,
-                                int beam_size,
-                                cudaStream_t stream);
-
-template <typename T>
-void apply_logits_mask_kernelLauncher(T* log_probs,
-                                      const bool* finished,
-                                      int batch_size,
-                                      int beam_width,
-                                      int vocab_size_padded,
-                                      int vocab_size,
-                                      cudaStream_t stream,
-                                      const T* logits_mask = nullptr,
-                                      const bool min_penalty = false,
-                                      const int end_id = -1,
-                                      const T* bias = nullptr);
-
-template <typename T>
-void gptj_start_id_embedding_lookups_kernel_launcher(T* from_tensor,
-                                                         int* output_ids,
-                                                         const T* embedding_table, 
-                                                         const int* word_ids,
-                                                         const int length,
-                                                         const int max_length,
-                                                         const int batch_size,
-                                                         const int hidden_units, 
-                                                         cudaStream_t stream);
-
-template <typename T>
-void gpj_embedding_lookups_kernel_launcher(T* from_tensor,
-                                            const T* embedding_table, 
-                                            const int* word_ids,
-                                            const int local_batch_size,
-                                            const int batch_size,
-                                            const int hidden_units, 
-                                            int step, 
-                                            int ite,
-                                            int max_input_len,
-                                            const int* start_lengths,
-                                            cudaStream_t stream);
-
-}  // namespace fastertransformer
diff --git a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/decoding_kernels.cu b/paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/decoding_kernels.cu
deleted file mode 100644
index 810954c6b94a..000000000000
--- a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/decoding_kernels.cu
+++ /dev/null
@@ -1,713 +0,0 @@
-/*
- * Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
- * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-namespace fastertransformer {
-
-template <typename T, bool ALIVE = false>
-__global__ void init_kernel_v2(bool* finished,
-                               bool* alive_finished,
-                               int* sequence_length,
-                               int* word_ids,
-                               T* cum_log_probs,
-                               const int sentence_id,
-                               const int beam_width,
-                               const int batch_size) {
-  const bool IS_FP16 = std::is_same<T, half>::value;
-  const T MAX_T_VAL = (IS_FP16) ? HALF_FLT_MAX : 1e20f;
-  for (int index = blockIdx.x * blockDim.x + threadIdx.x;
-       index < batch_size * beam_width;
-       index += blockDim.x * gridDim.x) {
-    finished[index] = false;
-    if (index < batch_size * beam_width / 2) {
-      alive_finished[index] = false;
-    }
-    sequence_length[index] = 0;
-    if (ALIVE) {
-      if (index < batch_size * beam_width / 2) word_ids[index] = sentence_id;
-      cum_log_probs[index] =
-          (index % beam_width == beam_width / 2) ? (T)0.0f : -MAX_T_VAL;
-    } else {
-      word_ids[index] = sentence_id;
-      cum_log_probs[index] = (index % beam_width == 0) ? (T)0.0f : -MAX_T_VAL;
-    }
-  }
-}
-
-template <typename T>
-void init_kernelLauncher_v2(bool* finished,
-                            bool* alive_finished,
-                            int* sequence_length,
-                            int* word_ids,
-                            T* cum_log_probs,
-                            const int sentence_id,
-                            const int batch_size,
-                            const int beam_width,
-                            cudaStream_t stream) {
-  dim3 grid((int)ceil(batch_size * beam_width * 1.0 / 256));
-  dim3 block(256);
-
-  init_kernel_v2<T, true><<<grid, block, 0, stream>>>(finished,
-                                                      alive_finished,
-                                                      sequence_length,
-                                                      word_ids,
-                                                      cum_log_probs,
-                                                      sentence_id,
-                                                      beam_width,
-                                                      batch_size);
-}
-
-// TODO Add half2 implementation
-template <typename T>
-__global__ void embedding_position_lookups_fix_kernel(
-    T* from_tensor,
-    const T* embedding_table,
-    const T* pos_table,
-    const int* word_ids,
-    const int local_batch_size,
-    const int batch_size,
-    const int hidden_units,
-    int step,
-    int ite,
-    int max_input_len,
-    const int* start_lengths) {
-  int timestep = step - 1;
-  // if the input is padded in the batch, indices of the word_id and the
-  // pos_table also should be shifted forward by the length of the padding.
-  int len_padding =
-      max_input_len - start_lengths[local_batch_size * ite + blockIdx.x];
-  int idx_word_id = (step == max_input_len) ? timestep - len_padding : timestep;
-  int idx_pos_table = timestep - len_padding;
-
-  int* word_ids_buf =
-      (int*)word_ids + idx_word_id * batch_size + local_batch_size * ite;
-  T* from_tensor_buf = from_tensor + blockIdx.x * hidden_units;
-  for (int index = threadIdx.x; index < hidden_units; index += blockDim.x) {
-    from_tensor_buf[index] =
-        embedding_table[word_ids_buf[blockIdx.x] * hidden_units + index] +
-        pos_table[idx_pos_table * hidden_units + index];
-  }
-}
-
-template <typename T>
-void embedding_position_lookups_fix_kernel_launcher(T* from_tensor,
-                                                    const T* embedding_table,
-                                                    const T* pos_table,
-                                                    const int* word_ids,
-                                                    const int local_batch_size,
-                                                    const int batch_size,
-                                                    const int hidden_units,
-                                                    int step,
-                                                    int ite,
-                                                    int max_input_len,
-                                                    const int* start_lengths,
-                                                    cudaStream_t stream) {
-  dim3 grid(min(local_batch_size, 65536));
-  dim3 block(min(hidden_units, 1024));
-  embedding_position_lookups_fix_kernel<T>
-      <<<grid, block, 0, stream>>>(from_tensor,
-                                   embedding_table,
-                                   pos_table,
-                                   word_ids,
-                                   local_batch_size,
-                                   batch_size,
-                                   hidden_units,
-                                   step,
-                                   ite,
-                                   max_input_len,
-                                   start_lengths);
-}
-
-template <typename T>
-__global__ void embedding_position_lookups_bart_kernel(
-    T* from_tensor,
-    const T* embedding_table,
-    const T* position_encoding,
-    const int* word_ids,
-    const int batch_size,
-    const int hidden_units) {
-  // 1. lookup from embedding table
-  // 2. add the position encoding
-  for (int index = blockIdx.x * blockDim.x + threadIdx.x;
-       index < batch_size * hidden_units;
-       index += blockDim.x * gridDim.x) {
-    const int row_index = index / hidden_units;
-    const int col_index = index % hidden_units;
-    from_tensor[index] =
-        embedding_table[word_ids[row_index] * hidden_units + col_index] +
-        position_encoding[col_index];
-  }
-}
-
-template <typename T>
-void embedding_position_lookups_bart_kernel_launcher(T* from_tensor,
-                                                     const T* embedding_table,
-                                                     const T* position_encoding,
-                                                     const int* word_ids,
-                                                     const int batch_size,
-                                                     const int hidden_units,
-                                                     cudaStream_t stream) {
-  dim3 grid(min(batch_size, 65536));
-  dim3 block(min(hidden_units, 1024));
-  embedding_position_lookups_bart_kernel<T><<<grid, block, 0, stream>>>(
-      from_tensor,
-      embedding_table,
-      position_encoding,
-      word_ids,
-      batch_size,
-      hidden_units);
-}
-
-template <typename T>
-__global__ void update_with_force_decoding_kernel(const int* trg_word,
-                                                  const int* trg_length,
-                                                  bool* finished,
-                                                  int* word_ids,
-                                                  int* sequence_length,
-                                                  int* parent_ids_buf,
-                                                  int* parent_ids,
-                                                  int* output_ids,
-                                                  T* scores,
-                                                  bool keep_alive_beam,
-                                                  const int batch_size,
-                                                  const int beam_width,
-                                                  const int max_trg_len,
-                                                  const int step) {
-  int bid = blockIdx.x;   // batch_size
-  int tid = threadIdx.x;  // beam_width
-
-  const T MAX_T_VAL = (sizeof(T) == 2) ? HALF_FLT_MAX : 1e20f;
-  if (step <= trg_length[bid]) {
-    finished[bid * beam_width + tid] = false;
-
-    int word_id = trg_word[bid * max_trg_len + step - 1];
-
-    if (keep_alive_beam) {
-      if (tid >= beam_width / 2) {
-        word_ids[bid * beam_width / 2 + tid - beam_width / 2] = word_id;
-      }
-    } else {
-      word_ids[bid * beam_width + tid] = word_id;
-    }
-
-    output_ids[bid * beam_width + tid] = word_id;
-    if (sequence_length) {
-      sequence_length[bid * beam_width + tid]++;
-    }
-
-    if (parent_ids && scores) {
-      if (keep_alive_beam) {
-        parent_ids[bid * beam_width + tid] = bid * beam_width + beam_width / 2;
-        if (tid >= beam_width / 2) {
-          parent_ids_buf[bid * beam_width / 2 + tid - beam_width / 2] =
-              bid * beam_width / 2;
-        }
-
-        if (tid == beam_width / 2) {
-          scores[bid * beam_width + tid] = 0;
-        } else {
-          scores[bid * beam_width + tid] = -MAX_T_VAL;
-        }
-      } else {
-        parent_ids[bid * beam_width + tid] = bid * beam_width;
-
-        if (tid == 0) {
-          scores[bid * beam_width + tid] = 0;
-        } else {
-          scores[bid * beam_width + tid] = -MAX_T_VAL;
-        }
-      }
-    }
-  }
-}
-
-template <typename T>
-void update_with_force_decodingLauncher(const int* trg_word,
-                                        const int* trg_length,
-                                        bool* finished,
-                                        int* word_ids,
-                                        int* sequence_length,
-                                        int* parent_ids_buf,
-                                        int* parent_ids,
-                                        int* output_ids,
-                                        T* scores,
-                                        bool keep_alive_beam,
-                                        const int batch_size,
-                                        const int beam_width,
-                                        const int max_trg_len,
-                                        const int step,
-                                        cudaStream_t stream) {
-  if (trg_word == nullptr) {
-    return;
-  }
-
-  update_with_force_decoding_kernel<<<batch_size, beam_width, 0, stream>>>(
-      trg_word,
-      trg_length,
-      finished,
-      word_ids,
-      sequence_length,
-      parent_ids_buf,
-      parent_ids,
-      output_ids,
-      scores,
-      keep_alive_beam,
-      batch_size,
-      beam_width,
-      max_trg_len,
-      step);
-}
-
-template <typename T>
-void update_KV_cache_kernelLauncher_v2(T** key_cache,
-                                       T** value_cache,
-                                       const int* beam_ids,
-                                       const bool* finished,
-                                       const int batch_size,
-                                       const int beam_width,
-                                       const int head_num,
-                                       const int size_per_head,
-                                       const int step,
-                                       const int decoder_max_seq_len,
-                                       const int cache_size,
-                                       const int decoder_layers,
-                                       cudaStream_t stream,
-                                       const int memory_max_seq_len) {
-  int src_id = step & 0x1;
-  int tgt_id = 1 - src_id;
-  int tmp_len = (memory_max_seq_len != -1) ? step + memory_max_seq_len : step;
-
-  if (decoder_max_seq_len < 0) {
-    int hidden_dim = head_num * size_per_head;
-    dim3 grid(decoder_layers * batch_size * beam_width * tmp_len);
-    dim3 block(min(1024, hidden_dim));
-    block.x = block.x / (4 / sizeof(T));
-
-    update_KV_cache_kernel<<<grid, block, 0, stream>>>(key_cache[src_id],
-                                                       key_cache[tgt_id],
-                                                       value_cache[src_id],
-                                                       value_cache[tgt_id],
-                                                       beam_ids,
-                                                       finished,
-                                                       batch_size,
-                                                       beam_width,
-                                                       hidden_dim,
-                                                       cache_size,
-                                                       tmp_len,
-                                                       decoder_layers);
-  } else {
-    dim3 grid(batch_size * beam_width, head_num, decoder_layers);
-    constexpr int block_sz = 128;
-    int tmp_decoder_max_seq_len =
-        (memory_max_seq_len != -1) ? (decoder_max_seq_len + memory_max_seq_len)
-                                   : decoder_max_seq_len;
-
-    update_KV_batch_major_cache_kernel<<<grid, block_sz, 0, stream>>>(
-        key_cache[src_id],
-        key_cache[tgt_id],
-        value_cache[src_id],
-        value_cache[tgt_id],
-        beam_ids,
-        finished,
-        batch_size,
-        beam_width,
-        size_per_head,
-        cache_size,
-        tmp_len,
-        tmp_decoder_max_seq_len,
-        decoder_layers);
-  }
-}
-
-template <typename T>
-__global__ void apply_logits_mask_kernel(int vocab_size_padded,
-                                         int vocab_size,
-                                         int beam_width,
-                                         T* log_probs,
-                                         const bool* finished,
-                                         const T* logits_mask = nullptr,
-                                         const bool min_penalty = false,
-                                         const int end_id = -1,
-                                         const T* bias = nullptr) {
-  int tid = threadIdx.x;
-  int bid = blockIdx.x;
-  int bbid = blockIdx.y;  // batch_size * beam_size: index
-  const T MAX_T_VAL = (sizeof(T) == 2) ? HALF_FLT_MAX : 1e20f;
-
-  bool finish = (finished != nullptr) ? finished[bbid] : false;
-
-  if (!finish) {
-    for (int i = tid + bid * blockDim.x; i < vocab_size_padded;
-          i += blockDim.x * gridDim.x) {
-      if ((min_penalty && i == end_id) || i >= vocab_size) {
-        log_probs[i + bbid * vocab_size_padded] = -MAX_T_VAL;
-      } else if (logits_mask) {
-        log_probs[i + bbid * vocab_size_padded] += logits_mask[i];
-      } else if (bias) {
-        log_probs[i + bbid * vocab_size_padded] += bias[i];
-      } else {
-        continue;
-      }
-    }
-  }
-}
-
-template <typename T>
-void apply_logits_mask_kernelLauncher(T* log_probs,
-                                      const bool* finished,
-                                      int batch_size,
-                                      int beam_width,
-                                      int vocab_size_padded,
-                                      int vocab_size,
-                                      cudaStream_t stream,
-                                      const T* logits_mask,
-                                      const bool min_penalty,
-                                      const int end_id,
-                                      const T* bias) {
-  if (logits_mask == nullptr && !min_penalty && bias == nullptr && vocab_size == vocab_size_padded) return;
-
-  dim3 block(256);
-  dim3 grid((vocab_size_padded + block.x - 1) / block.x,
-            beam_width * batch_size);
-
-  apply_logits_mask_kernel<T><<<grid, block, 0, stream>>>(vocab_size_padded,
-                                                          vocab_size,
-                                                          beam_width,
-                                                          log_probs,
-                                                          finished,
-                                                          logits_mask,
-                                                          min_penalty,
-                                                          end_id,
-                                                          bias);
-}
-
-
-  template <typename T> __launch_bounds__(1024, 1)
-  __global__ void gptj_start_id_embedding_lookups_kernel(T* from_tensor,
-                                                             int* output_ids,
-                                                             const T* embedding_table,
-                                                             const int* word_ids,
-                                                             const int length,
-                                                             const int max_length,
-                                                             const int batch_size,
-                                                             const int hidden_units)
-  { 
-      for(int index = blockIdx.x * blockDim.x + threadIdx.x; index < batch_size * length * hidden_units; index += blockDim.x * gridDim.x)
-      {
-          // transpose the word_ids [batch, length] (part of [batch, max_length]) to output_ids [length, batch]
-          if(index < batch_size * max_length)
-          {
-            const int seq_id = index % max_length;
-            const int batch_id = index / max_length;
-            if(seq_id < length)
-              output_ids[seq_id * batch_size + batch_id] = word_ids[index];
-            // output_ids[index] = word_ids[index];
-          }
-        
-          // embedding lookup from word ids [batch, length] (part of [batch, max_length]) and [vocab, hidden] to generate embedding [batch, length, hidden]
-          const int word_index = index / hidden_units;
-          const int word_index_row = word_index / length;
-          const int word_index_col = word_index % length;
-          const int real_word_index = word_index_row * max_length + word_index_col;
-          const int col_index = index % hidden_units;
-          from_tensor[index] = embedding_table[word_ids[real_word_index] * hidden_units + col_index];
-      }
-  }
-
-
-  template <typename T>
-  void gptj_start_id_embedding_lookups_kernel_launcher(T* from_tensor,
-                                                           int *output_ids,
-                                                           const T* embedding_table, 
-                                                           const int* word_ids,
-                                                           const int length,
-                                                           const int max_length,
-                                                           const int batch_size,
-                                                           const int hidden_units, 
-                                                           cudaStream_t stream)
-  {
-      dim3 grid(min(batch_size * length, 65536));
-      dim3 block(min(hidden_units, 1024));
-      gptj_start_id_embedding_lookups_kernel<T><<<grid, block, 0, stream>>>(from_tensor,
-                                                                                output_ids,
-                                                                                embedding_table,
-                                                                                word_ids,
-                                                                                length,
-                                                                                max_length,
-                                                                                batch_size,
-                                                                                hidden_units);
-  }
-
-
-  // TODO Add half2 implementation
-template <typename T>
-__global__ void gptj_embedding_lookups_kernel(
-    T* from_tensor,
-    const T* embedding_table,
-    const int* word_ids,
-    const int local_batch_size,
-    const int batch_size,
-    const int hidden_units,
-    int step,
-    int ite,
-    int max_input_len,
-    const int* start_lengths) {
-  int timestep = step - 1;
-  // if the input is padded in the batch, indices of the word_id 
-  // should be shifted forward by the length of the padding.
-  int len_padding =
-      max_input_len - start_lengths[local_batch_size * ite + blockIdx.x];
-  int idx_word_id = (step == max_input_len) ? timestep - len_padding : timestep;
-
-  int* word_ids_buf =
-      (int*)word_ids + idx_word_id * batch_size + local_batch_size * ite;
-  T* from_tensor_buf = from_tensor + blockIdx.x * hidden_units;
-  for (int index = threadIdx.x; index < hidden_units; index += blockDim.x) {
-    from_tensor_buf[index] =
-        embedding_table[word_ids_buf[blockIdx.x] * hidden_units + index];
-  }
-}
-
-template <typename T>
-void gpj_embedding_lookups_kernel_launcher(T* from_tensor,
-                                                    const T* embedding_table,
-                                                    const int* word_ids,
-                                                    const int local_batch_size,
-                                                    const int batch_size,
-                                                    const int hidden_units,
-                                                    int step,
-                                                    int ite,
-                                                    int max_input_len,
-                                                    const int* start_lengths,
-                                                    cudaStream_t stream) {
-  dim3 grid(min(local_batch_size, 65536));
-  dim3 block(min(hidden_units, 1024));
-  gptj_embedding_lookups_kernel<T>
-      <<<grid, block, 0, stream>>>(from_tensor,
-                                   embedding_table,
-                                   word_ids,
-                                   local_batch_size,
-                                   batch_size,
-                                   hidden_units,
-                                   step,
-                                   ite,
-                                   max_input_len,
-                                   start_lengths);
-}
-
-template void init_kernelLauncher_v2(bool* finished,
-                                     bool* alive_finished,
-                                     int* sequence_length,
-                                     int* word_ids,
-                                     float* cum_log_probs,
-                                     const int sentence_id,
-                                     const int batch_size,
-                                     const int beam_width,
-                                     cudaStream_t stream);
-
-template void init_kernelLauncher_v2(bool* finished,
-                                     bool* alive_finished,
-                                     int* sequence_length,
-                                     int* word_ids,
-                                     half* cum_log_probs,
-                                     const int sentence_id,
-                                     const int batch_size,
-                                     const int beam_width,
-                                     cudaStream_t stream);
-
-template void embedding_position_lookups_fix_kernel_launcher(
-    float* from_tensor,
-    const float* embedding_table,
-    const float* pos_table,
-    const int* word_ids,
-    const int local_batch_size,
-    const int batch_size,
-    const int hidden_units,
-    int step,
-    int ite,
-    int max_input_len,
-    const int* start_lengths,
-    cudaStream_t stream);
-
-template void embedding_position_lookups_fix_kernel_launcher(
-    half* from_tensor,
-    const half* embedding_table,
-    const half* pos_table,
-    const int* word_ids,
-    const int local_batch_size,
-    const int batch_size,
-    const int hidden_units,
-    int step,
-    int ite,
-    int max_input_len,
-    const int* start_lengths,
-    cudaStream_t stream);
-
-template void embedding_position_lookups_bart_kernel_launcher(
-    float* from_tensor,
-    const float* embedding_table,
-    const float* position_encoding,
-    const int* word_ids,
-    const int batch_size,
-    const int hidden_units,
-    cudaStream_t stream);
-
-template void embedding_position_lookups_bart_kernel_launcher(
-    half* from_tensor,
-    const half* embedding_table,
-    const half* position_encoding,
-    const int* word_ids,
-    const int batch_size,
-    const int hidden_units,
-    cudaStream_t stream);
-
-template void update_with_force_decodingLauncher(const int* trg_word,
-                                                 const int* trg_length,
-                                                 bool* finished,
-                                                 int* word_ids,
-                                                 int* sequence_length,
-                                                 int* parent_ids_buf,
-                                                 int* parent_ids,
-                                                 int* output_ids,
-                                                 float* scores,
-                                                 bool keep_alive_beam,
-                                                 const int batch_size,
-                                                 const int beam_width,
-                                                 const int max_trg_len,
-                                                 const int step,
-                                                 cudaStream_t stream);
-
-template void update_with_force_decodingLauncher(const int* trg_word,
-                                                 const int* trg_length,
-                                                 bool* finished,
-                                                 int* word_ids,
-                                                 int* sequence_length,
-                                                 int* parent_ids_buf,
-                                                 int* parent_ids,
-                                                 int* output_ids,
-                                                 half* scores,
-                                                 bool keep_alive_beam,
-                                                 const int batch_size,
-                                                 const int beam_width,
-                                                 const int max_trg_len,
-                                                 const int step,
-                                                 cudaStream_t stream);
-
-template void update_KV_cache_kernelLauncher_v2(float** key_cache,
-                                                float** value_cache,
-                                                const int* beam_ids,
-                                                const bool* finished,
-                                                const int batch_size,
-                                                const int beam_width,
-                                                const int head_num,
-                                                const int size_per_head,
-                                                const int step,
-                                                const int decoder_max_seq_len,
-                                                const int cache_size,
-                                                const int decoder_layers,
-                                                cudaStream_t stream,
-                                                const int memory_max_seq_len);
-
-template void update_KV_cache_kernelLauncher_v2(half** key_cache,
-                                                half** value_cache,
-                                                const int* beam_ids,
-                                                const bool* finished,
-                                                const int batch_size,
-                                                const int beam_width,
-                                                const int head_num,
-                                                const int size_per_head,
-                                                const int step,
-                                                const int decoder_max_seq_len,
-                                                const int cache_size,
-                                                const int decoder_layers,
-                                                cudaStream_t stream,
-                                                const int memory_max_seq_len);
-
-template void apply_logits_mask_kernelLauncher(
-    float* log_probs,
-    const bool* finished,
-    int batch_size,
-    int beam_width,
-    int vocab_size_padded,
-    int vocab_size,
-    cudaStream_t stream,
-    const float* logits_mask,
-    const bool min_penalty,
-    const int end_id,
-    const float* bias);
-
-template void apply_logits_mask_kernelLauncher(
-    half* log_probs,
-    const bool* finished,
-    int batch_size,
-    int beam_width,
-    int vocab_size_padded,
-    int vocab_size,
-    cudaStream_t stream,
-    const half* logits_mask,
-    const bool min_penalty,
-    const int end_id,
-    const half* bias);
-
-  template
-  void gptj_start_id_embedding_lookups_kernel_launcher(float* from_tensor,
-                                                           int* output_ids,
-                                                           const float* embedding_table,
-                                                           const int* word_ids,
-                                                           const int length,
-                                                           const int max_length,
-                                                           const int batch_size,
-                                                           const int hidden_units, 
-                                                           cudaStream_t stream);
-
-  template
-  void gptj_start_id_embedding_lookups_kernel_launcher(half* from_tensor,
-                                                           int* output_ids,
-                                                           const half* embedding_table,
-                                                           const int* word_ids,
-                                                           const int length,
-                                                           const int max_length,
-                                                           const int batch_size,
-                                                           const int hidden_units, 
-                                                           cudaStream_t stream);
-  
-  template void gpj_embedding_lookups_kernel_launcher(
-    float* from_tensor,
-    const float* embedding_table,
-    const int* word_ids,
-    const int local_batch_size,
-    const int batch_size,
-    const int hidden_units,
-    int step,
-    int ite,
-    int max_input_len,
-    const int* start_lengths,
-    cudaStream_t stream);
-
-template void gpj_embedding_lookups_kernel_launcher(
-    half* from_tensor,
-    const half* embedding_table,
-    const int* word_ids,
-    const int local_batch_size,
-    const int batch_size,
-    const int hidden_units,
-    int step,
-    int ite,
-    int max_input_len,
-    const int* start_lengths,
-    cudaStream_t stream);
-
-}  // end of name space fastertransformer
diff --git a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/lightseq_kernels.cu b/paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/lightseq_kernels.cu
deleted file mode 100644
index 4e3a4022933f..000000000000
--- a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/lightseq_kernels.cu
+++ /dev/null
@@ -1,56 +0,0 @@
-
-namespace fastertransformer {
-
-const unsigned int WARP_REDUCE_MASK = 0xffffffff;
-const float CUDA_FLOAT_INF_NEG = -100000000.f;
-const unsigned int WARP_SIZE = 32;
-
-template <typename T>
-__forceinline__ __device__ T warpReduceMax(T val) {
-  for (int mask = (WARP_SIZE >> 1); mask > 0; mask >>= 1)
-    val = max(val, __shfl_xor_sync(WARP_REDUCE_MASK, val, mask, WARP_SIZE));
-  return val;
-}
-
-
-/* Calculate the maximum of all elements in a block */
-template <typename T>
-__forceinline__ __device__ T blockReduceMax(T val) {
-  static __shared__ T shared[32];
-  int lane = threadIdx.x & 0x1f;
-  int wid = threadIdx.x >> 5;
-
-  val = warpReduceMax<T>(val);
-
-  if (lane == 0) shared[wid] = val;
-  __syncthreads();
-
-  val = (threadIdx.x < ((blockDim.x + 31) >> 5)) ? shared[lane]
-                                                 : CUDA_FLOAT_INF_NEG;
-  val = warpReduceMax<T>(val);
-  return val;
-}
-
-/* Calculate the rough topk-th value in a block, rough but safe */
-template <typename T, int K>
-__forceinline__ __device__ T blockRoughTopK(T val) {
-  static __shared__ T shared[32];
-  int lane = threadIdx.x & 0x1f;
-  int wid = threadIdx.x >> 5;
-  val = warpReduceMax(val);
-
-  if (lane == 0) shared[wid] = val;
-  __syncthreads();
-
-  // we do not care about result of threadIdx.x bigger than (blockDim.x >> 5)
-  val = (threadIdx.x < (blockDim.x >> 5)) ? shared[lane] : CUDA_FLOAT_INF_NEG;
-
-  // K should be 2, 4, 6, 8, 16 or 32
-  for (int mask = 16; mask >= K; mask >>= 1)
-    val = max(val, __shfl_xor_sync(WARP_REDUCE_MASK, val, mask, 32));
-  for (int mask = (K >> 1); mask > 0; mask >>= 1)
-    val = min(val, __shfl_xor_sync(WARP_REDUCE_MASK, val, mask, 32));
-
-  return val;
-}
-}
\ No newline at end of file
diff --git a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/masked_multihead_attention.cu b/paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/masked_multihead_attention.cu
deleted file mode 100644
index 310b6daeb78b..000000000000
--- a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/masked_multihead_attention.cu
+++ /dev/null
@@ -1,1504 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
- * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without modification, are not permit-
- * ted.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR 
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE 
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#include "masked_multihead_attention.h"
-#include "masked_multihead_attention_utils.h"
-#include <assert.h>
-#include <float.h>
-#include <stdexcept>
-//#define MMHA_USE_HMMA_FOR_REDUCTION
-
-// Below are knobs to extend FP32 accumulation for higher FP16 accuracy
-
-// Does not seem to affect the accuracy that much
-//#define MMHA_USE_FP32_ACUM_FOR_FMA
-
-// Seems to slightly improve the accuracy
-#define MMHA_USE_FP32_ACUM_FOR_OUT
-
-#if 0 && defined(MMHA_USE_FP32_ACUM_FOR_OUT)
-// Does not seem to improve the accuracy
-//#define MMHA_USE_FP32_ACUM_FOR_LOGITS
-#endif
-
-namespace mmha { 
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-//
-// We use the following terminology to describe the different dimensions.
-//
-// B:  Batch size (number of sequences),
-// L:  Sequence length,
-// D:  Hidden dimension,
-// H:  Number of heads,
-// Dh: Hidden dimension per head - Dh = D / H.
-//
-// The different kernels assign a threadblock for B x H pair. The grid has size (1, B, H). We use 
-// 64, 128 and 256 threads per block.
-//
-// Each threadblock loads Dh values from Q and its associated bias. The kernels run a loop to 
-// compute Q * K^T where K is loaded from a cache buffer -- except for the current timestep. The 
-// cache buffer helps with memory accesses and contains keys with bias.
-//
-// The layout of the cache buffer for the keys is [B, H, Dh/x, L, x] where x == 8 for FP16 and 
-// x == 4 for FP32 where the fastest moving dimension (contiguous data) is the rightmost one. The
-// values for x are chosen to create chunks of 16 bytes.
-//
-// The different kernels use 1, 2 or 4 threads per key (THREADS_PER_KEY). The size of the LDGs 
-// depends on the number of threads per key. Each thread sums Dh / THREADS_PER_KEY elements. At
-// the end of each iteration of the Q * K^T loop, we perform a reduction between lanes using an
-// HMMA instruction (Tensor Core). Each Q * K^T valuey is stored in shared memory in FP32.
-//
-// After that loop, a parallel softmax is computed across the different Q * K^T values stored in
-// shared memory.
-//
-// The kernel ends with a loop over the values in V. We use THREADS_PER_VALUE to control how many
-// timesteps are computed by loop iteration. As with the keys, the values are read from a cache 
-// except for the current timestep. The layout of the cache buffer for the values is much simpler
-// as it is [B, H, L, Dh].
-//
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template< typename T, int Dh >
-struct Qk_vec_ {};
-
-template<> struct Qk_vec_<float,     32> { using Type = float;    };
-template<> struct Qk_vec_<float,     64> { using Type = float2;   };
-template<> struct Qk_vec_<float,    128> { using Type = float4;   };
-template<> struct Qk_vec_<float,    256> { using Type = float4;   };
-template<> struct Qk_vec_<uint16_t,  32> { using Type = uint32_t; };
-template<> struct Qk_vec_<uint16_t,  64> { using Type = uint32_t; };
-template<> struct Qk_vec_<uint16_t, 128> { using Type = uint2;    };
-template<> struct Qk_vec_<uint16_t, 256> { using Type = uint4;    };
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template< typename T, int THREADS_PER_KEY >
-struct K_vec_ {};
-
-template<> struct K_vec_<float,    4> { using Type = float;    };
-template<> struct K_vec_<float,    2> { using Type = float2;   };
-template<> struct K_vec_<float,    1> { using Type = float4;   };
-template<> struct K_vec_<uint16_t, 4> { using Type = uint32_t; };
-template<> struct K_vec_<uint16_t, 2> { using Type = uint2;    };
-template<> struct K_vec_<uint16_t, 1> { using Type = uint4;    };
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template< typename T, int V_VEC_SIZE >
-struct V_vec_ {};
-
-template<> struct V_vec_<float,    1> { using Type = float;    };
-template<> struct V_vec_<float,    2> { using Type = float2;   };
-template<> struct V_vec_<float,    4> { using Type = float4;   };
-template<> struct V_vec_<uint16_t, 2> { using Type = uint32_t; };
-template<> struct V_vec_<uint16_t, 4> { using Type = uint2;    };
-template<> struct V_vec_<uint16_t, 8> { using Type = uint4;    };
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-#ifdef MMHA_USE_FP32_ACUM_FOR_FMA
-template< typename T>
-struct Qk_vec_acum_fp32_ {};
-
-template<> struct Qk_vec_acum_fp32_<float   > { using Type = float;        };
-template<> struct Qk_vec_acum_fp32_<float2  > { using Type = float2;       };
-template<> struct Qk_vec_acum_fp32_<float4  > { using Type = float4;       };
-//template<> struct Qk_vec_acum_fp32_<uint16_t> { using Type = float;        };
-template<> struct Qk_vec_acum_fp32_<uint32_t> { using Type = float2;       };
-template<> struct Qk_vec_acum_fp32_<uint2   > { using Type = Float4_;      };
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template< typename T>
-struct K_vec_acum_fp32_ {};
-
-template<> struct K_vec_acum_fp32_<float   > { using Type = float;        };
-template<> struct K_vec_acum_fp32_<float2  > { using Type = float2;       };
-template<> struct K_vec_acum_fp32_<float4  > { using Type = float4;       };
-template<> struct K_vec_acum_fp32_<uint32_t> { using Type = float2;       };
-template<> struct K_vec_acum_fp32_<uint2   > { using Type = Float4_;      };
-template<> struct K_vec_acum_fp32_<uint4   > { using Type = Float8_;      };
-#endif
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-#ifdef MMHA_USE_FP32_ACUM_FOR_OUT
-template< typename T >
-struct V_vec_acum_fp32_ {};
-
-template<> struct V_vec_acum_fp32_<float   > { using Type = float;    };
-template<> struct V_vec_acum_fp32_<float2  > { using Type = float2;   };
-template<> struct V_vec_acum_fp32_<float4  > { using Type = float4;   };
-template<> struct V_vec_acum_fp32_<uint32_t> { using Type = float2;   };
-template<> struct V_vec_acum_fp32_<uint2   > { using Type = Float4_;  };
-template<> struct V_vec_acum_fp32_<uint4   > { using Type = Float8_;  };
-#endif
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template< int THREADS_PER_KEY, typename K_vec, int N >
-inline __device__ float qk_dot_(const K_vec (&q)[N], const K_vec (&k)[N]) {
-#ifdef MMHA_USE_FP32_ACUM_FOR_FMA
-  using K_vec_acum = typename K_vec_acum_fp32_<K_vec>::Type;
-#else
-  using K_vec_acum = K_vec;
-#endif
-  // Compute the parallel products for Q*K^T (treat vector lanes separately).
-  K_vec_acum qk_vec = mul<K_vec_acum, K_vec, K_vec>(q[0], k[0]);
-  #pragma unroll
-  for( int ii = 1; ii < N; ++ii ) {
-    qk_vec = fma(q[ii], k[ii], qk_vec);
-  }
-
-  // Finalize the reduction across lanes.
-  float qk = sum(qk_vec);
-  #pragma unroll
-  for( int mask = THREADS_PER_KEY / 2; mask >= 1; mask /= 2 ) {
-    qk += __shfl_xor_sync(uint32_t(-1), qk, mask);
-  }
-  return qk;
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template< typename T, int THREADS_PER_KEY >
-struct Qk_dot {
-  template< typename K_vec, int N >
-  static inline __device__ float dot(const K_vec (&q)[N], const K_vec (&k)[N]) {
-    return qk_dot_<THREADS_PER_KEY>(q, k);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-inline __device__ float4 hmma_fp32(const uint2 &a, uint32_t b) {
-  float4 c; float zero = 0.f;
-  asm volatile( \
-    "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32 \n" \
-    "    {%0, %1, %2, %3}, \n" \
-    "    {%4, %5}, \n" \
-    "    {%6}, \n" \
-    "    {%7, %7, %7, %7}; \n" \
-      \
-      : "=f"(c.x), "=f"(c.y), "=f"(c.z), "=f"(c.w) 
-      :  "r"(a.x)   "r"(a.y)
-      ,  "r"(b)
-      ,  "f"(zero));
-  return c;
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template< int N >
-inline __device__ float qk_hmma_dot_(const uint32_t (&q)[N], const uint32_t (&k)[N]) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 750 
-#ifdef MMHA_USE_FP32_ACUM_FOR_FMA
-  using K_vec_acum = typename K_vec_acum_fp32_<uint32_t>::Type;
-#else
-  using K_vec_acum = uint32_t;
-#endif
-  K_vec_acum qk_vec = mul<K_vec_acum, uint32_t, uint32_t>(q[0], k[0]);
-  #pragma unroll
-  for( int ii = 1; ii < N; ++ii ) {
-    qk_vec = fma(q[ii], k[ii], qk_vec);
-  }
-#ifdef MMHA_USE_FP32_ACUM_FOR_FMA
-    uint32_t qk_vec_ = float2_to_half2(qk_vec);
-    return hmma_fp32(make_uint2(qk_vec_, 0u), 0x3c003c00u).x;
-#else
-  return hmma_fp32(make_uint2(qk_vec, 0u), 0x3c003c00u).x;
-#endif
-#else
-  return 0.f;
-#endif
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template<>
-struct Qk_dot<uint16_t, 4> {
-  template< int N >
-  static inline __device__ float dot(const uint32_t (&q)[N], const uint32_t (&k)[N]) {
-#if __CUDA_ARCH__ >= 750 && defined(MMHA_USE_HMMA_FOR_REDUCTION)
-    return qk_hmma_dot_(q, k);
-#else
-    return qk_dot_<4>(q, k);
-#endif // defined MMHA_USE_HMMA_FOR_REDUCTION
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template< int WARPS_PER_BLOCK, int WARP_SIZE = 32 >
-inline __device__ float block_sum(float *red_smem, float sum) {
-
-  // Decompose the thread index into warp / lane.
-  int warp = threadIdx.x / WARP_SIZE;
-  int lane = threadIdx.x % WARP_SIZE;
-
-  // Compute the sum per warp.
-  #pragma unroll
-  for( int mask = WARP_SIZE / 2; mask >= 1; mask /= 2 ) {
-    sum += __shfl_xor_sync(uint32_t(-1), sum, mask);
-  }
-
-  // Warp leaders store the data to shared memory.
-  if( lane == 0 ) {
-    red_smem[warp] = sum;
-  }
-
-  // Make sure the data is in shared memory.
-  __syncthreads();
-
-  // The warps compute the final sums.
-  if( lane < WARPS_PER_BLOCK ) {
-    sum = red_smem[lane];
-  }
-
-  // Parallel reduction inside the warp.
-  #pragma unroll
-  for( int mask = WARPS_PER_BLOCK / 2; mask >= 1; mask /= 2 ) {
-    sum += __shfl_xor_sync(uint32_t(-1), sum, mask);
-  }
-
-  // Broadcast to other threads.
-  return __shfl_sync(uint32_t(-1), sum, 0);
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-inline __device__ void convert_from_float(float &dst, float src) {
-  dst = src;
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-inline __device__ void convert_from_float(uint16_t &dst, float src) {
-  dst = float_to_half(src);
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-inline __device__ void convert_from_float(uint32_t &dst, float2 src) {
-  dst = float2_to_half2(src);
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-inline __device__ void convert_from_float(uint2 &dst, Float4_ src) {
-  dst.x = float2_to_half2(src.x);
-  dst.y = float2_to_half2(src.y);
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-inline __device__ void convert_from_float(uint4 &dst, Float8_ src) {
-  dst.x = float2_to_half2(src.x);
-  dst.y = float2_to_half2(src.y);
-  dst.z = float2_to_half2(src.z);
-  dst.w = float2_to_half2(src.w);
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-inline __device__ void convert_from_float(float2 &dst, float2 src) {
-  dst = src;
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-inline __device__ void convert_from_float(float4 &dst, float4 src) {
-  dst = src;
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-inline __device__ float convert_to_float(float4 u) {
-  return u.x;
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-inline __device__ float convert_to_float(uint4 u) {
-  float2 tmp = half2_to_float2(u.x);
-  return tmp.x;
-}
-
-#if defined(MMHA_USE_FP32_ACUM_FOR_LOGITS)
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-inline __device__ float cast_to_float(float u) {
-  return u;
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-inline __device__ float2 cast_to_float(float2 u) {
-  return u;
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-inline __device__ float4 cast_to_float(float4 u) {
-  return u;
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-inline __device__ Float4_ cast_to_float(Float4_ u) {
-  return u;
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-inline __device__ Float8_ cast_to_float(Float8_ u) {
-  return u;
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-inline __device__ float2 cast_to_float(uint32_t u) {
-  return half2_to_float2(u);
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-inline __device__ Float4_ cast_to_float(uint2 u) {
-  Float4_ tmp;
-  tmp.x = half2_to_float2(u.x);
-  tmp.y = half2_to_float2(u.y);
-  return tmp;
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-inline __device__ Float8_ cast_to_float(uint4 u) {
-  Float8_ tmp;
-  tmp.x = half2_to_float2(u.x);
-  tmp.y = half2_to_float2(u.y);
-  tmp.z = half2_to_float2(u.z);
-  tmp.w = half2_to_float2(u.w);
-  return tmp;
-}
-
-#endif
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template< typename T >
-inline __device__ __host__ T div_up(T m, T n) {
-  return (m + n-1) / n;
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <typename T>
-inline size_t smem_size_in_bytes(
-    const Masked_multihead_attention_params<T> &params,
-    int threads_per_value,
-    int threads_per_block,
-    int pad_active_groups) {
-  // The amount of shared memory needed to store the Q*K^T values in float.
-  size_t qk_sz = div_up(params.timestep + 1, 4) * 16;
-
-  // The extra memory needed if we are not using floats for the final logits.
-  size_t logits_sz = 0;
-#ifndef MMHA_USE_FP32_ACUM_FOR_LOGITS
-  if( sizeof(T) != 4 ) {
-    logits_sz = div_up(params.seq_length, 4) * 4 * sizeof(T);
-  }
-#endif
-
-  // The total size needed during softmax.
-  size_t softmax_sz = qk_sz + logits_sz;
-
-  // The number of partial rows to reduce in the final reduction.
-  // int rows_per_red = threads_per_block / threads_per_value;
-  // to solve `threads_per_block / threads_per_value` is not 2^n
-  int rows_per_red = params.rotary_embedding_dim>0 ? threads_per_block / threads_per_value: pad_active_groups;
-  // The amount of storage needed to finalize the outputs.
-  size_t red_sz = rows_per_red * params.hidden_size_per_head * sizeof(T) / 2;
-
-  // The max.
-  return max(softmax_sz, red_sz);
-}
-
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-inline __device__ constexpr uint32_t shfl_mask(int threads) { 
-  return threads == 32 ? uint32_t(-1) : (1u << threads) - 1u;
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template< 
-  // The type of the inputs. Supported types: float and half.
-  typename T, 
-  // The hidden dimension per head.
-  int Dh,
-  int Dh_MAX,
-  // The number of threads per key.
-  int THREADS_PER_KEY,
-  // The number of threads per value.
-  int THREADS_PER_VALUE,
-  // The number of threads in a threadblock.
-  int THREADS_PER_BLOCK
->
-__global__ void masked_multihead_attention_kernel(Masked_multihead_attention_params<T> params, int pad_active_groups) {
-  // Make sure the hidden dimension per head is a multiple of the number of threads per key.
-  static_assert(Dh % THREADS_PER_KEY == 0, "");
-  // Make sure the hidden dimension per head is a multiple of the number of threads per value.
-  static_assert(Dh % THREADS_PER_VALUE == 0, "");
-
-  // The size of a warp.
-  constexpr int WARP_SIZE = 32;
-  // The number of warps in a threadblock.
-  constexpr int WARPS_PER_BLOCK = THREADS_PER_BLOCK / WARP_SIZE;
-
-  // Use smem_size_in_bytes (above) to determine the amount of shared memory.
-  extern __shared__ char smem_[];
-
-  // The shared memory for the Q*K^T values and partial logits in softmax.
-  float *qk_smem = reinterpret_cast<float*>(smem_);
-
-  // The shared memory for the logits. For FP32, that's the same buffer as qk_smem.
-  char *logits_smem_ = smem_;
-#ifndef MMHA_USE_FP32_ACUM_FOR_LOGITS
-  if( sizeof(T) != 4 ) {
-    logits_smem_ += div_up(params.timestep + 1, 4) * 16; //sizeof(float);
-  }
-  T *logits_smem = reinterpret_cast<T*>(logits_smem_);
-#else
-  float *logits_smem = reinterpret_cast<float*>(logits_smem_);
-#endif
-
-  // The shared memory to do the final reduction for the output values. Reuse qk_smem.
-  T *out_smem = reinterpret_cast<T*>(smem_);
-
-  // The shared memory buffers for the block-wide reductions. One for max, one for sum.
-  __shared__ float red_smem[WARPS_PER_BLOCK * 2];
-  // Shared memory to store Q inputs.
-  __shared__ T q_smem[Dh];
-
-  // A vector of Q or K elements for the current timestep.
-  using Qk_vec = typename Qk_vec_<T, Dh_MAX>::Type;
-  // The number of elements per vector.
-  constexpr int QK_VEC_SIZE = sizeof(Qk_vec) / sizeof(T);
-  // Make sure the hidden size per head is a multiple of the vector size.
-  static_assert(Dh % QK_VEC_SIZE == 0, "");
-  // The number of vectors per warp.
-  constexpr int QK_VECS_PER_WARP = Dh / QK_VEC_SIZE;
-
-  // The layout of the cache is [B, H, Dh/x, L, x] with x == 4/8 for FP32/FP16. Since each thread
-  // owns x elements, we have to decompose the linear index into chunks of x values and the posi-
-  // tion of the thread in that chunk. 
-
-  // The number of elements in a chunk of 16B (that's the x in the above formula).
-  constexpr int QK_ELTS_IN_16B = 16 / sizeof(T);
-  // The number of K vectors in 16B.
-  constexpr int QK_VECS_IN_16B = 16 / sizeof(Qk_vec);
-
-  // The batch.
-  const int bi = blockIdx.y;
-  if(params.finished != nullptr && params.finished[bi] == true) return;
-  // The head.
-  const int hi = blockIdx.x;
-  // Combine the batch and the head indices.
-  const int bhi = bi * params.num_heads + hi;
-  // The thread in the block. 
-  const int tidx = threadIdx.x;
-
-  // While doing the product Q*K^T for the different keys we track the max.
-  float qk_max = -FLT_MAX;
-  float qk = 0;
-
-  int qkv_base_offset = (params.stride == 0)? bhi*Dh : bi*params.stride + hi*Dh;
-
-  // First QK_VECS_PER_WARP load Q and K + the bias values for the current timestep.
-  if( tidx < QK_VECS_PER_WARP ) {
-
-    // The offset in the Q and K buffer also accounts for the batch.
-    int qk_offset = qkv_base_offset + tidx*QK_VEC_SIZE;
-    // The offset in the bias buffer.
-    int qk_bias_offset = hi*Dh + tidx*QK_VEC_SIZE;
-
-    // Trigger the loads from the Q and K buffers.
-    Qk_vec q = *reinterpret_cast<const Qk_vec*>(&params.q[qk_offset]);
-    Qk_vec k = *reinterpret_cast<const Qk_vec*>(&params.k[qk_offset]);
-
-    // Trigger the loads from the Q and K bias buffers.
-    Qk_vec q_bias = *reinterpret_cast<const Qk_vec*>(&params.q_bias[qk_bias_offset]);
-    Qk_vec k_bias = *reinterpret_cast<const Qk_vec*>(&params.k_bias[qk_bias_offset]);
-
-    // Computes the Q/K values with bias.
-    q = add(q, q_bias);
-    k = add(k, k_bias);
-
-    // Store the Q values to shared memory.
-    *reinterpret_cast<Qk_vec*>(&q_smem[tidx*QK_VEC_SIZE]) = q;
-
-    // Write the K values to the global memory cache. 
-    // 
-    // NOTE: The stores are uncoalesced as we have multiple chunks of 16B spread across the memory 
-    // system. We designed it this way as it allows much better memory loads (and there are many 
-    // more loads) + the stores are really "write and forget" since we won't need the ack before 
-    // the end of the kernel. There's plenty of time for the transactions to complete.
-
-    // The 16B chunk written by the thread.
-    int co = tidx / QK_VECS_IN_16B;
-    // The position of the thread in that 16B chunk.
-    int ci = tidx % QK_VECS_IN_16B * QK_VEC_SIZE;
-
-    // Two chunks are separated by L * x elements. A thread write QK_VEC_SIZE elements.
-    int offset = bhi*params.seq_length*Dh + 
-                 co*params.seq_length*QK_ELTS_IN_16B + 
-                 params.timestep*QK_ELTS_IN_16B +
-                 ci; 
-
-    // Trigger the stores to global memory. 
-    *reinterpret_cast<Qk_vec*>(&params.k_cache[offset]) = k;
-
-    // Compute \sum_i Q[i] * K^T[i] for the current timestep.
-#ifdef MMHA_USE_FP32_ACUM_FOR_FMA
-    using Qk_vec_acum = typename Qk_vec_acum_fp32_<Qk_vec>::Type;
-#else
-    using Qk_vec_acum = Qk_vec;
-#endif
-    qk = dot<Qk_vec_acum, Qk_vec>(q, k);
-    // float qk = dot<Qk_vec_acum, Qk_vec>(q, k);
-    // #pragma unroll
-    // for( int mask = QK_VECS_PER_WARP / 2; mask >= 1; mask /= 2 ) {
-    //   qk += __shfl_xor_sync(shfl_mask(QK_VECS_PER_WARP), qk, mask);
-    // }
-
-    // // Normalize qk.
-    // qk *= params.inv_sqrt_dh;
-
-    // // Store that value in shared memory. Keep the Q*K^T value in register for softmax.
-    // if( tidx == 0 ) {
-    //   qk_max = qk;
-    //   qk_smem[params.timestep] = qk;
-    // }
-  }
-  if (tidx < WARP_SIZE) {
-    // use reduce-sum on WARP_SIZE instead of QK_VECS_PER_WARP to solve
-    // QK_VECS_PER_WARP is not 2^n, such as Dh=80, 96
-    for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) {
-      qk += __shfl_xor_sync(uint32_t(-1), qk, mask);
-    }
-    if (tidx == 0) {
-      qk *= params.inv_sqrt_dh;
-      qk_max = qk;
-      qk_smem[params.timestep] = qk;
-    }
-  }
-
-  // Make sure the data is in shared memory.
-  __syncthreads();
-
-  // The type of queries and keys for the math in the Q*K^T product.
-  using K_vec = typename K_vec_<T, THREADS_PER_KEY>::Type;
-  // The number of elements per vector.
-  constexpr int K_VEC_SIZE = sizeof(K_vec) / sizeof(T);
-  // Make sure the hidden size per head is a multiple of the vector size.
-  static_assert(Dh % K_VEC_SIZE == 0, "");
-  // The number of elements per thread.
-  constexpr int K_ELTS_PER_THREAD = Dh / THREADS_PER_KEY;
-  // The number of vectors per thread.
-  constexpr int K_VECS_PER_THREAD = K_ELTS_PER_THREAD / K_VEC_SIZE;
-
-  // The position the first key loaded by each thread from the cache buffer (for this B * H).
-  int ko = tidx / THREADS_PER_KEY;
-  // The position of the thread in the chunk of keys.
-  int ki = tidx % THREADS_PER_KEY * K_VEC_SIZE;
-
-  // Load the Q values from shared memory. The values are reused during the loop on K.
-  K_vec q[K_VECS_PER_THREAD];
-  #pragma unroll
-  for( int ii = 0; ii < K_VECS_PER_THREAD; ++ii ) {
-    q[ii] = *reinterpret_cast<const K_vec*>(&q_smem[ki + ii*THREADS_PER_KEY*K_VEC_SIZE]);
-  }
-
-  // The number of timesteps loaded per iteration.
-  constexpr int K_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_KEY;
-  // The number of keys per warp.
-  constexpr int K_PER_WARP = WARP_SIZE / THREADS_PER_KEY;
-
-  // The base pointer for the key in the cache buffer.
-  T *k_cache = &params.k_cache[bhi*params.seq_length*Dh + ki];
-
-  // Pick a number of keys to make sure all the threads of a warp enter (due to shfl_sync).
-  int ti_end = div_up(params.timestep, K_PER_WARP) * K_PER_WARP;
-
-  // Iterate over the keys/timesteps to compute the various (Q*K^T)_{ti} values.
-  for( int ti = ko; ti < ti_end; ti += K_PER_ITER ) {
-
-    // The keys loaded from the key cache.
-    K_vec k[K_VECS_PER_THREAD];
-    #pragma unroll
-    for( int ii = 0; ii < K_VECS_PER_THREAD; ++ii ) {
-      int jj = ii * params.seq_length + ti; 
-      if( ti < params.timestep ) {
-        k[ii] = *reinterpret_cast<const K_vec*>(&k_cache[jj*QK_ELTS_IN_16B]);
-      }
-    }
-
-    // Perform the dot product and normalize qk. 
-    // 
-    // WARNING: ALL THE THREADS OF A WARP MUST ENTER!!!
-    float qk = Qk_dot<T, THREADS_PER_KEY>::dot(q, k) * params.inv_sqrt_dh;
-
-    bool is_mask = params.is_mask? (ti >= params.input_lengths[bi] && ti < params.max_input_len) : false;
-
-    // Store the product to shared memory. There's one qk value per timestep. Update the max.
-    if( ti < params.timestep && tidx % THREADS_PER_KEY == 0 ) {
-
-      qk_max = is_mask? qk_max : fmaxf(qk_max, qk);
-      qk_smem[ti] = qk;
-    }
-  }
-
-  // Perform the final reduction to compute the max inside each warp.
-  //
-  // NOTE: In a group of THREADS_PER_KEY threads, the leader already has the max value for the
-  // group so it's not needed to run the reduction inside the group (again).
-  #pragma unroll
-  for( int mask = WARP_SIZE / 2; mask >= THREADS_PER_KEY; mask /= 2 ) {
-    qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask));
-  }
-
-  // Decompose the thread index into warp and lane.
-  const int warp = tidx / WARP_SIZE;
-  const int lane = tidx % WARP_SIZE;
-
-  // The warp leader writes the max to shared memory.
-  if( lane == 0 ) {
-    red_smem[warp] = qk_max;
-  }
-
-  // Make sure the products are in shared memory.
-  __syncthreads();
-
-  // The warps finalize the reduction.
-  qk_max = lane < WARPS_PER_BLOCK ? red_smem[lane] : -FLT_MAX;
-  #pragma unroll
-  for( int mask = WARPS_PER_BLOCK / 2; mask >= 1; mask /= 2 ) {
-    qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask));
-  }
-
-  // Broadcast to all the threads in the warp.
-  qk_max = __shfl_sync(uint32_t(-1), qk_max, 0);
-
-  // Compute the logits and start the sum.
-  float sum = 0.f;
-
-  for( int ti = tidx; ti <= params.timestep; ti += THREADS_PER_BLOCK ) {
-    bool is_mask = params.is_mask? (ti >= params.input_lengths[bi] && ti < params.max_input_len) : false;
-
-    float logit = is_mask? 0.f : __expf(qk_smem[ti] - qk_max);
-    sum += logit;
-    qk_smem[ti] = logit;
-  }
-
-  // Compute the sum.
-  sum = block_sum<WARPS_PER_BLOCK>(&red_smem[WARPS_PER_BLOCK], sum);
-
-  // Normalize the logits.
-  float inv_sum = __fdividef(1.f, sum + 1.e-6f);
-  for( int ti = tidx; ti <= params.timestep; ti += THREADS_PER_BLOCK ) { 
-    convert_from_float(logits_smem[ti], qk_smem[ti] * inv_sum);
-  }
-
-  // Make sure the logits are in shared memory.
-  __syncthreads();
-
-  // The number of elements per vector.
-  constexpr int V_VEC_SIZE = Dh / THREADS_PER_VALUE;
-  // A vector of V elements for the current timestep.
-  using V_vec = typename V_vec_<T, V_VEC_SIZE>::Type;
-
-  // The value computed by this thread.
-  int vo = tidx / THREADS_PER_VALUE;
-  // The hidden dimensions computed by this particular thread.
-  int vi = tidx % THREADS_PER_VALUE * V_VEC_SIZE;
-
-  // The base pointer for the value in the cache buffer.
-  T *v_cache = &params.v_cache[bhi*params.seq_length*Dh + vi];
-
-#ifdef MMHA_USE_FP32_ACUM_FOR_OUT
-  using V_vec_acum = typename V_vec_acum_fp32_<V_vec>::Type;
-#else
-  using V_vec_acum = V_vec;
-#endif
-  // The partial outputs computed by each thread.
-  V_vec_acum out; zero(out);
-
-  // The number of values processed per iteration of the loop.
-  constexpr int V_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_VALUE;
-
-  // to solve THREADS_PER_BLOCK is not divisible by THREADS_PER_VALUE
-  if (vo < V_PER_ITER) {
-    // Loop over the timesteps to compute the partial outputs.
-    for (int ti = vo; ti < params.timestep; ti += V_PER_ITER) {
-      // Load the values from the cache.
-      V_vec v = *reinterpret_cast<const V_vec *>(&v_cache[ti * Dh]);
-      // Load the logits from shared memory.
-#if defined(MMHA_USE_FP32_ACUM_FOR_LOGITS)
-      float logit = logits_smem[ti];
-      out = fma(logit, cast_to_float(v), out);
-#else
-      T logit = logits_smem[ti];
-
-      // Update the partial sums.
-      out = fma(logit, v, out);
-#endif
-    }
-  }
-
-  // One group of threads computes the product(s) for the current timestep.
-  if( vo == params.timestep % V_PER_ITER ) {
-
-    // Trigger the loads from the V buffer.
-    V_vec v = *reinterpret_cast<const V_vec*>(&params.v[qkv_base_offset + vi]);
-    // Trigger the loads from the V bias buffer.
-    V_vec v_bias = *reinterpret_cast<const V_vec*>(&params.v_bias[hi*Dh + vi]);
-
-    // Compute the V values with bias.
-    v = add(v, v_bias);
-
-    // Store the values with bias back to global memory in the cache for V. 
-    *reinterpret_cast<V_vec*>(&v_cache[params.timestep*Dh]) = v;
-
-    // Initialize the output value with the current timestep.
-#if defined(MMHA_USE_FP32_ACUM_FOR_LOGITS)
-    out = fma(logits_smem[params.timestep], cast_to_float(v), out);
-#else
-    out = fma(logits_smem[params.timestep], v, out);
-#endif
-  }
-
-  // Make sure we can start writing to shared memory.
-  // Sync since logits_smem and out_smem use the same shared memory.
-  __syncthreads();
-
-  if (vo < pad_active_groups / 2) {
-    // use pad_active_groups instead of V_PER_ITER to solve V_PER_ITER is not 2^n
-    zero(*reinterpret_cast<V_vec*>(&out_smem[vo * Dh + vi]));
-  }
-  // No need to __syncthreads() here since sync exists in the reduce below
-
-  // Run the final reduction amongst the different groups computing different partial outputs.
-  // for( int active_groups = V_PER_ITER; active_groups >= 2; active_groups /= 2 ) {
-  #pragma unroll
-  for(int active_groups=pad_active_groups; active_groups >= 2; active_groups /= 2 ) {
-
-    // The midpoint in the number of active groups.
-    int midpoint = active_groups / 2;
-
-    // The upper part of active threads store to shared memory.
-    if( vo >= midpoint && vo < active_groups ) {
-#ifdef MMHA_USE_FP32_ACUM_FOR_OUT
-      convert_from_float(*reinterpret_cast<V_vec*>(&out_smem[(vo - midpoint)*Dh + vi]), out);
-#else
-      *reinterpret_cast<V_vec*>(&out_smem[(vo - midpoint)*Dh + vi]) = out;
-#endif
-    }
-    __syncthreads();
-
-    // The bottom warps update their values.
-    if( vo < midpoint ) {
-      out = add(*reinterpret_cast<const V_vec*>(&out_smem[vo*Dh + vi]), out);
-    }
-    __syncthreads();
-  }
-
-  // Output the final values.
-  if( vo == 0 ) {
-#ifdef MMHA_USE_FP32_ACUM_FOR_OUT
-    convert_from_float(*reinterpret_cast<V_vec*>(&params.out[bhi*Dh + vi]), out);
-#else
-    *reinterpret_cast<V_vec*>(&params.out[bhi*Dh + vi]) = out;
-#endif
-  }
-}
-
-template<
-    // The type of the inputs. Supported types: float and half.
-    typename T,
-    // The hidden dimension per head.
-    int Dh,
-    int Dh_MAX,
-    // The number of threads per key.
-    int THREADS_PER_KEY,
-    // The number of threads per value.
-    int THREADS_PER_VALUE,
-    // The number of threads in a threadblock.
-    int THREADS_PER_BLOCK
-    >
-__global__ void masked_multihead_attention_kernel_v2(Masked_multihead_attention_params<T> params, int pad_active_groups)
-{
-
-    // Make sure the hidden dimension per head is a multiple of the number of threads per key.
-    static_assert(Dh_MAX % THREADS_PER_KEY == 0, "");
-    // Make sure the hidden dimension per head is a multiple of the number of threads per value.
-    static_assert(Dh_MAX % THREADS_PER_VALUE == 0, "");
-
-    // The size of a warp.
-    constexpr int WARP_SIZE = 32;
-    // The number of warps in a threadblock.
-    constexpr int WARPS_PER_BLOCK = THREADS_PER_BLOCK / WARP_SIZE;
-
-    // Use smem_size_in_bytes (above) to determine the amount of shared memory.
-    extern __shared__ char smem_[];
-
-    // The shared memory for the Q*K^T values and partial logits in softmax.
-    float* qk_smem = reinterpret_cast<float*>(smem_);
-
-    // The shared memory for the logits. For FP32, that's the same buffer as qk_smem.
-    char* logits_smem_ = smem_;
-
-    // DO_CROSS_ATTENTION = false
-    constexpr bool DO_CROSS_ATTENTION = false;
-
-#ifndef MMHA_USE_FP32_ACUM_FOR_LOGITS
-    if (sizeof(T) != 4) {
-        // TODO - cahnge to tlength
-        logits_smem_ +=
-            (DO_CROSS_ATTENTION) ? div_up(params.seq_length + 1, 4) * 16 : div_up(params.timestep + 1, 4) * 16;
-    }
-    T* logits_smem = reinterpret_cast<T*>(logits_smem_);
-#else
-    float* logits_smem = reinterpret_cast<float*>(logits_smem_);
-#endif
-
-    // The shared memory to do the final reduction for the output values. Reuse qk_smem.
-    T* out_smem = reinterpret_cast<T*>(smem_);
-
-    // The shared memory buffers for the block-wide reductions. One for max, one for sum.
-    __shared__ float red_smem[WARPS_PER_BLOCK * 2];
-
-    // A vector of Q or K elements for the current timestep.
-    using Qk_vec = typename Qk_vec_<T, Dh_MAX>::Type;
-
-    // Use alignment for safely casting the shared buffers as Qk_vec.
-    // Shared memory to store Q inputs.
-    __shared__ __align__(sizeof(Qk_vec)) T q_smem[Dh_MAX];
-
-    // This is one of the reasons we should have a separate kernel for cross attention
-    __shared__ __align__(sizeof(Qk_vec)) T bias_smem[DO_CROSS_ATTENTION ? Dh_MAX : 1];
-
-    // A vector of Q or K elements for the current timestep.
-    using Qk_vec = typename Qk_vec_<T, Dh_MAX>::Type;
-    // The number of elements per vector.
-    constexpr int QK_VEC_SIZE = sizeof(Qk_vec) / sizeof(T);
-    // Make sure the hidden size per head is a multiple of the vector size.
-    static_assert(Dh_MAX % QK_VEC_SIZE == 0, "");
-    // We will use block wide reduction if needed
-    // static_assert(Dh_MAX / QK_VEC_SIZE <= WARP_SIZE, "");
-    // The number of vectors per warp.
-    constexpr int QK_VECS_PER_WARP = Dh_MAX / QK_VEC_SIZE;
-
-    // The layout of the cache is [B, H, Dh/x, L, x] with x == 4/8 for FP32/FP16. Since each thread
-    // owns x elements, we have to decompose the linear index into chunks of x values and the posi-
-    // tion of the thread in that chunk.
-
-    // The number of elements in a chunk of 16B (that's the x in the above formula).
-    constexpr int QK_ELTS_IN_16B = 16 / sizeof(T);
-    // The number of K vectors in 16B.
-    constexpr int QK_VECS_IN_16B = 16 / sizeof(Qk_vec);
-
-    // The batch/beam idx
-    const int bi = blockIdx.y;
-    if (params.finished != nullptr && params.finished[bi] == true) {
-        return;
-    }
-    // The beam idx
-    const int beami = bi % params.beam_width;
-    // The "beam-aware" batch idx
-    const int bbi = bi / params.beam_width;
-    // The head.
-    const int hi = blockIdx.x;
-    // Combine the batch and the head indices.
-    const int bhi = bi * params.num_heads + hi;
-    // Combine the "beam-aware" batch idx and the head indices.
-    const int bbhi = bbi * params.beam_width * params.num_heads + hi;
-    // The thread in the block.
-    const int tidx = threadIdx.x;
-
-    // While doing the product Q*K^T for the different keys we track the max.
-    float qk_max = -FLT_MAX;
-
-    float qk = 0.0F;
-
-    int qkv_base_offset = (params.stride == 0) ? bhi * Dh : bi * params.stride + hi * Dh;
-
-    // int tlength = (DO_CROSS_ATTENTION)? params.memory_length_per_sample[bi] - 1 : params.timestep;
-    int tlength = (DO_CROSS_ATTENTION)                  ? params.memory_length_per_sample[bi] - 1 :
-                  (params.length_per_sample == nullptr) ? params.timestep :
-                                                          params.length_per_sample[bi];
-    // First QK_VECS_PER_WARP load Q and K + the bias values for the current timestep.
-    if (tidx < QK_VECS_PER_WARP) {
-
-        // The offset in the Q and K buffer also accounts for the batch.
-        int qk_offset = qkv_base_offset + tidx * QK_VEC_SIZE;
-        // The offset in the bias buffer.
-        int qk_bias_offset = hi * Dh + tidx * QK_VEC_SIZE;
-
-        // Trigger the loads from the Q and K buffers.
-        Qk_vec q;
-        zero(q);
-        q = (Dh == Dh_MAX || tidx * QK_VEC_SIZE < Dh) ? *reinterpret_cast<const Qk_vec*>(&params.q[qk_offset]) : q;
-        Qk_vec k;
-        zero(k);
-        if (DO_CROSS_ATTENTION) {
-            // The 16B chunk written by the thread.
-            int co = tidx / QK_VECS_IN_16B;
-            // The position of the thread in that 16B chunk.
-            int ci = tidx % QK_VECS_IN_16B * QK_VEC_SIZE;
-
-            // Two chunks are separated by L * x elements. A thread write QK_VEC_SIZE elements.
-            int offset = bhi * params.seq_length * Dh + co * params.seq_length * QK_ELTS_IN_16B +
-                         // params.timestep*QK_ELTS_IN_16B +
-                         tlength * QK_ELTS_IN_16B + ci;
-            k = (Dh == Dh_MAX || tidx * QK_VEC_SIZE < Dh) ? *reinterpret_cast<const Qk_vec*>(&params.k_cache[offset]) :
-                                                            k;
-        }
-        else {
-            k = (Dh == Dh_MAX || tidx * QK_VEC_SIZE < Dh) ? *reinterpret_cast<const Qk_vec*>(&params.k[qk_offset]) : k;
-        }
-
-        // Trigger the loads from the Q and K bias buffers.
-        Qk_vec q_bias;
-        zero(q_bias);
-        q_bias = (Dh == Dh_MAX || tidx * QK_VEC_SIZE < Dh) && params.q_bias != nullptr ?
-                     *reinterpret_cast<const Qk_vec*>(&params.q_bias[qk_bias_offset]) :
-                     q_bias;
-        Qk_vec k_bias;
-        zero(k_bias);
-
-        if (!DO_CROSS_ATTENTION || (DO_CROSS_ATTENTION && params.timestep == 0)) {
-            k_bias = (Dh == Dh_MAX || tidx * QK_VEC_SIZE < Dh) && params.k_bias != nullptr ?
-                         *reinterpret_cast<const Qk_vec*>(&params.k_bias[qk_bias_offset]) :
-                         k_bias;
-        }
-
-        // Computes the Q/K values with bias.
-        q = add(q, q_bias);
-        if (!DO_CROSS_ATTENTION || (DO_CROSS_ATTENTION && params.timestep == 0)) {
-            k = add(k, k_bias);
-            if (params.rotary_embedding_dim > 0) {
-                apply_rotary_embedding(q, k, tidx, params.rotary_embedding_dim, params.timestep);
-            }
-        }
-        else {
-            if (params.rotary_embedding_dim > 0) {
-                apply_rotary_embedding(q, tidx, params.rotary_embedding_dim, params.timestep);
-            }
-        }
-
-        // Store the Q values to shared memory.
-        *reinterpret_cast<Qk_vec*>(&q_smem[tidx * QK_VEC_SIZE]) = q;
-
-        // Store Dh values of k_bias into smem, since will need to add later
-        // if params.timestep == 0
-        if (DO_CROSS_ATTENTION && params.timestep == 0) {
-            *reinterpret_cast<Qk_vec*>(&bias_smem[tidx * QK_VEC_SIZE]) = k_bias;
-        }
-
-        // Write the K values to the global memory cache.
-        //
-        // NOTE: The stores are uncoalesced as we have multiple chunks of 16B spread across the memory
-        // system. We designed it this way as it allows much better memory loads (and there are many
-        // more loads) + the stores are really "write and forget" since we won't need the ack before
-        // the end of the kernel. There's plenty of time for the transactions to complete.
-
-        // The 16B chunk written by the thread.
-        int co = tidx / QK_VECS_IN_16B;
-        // The position of the thread in that 16B chunk.
-        int ci = tidx % QK_VECS_IN_16B * QK_VEC_SIZE;
-
-        // Two chunks are separated by L * x elements. A thread write QK_VEC_SIZE elements.
-        int offset = bhi * params.seq_length * Dh + co * params.seq_length * QK_ELTS_IN_16B +
-                     // params.timestep*QK_ELTS_IN_16B +
-                     tlength * QK_ELTS_IN_16B + ci;
-
-        if (!DO_CROSS_ATTENTION || (DO_CROSS_ATTENTION && params.timestep == 0)) {
-            // Trigger the stores to global memory.
-            if (Dh == Dh_MAX || co < Dh / QK_ELTS_IN_16B) {
-                *reinterpret_cast<Qk_vec*>(&params.k_cache[offset]) = k;
-            }
-        }
-
-        // Compute \sum_i Q[i] * K^T[i] for the current timestep.
-#ifdef MMHA_USE_FP32_ACUM_FOR_FMA
-        using Qk_vec_acum = typename Qk_vec_acum_fp32_<Qk_vec>::Type;
-#else
-        using Qk_vec_acum = Qk_vec;
-#endif
-        qk = dot<Qk_vec_acum, Qk_vec>(q, k);
-        if (QK_VECS_PER_WARP <= WARP_SIZE) {
-#pragma unroll
-            for (int mask = QK_VECS_PER_WARP / 2; mask >= 1; mask /= 2) {
-                qk += __shfl_xor_sync(shfl_mask(QK_VECS_PER_WARP), qk, mask);
-            }
-        }
-    }
-
-    if (QK_VECS_PER_WARP > WARP_SIZE) {
-        constexpr int WARPS_PER_RED = (QK_VECS_PER_WARP + WARP_SIZE - 1) / WARP_SIZE;
-        qk = block_sum<WARPS_PER_RED>(&red_smem[WARPS_PER_RED], qk);
-    }
-
-    // Store that value in shared memory. Keep the Q*K^T value in register for softmax.
-    if (tidx == 0) {
-        // Normalize qk.
-        qk *= params.inv_sqrt_dh;
-
-        if (params.relative_attention_bias_float != nullptr) {
-            qk = qk
-                 + params.relative_attention_bias_float[hi * params.relative_attention_bias_stride
-                                                            * params.relative_attention_bias_stride
-                                                        + tlength * params.relative_attention_bias_stride + tlength];
-        }
-        else if (params.relative_attention_bias_half != nullptr) {
-            qk = qk
-                 + (float)
-                       params.relative_attention_bias_half[hi * params.relative_attention_bias_stride
-                                                               * params.relative_attention_bias_stride
-                                                           + tlength * params.relative_attention_bias_stride + tlength];
-        }
-        qk_max = qk;
-        qk_smem[tlength] = qk;
-        // qk_smem[params.timestep] = qk;
-    }
-
-    // Make sure the data is in shared memory.
-    __syncthreads();
-
-    // The type of queries and keys for the math in the Q*K^T product.
-    using K_vec = typename K_vec_<T, THREADS_PER_KEY>::Type;
-    // The number of elements per vector.
-    constexpr int K_VEC_SIZE = sizeof(K_vec) / sizeof(T);
-    // Make sure the hidden size per head is a multiple of the vector size.
-    static_assert(Dh_MAX % K_VEC_SIZE == 0, "");
-    // The number of elements per thread.
-    constexpr int K_ELTS_PER_THREAD = Dh_MAX / THREADS_PER_KEY;
-    // The number of vectors per thread.
-    constexpr int K_VECS_PER_THREAD = K_ELTS_PER_THREAD / K_VEC_SIZE;
-
-    // The position the first key loaded by each thread from the cache buffer (for this B * H).
-    int ko = tidx / THREADS_PER_KEY;
-    // The position of the thread in the chunk of keys.
-    int ki = tidx % THREADS_PER_KEY * K_VEC_SIZE;
-
-    static_assert(Dh_MAX == THREADS_PER_KEY * K_VEC_SIZE * K_VECS_PER_THREAD);
-
-    // Load the Q values from shared memory. The values are reused during the loop on K.
-    K_vec q[K_VECS_PER_THREAD];
-#pragma unroll
-    for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
-        q[ii] = *reinterpret_cast<const K_vec*>(&q_smem[ki + ii * THREADS_PER_KEY * K_VEC_SIZE]);
-    }
-
-    K_vec k_bias[DO_CROSS_ATTENTION ? K_VECS_PER_THREAD : 1];
-    if (DO_CROSS_ATTENTION && params.timestep == 0) {
-#pragma unroll
-        for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
-            k_bias[ii] = *reinterpret_cast<const K_vec*>(&bias_smem[ki + ii * THREADS_PER_KEY * K_VEC_SIZE]);
-        }
-    }
-
-    // The number of timesteps loaded per iteration.
-    constexpr int K_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_KEY;
-    // The number of keys per warp.
-    constexpr int K_PER_WARP = WARP_SIZE / THREADS_PER_KEY;
-
-    // The base pointer for the key in the cache buffer.
-    T* k_cache = &params.k_cache[bhi * params.seq_length * Dh + ki];
-    // Base pointer for the beam's batch, before offsetting with indirection buffer
-    T* k_cache_batch = &params.k_cache[bbhi * params.seq_length * Dh + ki];
-
-    // Pick a number of keys to make sure all the threads of a warp enter (due to shfl_sync).
-    // int ti_end = div_up(params.timestep, K_PER_WARP) * K_PER_WARP;
-    int ti_end = div_up(tlength, K_PER_WARP) * K_PER_WARP;
-
-    // Iterate over the keys/timesteps to compute the various (Q*K^T)_{ti} values.
-    for (int ti = ko; ti < ti_end; ti += K_PER_ITER) {
-
-        // The keys loaded from the key cache.
-        K_vec k[K_VECS_PER_THREAD];
-        K_vec k_vec_zero;
-        zero(k_vec_zero);
-#pragma unroll
-        for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
-            int jj = ii * params.seq_length + ti;
-            // if( ti < params.timestep ) {
-            if (ti < tlength) {
-                const int beam_src =
-                    (params.cache_indir != nullptr) ?
-                        params.cache_indir[(bbi * params.beam_width + beami) * params.seq_length + ti] :
-                        0;
-                const int beam_offset = beam_src * params.num_heads * params.seq_length * Dh;
-                k[ii] = (Dh == Dh_MAX || jj * QK_ELTS_IN_16B < Dh * params.seq_length) ?
-                            *reinterpret_cast<const K_vec*>(&k_cache_batch[beam_offset + jj * QK_ELTS_IN_16B]) :
-                            k_vec_zero;
-                // add bias and update k_cache
-                if (DO_CROSS_ATTENTION && params.timestep == 0) {
-                    k[ii] = add(k[ii], k_bias[ii]);
-                    if (Dh == Dh_MAX || jj * QK_ELTS_IN_16B < Dh * params.seq_length) {
-                        *reinterpret_cast<K_vec*>(&k_cache[jj * QK_ELTS_IN_16B]) = k[ii];
-                    }
-                }
-            }
-        }
-
-        // Perform the dot product and normalize qk.
-        //
-        // WARNING: ALL THE THREADS OF A WARP MUST ENTER!!!
-        float qk = Qk_dot<T, THREADS_PER_KEY>::dot(q, k) * params.inv_sqrt_dh;
-        bool is_mask = (params.input_lengths != nullptr && ti >= params.input_lengths[bi] && ti < params.max_input_len);
-
-        // Store the product to shared memory. There's one qk value per timestep. Update the max.
-        // if( ti < params.timestep && tidx % THREADS_PER_KEY == 0 ) {
-        if (ti < tlength && tidx % THREADS_PER_KEY == 0) {
-            if (params.relative_attention_bias_float != nullptr) {
-                qk = qk
-                     + params.relative_attention_bias_float[hi * params.relative_attention_bias_stride
-                                                                * params.relative_attention_bias_stride
-                                                            + tlength * params.relative_attention_bias_stride + ti];
-            }
-            else if (params.relative_attention_bias_half != nullptr) {
-                qk = qk
-                     + (float)
-                           params.relative_attention_bias_half[hi * params.relative_attention_bias_stride
-                                                                   * params.relative_attention_bias_stride
-                                                               + tlength * params.relative_attention_bias_stride + ti];
-            }
-            qk_max = is_mask ? qk_max : fmaxf(qk_max, qk);
-            qk_smem[ti] = qk;
-        }
-    }
-
-// Perform the final reduction to compute the max inside each warp.
-//
-// NOTE: In a group of THREADS_PER_KEY threads, the leader already has the max value for the
-// group so it's not needed to run the reduction inside the group (again).
-#pragma unroll
-    for (int mask = WARP_SIZE / 2; mask >= THREADS_PER_KEY; mask /= 2) {
-        qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask));
-    }
-
-    // Decompose the thread index into warp and lane.
-    const int warp = tidx / WARP_SIZE;
-    const int lane = tidx % WARP_SIZE;
-
-    // The warp leader writes the max to shared memory.
-    if (lane == 0) {
-        red_smem[warp] = qk_max;
-    }
-
-    // Make sure the products are in shared memory.
-    __syncthreads();
-
-    // The warps finalize the reduction.
-    qk_max = lane < WARPS_PER_BLOCK ? red_smem[lane] : -FLT_MAX;
-#pragma unroll
-    for (int mask = WARPS_PER_BLOCK / 2; mask >= 1; mask /= 2) {
-        qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask));
-    }
-
-    // Broadcast to all the threads in the warp.
-    qk_max = __shfl_sync(uint32_t(-1), qk_max, 0);
-
-    // Compute the logits and start the sum.
-    float sum = 0.f;
-    // for( int ti = tidx; ti <= params.timestep; ti += THREADS_PER_BLOCK ) {
-    for (int ti = tidx; ti <= tlength; ti += THREADS_PER_BLOCK) {
-        bool is_mask = (params.input_lengths != nullptr && ti >= params.input_lengths[bi] && ti < params.max_input_len);
-        float logit = is_mask ? 0.f : __expf(qk_smem[ti] - qk_max);
-        sum += logit;
-        qk_smem[ti] = logit;
-    }
-
-    // Compute the sum.
-    sum = block_sum<WARPS_PER_BLOCK>(&red_smem[WARPS_PER_BLOCK], sum);
-
-    // Normalize the logits.
-    float inv_sum = __fdividef(1.f, sum + 1.e-6f);
-    // for( int ti = tidx; ti <= params.timestep; ti += THREADS_PER_BLOCK ) {
-    for (int ti = tidx; ti <= tlength; ti += THREADS_PER_BLOCK) {
-        convert_from_float(logits_smem[ti], qk_smem[ti] * inv_sum);
-    }
-
-    // Put Values part below so we leverage __syncthreads
-    // from the previous step
-
-    // The number of elements per vector.
-    constexpr int V_VEC_SIZE = Dh_MAX / THREADS_PER_VALUE;
-    // A vector of V elements for the current timestep.
-    using V_vec = typename V_vec_<T, V_VEC_SIZE>::Type;
-
-    // The value computed by this thread.
-    int vo = tidx / THREADS_PER_VALUE;
-    // The hidden dimensions computed by this particular thread.
-    int vi = tidx % THREADS_PER_VALUE * V_VEC_SIZE;
-
-    // The base pointer for the value in the cache buffer.
-    T* v_cache = &params.v_cache[bhi * params.seq_length * Dh + vi];
-    // Base pointer for the beam's batch, before offsetting with indirection buffer
-    T* v_cache_batch = &params.v_cache[bbhi * params.seq_length * Dh + vi];
-
-    // The number of values processed per iteration of the loop.
-    constexpr int V_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_VALUE;
-
-    // One group of threads computes the product(s) for the current timestep.
-    V_vec v_bias;
-    zero(v_bias);
-    // if( vo == params.timestep % V_PER_ITER ) {
-    if (Dh == Dh_MAX || vi < Dh) {
-        if (!DO_CROSS_ATTENTION || (DO_CROSS_ATTENTION && params.timestep == 0)) {
-            if (vo == tlength % V_PER_ITER) {
-                // Trigger the loads from the V bias buffer.
-                if (params.v_bias != nullptr) {
-                    v_bias = *reinterpret_cast<const V_vec*>(&params.v_bias[hi * Dh + vi]);
-                }
-                if (DO_CROSS_ATTENTION) {
-                    *reinterpret_cast<V_vec*>(&bias_smem[vi]) = v_bias;
-                }
-            }
-        }
-    }
-
-    // From previous, before values, step
-    // Also make sure the logits are in shared memory.
-    __syncthreads();
-
-    // Values continued
-#ifdef MMHA_USE_FP32_ACUM_FOR_OUT
-    using V_vec_acum = typename V_vec_acum_fp32_<V_vec>::Type;
-#else
-    using V_vec_acum = V_vec;
-#endif
-    // The partial outputs computed by each thread.
-    V_vec_acum out;
-    zero(out);
-
-    // Loop over the timesteps to compute the partial outputs.
-    // for( int ti = vo; ti < params.timestep; ti += V_PER_ITER ) {
-    if (Dh == Dh_MAX || vi < Dh) {
-        for (int ti = vo; ti < tlength; ti += V_PER_ITER) {
-
-            // Fetch offset based on cache_indir when beam sampling
-            const int beam_src = (params.cache_indir != nullptr) ?
-                                     params.cache_indir[(bbi * params.beam_width + beami) * params.seq_length + ti] :
-                                     0;
-            const int beam_offset = beam_src * params.num_heads * params.seq_length * Dh;
-            // Load the values from the cache.
-            V_vec v = *reinterpret_cast<const V_vec*>(&v_cache_batch[beam_offset + ti * Dh]);
-            if (DO_CROSS_ATTENTION && params.timestep == 0) {
-                v = add(v, *reinterpret_cast<V_vec*>(&bias_smem[vi]));
-                *reinterpret_cast<V_vec*>(&v_cache[ti * Dh]) = v;
-            }
-            // Load the logits from shared memory.
-#if defined(MMHA_USE_FP32_ACUM_FOR_LOGITS)
-            float logit = logits_smem[ti];
-            out = fma(logit, cast_to_float(v), out);
-#else
-            T logit = logits_smem[ti];
-
-            // Update the partial sums.
-            out = fma(logit, v, out);
-#endif
-        }
-    }
-
-    // One group of threads computes the product(s) for the current timestep.
-    // if( vo == params.timestep % V_PER_ITER ) {
-    if (vo == tlength % V_PER_ITER && (Dh == Dh_MAX || vi < Dh)) {
-
-        V_vec v;
-        if (DO_CROSS_ATTENTION) {
-            v = *reinterpret_cast<const V_vec*>(&v_cache[tlength * Dh]);
-        }
-        else {
-            // Trigger the loads from the V buffer.
-            v = *reinterpret_cast<const V_vec*>(&params.v[qkv_base_offset + vi]);
-            // Trigger the loads from the V bias buffer.
-            // V_vec v_bias = *reinterpret_cast<const V_vec*>(&params.v_bias[hi*Dh + vi]);
-        }
-
-        // Compute the V values with bias.
-        if (!DO_CROSS_ATTENTION || (DO_CROSS_ATTENTION && params.timestep == 0)) {
-            v = add(v, v_bias);
-
-            // Store the values with bias back to global memory in the cache for V.
-            //*reinterpret_cast<V_vec*>(&v_cache[params.timestep*Dh]) = v;
-            *reinterpret_cast<V_vec*>(&v_cache[tlength * Dh]) = v;
-        }
-
-        // Initialize the output value with the current timestep.
-#if defined(MMHA_USE_FP32_ACUM_FOR_LOGITS)
-        // out = fma(logits_smem[params.timestep], cast_to_float(v), out);
-        out = fma(logits_smem[tlength], cast_to_float(v), out);
-#else
-        // out = fma(logits_smem[params.timestep], v, out);
-        out = fma(logits_smem[tlength], v, out);
-#endif
-    }
-
-    // Make sure we can start writing to shared memory.
-    __syncthreads();
-
-    // Run the final reduction amongst the different groups computing different partial outputs.
-    if (Dh == Dh_MAX || vi < Dh)
-#pragma unroll
-        for (int active_groups = V_PER_ITER; active_groups >= 2; active_groups /= 2) {
-
-            // The midpoint in the number of active groups.
-            int midpoint = active_groups / 2;
-
-            // The upper part of active threads store to shared memory.
-            if (vo >= midpoint && vo < active_groups && (Dh == Dh_MAX || vi < Dh)) {
-#ifdef MMHA_USE_FP32_ACUM_FOR_OUT
-                convert_from_float(*reinterpret_cast<V_vec*>(&out_smem[(vo - midpoint) * Dh + vi]), out);
-#else
-                *reinterpret_cast<V_vec*>(&out_smem[(vo - midpoint) * Dh + vi]) = out;
-#endif
-            }
-            __syncthreads();
-
-            // The bottom warps update their values.
-            if (vo < midpoint && (Dh == Dh_MAX || vi < Dh)) {
-                out = add(*reinterpret_cast<const V_vec*>(&out_smem[vo * Dh + vi]), out);
-            }
-            __syncthreads();
-        }
-
-    // Output the final values.
-    if (vo == 0 && (Dh == Dh_MAX || vi < Dh)) {
-#ifdef MMHA_USE_FP32_ACUM_FOR_OUT
-        convert_from_float(*reinterpret_cast<V_vec*>(&params.out[bhi * Dh + vi]), out);
-#else
-        *reinterpret_cast<V_vec*>(&params.out[bhi * Dh + vi]) = out;
-#endif
-    }
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace mmha  
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-#define MMHA_LAUNCH_KERNEL(T, Dh, Dh_MAX, THDS_PER_KEY, THDS_PER_VALUE, THDS_PER_BLOCK, stream) \
-  int pad_active_groups = 1 << static_cast<int>(ceil(std::log2(THDS_PER_BLOCK / THDS_PER_VALUE))); \
-  size_t smem_sz = mmha::smem_size_in_bytes<T>(params, THDS_PER_VALUE, THDS_PER_BLOCK, pad_active_groups); \
-  dim3 grid(params.num_heads, params.batch_size);  \
-  mmha::masked_multihead_attention_kernel<T, Dh, Dh_MAX, THDS_PER_KEY, THDS_PER_VALUE, THDS_PER_BLOCK> \
-    <<<grid, THDS_PER_BLOCK, smem_sz, stream>>>(params, pad_active_groups)
-
-
-#define MMHA_LAUNCH_KERNEL_V2(T, Dh, Dh_MAX, THDS_PER_KEY, THDS_PER_VALUE, THDS_PER_BLOCK, stream) \
-  int pad_active_groups = 1 << static_cast<int>(ceil(std::log2(THDS_PER_BLOCK / THDS_PER_VALUE))); \
-  size_t smem_sz = mmha::smem_size_in_bytes<T>(params, THDS_PER_VALUE, THDS_PER_BLOCK, pad_active_groups); \
-  dim3 grid(params.num_heads, params.batch_size);  \
-    mmha::masked_multihead_attention_kernel_v2<T, Dh, Dh_MAX, THDS_PER_KEY, THDS_PER_VALUE, THDS_PER_BLOCK> \
-  <<<grid, THDS_PER_BLOCK, smem_sz, stream>>>(params, pad_active_groups)
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template < typename T, int Dh, int Dh_MAX>
-void  mmha_launch_kernel(const Masked_multihead_attention_params<T> &params, const cudaStream_t &stream) {
-  if (params.rotary_embedding_dim > 0 || params.relative_attention_bias_float || params.relative_attention_bias_half) {
-    constexpr int THREADS_PER_VALUE = Dh_MAX * sizeof(T) / 16;
-    if( params.timestep < 32 ) {
-      MMHA_LAUNCH_KERNEL_V2(T, Dh, Dh_MAX, 4, THREADS_PER_VALUE, 64, stream);
-    } else if( params.timestep < 2048 ) {
-      MMHA_LAUNCH_KERNEL_V2(T, Dh, Dh_MAX, 2, THREADS_PER_VALUE, 128, stream);
-    } else {
-      MMHA_LAUNCH_KERNEL_V2(T, Dh, Dh_MAX, 1, THREADS_PER_VALUE, 256, stream);
-    }
-  } else {
-    constexpr int THREADS_PER_VALUE = Dh * sizeof(T) / 16;
-    if( params.timestep < 32 ) {
-      MMHA_LAUNCH_KERNEL(T, Dh, Dh_MAX, 4, THREADS_PER_VALUE, 64, stream);
-    } else if( params.timestep < 2048 ) {
-      MMHA_LAUNCH_KERNEL(T, Dh, Dh_MAX, 2, THREADS_PER_VALUE, 128, stream);
-    } else {
-      MMHA_LAUNCH_KERNEL(T, Dh, Dh_MAX, 1, THREADS_PER_VALUE, 256, stream);
-    }
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template< typename T >
-void masked_multihead_attention_(const Masked_multihead_attention_params<T> &params, const cudaStream_t &stream) {
-  switch ( params.hidden_size_per_head ) {
-    case 32:
-      mmha_launch_kernel<T, 32, 32>(params, stream);
-      break;
-    case 64:
-      mmha_launch_kernel<T, 64, 64>(params, stream);
-      break;
-    case 80:  // gpt-cpm-large-cn
-      mmha_launch_kernel<T, 80, 128>(params, stream);
-      break;
-    case 96:  // plato-xl
-      mmha_launch_kernel<T, 96, 128>(params, stream);
-      break;
-    case 128:
-      mmha_launch_kernel<T, 128, 128>(params, stream);
-      break;
-    case 160:
-      mmha_launch_kernel<T, 160, 256>(params, stream);
-      break;
-    case 192:
-      mmha_launch_kernel<T, 192, 256>(params, stream);
-      break;
-    case 224:
-      mmha_launch_kernel<T, 224, 256>(params, stream);
-      break;
-    case 256: // GPTJ/CodeGen
-      mmha_launch_kernel<T, 256, 256>(params, stream);
-      break;
-    default:
-      // assert(false);
-      throw std::runtime_error("Unsupported model size.");
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-void masked_multihead_attention(const Masked_multihead_attention_params<float> &params, const cudaStream_t &stream) {
-  masked_multihead_attention_(params, stream);
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-void masked_multihead_attention(const Masked_multihead_attention_params<uint16_t> &params, const cudaStream_t &stream) {
-  masked_multihead_attention_(params, stream);
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-#undef MMHA_LAUNCH_KERNEL
diff --git a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/masked_multihead_attention.h b/paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/masked_multihead_attention.h
deleted file mode 100644
index 992a598536ee..000000000000
--- a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/masked_multihead_attention.h
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <cuda_runtime_api.h>
-#include <cuda_fp16.h>
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-#define CHECK_CUDA(call) do { \
-  cudaError_t status_ = call; \
-  if( status_ != cudaSuccess ) { \
-    fprintf(stderr, "CUDA error (%s:%d): %s\n", __FILE__, __LINE__, cudaGetErrorString(status_)); \
-    exit(1); \
-  } \
-} while(0)
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// The structure of parameters for the masked multihead attention kernel.
-//
-// We use the following terminology to describe the different dimensions.
-//
-// B:  Batch size (number of sequences),
-// L:  Sequence length,
-// D:  Hidden dimension,
-// H:  Number of heads,
-// Dh: Hidden dimension per head - Dh = D / H.
-
-template< typename T >
-struct Masked_multihead_attention_params {
-
-  // The output buffer. Dimensions B x D.
-  T *out;
-
-  // The input Qs and the associated bias. Dimensions B x D and D, resp.
-  const T *q, *q_bias;
-  // The input Ks and the associated bias. Dimensions B x D and D, resp.
-  const T *k, *k_bias;
-  // The input Vs and the associated bias. Dimensions B x D and D, resp.
-  const T *v, *v_bias;
-
-  // The cache for the Ks. The size must be at least B x L x D.
-  T *k_cache;
-  // The cache for the Vs. The size must be at least B x L x D.
-  T *v_cache;
-
-  // The indirections to use for cache when beam sampling.
-  const int* cache_indir = nullptr;
-
-  // allows to exist attention eary
-  bool *finished;
-
-  // Stride to handle the case when KQV is a single buffer
-  int stride;
-
-  // The batch size.
-  int batch_size;
-  // The sequence length.
-  int seq_length;
-  // The number of heads (H).
-  int num_heads;
-  // The hidden dimension per head (Dh).
-  int hidden_size_per_head;
-  // The current timestep.
-  int timestep;
-
-  // The per-head latent space reserved for rotary embeddings.
-  int rotary_embedding_dim = 0;
-
-  // The 1.f / sqrt(Dh). Computed on the host.
-  float inv_sqrt_dh;
-
-  // params for masking.
-  bool is_mask;
-  const int *input_lengths = input_lengths;
-  int max_input_len = max_input_len;
-
-  const float* relative_attention_bias_float = nullptr;
-  const half* relative_attention_bias_half = nullptr;
-  int relative_attention_bias_stride;
-  // The beam width
-  int beam_width = 1;
-  // required in case of cross attention
-  int* memory_length_per_sample = nullptr;
-  // required in case of masked attention with different length
-  const int* length_per_sample = nullptr;
-
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-void masked_multihead_attention    (const Masked_multihead_attention_params<float>    &params, const cudaStream_t &stream);
-void masked_multihead_attention    (const Masked_multihead_attention_params<uint16_t> &params, const cudaStream_t &stream);
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
diff --git a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/masked_multihead_attention_utils.h b/paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/masked_multihead_attention_utils.h
deleted file mode 100644
index cdcc47a06180..000000000000
--- a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/masked_multihead_attention_utils.h
+++ /dev/null
@@ -1,265 +0,0 @@
-/*
- * Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-namespace mmha {
-
-inline __device__ float2 rotary_embedding_coefficient(const int zid, const int rot_embed_dim, const float t_step)
-{
-    const float inv_freq = t_step / pow(10000.0f, zid / (float)rot_embed_dim);
-    return {cos(inv_freq), sin(inv_freq)};
-}
-
-inline __device__ float2 rotary_embedding_transform(const float2 v, const float2 coef)
-{
-    float2 rot_v;
-    rot_v.x = coef.x * v.x - coef.y * v.y;
-    rot_v.y = coef.x * v.y + coef.y * v.x;
-    return rot_v;
-}
-
-inline __device__ uint32_t rotary_embedding_transform(const uint32_t v, const float2 coef)
-{
-    float2 fv = half2_to_float2(v);
-    float2 rot_fv = rotary_embedding_transform(fv, coef);
-    return float2_to_half2(rot_fv);
-}
-
-#ifdef ENABLE_BF16
-inline __device__ __nv_bfloat162 rotary_embedding_transform(const __nv_bfloat162 v, const float2 coef)
-{
-    float2 fv = bf1622float2(v);
-    float2 rot_fv = rotary_embedding_transform(fv, coef);
-    return __floats2bfloat162_rn(rot_fv.x, rot_fv.y);
-}
-#endif
-
-inline __device__ void apply_rotary_embedding(float& q, int zid, int rot_embed_dim, int t_step)
-{
-    return;
-}
-
-inline __device__ void apply_rotary_embedding(float& q, float& k, int zid, int rot_embed_dim, int t_step)
-{
-    return;
-}
-
-inline __device__ void apply_rotary_embedding(float2& q, int tid, int rot_embed_dim, int t_step)
-{
-    if (2 * tid >= rot_embed_dim) {
-        return;
-    }
-    const auto coef = rotary_embedding_coefficient(2 * tid, rot_embed_dim, t_step);
-    q = rotary_embedding_transform(q, coef);
-}
-
-inline __device__ void apply_rotary_embedding(float2& q, float2& k, int tid, int rot_embed_dim, int t_step)
-{
-    if (2 * tid >= rot_embed_dim) {
-        return;
-    }
-    const auto coef = rotary_embedding_coefficient(2 * tid, rot_embed_dim, t_step);
-    q = rotary_embedding_transform(q, coef);
-    k = rotary_embedding_transform(k, coef);
-}
-
-inline __device__ void apply_rotary_embedding(float4& q, int tid, int rot_embed_dim, int t_step)
-{
-    if (4 * tid >= rot_embed_dim) {
-        return;
-    }
-
-    Float4_& q_ = *reinterpret_cast<Float4_*>(&q);
-    const auto coef0 = rotary_embedding_coefficient(4 * tid, rot_embed_dim, t_step);
-    q_.x = rotary_embedding_transform(q_.x, coef0);
-    const auto coef1 = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, t_step);
-    q_.y = rotary_embedding_transform(q_.y, coef1);
-}
-
-inline __device__ void apply_rotary_embedding(float4& q, float4& k, int tid, int rot_embed_dim, int t_step)
-{
-    if (4 * tid >= rot_embed_dim) {
-        return;
-    }
-
-    Float4_& q_ = *reinterpret_cast<Float4_*>(&q);
-    Float4_& k_ = *reinterpret_cast<Float4_*>(&k);
-    const auto coef0 = rotary_embedding_coefficient(4 * tid, rot_embed_dim, t_step);
-    q_.x = rotary_embedding_transform(q_.x, coef0);
-    k_.x = rotary_embedding_transform(k_.x, coef0);
-    const auto coef1 = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, t_step);
-    q_.y = rotary_embedding_transform(q_.y, coef1);
-    k_.y = rotary_embedding_transform(k_.y, coef1);
-}
-
-inline __device__ void apply_rotary_embedding(uint32_t& q, int tid, int rot_embed_dim, int t_step)
-{
-    if (2 * tid >= rot_embed_dim) {
-        return;
-    }
-    const auto coef = rotary_embedding_coefficient(2 * tid, rot_embed_dim, t_step);
-    q = rotary_embedding_transform(q, coef);
-}
-
-inline __device__ void apply_rotary_embedding(uint32_t& q, uint32_t& k, int tid, int rot_embed_dim, int t_step)
-{
-    if (2 * tid >= rot_embed_dim) {
-        return;
-    }
-    const auto coef = rotary_embedding_coefficient(2 * tid, rot_embed_dim, t_step);
-    q = rotary_embedding_transform(q, coef);
-    k = rotary_embedding_transform(k, coef);
-}
-
-inline __device__ void apply_rotary_embedding(uint2& q, int tid, int rot_embed_dim, int t_step)
-{
-    if (4 * tid >= rot_embed_dim) {
-        return;
-    }
-    const auto coef0 = rotary_embedding_coefficient(4 * tid, rot_embed_dim, t_step);
-    q.x = rotary_embedding_transform(q.x, coef0);
-    const auto coef1 = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, t_step);
-    q.y = rotary_embedding_transform(q.y, coef1);
-}
-
-inline __device__ void apply_rotary_embedding(uint2& q, uint2& k, int tid, int rot_embed_dim, int t_step)
-{
-    if (4 * tid >= rot_embed_dim) {
-        return;
-    }
-    const auto coef0 = rotary_embedding_coefficient(4 * tid, rot_embed_dim, t_step);
-    q.x = rotary_embedding_transform(q.x, coef0);
-    k.x = rotary_embedding_transform(k.x, coef0);
-    const auto coef1 = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, t_step);
-    q.y = rotary_embedding_transform(q.y, coef1);
-    k.y = rotary_embedding_transform(k.y, coef1);
-}
-
-inline __device__ void apply_rotary_embedding(uint4& q, int tid, int rot_embed_dim, int t_step)
-{
-    if (8 * tid >= rot_embed_dim) {
-        return;
-    }
-    const auto coef0 = rotary_embedding_coefficient(8 * tid, rot_embed_dim, t_step);
-    q.x = rotary_embedding_transform(q.x, coef0);
-    const auto coef1 = rotary_embedding_coefficient(8 * tid + 2, rot_embed_dim, t_step);
-    q.y = rotary_embedding_transform(q.y, coef1);
-    const auto coef2 = rotary_embedding_coefficient(8 * tid + 4, rot_embed_dim, t_step);
-    q.z = rotary_embedding_transform(q.z, coef2);
-    const auto coef3 = rotary_embedding_coefficient(8 * tid + 6, rot_embed_dim, t_step);
-    q.w = rotary_embedding_transform(q.w, coef3);
-}
-
-inline __device__ void apply_rotary_embedding(uint4& q, uint4& k, int tid, int rot_embed_dim, int t_step)
-{
-    if (8 * tid >= rot_embed_dim) {
-        return;
-    }
-    const auto coef0 = rotary_embedding_coefficient(8 * tid, rot_embed_dim, t_step);
-    q.x = rotary_embedding_transform(q.x, coef0);
-    k.x = rotary_embedding_transform(k.x, coef0);
-    const auto coef1 = rotary_embedding_coefficient(8 * tid + 2, rot_embed_dim, t_step);
-    q.y = rotary_embedding_transform(q.y, coef1);
-    k.y = rotary_embedding_transform(k.y, coef1);
-    const auto coef2 = rotary_embedding_coefficient(8 * tid + 4, rot_embed_dim, t_step);
-    q.z = rotary_embedding_transform(q.z, coef2);
-    k.z = rotary_embedding_transform(k.z, coef2);
-    const auto coef3 = rotary_embedding_coefficient(8 * tid + 6, rot_embed_dim, t_step);
-    q.w = rotary_embedding_transform(q.w, coef3);
-    k.w = rotary_embedding_transform(k.w, coef3);
-}
-
-#ifdef ENABLE_BF16
-inline __device__ void apply_rotary_embedding(__nv_bfloat162& q, int tid, int rot_embed_dim, int t_step)
-{
-    if (2 * tid >= rot_embed_dim) {
-        return;
-    }
-    const auto coef = rotary_embedding_coefficient(2 * tid, rot_embed_dim, t_step);
-    q = rotary_embedding_transform(q, coef);
-}
-
-inline __device__ void
-apply_rotary_embedding(__nv_bfloat162& q, __nv_bfloat162& k, int tid, int rot_embed_dim, int t_step)
-{
-    if (2 * tid >= rot_embed_dim) {
-        return;
-    }
-    const auto coef = rotary_embedding_coefficient(2 * tid, rot_embed_dim, t_step);
-    q = rotary_embedding_transform(q, coef);
-    k = rotary_embedding_transform(k, coef);
-}
-
-inline __device__ void apply_rotary_embedding(bf16_4_t& q, int tid, int rot_embed_dim, int t_step)
-{
-    if (4 * tid >= rot_embed_dim) {
-        return;
-    }
-    const auto coef0 = rotary_embedding_coefficient(4 * tid, rot_embed_dim, t_step);
-    q.x = rotary_embedding_transform(q.x, coef0);
-    const auto coef1 = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, t_step);
-    q.y = rotary_embedding_transform(q.y, coef1);
-}
-
-inline __device__ void apply_rotary_embedding(bf16_4_t& q, bf16_4_t& k, int tid, int rot_embed_dim, int t_step)
-{
-    if (4 * tid >= rot_embed_dim) {
-        return;
-    }
-    const auto coef0 = rotary_embedding_coefficient(4 * tid, rot_embed_dim, t_step);
-    q.x = rotary_embedding_transform(q.x, coef0);
-    k.x = rotary_embedding_transform(k.x, coef0);
-    const auto coef1 = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, t_step);
-    q.y = rotary_embedding_transform(q.y, coef1);
-    k.y = rotary_embedding_transform(k.y, coef1);
-}
-
-inline __device__ void apply_rotary_embedding(bf16_8_t& q, int tid, int rot_embed_dim, int t_step)
-{
-    if (8 * tid >= rot_embed_dim) {
-        return;
-    }
-    const auto coef0 = rotary_embedding_coefficient(8 * tid, rot_embed_dim, t_step);
-    q.x = rotary_embedding_transform(q.x, coef0);
-    const auto coef1 = rotary_embedding_coefficient(8 * tid + 2, rot_embed_dim, t_step);
-    q.y = rotary_embedding_transform(q.y, coef1);
-    const auto coef2 = rotary_embedding_coefficient(8 * tid + 4, rot_embed_dim, t_step);
-    q.z = rotary_embedding_transform(q.z, coef2);
-    const auto coef3 = rotary_embedding_coefficient(8 * tid + 6, rot_embed_dim, t_step);
-    q.w = rotary_embedding_transform(q.w, coef3);
-}
-
-inline __device__ void apply_rotary_embedding(bf16_8_t& q, bf16_8_t& k, int tid, int rot_embed_dim, int t_step)
-{
-    if (8 * tid >= rot_embed_dim) {
-        return;
-    }
-    const auto coef0 = rotary_embedding_coefficient(8 * tid, rot_embed_dim, t_step);
-    q.x = rotary_embedding_transform(q.x, coef0);
-    k.x = rotary_embedding_transform(k.x, coef0);
-    const auto coef1 = rotary_embedding_coefficient(8 * tid + 2, rot_embed_dim, t_step);
-    q.y = rotary_embedding_transform(q.y, coef1);
-    k.y = rotary_embedding_transform(k.y, coef1);
-    const auto coef2 = rotary_embedding_coefficient(8 * tid + 4, rot_embed_dim, t_step);
-    q.z = rotary_embedding_transform(q.z, coef2);
-    k.z = rotary_embedding_transform(k.z, coef2);
-    const auto coef3 = rotary_embedding_coefficient(8 * tid + 6, rot_embed_dim, t_step);
-    q.w = rotary_embedding_transform(q.w, coef3);
-    k.w = rotary_embedding_transform(k.w, coef3);
-}
-#endif  // ENABLE_BF16
-
-} // namespace mmha 
\ No newline at end of file
diff --git a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/online_softmax_beamsearch_kernels.cu b/paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/online_softmax_beamsearch_kernels.cu
deleted file mode 100644
index 65b4ac6f0014..000000000000
--- a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/online_softmax_beamsearch_kernels.cu
+++ /dev/null
@@ -1,1559 +0,0 @@
-/*
- * Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "cub/cub.cuh"
-#include "fastertransformer/cuda/topk_kernels.cuh"
-
-namespace fastertransformer {
-
-#define TOPK_FP16_STORAGE 0
-
-template <typename T, int MAX_K, int THREADBLOCK_SIZE>
-__launch_bounds__(THREADBLOCK_SIZE) __global__
-    void beam_topK_kernel(const T* log_probs,
-                          int* topk_tmp_id_buf,
-                          T* topk_tmp_val_buf,
-                          const int vocab_size,
-                          T diversity_rate) {
-  typedef cub::BlockReduce<TopK<T, MAX_K>, THREADBLOCK_SIZE> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-
-  int thread_id = threadIdx.x;
-  int block_id = blockIdx.x;
-  TopK<T, MAX_K> partial;
-
-  for (int i = 0; i < MAX_K; ++i) {
-    partial.p[i] = -1;
-    partial.u[i] = -FLT_MAX;
-  }
-
-  for (int elem_id = thread_id; elem_id < vocab_size;
-       elem_id += THREADBLOCK_SIZE) {
-    int index = elem_id + block_id * vocab_size;
-    partial.insert((T)log_probs[index], index);
-  }
-
-  TopK<T, MAX_K> total =
-      BlockReduce(temp_storage).Reduce(partial, reduce_topk_op<T, MAX_K>);
-
-  if (thread_id == 0) {
-    int index = block_id * MAX_K;
-
-    for (int i = 0; i < MAX_K; ++i) {
-      topk_tmp_id_buf[index + i] = total.p[i];
-      topk_tmp_val_buf[index + i] = total.u[i] + diversity_rate * (T)i;
-    }
-  }
-}
-
-template <typename T, int MAX_K, int THREADBLOCK_SIZE>
-__launch_bounds__(THREADBLOCK_SIZE) __global__
-    void batch_topK_kernel(int* topk_tmp_id_buf,
-                           T* topk_tmp_val_buf,
-                           int* id_buf) {
-  int thread_id = threadIdx.x;
-  int block_id = blockIdx.x;
-  TopK<T, MAX_K> partial;
-  if (thread_id == 0) {
-    for (int i = 0; i < MAX_K; ++i) {
-      partial.p[i] = -1;
-      partial.u[i] = -FLT_MAX;
-    }
-
-    int index = block_id * MAX_K * MAX_K;
-    for (int i = 0; i < MAX_K * MAX_K; i++) {
-      partial.insert((T)topk_tmp_val_buf[index + i],
-                     topk_tmp_id_buf[index + i]);
-    }
-
-    index = block_id * MAX_K;
-    for (int i = 0; i < MAX_K; i++) {
-      id_buf[index + i] = partial.p[i];
-    }
-  }
-}
-
-template <typename T, int MAX_K, int THREADBLOCK_SIZE>
-__launch_bounds__(THREADBLOCK_SIZE) __global__
-    void batch_topK_kernel(const int* __restrict topk_tmp_id_buf,
-                           const T* __restrict topk_tmp_val_buf,
-                           int* __restrict id_buf,
-                           T* __restrict val_buf) {
-  int thread_id = threadIdx.x;
-  int block_id = blockIdx.x;
-  TopK<T, MAX_K> partial;
-  if (thread_id == 0) {
-    for (int i = 0; i < MAX_K; ++i) {
-      partial.p[i] = -1;
-      partial.u[i] = -FLT_MAX;
-    }
-
-    int index = block_id * MAX_K * MAX_K;
-    for (int i = 0; i < MAX_K * MAX_K; i++) {
-      partial.insert((T)topk_tmp_val_buf[index + i],
-                     topk_tmp_id_buf[index + i]);
-    }
-
-    index = block_id * MAX_K;
-    for (int i = 0; i < MAX_K; i++) {
-      id_buf[index + i] = partial.p[i];
-      val_buf[index + i] = partial.u[i];
-    }
-  }
-}
-
-template <typename T, int MAX_K, int THREADBLOCK_SIZE>
-__launch_bounds__(THREADBLOCK_SIZE) __global__
-    void batch_topk_kernel(const int* __restrict x,
-                           const T* __restrict y,
-                           int* __restrict z,
-                           float* __restrict v,
-                           int V,
-                           int K,
-                           T diversity_rate) {
-  int thread_id = threadIdx.x;
-  int vector_id = blockIdx.x;
-
-  // reposition x, y to data for the current vector
-  x += vector_id * V;
-  y += vector_id * V;
-
-  typedef cub::BlockReduce<TopK<T, MAX_K>, THREADBLOCK_SIZE> BlockReduce;
-
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-
-  TopK<T, MAX_K> partial;
-  for (int i = 0; i < MAX_K; ++i) {
-    partial.p[i] = -1;
-    partial.u[i] = -FLT_MAX;
-  }
-  for (int elem_id = thread_id; elem_id < V; elem_id += THREADBLOCK_SIZE) {
-    int i = elem_id % K;
-    T elem = y[elem_id] + diversity_rate * (T)i;
-    int elem_idx = elem_id;  // x[elem_id];
-    partial.insert(elem, elem_idx);
-  }
-
-  TopK<T, MAX_K> total =
-      BlockReduce(temp_storage).Reduce(partial, reduce_topk_op<T, MAX_K>);
-
-  if (thread_id == 0) {
-    z += vector_id * K;
-    v += vector_id * K;
-
-    for (int i = 0; i < MAX_K; ++i) {
-      if (i < K) {
-        z[i] = x[total.p[i]];
-        v[i] = (float)y[total.p[i]];
-      }
-    }
-  }
-}
-
-struct __align__(8) MD {
-  float m;
-  float d;
-};
-
-__device__ __forceinline__ MD reduce_md_op(MD a, MD b) {
-  bool a_bigger = (a.m > b.m);
-  MD bigger_m = a_bigger ? a : b;
-  MD smaller_m = a_bigger ? b : a;
-  MD res;
-  res.d = bigger_m.d + smaller_m.d * __expf(smaller_m.m - bigger_m.m);
-  res.m = bigger_m.m;
-  return res;
-}
-
-template <typename T, int MAX_K>
-struct TopKMD {
-  MD md;
-  TopK<T, MAX_K> topk;
-};
-
-template <typename T, int MAX_K>
-__device__ __forceinline__ TopKMD<T, MAX_K> reduce_topk_md_op(
-    const TopKMD<T, MAX_K>& a, const TopKMD<T, MAX_K>& b) {
-  TopKMD<T, MAX_K> res;
-  res.md = reduce_md_op(a.md, b.md);
-  res.topk = reduce_topk_op(a.topk, b.topk);
-  return res;
-}
-
-template <typename T,
-          int ITEMS_PER_THREAD,
-          int MAX_K,
-          int THREADBLOCK_SIZE,
-          bool ALIVE = false>
-__launch_bounds__(THREADBLOCK_SIZE) __global__
-    void beam_online_softmax_topk_kernel(const T* __restrict x,
-                                         const T* __restrict b,
-                                         const float* __restrict c,
-                                         const bool* __restrict finished,
-                                         int* __restrict z,
-                                         T* __restrict v,
-                                         int V,
-                                         int K,
-                                         int E) {
-  int thread_id = threadIdx.x;
-  int vector_id = blockIdx.x;
-
-  const bool IS_FP16 = std::is_same<T, half>::value;
-  const T MAX_T_VAL = (IS_FP16) ? HALF_FLT_MAX : FLT_MAX;
-
-  // reposition y to data for the current vector
-  x += vector_id * V;
-
-  typedef cub::BlockReduce<TopKMD<float, MAX_K>, THREADBLOCK_SIZE> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-
-  TopKMD<float, MAX_K> partial;
-  bool finish = ALIVE ? false : finished[vector_id];
-  for (int i = 0; i < MAX_K; ++i) {
-    partial.topk.p[i] = -1;
-    partial.topk.u[i] = -MAX_T_VAL;
-  }
-  partial.md.m = -MAX_T_VAL;
-  partial.md.d = 0.0F;
-
-  if (finish) {
-    for (int elem_id = thread_id; elem_id < V; elem_id += THREADBLOCK_SIZE) {
-      float elem = (elem_id == E) ? MAX_T_VAL : -MAX_T_VAL;
-      MD new_elem{elem, 1.0F};
-      partial.md = reduce_md_op(partial.md, new_elem);
-      partial.topk.insert(elem, elem_id);
-    }
-  } else {
-    for (int elem_id = thread_id; elem_id < V; elem_id += THREADBLOCK_SIZE) {
-      float elem = (float)x[elem_id] + b[elem_id];
-      MD new_elem{elem, 1.0F};
-      partial.md = reduce_md_op(partial.md, new_elem);
-      partial.topk.insert(elem, elem_id);
-    }
-  }
-
-  TopKMD<float, MAX_K> total =
-      BlockReduce(temp_storage)
-          .Reduce(partial, reduce_topk_md_op<float, MAX_K>);
-
-  if (thread_id == 0) {
-    z += vector_id * K;
-    v += vector_id * K;
-    // c += vector_id;
-
-    // cum_log_probs puts the results of alive beams after finish beams,
-    // thus we add the offset.
-    c += ALIVE ? (vector_id / (K / 2) * K + vector_id % (K / 2) + K / 2)
-               : vector_id;
-
-    // float d_total_inverse = __fdividef(1.0F, total.md.d);
-    float d_total_log = logf(total.md.d);
-    for (int i = 0; i < MAX_K; ++i) {
-      // float val = __expf(total.topk.u[i] - total.md.m) * d_total_inverse;
-      float val = total.topk.u[i] - total.md.m - d_total_log;
-      if (i < K) {
-        z[i] = total.topk.p[i] +
-               vector_id * V;  // faster transformer needs absolute id
-        v[i] = (float)val + (float)c[0];
-      }
-    }
-  }
-}
-
-template <typename T,
-          int ITEMS_PER_THREAD,
-          int MAX_K,
-          int THREADBLOCK_SIZE,
-          bool ALIVE = false>
-__launch_bounds__(THREADBLOCK_SIZE, 1) __global__
-    void beam_online_softmax_topk_stage1_kernel(const T* __restrict x,
-                                                const T* __restrict b,
-                                                const bool* __restrict finished,
-                                                float* __restrict t,
-                                                int V,
-                                                int K,
-                                                int E) {
-  int thread_id = threadIdx.x;
-  int vector_id = blockIdx.x;
-
-  const int PACKED_TOP_KMD_SIZE = 2 * MAX_K + 2;
-
-  const bool IS_FP16 = std::is_same<T, half>::value;
-  const T MAX_T_VAL = (IS_FP16) ? HALF_FLT_MAX : FLT_MAX;
-
-  // one will have multiple sections per V
-  const int v_local = (V + gridDim.y - 1) / gridDim.y;
-  const int section_start = v_local * blockIdx.y;
-  int section_end = section_start + v_local;
-  section_end = (section_end > V) ? V : section_end;
-
-  // reposition x to data for the current vector
-  x += vector_id * V;
-#if TOPK_FP16_STORAGE == 1
-  typedef cub::BlockReduce<TopKMD<__half, MAX_K>, THREADBLOCK_SIZE> BlockReduce;
-#else
-  typedef cub::BlockReduce<TopKMD<T, MAX_K>, THREADBLOCK_SIZE> BlockReduce;
-#endif
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  __shared__ float buf_s[PACKED_TOP_KMD_SIZE];  // save intermediate result
-
-#if TOPK_FP16_STORAGE == 1
-  TopKMD<__half, MAX_K> partial;
-#else
-  TopKMD<T, MAX_K> partial;
-#endif
-  // bool finish = finished[vector_id];
-  bool finish = ALIVE ? false : finished[vector_id];
-  for (int i = 0; i < MAX_K; ++i) {
-    partial.topk.p[i] = -1;
-    partial.topk.u[i] = -MAX_T_VAL;
-  }
-  partial.md.m = -MAX_T_VAL;
-  partial.md.d = 0.0F;
-
-  if (finish) {
-#pragma unroll 1
-    for (int elem_id = section_start + thread_id; elem_id < section_end;
-         elem_id += THREADBLOCK_SIZE) {
-      float elem = (elem_id == E) ? MAX_T_VAL : -MAX_T_VAL;
-      MD new_elem{elem, 1.0F};
-      partial.md = reduce_md_op(partial.md, new_elem);
-      partial.topk.insert(elem, elem_id);
-    }
-  } else {
-#pragma unroll 1
-    for (int elem_id = section_start + thread_id; elem_id < section_end;
-         elem_id += THREADBLOCK_SIZE) {
-      T bias = b == nullptr ? (T)0.0f : b[elem_id];  // gpt-2 does not use bias
-      T elem = x[elem_id] + bias;
-      MD new_elem{elem, 1.0F};
-      partial.md = reduce_md_op(partial.md, new_elem);
-      partial.topk.insert(elem, elem_id);
-    }
-  }
-
-#if TOPK_FP16_STORAGE == 1
-  TopKMD<__half, MAX_K> total =
-      BlockReduce(temp_storage)
-          .Reduce(partial, reduce_topk_md_op<__half, MAX_K>);
-#else
-  TopKMD<T, MAX_K> total =
-      BlockReduce(temp_storage).Reduce(partial, reduce_topk_md_op<T, MAX_K>);
-#endif
-
-  if (thread_id == 0) {
-    for (int i = 0; i < K; i++) {
-      reinterpret_cast<int*>(buf_s)[i] =
-          total.topk.p[i] +
-          vector_id * V;  // faster transformer needs absolute id
-      buf_s[MAX_K + i] = total.topk.u[i];
-    }
-    buf_s[2 * MAX_K] = total.md.d;
-    buf_s[2 * MAX_K + 1] = total.md.m;
-  }
-  __syncthreads();
-  if (threadIdx.x < PACKED_TOP_KMD_SIZE) {
-    t[blockIdx.x * PACKED_TOP_KMD_SIZE * gridDim.y +
-      blockIdx.y * PACKED_TOP_KMD_SIZE + threadIdx.x] = buf_s[threadIdx.x];
-  }
-}
-
-template <typename T, int MAX_K, int THREADBLOCK_SIZE, bool ALIVE = false>
-__launch_bounds__(THREADBLOCK_SIZE) __global__
-    void beam_online_softmax_topk_stage2_kernel(const float* __restrict x,
-                                                const float* __restrict c,
-                                                int* __restrict z,
-                                                T* __restrict v,
-                                                int K,
-                                                int parts_per_beam) {
-  const int vector_id = blockIdx.x;
-  const int thread_id = threadIdx.x;
-  const int PACKED_TOP_KMD_SIZE = 2 * MAX_K + 2;
-
-  const bool IS_FP16 = std::is_same<T, half>::value;
-  const T MAX_T_VAL = (IS_FP16) ? HALF_FLT_MAX : FLT_MAX;
-
-  extern __shared__ char buf_s_[];  // intermediate result
-  float* buf_s = reinterpret_cast<float*>(buf_s_);
-  //__shared__ float buf_s[PACKED_TOP_KMD_SIZE * THREADBLOCK_SIZE]; //
-  // intermediate result
-
-  typedef cub::BlockReduce<TopKMD<T, MAX_K>, THREADBLOCK_SIZE> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-
-  x += vector_id * PACKED_TOP_KMD_SIZE * parts_per_beam;
-
-  TopKMD<T, MAX_K> partial;
-  for (int i = 0; i < MAX_K; ++i) {
-    partial.topk.p[i] = -1;
-    partial.topk.u[i] = -MAX_T_VAL;
-  }
-  partial.md.m = -MAX_T_VAL;
-  partial.md.d = 0.0F;
-
-  // load and unpack into registers through smem
-  for (int idx = thread_id; idx < PACKED_TOP_KMD_SIZE * parts_per_beam;
-       idx += THREADBLOCK_SIZE) {
-    buf_s[idx] = x[idx];
-  }
-  __syncthreads();
-
-  if (threadIdx.x < parts_per_beam) {
-    float* b_s = buf_s + thread_id * PACKED_TOP_KMD_SIZE;
-    for (int i = 0; i < K; i++) {
-      partial.topk.p[i] = reinterpret_cast<int*>(b_s)[i];
-      partial.topk.u[i] = b_s[MAX_K + i];
-    }
-    partial.md.d = b_s[2 * MAX_K];
-    partial.md.m = b_s[2 * MAX_K + 1];
-  }
-  __syncthreads();
-
-  TopKMD<T, MAX_K> total =
-      BlockReduce(temp_storage).Reduce(partial, reduce_topk_md_op<T, MAX_K>);
-
-  if (thread_id == 0) {
-    z += vector_id * K;
-    v += vector_id * K;
-    // c += vector_id;
-
-    // cum_log_probs puts the results of alive beams after finish beams,
-    // thus we add the offset.
-    c += ALIVE ? (vector_id / (K / 2) * K + vector_id % (K / 2) + K / 2)
-               : vector_id;
-
-    float d_total_log = logf(total.md.d);
-    for (int i = 0; i < MAX_K; ++i) {
-      float val = (float)total.topk.u[i] - total.md.m - d_total_log;
-      if (i < K) {
-        z[i] = total.topk.p[i];
-        v[i] = (float)val + (float)c[0];
-      }
-    }
-  }
-}
-
-template <typename T>
-void topK_kernelLauncher(T* log_probs,
-                         int* topk_tmp_id_buf,
-                         T* topk_tmp_val_buf,
-                         int* ids,
-                         DecodingBeamsearchArguments args,
-                         cudaStream_t stream) {
-  const int batch_size = args.batch_size_;
-  const int beam_width = args.beam_width_;
-  const int vocab_size = args.vocab_size_padded_;
-  const int diversity_rate = args.beam_search_diversity_rate_;
-  const int block_size = SMALL_TOP_K_SOFTMAX_THREADBLOCK_SIZE;
-
-  switch (beam_width) {
-    case 1:
-      beam_topK_kernel<
-          T,
-          1,
-          block_size><<<batch_size * beam_width, block_size, 0, stream>>>(
-          log_probs,
-          topk_tmp_id_buf,
-          topk_tmp_val_buf,
-          vocab_size,
-          diversity_rate);
-      batch_topK_kernel<T,
-                        1,
-                        block_size><<<batch_size, block_size, 0, stream>>>(
-          topk_tmp_id_buf, topk_tmp_val_buf, ids);
-      break;
-    case 2:
-      beam_topK_kernel<
-          T,
-          2,
-          block_size><<<batch_size * beam_width, block_size, 0, stream>>>(
-          log_probs,
-          topk_tmp_id_buf,
-          topk_tmp_val_buf,
-          vocab_size,
-          diversity_rate);
-      batch_topK_kernel<T,
-                        2,
-                        block_size><<<batch_size, block_size, 0, stream>>>(
-          topk_tmp_id_buf, topk_tmp_val_buf, ids);
-      break;
-    case 3:
-      beam_topK_kernel<
-          T,
-          3,
-          block_size><<<batch_size * beam_width, block_size, 0, stream>>>(
-          log_probs,
-          topk_tmp_id_buf,
-          topk_tmp_val_buf,
-          vocab_size,
-          diversity_rate);
-      batch_topK_kernel<T,
-                        3,
-                        block_size><<<batch_size, block_size, 0, stream>>>(
-          topk_tmp_id_buf, topk_tmp_val_buf, ids);
-      break;
-    case 4:
-      beam_topK_kernel<
-          T,
-          4,
-          block_size><<<batch_size * beam_width, block_size, 0, stream>>>(
-          log_probs,
-          topk_tmp_id_buf,
-          topk_tmp_val_buf,
-          vocab_size,
-          diversity_rate);
-      batch_topK_kernel<T,
-                        4,
-                        block_size><<<batch_size, block_size, 0, stream>>>(
-          topk_tmp_id_buf, topk_tmp_val_buf, ids);
-      break;
-    case 6:
-      beam_topK_kernel<
-          T,
-          6,
-          block_size><<<batch_size * beam_width, block_size, 0, stream>>>(
-          log_probs,
-          topk_tmp_id_buf,
-          topk_tmp_val_buf,
-          vocab_size,
-          diversity_rate);
-      batch_topK_kernel<T,
-                        6,
-                        block_size><<<batch_size, block_size, 0, stream>>>(
-          topk_tmp_id_buf, topk_tmp_val_buf, ids);
-      break;
-    case 8:
-      beam_topK_kernel<
-          T,
-          8,
-          block_size><<<batch_size * beam_width, block_size, 0, stream>>>(
-          log_probs,
-          topk_tmp_id_buf,
-          topk_tmp_val_buf,
-          vocab_size,
-          diversity_rate);
-      batch_topK_kernel<T,
-                        8,
-                        block_size><<<batch_size, block_size, 0, stream>>>(
-          topk_tmp_id_buf, topk_tmp_val_buf, ids);
-      break;
-    case 32:
-      beam_topK_kernel<
-          T,
-          32,
-          block_size><<<batch_size * beam_width, block_size, 0, stream>>>(
-          log_probs,
-          topk_tmp_id_buf,
-          topk_tmp_val_buf,
-          vocab_size,
-          diversity_rate);
-      batch_topK_kernel<T,
-                        32,
-                        block_size><<<batch_size, block_size, 0, stream>>>(
-          topk_tmp_id_buf, topk_tmp_val_buf, ids);
-      break;
-    default:
-      printf("[ERROR] Topk kernel does not support beamwidth = %d \n",
-             beam_width);
-      exit(0);
-      break;
-  }
-}
-
-template <typename T, int MAX_K, bool ALIVE = false>
-void beam_online_softmax_topk_stage2_kernelLauncher(const float* temp_storage,
-                                                    const float* cum_log_probs,
-                                                    int* ids,
-                                                    T* vals,
-                                                    int batch_size,
-                                                    int beam_width,
-                                                    int parts_per_beam,
-                                                    cudaStream_t stream) {
-  // might rewrite beam_online_softmax_topk_stage2_kernel no to depend on
-  // constant block size
-  // in oreder to reduce compilation time
-  int smem_stage2_size = parts_per_beam * (2 * MAX_K + 2) * sizeof(float);
-
-  if (parts_per_beam <= 32) {
-    beam_online_softmax_topk_stage2_kernel<
-        T,
-        MAX_K,
-        32,
-        ALIVE><<<batch_size * beam_width, 32, smem_stage2_size, stream>>>(
-        temp_storage,
-        cum_log_probs,
-        ids,
-        vals,
-        ALIVE ? beam_width * 2 : beam_width,
-        parts_per_beam);
-    return;
-  }
-  if (parts_per_beam <= 64) {
-    beam_online_softmax_topk_stage2_kernel<
-        T,
-        MAX_K,
-        64,
-        ALIVE><<<batch_size * beam_width, 64, smem_stage2_size, stream>>>(
-        temp_storage,
-        cum_log_probs,
-        ids,
-        vals,
-        ALIVE ? beam_width * 2 : beam_width,
-        parts_per_beam);
-    return;
-  }
-  if (parts_per_beam <= 128) {
-    beam_online_softmax_topk_stage2_kernel<
-        T,
-        MAX_K,
-        128,
-        ALIVE><<<batch_size * beam_width, 128, smem_stage2_size, stream>>>(
-        temp_storage,
-        cum_log_probs,
-        ids,
-        vals,
-        ALIVE ? beam_width * 2 : beam_width,
-        parts_per_beam);
-    return;
-  }
-  assert(0);
-}
-
-template <typename T, int MAX_K>
-void topK_softMax_kernelLauncher(const T* log_probs,
-                                 const T* bias,
-                                 const bool* finished,
-                                 float* cum_log_probs,
-                                 int* ids,
-                                 void* temp_storage,
-                                 const int temp_storage_size,
-                                 const int batch_size,
-                                 const int beam_width,
-                                 const int vocab_size,
-                                 const int end_id,
-                                 T diversity_rate,
-                                 cudaStream_t stream) {
-  const int items_per_thread = 1;
-  const int block_sz =
-      (MAX_K < 16) ? (MAX_K < 8) ? SMALL_TOP_K_SOFTMAX_THREADBLOCK_SIZE : 128
-                   : 64;
-  // const int block_sz = SMALL_TOP_K_SOFTMAX_THREADBLOCK_SIZE;
-
-  assert(temp_storage_size % 2 == 0);
-  assert(temp_storage_size >= 2 * batch_size * beam_width * beam_width);
-
-  const int topk_buf_offset =
-      ceil(batch_size * beam_width * beam_width / 4.) * 4;
-  int* topk_tmp_id_buf = reinterpret_cast<int*>(temp_storage);
-  T* topk_tmp_val_buf = reinterpret_cast<T*>(topk_tmp_id_buf + topk_buf_offset);
-  float* tmp_buffer =
-      reinterpret_cast<float*>(topk_tmp_val_buf + topk_buf_offset);
-
-#ifdef DO_SPLIT_SMALL_TOP_K_SOFTMAX
-  int voc_parts = 4;
-  if (batch_size * beam_width < 256) {
-    // Volta has 80 SMs, so we aim for three waves
-    voc_parts = (240 + batch_size * beam_width - 1) / (batch_size * beam_width);
-    voc_parts = std::min(128, voc_parts);  // we implment up to 128
-  }
-  dim3 grid(batch_size * beam_width, voc_parts);
-  cudaFuncSetAttribute(beam_online_softmax_topk_stage1_kernel<T,
-                                                              items_per_thread,
-                                                              MAX_K,
-                                                              block_sz>,
-                       cudaFuncAttributePreferredSharedMemoryCarveout,
-                       cudaSharedmemCarveoutMaxL1);
-  beam_online_softmax_topk_stage1_kernel<
-      T,
-      items_per_thread,
-      MAX_K,
-      block_sz><<<grid, block_sz, 0, stream>>>(
-      log_probs, bias, finished, tmp_buffer, vocab_size, beam_width, end_id);
-#endif
-  if (beam_width > 1) {
-#ifdef DO_SPLIT_SMALL_TOP_K_SOFTMAX
-    beam_online_softmax_topk_stage2_kernelLauncher<T, MAX_K>(tmp_buffer,
-                                                             cum_log_probs,
-                                                             topk_tmp_id_buf,
-                                                             topk_tmp_val_buf,
-                                                             batch_size,
-                                                             beam_width,
-                                                             voc_parts,
-                                                             stream);
-#else
-    beam_online_softmax_topk_kernel<
-        T,
-        items_per_thread,
-        MAX_K,
-        block_sz><<<batch_size * beam_width, block_sz, 0, stream>>>(
-        log_probs,
-        bias,
-        cum_log_probs,
-        finished,
-        topk_tmp_id_buf,
-        topk_tmp_val_buf,
-        vocab_size,
-        beam_width,
-        end_id);
-#endif
-#if 0
-            // wrong result with diversity_rate != 0.f
-            batch_topK_kernel<T, MAX_K, 32><<<batch_size, 32, 0, stream>>>
-                                (topk_tmp_id_buf, topk_tmp_val_buf, ids, cum_log_probs);
-#else
-    batch_topk_kernel<T, MAX_K, 32><<<batch_size, 32, 0, stream>>>(
-        topk_tmp_id_buf,
-        topk_tmp_val_buf,
-        ids,
-        cum_log_probs,
-        beam_width * beam_width,
-        beam_width,
-        diversity_rate);
-#endif
-  } else {
-#ifdef DO_SPLIT_SMALL_TOP_K_SOFTMAX
-    beam_online_softmax_topk_stage2_kernelLauncher<float, MAX_K>(tmp_buffer,
-                                                                 cum_log_probs,
-                                                                 ids,
-                                                                 cum_log_probs,
-                                                                 batch_size,
-                                                                 beam_width,
-                                                                 voc_parts,
-                                                                 stream);
-#else
-    beam_online_softmax_topk_kernel<
-        T,
-        items_per_thread,
-        MAX_K,
-        block_sz><<<batch_size * beam_width, block_sz, 0, stream>>>(
-        log_probs,
-        bias,
-        cum_log_probs,
-        finished,
-        ids,
-        cum_log_probs,
-        vocab_size,
-        beam_width,
-        end_id);
-#endif
-  }
-}
-
-template <typename T>
-void topK_softMax(const T* log_probs,
-                  const T* bias,
-                  const bool* finished,
-                  float* cum_log_probs,
-                  int* ids,
-                  void* temp_storage,
-                  DecodingBeamsearchArguments args,
-                  cudaStream_t stream) {
-  const int temp_storage_size = args.temp_storage_size_;
-  const int batch_size = args.batch_size_;
-  const int beam_width = args.beam_width_;
-  const int vocab_size = args.vocab_size_padded_;
-  const int end_id = args.end_id_;
-  const T diversity_rate = args.beam_search_diversity_rate_;
-
-  switch (beam_width) {
-    case 1:
-      topK_softMax_kernelLauncher<T, 1>(log_probs,
-                                        bias,
-                                        finished,
-                                        cum_log_probs,
-                                        ids,
-                                        temp_storage,
-                                        temp_storage_size,
-                                        batch_size,
-                                        beam_width,
-                                        vocab_size,
-                                        end_id,
-                                        diversity_rate,
-                                        stream);
-      break;
-    case 2:
-      topK_softMax_kernelLauncher<T, 2>(log_probs,
-                                        bias,
-                                        finished,
-                                        cum_log_probs,
-                                        ids,
-                                        temp_storage,
-                                        temp_storage_size,
-                                        batch_size,
-                                        beam_width,
-                                        vocab_size,
-                                        end_id,
-                                        diversity_rate,
-                                        stream);
-      break;
-    case 3:
-      topK_softMax_kernelLauncher<T, 3>(log_probs,
-                                        bias,
-                                        finished,
-                                        cum_log_probs,
-                                        ids,
-                                        temp_storage,
-                                        temp_storage_size,
-                                        batch_size,
-                                        beam_width,
-                                        vocab_size,
-                                        end_id,
-                                        diversity_rate,
-                                        stream);
-      break;
-    case 4:
-      topK_softMax_kernelLauncher<T, 4>(log_probs,
-                                        bias,
-                                        finished,
-                                        cum_log_probs,
-                                        ids,
-                                        temp_storage,
-                                        temp_storage_size,
-                                        batch_size,
-                                        beam_width,
-                                        vocab_size,
-                                        end_id,
-                                        diversity_rate,
-                                        stream);
-      break;
-    case 8:
-      topK_softMax_kernelLauncher<T, 8>(log_probs,
-                                        bias,
-                                        finished,
-                                        cum_log_probs,
-                                        ids,
-                                        temp_storage,
-                                        temp_storage_size,
-                                        batch_size,
-                                        beam_width,
-                                        vocab_size,
-                                        end_id,
-                                        diversity_rate,
-                                        stream);
-      break;
-    case 16:
-      topK_softMax_kernelLauncher<T, 16>(log_probs,
-                                         bias,
-                                         finished,
-                                         cum_log_probs,
-                                         ids,
-                                         temp_storage,
-                                         temp_storage_size,
-                                         batch_size,
-                                         beam_width,
-                                         vocab_size,
-                                         end_id,
-                                         diversity_rate,
-                                         stream);
-      break;
-    case 32:
-      topK_softMax_kernelLauncher<T, 32>(log_probs,
-                                         bias,
-                                         finished,
-                                         cum_log_probs,
-                                         ids,
-                                         temp_storage,
-                                         temp_storage_size,
-                                         batch_size,
-                                         beam_width,
-                                         vocab_size,
-                                         end_id,
-                                         diversity_rate,
-                                         stream);
-      break;
-    default:
-      printf("[ERROR] Topk kernel does not support beamwidth = %d \n",
-             beam_width);
-      exit(0);
-      break;
-  }
-}
-
-template void topK_kernelLauncher<float>(float* log_probs,
-                                         int* topk_tmp_id_buf,
-                                         float* topk_tmp_val_buf,
-                                         int* ids,
-                                         DecodingBeamsearchArguments args,
-                                         cudaStream_t stream);
-
-template void topK_kernelLauncher<half>(half* log_probs,
-                                        int* topk_tmp_id_buf,
-                                        half* topk_tmp_val_buf,
-                                        int* ids,
-                                        DecodingBeamsearchArguments args,
-                                        cudaStream_t stream);
-
-template void topK_softMax<float>(const float* log_probs,
-                                  const float* bias,
-                                  const bool* finished,
-                                  float* cum_log_probs,
-                                  int* ids,
-                                  void* tmp_storage,
-                                  DecodingBeamsearchArguments args,
-                                  cudaStream_t stream);
-
-template void topK_softMax<half>(const half* log_probs,
-                                 const half* bias,
-                                 const bool* finished,
-                                 float* cum_log_probs,
-                                 int* ids,
-                                 void* tmp_storage,
-                                 DecodingBeamsearchArguments args,
-                                 cudaStream_t stream);
-
-template <typename T, int MAX_K>
-struct TopKFinish {
-  T u[MAX_K];
-  int idx[MAX_K];
-  int len[MAX_K];
-
-  __device__ __forceinline__ void insert(T elem, int pidx, int step) {
-    if (elem > u[MAX_K - 1]) {
-      u[MAX_K - 1] = elem;
-      idx[MAX_K - 1] = pidx;
-      len[MAX_K - 1] = step;
-
-      for (int k = MAX_K - 2; k >= 0; --k) {
-        if (u[k + 1] > u[k]) {
-          T u2 = u[k];
-          u[k] = u[k + 1];
-          u[k + 1] = u2;
-          int tmp = idx[k];
-          idx[k] = idx[k + 1];
-          idx[k + 1] = tmp;
-          tmp = len[k];
-          len[k] = len[k + 1];
-          len[k + 1] = tmp;
-        }
-      }
-    }
-  }
-};
-
-template <int MAX_K>
-struct TopKStop {
-  float u[MAX_K];
-  int word_id[MAX_K];
-  int idx[MAX_K];
-  int len[MAX_K];
-
-  __device__ __forceinline__ void insert(float elem,
-                                         int ids,
-                                         int pidx,
-                                         int step) {
-    if (elem > u[MAX_K - 1]) {
-      u[MAX_K - 1] = elem;
-      word_id[MAX_K - 1] = ids;
-      idx[MAX_K - 1] = pidx;
-      len[MAX_K - 1] = step;
-
-      for (int k = MAX_K - 2; k >= 0; --k) {
-        if (u[k + 1] > u[k]) {
-          float u2 = u[k];
-          u[k] = u[k + 1];
-          u[k + 1] = u2;
-
-          int tmp1 = word_id[k];
-          word_id[k] = word_id[k + 1];
-          word_id[k + 1] = tmp1;
-
-          int tmp2 = idx[k];
-          idx[k] = idx[k + 1];
-          idx[k + 1] = tmp2;
-
-          int tmp3 = len[k];
-          len[k] = len[k + 1];
-          len[k + 1] = tmp3;
-        }
-      }
-    }
-  }
-};
-
-template <typename T, int MAX_K, int THREADBLOCK_SIZE>
-__launch_bounds__(THREADBLOCK_SIZE) __global__
-    void batch_topk_update_kernel(const int* __restrict x,
-                                  const T* __restrict y,
-                                  bool* finished,
-                                  bool* alive_finished,
-                                  int* sequence_length,
-                                  int* word_ids,
-                                  int* parent_ids,
-                                  int* output_word_ids,
-                                  int* output_parent_ids,
-                                  float* output_cum_log_probs,
-                                  const int batch_size,
-                                  const int beam_width,
-                                  const int vocab_size,
-                                  const int end_id,
-                                  const int step,
-                                  const int max_out_len,
-                                  int V,
-                                  int K,
-                                  T diversity_rate,
-                                  float length_penalty,
-                                  float max_length_penalty,
-                                  const int finished_candidate_num,
-                                  const bool early_stopping = false) {
-  int thread_id = threadIdx.x;
-  int vector_id = blockIdx.x;
-
-  const bool IS_FP16 = std::is_same<T, half>::value;
-  // to be consistent with MAX_T_VAL in init_kernel, which should also be same
-  // with other topk kernel, however it does not
-  const T MAX_T_VAL = (IS_FP16) ? HALF_FLT_MAX : 1e20f;
-
-  // reposition x, y to data for the current vector
-  x += vector_id * V;
-  y += vector_id * V;
-
-  typedef cub::BlockReduce<TopK<T, MAX_K>, THREADBLOCK_SIZE> BlockReduce;
-
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-
-  TopK<T, MAX_K> partial;
-  for (int i = 0; i < MAX_K; ++i) {
-    partial.p[i] = -1;
-    partial.u[i] = -MAX_T_VAL;
-  }
-  for (int elem_id = thread_id; elem_id < V; elem_id += THREADBLOCK_SIZE) {
-    int i = elem_id % K;
-    T elem = y[elem_id] + diversity_rate * (T)i;
-    int elem_idx = elem_id;  // x[elem_id];
-    partial.insert(elem, elem_idx);
-  }
-
-  TopK<T, MAX_K> total =
-      BlockReduce(temp_storage).Reduce(partial, reduce_topk_op<T, MAX_K>);
-
-  // grow finish and grow alive
-  if (thread_id == 0) {
-    word_ids += vector_id * beam_width;
-    parent_ids += vector_id * beam_width;
-    output_word_ids += vector_id * K;  // K == MAX_K == beam_width*2
-    output_parent_ids += vector_id * K;
-    output_cum_log_probs += vector_id * K;
-    sequence_length += vector_id * K;
-    finished += vector_id * K;
-    alive_finished += vector_id * beam_width;
-    // load the finish queue to grow
-    // TODO(guosheng): Use vectorized load or do this BlockReduce to use
-    // multi-threads without extra sync
-    int finish_num = 0;
-    TopKFinish<T, MAX_K / 2> finish_candidate;
-    for (int i = 0; i < MAX_K / 2; ++i) {
-      if (step == 1) {  // step number starts from 1 rather than 0
-        finish_candidate.u[i] = -MAX_T_VAL;
-        finish_candidate.idx[i] = -1;
-        finish_candidate.len[i] = 0;
-      } else {
-        finish_candidate.u[i] = output_cum_log_probs[i];
-        finish_candidate.idx[i] = i;
-        finish_candidate.len[i] = sequence_length[i];
-        if (finished[i]) finish_num++;
-      }
-    }
-
-    int alive_num = 0;
-    for (int i = 0; i < MAX_K; ++i) {  // K == MAX_K == beam_width*2
-      if (i < K) {
-        // beam_online_softmax_topk_kernel produces absolute id, which can make
-        // update_KV_cache_kernel use gather instead of gather_nd
-        int abs_id = x[total.p[i]];
-        float cum_log_prob = (float)y[total.p[i]];
-        // There are two queues, one for the alive and another for the finish.
-        // `beam_id` stands for parents in the alive, and it uses absolute id
-        // represented as `batch_idx * beam_width + beam_idx`.
-        int beam_id = abs_id / vocab_size;
-        int beam_id_in_output =
-            vector_id * K + (beam_id % beam_width) + beam_width;
-        int word_id = abs_id % vocab_size;
-        if (i < finished_candidate_num && word_id == end_id) {  // grow finish
-          // The alive candidates are put after finish candidates in the
-          // finish queue, thus parent index should plus with the
-          // offset(beam_width).
-          finish_candidate.insert(
-              cum_log_prob / length_penalty, beam_id_in_output, step);
-          if (finish_num != MAX_K / 2) finish_num++;
-        } else if (alive_num < beam_width && word_id != end_id) {  // grow alive
-          parent_ids[alive_num] = beam_id;
-          word_ids[alive_num] = word_id;
-          // Also put alive candidates after finish candidates, since output
-          // must include both the finish and alive to trace full path
-          output_word_ids[MAX_K / 2 + alive_num] = word_id;
-          output_parent_ids[MAX_K / 2 + alive_num] = beam_id_in_output;
-          output_cum_log_probs[MAX_K / 2 + alive_num] = cum_log_prob;
-          sequence_length[MAX_K / 2 + alive_num] = step;
-          finished[MAX_K / 2 + alive_num] = 0;
-          alive_finished[alive_num] = 0;
-          alive_num++;
-        }
-      }
-    }
-
-    for (int i = 0; i < MAX_K / 2; ++i) {
-      output_word_ids[i] = end_id;
-      output_cum_log_probs[i] = static_cast<float>(finish_candidate.u[i]);
-      output_parent_ids[i] = finish_candidate.idx[i];
-      sequence_length[i] = finish_candidate.len[i];
-      // finished[i] = 1;
-      finished[i] = finish_candidate.u[i] > (-MAX_T_VAL + (T)10.0f) ? 1 : 0;
-    }
-
-    // early finish
-    float lowest_finish =
-        finish_num == 0 ? -1e20f : output_cum_log_probs[finish_num - 1];
-    // The best possible score of the most likely alive sequence
-    float lower_bound =
-        (float)output_cum_log_probs[MAX_K / 2] / max_length_penalty;
-
-    // output must include both the finish and alive to trace full path
-    if (finished_candidate_num == MAX_K / 2) {
-      if (finish_num == finished_candidate_num &&
-          (lowest_finish > lower_bound || early_stopping)) {  // when finishing
-        // If early stop, also mark the alive beams finished.
-        for (int i = MAX_K / 2; i < MAX_K; ++i) {
-          finished[i] = 1;
-          alive_finished[i - MAX_K / 2] = 1;
-        }
-      } else if (step == max_out_len) {
-        TopKStop<MAX_K / 2> finish_stop;
-        for (int i = 0; i < MAX_K / 2; ++i) {
-          finish_stop.word_id[i] = -1;
-          finish_stop.u[i] = -1e20f;
-          finish_stop.idx[i] = -1;
-          finish_stop.len[i] = 0;
-        }
-
-        for (int i = 0; i < finish_num; ++i) {
-          finish_stop.insert(output_cum_log_probs[i],
-                             end_id,
-                             output_parent_ids[i],
-                             sequence_length[i]);
-        }
-        for (int i = MAX_K / 2; i < MAX_K; ++i) {
-          finish_stop.insert(output_cum_log_probs[i] / length_penalty,
-                             word_ids[i - MAX_K / 2],
-                             output_parent_ids[i],
-                             step);
-        }
-        for (int i = 0; i < MAX_K / 2; ++i) {
-          output_word_ids[i] = finish_stop.word_id[i];
-          output_cum_log_probs[i] = finish_stop.u[i];
-          output_parent_ids[i] = finish_stop.idx[i];
-          sequence_length[i] = finish_stop.len[i];
-          finished[i] = 1;
-        }
-
-        // If early stop, also mark the alive beams finished.
-        for (int i = MAX_K / 2; i < MAX_K; ++i) {
-          finished[i] = 1;
-          alive_finished[i - MAX_K / 2] = 1;
-        }
-      }
-    } else {
-      if (step == max_out_len ||
-          lowest_finish > lower_bound) {  // when finishing
-        for (int i = 0; finish_num < MAX_K / 2; ++finish_num, ++i) {
-          output_word_ids[finish_num] = word_ids[i];
-          output_cum_log_probs[finish_num] =
-              (float)output_cum_log_probs[i + beam_width] / length_penalty;
-          output_parent_ids[finish_num] = output_parent_ids[i + beam_width];
-          sequence_length[finish_num] = step;
-          finished[finish_num] = 1;
-        }
-        // If early stop, also mark the alive beams finished.
-        for (int i = MAX_K / 2; i < MAX_K; ++i) {
-          finished[i] = 1;
-          alive_finished[i - MAX_K / 2] = 1;
-        }
-      }
-    }
-  }
-}
-
-template <typename T, int MAX_K>
-void topK_softMax_update_kernelLauncher(const T* log_probs,
-                                        const T* bias,
-                                        bool* finished,
-                                        bool* alive_finished,
-                                        int* sequence_length,
-                                        int* word_ids,
-                                        int* parent_ids,
-                                        int* output_word_ids,
-                                        int* output_parent_ids,
-                                        float* cum_log_probs,
-                                        void* temp_storage,
-                                        const int temp_storage_size,
-                                        const int batch_size,
-                                        const int beam_width,
-                                        const int vocab_size,
-                                        const int end_id,
-                                        const int step,
-                                        const int max_out_len,
-                                        T diversity_rate,
-                                        const float alpha,
-                                        const int finished_candidate_num,
-                                        const bool early_stopping,
-                                        cudaStream_t stream) {
-  const int items_per_thread = 1;
-  const int block_sz =
-      (MAX_K < 16) ? (MAX_K < 8) ? SMALL_TOP_K_SOFTMAX_THREADBLOCK_SIZE : 128
-                   : 64;
-  // const int block_sz = SMALL_TOP_K_SOFTMAX_THREADBLOCK_SIZE;
-
-  assert(temp_storage_size % 2 == 0);
-  // select top beam_width*2 for topk_tmp_id_buf and topk_tmp_val_buf
-  assert(temp_storage_size >= 2 * batch_size * beam_width * beam_width * 2);
-
-  const int topk_buf_offset =
-      ceil(batch_size * beam_width * beam_width / 4.) * 4 * 2;
-  int* topk_tmp_id_buf = reinterpret_cast<int*>(temp_storage);
-  T* topk_tmp_val_buf = reinterpret_cast<T*>(topk_tmp_id_buf + topk_buf_offset);
-  float* tmp_buffer =
-      reinterpret_cast<float*>(topk_tmp_val_buf + topk_buf_offset);
-
-#ifdef DO_SPLIT_SMALL_TOP_K_SOFTMAX
-  int voc_parts = 4;
-  if (batch_size * beam_width < 256) {
-    // Volta has 80 SMs, so we aim for three waves
-    voc_parts = (240 + batch_size * beam_width - 1) / (batch_size * beam_width);
-    voc_parts = std::min(128, voc_parts);  // we implment up to 128
-  }
-  dim3 grid(batch_size * beam_width, voc_parts);
-  cudaFuncSetAttribute(beam_online_softmax_topk_stage1_kernel<T,
-                                                              items_per_thread,
-                                                              MAX_K,
-                                                              block_sz>,
-                       cudaFuncAttributePreferredSharedMemoryCarveout,
-                       cudaSharedmemCarveoutMaxL1);
-  beam_online_softmax_topk_stage1_kernel<T,
-                                         items_per_thread,
-                                         MAX_K,
-                                         block_sz,
-                                         true><<<grid, block_sz, 0, stream>>>(
-      log_probs,
-      bias,
-      finished,
-      tmp_buffer,
-      vocab_size,
-      beam_width * 2,
-      end_id);
-  beam_online_softmax_topk_stage2_kernelLauncher<T, MAX_K, true>(
-      tmp_buffer,
-      cum_log_probs,
-      topk_tmp_id_buf,
-      topk_tmp_val_buf,
-      batch_size,
-      beam_width,
-      voc_parts,
-      stream);  // double beam_width in launcher
-#else
-  beam_online_softmax_topk_kernel<
-      T,
-      items_per_thread,
-      MAX_K,
-      block_sz,
-      true><<<batch_size * beam_width, block_sz, 0, stream>>>(log_probs,
-                                                              bias,
-                                                              cum_log_probs,
-                                                              finished,
-                                                              topk_tmp_id_buf,
-                                                              topk_tmp_val_buf,
-                                                              vocab_size,
-                                                              beam_width * 2,
-                                                              end_id);
-#endif
-  float length_penalty = (finished_candidate_num == beam_width)
-                             ? std::pow((5. + step - 1) / 6., alpha)
-                             : std::pow((5. + step + 1) / 6., alpha);
-  float max_length_penalty = (finished_candidate_num == beam_width)
-                                 ? length_penalty
-                                 : std::pow((5. + max_out_len + 1) / 6., alpha);
-  batch_topk_update_kernel<T, MAX_K, 32><<<batch_size, 32, 0, stream>>>(
-      topk_tmp_id_buf,
-      topk_tmp_val_buf,
-      finished,
-      alive_finished,
-      sequence_length,
-      word_ids,
-      parent_ids,
-      output_word_ids,
-      output_parent_ids,
-      cum_log_probs,
-      batch_size,
-      beam_width,
-      vocab_size,
-      end_id,
-      step,
-      max_out_len,
-      beam_width * beam_width * 2,
-      beam_width * 2,
-      diversity_rate,
-      length_penalty,
-      max_length_penalty,
-      finished_candidate_num,
-      early_stopping);
-}
-
-template <typename T>
-void topK_softMax_update(
-    const T* log_probs,
-    const T* bias,  // NOTE: bias is float in V3.1
-    bool* finished,
-    bool* alive_finished,
-    int* sequence_length,
-    int* word_ids,
-    int* parent_ids,  // for update cache, only include alive beams
-    int* output_word_ids,
-    int* output_parent_ids,  // for gather tree, include both alive and finish
-                             // beams
-    float* output_cum_log_probs,  // NOTE: cum_log_probs is T in V3.1
-    void* temp_storage,
-    const int step,
-    DecodingBeamsearchArguments args,
-    cudaStream_t stream) {
-  const int temp_storage_size = args.temp_storage_size_;
-  const int batch_size = args.batch_size_;
-  const int beam_width = args.beam_width_;
-  const int vocab_size = args.vocab_size_padded_;
-  const int end_id = args.end_id_;
-  const T diversity_rate = args.beam_search_diversity_rate_;
-  const int max_out_len = args.seq_len_;
-  const float alpha = args.alpha_;
-  const int finished_candidate_num = args.finished_candidate_num_;
-  const bool early_stopping = args.early_stopping_;
-
-  switch (beam_width) {
-    case 1:
-      topK_softMax_update_kernelLauncher<T, 2>(log_probs,
-                                               bias,
-                                               finished,
-                                               alive_finished,
-                                               sequence_length,
-                                               word_ids,
-                                               parent_ids,
-                                               output_word_ids,
-                                               output_parent_ids,
-                                               output_cum_log_probs,
-                                               temp_storage,
-                                               temp_storage_size,
-                                               batch_size,
-                                               beam_width,
-                                               vocab_size,
-                                               end_id,
-                                               step,
-                                               max_out_len,
-                                               diversity_rate,
-                                               alpha,
-                                               finished_candidate_num,
-                                               early_stopping,
-                                               stream);
-      break;
-    case 2:
-      topK_softMax_update_kernelLauncher<T, 4>(log_probs,
-                                               bias,
-                                               finished,
-                                               alive_finished,
-                                               sequence_length,
-                                               word_ids,
-                                               parent_ids,
-                                               output_word_ids,
-                                               output_parent_ids,
-                                               output_cum_log_probs,
-                                               temp_storage,
-                                               temp_storage_size,
-                                               batch_size,
-                                               beam_width,
-                                               vocab_size,
-                                               end_id,
-                                               step,
-                                               max_out_len,
-                                               diversity_rate,
-                                               alpha,
-                                               finished_candidate_num,
-                                               early_stopping,
-                                               stream);
-      break;
-    case 3:
-      topK_softMax_update_kernelLauncher<T, 6>(log_probs,
-                                               bias,
-                                               finished,
-                                               alive_finished,
-                                               sequence_length,
-                                               word_ids,
-                                               parent_ids,
-                                               output_word_ids,
-                                               output_parent_ids,
-                                               output_cum_log_probs,
-                                               temp_storage,
-                                               temp_storage_size,
-                                               batch_size,
-                                               beam_width,
-                                               vocab_size,
-                                               end_id,
-                                               step,
-                                               max_out_len,
-                                               diversity_rate,
-                                               alpha,
-                                               finished_candidate_num,
-                                               early_stopping,
-                                               stream);
-      break;
-    case 4:
-      topK_softMax_update_kernelLauncher<T, 8>(log_probs,
-                                               bias,
-                                               finished,
-                                               alive_finished,
-                                               sequence_length,
-                                               word_ids,
-                                               parent_ids,
-                                               output_word_ids,
-                                               output_parent_ids,
-                                               output_cum_log_probs,
-                                               temp_storage,
-                                               temp_storage_size,
-                                               batch_size,
-                                               beam_width,
-                                               vocab_size,
-                                               end_id,
-                                               step,
-                                               max_out_len,
-                                               diversity_rate,
-                                               alpha,
-                                               finished_candidate_num,
-                                               early_stopping,
-                                               stream);
-      break;
-    case 8:
-      topK_softMax_update_kernelLauncher<T, 16>(log_probs,
-                                                bias,
-                                                finished,
-                                                alive_finished,
-                                                sequence_length,
-                                                word_ids,
-                                                parent_ids,
-                                                output_word_ids,
-                                                output_parent_ids,
-                                                output_cum_log_probs,
-                                                temp_storage,
-                                                temp_storage_size,
-                                                batch_size,
-                                                beam_width,
-                                                vocab_size,
-                                                end_id,
-                                                step,
-                                                max_out_len,
-                                                diversity_rate,
-                                                alpha,
-                                                finished_candidate_num,
-                                                early_stopping,
-                                                stream);
-      break;
-    case 16:
-      topK_softMax_update_kernelLauncher<T, 32>(log_probs,
-                                                bias,
-                                                finished,
-                                                alive_finished,
-                                                sequence_length,
-                                                word_ids,
-                                                parent_ids,
-                                                output_word_ids,
-                                                output_parent_ids,
-                                                output_cum_log_probs,
-                                                temp_storage,
-                                                temp_storage_size,
-                                                batch_size,
-                                                beam_width,
-                                                vocab_size,
-                                                end_id,
-                                                step,
-                                                max_out_len,
-                                                diversity_rate,
-                                                alpha,
-                                                finished_candidate_num,
-                                                early_stopping,
-                                                stream);
-      break;
-    case 32:
-      topK_softMax_update_kernelLauncher<T, 64>(log_probs,
-                                                bias,
-                                                finished,
-                                                alive_finished,
-                                                sequence_length,
-                                                word_ids,
-                                                parent_ids,
-                                                output_word_ids,
-                                                output_parent_ids,
-                                                output_cum_log_probs,
-                                                temp_storage,
-                                                temp_storage_size,
-                                                batch_size,
-                                                beam_width,
-                                                vocab_size,
-                                                end_id,
-                                                step,
-                                                max_out_len,
-                                                diversity_rate,
-                                                alpha,
-                                                finished_candidate_num,
-                                                early_stopping,
-                                                stream);
-      break;
-    default:
-      printf("[ERROR] Topk kernel does not support beamwidth = %d \n",
-             beam_width);
-      exit(0);
-      break;
-  }
-}
-
-template void topK_softMax_update<float>(
-    const float* log_probs,
-    const float* bias,
-    bool* finished,
-    bool* alive_finished,
-    int* sequence_length,
-    int* word_ids,
-    int* parent_ids,  // for update cache, only include alive beams
-    int* output_word_ids,
-    int* output_parent_ids,  // for gather tree, include both alive and finish
-                             // beams
-    float* output_cum_log_probs,
-    void* temp_storage,
-    const int step,
-    DecodingBeamsearchArguments args,
-    cudaStream_t stream);
-
-template void topK_softMax_update<half>(
-    const half* log_probs,
-    const half* bias,
-    bool* finished,
-    bool* alive_finished,
-    int* sequence_length,
-    int* word_ids,
-    int* parent_ids,  // for update cache, only include alive beams
-    int* output_word_ids,
-    int* output_parent_ids,  // for gather tree, include both alive and finish
-                             // beams
-    float* output_cum_log_probs,
-    void* temp_storage,
-    const int step,
-    DecodingBeamsearchArguments args,
-    cudaStream_t stream);
-
-}  // end of namespace fastertransformer
diff --git a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/open_attention.h b/paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/open_attention.h
deleted file mode 100644
index 6702087e9b11..000000000000
--- a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/open_attention.h
+++ /dev/null
@@ -1,1137 +0,0 @@
-/*
- * Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/**
- * Open sourced multi-head attention
- **/
-
-#pragma once
-
-#include "fastertransformer/utils/allocator.h"
-#include "fastertransformer/cuda/multi_head_attention.h"
-#include "fastertransformer/cuda/attention_kernels.cuh"
-#include "fastertransformer/cuda/transformer_kernels.cuh"
-#include "fastertransformer/cuda/cuda_kernels.h"
-#include "fastertransformer/cuda/cuda_int8_kernels.h"
-#include "fastertransformer/gemm_test/encoder_gemm_func.h"
-#include "fastertransformer/gemm_test/encoder_igemm_func.h"
-#include "fastertransformer/utils/functions.h"
-#include <assert.h>
-#include <cuda_runtime.h>
-#include <cuda_fp16.h>
-#include "fastertransformer/trt_fused_multihead_attention/qkvToContext.h"
-
-namespace fastertransformer{
-namespace cuda{
-
-void trt_add_QKV_bias_transpose_debug_kernelLauncher(
-  const half* query_buf, const half* bias_Q,
-  const half* key_buf, const half* bias_K,
-  const half* value_buf, const half* bias_V,
-  half* context_buf, 
-  const int valid_word_num, 
-  const int head_num, const int size_per_head,
-  cudaStream_t stream); // Used to debug the trt_add_QKV_bias kernel
-
-template <typename T>
-void add_QK_bias_transform_kernelLauncher(int8_t *q_buf, int8_t *k_buf, const int32_t* Q, const T* bias_Q, 
-                                          const int32_t* K, const T* bias_K, const int batch_size, 
-                                          const int seq_len, const int head_num, const int size_per_head, 
-                                          const float * q_weight_amax, const float *q_input_deQFactor_div127_ptr, 
-                                          const float * k_weight_amax, const float *k_input_deQFactor_div127_ptr, 
-                                          const float *q_output_scale_ptr, const float *k_output_scale_ptr,
-                                          bool use_ORDER_COL32_2R_4R4, cudaStream_t stream);
-                                          
-template <typename T>
-void add_QK_bias_transform_kernelLauncher(int8_t *q_buf, int8_t *k_buf, const int8_t* Q, const T* bias_Q, 
-                                          const int8_t* K, const T* bias_K, const int batch_size, 
-                                          const int seq_len, const int head_num, const int size_per_head, 
-                                          const float *q_input_deQFactor_ptr, const float *k_input_deQFactor_ptr, 
-                                          const float *q_output_scale_ptr, const float *k_output_scale_ptr,
-                                          bool use_ORDER_COL32_2R_4R4, cudaStream_t stream);                                          
-
-template <typename T>
-void add_V_bias_transform_kernelLauncher(int8_t *v_buf, const int32_t *V, const T *V_bias, 
-                                         const int batch_size, const int seq_len, 
-                                         const int head_num, const int size_per_head, 
-                                         const float* weight_amax, 
-                                         const float *input_deQFactor_div127_ptr, const float *out_scale_ptr, 
-                                         bool use_ORDER_COL32_2R_4R4, cudaStream_t stream);
-                                         
-template <typename T>
-void add_V_bias_transform_kernelLauncher(int8_t *v_buf, const int8_t *V, const T *V_bias, const int batch_size, 
-                                         const int seq_len, const int head_num, const int size_per_head,
-                                         const float *input_deQFactor_ptr, const float *out_scale_ptr, 
-                                         bool use_ORDER_COL32_2R_4R4, cudaStream_t stream);
-                                         
-void mappingRemovePaddingData_kernelLauncher(const int batch_size, const int seq_len, 
-                                             const int valid_word_num, int *mapping, 
-                                             const int* sequence_id_offset, cudaStream_t stream);
-                                             
-template <typename T>
-void add_QK_bias_transform_rebuild_padding_kernelLauncher(int8_t *q_buf, int8_t *k_buf, 
-                                                          const int32_t* Q, const T* bias_Q, 
-                                                          const int32_t* K, const T* bias_K, 
-                                                          const int* sequence_id_offset, const int valid_word_num, 
-                                                          const int batch_size, const int seq_len, 
-                                                          const int head_num, const int size_per_head, 
-                                                          const float * q_weight_amax, 
-                                                          const float *q_input_deQFactor_div127_ptr, 
-                                                          const float * k_weight_amax, 
-                                                          const float *k_input_deQFactor_div127_ptr, 
-                                                          const float *q_output_scale_ptr, const float *k_output_scale_ptr,
-                                                          bool use_ORDER_COL32_2R_4R4, cudaStream_t stream);
-                                                          
-template <typename T>
-void add_QK_bias_transform_rebuild_padding_kernelLauncher(int8_t *q_buf, int8_t *k_buf, const int8_t* Q, const T* bias_Q, 
-                                                          const int8_t* K, const T* bias_K, const int* sequence_id_offset, 
-                                                          const int valid_word_num, 
-                                                          const int batch_size, const int seq_len, 
-                                                          const int head_num, const int size_per_head,  
-                                                          const float *q_deQFactor_ptr,  const float *k_deQFactor_ptr, 
-                                                          const float *q_output_scale_ptr, const float *k_output_scale_ptr,
-                                                          bool use_ORDER_COL32_2R_4R4, cudaStream_t stream);
-                                                          
-template <typename T>
-void add_V_bias_transform_rebuild_padding_kernelLauncher(int8_t *v_buf, const int32_t *V, const T *V_bias, 
-                                                         const int* sequence_id_map, const int valid_word_num, 
-                                                         const int batch_size, const int seq_len, 
-                                                         const int head_num, const int size_per_head, 
-                                                         const float* weight_amax, 
-                                                         const float *input_deQFactor_div127_ptr, 
-                                                         const float *out_scale_ptr, 
-                                                         bool use_ORDER_COL32_2R_4R4, cudaStream_t stream);
-                                                         
-template <typename T>
-void add_V_bias_transform_rebuild_padding_kernelLauncher(int8_t *v_buf, const int8_t *V, const T *V_bias, 
-                                                         const int* sequence_id_map, const int valid_word_num, 
-                                                         const int batch_size, const int seq_len, 
-                                                         const int head_num, const int size_per_head,
-                                                         const float *deQFactor_ptr, const float *out_scale_ptr, 
-                                                         bool use_ORDER_COL32_2R_4R4, cudaStream_t stream);  
-
-template <typename T>
-void softmax_COL32_kernelLauncher(int8_t* qk_buf, const int32_t* qk_int_buf, const T* attr_mask, 
-                                  const int batch_size, const int head_num, const int seq_len, 
-                                  const float scalar1a, const float *scalar1b, const float *scalar1c, 
-                                  const float *amax_ptr, cudaStream_t stream);
-
-template <typename T>
-void softmax_COL32_kernelLauncher(int8_t* qk_buf, const int8_t* qk_int_buf, const T* attr_mask, 
-                                  const int batch_size, const int head_num, const int seq_len, 
-                                  const float scalar1a, const float *scalar1b, const float *amax_ptr, 
-                                  cudaStream_t stream);
-
-template<typename T>
-void add_QKV_bias_rebuild_padding_kernelLauncher(T* Q, const T* bias_Q, T* K, const T* bias_K, 
-                                                 T* V, const T* bias_V, T* q_buf, T* k_buf, T* v_buf, 
-                                                 const int batch_size, const int seq_len, 
-                                                 const int head_num, const int size_per_head, const int valid_word_num, 
-                                                 const int* mask_offset, cudaStream_t stream);
-                                                 
-template<typename T>
-void transpose_kernelLauncher(T* src, T* dst, const int batch_size, const int seq_len, const int head_num, const int size_per_head, cudaStream_t stream); 
-
-template<typename T>
-void transpose_rebuild_padding_kernelLauncher(T* src, T* dst, const int valid_word_num,
-                                              const int batch_size, const int seq_len, 
-                                              const int head_num, const int size_per_head, 
-                                              const int* mask_offset, cudaStream_t stream);
-
-template<OperationType OpType_>
-class OpenMultiHeadAttentionTraits;
-
-template<>
-class OpenMultiHeadAttentionTraits<OperationType::FP32>
-{
- public:
-  typedef float DataType;
-  static cudaDataType_t const computeType = CUDA_R_32F;
-  static cudaDataType_t const scaleType = CUDA_R_32F;
-  static cudaDataType_t const AType = CUDA_R_32F;
-  static cudaDataType_t const BType = CUDA_R_32F;
-  static cudaDataType_t const CType = CUDA_R_32F;
-  //others
-};
-
-template<>
-class OpenMultiHeadAttentionTraits<OperationType::FP16>
-{
- public:
-  typedef half DataType;
-  static cudaDataType_t const computeType = CUDA_R_16F;
-  static cudaDataType_t const scaleType = CUDA_R_16F;
-  static cudaDataType_t const AType = CUDA_R_16F;
-  static cudaDataType_t const BType = CUDA_R_16F;
-  static cudaDataType_t const CType = CUDA_R_16F;
-  //others
-};
-
-/**
- * Multi-head attetion open sourced
- */
-template<OperationType OpType_>
-class OpenMultiHeadAttention: IMultiHeadAttention<OpType_>
-{
- private:
-  typedef OpenMultiHeadAttentionTraits<OpType_> Traits_;
-  typedef typename Traits_::DataType DataType_;
-  const cudaDataType_t computeType_ = Traits_::computeType;
-  const cudaDataType_t AType_ = Traits_::AType;
-  const cudaDataType_t BType_ = Traits_::BType;
-  const cudaDataType_t CType_ = Traits_::CType;
-  IAllocator* allocator_ = NULL;
-  MultiHeadInitParam<DataType_> param_;
-
-  //algo for batch matrix multiplication in unfused mha
-  int cublasBmmAlgo_[2];
-  std::map<std::string, cublasLtMatmulAlgo_info> cublasAlgoMap_;
-  std::map<std::string, int> parameterMap_;
-  bool is_fuse_QKV_;
-
-  DataType_* buf_ = NULL;
-  DataType_* query_buf_;
-  DataType_* key_buf_;
-  DataType_* value_buf_;
-  DataType_* q_buf_;
-  DataType_* k_buf_;
-  DataType_* v_buf_;
-  DataType_* qk_buf_;
-  DataType_* transpose_dst_;
-  
-  DataType_** qkv_kernel_;
-  DataType_** qkv_input_;
-  DataType_** qkv_buf_;
-
-  void* cublas_workspace_;
-
-  void* trt_attn_workspace_;
-
-  const float *query_weight_amax_list, *key_weight_amax_list, *value_weight_amax_list;
-
-  int sm_;
-  int batch_size_;
-  int from_seq_len_;
-  int to_seq_len_;
-  int head_num_;
-  int size_per_head_;
-  float q_scaling_;
-  //int8_mode == 0 -- not use int8
-  //int8_mode == 1 -- use int8; without quantized residual; when (batch*seqLen >= 512) or (seqLen % 32 !=0 ), using trt fused mha
-  //int8_mode == 2 -- use int8; with quantized residual; with trt fused mha
-  //int8_mode == 3 -- use int8; with quantized residual; without trt fused mha
-  int int8_mode_ = 0;
-  int* sequence_id_map_;
-  int* Q_int_buf_;
-  int* K_int_buf_;
-  int* V_int_buf_;
-  int* qk_int_buf_;
-  int* transpose_dst_int_buf_;
-
-  bool allow_gemm_test_ = false;
-  bool use_ORDER_COL32_2R_4R4_ = false;
-  std::unique_ptr<MHARunner> dispatcher_fp16, dispatcher_int8;
-
- public:
-
-  void getCublasBmmAlgoFromMap()
-  {
-    int batchCount, m, n, k, dataType;
-    if (std::is_same<half, DataType_>::value)
-      dataType = HALF_DATATYPE;
-    else
-      dataType = FLOAT_DATATYPE;
-    //bmm1    
-    batchCount = batch_size_*head_num_;
-    m = from_seq_len_; 
-    n = from_seq_len_; 
-    k = size_per_head_; 
-    char mark[256];
-    sprintf(mark, "%d_%d_%d_%d_%d", batchCount, n, m, k, dataType);
-    if (cublasAlgoMap_.find(mark) != cublasAlgoMap_.end())
-    {
-      cublasBmmAlgo_[0] = cublasAlgoMap_[mark].algoId;
-    }
-    else
-    {
-      cublasBmmAlgo_[0] = dataType == FLOAT_DATATYPE ? CUBLAS_GEMM_DEFAULT : CUBLAS_GEMM_DEFAULT_TENSOR_OP;
-    }
-    //bmm2
-    batchCount = batch_size_*head_num_;
-    m = from_seq_len_;
-    n = size_per_head_;
-    k = from_seq_len_;
-    sprintf(mark, "%d_%d_%d_%d_%d", batchCount, n, m, k, dataType);
-    if (cublasAlgoMap_.find(mark) != cublasAlgoMap_.end())
-    {
-      cublasBmmAlgo_[1] = cublasAlgoMap_[mark].algoId;
-    }
-    else
-    {
-      cublasBmmAlgo_[1] = dataType == FLOAT_DATATYPE ? CUBLAS_GEMM_DEFAULT : CUBLAS_GEMM_DEFAULT_TENSOR_OP;
-    }
-  }
-
-  void judgeFusedQKV()
-  {
-    is_fuse_QKV_ = false;
-    int m, n, k, dataType;
-    if (std::is_same<half, DataType_>::value)
-      dataType = HALF_DATATYPE;
-    else
-      dataType = FLOAT_DATATYPE;
-
-    m = batch_size_*from_seq_len_;
-    n = head_num_*size_per_head_;
-    k = head_num_*size_per_head_;
-    char mark[256], mark2[256];
-    sprintf(mark, "1_%d_%d_%d_%d", n, m, k, dataType);
-    sprintf(mark2, "3_%d_%d_%d_%d", n, m, k, dataType);
-    if (
-        cublasAlgoMap_.find(mark) != cublasAlgoMap_.end() && 
-        cublasAlgoMap_.find(mark2) != cublasAlgoMap_.end() &&
-        3*cublasAlgoMap_[mark].exec_time > cublasAlgoMap_[mark2].exec_time
-       )
-    {
-        is_fuse_QKV_ = true;
-    }
-  }
-
-  //free buffer for OpenMultiHeadAttention
-  void freeBuffer()
-  {
-#ifndef NDEBUG
-    PRINT_FUNC_NAME_();
-#endif
-    if (buf_ != NULL)
-    {
-      if (allocator_ == NULL)
-      {
-        printf("[ERROR][OpenMultiHeadAttention][freeBuffer] allocator_ is NULL!\n");
-        exit(-1);
-      }
-      allocator_->free(buf_);
-      buf_ = NULL;
-    }
-  }
-
-  size_t get_workspace_size()
-  {
-    size_t size = 0;
-    
-    const int buf_size = batch_size_ * head_num_ * from_seq_len_ * size_per_head_;
-    const int qk_buf_size = batch_size_ * head_num_ * from_seq_len_ * from_seq_len_;
-    const int seq_len_padded = (from_seq_len_ + 31)/32*32;
-    const int padded_buf_size = batch_size_ * head_num_ * seq_len_padded * size_per_head_;
-    const int padded_qk_buf_size = batch_size_ * head_num_ * seq_len_padded * seq_len_padded;
-
-    if(int8_mode_ != 0)
-    {
-#ifdef WITH_IINT8
-             //query_buf_(Q_int_buf_) key_buf_(K_int_buf_) value_buf_(V_int_buf_) qk_int_buf_ transpose_dst_(transpose_dst_int_buf_)
-      size = sizeof(int) * (4*buf_size + padded_qk_buf_size) +
-             //int8 q_buf_ k_buf_ v_buf_ qk_buf_
-             sizeof(int8_t) * (3*padded_buf_size + padded_qk_buf_size) +
-             //sequence_id_map 
-             (batch_size_*from_seq_len_)*sizeof(int) +
-             //trt_attn_workspace_
-             (dispatcher_int8.get() ? dispatcher_int8->getWorkspaceSize() : 0);
-
-#else
-      printf("[ERROR] PaddleNLP does not support INT8. \n");
-      exit(-1);
-#endif
-    }
-    else
-    {
-      size = sizeof(DataType_) * (buf_size * 7 + qk_buf_size) + sizeof(DataType_*) * 9 + 
-                (dispatcher_fp16.get() ? dispatcher_fp16->getWorkspaceSize() : 0);
-    }
-    return size;
-  }
-
-  //allocate buffer for OpenMultiHeadAttention
-  //read config again if hasChangedConfig == true
-  void allocateBuffer(IAllocator* allocator, void* cublas_workspace, int batch_size, int from_seq_len, int to_seq_len,
-                      int head_num, int size_per_head, bool hasChangedConfig, bool use_trt_kernel)
-  {
-#ifndef NDEBUG
-    PRINT_FUNC_NAME_();
-#endif
-    if (allocator == NULL)
-    {
-      printf("[ERROR][OpenMultiHeadAttention][allocateBuffer] allocator == NULL!\n");
-      exit(-1);
-    }
-    
-    try
-    {
-      //only allocate new buffer when buf_ is empty
-      //if buf_ is not empty, use previous allocated one
-      //this can ensure consistency between (allocator_, batch_size_, ...) and buf_
-      if (buf_ != NULL){
-        printf("[ERROR][OpenMultiHeadAttention][allocateBuffer] previous buffer is not freed, use previous one. To allocate new buffer, please use freeBuffer() to free previous buffer first.\n");
-        exit(-1);
-      }
-      else
-      {
-        allocator_ = allocator;
-        batch_size_ = batch_size;
-        from_seq_len_ = from_seq_len;
-        to_seq_len_ = to_seq_len;
-        head_num_ = head_num;
-        size_per_head_ = size_per_head;
-        cublas_workspace_ = cublas_workspace;
-
-        const int buf_size = batch_size_ * head_num_ * from_seq_len_ * size_per_head_;
-        const int qk_buf_size = batch_size_ * head_num_ * from_seq_len_ * from_seq_len_;
-
-        if (int8_mode_ != 0)
-        {
-#ifdef WITH_INT8
-          if ((int8_mode_ == 1 && (batch_size_*from_seq_len_ >= 512 || (from_seq_len_ % 32 != 0))) || int8_mode_ == 2)
-          {
-            if (use_trt_kernel && (sm_ == kSM_86 || sm_ == kSM_80 || sm_ == kSM_75 || sm_ == kSM_72) && size_per_head_ == 64)
-            {
-              //try
-              {
-                dispatcher_int8.reset(new FusedMHARunnerInt8v2(head_num_, size_per_head_, sm_));    
-              }
-            }
-          }
-          const int seq_len_padded = (from_seq_len_ + 31)/32*32;
-          const int padded_buf_size = batch_size_ * head_num_ * seq_len_padded * size_per_head_;
-          const int padded_qk_buf_size = batch_size_ * head_num_ * seq_len_padded * seq_len_padded;
-          
-          buf_ = (DataType_*) allocator_->malloc(get_workspace_size(), false);
-          if (buf_ == NULL)
-            throw std::runtime_error(std::string("Allocator failed to allocate internal buffer."));
-          Q_int_buf_ = (int *)(buf_);
-          K_int_buf_ = Q_int_buf_ + buf_size;
-          V_int_buf_ = K_int_buf_ + buf_size;
-          transpose_dst_int_buf_ = V_int_buf_ + buf_size;
-          qk_int_buf_ = transpose_dst_int_buf_ + buf_size;
-          q_buf_ = (DataType_*)(qk_int_buf_ + padded_qk_buf_size);
-          //the actual size is calculated with int8_t datatype 
-          k_buf_ = (DataType_*)((int8_t*)q_buf_ + padded_buf_size);
-          v_buf_ = (DataType_*)((int8_t*)k_buf_ + padded_buf_size);
-          qk_buf_ = (DataType_*)((int8_t*)v_buf_ + padded_buf_size);
-          sequence_id_map_ = (int*)((int8_t*)qk_buf_ + padded_qk_buf_size);
-          trt_attn_workspace_ = (void*)(sequence_id_map_ + (batch_size_*from_seq_len_));
-#else
-          printf("[ERROR] PaddleNLP does not support INT8. \n");
-          exit(-1);
-#endif
-
-        }
-        else
-        {
-        // if (use_trt_kernel && (sm_ == kSM_70 || sm_ == kSM_86 || sm_ == kSM_80 || sm_ == kSM_75 || sm_ == kSM_72) && size_per_head_ == 64)
-        //     dispatcher_fp16.reset(new FusedMHARunnerFP16v2(head_num_, size_per_head_, sm_, q_scaling_));
-          buf_ = (DataType_*) allocator_->malloc(get_workspace_size(), false);
-          if (buf_ == NULL)
-            throw std::runtime_error(std::string("Allocator failed to allocate internal buffer."));
-          query_buf_ = buf_;
-          key_buf_ = buf_ + buf_size;
-          value_buf_ = buf_ + 2 * buf_size;
-          q_buf_ = buf_ + 3 * buf_size;
-          k_buf_ = buf_ + 4 * buf_size;
-          v_buf_ = buf_ + 5 * buf_size;
-          qk_buf_ = buf_ + 6 * buf_size;
-          transpose_dst_ = qk_buf_ + qk_buf_size;
-          qkv_kernel_ = (DataType_**)(transpose_dst_ + buf_size);
-          qkv_input_ = qkv_kernel_ + 3;
-          qkv_buf_ = qkv_input_ + 3;
-          trt_attn_workspace_ = (void*)(qkv_buf_ + 3);
-        }
-      }
-
-      //no gemm test in OpenMultiHeadAttention 
-      //if config changes, read config again
-      if (hasChangedConfig)
-      {
-        int isConfigExist = -1;
-        if (int8_mode_ != 0)
-        {
-#ifdef WITH_INT8
-          isConfigExist = access(IGEMM_CONFIG, 0);
-#else
-          printf("[ERROR] PaddleNLP does not support INT8. \n");
-          exit(-1);
-#endif
-        }
-        else
-        {
-          isConfigExist = access(GEMM_CONFIG, 0);
-        }
-        if (isConfigExist == -1)
-          printf("[WARNING][OpenMultiHeadAttention] %s is not found; using default GEMM algo\n", int8_mode_ != 0 ? IGEMM_CONFIG : GEMM_CONFIG);
-        else
-        {
-          readAlgoFromConfig(int8_mode_, cublasAlgoMap_, parameterMap_, false);
-        }
-      }
-
-      if (int8_mode_ == 0)
-      {
-        getCublasBmmAlgoFromMap();
-        judgeFusedQKV();
-      }
-    }
-    catch(std::runtime_error& error)
-    {
-      throw error;
-    }
-  }
-
-  //Ctor
-  OpenMultiHeadAttention(int int8_mode=0, bool allow_gemm_test=false, bool use_ORDER_COL32_2R_4R4=false, int sm = 75, float q_scaling=1.0) : 
-    int8_mode_(int8_mode), allow_gemm_test_(allow_gemm_test), use_ORDER_COL32_2R_4R4_(use_ORDER_COL32_2R_4R4), sm_(sm), q_scaling_(q_scaling)
-   {
-#ifndef NDEBUG
-    PRINT_FUNC_NAME_();
-#endif
-    //sm_ = getSMVersion();
-
-    try
-    {
-      int isConfigExist = -1;
-      if (int8_mode_ != 0) {
-#ifdef WITH_INT8
-        isConfigExist = access(IGEMM_CONFIG, 0);
-#else
-        printf("[ERROR] PaddleNLP does not support INT8. \n");
-        exit(-1);
-#endif
-      }
-      else {
-        isConfigExist = access(GEMM_CONFIG, 0);
-      }
-
-      if (isConfigExist == -1)
-      {
-        if (!allow_gemm_test_)
-        {
-          printf("[WARNING][OpenMultiHeadAttention] %s is not found; using default GEMM algo\n", int8_mode_ != 0 ? IGEMM_CONFIG : GEMM_CONFIG);
-        }
-      }
-      else
-      {
-        readAlgoFromConfig(int8_mode_, cublasAlgoMap_, parameterMap_, false);
-      }
-    }
-    catch(std::runtime_error& error)
-    {
-      throw error;
-    }
-  }
-
-  OpenMultiHeadAttention(const OpenMultiHeadAttention *attention)
-  {
-#ifndef NDEBUG
-    PRINT_FUNC_NAME_();
-#endif
-    sm_ = attention->sm_;
-    int8_mode_ = attention->int8_mode_;
-    allow_gemm_test_ = attention->allow_gemm_test_;
-    q_scaling_ = attention->q_scaling_;
-
-    for(int i = 0; i < 2; i++) cublasBmmAlgo_[i] = attention->cublasBmmAlgo_[i];
-    cublasAlgoMap_ = attention->cublasAlgoMap_;
-    parameterMap_ = attention->parameterMap_;
-    is_fuse_QKV_ = attention->is_fuse_QKV_;
-    use_ORDER_COL32_2R_4R4_ = attention->use_ORDER_COL32_2R_4R4_;
-  }
-
-  void multiHeadAttr_nofuse_kernelLauncher(
-      cudaStream_t stream,
-      cublasHandle_t cublas_handle,
-      cublasLtHandle_t cublaslt_handle,
-      DataType_* Q,
-      const DataType_* bias_Q,
-      DataType_* K,
-      const DataType_* bias_K,
-      DataType_* V,
-      const DataType_* bias_V,
-      const DataType_* attr_mask,
-      DataType_* dst,
-      const int batch_size,
-      const int seq_len,
-      const int head_num,
-      const int size_per_head,
-      const int int8_mode_,
-      const DataType_ scalar)
-  {
-    const int k = head_num * size_per_head;
-
-    if (int8_mode_ != 0)
-    {
-      //var for int8
-#ifdef WITH_INT8
-      const float*Qbias_amax_ptr, *Kbias_amax_ptr, *Vbias_amax_ptr, *bmm1_amax_ptr, *Softmax_amax_ptr, *bmm2_amax_ptr, *in_amax_ptr, *Q_aftergemm_amax_ptr, *K_aftergemm_amax_ptr, *V_aftergemm_amax_ptr;
-      Qbias_amax_ptr = param_.amaxList + 8;
-      Kbias_amax_ptr = param_.amaxList + 16;
-      Vbias_amax_ptr = param_.amaxList + 24;
-      Softmax_amax_ptr = param_.amaxList + 32;
-      bmm2_amax_ptr = param_.amaxList + 36;
-      Q_aftergemm_amax_ptr = param_.amaxList + 4;
-      K_aftergemm_amax_ptr = param_.amaxList + 12;
-      V_aftergemm_amax_ptr = param_.amaxList + 20;
-      bmm1_amax_ptr = param_.amaxList + 28;
-      in_amax_ptr = param_.amaxList;
-      
-	  if (size_per_head % 32 != 0)
-      {
-        printf("[ERROR][FT][multiHeadAttr_nofuse_kernelLauncher] int8 unfused mha kernel only works when size_per_head %% 32 == 0.\n");
-        exit(-1);
-      }
-      if ((seq_len % 32 != 0) && int8_mode_ == 1)
-      {
-        printf("[ERROR][FT][multiHeadAttr_nofuse_kernelLauncher] int8 mode1 unfused mha kernel only works when seq_len %% 32 == 0.\n");
-        exit(-1);
-	    }
-      const int seq_len_padded = (seq_len + 31)/32*32;
-
-      if(param_.sequence_id_offset == nullptr || param_.valid_word_num == batch_size * seq_len)
-      {
-        if (int8_mode_ == 1)
-        {
-          add_QK_bias_transform_kernelLauncher((int8_t*)q_buf_, (int8_t*)k_buf_, 
-                                               (const int32_t*) Q, bias_Q, (const int32_t*) K, bias_K, 
-                                               batch_size, seq_len, head_num, size_per_head, 
-                                               query_weight_amax_list, in_amax_ptr+2, 
-                                               key_weight_amax_list, in_amax_ptr+2, 
-                                               Qbias_amax_ptr+3, Kbias_amax_ptr+3,
-                                               use_ORDER_COL32_2R_4R4_, stream);
-          add_V_bias_transform_kernelLauncher((int8_t*)v_buf_, 
-                                              (const int32_t *)V, bias_V, 
-                                              batch_size, seq_len, head_num, size_per_head, 
-                                              value_weight_amax_list, in_amax_ptr+2, Vbias_amax_ptr+3, 
-                                              use_ORDER_COL32_2R_4R4_, stream);
-        }
-        else if (int8_mode_ == 2 || int8_mode_ == 3)
-        {
-          add_QK_bias_transform_kernelLauncher((int8_t*)q_buf_, (int8_t*)k_buf_, 
-                                               (const int8_t*) Q, bias_Q, (const int8_t*)K, bias_K, 
-                                               batch_size, seq_len, head_num, size_per_head, 
-                                               Q_aftergemm_amax_ptr+1, K_aftergemm_amax_ptr+1, 
-                                               Qbias_amax_ptr+3, Kbias_amax_ptr+3,
-                                               use_ORDER_COL32_2R_4R4_, stream);
-          add_V_bias_transform_kernelLauncher((int8_t*)v_buf_, (const int8_t *)V, bias_V, 
-                                               batch_size, seq_len, head_num, size_per_head,
-                                               V_aftergemm_amax_ptr+1, Vbias_amax_ptr+3, 
-                                               use_ORDER_COL32_2R_4R4_, stream);
-        }
-      }
-      else{
-        mappingRemovePaddingData_kernelLauncher(batch_size, seq_len, 
-                                                param_.valid_word_num, sequence_id_map_, 
-                                                param_.sequence_id_offset, stream);
-        // if we use remove padding, then initialize the q_buf_, k_buf_ and v_buf_ to prevent bugs. v_buf_ will be properly initiaized in add_V_bias_transform_rebuild_padding_kernelLauncher()
-        cudaMemsetAsync((int8_t*)q_buf_, 0, 2 * batch_size_ * seq_len_padded * head_num * size_per_head * sizeof(int8_t), param_.stream);
-        if (int8_mode_ == 1)
-        {
-          
-          add_QK_bias_transform_rebuild_padding_kernelLauncher((int8_t*)q_buf_, (int8_t*)k_buf_, 
-                                                               (const int32_t*)Q, bias_Q, 
-                                                               (const int32_t*)K, bias_K, 
-                                                               param_.sequence_id_offset, param_.valid_word_num, 
-                                                               batch_size, seq_len, 
-                                                               head_num, size_per_head, 
-                                                               query_weight_amax_list, in_amax_ptr+2, 
-                                                               key_weight_amax_list, in_amax_ptr+2, 
-                                                               Qbias_amax_ptr+3, Kbias_amax_ptr+3,
-                                                               use_ORDER_COL32_2R_4R4_, stream);
-
-          add_V_bias_transform_rebuild_padding_kernelLauncher((int8_t*)v_buf_, (const int32_t *)V, bias_V,
-                                                              sequence_id_map_, param_.valid_word_num, 
-                                                              batch_size, seq_len, head_num, size_per_head, 
-                                                              value_weight_amax_list, in_amax_ptr+2, Vbias_amax_ptr+3, 
-                                                              use_ORDER_COL32_2R_4R4_, stream);      
-        }
-        else if (int8_mode_ == 2 || int8_mode_ == 3)
-        {
-          add_QK_bias_transform_rebuild_padding_kernelLauncher((int8_t*)q_buf_, (int8_t*)k_buf_, 
-                                                               (const int8_t*)Q, bias_Q, 
-                                                               (const int8_t*)K, bias_K,
-                                                               param_.sequence_id_offset, param_.valid_word_num,
-                                                               batch_size, seq_len, head_num, size_per_head, 
-                                                               Q_aftergemm_amax_ptr+1, K_aftergemm_amax_ptr+1,
-                                                               Qbias_amax_ptr+3, Kbias_amax_ptr+3,
-                                                               use_ORDER_COL32_2R_4R4_, stream);
-
-          add_V_bias_transform_rebuild_padding_kernelLauncher((int8_t*)v_buf_, (const int8_t *)V, bias_V, 
-                                                              sequence_id_map_, param_.valid_word_num, 
-                                                              batch_size, seq_len, head_num, size_per_head,
-                                                              V_aftergemm_amax_ptr+1, Vbias_amax_ptr+3, 
-                                                              use_ORDER_COL32_2R_4R4_, stream);
-        }
-      }
-  
-      int batchCount = batch_size * head_num;
-    
-      if (int8_mode_ == 1)
-      {     
-        cublasLtMM_withAlgo(qk_int_buf_, batchCount, seq_len, seq_len, size_per_head, 
-                            size_per_head*seq_len, size_per_head*seq_len, seq_len*seq_len, 
-                            (int8_t*)q_buf_, (int8_t*)k_buf_, cublaslt_handle, stream, cublasAlgoMap_, use_ORDER_COL32_2R_4R4_);
-
-        softmax_COL32_kernelLauncher((int8_t*)qk_buf_, qk_int_buf_, attr_mask, 
-                                     batch_size, head_num, seq_len, 
-                                     float(scalar), Qbias_amax_ptr + 1, Kbias_amax_ptr + 1, 
-                                     Softmax_amax_ptr, stream);
-      
-        cublasLtMM_withAlgo(transpose_dst_int_buf_, batchCount, seq_len, size_per_head, seq_len, 
-                            seq_len*seq_len, size_per_head*seq_len, size_per_head*seq_len, (int8_t*)qk_buf_, 
-                            (int8_t*)v_buf_, cublaslt_handle, stream, cublasAlgoMap_, use_ORDER_COL32_2R_4R4_);
-
-        if(param_.sequence_id_offset == nullptr || param_.valid_word_num == batch_size * seq_len)
-        {
-          transpose_COL32_kernelLauncher((int8_t*)dst, (const int*)transpose_dst_int_buf_, batch_size, seq_len, head_num, 
-                                         size_per_head, Vbias_amax_ptr+1, Softmax_amax_ptr+1, bmm2_amax_ptr+3, stream);
-        }
-        else
-        {
-          transpose_COL32_rebuild_padding_kernelLauncher((int8_t*)dst, (const int*)transpose_dst_int_buf_, sequence_id_map_, 
-                                                         param_.valid_word_num, batch_size, seq_len, head_num, size_per_head, 
-                                                         Vbias_amax_ptr+1, Softmax_amax_ptr+1, bmm2_amax_ptr+3, stream);     
-        }    
-      }
-      else if (int8_mode_ == 2 || int8_mode_ == 3)
-      {
-        cublasLtMM_withAlgo_int8IO((int8_t*)qk_int_buf_, batchCount, seq_len, seq_len_padded, size_per_head, 
-                                    size_per_head*seq_len, size_per_head*seq_len_padded, seq_len*seq_len_padded, 
-                                    param_.int8O_gemm_deQ_scale_list[3],
-                                    (int8_t*)q_buf_, (int8_t*)k_buf_, cublaslt_handle, stream, cublasAlgoMap_, use_ORDER_COL32_2R_4R4_);
-       
-        softmax_COL32_kernelLauncher((int8_t*)qk_buf_, (int8_t*)qk_int_buf_, attr_mask, 
-                                     batch_size, head_num, seq_len, 
-                                     float(scalar), bmm1_amax_ptr + 1, Softmax_amax_ptr, 
-                                     stream); 
-      
-        cublasLtMM_withAlgo_int8IO((int8_t*)transpose_dst_int_buf_, batchCount, seq_len, size_per_head, seq_len_padded, 
-                                    seq_len*seq_len_padded, size_per_head*seq_len_padded, size_per_head*seq_len, param_.int8O_gemm_deQ_scale_list[4], (int8_t*)qk_buf_, 
-                                    (int8_t*)v_buf_, cublaslt_handle, stream, cublasAlgoMap_, use_ORDER_COL32_2R_4R4_);
-        if(param_.sequence_id_offset == nullptr || param_.valid_word_num == batch_size * seq_len)
-        {
-          transpose_COL32_kernelLauncher((int8_t*)dst, (const int8_t*)transpose_dst_int_buf_, batch_size, seq_len, head_num, 
-                                          size_per_head, bmm2_amax_ptr+1, bmm2_amax_ptr+3, stream);
-        }
-        else
-        {
-          transpose_COL32_rebuild_padding_kernelLauncher((int8_t*)dst, (const int8_t*)transpose_dst_int_buf_, sequence_id_map_, 
-                                                          param_.valid_word_num, batch_size, seq_len, head_num, size_per_head, 
-                                                          bmm2_amax_ptr+1, bmm2_amax_ptr+3, stream);
-        }
-      }
-#else
-      printf("[ERROR] PaddleNLP does not support INT8. \n");
-      exit(-1);
-#endif
-    }
-    //FP32/FP16
-    else
-    {
-      if(param_.sequence_id_offset == nullptr || param_.valid_word_num == batch_size * seq_len)
-      {
-        add_QKV_bias_transpose_kernelLauncher(q_buf_, k_buf_, v_buf_,
-          Q, bias_Q,
-          K, bias_K,
-          V, bias_V,
-          batch_size_, seq_len,
-          head_num,
-          size_per_head, stream);
-      }
-      else
-      {
-        // if we use remove padding, then initialize the q_buf_, k_buf_ and v_buf_ to prevent bugs.
-        cudaMemsetAsync(q_buf_, 0, 3 * batch_size_ * seq_len * head_num * size_per_head * sizeof(DataType_), param_.stream);
-
-        add_QKV_bias_rebuild_padding_kernelLauncher(Q, bias_Q, K, bias_K, V, bias_V, q_buf_, k_buf_, v_buf_, 
-          batch_size, seq_len, head_num, size_per_head, param_.valid_word_num, param_.sequence_id_offset, stream);
-      }
-
-      DataType_ alpha = (DataType_)1.0f, beta = (DataType_)0.0f;
-    
-      check_cuda_error(cublasGemmStridedBatchedEx(cublas_handle,
-        CUBLAS_OP_T, CUBLAS_OP_N,
-        seq_len, seq_len, size_per_head,
-        &alpha,
-        k_buf_, AType_, size_per_head, seq_len * size_per_head,
-        q_buf_, BType_, size_per_head, seq_len * size_per_head,
-        &beta,
-        qk_buf_, CType_, seq_len, seq_len * seq_len,
-        batch_size * head_num,
-        computeType_,
-        static_cast<cublasGemmAlgo_t>(cublasBmmAlgo_[0])));
-    
-      attn_softmax_kernelLauncher(qk_buf_, attr_mask, batch_size, seq_len, head_num, scalar, stream);
-
-      check_cuda_error(cublasGemmStridedBatchedEx(cublas_handle,
-        CUBLAS_OP_N, CUBLAS_OP_N,
-        size_per_head, seq_len, seq_len,
-        &alpha,
-        v_buf_, AType_, size_per_head, seq_len * size_per_head,
-        qk_buf_, BType_, seq_len, seq_len * seq_len,
-        &beta,
-        transpose_dst_, CType_, size_per_head, seq_len * size_per_head,
-        batch_size * head_num,
-        computeType_,
-        static_cast<cublasGemmAlgo_t>(cublasBmmAlgo_[1])));
-      
-      if(param_.sequence_id_offset == nullptr || param_.valid_word_num == batch_size * seq_len)
-      {
-        transpose_kernelLauncher(transpose_dst_, dst, batch_size, seq_len, head_num, size_per_head, stream);
-      }
-      else
-      {
-        transpose_rebuild_padding_kernelLauncher(transpose_dst_, dst, param_.valid_word_num,
-                                                 batch_size, seq_len, head_num, size_per_head, 
-                                                 param_.sequence_id_offset, stream);
-      }
-    }  
-  }
-
-  void forward()
-  {
-#ifndef NDEBUG
-    PRINT_FUNC_NAME_();
-#endif
-    try
-    { 
-      forward(param_.from_tensor, param_.to_tensor);
-    }
-    catch(std::runtime_error& error)
-    {
-      throw error;
-    }
-  }
-
-  void forward(const DataType_* from_tensor, const DataType_* to_tensor)
-  {
-    if(param_.sequence_id_offset != nullptr && param_.valid_word_num != batch_size_ * from_seq_len_)
-    {
-      is_fuse_QKV_ = false;
-    }
-
-    if(is_fuse_QKV_ == true && int8_mode_ == 0)
-    {
-      // For tensorrt, we cannot get the pointer of from tensor until enqueue
-      const DataType_* hA[] {param_.self_attention.query_weight.kernel,
-                             param_.self_attention.key_weight.kernel,
-                             param_.self_attention.value_weight.kernel,
-                             from_tensor, to_tensor, to_tensor,
-                             query_buf_, key_buf_, value_buf_};
-      cudaMemcpyAsync((void*)qkv_kernel_, hA, sizeof(DataType_*) * 9, cudaMemcpyHostToDevice, param_.stream);
-    }
-
-#ifndef NDEBUG
-    PRINT_FUNC_NAME_();
-#endif
-    const int m = param_.sequence_id_offset == nullptr ? batch_size_ * from_seq_len_ : param_.valid_word_num;
-    const int k = head_num_ * size_per_head_;
-    const int n = k;
-    const DataType_ alpha = (DataType_)1.0f, beta = (DataType_)0.0f;
-
-    try
-    { 
-      if (int8_mode_ != 0){
-#ifdef WITH_INT8
-        //K_int_buf_ V_int_buf_ should point to correct buffer according to param_.valid_word_num
-        if (int8_mode_ == 1) {
-          K_int_buf_ = (int*)Q_int_buf_ + param_.valid_word_num * head_num_ * size_per_head_;
-          V_int_buf_ = (int*)K_int_buf_ + param_.valid_word_num * head_num_ * size_per_head_;
-        } else if (int8_mode_ == 2 || int8_mode_ == 3){
-          K_int_buf_ = (int*)((int8_t*)Q_int_buf_ + param_.valid_word_num * head_num_ * size_per_head_);
-          V_int_buf_ = (int*)((int8_t*)K_int_buf_ + param_.valid_word_num * head_num_ * size_per_head_);
-        }
-
-        int fusedINT8QKV = 0;
-        const int8_t* Q_weight = (const int8_t*)(param_.self_attention.query_weight.kernel);
-        const int8_t* K_weight = (const int8_t*)(param_.self_attention.key_weight.kernel);
-        const int8_t* V_weight = (const int8_t*)(param_.self_attention.value_weight.kernel);
-        //for QKV weight are DataType_ & continue
-        if ((param_.self_attention.query_weight.kernel + n*k == param_.self_attention.key_weight.kernel) &&
-            (param_.self_attention.key_weight.kernel + n*k == param_.self_attention.value_weight.kernel))
-          fusedINT8QKV = 1;
-          //for QVK weight are int8 & continue
-        else if ((Q_weight + n*k == K_weight) && (K_weight + n*k == V_weight))
-          fusedINT8QKV = 2;
-        
-        if (int8_mode_ == 1)
-        {
-          if (fusedINT8QKV == 0){
-            cublasLtMM_withAlgo(Q_int_buf_, 1, m, n, k, 0, 0, 0, 
-                                param_.int8_from_tensor, Q_weight, 
-                                param_.cublaslt_handle, param_.stream, 
-                                cublasAlgoMap_, use_ORDER_COL32_2R_4R4_);
-            cublasLtMM_withAlgo(K_int_buf_, 1, m, n, k, 0, 0, 0, 
-                                param_.int8_from_tensor, K_weight, 
-                                param_.cublaslt_handle, param_.stream, 
-                                cublasAlgoMap_, use_ORDER_COL32_2R_4R4_);
-            cublasLtMM_withAlgo(V_int_buf_, 1, m, n, k, 0, 0, 0, 
-                                param_.int8_from_tensor, V_weight, 
-                                param_.cublaslt_handle, param_.stream, 
-                                cublasAlgoMap_, use_ORDER_COL32_2R_4R4_);
-          }
-          else{
-            int strideFactor = (fusedINT8QKV == 1) ? (sizeof(DataType_)/sizeof(int8_t)) : 1; 
-            cublasLtMM_withAlgo(Q_int_buf_, 3, m, n, k, 0, n*k*strideFactor, 
-                                n*m, param_.int8_from_tensor, Q_weight, 
-                                param_.cublaslt_handle, param_.stream, cublasAlgoMap_, use_ORDER_COL32_2R_4R4_);
-          }
-        }
-        else if (int8_mode_ == 2 || int8_mode_ == 3)
-        {
-          if (fusedINT8QKV == 0){
-            cublasLtMM_withAlgo_int8IO((int8_t*)Q_int_buf_, 1, m, n, k, 0, 0, 0, 
-                                       param_.int8O_gemm_deQ_scale_list[0],  
-                                       param_.int8_from_tensor, Q_weight, 
-                                       param_.cublaslt_handle, param_.stream, 
-                                       cublasAlgoMap_, use_ORDER_COL32_2R_4R4_);
-            cublasLtMM_withAlgo_int8IO((int8_t*)K_int_buf_, 1, m, n, k, 0, 0, 0, 
-                                       param_.int8O_gemm_deQ_scale_list[1],
-                                       param_.int8_from_tensor, K_weight, 
-                                       param_.cublaslt_handle, param_.stream, 
-                                       cublasAlgoMap_, use_ORDER_COL32_2R_4R4_);
-            cublasLtMM_withAlgo_int8IO((int8_t*)V_int_buf_, 1, m, n, k, 0, 0, 0,
-                                       param_.int8O_gemm_deQ_scale_list[2],            
-                                       param_.int8_from_tensor, V_weight, 
-                                       param_.cublaslt_handle, param_.stream, 
-                                       cublasAlgoMap_, use_ORDER_COL32_2R_4R4_);
-          }
-          else{
-            int strideFactor = (fusedINT8QKV == 1) ? (sizeof(DataType_)/sizeof(int8_t)) : 1; 
-            cublasLtMM_withAlgo_int8IO((int8_t*)Q_int_buf_, 3, m, n, k, 0, n*k*strideFactor, n*m, 
-                                       param_.int8O_gemm_deQ_scale_list[0],
-                                       param_.int8_from_tensor, Q_weight, 
-                                       param_.cublaslt_handle, param_.stream, cublasAlgoMap_, use_ORDER_COL32_2R_4R4_);
-          }  
-        }
-
-        int S;
-        if(dispatcher_int8.get())
-          S = dispatcher_int8->getSFromMaxSeqLen(from_seq_len_);
-        if(dispatcher_int8.get() && dispatcher_int8->isValid(S) && param_.trt_seqlen_offset != nullptr)
-        {
-          // This function is only used when we satisfy the following conditions:
-          // 1. INT8
-          // 2. GPU SM >= 75
-          int8_fused_multiHeadAttr_kernelLauncher((const void*)Q_int_buf_,
-                                                  param_.amaxList + 4 + 1, 
-                                                  param_.amaxList + 12 + 1, 
-                                                  param_.amaxList + 20 + 1,
-                                                  param_.trt_fused_mha_amax_list[0]/127.0f,
-                                                  S
-                                                 );
-        }
-        else
-        {
-
-          DataType_ scalar = 1 / (sqrtf(size_per_head_ * 1.0f) * q_scaling_);
-          multiHeadAttr_nofuse_kernelLauncher(
-                param_.stream,
-                param_.cublas_handle,
-                param_.cublaslt_handle,
-                (DataType_*)Q_int_buf_,
-                param_.self_attention.query_weight.bias,
-                (DataType_*)(K_int_buf_),
-                param_.self_attention.key_weight.bias,
-                (DataType_*)(V_int_buf_),
-                param_.self_attention.value_weight.bias,
-                param_.attr_mask,
-                param_.attr_out,
-                batch_size_,
-                from_seq_len_,
-                head_num_,
-                size_per_head_,
-                int8_mode_,
-                scalar);
-        }
-#else
-        printf("[ERROR] PaddleNLP does not support INT8. \n");
-        exit(-1);
-#endif
-      }
-      else{      
-        if(is_fuse_QKV_ == true)
-        {
-          int algoId = getAlgoIdFromMap(cublasAlgoMap_, 3, n, m, k, AType_ == CUDA_R_16F ? HALF_DATATYPE : FLOAT_DATATYPE);
-          check_cuda_error(cublasGemmBatchedEx(param_.cublas_handle, 
-                           CUBLAS_OP_N, CUBLAS_OP_N, 
-                           n, m, k, 
-                           &alpha, 
-                           (const void* const*) qkv_kernel_, AType_, n,
-                           (const void* const*) qkv_input_, BType_, k,
-                           &beta, 
-                           (void* const*)qkv_buf_, CType_, n,
-                           3,
-                           computeType_, 
-                           static_cast<cublasGemmAlgo_t>(algoId)));
-        }
-        else
-        {
-          cublasMM_cublasLtMM_wrapper(param_.cublaslt_handle, param_.cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, 
-                           n, m, k, &alpha,
-                           param_.self_attention.query_weight.kernel, AType_, n,
-                           from_tensor, BType_, k,
-                           &beta, (DataType_ *)query_buf_, CType_, n,
-                           param_.stream, cublasAlgoMap_, sm_, cublas_workspace_);
-
-#ifndef NDEBUG
-          cudaDeviceSynchronize();
-          check_cuda_error(cudaGetLastError());
-#endif
-          cublasMM_cublasLtMM_wrapper(param_.cublaslt_handle, param_.cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, 
-                           n, m, k, &alpha,
-                           param_.self_attention.key_weight.kernel, AType_, n,
-                           to_tensor, BType_, k,
-                           &beta, (DataType_ *)key_buf_, CType_, n,
-                           param_.stream, cublasAlgoMap_, sm_, cublas_workspace_); 
-
-#ifndef NDEBUG
-          cudaDeviceSynchronize();
-          check_cuda_error(cudaGetLastError());
-#endif
-
-          cublasMM_cublasLtMM_wrapper(param_.cublaslt_handle, param_.cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, 
-                           n, m, k, &alpha,
-                           param_.self_attention.value_weight.kernel, AType_, n,
-                           to_tensor, BType_, k,
-                           &beta, (DataType_ *)value_buf_, CType_, n,
-                           param_.stream, cublasAlgoMap_, sm_, cublas_workspace_); 
-        }
-     
-#ifndef NDEBUG
-        cudaDeviceSynchronize();
-        check_cuda_error(cudaGetLastError());
-#endif
-        int S;
-        if(dispatcher_fp16.get())
-          S = dispatcher_fp16->getSFromMaxSeqLen(from_seq_len_);
-        if(dispatcher_fp16.get() && OpType_==OperationType::FP16 && dispatcher_fp16->isValid(S) && param_.trt_seqlen_offset != nullptr)
-        {
-          // This function is only used when we satisfy the following conditions:
-          // 1. FP16
-          // 2. GPU SM >= 72
-          //  3. Temporally add seqlen <= 384 limitation because the current fused mha cannot handle seqlen > 384.
-          fused_multiHeadAttr_kernelLauncher(S);
-        }
-        else
-        {
-          DataType_ scalar = 1 / (sqrtf(size_per_head_ * 1.0f) * q_scaling_);
-
-          multiHeadAttr_nofuse_kernelLauncher(
-            param_.stream,
-            param_.cublas_handle,
-            param_.cublaslt_handle,
-            query_buf_,
-            param_.self_attention.query_weight.bias,
-            key_buf_,
-            param_.self_attention.key_weight.bias,
-            value_buf_,
-            param_.self_attention.value_weight.bias,
-            param_.attr_mask,
-            param_.attr_out,
-            batch_size_,
-            from_seq_len_,
-            head_num_,
-            size_per_head_,
-            int8_mode_,
-            scalar);
-        }
-      }
-    }
-    catch(std::runtime_error& error)
-    {
-      throw error;
-    }
-  }
-
-  void fused_multiHeadAttr_kernelLauncher(const int S);
-  
-  void int8_fused_multiHeadAttr_kernelLauncher(const void* Q, 
-                                               const float *q_deQFactor_ptr, const float *k_deQFactor_ptr, const float *v_deQFactor_ptr, 
-                                               const float mScaleQkv, const int S);
-
-  void trt_add_QKV_bias_kernelLauncher(
-      const DataType_* bias_Q,
-      const DataType_* bias_K,
-      const DataType_* bias_V);
-      
-  void trt_add_QKV_bias_COL32_int8IO_kernelLauncher(
-      int8_t* output,
-      const int8_t* Q,
-      const DataType_* bias_Q,
-      const DataType_* bias_K,
-      const DataType_* bias_V,
-      const float *q_input_deQFactor_ptr, 
-      const float *k_input_deQFactor_ptr, 
-      const float *v_input_deQFactor_ptr, 
-      const float qkv_output_scale); 
-
-  void trt_add_QKV_bias_COL32_int32Iint8O_kernelLauncher(
-      int8_t* output,
-      const int32_t* Q,
-      const DataType_* bias_Q,
-      const DataType_* bias_K,
-      const DataType_* bias_V,
-      const float *input_deQFactor_div127_ptr,
-      const float * q_weight_amax,
-      const float * k_weight_amax,
-      const float * v_weight_amax,
-      const float qkv_output_scale);
-
-  void initialize(MultiHeadInitParam<DataType_> param)
-  {
-#ifndef NDEBUG
-    PRINT_FUNC_NAME_();
-#endif
-    param_ = param;
-    if (int8_mode_ != 0){
-#ifdef WITH_INT8
-      int hidden_dim = head_num_ * size_per_head_;
-      query_weight_amax_list = param_.amaxList + ACTIVATION_AMAX_NUM;
-      key_weight_amax_list = query_weight_amax_list + hidden_dim;
-      value_weight_amax_list = key_weight_amax_list + hidden_dim;
-      if (dispatcher_int8.get())
-      {
-        dispatcher_int8.get()->setScaleList(param_.trt_fused_mha_amax_list[0]/127.0f, param_.trt_fused_mha_amax_list[1]/127.0f, param_.trt_fused_mha_amax_list[2]/127.0f);
-      }
-#else
-      printf("[ERROR] PaddleNLP does not support INT8. \n");
-      exit(-1);
-#endif
-    } 
-  }
-
-  ~OpenMultiHeadAttention() override
-  {
-    if (buf_ != NULL)
-    {
-      if (allocator_ == NULL)
-      {
-        printf("[ERROR][OpenMultiHeadAttention][~OpenMultiHeadAttention] allocator_ is NULL!\n");
-        exit(-1);
-      }
-      allocator_->free(buf_);
-      buf_ = NULL;
-    }
-  }
-};
-                                       
-}//namespace cuda
-}//namespace fastertransformer
diff --git a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/open_decoder.cu b/paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/open_decoder.cu
deleted file mode 100644
index c21a1672a422..000000000000
--- a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/open_decoder.cu
+++ /dev/null
@@ -1,646 +0,0 @@
-/*
- * Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-namespace fastertransformer {
-
-template <typename T>
-__global__ void transpose_cache_batch_major(T* k_dst,
-                                            T* v_dst,
-                                            const T* k_src,
-                                            const T* v_src,
-                                            const int* memory_seq_len,
-                                            const int head_num,
-                                            const int size_per_head,
-                                            const int memory_max_seq_len,
-                                            const int cache_max_len) {
-  const int hidden_dim = head_num * size_per_head;
-  const int x = (sizeof(T) == 4) ? 4 : 8;
-  const int size_per_head_split = size_per_head / x;
-  const int batch_id = blockIdx.x;
-  const int seq_id = blockIdx.y;
-
-  for (int id = threadIdx.x; id < head_num * size_per_head_split * x;
-       id += blockDim.x) {
-    int tmp_id = id;
-    int x_id = tmp_id % x;
-    tmp_id /= x;
-    int size_id = tmp_id % size_per_head_split;
-    tmp_id /= size_per_head_split;
-    int head_id = tmp_id % head_num;
-
-    int src_seq_id =
-        (seq_id < memory_seq_len[batch_id])
-            ? (seq_id + memory_max_seq_len - memory_seq_len[batch_id])
-            : (seq_id - memory_seq_len[batch_id]);
-
-    // key: [B, head_num, L, size_per_head / x, x] ->
-    // [B, head_num, size_per_head / x, L, x]
-    k_dst[batch_id * hidden_dim * cache_max_len +
-          head_id * size_per_head * cache_max_len +
-          size_id * cache_max_len * x + seq_id * x + x_id] =
-        k_src[batch_id * hidden_dim * memory_max_seq_len +
-              head_id * size_per_head * memory_max_seq_len +
-              src_seq_id * size_per_head + size_id * x + x_id];
-
-    // value: [B, head_num, L, size_per_head/x, x] ->
-    // [B, head_num, L, size_per_head/x, x]
-    v_dst[batch_id * hidden_dim * cache_max_len +
-          head_id * size_per_head * cache_max_len + seq_id * size_per_head +
-          size_id * x + x_id] =
-        v_src[batch_id * hidden_dim * memory_max_seq_len +
-              head_id * size_per_head * memory_max_seq_len +
-              src_seq_id * size_per_head + size_id * x + x_id];
-  }
-}
-
-template <typename T>
-void transpose_cache_batch_major_kernelLauncher(T* k_dst,
-                                                T* v_dst,
-                                                const T* k_src,
-                                                const T* v_src,
-                                                const int* memory_seq_len,
-                                                const int local_batch_size,
-                                                const int memory_max_seq_len,
-                                                const int cache_max_len,
-                                                const int size_per_head,
-                                                const int local_head_num,
-                                                cudaStream_t stream) {
-  constexpr int block_sz = 128;
-  dim3 grid(local_batch_size, memory_max_seq_len);
-
-  transpose_cache_batch_major<<<grid, block_sz, 0, stream>>>(k_dst,
-                                                             v_dst,
-                                                             k_src,
-                                                             v_src,
-                                                             memory_seq_len,
-                                                             local_head_num,
-                                                             size_per_head,
-                                                             memory_max_seq_len,
-                                                             cache_max_len);
-}
-
-template <typename T>
-void transpose_general_kernelLauncher(T* dst,
-                                      T* src,
-                                      const int batch_size,
-                                      const int seq_len,
-                                      const int head_num,
-                                      const int size_per_head,
-                                      cudaStream_t stream) {
-  dim3 grid, block;
-  int grid_size = batch_size * head_num * seq_len;
-  if (sizeof(T) == 2) {
-    int seq_per_block = grid_size % 4 == 0 ? 4 : 1;
-    grid.x = grid_size / seq_per_block;
-    block.x = seq_per_block * size_per_head / 2;
-    transpose<T><<<grid, block, 0, stream>>>(
-        src, dst, batch_size, seq_len, head_num, size_per_head / 2);
-  } else {
-    const int seq_per_block = 1;
-    grid.x = grid_size / seq_per_block;
-    block.x = seq_per_block * size_per_head;
-    transpose<T><<<grid, block, 0, stream>>>(
-        src, dst, batch_size, seq_len, head_num, size_per_head);
-  }
-}
-
-template void transpose_cache_batch_major_kernelLauncher(
-    float* k_dst,
-    float* v_dst,
-    const float* k_src,
-    const float* v_src,
-    const int* memory_seq_len,
-    const int local_batch_size,
-    const int memory_max_seq_len,
-    const int cache_max_len,
-    const int size_per_head,
-    const int local_head_num,
-    cudaStream_t stream);
-
-template void transpose_cache_batch_major_kernelLauncher(
-    half* k_dst,
-    half* v_dst,
-    const half* k_src,
-    const half* v_src,
-    const int* memory_seq_len,
-    const int local_batch_size,
-    const int memory_max_seq_len,
-    const int cache_max_len,
-    const int size_per_head,
-    const int local_head_num,
-    cudaStream_t stream);
-
-template void transpose_general_kernelLauncher(float* dst,
-                                               float* src,
-                                               const int batch_size,
-                                               const int seq_len,
-                                               const int head_num,
-                                               const int size_per_head,
-                                               cudaStream_t stream);
-
-template void transpose_general_kernelLauncher(half* dst,
-                                               half* src,
-                                               const int batch_size,
-                                               const int seq_len,
-                                               const int head_num,
-                                               const int size_per_head,
-                                               cudaStream_t stream);
-
-
-
-template <typename T>
-void fusedQKV_masked_attention_dispatch_v2(
-  const T* qkv_buf, const T* qkv_bias,
-  T* key_cache, T* value_cache,
-  T* context_buf, const bool* finished, int max_batch_size, int inference_batch_size, 
-  int head_num, int size_per_head, const int step, const int max_seq_len, 
-  const int max_input_len, const int* input_lengths, const int rotary_embedding_dim, cudaStream_t stream)
-{
-  using DataType = typename std::conditional<sizeof(T) == 4, float, uint16_t>::type;
-  // Prepare the parameters.
-  Masked_multihead_attention_params<DataType> params;
-  memset(&params, 0, sizeof(params));
-  int hidden_units = head_num * size_per_head;
-  if (qkv_bias != nullptr) {
-      params.q_bias = reinterpret_cast<const DataType*>(qkv_bias);
-      params.k_bias = reinterpret_cast<const DataType*>(qkv_bias) + hidden_units;
-      params.v_bias = reinterpret_cast<const DataType*>(qkv_bias) + 2 * hidden_units;
-  }
-  else {
-     // gptj/codegen no bias
-      params.q_bias = nullptr;
-      params.k_bias = nullptr;
-      params.v_bias = nullptr;
-  }
-
-  // Set the output buffer.
-  params.out = reinterpret_cast<DataType *>(context_buf);
-
-  // Set the input buffers.
-  params.q = reinterpret_cast<const DataType *>(qkv_buf);
-  params.k = reinterpret_cast<const DataType *>(qkv_buf) + hidden_units;
-  params.v = reinterpret_cast<const DataType *>(qkv_buf) + 2 * hidden_units;
-  params.stride = 3 * hidden_units;
-  params.finished = const_cast<bool*>(finished);
-
-  params.k_cache = reinterpret_cast<DataType *>(key_cache);
-  params.v_cache = reinterpret_cast<DataType *>(value_cache);
-  params.batch_size = inference_batch_size;
-  params.seq_length = max_seq_len;
-  params.timestep = step-1;
-  params.num_heads = head_num;
-  params.hidden_size_per_head = size_per_head;
-  // GptJ: rotary_embedding
-  params.rotary_embedding_dim = rotary_embedding_dim;
-  params.inv_sqrt_dh = 1.F / sqrtf((float) params.hidden_size_per_head);
-
-  params.is_mask = true;
-  params.input_lengths = input_lengths;
-  params.max_input_len = max_input_len;
-
-  masked_multihead_attention(params, stream);
-}
-
-template <typename T>
-void masked_attention_dispatch_v2(
-  T* key_buf, T* value_buf,
-  T* query_buf, const T* self_Q_bias, 
-  T* key_cache, const T* self_K_bias, T* value_cache, const T* self_V_bias,
-  T* context_buf, const bool* finished, int max_batch_size, int inference_batch_size,
-  int head_num, int size_per_head, const int step, const int max_seq_len, cudaStream_t stream,
-  const T* relative_attention_bias)
-{
-  if (max_seq_len < 0) {
-    const int block_sz = ATTENTION_BLOCK_SIZE;
-    T scalar = (T)(1.f / sqrtf(size_per_head * 1.0f));
-  
-    dim3 grid(inference_batch_size * head_num);
-  
-    int cond = size_per_head * ((ATTENION_OPT)? 1:0);
-    switch (cond)
-    {
-      case 32:
-        masked_attention_kernel_opt<32, block_sz, T><<<grid, block_sz, sizeof(float)*step, stream>>>(
-          key_buf, value_buf,
-          query_buf, self_Q_bias,  key_cache, self_K_bias, value_cache, self_V_bias, context_buf, finished,
-          max_batch_size, head_num, step, scalar);
-        break;
-      case 64:
-          masked_attention_kernel_opt<64, block_sz, T><<<grid, block_sz, sizeof(float)*step, stream>>>(
-            key_buf, value_buf,
-            query_buf, self_Q_bias,
-            key_cache, self_K_bias,
-            value_cache, self_V_bias,
-            context_buf, 
-            finished,
-            max_batch_size, head_num, step, scalar);
-        break;
-      case 128:
-          masked_attention_kernel_opt<128, block_sz, T><<<grid, block_sz, sizeof(float)*step, stream>>>(
-            key_buf, value_buf,
-            query_buf, self_Q_bias,  key_cache, self_K_bias, value_cache, self_V_bias, context_buf, finished,
-            max_batch_size, head_num, step, scalar);
-        break;
-      default:
-        // default path
-        int block_size = 128;
-        
-        //suppose size_per_head <= 128
-        if(step <= 64)
-          block_size = 64;
-        else if(step <= 128 && step > size_per_head)
-          block_size = 128;
-        else if(step > 128 && step <= 256)
-          block_size = 256;
-        else if(step > 256 && step <= 512)
-          block_size = 512;
-        else
-          block_size = 1024;
-        
-        if((int)block_size < size_per_head)
-          block_size = size_per_head;
-          
-        assert(block_size <= 1024);
-        dim3 block(block_size);
-        T scalar = 1 / sqrtf(size_per_head * 1.0f);
-  
-        
-        int shared_size = sizeof(T) * (size_per_head + step);
-        masked_attention_kernel<T><<<grid, block, shared_size, stream>>>(
-          key_buf, value_buf,
-          query_buf, self_Q_bias,
-          key_cache, self_K_bias,
-          value_cache, self_V_bias,
-          context_buf, finished, max_batch_size,
-          head_num, size_per_head, step, scalar);
-    }
-  } else {
-    assert(step > 0);
-    assert(size_per_head == 32 || size_per_head == 64 || size_per_head == 128);
-    using DataType = typename std::conditional<sizeof(T) == 4, float, uint16_t>::type;
-    // Prepare the parameters.
-    Masked_multihead_attention_params<DataType> params;
-    memset(&params, 0, sizeof(params));
-    params.q_bias = reinterpret_cast<const DataType *>(self_Q_bias);
-    params.k_bias = reinterpret_cast<const DataType *>(self_K_bias);
-    params.v_bias = reinterpret_cast<const DataType *>(self_V_bias);
-  
-    // Set the output buffer.
-    params.out = reinterpret_cast<DataType *>(context_buf);
-  
-    // Set the input buffers.
-    params.q = reinterpret_cast<const DataType *>(query_buf);
-    params.k = reinterpret_cast<const DataType *>(key_buf);
-    params.v = reinterpret_cast<const DataType *>(value_buf);
-    params.stride = 0;
-    params.finished = const_cast<bool*>(finished);
-  
-    params.k_cache = reinterpret_cast<DataType *>(key_cache);
-    params.v_cache = reinterpret_cast<DataType *>(value_cache);
-    params.batch_size = inference_batch_size;
-    params.seq_length = max_seq_len;
-    params.timestep = step-1;
-    params.num_heads = head_num;
-    params.hidden_size_per_head = size_per_head;
-
-    params.is_mask = false;
-    params.input_lengths = nullptr;
-    params.max_input_len = 0;
-
-    if (relative_attention_bias) {
-      params.inv_sqrt_dh = 1.F;
-
-      if (sizeof(T) == 4) {
-        params.relative_attention_bias_float = reinterpret_cast<const float*>(relative_attention_bias);
-      } else {
-        params.relative_attention_bias_half = reinterpret_cast<const half*>(relative_attention_bias);
-      }
-
-      params.relative_attention_bias_stride = max_seq_len + 1;
-    } else {
-      params.inv_sqrt_dh = 1.F / sqrtf((float) params.hidden_size_per_head);
-    }
-
-    masked_multihead_attention(params, stream);
-  }
-}
-
-template <typename T>
-void fusedQKV_masked_attention_dispatch_v3(
-  const T* qkv_buf, const T* qkv_bias,
-  T* key_cache, T* value_cache,
-  T* context_buf, const bool* finished, int max_batch_size, int inference_batch_size, 
-  int head_num, int size_per_head, const int step, const int max_seq_len, cudaStream_t stream,
-  const T* relative_attention_bias)
-{
-  if (max_seq_len < 0) {
-    const int block_sz = ATTENTION_BLOCK_SIZE;
-    T scalar = (T)(1.f / sqrtf(size_per_head * 1.0f));
-  
-    dim3 grid(inference_batch_size * head_num);
-  
-    int cond = size_per_head * ((ATTENION_OPT)? 1:0);
-    switch (cond)
-    {
-      case 32:
-        fusedQKV_masked_attention_kernel_opt<32, block_sz, T><<<grid, block_sz, sizeof(float)*step, stream>>>(
-          qkv_buf, qkv_bias,
-          key_cache, value_cache,
-          context_buf,
-          finished,
-          max_batch_size, head_num, step, scalar);
-        break;
-      case 64:
-        fusedQKV_masked_attention_kernel_opt<64, block_sz, T><<<grid, block_sz, sizeof(float)*step, stream>>>(
-          qkv_buf, qkv_bias,
-          key_cache,
-          value_cache,
-          context_buf,
-          finished,
-          max_batch_size, head_num, step, scalar);
-        break;
-      case 128:
-        fusedQKV_masked_attention_kernel_opt<128, block_sz, T><<<grid, block_sz, sizeof(float)*step, stream>>>(
-          qkv_buf, qkv_bias,
-          key_cache,
-          value_cache,
-          context_buf,
-          finished,
-          max_batch_size, head_num, step, scalar);
-        break;
-      default:
-        assert(false);
-    }
-  }
-  else {
-    using DataType = typename std::conditional<sizeof(T) == 4, float, uint16_t>::type;
-    // Prepare the parameters.
-    Masked_multihead_attention_params<DataType> params;
-    memset(&params, 0, sizeof(params));
-    int hidden_units = head_num * size_per_head;
-    params.q_bias = reinterpret_cast<const DataType *>(qkv_bias);
-    params.k_bias = reinterpret_cast<const DataType *>(qkv_bias) + hidden_units;
-    params.v_bias = reinterpret_cast<const DataType *>(qkv_bias) + 2 * hidden_units;
-  
-    // Set the output buffer.
-    params.out = reinterpret_cast<DataType *>(context_buf);
-  
-    // Set the input buffers.
-    params.q = reinterpret_cast<const DataType *>(qkv_buf);
-    params.k = reinterpret_cast<const DataType *>(qkv_buf) + hidden_units;
-    params.v = reinterpret_cast<const DataType *>(qkv_buf) + 2 * hidden_units;
-    params.stride = 3 * hidden_units;
-    params.finished = const_cast<bool*>(finished);
-  
-    params.k_cache = reinterpret_cast<DataType *>(key_cache);
-    params.v_cache = reinterpret_cast<DataType *>(value_cache);
-    params.batch_size = inference_batch_size;
-    params.seq_length = max_seq_len;
-    params.timestep = step - 1;
-    params.num_heads = head_num;
-    params.hidden_size_per_head = size_per_head;
-
-    params.is_mask = false;
-    params.input_lengths = nullptr;
-    params.max_input_len = 0;
-
-    if (relative_attention_bias) {
-      params.inv_sqrt_dh = 1.F;
-
-      if (sizeof(T) == 4) {
-        params.relative_attention_bias_float = reinterpret_cast<const float*>(relative_attention_bias);
-      } else {
-        params.relative_attention_bias_half = reinterpret_cast<const half*>(relative_attention_bias);
-      }
-
-      params.relative_attention_bias_stride = max_seq_len + 1;
-    } else {
-      params.inv_sqrt_dh = 1.F / sqrtf((float) params.hidden_size_per_head);
-    }
-
-    masked_multihead_attention(params, stream);
-  }
-}
-
-template void fusedQKV_masked_attention_dispatch_v3(
-  const float* qkv_buf, 
-  const float* qkv_bias,
-  float* key_cache, 
-  float* value_cache,
-  float* context_buf, 
-  const bool* finished, 
-  int max_batch_size, 
-  int inference_batch_size, 
-  int head_num, 
-  int size_per_head, 
-  const int step, 
-  const int max_seq_len,
-  cudaStream_t stream,
-  const float* relative_attention_bias);
-  
-template void fusedQKV_masked_attention_dispatch_v3(
-  const half* qkv_buf, 
-  const half* qkv_bias,
-  half* key_cache, 
-  half* value_cache,
-  half* context_buf, 
-  const bool* finished, 
-  int max_batch_size, 
-  int inference_batch_size, 
-  int head_num, 
-  int size_per_head,
-  const int step, 
-  const int max_seq_len,
-  cudaStream_t stream,
-  const half* relative_attention_bias);
-
-template void masked_attention_dispatch_v2(
-  float* key_buf, 
-  float* value_buf,
-  float* query_buf, 
-  const float* self_Q_bias, 
-  float* key_cache, 
-  const float* self_K_bias, 
-  float* value_cache, 
-  const float* self_V_bias,
-  float* context_buf, 
-  const bool* finished, 
-  int max_batch_size, 
-  int inference_batch_size, 
-  int head_num, 
-  int size_per_head, 
-  const int step,
-  const int max_seq_size,
-  cudaStream_t stream,
-  const float* relative_attention_bias);
-
-template void masked_attention_dispatch_v2(
-  half* key_buf, 
-  half* value_buf,
-  half* query_buf, 
-  const half* self_Q_bias, 
-  half* key_cache, 
-  const half* self_K_bias, 
-  half* value_cache, 
-  const half* self_V_bias,
-  half* context_buf, 
-  const bool* finished, 
-  int max_batch_size, 
-  int inference_batch_size, 
-  int head_num, 
-  int size_per_head, 
-  const int step,
-  const int max_seq_size,
-  cudaStream_t stream,
-  const half* relative_attention_bias);
-
-template void fusedQKV_masked_attention_dispatch_v2(
-  const float* qkv_buf, 
-  const float* qkv_bias,
-  float* key_cache, 
-  float* value_cache,
-  float* context_buf, 
-  const bool* finished, 
-  int max_batch_size, 
-  int inference_batch_size, 
-  int head_num, 
-  int size_per_head, 
-  const int step, 
-  const int max_seq_len,
-  const int max_input_len, 
-  const int* input_lengths,
-  const int rotary_embedding_dim,
-  cudaStream_t stream);
-  
-template void fusedQKV_masked_attention_dispatch_v2(
-  const half* qkv_buf, 
-  const half* qkv_bias,
-  half* key_cache, 
-  half* value_cache,
-  half* context_buf, 
-  const bool* finished, 
-  int max_batch_size, 
-  int inference_batch_size, 
-  int head_num, 
-  int size_per_head,
-  const int step, 
-  const int max_seq_len,
-  const int max_input_len, 
-  const int* input_lengths,
-  const int rotary_embedding_dim,
-  cudaStream_t stream);
-
-template <typename T>
-void cross_attention_dispatch_v2(T* query_buf, const T* Q_bias, 
-  T* key_cache, const T* K_bias, T* value_cache, const T* V_bias, const int* length,
-  T* context_buf, const bool* finished,
-  int batch_size, int head_num, int size_per_head, int step, int seq_len, cudaStream_t stream,
-  const T* relative_attention_bias)
-  {
-    const int block_sz = ATTENTION_BLOCK_SIZE;
-    float scalar = (relative_attention_bias) ? 1.0f : 1.f / sqrtf(size_per_head * 1.0f);
-
-    dim3 grid(batch_size * head_num);
-
-    int cond = size_per_head * ((ATTENION_OPT)? 1:0);
-    switch (cond)
-    {
-      case 32:
-        cross_attention_kernel_opt<T, 32, block_sz><<<grid, block_sz, sizeof(float)*seq_len, stream>>>(
-          query_buf, Q_bias, key_cache, K_bias, value_cache, V_bias, length, context_buf, finished,
-          batch_size, head_num, step, seq_len, scalar);
-        break;
-      case 64:
-        cross_attention_kernel_opt<T, 64, block_sz><<<grid, block_sz, sizeof(float)*seq_len, stream>>>(
-          query_buf, Q_bias, key_cache, K_bias, value_cache, V_bias, length, context_buf, finished,
-          batch_size, head_num, step, seq_len, scalar);
-        break;
-      case 128:
-        cross_attention_kernel_opt<T, 128, block_sz><<<grid, block_sz, sizeof(float)*seq_len, stream>>>(
-          query_buf, Q_bias, key_cache, K_bias, value_cache, V_bias, length, context_buf, finished,
-          batch_size, head_num, step, seq_len, scalar);
-        break;
-      default:
-        // default path
-
-        int block_size = 128;
-
-        if(seq_len <= 64)
-          block_size = 64;
-        else if(seq_len <= 128 && seq_len > size_per_head)
-          block_size = 128;
-        else if(seq_len > 128 && seq_len <= 256)
-          block_size = 256;
-        else if(seq_len > 256 && seq_len <= 512)
-          block_size = 512;
-        else
-          block_size = 1024;
-
-        if(block_size < size_per_head)
-          block_size = size_per_head;
-
-        assert(block_size <= 1024);
-        dim3 block(block_size);
-        
-        int shared_size = sizeof(T) * (size_per_head + seq_len);
-        cross_attention_kernel<T><<<grid, block, shared_size, stream>>>(
-          query_buf, Q_bias, 
-          key_cache, K_bias,
-          value_cache, V_bias,
-          length, context_buf, finished,
-          batch_size,
-          head_num, size_per_head, step, seq_len, scalar);
-    }
-  }
-
-template void cross_attention_dispatch_v2(
-  float* query_buf, 
-  const float* Q_bias, 
-  float* key_cache, 
-  const float* K_bias, 
-  float* value_cache, 
-  const float* V_bias, 
-  const int* length,
-  float* context_buf, 
-  const bool* finished,
-  int batch_size, 
-  int head_num, 
-  int size_per_head, 
-  int step, 
-  int seq_len, 
-  cudaStream_t stream,
-  const float* relative_attention_bias);
-
-template void cross_attention_dispatch_v2(
-  half* query_buf, 
-  const half* Q_bias, 
-  half* key_cache, 
-  const half* K_bias, 
-  half* value_cache, 
-  const half* V_bias, 
-  const int* length,
-  half* context_buf, 
-  const bool* finished,
-  int batch_size, 
-  int head_num, 
-  int size_per_head, 
-  int step, 
-  int seq_len, 
-  cudaStream_t stream,
-  const half* relative_attention_bias);
-
-}
diff --git a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/open_decoder.cuh b/paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/open_decoder.cuh
deleted file mode 100644
index 8556a98ad375..000000000000
--- a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/open_decoder.cuh
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-namespace fastertransformer {
-
-template <typename T>
-void transpose_cache_batch_major_kernelLauncher(T* k_dst,
-                                                T* v_dst,
-                                                const T* k_src,
-                                                const T* v_src,
-                                                const int* memory_seq_len,
-                                                const int local_batch_size,
-                                                const int memory_max_seq_len,
-                                                const int cache_max_seq_len,
-                                                const int size_per_head,
-                                                const int local_head_num,
-                                                cudaStream_t stream);
-
-template <typename T>
-void transpose_general_kernelLauncher(T* dst,
-                                      T* src,
-                                      const int batch_size,
-                                      const int seq_len,
-                                      const int head_num,
-                                      const int size_per_head,
-                                      cudaStream_t stream);
-
-template <typename T>
-__global__ void transpose(T* src,
-                          T* dst,
-                          const int batch_size,
-                          const int seq_len,
-                          const int head_num,
-                          const int size_per_head);
-
-template <typename T>
-void fusedQKV_masked_attention_dispatch_v2(const T* qkv_buf,
-                                           const T* qkv_bias,
-                                           T* key_cache,
-                                           T* value_cache,
-                                           T* context_buf,
-                                           const bool* finished,
-                                           int max_batch_size,
-                                           int inference_batch_size,
-                                           int head_num,
-                                           int size_per_head,
-                                           const int step,
-                                           const int max_seq_len,
-                                           const int max_input_len,
-                                           const int* input_lengths,
-                                           const int rotary_embedding_dim,
-                                           cudaStream_t stream);
-
-template <typename T>
-void masked_attention_dispatch_v2(T* key_buf,
-                                  T* value_buf,
-                                  T* query_buf,
-                                  const T* self_Q_bias,
-                                  T* key_cache,
-                                  const T* self_K_bias,
-                                  T* value_cache,
-                                  const T* self_V_bias,
-                                  T* context_buf,
-                                  const bool* finished,
-                                  int max_batch_size,
-                                  int inference_batch_size,
-                                  int head_num,
-                                  int size_per_head,
-                                  const int step,
-                                  const int max_seq_len,
-                                  cudaStream_t stream,
-                                  const T* relative_attention_bias = nullptr);
-
-template <typename T>
-void fusedQKV_masked_attention_dispatch_v3(
-    const T* qkv_buf,
-    const T* qkv_bias,
-    T* key_cache,
-    T* value_cache,
-    T* context_buf,
-    const bool* finished,
-    int max_batch_size,
-    int inference_batch_size,
-    int head_num,
-    int size_per_head,
-    const int step,
-    const int max_seq_len,
-    cudaStream_t stream,
-    const T* relative_attention_bias = nullptr);
-
-template <typename T>
-void cross_attention_dispatch_v2(T* query_buf,
-                                 const T* Q_bias,
-                                 T* key_cache,
-                                 const T* K_bias,
-                                 T* value_cache,
-                                 const T* V_bias,
-                                 const int* length,
-                                 T* context_buf,
-                                 const bool* finished,
-                                 int batch_size,
-                                 int head_num,
-                                 int size_per_head,
-                                 int step,
-                                 int seq_len,
-                                 cudaStream_t stream,
-                                 const T* relative_attention_bias = nullptr);
-
-}  // namespace fastertransformer
diff --git a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/topk_kernels.cu b/paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/topk_kernels.cu
deleted file mode 100644
index 315bb8d344d1..000000000000
--- a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/topk_kernels.cu
+++ /dev/null
@@ -1,2643 +0,0 @@
-/*
- * Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <random>
-#include "cub/cub.cuh"
-#include "fastertransformer/cuda/topk_kernels.cuh"
-
-namespace fastertransformer {
-
-__global__ void ker_curand_setup(curandState_t* state,
-                                 const int size,
-                                 const int seed) {
-  // curand_init(clock(), blockIdx.x * blockDim.x + threadIdx.x, 0,
-  // &state[blockIdx.x * blockDim.x + threadIdx.x]);
-  // fix the seed to prevent the seed of different gpu are differnet in Tensor
-  // Parallel
-  if (threadIdx.x + blockIdx.x * blockDim.x < size)
-    curand_init(seed,
-                blockIdx.x * blockDim.x + threadIdx.x,
-                seed,
-                &state[blockIdx.x * blockDim.x + threadIdx.x]);
-}
-
-__global__ void ker_curand_setup_bsz_one(curandState_t* state,
-                                 const int size,
-                                 const int seed) {
-  if (threadIdx.x + blockIdx.x * blockDim.x < size)
-    curand_init(seed,
-                0,
-                seed,
-                &state[blockIdx.x * blockDim.x + threadIdx.x]);
-}
-
-void ker_curand_setupLauncher(curandState_t* state,
-                              DecodingSamplingArguments args,
-                              cudaStream_t stream) {
-  dim3 block(256);
-  dim3 grid((int)(ceil(args.batch_size_ * 1.0 / 256)));
-  int seed = args.seed_ != -1 ? args.seed_ : clock() % INT_MAX;
-  if(args.batch_size_ != 1)
-    ker_curand_setup<<<grid, block, 0, stream>>>(state, args.batch_size_, seed);
-  else
-    // Reduce the huge occupation of gpu memory due to curand_init func when bsz=1.
-    // TODO(gongenlei): Solve above problem when bsz > 1.
-    ker_curand_setup_bsz_one<<<grid, block, 0, stream>>>(state, args.batch_size_, seed);
-}
-
-
-template <typename T, int MAX_K, int THREADBLOCK_SIZE>
-__launch_bounds__(THREADBLOCK_SIZE) __global__
-    void beam_topK_kernel(const T* log_probs,
-                          int* topk_tmp_id_buf,
-                          T* topk_tmp_val_buf,
-                          const int vocab_size,
-                          T diversity_rate) {
-  typedef cub::BlockReduce<TopK<T, MAX_K>, THREADBLOCK_SIZE> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-
-  int thread_id = threadIdx.x;
-  int block_id = blockIdx.x;
-  TopK<T, MAX_K> partial;
-
-  const bool IS_FP16 = std::is_same<T, half>::value;
-  const T MAX_T_VAL = (IS_FP16) ? HALF_FLT_MAX : FLT_MAX;
-
-#pragma unroll
-  for (int i = 0; i < MAX_K; ++i) {
-    partial.p[i] = -1;
-    partial.u[i] = -MAX_T_VAL;
-  }
-
-#pragma unroll
-  for (int elem_id = thread_id; elem_id < vocab_size;
-       elem_id += THREADBLOCK_SIZE) {
-    int index = elem_id + block_id * vocab_size;
-    partial.insert(log_probs[index], index);
-  }
-
-  TopK<T, MAX_K> total =
-      BlockReduce(temp_storage).Reduce(partial, reduce_topk_op<T, MAX_K>);
-
-  if (thread_id == 0) {
-    int index = block_id * MAX_K;
-
-#pragma unroll
-    for (int i = 0; i < MAX_K; ++i) {
-      topk_tmp_id_buf[index + i] = total.p[i];
-      topk_tmp_val_buf[index + i] = total.u[i] + diversity_rate * (T)i;
-    }
-  }
-}
-
-template <typename T, int K>
-__forceinline__ __device__ T blockRoughTopK(T val);
-
-template <typename T, int beam_size, int THREADBLOCK_SIZE>
-__launch_bounds__(THREADBLOCK_SIZE) __global__
-    void beam_topK_kernel_hierarchical(const T* log_probs,
-                                       T* can_score_buf,
-                                       int* can_idx_buf,
-                                       int* topk_tmp_id_buf,
-                                       T* topk_tmp_val_buf,
-                                       const int vocab_size,
-                                       T diversity_rate) {
-  __shared__ T s_topk;
-  __shared__ int num_cur_beam_can;
-  typedef cub::BlockReduce<TopK<T, beam_size>, THREADBLOCK_SIZE> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-
-  int thread_id = threadIdx.x;
-  int block_id = blockIdx.x;
-  const bool IS_FP16 = std::is_same<T, half>::value;
-  const T MAX_T_VAL = (IS_FP16) ? HALF_FLT_MAX : FLT_MAX;
-  T rough_top_kth_logit = -MAX_T_VAL;
-
-#pragma unroll
-  for (int elem_id = thread_id; elem_id < vocab_size;
-       elem_id += THREADBLOCK_SIZE) {
-    int index = elem_id + block_id * vocab_size;
-    rough_top_kth_logit = fmaxf(rough_top_kth_logit, log_probs[index]);
-  }
-  rough_top_kth_logit = blockRoughTopK<float, beam_size>(rough_top_kth_logit);
-  if (thread_id == 0) {
-    s_topk = rough_top_kth_logit;
-    num_cur_beam_can = 0;
-  }
-
-  int idx = block_id * vocab_size + thread_id;
-
-  __shared__ int l_n;  // current iteration candidate number
-  for (int iter = 0;
-       iter < (vocab_size + THREADBLOCK_SIZE - 1) / THREADBLOCK_SIZE;
-       iter++) {
-    // zero the counter
-    if (threadIdx.x == 0) l_n = 0;
-    __syncthreads();
-    T lgt = -MAX_T_VAL;  // min s_topk is CUDA_FLOAT_INF_NEG
-    int pos;
-    int vocab_id = idx - block_id * vocab_size;
-
-    if (vocab_id < vocab_size) {
-      lgt = log_probs[idx];
-      if (lgt >= s_topk) pos = atomicAdd(&l_n, 1);
-    }
-    __syncthreads();
-    if (threadIdx.x == 0) {
-      l_n = atomicAdd(&num_cur_beam_can, l_n);
-    }
-    __syncthreads();
-
-    if (lgt >= s_topk) {
-      pos += l_n;
-      can_score_buf[pos + block_id * vocab_size] = lgt;
-      can_idx_buf[pos + block_id * vocab_size] = idx;
-    }
-    __syncthreads();
-    idx += THREADBLOCK_SIZE;
-  }
-
-  TopK<T, beam_size> partial;
-#pragma unroll
-  for (int i = 0; i < beam_size; ++i) {
-    partial.p[i] = -1;
-    partial.u[i] = -MAX_T_VAL;
-  }
-  for (int elem_id = thread_id; elem_id < num_cur_beam_can;
-       elem_id += THREADBLOCK_SIZE) {
-    int index = elem_id + block_id * vocab_size;
-    partial.insert(can_score_buf[index], index);
-  }
-  TopK<T, beam_size> total =
-      BlockReduce(temp_storage).Reduce(partial, reduce_topk_op<T, beam_size>);
-
-  if (thread_id == 0) {
-    int index = block_id * beam_size;
-
-#pragma unroll
-    for (int i = 0; i < beam_size; ++i) {
-      topk_tmp_id_buf[index + i] = can_idx_buf[total.p[i]];
-      topk_tmp_val_buf[index + i] = total.u[i] + diversity_rate * (T)i;
-    }
-  }
-}
-
-template <typename T, int THREADBLOCK_SIZE>
-__global__ void beam_topK_kernel_general(const T* log_probs,
-                                         T* tmp_log_probs,
-                                         int* topk_tmp_id_buf,
-                                         T* topk_tmp_val_buf,
-                                         const int k,
-                                         const int vocab_size) {
-  const bool IS_FP16 = std::is_same<T, half>::value;
-  const T MAX_T_VAL = (IS_FP16) ? HALF_FLT_MAX : FLT_MAX;
-  typedef cub::BlockReduce<TopK_2<T>, THREADBLOCK_SIZE> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-
-  const int tid = threadIdx.x;
-  const int bid = blockIdx.x;
-  TopK_2<T> partial;
-
-  for (int elem_id = tid; elem_id < vocab_size; elem_id += THREADBLOCK_SIZE) {
-    int index = elem_id + bid * vocab_size;
-    tmp_log_probs[index] = log_probs[index];
-  }
-
-  for (int ite = 0; ite < k; ite++) {
-    partial.init();
-#pragma unroll
-    for (int elem_id = tid; elem_id < vocab_size; elem_id += THREADBLOCK_SIZE) {
-      int index = elem_id + bid * vocab_size;
-      partial.insert(tmp_log_probs[index], index);
-    }
-
-    TopK_2<T> total =
-        BlockReduce(temp_storage).Reduce(partial, reduce_topk_op_2<T>);
-
-    if (tid == 0) {
-      const int index = bid * k + ite;
-      topk_tmp_id_buf[index] = total.p;
-      topk_tmp_val_buf[index] = total.u;
-      tmp_log_probs[total.p] = -MAX_T_VAL;
-    }
-    __syncthreads();
-  }
-}
-
-#define CASE_K(K)                                                              \
-  case K:                                                                      \
-    beam_topK_kernel<T, K, block_size><<<batch_size, block_size, 0, stream>>>( \
-        log_probs, topk_tmp_id_buf, topk_tmp_val_buf, vocab_size, 0.0f);       \
-    break;
-
-template <typename T>
-void beam_topK_kernelLauncher(const T* log_probs,
-                              int* topk_tmp_id_buf,
-                              T* topk_tmp_val_buf,
-                              DecodingSamplingArguments args,
-                              cudaStream_t stream) {
-  const int batch_size = args.batch_size_;
-  const int vocab_size = args.vocab_size_padded_;
-  const int candidate_num = args.candidate_num_;
-  const int block_size = 256;
-  switch (candidate_num) {
-    CASE_K(1);
-    CASE_K(2);
-    CASE_K(4);
-    default:
-      printf("[ERROR] Topk kernel does not support candidate_num = %d \n",
-             candidate_num);
-      exit(0);
-      break;
-  }
-}
-
-#undef CASE_K
-
-template void beam_topK_kernelLauncher(const float* log_probs,
-                                       int* topk_tmp_id_buf,
-                                       float* topk_tmp_val_buf,
-                                       DecodingSamplingArguments args,
-                                       cudaStream_t stream);
-
-template void beam_topK_kernelLauncher(const half* log_probs,
-                                       int* topk_tmp_id_buf,
-                                       half* topk_tmp_val_buf,
-                                       DecodingSamplingArguments args,
-                                       cudaStream_t stream);
-
-template <typename T, int MAX_K, int THREADBLOCK_SIZE>
-__launch_bounds__(THREADBLOCK_SIZE) __global__
-    void batch_topK_kernel(int* topk_tmp_id_buf,
-                           T* topk_tmp_val_buf,
-                           int* id_buf) {
-  int thread_id = threadIdx.x;
-  int block_id = blockIdx.x;
-  const bool IS_FP16 = std::is_same<T, half>::value;
-  const T MAX_T_VAL = (IS_FP16) ? HALF_FLT_MAX : FLT_MAX;
-  TopK<T, MAX_K> partial;
-  if (thread_id == 0) {
-    for (int i = 0; i < MAX_K; ++i) {
-      partial.p[i] = -1;
-      partial.u[i] = -MAX_T_VAL;
-    }
-
-    int index = block_id * MAX_K * MAX_K;
-    for (int i = 0; i < MAX_K * MAX_K; i++) {
-      partial.insert((T)topk_tmp_val_buf[index + i],
-                     topk_tmp_id_buf[index + i]);
-    }
-
-    index = block_id * MAX_K;
-    for (int i = 0; i < MAX_K; i++) {
-      id_buf[index + i] = partial.p[i];
-    }
-  }
-}
-
-template <typename T, int MAX_K, int THREADBLOCK_SIZE>
-__launch_bounds__(THREADBLOCK_SIZE) __global__
-    void batch_topK_kernel_v2(int* topk_tmp_id_buf,
-                              T* topk_tmp_val_buf,
-                              int* id_buf) {
-  typedef cub::BlockReduce<TopK<T, MAX_K>, THREADBLOCK_SIZE> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-
-  int tid = threadIdx.x;
-  int bid = blockIdx.x;
-  TopK<T, MAX_K> partial;
-  const bool IS_FP16 = std::is_same<T, half>::value;
-  const T MAX_T_VAL = (IS_FP16) ? HALF_FLT_MAX : FLT_MAX;
-
-#pragma unroll
-  for (int i = 0; i < MAX_K; ++i) {
-    partial.p[i] = -1;
-    partial.u[i] = -MAX_T_VAL;
-  }
-
-  int ite = MAX_K * MAX_K / THREADBLOCK_SIZE;
-#pragma unroll
-  for (int i = 0; i < ite; i++) {
-    int index = bid * MAX_K * MAX_K + i * THREADBLOCK_SIZE + tid;
-    partial.insert((T)topk_tmp_val_buf[index], topk_tmp_id_buf[index]);
-  }
-
-  TopK<T, MAX_K> total =
-      BlockReduce(temp_storage).Reduce(partial, reduce_topk_op<T, MAX_K>);
-
-  if (tid == 0) {
-#pragma unroll
-    for (int i = 0; i < MAX_K; i++) id_buf[bid * MAX_K + i] = total.p[i];
-  }
-}
-
-template <typename T, int BLOCK_SIZE_, int BLOCKS_PER_BEAM_>
-__global__ void topk_stage_1_opt3(const T* __restrict log_probs,
-                                  T* tmp_log_probs,
-                                  int* topk_tmp_id_buf,
-                                  T* topk_tmp_val_buf,
-                                  const bool* finished,
-                                  const int k,
-                                  const int vocab_size,
-                                  const int end_id) {
-  typedef cub::BlockReduce<TopK_2<T>, BLOCK_SIZE_> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-
-  const int tid = threadIdx.x;
-  const int bid = blockIdx.x;
-
-  const int row_id = bid / BLOCKS_PER_BEAM_;      // row id for log_probs
-  const int block_lane = bid % BLOCKS_PER_BEAM_;  // block id for a beam
-  const int tmp_log_buf_index = row_id * vocab_size;
-  const int tmp_topk_buf_index = row_id * BLOCKS_PER_BEAM_ * k + block_lane * k;
-  TopK_2<T> partial;
-  const bool IS_FP16 = std::is_same<T, half>::value;
-  const T MAX_T_VAL = (IS_FP16) ? HALF_FLT_MAX : FLT_MAX;
-
-  if (finished != nullptr && finished[row_id] == true) {
-    if (tid < k) {
-      const int index = tmp_topk_buf_index + tid;
-      if (block_lane == 0 && tid == 0) {
-        topk_tmp_id_buf[index] = tmp_log_buf_index + end_id;
-        topk_tmp_val_buf[index] = log_probs[tmp_log_buf_index + end_id];
-      } else {
-        topk_tmp_id_buf[index] = -1;
-        topk_tmp_val_buf[index] = -MAX_T_VAL;
-      }
-    }
-    return;
-  }
-
-  for (int elem_id = tid + block_lane * BLOCK_SIZE_; elem_id < vocab_size;
-       elem_id += BLOCK_SIZE_ * BLOCKS_PER_BEAM_) {
-    int index = elem_id + tmp_log_buf_index;
-    tmp_log_probs[index] = log_probs[index];
-  }
-
-  for (int ite = 0; ite < k; ite++) {
-    partial.init();
-#pragma unroll
-    for (int elem_id = tid + block_lane * BLOCK_SIZE_; elem_id < vocab_size;
-         elem_id += BLOCK_SIZE_ * BLOCKS_PER_BEAM_) {
-      int index = elem_id + tmp_log_buf_index;
-      partial.insert(tmp_log_probs[index], index);
-    }
-
-    TopK_2<T> total =
-        BlockReduce(temp_storage).Reduce(partial, reduce_topk_op_2<T>);
-
-    if (tid == 0) {
-      const int index = tmp_topk_buf_index + ite;
-      topk_tmp_id_buf[index] = total.p;
-      topk_tmp_val_buf[index] = total.u;
-      tmp_log_probs[total.p] = -MAX_T_VAL;
-    }
-    __syncthreads();
-  }
-}
-
-template <typename T, int BLOCK_SIZE_, int BLOCKS_PER_BEAM_>
-__global__ void topk_stage_2_opt3(const int* __restrict topk_tmp_id_buf,
-                                  T* topk_tmp_val_buf,
-                                  int* ids,
-                                  const int k) {
-  const int size = k * k * BLOCKS_PER_BEAM_;
-  const int tid = threadIdx.x;
-  const int batch_id = blockIdx.x;
-  const bool IS_FP16 = std::is_same<T, half>::value;
-  const T MAX_T_VAL = (IS_FP16) ? HALF_FLT_MAX : FLT_MAX;
-
-  typedef cub::BlockReduce<TopK_2<T>, BLOCK_SIZE_> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  extern __shared__ char array[];
-  T* s_val = topk_tmp_val_buf + batch_id * size;
-  int* s_id = (int*)(array);
-
-  TopK_2<T> partial;
-
-  for (int ite = 0; ite < k; ite++) {
-    partial.init();
-#pragma unroll
-    for (int i = tid; i < size; i += BLOCK_SIZE_) {
-      partial.insert(s_val[i], i);
-    }
-
-    TopK_2<T> total =
-        BlockReduce(temp_storage).Reduce(partial, reduce_topk_op_2<T>);
-
-    if (tid == 0) {
-      s_id[ite] = total.p;
-      s_val[total.p] = -MAX_T_VAL;
-    }
-    __syncthreads();
-  }
-  if (tid < k)
-    ids[batch_id * k + tid] = topk_tmp_id_buf[batch_id * size + s_id[tid]];
-}
-
-template <typename T, int BLOCK_SIZE_, int BLOCKS_PER_BEAM_>
-__global__ void topk_stage_1_opt3(
-    const T* __restrict log_probs,
-    const float* __restrict cum_log_probs,  // If null, log_probs is
-                                            // cum_log_probs.
-    // Used in beam_search_v2 which adding
-    // cum_log_buf_ to logits_buf_ here.
-    T* tmp_log_probs,
-    int* topk_tmp_id_buf,
-    T* topk_tmp_val_buf,
-    const bool* finished,
-    const int k,
-    const int vocab_size,
-    const T
-        diversity_rate,  // diversity_rate only works when BLOCKS_PER_BEAM_ is 1
-    const int end_id) {
-  typedef cub::BlockReduce<TopK_2<T>, BLOCK_SIZE_> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-
-  const int tid = threadIdx.x;
-  const int bid = blockIdx.x;
-
-  const int row_id = bid / BLOCKS_PER_BEAM_;      // row id for log_probs
-  const int block_lane = bid % BLOCKS_PER_BEAM_;  // block id for a beam
-  const int tmp_log_buf_index = row_id * vocab_size;
-  const int tmp_topk_buf_index = row_id * BLOCKS_PER_BEAM_ * k + block_lane * k;
-  const int beam_id_in_output =
-      row_id / (k >> 1) * k + row_id % (k >> 1) + (k >> 1);
-  TopK_2<T> partial;
-  const bool IS_FP16 = std::is_same<T, half>::value;
-  const T MAX_T_VAL = (IS_FP16) ? HALF_FLT_MAX : FLT_MAX;
-
-  for (int elem_id = tid + block_lane * BLOCK_SIZE_; elem_id < vocab_size;
-       elem_id += BLOCK_SIZE_ * BLOCKS_PER_BEAM_) {
-    int index = elem_id + tmp_log_buf_index;
-    tmp_log_probs[index] = log_probs[index];
-  }
-
-  for (int ite = 0; ite < k; ite++) {
-    partial.init();
-#pragma unroll
-    for (int elem_id = tid + block_lane * BLOCK_SIZE_; elem_id < vocab_size;
-         elem_id += BLOCK_SIZE_ * BLOCKS_PER_BEAM_) {
-      int index = elem_id + tmp_log_buf_index;
-      partial.insert(tmp_log_probs[index], index);
-    }
-
-    TopK_2<T> total =
-        BlockReduce(temp_storage).Reduce(partial, reduce_topk_op_2<T>);
-
-    if (tid == 0) {
-      const int index = tmp_topk_buf_index + ite;
-      topk_tmp_id_buf[index] = total.p;
-      topk_tmp_val_buf[index] =
-          (cum_log_probs
-               ? (T)((float)total.u + cum_log_probs[beam_id_in_output])
-               : total.u) +
-          diversity_rate * (T)ite;
-      tmp_log_probs[total.p] = -MAX_T_VAL;
-    }
-    __syncthreads();
-  }
-}
-
-template <typename T, int BLOCK_SIZE_, int BLOCKS_PER_BEAM_>
-__global__ void topk_stage_2_opt3_sampling(
-    const int* __restrict topk_tmp_id_buf,
-    T* topk_tmp_val_buf,
-    T* topk_tmp2_val_buf,
-    int* ids,
-    int* sequence_length,
-    bool* finished_buf,
-    const int k,
-    curandState_t* curandstate,
-    const int end_id,
-    const int vocab_size) {
-  const int size = k * BLOCKS_PER_BEAM_;
-  const int tid = threadIdx.x;
-  const int batch_id = blockIdx.x;
-  const bool IS_FP16 = std::is_same<T, half>::value;
-  const T MAX_T_VAL = (IS_FP16) ? HALF_FLT_MAX : FLT_MAX;
-
-  typedef cub::BlockReduce<TopK_2<float>, BLOCK_SIZE_> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  extern __shared__ char array[];
-  __shared__ float rand_num;
-  __shared__ float s_sum;
-  __shared__ float s_max;
-  T* s_val = topk_tmp_val_buf + batch_id * size;
-  int* s_id = (int*)(array);
-  s_max = (float)0.0f;
-  s_sum = (float)0.0f;
-  TopK_2<float> partial;
-
-  for (int index = tid; index < size; index += BLOCK_SIZE_) {
-    topk_tmp2_val_buf[batch_id * size + index] =
-        topk_tmp_val_buf[batch_id * size + index];
-  }
-  __syncthreads();
-  T* s_val2 = topk_tmp2_val_buf + batch_id * size;
-
-  for (int ite = 0; ite < k; ite++) {
-    partial.init();
-#pragma unroll
-    for (int i = tid; i < size; i += BLOCK_SIZE_) {
-      partial.insert((float)s_val[i], i);
-    }
-
-    TopK_2<float> total =
-        BlockReduce(temp_storage).Reduce(partial, reduce_topk_op_2<float>);
-
-    if (ite == 0) s_max = total.u;
-
-    if (tid == 0) {
-      s_id[ite] = total.p;
-      s_val[total.p] = -MAX_T_VAL;
-      total.u = __expf(total.u - s_max);
-      s_val2[total.p] = (T)total.u;
-      s_sum += total.u;
-    }
-    __syncthreads();
-  }
-  if (tid == 0) {
-    rand_num = (float)curand_uniform(curandstate + blockIdx.x) * s_sum;
-    for (int i = 0; i < k; i++) {
-      rand_num = rand_num - (float)s_val2[s_id[i]];
-      if (rand_num <= 0.0f || i == k - 1) {
-        ids[batch_id] = topk_tmp_id_buf[batch_id * size + s_id[i]] % vocab_size;
-        break;
-      }
-    }
-    if (finished_buf != nullptr) {
-      if (sequence_length != nullptr) {
-        sequence_length[batch_id] = finished_buf[batch_id]
-                                        ? sequence_length[batch_id]
-                                        : sequence_length[batch_id] + 1;
-      }
-      finished_buf[batch_id] = ids[batch_id] == end_id ? 1 : 0;
-    }
-  }
-}
-
-template <typename T, int BLOCK_SIZE_, int BLOCKS_PER_BEAM_>
-__global__ void topk_stage_2_opt3_sampling_v2(
-    const int* __restrict topk_tmp_id_buf,
-    T* topk_tmp_val_buf,
-    T* topk_tmp2_val_buf,
-    int* ids,
-    int* sequence_length,
-    float* scores,
-    bool* finished_buf,
-    const int k,
-    curandState_t* curandstate,
-    const int end_id,
-    const int vocab_size) {
-  const int size = k * BLOCKS_PER_BEAM_;
-  const int tid = threadIdx.x;
-  const int batch_id = blockIdx.x;
-  const bool IS_FP16 = std::is_same<T, half>::value;
-  const T MAX_T_VAL = (IS_FP16) ? HALF_FLT_MAX : FLT_MAX;
-
-  typedef cub::BlockReduce<TopK_2<float>, BLOCK_SIZE_> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  extern __shared__ char array[];
-  __shared__ float rand_num;
-  __shared__ float s_sum;
-  // __shared__ float s_max;
-  T* s_val = topk_tmp_val_buf + batch_id * size;
-  int* s_id = (int*)(array);
-  // s_max = (float)0.0f;
-  s_sum = (float)0.0f;
-  TopK_2<float> partial;
-
-  for (int index = tid; index < size; index += BLOCK_SIZE_) {
-    topk_tmp2_val_buf[batch_id * size + index] =
-        topk_tmp_val_buf[batch_id * size + index];
-  }
-  __syncthreads();
-  T* s_val2 = topk_tmp2_val_buf + batch_id * size;
-
-  for (int ite = 0; ite < k; ite++) {
-    partial.init();
-#pragma unroll
-    for (int i = tid; i < size; i += BLOCK_SIZE_) {
-      partial.insert((float)s_val[i], i);
-    }
-
-    TopK_2<float> total =
-        BlockReduce(temp_storage).Reduce(partial, reduce_topk_op_2<float>);
-
-    // if (ite == 0) s_max = total.u;
-
-    if (tid == 0) {
-      s_id[ite] = total.p;
-      s_val[total.p] = -MAX_T_VAL;
-      // total.u = __expf(total.u - s_max);
-      s_val2[total.p] = (T)total.u;
-      s_sum += total.u;
-    }
-    __syncthreads();
-  }
-  if (tid == 0) {
-    rand_num = (float)curand_uniform(curandstate + blockIdx.x) * s_sum;
-    for (int i = 0; i < k; i++) {
-      rand_num = rand_num - (float)s_val2[s_id[i]];
-      if (rand_num <= 0.0f || i == k - 1) {
-        ids[batch_id] = topk_tmp_id_buf[batch_id * size + s_id[i]] % vocab_size;
-        if (scores) {
-          scores[batch_id] += __logf((float)s_val2[s_id[i]]);
-        }
-        break;
-      }
-    }
-    if (finished_buf != nullptr) {
-      if (sequence_length != nullptr) {
-        sequence_length[batch_id] = finished_buf[batch_id]
-                                        ? sequence_length[batch_id]
-                                        : sequence_length[batch_id] + 1;
-      }
-      finished_buf[batch_id] = ids[batch_id] == end_id ? 1 : 0;
-    }
-  }
-}
-
-template <typename T, int BLOCK_SIZE, int BLOCKS_PER_BEAM>
-__global__ void topk_stage_1_opt2_general(const T* __restrict log_probs,
-                                          T* tmp_log_probs,
-                                          int* topk_tmp_id_buf,
-                                          T* topk_tmp_val_buf,
-                                          const int k,
-                                          const int vocab_size) {
-  const bool IS_FP16 = std::is_same<T, half>::value;
-  const T MAX_T_VAL = (IS_FP16) ? HALF_FLT_MAX : FLT_MAX;
-  typedef cub::BlockReduce<TopK_2<T>, BLOCK_SIZE> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-
-  const int tid = threadIdx.x;
-  const int bid = blockIdx.x;
-  const int row_id = bid / BLOCKS_PER_BEAM;      // row id for log_probs
-  const int block_lane = bid % BLOCKS_PER_BEAM;  // block id for a beam
-  const int tmp_log_buf_index = row_id * vocab_size;
-  const int tmp_topk_buf_index = row_id * BLOCKS_PER_BEAM * k + block_lane * k;
-  TopK_2<T> partial;
-
-  for (int elem_id = tid + block_lane * BLOCK_SIZE; elem_id < vocab_size;
-       elem_id += BLOCK_SIZE * BLOCKS_PER_BEAM) {
-    int index = elem_id + tmp_log_buf_index;
-    tmp_log_probs[index] = log_probs[index];
-  }
-
-
-  for (int ite = 0; ite < k; ite++) {
-    partial.init();
-#pragma unroll
-    for (int elem_id = tid + block_lane * BLOCK_SIZE; elem_id < vocab_size;
-         elem_id += BLOCK_SIZE * BLOCKS_PER_BEAM) {
-      int index = elem_id + tmp_log_buf_index;
-      partial.insert(tmp_log_probs[index], index);
-    }
-
-    TopK_2<T> total =
-        BlockReduce(temp_storage).Reduce(partial, reduce_topk_op_2<T>);
-
-    if (tid == 0) {
-      const int index = tmp_topk_buf_index + ite;
-      topk_tmp_id_buf[index] = total.p;
-      topk_tmp_val_buf[index] = total.u;
-      tmp_log_probs[total.p] = -MAX_T_VAL;
-    }
-    __syncthreads();
-  }
-}
-
-template <typename T, int BLOCK_SIZE, int BLOCKS_PER_BEAM>
-__global__ void topk_stage_2_opt2_general(const int* __restrict topk_tmp_id_buf,
-                                          T* topk_tmp_val_buf,
-                                          int* ids,
-                                          const int k) {
-  const int size = k * k * BLOCKS_PER_BEAM;
-  const int tid = threadIdx.x;
-  const int batch_id = blockIdx.x;
-  const bool IS_FP16 = std::is_same<T, half>::value;
-  const T MAX_T_VAL = (IS_FP16) ? HALF_FLT_MAX : FLT_MAX;
-
-  typedef cub::BlockReduce<TopK_2<T>, BLOCK_SIZE> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  extern __shared__ char array[];
-  T* s_val = topk_tmp_val_buf + batch_id * size;
-  int* s_id = (int*)(array);
-
-  TopK_2<T> partial;
-
-  for (int ite = 0; ite < k; ite++) {
-    partial.init();
-#pragma unroll
-    for (int i = tid; i < size; i += BLOCK_SIZE) {
-      partial.insert(s_val[i], i);
-    }
-
-    TopK_2<T> total =
-        BlockReduce(temp_storage).Reduce(partial, reduce_topk_op_2<T>);
-
-    if (tid == 0) {
-      s_id[ite] = total.p;
-      s_val[total.p] = -MAX_T_VAL;
-    }
-    __syncthreads();
-  }
-  if (tid < k)
-    ids[batch_id * k + tid] = topk_tmp_id_buf[batch_id * size + s_id[tid]];
-}
-
-#define CASE_K_DIV(K, BLOCK_SIZE_1, BLOCK_SIZE_2)                            \
-  case K:                                                                    \
-    beam_topK_kernel<                                                        \
-        T,                                                                   \
-        K,                                                                   \
-        BLOCK_SIZE_2><<<batch_size * beam_width, BLOCK_SIZE_2, 0, stream>>>( \
-        log_probs,                                                           \
-        topk_tmp_id_buf,                                                     \
-        topk_tmp_val_buf,                                                    \
-        vocab_size,                                                          \
-        diversity_rate);                                                     \
-    if (K < 10)                                                              \
-      batch_topK_kernel<                                                     \
-          T,                                                                 \
-          K,                                                                 \
-          BLOCK_SIZE_1><<<batch_size, BLOCK_SIZE_1, 0, stream>>>(            \
-          topk_tmp_id_buf, topk_tmp_val_buf, ids);                           \
-    else                                                                     \
-      batch_topK_kernel_v2<T, K, 32><<<batch_size, 32, 0, stream>>>(         \
-          topk_tmp_id_buf, topk_tmp_val_buf, ids);                           \
-    break;
-
-#define CASE_K(K, BLOCK_SIZE_1_, BLOCK_SIZE_2_, BLOCKS_PER_BEAM_)            \
-  case K:                                                                    \
-    topk_stage_1_opt3<float,                                                 \
-                      BLOCK_SIZE_1_,                                         \
-                      BLOCKS_PER_BEAM_><<<batch_size * K * BLOCKS_PER_BEAM_, \
-                                          BLOCK_SIZE_1_,                     \
-                                          0,                                 \
-                                          stream>>>(log_probs,               \
-                                                    temp_log_probs,          \
-                                                    topk_tmp_id_buf,         \
-                                                    topk_tmp_val_buf,        \
-                                                    finished,                \
-                                                    beam_width,              \
-                                                    vocab_size,              \
-                                                    end_id);                 \
-    topk_stage_2_opt3<float,                                                 \
-                      BLOCK_SIZE_2_,                                         \
-                      BLOCKS_PER_BEAM_><<<batch_size,                        \
-                                          BLOCK_SIZE_2_,                     \
-                                          K * sizeof(int),                   \
-                                          stream>>>(                         \
-        topk_tmp_id_buf, topk_tmp_val_buf, ids, beam_width);                 \
-    break;
-
-template <typename T>
-void topK_kernelLauncher(void* workspace,
-                         size_t& workspace_size,
-                         T* log_probs,
-                         int* ids,
-                         const bool* finished,
-                         DecodingBeamsearchArguments args,
-                         cudaStream_t stream) {
-  const int batch_size = args.batch_size_;
-  const int beam_width = args.beam_width_;
-  const int vocab_size = args.vocab_size_padded_;
-  const T diversity_rate = args.beam_search_diversity_rate_;
-  const int end_id = args.end_id_;
-
-  const int max_block_per_beam = 8;
-  int temp_log_probs_buf_size =
-      batch_size * beam_width * vocab_size;  // type float
-  int topk_tmp_ids_buf_size =
-      batch_size * beam_width * beam_width * max_block_per_beam;  // type int
-  int topk_tmp_val_buf_size =
-      batch_size * beam_width * beam_width * max_block_per_beam;  // type float
-  // int can_score_buf_size = batch_size * beam_width * vocab_size;
-  // int can_idx_buf_size = batch_size * beam_width * vocab_size;
-
-  // prevent memory misalinged address
-  temp_log_probs_buf_size = (int)(ceil(temp_log_probs_buf_size / 4.)) * 4;
-  // can_score_buf_size = (int)(ceil(can_score_buf_size / 4.)) * 4;
-  // can_idx_buf_size = (int)(ceil(can_idx_buf_size / 4.)) * 4;
-  topk_tmp_ids_buf_size = (int)(ceil(topk_tmp_ids_buf_size / 4.)) * 4;
-  topk_tmp_val_buf_size = (int)(ceil(topk_tmp_val_buf_size / 4.)) * 4;
-
-  if (workspace == nullptr) {
-    workspace_size = sizeof(float) * temp_log_probs_buf_size +
-                     sizeof(int) * topk_tmp_ids_buf_size +
-                     sizeof(float) * topk_tmp_val_buf_size;
-    // sizeof(float) * can_score_buf_size +
-    // sizeof(int) * can_idx_buf_size;
-    return;
-  } else {
-    T* temp_log_probs = (T*)workspace;
-    int* topk_tmp_id_buf = (int*)(temp_log_probs + temp_log_probs_buf_size);
-    T* topk_tmp_val_buf = (T*)(topk_tmp_id_buf + topk_tmp_ids_buf_size);
-    // T* can_score_buf = (T*)(topk_tmp_val_buf + topk_tmp_val_buf_size);
-    // int* can_idx_buf = (int*)(can_score_buf + can_score_buf_size);
-    if (diversity_rate == 0.0f) {
-      switch (beam_width) {
-        CASE_K(1, 128, 128, 8);
-        CASE_K(4, 128, 128, 8);
-        CASE_K(10, 128, 128, 8);
-        CASE_K(16, 128, 128, 5);
-        CASE_K(32, 256, 128, 1);
-        CASE_K(64, 256, 256, 1);
-        default:
-          topk_stage_1_opt2_general<
-              T,
-              128,
-              1><<<batch_size * beam_width * 1, 128, 0, stream>>>(
-              log_probs,
-              temp_log_probs,
-              topk_tmp_id_buf,
-              topk_tmp_val_buf,
-              beam_width,
-              vocab_size);
-          topk_stage_2_opt2_general<T, 128, 1><<<batch_size,
-                                                 128,
-                                                 beam_width * beam_width * 1 *
-                                                         sizeof(float) +
-                                                     beam_width * sizeof(int),
-                                                 stream>>>(
-              topk_tmp_id_buf, topk_tmp_val_buf, ids, beam_width);
-          break;
-      }
-    } else {
-      switch (beam_width) {
-        CASE_K_DIV(1, 256, 256);
-        CASE_K_DIV(4, 256, 256);
-        CASE_K_DIV(16, 256, 64);
-        CASE_K_DIV(64, 256, 64);
-        default:
-          // printf("[ERROR] Topk kernel does not support beamwidth = %d \n",
-          //        beam_width);
-          // exit(0);
-          // diversity_rate only works when BLOCKS_PER_BEAM_ is 1
-          topk_stage_1_opt3<T,
-                            128,
-                            1><<<batch_size * beam_width * 1, 128, 0, stream>>>(
-              log_probs,
-              (float*)nullptr,
-              temp_log_probs,
-              topk_tmp_id_buf,
-              topk_tmp_val_buf,
-              finished,
-              beam_width,
-              vocab_size,
-              diversity_rate,
-              end_id);
-          topk_stage_2_opt3<
-              T,
-              128,
-              1><<<batch_size, 128, beam_width * sizeof(int), stream>>>(
-              topk_tmp_id_buf, topk_tmp_val_buf, ids, beam_width);
-          break;
-      }
-    }
-    return;
-  }
-}
-
-#undef CASE_K
-#undef CASE_K_DIV
-
-template void topK_kernelLauncher<float>(void* workspace,
-                                         size_t& workspace_size,
-                                         float* log_probs,
-                                         int* ids,
-                                         const bool* finished,
-                                         DecodingBeamsearchArguments args,
-                                         cudaStream_t stream);
-
-template <typename T, int BLOCK_SIZE_, int BLOCKS_PER_BEAM_>
-__global__ void topk_stage_2_opt3_update(const int* __restrict topk_tmp_id_buf,
-                                         T* topk_tmp_val_buf,
-                                         bool* finished,
-                                         bool* alive_finished,
-                                         int* sequence_length,
-                                         int* word_ids,
-                                         int* parent_ids,
-                                         int* output_word_ids,
-                                         int* output_parent_ids,
-                                         float* output_cum_log_probs,
-                                         const int beam_width,
-                                         const int vocab_size,
-                                         const int end_id,
-                                         const int step,
-                                         const int max_out_len,
-                                         int k,
-                                         //  T diversity_rate,
-                                         float length_penalty,
-                                         float max_length_penalty,
-                                         const int finished_candidate_num,
-                                         const bool early_stopping) {
-  const int size = beam_width * BLOCKS_PER_BEAM_ * beam_width * 2;
-  const int tid = threadIdx.x;
-  const int batch_id = blockIdx.x;
-  const bool IS_FP16 = std::is_same<T, half>::value;
-  // to be consistent with MAX_T_VAL in init_kernel, which should also be same
-  // with other topk kernel, however it does not
-  const T MAX_T_VAL = (IS_FP16) ? HALF_FLT_MAX : 1e20f;
-
-  typedef cub::BlockReduce<TopK_2<T>, BLOCK_SIZE_> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-
-  TopK_2<T> partial;
-
-  int finish_num = 0;
-  int alive_num = 0;
-  // No need for tmp_cum_log_probs anymore, since topk_tmp_val_buf stores cum
-  // log
-  // probs now thus only need to write and no need to read output_cum_log_probs.
-  // float* tmp_cum_log_probs =
-  //     (float*)(topk_tmp_val_buf +
-  //              gridDim.x * beam_width * beam_width * 2 * BLOCKS_PER_BEAM_) +
-  //     batch_id * beam_width;
-  topk_tmp_id_buf += batch_id * size;
-  topk_tmp_val_buf += batch_id * size;
-  word_ids += batch_id * beam_width;
-  parent_ids += batch_id * beam_width;
-  output_word_ids += batch_id * k;  // k == beam_width*2
-  output_parent_ids += batch_id * k;
-  output_cum_log_probs += batch_id * k;
-  sequence_length += batch_id * k;
-  finished += batch_id * k;
-  alive_finished += batch_id * beam_width;
-
-  for (int ite = 0; ite < k; ite++) {
-    partial.init();
-#pragma unroll
-    for (int i = tid; i < size; i += BLOCK_SIZE_) {
-      // diversity_rate only works when BLOCKS_PER_BEAM_ is 1
-      // topk_tmp_val_buf reserves cum log probs rather than log probs currently
-      // partial.insert(topk_tmp_val_buf[i] +
-      //                    output_cum_log_probs[topk_tmp_id_buf[i] / vocab_size
-      //                    %
-      //                                             beam_width +
-      //                                         beam_width] +
-      //                    i % k * diversity_rate,
-      //                i);
-      // partial.insert(topk_tmp_val_buf[i] + i % k * diversity_rate, i);
-      partial.insert(topk_tmp_val_buf[i], i);
-    }
-
-    TopK_2<T> total =
-        BlockReduce(temp_storage).Reduce(partial, reduce_topk_op_2<T>);
-
-    if (tid == 0) {
-      if (ite == 0) {
-        if (step == 1) {  // init output
-          for (int i = 0; i < beam_width; i++) {
-            output_word_ids[i] = end_id;
-            output_cum_log_probs[i] = -1e20f;
-            output_parent_ids[i] = -1;
-            sequence_length[i] = 0;
-          }
-        } else {
-          for (int i = 0; i < beam_width; i++) {
-            output_word_ids[i] = end_id;
-            output_parent_ids[i] = i;
-            if (finished[i]) finish_num++;
-          }
-        }
-      }
-
-      // beam_online_softmax_topk_kernel produces absolute id, which can make
-      // update_KV_cache_kernel use gather instead of gather_nd
-      int abs_id = topk_tmp_id_buf[total.p];
-      // use scores in topk_tmp_val_buf rather than total.u, since the latter
-      // stores diversity decay values, while original cum log probs is needed.
-      // float cum_log_prob = total.u;
-      float cum_log_prob = (float)topk_tmp_val_buf[total.p];
-      // There are two queues, one for the alive and another for the finish.
-      // `beam_id` stands for parents in the alive, and it uses absolute id
-      // represented as `batch_idx * beam_width + beam_idx`.
-      int beam_id = abs_id / vocab_size;
-      int beam_id_in_output =
-          batch_id * k + (beam_id % beam_width) + beam_width;
-      int word_id = abs_id % vocab_size;
-      if (word_id == end_id && ite < finished_candidate_num) {
-        // grow finish
-        float score = cum_log_prob / length_penalty;
-        if (score > output_cum_log_probs[beam_width - 1]) {
-          output_word_ids[beam_width - 1] = end_id;
-          output_cum_log_probs[beam_width - 1] = score;
-          output_parent_ids[beam_width - 1] = beam_id_in_output;
-          sequence_length[beam_width - 1] = step;
-          finished[beam_width - 1] = 1;
-          for (int i = beam_width - 2; i >= 0; --i) {
-            if (output_cum_log_probs[i + 1] > output_cum_log_probs[i]) {
-              // output_word_ids[i] = end_id;
-              float tmp_f = output_cum_log_probs[i];
-              output_cum_log_probs[i] = output_cum_log_probs[i + 1];
-              output_cum_log_probs[i + 1] = tmp_f;
-              int tmp_i = output_parent_ids[i];
-              output_parent_ids[i] = output_parent_ids[i + 1];
-              output_parent_ids[i + 1] = tmp_i;
-              tmp_i = sequence_length[i];
-              sequence_length[i] = sequence_length[i + 1];
-              sequence_length[i + 1] = tmp_i;
-              tmp_i = finished[i];
-              finished[i] = finished[i + 1];
-              finished[i + 1] = tmp_i;
-            } else {
-              break;
-            }
-          }
-        }
-        if (finish_num != beam_width) finish_num += 1;
-      } else if (alive_num < beam_width && word_id != end_id) {
-        // grow alive
-        parent_ids[alive_num] = beam_id;
-        word_ids[alive_num] = word_id;
-        // Also put alive candidates after finish candidates, since output
-        // must include both the finish and alive to trace full path
-        output_word_ids[beam_width + alive_num] = word_id;
-        output_parent_ids[beam_width + alive_num] = beam_id_in_output;
-        // Must not override output_cum_log_probs since the after iters would
-        // use it. We will copy tmp_cum_log_probs back to output_cum_log_probs
-        // after the topk all has been selected.
-        // tmp_cum_log_probs[alive_num] = cum_log_prob;
-        // No need for tmp_cum_log_probs anymore.
-        output_cum_log_probs[beam_width + alive_num] = cum_log_prob;
-        sequence_length[beam_width + alive_num] = step;
-        finished[beam_width + alive_num] = 0;
-        alive_finished[alive_num] = 0;
-        alive_num += 1;
-      }
-      topk_tmp_val_buf[total.p] = -MAX_T_VAL;
-    }
-    __syncthreads();
-  }
-
-  if (tid == 0) {
-    // No need for tmp_cum_log_probs anymore.
-    // for (int i = 0; i < beam_width; ++i) {
-    //   output_cum_log_probs[beam_width + i] = tmp_cum_log_probs[i];
-    // }
-    // early finish
-    float lowest_finish =
-        finish_num == 0 ? -1e20f : output_cum_log_probs[finish_num - 1];
-    // The best possible score of the most likely alive sequence
-    float lower_bound =
-        (float)output_cum_log_probs[beam_width] / max_length_penalty;
-
-    if (finished_candidate_num == beam_width) {
-      if (finish_num == finished_candidate_num &&
-          (lowest_finish > lower_bound || early_stopping)) {
-        // If early stop, also mark the alive beams finished.
-        for (int i = beam_width; i < beam_width * 2; ++i) {
-          finished[i] = 1;
-          alive_finished[i - beam_width] = 1;
-        }
-      } else if (step == max_out_len) {
-        // sort on finish sequences and alive sequences
-        for (int ite = beam_width; ite < beam_width * 2; ++ite) {
-          output_cum_log_probs[ite] =
-              output_cum_log_probs[ite] / length_penalty;
-          for (int i = ite - 1;
-               i >= 0 && output_cum_log_probs[i + 1] > output_cum_log_probs[i];
-               --i) {
-            float tmp_f = output_cum_log_probs[i];
-            output_cum_log_probs[i] = output_cum_log_probs[i + 1];
-            output_cum_log_probs[i + 1] = tmp_f;
-            int tmp_i = output_word_ids[i];
-            output_word_ids[i] = output_word_ids[i + 1];
-            output_word_ids[i + 1] = tmp_i;
-            tmp_i = output_parent_ids[i];
-            output_parent_ids[i] = output_parent_ids[i + 1];
-            output_parent_ids[i + 1] = tmp_i;
-            tmp_i = sequence_length[i];
-            sequence_length[i] = sequence_length[i + 1];
-            sequence_length[i + 1] = tmp_i;
-            finished[i] = 1;
-            finished[i + 1] = 1;
-          }
-        }
-        // If early stop, also mark the alive beams finished.
-        for (int i = beam_width; i < beam_width * 2; ++i) {
-          finished[i] = 1;
-          alive_finished[i - beam_width] = 1;
-        }
-      }
-
-    } else {
-      // output must include both the finish and alive to trace full path
-      if (step == max_out_len ||
-          lowest_finish > lower_bound) {  // when finishing
-        for (int i = 0; finish_num < beam_width; ++finish_num, ++i) {
-          output_word_ids[finish_num] = word_ids[i];
-          output_cum_log_probs[finish_num] =
-              output_cum_log_probs[i + beam_width] / length_penalty;
-          output_parent_ids[finish_num] = output_parent_ids[i + beam_width];
-          sequence_length[finish_num] = step;
-          finished[finish_num] = 1;
-        }
-        // If early stop, also mark the alive beams finished.
-        for (int i = beam_width; i < beam_width * 2; ++i) {
-          finished[i] = 1;
-          alive_finished[i - beam_width] = 1;
-        }
-      }
-    }
-  }
-}
-
-#define CASE_K(K, BLOCK_SIZE_1_, BLOCK_SIZE_2_, BLOCKS_PER_BEAM_)            \
-  case K:                                                                    \
-    topk_stage_1_opt3<T,                                                     \
-                      BLOCK_SIZE_1_,                                         \
-                      BLOCKS_PER_BEAM_><<<batch_size * K * BLOCKS_PER_BEAM_, \
-                                          BLOCK_SIZE_1_,                     \
-                                          0,                                 \
-                                          stream>>>(log_probs,               \
-                                                    output_cum_log_probs,    \
-                                                    temp_log_probs,          \
-                                                    topk_tmp_id_buf,         \
-                                                    topk_tmp_val_buf,        \
-                                                    finished,                \
-                                                    beam_width * 2,          \
-                                                    vocab_size,              \
-                                                    (T)diversity_rate,       \
-                                                    end_id);                 \
-    topk_stage_2_opt3_update<                                                \
-        T,                                                                   \
-        BLOCK_SIZE_2_,                                                       \
-        BLOCKS_PER_BEAM_><<<batch_size, BLOCK_SIZE_2_, 0, stream>>>(         \
-        topk_tmp_id_buf,                                                     \
-        topk_tmp_val_buf,                                                    \
-        finished,                                                            \
-        alive_finished,                                                      \
-        sequence_length,                                                     \
-        word_ids,                                                            \
-        parent_ids,                                                          \
-        output_word_ids,                                                     \
-        output_parent_ids,                                                   \
-        output_cum_log_probs,                                                \
-        beam_width,                                                          \
-        vocab_size,                                                          \
-        end_id,                                                              \
-        step,                                                                \
-        max_out_len,                                                         \
-        beam_width * 2,                                                      \
-        length_penalty,                                                      \
-        max_length_penalty,                                                  \
-        finished_candidate_num,                                              \
-        early_stopping);                                                     \
-    break;
-
-template <typename T>
-void topK_update_kernelLauncher(
-    void* workspace,
-    size_t& workspace_size,
-    const T* log_probs,
-    bool* finished,
-    bool* alive_finished,
-    int* sequence_length,
-    int* word_ids,
-    int* parent_ids,  // for update cache, only include alive beams
-    int* output_word_ids,
-    int* output_parent_ids,  // for gather tree, include both alive and finish
-                             // beams
-    float* output_cum_log_probs,  // NOTE: cum_log_probs is T in V3.1
-    const int step,
-    DecodingBeamsearchArguments args,
-    cudaStream_t stream) {
-  const int batch_size = args.batch_size_;
-  const int beam_width = args.beam_width_;
-  const int vocab_size = args.vocab_size_padded_;
-  // const int vocab_size = args.vocab_size_;
-  const float diversity_rate = args.beam_search_diversity_rate_;
-  const int end_id = args.end_id_;
-  const int max_out_len = args.seq_len_;
-  const float alpha = args.alpha_;
-
-  const int max_block_per_beam = 8;
-  int temp_log_probs_buf_size =
-      batch_size * beam_width * vocab_size;  // type float
-  // select top beam_width*2 for topk_tmp_id_buf and topk_tmp_val_buf
-  int topk_tmp_ids_buf_size = batch_size * beam_width * beam_width * 2 *
-                              max_block_per_beam;  // type int
-  int topk_tmp_val_buf_size = batch_size * beam_width * beam_width * 2 *
-                              max_block_per_beam;  // type float
-  // // to save tmp output_cum_log_probs results of the alive beams
-  // topk_tmp_val_buf_size += batch_size * beam_width;
-
-  // prevent memory misalinged address
-  temp_log_probs_buf_size = (int)(ceil(temp_log_probs_buf_size / 4.)) * 4;
-  topk_tmp_ids_buf_size = (int)(ceil(topk_tmp_ids_buf_size / 4.)) * 4;
-  topk_tmp_val_buf_size = (int)(ceil(topk_tmp_val_buf_size / 4.)) * 4;
-
-  if (workspace == nullptr) {
-    workspace_size = sizeof(float) * temp_log_probs_buf_size +
-                     sizeof(int) * topk_tmp_ids_buf_size +
-                     sizeof(float) * topk_tmp_val_buf_size;
-    return;
-  } else {
-    T* temp_log_probs = (T*)workspace;
-    int* topk_tmp_id_buf = (int*)(temp_log_probs + temp_log_probs_buf_size);
-    T* topk_tmp_val_buf = (T*)(topk_tmp_id_buf + topk_tmp_ids_buf_size);
-    const int finished_candidate_num = args.finished_candidate_num_;
-    const bool early_stopping = args.early_stopping_;
-    float length_penalty = (finished_candidate_num == beam_width)
-                               ? std::pow((5. + step - 1) / 6., alpha)
-                               : std::pow((5. + step + 1) / 6., alpha);
-    float max_length_penalty =
-        (finished_candidate_num == beam_width)
-            ? length_penalty
-            : std::pow((5. + max_out_len + 1) / 6., alpha);
-    if (diversity_rate == 0.0f) {
-      switch (beam_width) {
-        CASE_K(1, 128, 128, 8);
-        CASE_K(4, 128, 128, 8);
-        CASE_K(10, 128, 128, 8);
-        CASE_K(16, 128, 128, 5);
-        CASE_K(32, 256, 128, 1);
-        CASE_K(64, 256, 256, 1);
-        default:
-          topk_stage_1_opt3<T,
-                            128,
-                            1><<<batch_size * beam_width * 1, 128, 0, stream>>>(
-              log_probs,
-              output_cum_log_probs,
-              temp_log_probs,
-              topk_tmp_id_buf,
-              topk_tmp_val_buf,
-              finished,
-              beam_width * 2,
-              vocab_size,
-              diversity_rate,
-              end_id);
-          topk_stage_2_opt3_update<T, 128, 1><<<batch_size, 128, 0, stream>>>(
-              topk_tmp_id_buf,
-              topk_tmp_val_buf,
-              finished,
-              alive_finished,
-              sequence_length,
-              word_ids,
-              parent_ids,
-              output_word_ids,
-              output_parent_ids,
-              output_cum_log_probs,
-              beam_width,
-              vocab_size,
-              end_id,
-              step,
-              max_out_len,
-              beam_width * 2,
-              // diversity_rate,
-              length_penalty,
-              max_length_penalty,
-              finished_candidate_num,
-              early_stopping);
-          break;
-      }
-    } else {
-      // diversity_rate only works when BLOCKS_PER_BEAM_ is 1 to get the correct
-      // branch indice by `idx%k`
-      topk_stage_1_opt3<T,
-                        128,
-                        1><<<batch_size * beam_width * 1, 128, 0, stream>>>(
-          log_probs,
-          output_cum_log_probs,
-          temp_log_probs,
-          topk_tmp_id_buf,
-          topk_tmp_val_buf,
-          finished,
-          beam_width * 2,
-          vocab_size,
-          diversity_rate,
-          end_id);
-      topk_stage_2_opt3_update<T, 128, 1><<<batch_size, 128, 0, stream>>>(
-          topk_tmp_id_buf,
-          topk_tmp_val_buf,
-          finished,
-          alive_finished,
-          sequence_length,
-          word_ids,
-          parent_ids,
-          output_word_ids,
-          output_parent_ids,
-          output_cum_log_probs,
-          beam_width,
-          vocab_size,
-          end_id,
-          step,
-          max_out_len,
-          beam_width * 2,
-          // diversity_rate,
-          length_penalty,
-          max_length_penalty,
-          finished_candidate_num,
-          early_stopping);
-    }
-    return;
-  }
-}
-
-#undef CASE_K
-#undef CASE_K_DIV
-
-template void topK_update_kernelLauncher<float>(
-    void* workspace,
-    size_t& workspace_size,
-    const float* log_probs,
-    bool* finished,
-    bool* alive_finished,
-    int* sequence_length,
-    int* word_ids,
-    int* parent_ids,  // for update cache, only include alive beams
-    int* output_word_ids,
-    int* output_parent_ids,  // for gather tree, include both alive and finish
-                             // beams
-    float* output_cum_log_probs,  // NOTE: cum_log_probs is T in V3.1
-    const int step,
-    DecodingBeamsearchArguments args,
-    cudaStream_t stream);
-
-template void topK_update_kernelLauncher<half>(
-    void* workspace,
-    size_t& workspace_size,
-    const half* log_probs,
-    bool* finished,
-    bool* alive_finished,
-    int* sequence_length,
-    int* word_ids,
-    int* parent_ids,  // for update cache, only include alive beams
-    int* output_word_ids,
-    int* output_parent_ids,  // for gather tree, include both alive and finish
-                             // beams
-    float* output_cum_log_probs,  // NOTE: cum_log_probs is T in V3.1
-    const int step,
-    DecodingBeamsearchArguments args,
-    cudaStream_t stream);
-
-// Sampling kernels
-template <typename T>
-__global__ void sampling(int* topk_tmp_id_buf,
-                         T* topk_tmp_val_buf,
-                         int* ids,
-                         int* sequence_length,
-                         bool* finished_buf,
-                         const int candidate_num,
-                         int random_num,
-                         const int end_id,
-                         const int vocab_size) {
-  int tid = threadIdx.x;
-  int bid = blockIdx.x;
-  __shared__ float sum;
-  __shared__ float rand_num;
-
-  if (tid < candidate_num) {
-    float max_val = topk_tmp_val_buf[bid * candidate_num];
-    topk_tmp_val_buf[bid * candidate_num + tid] =
-        (T)__expf((float)topk_tmp_val_buf[bid * candidate_num + tid] - max_val);
-  }
-
-  if (tid == 0) {
-    sum = 0.0f;
-    for (int i = 0; i < candidate_num; i++) {
-      sum = sum + (float)topk_tmp_val_buf[bid * candidate_num + i];
-    }
-
-    curandState_t local_state;
-    curand_init(
-        (T)0, bid * candidate_num, blockDim.x * candidate_num, &local_state);
-    rand_num = (float)curand_uniform(&local_state) * sum;
-
-    ids[bid] =
-        topk_tmp_id_buf[bid * candidate_num + candidate_num - 1] % vocab_size;
-    for (int i = 0; i < candidate_num; i++) {
-      rand_num = rand_num - (float)topk_tmp_val_buf[bid * candidate_num + i];
-      if (rand_num <= 0.0f) {
-        ids[bid] = topk_tmp_id_buf[bid * candidate_num + i] % vocab_size;
-        break;
-      }
-    }
-    if (finished_buf != nullptr) {
-      if (sequence_length != nullptr) {
-        sequence_length[bid] =
-            finished_buf[bid] ? sequence_length[bid] : sequence_length[bid] + 1;
-      }
-      finished_buf[bid] = ids[bid] == end_id ? 1 : 0;
-    }
-  }
-}
-
-#define CASE_K(K)                                                              \
-  case K:                                                                      \
-    beam_topK_kernel<T, K, block_size><<<batch_size, block_size, 0, stream>>>( \
-        log_probs, topk_tmp_id_buf, topk_tmp_val_buf, vocab_size, 0.0f);       \
-    break;
-
-template <typename T>
-void topK_sampling_kernel_kernelLauncher(void* workspace,
-                                         size_t& workspace_size,
-                                         T* log_probs,
-                                         int* ids,
-                                         int* sequence_length,
-                                         bool* finished_buf,
-                                         int random_num,
-                                         DecodingSamplingArguments args,
-                                         cudaStream_t stream,
-                                         const int batch_size) {
-  // This function would be called two or more times.
-  // First time is used to get the workspace size, so we need to put
-  // max batch size we want to use.
-  // For other times, we need to put the inference batch size to
-  // set the grid size we use.
-  const int vocab_size = args.vocab_size_padded_;
-  const int candidate_num = args.candidate_num_;
-  const int end_id = args.end_id_;
-  const int block_size = 256;
-
-  int topk_tmp_ids_buf_size =
-      args.batch_size_ * args.candidate_num_;  // type int
-  int temp_log_probs_buf_size = args.batch_size_ * vocab_size;
-  int topk_tmp_val_buf_size = args.batch_size_ * args.candidate_num_;  // type T
-
-  temp_log_probs_buf_size = (int)(ceil(temp_log_probs_buf_size / 4.)) * 4;
-  topk_tmp_ids_buf_size = (int)(ceil(topk_tmp_ids_buf_size / 4.)) * 4;
-  topk_tmp_val_buf_size = (int)(ceil(topk_tmp_val_buf_size / 4.)) * 4;
-
-  if (workspace == nullptr) {
-    workspace_size = sizeof(T) * temp_log_probs_buf_size +
-                     sizeof(int) * topk_tmp_ids_buf_size +
-                     sizeof(T) * topk_tmp_val_buf_size;
-  } else {
-    T* temp_log_probs = (T*)workspace;
-    int* topk_tmp_id_buf = (int*)(temp_log_probs + temp_log_probs_buf_size);
-    T* topk_tmp_val_buf = (T*)(topk_tmp_id_buf + topk_tmp_ids_buf_size);
-
-    switch (candidate_num) {
-      CASE_K(1);
-      CASE_K(2);
-      CASE_K(4);
-      CASE_K(16);
-      CASE_K(64);
-      default:
-        beam_topK_kernel_general<
-            T,
-            block_size><<<batch_size, block_size, 0, stream>>>(log_probs,
-                                                               temp_log_probs,
-                                                               topk_tmp_id_buf,
-                                                               topk_tmp_val_buf,
-                                                               candidate_num,
-                                                               vocab_size);
-        break;
-    }
-    sampling<T><<<batch_size, candidate_num, 0, stream>>>(topk_tmp_id_buf,
-                                                          topk_tmp_val_buf,
-                                                          ids,
-                                                          sequence_length,
-                                                          finished_buf,
-                                                          candidate_num,
-                                                          random_num,
-                                                          end_id,
-                                                          vocab_size);
-  }
-}
-
-#undef CASE_K
-
-#define CASE_K(K_MIN, K_MAX, BLOCK_SIZE_1_, BLOCK_SIZE_2_, BLOCKS_PER_BEAM_) \
-  case K_MIN... K_MAX:                                                       \
-    topk_stage_1_opt3<T,                                                     \
-                      BLOCK_SIZE_1_,                                         \
-                      BLOCKS_PER_BEAM_><<<batch_size * BLOCKS_PER_BEAM_,     \
-                                          BLOCK_SIZE_1_,                     \
-                                          0,                                 \
-                                          stream>>>(log_probs,               \
-                                                    temp_log_probs,          \
-                                                    topk_tmp_id_buf,         \
-                                                    topk_tmp_val_buf,        \
-                                                    finished_buf,            \
-                                                    candidate_num,           \
-                                                    vocab_size,              \
-                                                    end_id);                 \
-    topk_stage_2_opt3_sampling<T,                                            \
-                               BLOCK_SIZE_2_,                                \
-                               BLOCKS_PER_BEAM_><<<batch_size,               \
-                                                   BLOCK_SIZE_2_,            \
-                                                   K_MAX * sizeof(int),      \
-                                                   stream>>>(                \
-        topk_tmp_id_buf,                                                     \
-        topk_tmp_val_buf,                                                    \
-        topk_tmp2_val_buf,                                                   \
-        ids,                                                                 \
-        sequence_length,                                                     \
-        finished_buf,                                                        \
-        candidate_num,                                                       \
-        curandstate,                                                         \
-        end_id,                                                              \
-        vocab_size);                                                         \
-    break;
-
-
-template <typename T>
-void topK_sampling_kernel_kernelLauncher_v2(void* workspace,
-                                            size_t& workspace_size,
-                                            T* log_probs,
-                                            int* ids,
-                                            int* sequence_length,
-                                            bool* finished_buf,
-                                            curandState_t* curandstate,
-                                            DecodingSamplingArguments args,
-                                            cudaStream_t stream,
-                                            const int batch_size) {
-  // Here, we put batch size as an argument because the batch size of
-  // initialization
-  // and inference may be different due to pipelint parallelism.
-  const int candidate_num = args.candidate_num_;
-  const int vocab_size = args.vocab_size_padded_;
-  const int end_id = args.end_id_;
-
-  const int max_block_per_beam = 8;
-  int temp_log_probs_buf_size = batch_size * vocab_size;  // type float
-  int topk_tmp_ids_buf_size =
-      batch_size * candidate_num * max_block_per_beam;  // type int
-  int topk_tmp_val_buf_size =
-      batch_size * candidate_num * max_block_per_beam;  // type float
-
-  // prevent memory misalinged address
-  temp_log_probs_buf_size = (int)(ceil(temp_log_probs_buf_size / 4.)) * 4;
-  topk_tmp_ids_buf_size = (int)(ceil(topk_tmp_ids_buf_size / 4.)) * 4;
-  topk_tmp_val_buf_size = (int)(ceil(topk_tmp_val_buf_size / 4.)) * 4;
-
-  if (workspace == nullptr) {
-    workspace_size = sizeof(float) * temp_log_probs_buf_size +
-                     sizeof(int) * topk_tmp_ids_buf_size +
-                     2 * sizeof(float) * topk_tmp_val_buf_size;
-    return;
-  } else {
-    T* temp_log_probs = (T*)workspace;
-    int* topk_tmp_id_buf = (int*)(temp_log_probs + temp_log_probs_buf_size);
-    T* topk_tmp_val_buf = (T*)(topk_tmp_id_buf + topk_tmp_ids_buf_size);
-    T* topk_tmp2_val_buf = (T*)(topk_tmp_val_buf + topk_tmp_val_buf_size);
-
-    switch (candidate_num) {
-      CASE_K(1, 16, 128, 128, 8);
-      CASE_K(17, 32, 256, 128, 8);
-      CASE_K(33, 64, 256, 256, 8);
-      default:
-        printf("[ERROR] Topk kernel does not support candidate_num = %d \n",
-               candidate_num);
-        exit(0);
-        break;
-    }
-    return;
-  }
-}
-
-#undef CASE_K
-
-
-#define CASE_K(K_MIN, K_MAX, BLOCK_SIZE_1_, BLOCK_SIZE_2_, BLOCKS_PER_BEAM_) \
-  case K_MIN... K_MAX:                                                       \
-    topk_stage_1_opt3<T,                                                     \
-                      BLOCK_SIZE_1_,                                         \
-                      BLOCKS_PER_BEAM_><<<batch_size * BLOCKS_PER_BEAM_,     \
-                                          BLOCK_SIZE_1_,                     \
-                                          0,                                 \
-                                          stream>>>(log_probs,               \
-                                                    temp_log_probs,          \
-                                                    topk_tmp_id_buf,         \
-                                                    topk_tmp_val_buf,        \
-                                                    finished_buf,            \
-                                                    candidate_num,           \
-                                                    vocab_size,              \
-                                                    end_id);                 \
-    topk_stage_2_opt3_sampling_v2<T,                                            \
-                               BLOCK_SIZE_2_,                                \
-                               BLOCKS_PER_BEAM_><<<batch_size,               \
-                                                   BLOCK_SIZE_2_,            \
-                                                   K_MAX * sizeof(int),      \
-                                                   stream>>>(                \
-        topk_tmp_id_buf,                                                     \
-        topk_tmp_val_buf,                                                    \
-        topk_tmp2_val_buf,                                                   \
-        ids,                                                                 \
-        sequence_length,                                                     \
-        scores,                                                              \
-        finished_buf,                                                        \
-        candidate_num,                                                       \
-        curandstate,                                                         \
-        end_id,                                                              \
-        vocab_size);                                                         \
-    break;
-
-
-template <typename T>
-void topK_sampling_kernel_kernelLauncher_v3(void* workspace,
-                                            size_t& workspace_size,
-                                            T* log_probs,
-                                            int* ids,
-                                            int* sequence_length,
-                                            float* scores,
-                                            bool* finished_buf,
-                                            curandState_t* curandstate,
-                                            DecodingSamplingArguments args,
-                                            cudaStream_t stream,
-                                            const int batch_size) {
-  // Here, we put batch size as an argument because the batch size of
-  // initialization
-  // and inference may be different due to pipelint parallelism.
-  const int candidate_num = args.candidate_num_;
-  const int vocab_size = args.vocab_size_padded_;
-  const int end_id = args.end_id_;
-
-  const int max_block_per_beam = 8;
-  int temp_log_probs_buf_size = batch_size * vocab_size;  // type float
-  int topk_tmp_ids_buf_size =
-      batch_size * candidate_num * max_block_per_beam;  // type int
-  int topk_tmp_val_buf_size =
-      batch_size * candidate_num * max_block_per_beam;  // type float
-
-  // prevent memory misalinged address
-  temp_log_probs_buf_size = (int)(ceil(temp_log_probs_buf_size / 4.)) * 4;
-  topk_tmp_ids_buf_size = (int)(ceil(topk_tmp_ids_buf_size / 4.)) * 4;
-  topk_tmp_val_buf_size = (int)(ceil(topk_tmp_val_buf_size / 4.)) * 4;
-
-  if (workspace == nullptr) {
-    workspace_size = sizeof(T) * temp_log_probs_buf_size +
-                     sizeof(int) * topk_tmp_ids_buf_size +
-                     2 * sizeof(T) * topk_tmp_val_buf_size;
-    return;
-  } else {
-    T* temp_log_probs = (T*)workspace;
-    int* topk_tmp_id_buf = (int*)(temp_log_probs + temp_log_probs_buf_size);
-    T* topk_tmp_val_buf = (T*)(topk_tmp_id_buf + topk_tmp_ids_buf_size);
-    T* topk_tmp2_val_buf = (T*)(topk_tmp_val_buf + topk_tmp_val_buf_size);
-
-    switch (candidate_num) {
-      CASE_K(1, 16, 128, 128, 8);
-      CASE_K(17, 32, 256, 128, 8);
-      CASE_K(33, 64, 256, 256, 8);
-      default:
-        printf("[ERROR] Topk kernel does not support candidate_num = %d \n",
-               candidate_num);
-        exit(0);
-        break;
-    }
-    return;
-  }
-}
-
-#undef CASE_K
-
-
-template void topK_sampling_kernel_kernelLauncher(
-    void* workspace,
-    size_t& workspace_size,
-    float* log_probs,
-    int* ids,
-    int* sequence_length,
-    bool* finished_buf,
-    int random_num,
-    DecodingSamplingArguments args,
-    cudaStream_t stream,
-    const int batch_size);
-
-template void topK_sampling_kernel_kernelLauncher(
-    void* workspace,
-    size_t& workspace_size,
-    half* log_probs,
-    int* ids,
-    int* sequence_length,
-    bool* finished_buf,
-    int random_num,
-    DecodingSamplingArguments args,
-    cudaStream_t stream,
-    const int batch_size);
-
-template void topK_sampling_kernel_kernelLauncher_v2(
-    void* workspace,
-    size_t& workspace_size,
-    float* log_probs,
-    int* ids,
-    int* sequence_length,
-    bool* finished_buf,
-    curandState_t* curandstate,
-    DecodingSamplingArguments args,
-    cudaStream_t stream,
-    const int batch_size);
-
-template void topK_sampling_kernel_kernelLauncher_v2(
-    void* workspace,
-    size_t& workspace_size,
-    half* log_probs,
-    int* ids,
-    int* sequence_length,
-    bool* finished_buf,
-    curandState_t* curandstate,
-    DecodingSamplingArguments args,
-    cudaStream_t stream,
-    const int batch_size);
-
-template void topK_sampling_kernel_kernelLauncher_v3(void* workspace,
-                                            size_t& workspace_size,
-                                            float* log_probs,
-                                            int* ids,
-                                            int* sequence_length,
-                                            float* scores,
-                                            bool* finished_buf,
-                                            curandState_t* curandstate,
-                                            DecodingSamplingArguments args,
-                                            cudaStream_t stream,
-                                            const int batch_size);
-
-template void topK_sampling_kernel_kernelLauncher_v3(void* workspace,
-                                            size_t& workspace_size,
-                                            half* log_probs,
-                                            int* ids,
-                                            int* sequence_length,
-                                            float* scores,
-                                            bool* finished_buf,
-                                            curandState_t* curandstate,
-                                            DecodingSamplingArguments args,
-                                            cudaStream_t stream,
-                                            const int batch_size);
-
-__global__ void init_topp_id_val(int* topp_id_val_buf,
-                                 int* topp_offset_buf,
-                                 const int batch_size,
-                                 const int vocab_size) {
-  int tid = threadIdx.x;
-  int bid = blockIdx.x;
-
-  if (bid == 0) {
-    for (int i = tid; i < batch_size + 1; i += blockDim.x) {
-      topp_offset_buf[i] = i * vocab_size;
-    }
-  }
-
-  while (tid < vocab_size) {
-    topp_id_val_buf[bid * vocab_size + tid] = tid;
-    tid += blockDim.x;
-  }
-}
-
-
-void init_topp_id_val_kernel_kernelLauncher(int* topp_id_val_buf,
-                                            int* topp_offset_buf,
-                                            const int batch_size,
-                                            const int vocab_size,
-                                            cudaStream_t stream) {
-  init_topp_id_val<<<batch_size, 512, 0, stream>>>(
-      topp_id_val_buf, topp_offset_buf, batch_size, vocab_size);
-}
-
-// Sampling kernels
-template <typename T>
-__global__ void top_p_sampling(T* sorted_log_probs,
-                               int* sorted_id_vals,
-                               int* ids,
-                               int* sequence_length,
-                               bool* finished_buf,
-                               const int vocab_size,
-                               const int random_num,
-                               const float prob_threshold,
-                               const int end_id) {
-  int tid = threadIdx.x;
-  curandState_t local_state;
-  // TODO: fix randomly cannot work in some specific situation.
-  curand_init((T)random_num, tid, 0, &local_state);
-  T rand_num = (T)curand_uniform(&local_state) * (T)prob_threshold;
-  ids[tid] = sorted_id_vals[tid * vocab_size];
-
-  for (int i = tid * vocab_size; i < tid * vocab_size + vocab_size; i++) {
-    rand_num = rand_num - sorted_log_probs[i];
-    if (rand_num <= (T)0.0f) {
-      ids[tid] = sorted_id_vals[i];
-      break;
-    }
-  }
-  if (finished_buf != nullptr) {
-    finished_buf[tid] = ids[tid] == end_id ? 1 : 0;
-    if (sequence_length != nullptr) {
-      sequence_length[tid] =
-          finished_buf[tid] ? sequence_length[tid] : sequence_length[tid] + 1;
-    }
-  }
-}
-
-template <typename T>
-__global__ void top_p_sampling_v2(T* sorted_log_probs,
-                                  int* sorted_id_vals,
-                                  int* ids,
-                                  int* sequence_length,
-                                  bool* finished_buf,
-                                  const int vocab_size,
-                                  curandState_t* curandstate,
-                                  const float prob_threshold,
-                                  const int end_id,
-                                  const int batch_size,
-                                  float* scores = nullptr) {
-  int tid = blockDim.x * blockIdx.x + threadIdx.x;
-  if (tid < batch_size) {
-    T rand_num = (T)curand_uniform(curandstate + tid) * (T)prob_threshold;
-    ids[tid] = sorted_id_vals[vocab_size - 1];
-    for (int i = tid * vocab_size; i < tid * vocab_size + vocab_size; i++) {
-      rand_num = rand_num - sorted_log_probs[i];
-      if (rand_num <= (T)0.0) {
-        ids[tid] = sorted_id_vals[i];
-        if (scores) {
-          scores[tid] += __logf((float)sorted_log_probs[i]);
-        }
-        break;
-      }
-    };
-    if (finished_buf != nullptr) {
-      finished_buf[tid] = ids[tid] == end_id ? 1 : 0;
-      if (sequence_length != nullptr) {
-        sequence_length[tid] =
-            finished_buf[tid] ? sequence_length[tid] : sequence_length[tid] + 1;
-      }
-    }
-  }
-}
-
-template <typename T>
-__global__ void sort_kernel(const T* log_probs,
-                            const int* id_vals,
-                            T* sorted_log_probs,
-                            int* sorted_id_vals,
-                            const int vocab_size) {
-  typedef cub::BlockRadixSort<T, 256, 32, int> BlockRadixSort;
-  __shared__ typename BlockRadixSort::TempStorage temp_storage;
-  // Obtain a segment of consecutive items that are blocked across threads
-  T thread_keys[32];
-  int thread_values[32];
-
-  int tid = threadIdx.x;
-  int bid = blockIdx.x;
-  for (int i = 0; i < 32; i++) {
-    int index = tid + 256 * i + bid * vocab_size;
-    thread_keys[i] = log_probs[index];
-    thread_values[i] = id_vals[index];
-  }
-  BlockRadixSort(temp_storage).SortDescending(thread_keys, thread_values);
-
-  for (int i = 0; i < 32; i++) {
-    int index = tid + 256 * i + bid * vocab_size;
-    sorted_log_probs[index] = thread_keys[i];
-    sorted_id_vals[index] = thread_values[i];
-  }
-}
-
-template <typename T>
-void topP_sampling_kernel_kernelLauncher(void* workspace,
-                                         size_t& workspace_size,
-                                         const T* log_probs,
-                                         const int* id_vals,
-                                         const int* offset_buf,
-                                         bool* finished_buf,
-                                         int step,
-                                         DecodingSamplingArguments& args,
-                                         int* output_ids,
-                                         int* sequence_length,
-                                         const int n,
-                                         cudaStream_t stream,
-                                         const int batch_size) {
-  const int vocab_size = args.vocab_size_padded_;
-  int sorted_log_prob_buf_size = batch_size * vocab_size;  // type T
-  int sorted_id_vals_buf_size = batch_size * vocab_size;   // type int
-  sorted_log_prob_buf_size = (int)(ceil(sorted_log_prob_buf_size / 4.)) * 4;
-  sorted_id_vals_buf_size = (int)(ceil(sorted_id_vals_buf_size / 4.)) * 4;
-
-  void* cub_temp_storage = workspace;
-  T* sorted_log_probs =
-      (T*)((char*)cub_temp_storage + args.cub_temp_storage_size_);
-  int* sorted_id_vals = (int*)(sorted_log_probs + sorted_log_prob_buf_size);
-
-  if (workspace == nullptr) {
-    cub::DeviceSegmentedRadixSort::SortPairsDescending(
-        nullptr,
-        args.cub_temp_storage_size_,
-        log_probs,
-        (T*)nullptr,
-        id_vals,
-        (int*)nullptr,
-        vocab_size * batch_size,
-        batch_size,
-        offset_buf,
-        offset_buf + 1,
-        0,              // begin_bit
-        sizeof(T) * 8,  // end_bit = sizeof(KeyT) * 8
-        stream);        // cudaStream_t
-    args.cub_temp_storage_size_ =
-        (int)(ceil(args.cub_temp_storage_size_ / 4.)) * 4;
-    workspace_size = sizeof(T) * sorted_log_prob_buf_size +
-                     sizeof(int) * sorted_id_vals_buf_size +
-                     args.cub_temp_storage_size_;
-  } else {
-    cub::DeviceSegmentedRadixSort::SortPairsDescending(
-        cub_temp_storage,
-        args.cub_temp_storage_size_,
-        log_probs,
-        sorted_log_probs,
-        id_vals,
-        sorted_id_vals,
-        n * batch_size,
-        batch_size,
-        offset_buf,
-        offset_buf + 1,
-        0,              // begin_bit
-        sizeof(T) * 8,  // end_bit = sizeof(KeyT) * 8
-        stream);        // cudaStream_t
-
-    top_p_sampling<<<1, batch_size, 0, stream>>>(sorted_log_probs,
-                                                 sorted_id_vals,
-                                                 output_ids,
-                                                 sequence_length,
-                                                 finished_buf,
-                                                 n,
-                                                 step,
-                                                 args.probability_threshold_,
-                                                 args.end_id_);
-  }
-}
-
-template void topP_sampling_kernel_kernelLauncher(
-    void* workspace,
-    size_t& workspace_size,
-    const float* log_probs,
-    const int* id_vals,
-    const int* offset_buf,
-    bool* finished_buf,
-    int step,
-    DecodingSamplingArguments& args,
-    int* output_ids,
-    int* sequence_length,
-    const int n,
-    cudaStream_t stream,
-    const int batch_size);
-
-template void topP_sampling_kernel_kernelLauncher(
-    void* workspace,
-    size_t& workspace_size,
-    const half* log_probs,
-    const int* id_vals,
-    const int* offset_buf,
-    bool* finished_buf,
-    int step,
-    DecodingSamplingArguments& args,
-    int* output_ids,
-    int* sequence_length,
-    const int n,
-    cudaStream_t stream,
-    const int batch_size);
-
-
-template <typename T, int MAX_K, int THREADBLOCK_SIZE>
-__launch_bounds__(THREADBLOCK_SIZE) __global__
-    void beam_topK_kernel_for_topP(const T* log_probs,
-                                   int* topk_tmp_id_buf,
-                                   T* topk_tmp_val_buf,
-                                   const int vocab_size,
-                                   int* offset_buf,
-                                   int* begin_offset_buf,
-                                   float p_threshold) {
-  typedef cub::BlockReduce<TopK<T, MAX_K>, THREADBLOCK_SIZE> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-
-  int thread_id = threadIdx.x;
-  int block_id = blockIdx.x;
-  TopK<T, MAX_K> partial;
-
-  const bool IS_FP16 = std::is_same<T, half>::value;
-  const T MAX_T_VAL = (IS_FP16) ? HALF_FLT_MAX : FLT_MAX;
-
-#pragma unroll
-  for (int i = 0; i < MAX_K; ++i) {
-    partial.p[i] = -1;
-    partial.u[i] = -MAX_T_VAL;
-  }
-
-#pragma unroll
-  for (int elem_id = thread_id; elem_id < vocab_size;
-       elem_id += THREADBLOCK_SIZE) {
-    int index = elem_id + block_id * vocab_size;
-    partial.insert(log_probs[index], index);
-  }
-
-  TopK<T, MAX_K> total =
-      BlockReduce(temp_storage).Reduce(partial, reduce_topk_op<T, MAX_K>);
-
-
-  if (thread_id == 0) {
-    begin_offset_buf[block_id] = offset_buf[block_id];
-    T sum_prob = (T)(0.0f);
-
-#pragma unroll
-    for (int i = 0; i < MAX_K; i++) {
-      sum_prob += total.u[i];
-    }
-
-    if ((float)sum_prob >= p_threshold) {
-      begin_offset_buf[block_id] += vocab_size;
-      int index = block_id * vocab_size;
-
-#pragma unroll
-      for (int i = 0; i < MAX_K; ++i) {
-        topk_tmp_id_buf[index + i] = total.p[i] % vocab_size;
-        topk_tmp_val_buf[index + i] = total.u[i];
-      }
-    }
-  }
-}
-
-template <typename T>
-void topP_sampling_kernel_kernelLauncher_v2(void* workspace,
-                                            size_t& workspace_size,
-                                            const T* log_probs,
-                                            const int* id_vals,
-                                            int* offset_buf,
-                                            int* begin_offset_buf,
-                                            bool* finished_buf,
-                                            curandState_t* curandstate,
-                                            DecodingSamplingArguments& args,
-                                            int* output_ids,
-                                            int* sequence_length,
-                                            const int n,
-                                            cudaStream_t stream,
-                                            const int batch_size) {
-  // Here, we put batch size as an argument because the batch size of
-  // initialization
-  // and inference may be different due to pipelint parallelism.
-  const int vocab_size = args.vocab_size_padded_;
-  const int block_size = 256;
-
-  int sorted_log_prob_buf_size = batch_size * vocab_size;  // type T
-  int sorted_id_vals_buf_size = batch_size * vocab_size;   // type int
-  sorted_log_prob_buf_size = (int)(ceil(sorted_log_prob_buf_size / 4.)) * 4;
-  sorted_id_vals_buf_size = (int)(ceil(sorted_id_vals_buf_size / 4.)) * 4;
-
-  void* cub_temp_storage = workspace;
-  T* sorted_log_probs =
-      (T*)((char*)cub_temp_storage + args.cub_temp_storage_size_);
-  int* sorted_id_vals = (int*)(sorted_log_probs + sorted_log_prob_buf_size);
-
-
-  if (workspace == nullptr) {
-    cub::DeviceSegmentedRadixSort::SortPairsDescending(
-        nullptr,
-        args.cub_temp_storage_size_,
-        log_probs,
-        (T*)nullptr,
-        id_vals,
-        (int*)nullptr,
-        vocab_size * batch_size,
-        batch_size,
-        begin_offset_buf,
-        offset_buf + 1,
-        0,              // begin_bit
-        sizeof(T) * 8,  // end_bit = sizeof(KeyT) * 8
-        stream);        // cudaStream_t
-    args.cub_temp_storage_size_ =
-        (int)(ceil(args.cub_temp_storage_size_ / 4.)) * 4;
-    workspace_size = sizeof(T) * sorted_log_prob_buf_size +
-                     sizeof(int) * sorted_id_vals_buf_size +
-                     args.cub_temp_storage_size_;
-  } else {
-    beam_topK_kernel_for_topP<
-        T,
-        1,
-        block_size><<<batch_size, block_size, 0, stream>>>(
-        log_probs,
-        sorted_id_vals,
-        sorted_log_probs,
-        vocab_size,
-        offset_buf,
-        begin_offset_buf,
-        args.probability_threshold_);
-
-    cub::DeviceSegmentedRadixSort::SortPairsDescending(
-        cub_temp_storage,
-        args.cub_temp_storage_size_,
-        log_probs,
-        sorted_log_probs,
-        id_vals,
-        sorted_id_vals,
-        n * batch_size,
-        batch_size,
-        begin_offset_buf,
-        offset_buf + 1,
-        0,              // begin_bit
-        sizeof(T) * 8,  // end_bit = sizeof(KeyT) * 8
-        stream);        // cudaStream_t
-
-    dim3 block(256);
-    dim3 grid((int)(ceil(batch_size * 1.0 / 256)));
-    top_p_sampling_v2<<<grid, block, 0, stream>>>(sorted_log_probs,
-                                                  sorted_id_vals,
-                                                  output_ids,
-                                                  sequence_length,
-                                                  finished_buf,
-                                                  n,
-                                                  curandstate,
-                                                  args.probability_threshold_,
-                                                  args.end_id_,
-                                                  batch_size);
-  }
-}
-
-template <typename T>
-void topP_sampling_kernel_kernelLauncher_v3(void* workspace,
-                                            size_t& workspace_size,
-                                            const T* log_probs,
-                                            const int* id_vals,
-                                            int* offset_buf,
-                                            int* begin_offset_buf,
-                                            bool* finished_buf,
-                                            curandState_t* curandstate,
-                                            DecodingSamplingArguments& args,
-                                            int* output_ids,
-                                            int* sequence_length,
-                                            float* scores,
-                                            const int n,
-                                            cudaStream_t stream,
-                                            const int batch_size) {
-  // Here, we put batch size as an argument because the batch size of
-  // initialization
-  // and inference may be different due to pipelint parallelism.
-  const int vocab_size = args.vocab_size_padded_;
-  const int block_size = 256;
-
-  int sorted_log_prob_buf_size = batch_size * vocab_size;  // type T
-  int sorted_id_vals_buf_size = batch_size * vocab_size;   // type int
-  sorted_log_prob_buf_size = (int)(ceil(sorted_log_prob_buf_size / 4.)) * 4;
-  sorted_id_vals_buf_size = (int)(ceil(sorted_id_vals_buf_size / 4.)) * 4;
-
-  void* cub_temp_storage = workspace;
-  T* sorted_log_probs =
-      (T*)((char*)cub_temp_storage + args.cub_temp_storage_size_);
-  int* sorted_id_vals = (int*)(sorted_log_probs + sorted_log_prob_buf_size);
-
-
-  if (workspace == nullptr) {
-    cub::DeviceSegmentedRadixSort::SortPairsDescending(
-        nullptr,
-        args.cub_temp_storage_size_,
-        log_probs,
-        (T*)nullptr,
-        id_vals,
-        (int*)nullptr,
-        vocab_size * batch_size,
-        batch_size,
-        begin_offset_buf,
-        offset_buf + 1,
-        0,              // begin_bit
-        sizeof(T) * 8,  // end_bit = sizeof(KeyT) * 8
-        stream);        // cudaStream_t
-    args.cub_temp_storage_size_ =
-        (int)(ceil(args.cub_temp_storage_size_ / 4.)) * 4;
-    workspace_size = sizeof(T) * sorted_log_prob_buf_size +
-                     sizeof(int) * sorted_id_vals_buf_size +
-                     args.cub_temp_storage_size_;
-  } else {
-    beam_topK_kernel_for_topP<
-        T,
-        1,
-        block_size><<<batch_size, block_size, 0, stream>>>(
-        log_probs,
-        sorted_id_vals,
-        sorted_log_probs,
-        vocab_size,
-        offset_buf,
-        begin_offset_buf,
-        args.probability_threshold_);
-
-    cub::DeviceSegmentedRadixSort::SortPairsDescending(
-        cub_temp_storage,
-        args.cub_temp_storage_size_,
-        log_probs,
-        sorted_log_probs,
-        id_vals,
-        sorted_id_vals,
-        n * batch_size,
-        batch_size,
-        begin_offset_buf,
-        offset_buf + 1,
-        0,              // begin_bit
-        sizeof(T) * 8,  // end_bit = sizeof(KeyT) * 8
-        stream);        // cudaStream_t
-
-    dim3 block(256);
-    dim3 grid((int)(ceil(batch_size * 1.0 / 256)));
-    top_p_sampling_v2<<<grid, block, 0, stream>>>(sorted_log_probs,
-                                                  sorted_id_vals,
-                                                  output_ids,
-                                                  sequence_length,
-                                                  finished_buf,
-                                                  n,
-                                                  curandstate,
-                                                  args.probability_threshold_,
-                                                  args.end_id_,
-                                                  batch_size,
-                                                  scores);
-  }
-}
-
-template void topP_sampling_kernel_kernelLauncher_v2(
-    void* workspace,
-    size_t& workspace_size,
-    const float* log_probs,
-    const int* id_vals,
-    int* offset_buf,
-    int* begin_offset_buf,
-    bool* finished_buf,
-    curandState_t* curandstate,
-    DecodingSamplingArguments& args,
-    int* output_ids,
-    int* sequence_length,
-    const int n,
-    cudaStream_t stream,
-    const int batch_size);
-
-template void topP_sampling_kernel_kernelLauncher_v2(
-    void* workspace,
-    size_t& workspace_size,
-    const half* log_probs,
-    const int* id_vals,
-    int* offset_buf,
-    int* begin_offset_buf,
-    bool* finished_buf,
-    curandState_t* curandstate,
-    DecodingSamplingArguments& args,
-    int* output_ids,
-    int* sequence_length,
-    const int n,
-    cudaStream_t stream,
-    const int batch_size);
-
-template void topP_sampling_kernel_kernelLauncher_v3(void* workspace,
-                                            size_t& workspace_size,
-                                            const float* log_probs,
-                                            const int* id_vals,
-                                            int* offset_buf,
-                                            int* begin_offset_buf,
-                                            bool* finished_buf,
-                                            curandState_t* curandstate,
-                                            DecodingSamplingArguments& args,
-                                            int* output_ids,
-                                            int* sequence_length,
-                                            float* scores,
-                                            const int n,
-                                            cudaStream_t stream,
-                                            const int batch_size);
-
-template void topP_sampling_kernel_kernelLauncher_v3(void* workspace,
-                                            size_t& workspace_size,
-                                            const half* log_probs,
-                                            const int* id_vals,
-                                            int* offset_buf,
-                                            int* begin_offset_buf,
-                                            bool* finished_buf,
-                                            curandState_t* curandstate,
-                                            DecodingSamplingArguments& args,
-                                            int* output_ids,
-                                            int* sequence_length,
-                                            float* scores,
-                                            const int n,
-                                            cudaStream_t stream,
-                                            const int batch_size);
-
-template <typename T, int MAX_K, int THREADBLOCK_SIZE>
-__launch_bounds__(THREADBLOCK_SIZE) __global__
-    void topK_topP_sampling_kernel(int* output_ids,
-                                   const T* logits,
-                                   const int vocab_size,
-                                   const int random_num,
-                                   const float prob_threshold,
-                                   T diversity_rate) {
-  typedef cub::BlockReduce<TopK<T, MAX_K>, THREADBLOCK_SIZE> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-
-  int thread_id = threadIdx.x;
-  int block_id = blockIdx.x;
-  TopK<T, MAX_K> partial;
-
-  const bool IS_FP16 = std::is_same<T, half>::value;
-  const T MAX_T_VAL = (IS_FP16) ? HALF_FLT_MAX : FLT_MAX;
-
-#pragma unroll
-  for (int i = 0; i < MAX_K; ++i) {
-    partial.p[i] = -1;
-    partial.u[i] = -MAX_T_VAL;
-  }
-
-#pragma unroll
-  for (int elem_id = thread_id; elem_id < vocab_size;
-       elem_id += THREADBLOCK_SIZE) {
-    int index = elem_id + block_id * vocab_size;
-    partial.insert(logits[index], index);
-  }
-
-  TopK<T, MAX_K> total =
-      BlockReduce(temp_storage).Reduce(partial, reduce_topk_op<T, MAX_K>);
-
-  if (thread_id == 0) {
-    // float sum = 0.0f;
-    T sum = (T)(0.0f);
-    T max_val = total.u[0];
-
-#pragma unroll
-    for (int i = 0; i < MAX_K; i++) {
-      total.u[i] =
-          total.u[i] + diversity_rate * (T)i;  // diversely sampling penalty
-      total.u[i] = (T)__expf((float)(total.u[i] - max_val));
-      sum += total.u[i];
-    }
-
-    curandState_t local_state;
-    curand_init((T)0, block_id * MAX_K, blockDim.x * MAX_K, &local_state);
-    T rand_num = (T)curand_uniform(&local_state) * (T)prob_threshold * sum;
-
-    output_ids[block_id] = total.p[0] % vocab_size;
-
-#pragma unroll
-    for (int i = 0; i < MAX_K; i++) {
-      rand_num = rand_num - total.u[i];
-      if (rand_num <= (T)0.0f) {
-        output_ids[block_id] = total.p[i] % vocab_size;
-        break;
-      }
-    }
-  }
-}
-
-#define CASE_K(K)                                                          \
-  case K:                                                                  \
-    topK_topP_sampling_kernel<                                             \
-        T,                                                                 \
-        K,                                                                 \
-        block_size><<<batch_size, block_size, 0, stream>>>(                \
-        output_ids, logits, vocab_size, random_num, prob_threshold, 0.0f); \
-    break;
-
-template <typename T>
-void topK_topP_sampling_kernel_kernelLauncher(void* workspace,
-                                              size_t& workspace_size,
-                                              int* output_ids,
-                                              const T* logits,
-                                              const int random_num,
-                                              DecodingSamplingArguments& args,
-                                              cudaStream_t stream,
-                                              const int batch_size) {
-  if (workspace == nullptr) {
-    workspace_size = 0;
-  } else {
-    const int vocab_size = args.vocab_size_padded_;
-    const int block_size = 256;
-    const T prob_threshold = args.probability_threshold_;
-    switch (args.candidate_num_) {
-      CASE_K(1);
-      CASE_K(2);
-      CASE_K(4);
-      CASE_K(16);
-      CASE_K(64);
-      default:
-        printf("[ERROR] Topk kernel does not support candidate_num = %d \n",
-               args.candidate_num_);
-        exit(0);
-        break;
-    }
-  }
-}
-
-#undef CASE_K
-
-template <typename T, int BLOCK_SIZE_, int BLOCKS_PER_BEAM_>
-__global__ void topk_topp_sampling_kernel_v2(
-    const int* __restrict topk_tmp_id_buf,
-    T* topk_tmp_val_buf,
-    T* topk_tmp2_val_buf,
-    int* ids,
-    int* sequence_length,
-    bool* finished_buf,
-    const int k,
-    const T prob_threshold,
-    curandState_t* curandstate,
-    const int end_id,
-    const int vocab_size) {
-  const int size = k * BLOCKS_PER_BEAM_;
-  const int tid = threadIdx.x;
-  const int batch_id = blockIdx.x;
-  const bool IS_FP16 = std::is_same<T, half>::value;
-  const T MAX_T_VAL = (IS_FP16) ? HALF_FLT_MAX : FLT_MAX;
-
-  typedef cub::BlockReduce<TopK_2<float>, BLOCK_SIZE_> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  extern __shared__ char array[];
-  __shared__ float rand_num;
-  __shared__ float s_max;
-  __shared__ float s_sum;
-  T* s_val = topk_tmp_val_buf + batch_id * size;
-  int* s_id = (int*)(array);
-  s_max = 0.0f;
-  s_sum = 0.0f;
-  TopK_2<float> partial;
-
-  for (int index = tid; index < size; index += BLOCK_SIZE_) {
-    topk_tmp2_val_buf[batch_id * size + index] =
-        topk_tmp_val_buf[batch_id * size + index];
-  }
-  __syncthreads();
-  T* s_val2 = topk_tmp2_val_buf + batch_id * size;
-
-  for (int ite = 0; ite < k; ite++) {
-    partial.init();
-#pragma unroll
-    for (int i = tid; i < size; i += BLOCK_SIZE_) {
-      partial.insert((float)s_val[i], i);
-    }
-
-    TopK_2<float> total =
-        BlockReduce(temp_storage).Reduce(partial, reduce_topk_op_2<float>);
-
-    if (ite == 0) s_max = total.u;
-
-    if (tid == 0) {
-      s_id[ite] = total.p;
-      s_val[total.p] = -MAX_T_VAL;
-      total.u = __expf(total.u - s_max);
-      s_val2[total.p] = (T)total.u;
-      s_sum += total.u;
-    }
-    __syncthreads();
-  }
-  if (tid == 0) {
-    rand_num = (float)curand_uniform(curandstate + blockIdx.x) *
-               (float)prob_threshold * s_sum;
-    for (int i = 0; i < k; i++) {
-      rand_num = rand_num - (float)s_val2[s_id[i]];
-      if (rand_num <= 0.0f || i == k - 1) {
-        ids[batch_id] = topk_tmp_id_buf[batch_id * size + s_id[i]] % vocab_size;
-        break;
-      }
-    }
-    if (finished_buf != nullptr) {
-      finished_buf[batch_id] = ids[batch_id] == end_id ? 1 : 0;
-      if (sequence_length != nullptr) {
-        sequence_length[batch_id] = finished_buf[batch_id]
-                                        ? sequence_length[batch_id]
-                                        : sequence_length[batch_id] + 1;
-      }
-    }
-  }
-}
-
-#define CASE_K(K_MIN, K_MAX, BLOCK_SIZE_1_, BLOCK_SIZE_2_, BLOCKS_PER_BEAM_) \
-  case K_MIN... K_MAX:                                                       \
-    topk_stage_1_opt3<T,                                                     \
-                      BLOCK_SIZE_1_,                                         \
-                      BLOCKS_PER_BEAM_><<<batch_size * BLOCKS_PER_BEAM_,     \
-                                          BLOCK_SIZE_1_,                     \
-                                          0,                                 \
-                                          stream>>>(logits,                  \
-                                                    temp_logits,             \
-                                                    topk_tmp_id_buf,         \
-                                                    topk_tmp_val_buf,        \
-                                                    finished_buf,            \
-                                                    candidate_num,           \
-                                                    vocab_size,              \
-                                                    end_id);                 \
-    topk_topp_sampling_kernel_v2<T,                                          \
-                                 BLOCK_SIZE_2_,                              \
-                                 BLOCKS_PER_BEAM_><<<batch_size,             \
-                                                     BLOCK_SIZE_2_,          \
-                                                     K_MAX * sizeof(int),    \
-                                                     stream>>>(              \
-        topk_tmp_id_buf,                                                     \
-        topk_tmp_val_buf,                                                    \
-        topk_tmp2_val_buf,                                                   \
-        output_ids,                                                          \
-        nullptr,                                                             \
-        finished_buf,                                                        \
-        candidate_num,                                                       \
-        prob_threshold,                                                      \
-        curandstate,                                                         \
-        end_id,                                                              \
-        vocab_size);                                                         \
-    break;
-
-template <typename T>
-void topK_topP_sampling_kernel_kernelLauncher_v2(
-    void* workspace,
-    size_t& workspace_size,
-    int* output_ids,
-    const T* logits,
-    bool* finished_buf,
-    curandState_t* curandstate,
-    DecodingSamplingArguments& args,
-    cudaStream_t stream,
-    const int batch_size) {
-  // Here, we put batch size as an argument because the batch size of
-  // initialization
-  // and inference may be different due to pipelint parallelism.
-  const int candidate_num = args.candidate_num_;
-  const int vocab_size = args.vocab_size_padded_;
-  const int end_id = args.end_id_;
-  const T prob_threshold = args.probability_threshold_;
-
-  const int max_block_per_beam = 8;
-  int temp_logits_buf_size = batch_size * vocab_size;  // type float
-  int topk_tmp_ids_buf_size =
-      batch_size * candidate_num * max_block_per_beam;  // type int
-  int topk_tmp_val_buf_size =
-      batch_size * candidate_num * max_block_per_beam;  // type float
-
-  // prevent memory misalinged address
-  temp_logits_buf_size = (int)(ceil(temp_logits_buf_size / 4.)) * 4;
-  topk_tmp_ids_buf_size = (int)(ceil(topk_tmp_ids_buf_size / 4.)) * 4;
-  topk_tmp_val_buf_size = (int)(ceil(topk_tmp_val_buf_size / 4.)) * 4;
-
-  if (workspace == nullptr) {
-    workspace_size = sizeof(T) * temp_logits_buf_size +
-                     sizeof(int) * topk_tmp_ids_buf_size +
-                     2 * sizeof(T) * topk_tmp_val_buf_size;
-    return;
-  } else {
-    T* temp_logits = (T*)workspace;
-    int* topk_tmp_id_buf = (int*)(temp_logits + temp_logits_buf_size);
-    T* topk_tmp_val_buf = (T*)(topk_tmp_id_buf + topk_tmp_ids_buf_size);
-    T* topk_tmp2_val_buf = (T*)(topk_tmp_val_buf + topk_tmp_val_buf_size);
-
-    switch (candidate_num) {
-      CASE_K(1, 16, 128, 128, 8);
-      CASE_K(17, 32, 256, 128, 8);
-      CASE_K(33, 64, 256, 256, 8);
-      default:
-        printf("[ERROR] Topk kernel does not support candidate_num = %d \n",
-               candidate_num);
-        exit(0);
-        break;
-    }
-    return;
-  }
-}
-
-#undef CASE_K
-
-template void topK_topP_sampling_kernel_kernelLauncher(
-    void* workspace,
-    size_t& workspace_size,
-    int* output_ids,
-    const float* logits,
-    const int random_num,
-    DecodingSamplingArguments& args,
-    cudaStream_t stream,
-    const int batch_size);
-
-
-template void topK_topP_sampling_kernel_kernelLauncher(
-    void* workspace,
-    size_t& workspace_size,
-    int* output_ids,
-    const half* logits,
-    const int random_num,
-    DecodingSamplingArguments& args,
-    cudaStream_t stream,
-    const int batch_size);
-
-template void topK_topP_sampling_kernel_kernelLauncher_v2(
-    void* workspace,
-    size_t& workspace_size,
-    int* output_ids,
-    const float* logits,
-    bool* finished_buf,
-    curandState_t* curandstate,
-    DecodingSamplingArguments& args,
-    cudaStream_t stream,
-    const int batch_size);
-
-template void topK_topP_sampling_kernel_kernelLauncher_v2(
-    void* workspace,
-    size_t& workspace_size,
-    int* output_ids,
-    const half* logits,
-    bool* finished_buf,
-    curandState_t* curandstate,
-    DecodingSamplingArguments& args,
-    cudaStream_t stream,
-    const int batch_size);
-
-}  // end of namespace fastertransformer
diff --git a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/topk_kernels.cuh b/paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/topk_kernels.cuh
deleted file mode 100644
index ff356b077834..000000000000
--- a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/topk_kernels.cuh
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
- * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-namespace fastertransformer {
-
-template <typename T>
-void topK_softMax_update(
-    const T* log_probs,
-    const T* bias,  // NOTE: bias is float in V3.1
-    bool* finished,
-    bool* alive_finished,
-    int* sequence_length,
-    int* word_ids,
-    int* parent_ids,  // for update cache, only include alive beams
-    int* output_word_ids,
-    int* output_parent_ids,  // for gather tree, include both alive and finish
-                             // beams
-    float* output_cum_log_probs,  // NOTE: cum_log_probs is T in V3.1
-    void* temp_storage,
-    const int step,
-    DecodingBeamsearchArguments args,
-    cudaStream_t stream);
-
-template <typename T>
-void topK_update_kernelLauncher(
-    void* workspace,
-    size_t& workspace_size,
-    const T* log_probs,
-    bool* finished,
-    bool* alive_finished,
-    int* sequence_length,
-    int* word_ids,
-    int* parent_ids,  // for update cache, only include alive beams
-    int* output_word_ids,
-    int* output_parent_ids,  // for gather tree, include both alive and finish
-                             // beams
-    float* output_cum_log_probs,  // NOTE: cum_log_probs is T in V3.1
-    const int step,
-    DecodingBeamsearchArguments args,
-    cudaStream_t stream);
-
-template <typename T>
-void topK_sampling_kernel_kernelLauncher_v3(
-    void* workspace,
-    size_t& workspace_size,
-    T* log_probs,
-    int* ids,
-    int* sequence_length,
-    float* scores,
-    bool* finished_buf,
-    curandState_t* curandstate,
-    DecodingSamplingArguments args,
-    cudaStream_t stream,
-    const int batch_size);
-
-template <typename T>
-void topP_sampling_kernel_kernelLauncher_v3(void* workspace,
-                                            size_t& workspace_size,
-                                            const T* log_probs,
-                                            const int* id_vals,
-                                            int* offset_buf,
-                                            int* begin_offset_buf,
-                                            bool* finished_buf,
-                                            curandState_t* curandstate,
-                                            DecodingSamplingArguments& args,
-                                            int* output_ids,
-                                            int* sequence_length,
-                                            float* scores,
-                                            const int n,
-                                            cudaStream_t stream,
-                                            const int batch_size);
-
-}  // namespace fastertransformer
diff --git a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/transformer_decoder.cu b/paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/transformer_decoder.cu
deleted file mode 100644
index 16a3c34df8a2..000000000000
--- a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/transformer_decoder.cu
+++ /dev/null
@@ -1,643 +0,0 @@
-/*
- * Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-namespace fastertransformer {
-
-template <typename T>
-__global__ void self_attention_kernel(const int* memory_sequence_length,
-                                      T* key_buf,
-                                      T* value_buf,
-                                      T* query_buf,
-                                      const T* self_Q_bias,
-                                      T* key_cache,
-                                      const T* self_K_bias,
-                                      T* value_cache,
-                                      const T* self_V_bias,
-                                      T* context_buf,
-                                      int batch_size,
-                                      int head_num,
-                                      int size_per_head,
-                                      const int step,
-                                      const int start_len,
-                                      const T scalar) {
-  extern __shared__ __align__(sizeof(T)) unsigned s_buf[];
-  T* sq = reinterpret_cast<T*>(s_buf);
-  T* logits = reinterpret_cast<T*>(&sq[size_per_head]);
-
-  int tid = threadIdx.x;
-  int bid = blockIdx.x / head_num;
-  int head_id = blockIdx.x % head_num;
-
-  int qkv_id = bid * head_num * size_per_head + head_id * size_per_head + tid;
-  int qkv_bias_id = head_id * size_per_head + tid;
-
-  if (tid < size_per_head)
-    sq[tid] = query_buf[qkv_id] + self_Q_bias[qkv_bias_id];
-  __syncthreads();
-
-  // offset for each step
-  int offset = batch_size * head_num * size_per_head;
-  for (int ite = 0; ite < step; ++ite) {
-    T key = tid < size_per_head ? key_cache[ite * offset + qkv_id] : (T)0.0f;
-    // for the last step, we should update K + bias_K to the cache
-    if (ite == step - 1 && tid < size_per_head) {
-      key = key_buf[qkv_id] + self_K_bias[qkv_bias_id];
-      key_cache[ite * offset + qkv_id] = key;
-    }
-
-    T val = (tid < size_per_head) ? key * sq[tid] * (T)(scalar) : (T)(0.0f);
-    T qk = blockReduceSum(val);
-    if (threadIdx.x == 0) {
-      logits[ite] = qk;
-    }
-    __syncthreads();  // try to remove
-  }
-  __syncthreads();  // try to remove
-
-  __shared__ float s_max_val, s_sum;
-  float local_i =
-      (tid >= (start_len - memory_sequence_length[bid]) && (tid < step))
-          ? (float)logits[tid]
-          : -1e20f;
-  float max_val = blockReduceMax<float>(local_i);
-  if (tid == 0) s_max_val = max_val;
-  __syncthreads();
-
-  local_i -= s_max_val;
-  float local_o =
-      (tid >= (start_len - memory_sequence_length[bid]) && (tid < step))
-          ? __expf(local_i)
-          : 0.0f;
-  float val = blockReduceSum<float>(local_o);
-
-  if (tid == 0) s_sum = val;  // + 1e-6;
-  __syncthreads();
-
-  if (tid >= (start_len - memory_sequence_length[bid]) && (tid < step)) {
-    logits[tid] = local_o / s_sum;
-  } else if (tid < step) {
-    logits[tid] = static_cast<T>(0.0f);
-  }
-  __syncthreads();
-
-  if (tid < size_per_head) {
-    T sum = (T)0.0f;
-    for (int ite = 0; ite < step; ++ite) {
-      T value = value_cache[ite * offset + qkv_id];
-      // for the last step, we should update V + bias_V to the cache
-      if (ite == step - 1) {
-        value = value_buf[qkv_id] + self_V_bias[qkv_bias_id];
-        value_cache[ite * offset + qkv_id] = value;
-      }
-      sum += value * logits[ite];
-    }
-    context_buf[qkv_id] = sum;
-  }
-}
-
-
-template <typename T>
-void self_attention_dispatch(const int* memory_sequence_length,
-                             T* key_buf,
-                             T* value_buf,
-                             T* query_buf,
-                             const T* self_Q_bias,
-                             T* key_cache,
-                             const T* self_K_bias,
-                             T* value_cache,
-                             const T* self_V_bias,
-                             T* context_buf,
-                             int batch_size,
-                             int head_num,
-                             int size_per_head,
-                             const int step,
-                             const int start_len,
-                             cudaStream_t stream) {
-  const int block_sz = ATTENTION_BLOCK_SIZE;
-  T scalar = (T)(1.f / sqrtf(size_per_head * 1.0f));
-
-  dim3 grid(batch_size * head_num);
-
-  int cond = size_per_head * ((ATTENION_OPT) ? 1 : 0);
-  switch (cond) {
-    /*case 32:
-      masked_attention_kernel_opt<32, block_sz, T><<<grid, block_sz,
-    sizeof(float)*step, stream>>>(
-        key_buf, value_buf,
-        query_buf, self_Q_bias,  key_cache, self_K_bias, value_cache,
-    self_V_bias, context_buf,
-        batch_size, head_num, step, scalar);
-      break;
-    case 64:
-      if(sizeof(T) == 2)
-        masked_attention_kernel_opt_half2<64, block_sz><<<grid, block_sz,
-    sizeof(float)*step, stream>>>(
-          key_buf, value_buf,
-          query_buf, self_Q_bias,  key_cache, self_K_bias, value_cache,
-    self_V_bias, context_buf,
-          batch_size, head_num, step, scalar);
-      else
-        masked_attention_kernel_opt<64, block_sz, T><<<grid, block_sz,
-    sizeof(float)*step, stream>>>(
-          key_buf, value_buf,
-          query_buf, self_Q_bias,
-          key_cache, self_K_bias,
-          value_cache, self_V_bias,
-          context_buf,
-          batch_size, head_num, step, scalar);
-      break;
-    case 128:
-      if(sizeof(T) == 2)
-        masked_attention_kernel_opt_half2<128, block_sz><<<grid, block_sz,
-    sizeof(float)*step, stream>>>(
-          key_buf, value_buf,
-          query_buf, self_Q_bias,  key_cache, self_K_bias, value_cache,
-    self_V_bias, context_buf,
-          batch_size, head_num, step, scalar);
-      else
-        masked_attention_kernel_opt<128, block_sz, T><<<grid, block_sz,
-    sizeof(float)*step, stream>>>(
-          key_buf, value_buf,
-          query_buf, self_Q_bias,  key_cache, self_K_bias, value_cache,
-    self_V_bias, context_buf,
-          batch_size, head_num, step, scalar);
-      break;*/
-    default:
-      // default path
-      int block_size = 128;
-
-      // suppose size_per_head <= 128
-      if (step <= 64)
-        block_size = 64;
-      else if (step <= 128 && step > size_per_head)
-        block_size = 128;
-      else if (step > 128 && step <= 256)
-        block_size = 256;
-      else if (step > 256 && step <= 512)
-        block_size = 512;
-      else
-        block_size = 1024;
-
-      if ((int)block_size < size_per_head) {
-        block_size = size_per_head;
-      }
-
-      assert(block_size <= 1024);
-      dim3 block(block_size);
-      T scalar = 1 / sqrtf(size_per_head * 1.0f);
-
-      int shared_size = sizeof(T) * (size_per_head + step);
-      self_attention_kernel<T><<<grid, block, shared_size, stream>>>(
-          memory_sequence_length,
-          key_buf,
-          value_buf,
-          query_buf,
-          self_Q_bias,
-          key_cache,
-          self_K_bias,
-          value_cache,
-          self_V_bias,
-          context_buf,
-          batch_size,
-          head_num,
-          size_per_head,
-          step,
-          start_len,
-          scalar);
-#ifndef NDEBUG
-      cudaDeviceSynchronize();
-      check_cuda_error(cudaGetLastError());
-#endif
-  }
-}
-
-
-template <OperationType OpType_>
-void OpenTransformerDecoder<OpType_>::self_multi_head_attention(
-    const DataType_* from_tensor,
-    const int* memory_sequence_length,
-    DataType_* key_cache_,
-    DataType_* value_cache_,
-    DataType_* decoder_output,
-    const int step,
-    const int start_len) {
-  int m = batch_size_;
-  int n = hidden_units_;
-  int k = hidden_units_;
-
-  DataType_ alpha = (DataType_)1.0f, beta = (DataType_)0.0f;
-
-  if (is_fuse_QKV == true) {
-    check_cuda_error(
-        cublasGemmBatchedEx(param_.cublas_handle,
-                            CUBLAS_OP_N,
-                            CUBLAS_OP_N,
-                            n,
-                            m,
-                            k,
-                            &alpha,
-                            (const void* const*)qkv_kernel_,
-                            AType_,
-                            n,
-                            (const void* const*)qkv_input_,
-                            BType_,
-                            k,
-                            &beta,
-                            (void* const*)qkv_buf_,
-                            CType_,
-                            n,
-                            3,
-                            computeType_,
-                            static_cast<cublasGemmAlgo_t>(cublasAlgo_[4])));
-  } else {
-    key_buf_ = key_cache_ + (step - 1) * m * n;
-    value_buf_ = value_cache_ + (step - 1) * m * n;
-
-    check_cuda_error(
-        cublasGemmEx(param_.cublas_handle,
-                     CUBLAS_OP_N,
-                     CUBLAS_OP_N,
-                     n,
-                     m,
-                     k,
-                     &alpha,
-                     param_.self_attention.query_weight.kernel,
-                     AType_,
-                     n,
-                     from_tensor,
-                     BType_,
-                     k,
-                     &beta,
-                     query_buf_,
-                     CType_,
-                     n,
-                     computeType_,
-                     static_cast<cublasGemmAlgo_t>(cublasAlgo_[0])));
-
-    check_cuda_error(
-        cublasGemmEx(param_.cublas_handle,
-                     CUBLAS_OP_N,
-                     CUBLAS_OP_N,
-                     n,
-                     m,
-                     k,
-                     &alpha,
-                     param_.self_attention.key_weight.kernel,
-                     AType_,
-                     n,
-                     from_tensor,
-                     BType_,
-                     k,
-                     &beta,
-                     key_buf_,
-                     CType_,
-                     n,
-                     computeType_,
-                     static_cast<cublasGemmAlgo_t>(cublasAlgo_[0])));
-
-    check_cuda_error(
-        cublasGemmEx(param_.cublas_handle,
-                     CUBLAS_OP_N,
-                     CUBLAS_OP_N,
-                     n,
-                     m,
-                     k,
-                     &alpha,
-                     param_.self_attention.value_weight.kernel,
-                     AType_,
-                     n,
-                     from_tensor,
-                     BType_,
-                     k,
-                     &beta,
-                     value_buf_,
-                     CType_,
-                     n,
-                     computeType_,
-                     static_cast<cublasGemmAlgo_t>(cublasAlgo_[0])));
-  }
-
-  self_attention_dispatch<DataType_>(memory_sequence_length,
-                                     key_buf_,
-                                     value_buf_,
-                                     query_buf_,
-                                     param_.self_attention.query_weight.bias,
-                                     key_cache_,
-                                     param_.self_attention.key_weight.bias,
-                                     value_cache_,
-                                     param_.self_attention.value_weight.bias,
-                                     context_buf_,
-                                     batch_size_,
-                                     head_num_,
-                                     size_per_head_,
-                                     step,
-                                     start_len,
-                                     param_.stream);
-
-  check_cuda_error(
-      cublasGemmEx(param_.cublas_handle,
-                   CUBLAS_OP_N,
-                   CUBLAS_OP_N,
-                   n,
-                   m,
-                   k,
-                   &alpha,
-                   param_.self_attention.attention_output_weight.kernel,
-                   AType_,
-                   n,
-                   context_buf_,
-                   BType_,
-                   k,
-                   &beta,
-                   decoder_output,
-                   CType_,
-                   n,
-                   computeType_,
-                   static_cast<cublasGemmAlgo_t>(cublasAlgo_[0])));
-}
-
-
-template <OperationType OpType_>
-void OpenTransformerDecoder<OpType_>::decoder_norm1(const DataType_* input,
-                                                    const DataType_* gamma,
-                                                    const DataType_* beta,
-                                                    DataType_* output,
-                                                    int m,
-                                                    int n) {
-  dim3 grid(m);
-  dim3 block(min(n, 1024));
-
-  /* For general cases, n is equal to hidden_units, e.g., 512/1024.
-     Since we have warp shuffle inside the code, block.x % 32 should be 0.
-  */
-  if (n % 32 != 0) block.x = 1024;
-
-  block.x =
-      block.x /
-      (4 / sizeof(DataType_));  // if using half, only need half of block.x
-
-  /* should pay attention to the rsqrt precision*/
-  // assert(block.x <= 1024);
-  // decoder_norm1_kernel<DataType_><<<grid, block, 0, param_.stream>>>(input,
-  // gamma, beta, output, m, n);
-  decoder_norm1_kernel_generalize<DataType_><<<grid, block, 0, param_.stream>>>(
-      input, gamma, beta, output, m, n);  // For gpt-3
-}
-
-template <OperationType OpType_>
-void OpenTransformerDecoder<OpType_>::decoder_norm2(const DataType_* input,
-                                                    const DataType_* gamma,
-                                                    const DataType_* beta,
-                                                    const DataType_* bias,
-                                                    DataType_* output,
-                                                    DataType_* norm_output,
-                                                    int m,
-                                                    int n) {
-  dim3 grid(m);
-  dim3 block(min(n, 1024));
-
-
-  /* For general cases, n is equal to hidden_units, e.g., 512/1024.
-  Since we have warp shuffle inside the code, block.x % 32 should be 0.
-  */
-
-  if (n % 32 != 0) block.x = 1024;
-
-  block.x =
-      block.x /
-      (4 / sizeof(DataType_));  // if using half, only need half of block.x
-
-  /* should pay attention to the rsqrt precision*/
-  // assert(block.x <= 1024);
-  // decoder_norm2_kernel<DataType_><<<grid, block, 0, param_.stream>>>(input,
-  // gamma, beta, bias, output, norm_output, m, n);
-  decoder_norm2_kernel_generalize<DataType_><<<grid, block, 0, param_.stream>>>(
-      input, gamma, beta, bias, output, norm_output, m, n);  // For gpt-3
-}
-
-template <OperationType OpType_>
-void OpenTransformerDecoder<OpType_>::ffn(const DataType_* input,
-                                          DataType_* ffn_inner,
-                                          DataType_* output,
-                                          const int m,
-                                          const int inner_size,
-                                          const int n,
-                                          ActivationType activation_type) {
-  int m1 = m, k1 = n, n1 = inner_size;
-  DataType_ alpha = (DataType_)1.0f;
-  DataType_ beta = (DataType_)0.0f;
-
-  check_cuda_error(cublasGemmEx(param_.cublas_handle,
-                                CUBLAS_OP_N,
-                                CUBLAS_OP_N,
-                                n1,
-                                m1,
-                                k1,
-                                &alpha,
-                                param_.ffn.intermediate_weight.kernel,
-                                AType_,
-                                n1,
-                                input,
-                                BType_,
-                                k1,
-                                &beta,
-                                ffn_inner,
-                                CType_,
-                                n1,
-                                computeType_,
-                                static_cast<cublasGemmAlgo_t>(cublasAlgo_[2])));
-
-  // dim3 grid(min(m1, 65536));
-  // dim3 block(min(n1 / 4, 1024));
-
-  // // TODO remove this limitation
-  // // assert(block.x <= 1024);
-
-  // if(activation_type == ActivationType::RELU)
-  //   add_bias_relu<DataType_><<<grid, block, 0, param_.stream>>>(ffn_inner,
-  //   param_.ffn.intermediate_weight.bias, m1, n1);
-  // else if(activation_type == ActivationType::GELU)
-  //   add_bias_gelu<DataType_><<<grid, block, 0, param_.stream>>>(ffn_inner,
-  //   param_.ffn.intermediate_weight.bias, m1, n1);
-
-  dim3 block(min((int)(n1 / 4 / (4 / sizeof(DataType_))), 1024));
-  dim3 grid(min(m1 * n1 / block.x, 65536));
-
-  if (activation_type == ActivationType::RELU)
-    add_bias_relu<DataType_><<<grid, block, 0, param_.stream>>>(
-        ffn_inner,
-        param_.ffn.intermediate_weight.bias,
-        m1,
-        n1 / (4 / sizeof(DataType_)));
-  else if (activation_type == ActivationType::GELU)
-    add_bias_gelu<DataType_><<<grid, block, 0, param_.stream>>>(
-        ffn_inner,
-        param_.ffn.intermediate_weight.bias,
-        m1,
-        n1 / (4 / sizeof(DataType_)));
-
-
-  int m2 = m, n2 = n, k2 = inner_size;
-  check_cuda_error(cublasGemmEx(param_.cublas_handle,
-                                CUBLAS_OP_N,
-                                CUBLAS_OP_N,
-                                n2,
-                                m2,
-                                k2,
-                                &alpha,
-                                param_.ffn.output_weight.kernel,
-                                AType_,
-                                n2,
-                                ffn_inner,
-                                BType_,
-                                k2,
-                                &beta,
-                                output,
-                                CType_,
-                                n2,
-                                computeType_,
-                                static_cast<cublasGemmAlgo_t>(cublasAlgo_[3])));
-}
-
-template <OperationType OpType_>
-void OpenTransformerDecoder<OpType_>::add_bias_act(
-    DataType_* input,
-    const DataType_* bias,
-    int m,
-    int n,
-    cudaStream_t stream,
-    ActivationType activation_type = ActivationType::GELU) {
-  dim3 block_(min((int)(n / 4 / (4 / sizeof(DataType_))), 1024));
-  dim3 grid_(min(m * n / block_.x, 65536));
-
-  if (activation_type == ActivationType::RELU)
-    add_bias_relu<DataType_><<<grid_, block_, 0, stream>>>(
-        input, bias, m, n / (4 / sizeof(DataType_)));
-  else if (activation_type == ActivationType::GELU)
-    add_bias_gelu<DataType_><<<grid_, block_, 0, stream>>>(
-        input, bias, m, n / (4 / sizeof(DataType_)));
-}
-
-template <OperationType OpType_>
-void OpenTransformerDecoder<OpType_>::add_bias_input(DataType_* output,
-                                                     const DataType_* input,
-                                                     const int m,
-                                                     const int n) {
-  dim3 grid(min(m, 65536));
-  dim3 block(min(n, 1024));
-
-  add_bias_input_kernel_generalize<<<grid, block, 0, param_.stream>>>(
-      output, input, param_.ffn.output_weight.bias, m, n);
-}
-
-template void
-OpenTransformerDecoder<OperationType::FP32>::self_multi_head_attention(
-    const float* from_tensor,
-    const int* memory_sequence_length,
-    float* key_cache,
-    float* value_cache,
-    float* decoder_output,
-    const int step,
-    const int start_len);
-
-template void
-OpenTransformerDecoder<OperationType::FP16>::self_multi_head_attention(
-    const half* from_tensor,
-    const int* memory_sequence_length,
-    half* key_cache,
-    half* value_cache,
-    half* decoder_output,
-    const int step,
-    const int start_len);
-
-template void OpenTransformerDecoder<OperationType::FP32>::ffn(
-    const float* input,
-    float* ffn_inner,
-    float* otuput,
-    const int m,
-    const int inner_size,
-    const int n,
-    ActivationType activation_type);
-
-template void OpenTransformerDecoder<OperationType::FP16>::ffn(
-    const half* input,
-    half* ffn_inner,
-    half* otuput,
-    const int m,
-    const int inner_size,
-    const int n,
-    ActivationType activation_type);
-
-template void OpenTransformerDecoder<OperationType::FP32>::decoder_norm1(
-    const float* input,
-    const float* gamma,
-    const float* beta,
-    float* output,
-    int m,
-    int n);
-
-template void OpenTransformerDecoder<OperationType::FP16>::decoder_norm1(
-    const half* input,
-    const half* gamma,
-    const half* beta,
-    half* output,
-    int m,
-    int n);
-
-template void OpenTransformerDecoder<OperationType::FP32>::decoder_norm2(
-    const float* input,
-    const float* gamma,
-    const float* beta,
-    const float* bias,
-    float* output,
-    float* norm_output,
-    int m,
-    int n);
-
-template void OpenTransformerDecoder<OperationType::FP16>::decoder_norm2(
-    const half* input,
-    const half* gamma,
-    const half* beta,
-    const half* bias,
-    half* output,
-    half* norm_output,
-    int m,
-    int n);
-
-template void OpenTransformerDecoder<OperationType::FP32>::add_bias_act(
-    float* input,
-    const float* bias,
-    int m,
-    int n,
-    cudaStream_t stream,
-    ActivationType activation_type);
-
-template void OpenTransformerDecoder<OperationType::FP16>::add_bias_act(
-    half* input,
-    const half* bias,
-    int m,
-    int n,
-    cudaStream_t stream,
-    ActivationType activation_type);
-
-template void OpenTransformerDecoder<OperationType::FP32>::add_bias_input(
-    float* output, const float* input, const int m, const int n);
-
-template void OpenTransformerDecoder<OperationType::FP16>::add_bias_input(
-    half* output, const half* input, const int m, const int n);
-
-
-}  // namespace FasterTransformer
diff --git a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/transformer_decoding_kernels.cu b/paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/transformer_decoding_kernels.cu
deleted file mode 100644
index b950f54525bb..000000000000
--- a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/transformer_decoding_kernels.cu
+++ /dev/null
@@ -1,671 +0,0 @@
-/*
- * Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-namespace fastertransformer {
-
-template <typename T>
-__global__ void embeddings_kernels(T* from_tensor,
-                                   const T* embedding_table,
-                                   const T* position_encoding,
-                                   const T* type_table,
-                                   const int* memory_sequence_length,
-                                   const int* type_id,
-                                   const int* word_ids,
-                                   const int step,
-                                   const int batch_size,
-                                   const int hidden_units,
-                                   const bool pos_bias,
-                                   const int* decoder_role_id = nullptr,
-                                   const T* role_embedding_table = nullptr,
-                                   const int* decoder_position_id = nullptr) {
-  // 1. lookup from embedding table
-  // 2. add the position encoding
-  // 3. add the token type embedding
-  for (int index = blockIdx.x * blockDim.x + threadIdx.x;
-       index < batch_size * hidden_units;
-       index += blockDim.x * gridDim.x) {
-    const int row_index = index / hidden_units;
-    const int col_index = index % hidden_units;
-
-    int position_index;
-    if (decoder_position_id) {
-      position_index = (decoder_position_id[row_index] + step - 1) * hidden_units + col_index;
-    } else {
-      int pos = (pos_bias) ? (step - 1 + memory_sequence_length[row_index])
-                           : (step - 1);
-      position_index = pos * hidden_units + col_index;
-    }
-
-    from_tensor[index] =
-        embedding_table[word_ids[row_index] * hidden_units + col_index] +
-        position_encoding[position_index] +
-        type_table[type_id[row_index] * hidden_units + col_index];
-
-    if (decoder_role_id) {
-      from_tensor[index] += role_embedding_table[decoder_role_id[row_index] * hidden_units + col_index];
-    }
-  }
-}
-
-template <typename T>
-void embeddings_kernel_launcher(T* from_tensor,
-                                const T* embedding_table,
-                                const T* position_encoding_table,
-                                const T* type_table,
-                                const int* memory_sequence_length,
-                                const int* type_id,
-                                const int* word_ids,
-                                const int step,
-                                const int batch_size,
-                                const int hidden_units,
-                                const bool pos_bias,
-                                cudaStream_t stream,
-                                const int* decoder_role_id,
-                                const T* role_embedding_table,
-                                const int* decoder_position_id) {
-  dim3 grid(min(batch_size, 65536));
-  dim3 block(min(hidden_units, 1024));
-
-  embeddings_kernels<T><<<grid, block, 0, stream>>>(from_tensor,
-                                                    embedding_table,
-                                                    position_encoding_table,
-                                                    type_table,
-                                                    memory_sequence_length,
-                                                    type_id,
-                                                    word_ids,
-                                                    step,
-                                                    batch_size,
-                                                    hidden_units,
-                                                    pos_bias,
-                                                    decoder_role_id,
-                                                    role_embedding_table,
-                                                    decoder_position_id);
-}
-
-template <typename T>
-__global__ void words_embeddings_kernels(T* from_tensor,
-                                   const T* embedding_table,
-                                   const int* word_ids,
-                                   const int batch_size,
-                                   const int hidden_units) {
-  // 1. lookup from embedding table
-  // 2. add the position encoding
-  // 3. add the token type embedding
-  for (int index = blockIdx.x * blockDim.x + threadIdx.x;
-       index < batch_size * hidden_units;
-       index += blockDim.x * gridDim.x) {
-    const int row_index = index / hidden_units;
-    const int col_index = index % hidden_units;
-
-    from_tensor[index] =
-        embedding_table[word_ids[row_index] * hidden_units + col_index];
-  }
-}
-
-template <typename T>
-void words_embeddings_kernel_launcher(T* from_tensor,
-                                const T* embedding_table,
-                                const int* word_ids,
-                                const int batch_size,
-                                const int hidden_units,
-                                cudaStream_t stream) {
-  dim3 grid(min(batch_size, 65536));
-  dim3 block(min(hidden_units, 1024));
-
-  words_embeddings_kernels<T><<<grid, block, 0, stream>>>(from_tensor,
-                                                    embedding_table,
-                                                    word_ids,
-                                                    batch_size,
-                                                    hidden_units);
-}
-
-template<typename T>
-__global__ void build_relative_attention_bias_kernel(
-        T* relative_attention_bias,
-        const T* relative_attention_bias_table,
-        const int head_num,
-        const int seq_len,
-        const int num_bucket,
-        const bool is_bidirectional,
-        const int max_distance) {
-    const int head_id = blockIdx.x;
-    for (int seq_id = threadIdx.x; seq_id < seq_len * seq_len; seq_id += blockDim.x) {
-        int row_id = seq_id / seq_len;
-        int col_id = seq_id % seq_len;
-
-        int relative_position = col_id - row_id;
-
-        int relative_buckets = 0;
-        int tmp_num_bucket = num_bucket;
-        if (is_bidirectional) {
-            tmp_num_bucket /= 2;
-            if (relative_position > 0) {
-                relative_buckets += tmp_num_bucket;
-            }
-            else {
-                relative_position *= -1;
-            }
-        }
-        else {
-            relative_position = -min(relative_position, 0);
-        }
-
-        int max_exact = tmp_num_bucket / 2;
-        bool is_small = relative_position < max_exact;
-
-        int relative_position_if_large =
-            max_exact
-            + (int)(logf(relative_position * 1.0f / max_exact) / logf((float)max_distance / max_exact)
-                    * (tmp_num_bucket - max_exact));
-
-        relative_position_if_large = min(relative_position_if_large, tmp_num_bucket - 1);
-
-        relative_buckets += is_small ? relative_position : relative_position_if_large;
-
-        relative_attention_bias[head_id * seq_len * seq_len + seq_id] =
-            relative_attention_bias_table[relative_buckets * gridDim.x + head_id];
-    }
-}
-
-template<typename T>
-void build_relative_attention_bias_launcher(T* relative_attention_bias,
-                                            const T* relative_attention_bias_table,
-                                            const int head_num,
-                                            const int seq_len,
-                                            const int num_bucket,
-                                            const bool is_bidirectional,
-                                            const int max_distance,
-                                            cudaStream_t stream) {
-    dim3 grid(head_num);
-    dim3 block(256);
-    build_relative_attention_bias_kernel<<<grid, block, 0, stream>>>(relative_attention_bias,
-                                                           relative_attention_bias_table,
-                                                           head_num,
-                                                           seq_len,
-                                                           num_bucket,
-                                                           is_bidirectional,
-                                                           max_distance);
-}
-
-
-template <typename T>
-__global__ void start_id_embedding_kernel(T* from_tensor,
-                                          const T* embedding_table,
-                                          const T* position_encoding_table,
-                                          const T* type_table,
-                                          const int* type_id,
-                                          const int* word_ids,
-                                          const int* memory_seq_len,
-                                          const int start_step,
-                                          const int max_length,
-                                          const int batch_size,
-                                          const int hidden_units,
-                                          const int* role_id = nullptr,
-                                          const T* role_embedding_table = nullptr,
-                                          const int* position_id = nullptr) {
-  int bid = blockIdx.x;
-  int seq_id = blockIdx.y;
-
-  for(int index = threadIdx.x; index < hidden_units; index += blockDim.x) {
-
-    int position_index;
-    // Apply custom position ids to handle different position ids setting,
-    // such as relative position in PLATO-XL.
-    if (position_id) {
-      position_index = position_id[bid * max_length + seq_id] * hidden_units + index;
-    } else {
-      int step;
-      // Deal with the situation which input_sequences is padded left.
-      if (seq_id < max_length - memory_seq_len[bid]) {
-        step = start_step;
-      } else {
-        step = start_step + (seq_id - max_length + memory_seq_len[bid]);
-      }
-      position_index = (step - 1) * hidden_units + index;
-    }
-
-    from_tensor[bid * max_length * hidden_units + seq_id * hidden_units + index] =
-        embedding_table[word_ids[bid * max_length + seq_id] * hidden_units + index]
-        + position_encoding_table[position_index]
-        + type_table[type_id[bid * max_length + seq_id] * hidden_units + index];
-
-    if (role_id) {
-      from_tensor[bid * max_length * hidden_units + seq_id * hidden_units + index] +=
-          role_embedding_table[role_id[bid * max_length + seq_id] * hidden_units + index];
-    }
-  }
-}
-
-template <typename T>
-void start_ids_embeddings_kernel_launcher(T* from_tensor,
-                                const T* embedding_table,
-                                const T* position_encoding_table,
-                                const T* type_table,
-                                const int* type_id,
-                                const int* word_ids,
-                                const int* memory_seq_len,
-                                const int start_step,
-                                const int max_length,
-                                const int batch_size,
-                                const int hidden_units,
-                                cudaStream_t stream,
-                                const int* role_id,
-                                const T* role_embedding_table,
-                                const int* position_id) {
-  dim3 grid(batch_size, max_length);
-  dim3 block(min(hidden_units, 1024));
-  start_id_embedding_kernel<T><<<grid, block, 0, stream>>>(from_tensor,
-                                                           embedding_table,
-                                                           position_encoding_table,
-                                                           type_table,
-                                                           type_id,
-                                                           word_ids,
-                                                           memory_seq_len,
-                                                           start_step,
-                                                           max_length,
-                                                           batch_size,
-                                                           hidden_units,
-                                                           role_id,
-                                                           role_embedding_table,
-                                                           position_id);
-}
-
-template <typename T>
-__global__ void initial_cache_kernel(const float* cache_k,
-                                     const float* cache_v,
-                                     const int* memory_sequence_length,
-                                     T* k_tgt,
-                                     T* v_tgt,
-                                     int n_head,
-                                     int size_per_head,
-                                     int mem_len,
-                                     int batch_size,
-                                     int beam_size = 1) {
-  int tid = threadIdx.x;
-  int bid = blockIdx.x / (beam_size * n_head);
-  int beam_id = blockIdx.x % (n_head * beam_size) / n_head;
-  int head_id = blockIdx.x % n_head;
-
-  int offset = batch_size * beam_size * n_head * size_per_head;
-
-  for (int ite = 0; ite < mem_len; ++ite) {
-    int tgt_id = bid * beam_size * n_head * size_per_head +
-                 beam_id * n_head * size_per_head + head_id * size_per_head +
-                 tid;
-    int src_id = bid * n_head * mem_len * size_per_head +
-                 head_id * mem_len * size_per_head + ite * size_per_head + tid;
-    k_tgt[ite * offset + tgt_id] = static_cast<T>(cache_k[src_id]);
-    v_tgt[ite * offset + tgt_id] = static_cast<T>(cache_v[src_id]);
-  }
-}
-
-template <typename T>
-void init_cache_kernel_launcher(const float* cache_k,
-                                const float* cache_v,
-                                const int* memory_sequence_length,
-                                T* k_tgt,
-                                T* v_tgt,
-                                int n_head,
-                                int size_per_head,
-                                int mem_len,
-                                int batch_size,
-                                int beam_size,
-                                cudaStream_t stream) {
-  initial_cache_kernel<
-      T><<<batch_size * beam_size * n_head, size_per_head, 0, stream>>>(
-      cache_k,
-      cache_v,
-      memory_sequence_length,
-      k_tgt,
-      v_tgt,
-      n_head,
-      size_per_head,
-      mem_len,
-      batch_size,
-      beam_size);
-}
-
-template <typename T>
-__global__ void init_logits_mask_kernel(T* logits_mask,
-                                        int vocab_size,
-                                        int start_id = -1,
-                                        int unk_id = -1,
-                                        int mask_id = -1) {
-  int bid = blockIdx.x;
-  int tid = threadIdx.x;
-
-  if (bid * blockDim.x + tid == start_id) {
-    logits_mask[bid * blockDim.x + tid] = static_cast<T>(-1e20f);
-  } else if (bid * blockDim.x + tid == unk_id) {
-    logits_mask[bid * blockDim.x + tid] = static_cast<T>(-1e20f);
-  } else if (bid * blockDim.x + tid == mask_id) {
-    logits_mask[bid * blockDim.x + tid] = static_cast<T>(-1e20f);
-  } else if (bid * blockDim.x + tid < vocab_size) {
-    logits_mask[bid * blockDim.x + tid] = static_cast<T>(0.0f);
-  }
-}
-
-template <typename T>
-void init_logits_mask_Launcher(T* logits_mask,
-                               int vocab_size,
-                               int start_id,
-                               int unk_id,
-                               int mask_id,
-                               cudaStream_t stream) {
-  dim3 block(256);
-  dim3 grid((vocab_size + block.x - 1) / block.x);
-
-  init_logits_mask_kernel<T><<<grid, block, 0, stream>>>(
-      logits_mask, vocab_size, start_id, unk_id, mask_id);
-}
-
-template <typename T>
-__global__ void apply_penalties_kernel(int step,
-                                       int vocab_size,
-                                       int beam_width,
-                                       T* log_probs,
-                                       const bool* finished,
-                                       int* current_ids,
-                                       int* previous_ids,
-                                       int* parent_ids,
-                                       int end_id,
-                                       float inv_temp,
-                                       float len_penalty,
-                                       float repeat_penalty,
-                                       const T* logits_mask = nullptr) {
-  int tid = threadIdx.x;
-  int bid = blockIdx.x;
-  int bbid = blockIdx.y;   // batch_size * beam_size: index
-  int bbsize = gridDim.y;  // batch_size * beam_size
-  int batchid = bbid / beam_width;
-  // int beamid = bbid % beam_width;
-
-  bool finish = (finished != nullptr) ? finished[bbid] : false;
-
-  if (!finish) {
-    // temperature
-    if (inv_temp != 1.0) {
-      for (int i = tid + bid * blockDim.x; i < vocab_size;
-           i += blockDim.x * gridDim.x) {
-        log_probs[i + bbid * vocab_size] *= inv_temp;
-      }
-    }
-
-    if (tid == 0 && bid == 0) {
-      // apply repetition penalty (this can apply the penalty multiple times to
-      // a
-      // repeated word).
-      if (repeat_penalty != 1.0) {
-        int prev_id = current_ids[bbid];
-        if (log_probs[prev_id + bbid * vocab_size] > T(0)) {
-          log_probs[prev_id + bbid * vocab_size] =
-              float(log_probs[prev_id + bbid * vocab_size]) / repeat_penalty;
-        } else {
-          log_probs[prev_id + bbid * vocab_size] =
-              float(log_probs[prev_id + bbid * vocab_size]) * repeat_penalty;
-        }
-
-        if (step > 1) {
-          int parent_beamid = parent_ids[bbsize * (step - 2) + bbid];
-          for (int i = step - 2; i > 0; --i) {
-            prev_id =
-                previous_ids[i * bbsize + batchid * beam_width + parent_beamid];
-            if (log_probs[prev_id + bbid * vocab_size] > T(0)) {
-              log_probs[prev_id + bbid * vocab_size] =
-                  float(log_probs[prev_id + bbid * vocab_size]) /
-                  repeat_penalty;
-            } else {
-              log_probs[prev_id + bbid * vocab_size] =
-                  float(log_probs[prev_id + bbid * vocab_size]) *
-                  repeat_penalty;
-            }
-            // if (i > 0) parent_beamid =
-            // parent_ids[bbsize*(i-1)+parent_beamid];
-            parent_beamid = parent_ids[bbsize * (i - 1) + parent_beamid];
-          }
-        }
-        prev_id = previous_ids[batchid * beam_width];
-        if (log_probs[prev_id + bbid * vocab_size] > T(0)) {
-          log_probs[prev_id + bbid * vocab_size] =
-              float(log_probs[prev_id + bbid * vocab_size]) / repeat_penalty;
-        } else {
-          log_probs[prev_id + bbid * vocab_size] =
-              float(log_probs[prev_id + bbid * vocab_size]) * repeat_penalty;
-        }
-      }
-
-      // apply length penalty
-      // NOTE: The length penalty has different implementation. May be update.
-      if (len_penalty != 1.0) {
-        if (log_probs[end_id + bbid * vocab_size] > T(0)) {
-          log_probs[end_id + bbid * vocab_size] =
-              float(log_probs[end_id + bbid * vocab_size]) / len_penalty;
-        } else {
-          log_probs[end_id + bbid * vocab_size] =
-              float(log_probs[end_id + bbid * vocab_size]) * len_penalty;
-        }
-      }
-    }
-
-    if (logits_mask) {
-      for (int i = tid + bid * blockDim.x; i < vocab_size;
-           i += blockDim.x * gridDim.x) {
-        log_probs[i + bbid * vocab_size] += logits_mask[i];
-      }
-    }
-  }
-}
-
-template <typename T>
-void apply_penalties_Launcher(int step,
-                              T* log_probs,
-                              const bool* finished,
-                              int* current_ids,
-                              int* previous_ids,
-                              int* parent_ids,
-                              int batch_size,
-                              int beam_width,
-                              int vocab_size,
-                              int end_id,
-                              float temperature,
-                              float len_penalty,
-                              float repeat_penalty,
-                              cudaStream_t stream,
-                              const T* logits_mask = nullptr) {
-  dim3 block(256);
-  dim3 grid((vocab_size + block.x - 1) / block.x, beam_width * batch_size);
-
-  apply_penalties_kernel<T><<<grid, block, 0, stream>>>(step,
-                                                        vocab_size,
-                                                        beam_width,
-                                                        log_probs,
-                                                        finished,
-                                                        current_ids,
-                                                        previous_ids,
-                                                        parent_ids,
-                                                        end_id,
-                                                        1.f / temperature,
-                                                        len_penalty,
-                                                        repeat_penalty,
-                                                        logits_mask);
-}
-
-template void embeddings_kernel_launcher(float* from_tensor,
-                                         const float* embedding_table,
-                                         const float* position_encoding_table,
-                                         const float* sent_table,
-                                         const int* memory_sequence_length,
-                                         const int* type_id,
-                                         const int* word_ids,
-                                         const int step,
-                                         const int batch_size,
-                                         const int hidden_units,
-                                         const bool pos_bias,
-                                         cudaStream_t stream,
-                                         const int* decoder_role_id,
-                                         const float* role_embedding_table,
-                                         const int* decoder_position_id);
-
-template void embeddings_kernel_launcher(half* from_tensor,
-                                         const half* embedding_table,
-                                         const half* position_encoding_table,
-                                         const half* sent_table,
-                                         const int* memory_sequence_length,
-                                         const int* type_id,
-                                         const int* word_ids,
-                                         const int step,
-                                         const int batch_size,
-                                         const int hidden_units,
-                                         const bool pos_bias,
-                                         cudaStream_t stream,
-                                         const int* decoder_role_id,
-                                         const half* role_embedding_table,
-                                         const int* decoder_position_id);
-
-template void words_embeddings_kernel_launcher(float* from_tensor,
-                                const float* embedding_table,
-                                const int* word_ids,
-                                const int batch_size,
-                                const int hidden_units,
-                                cudaStream_t stream);
-
-template void words_embeddings_kernel_launcher(half* from_tensor,
-                                const half* embedding_table,
-                                const int* word_ids,
-                                const int batch_size,
-                                const int hidden_units,
-                                cudaStream_t stream);
-
-template void build_relative_attention_bias_launcher(float* relative_attention_bias,
-                                            const float* relative_attention_bias_table,
-                                            const int head_num,
-                                            const int seq_len,
-                                            const int num_bucket,
-                                            const bool is_bidirectional,
-                                            const int max_distance,
-                                            cudaStream_t stream);
-
-template void build_relative_attention_bias_launcher(half* relative_attention_bias,
-                                            const half* relative_attention_bias_table,
-                                            const int head_num,
-                                            const int seq_len,
-                                            const int num_bucket,
-                                            const bool is_bidirectional,
-                                            const int max_distance,
-                                            cudaStream_t stream);
-
-template void start_ids_embeddings_kernel_launcher(float* from_tensor,
-                                const float* embedding_table,
-                                const float* position_encoding_table,
-                                const float* type_table,
-                                const int* type_id,
-                                const int* word_ids,
-                                const int* memory_seq_len,
-                                const int start_step,
-                                const int max_length,
-                                const int batch_size,
-                                const int hidden_units,
-                                cudaStream_t stream,
-                                const int* role_id,
-                                const float* role_embedding_table,
-                                const int* position_id);
-
-template void start_ids_embeddings_kernel_launcher(half* from_tensor,
-                                const half* embedding_table,
-                                const half* position_encoding_table,
-                                const half* type_table,
-                                const int* type_id,
-                                const int* word_ids,
-                                const int* memory_seq_len,
-                                const int start_step,
-                                const int max_length,
-                                const int batch_size,
-                                const int hidden_units,
-                                cudaStream_t stream,
-                                const int* role_id,
-                                const half* role_embedding_table,
-                                const int* position_id);
-
-template void init_cache_kernel_launcher(const float* cache_k,
-                                         const float* cache_v,
-                                         const int* memory_sequence_length,
-                                         float* k_tgt,
-                                         float* v_tgt,
-                                         int n_head,
-                                         int size_per_head,
-                                         int mem_len,
-                                         int batch_size,
-                                         int beam_size,
-                                         cudaStream_t stream);
-
-template void init_cache_kernel_launcher(const float* cache_k,
-                                         const float* cache_v,
-                                         const int* memory_sequence_length,
-                                         half* k_tgt,
-                                         half* v_tgt,
-                                         int n_head,
-                                         int size_per_head,
-                                         int mem_len,
-                                         int batch_size,
-                                         int beam_size,
-                                         cudaStream_t stream);
-
-template void init_logits_mask_Launcher(float* logits_mask,
-                                        int vocab_size,
-                                        int start_id,
-                                        int unk_id,
-                                        int mask_id,
-                                        cudaStream_t stream);
-
-template void init_logits_mask_Launcher(half* logits_mask,
-                                        int vocab_size,
-                                        int start_id,
-                                        int unk_id,
-                                        int mask_id,
-                                        cudaStream_t stream);
-
-template void apply_penalties_Launcher(int step,
-                                       float* log_probs,
-                                       const bool* finished,
-                                       int* current_ids,
-                                       int* previous_ids,
-                                       int* parent_ids,
-                                       int batch_size,
-                                       int beam_width,
-                                       int vocab_size,
-                                       int end_id,
-                                       float temperature,
-                                       float len_penalty,
-                                       float repeat_penalty,
-                                       cudaStream_t stream,
-                                       const float* logits_mask);
-
-template void apply_penalties_Launcher(int step,
-                                       half* log_probs,
-                                       const bool* finished,
-                                       int* current_ids,
-                                       int* previous_ids,
-                                       int* parent_ids,
-                                       int batch_size,
-                                       int beam_width,
-                                       int vocab_size,
-                                       int end_id,
-                                       float temperature,
-                                       float len_penalty,
-                                       float repeat_penalty,
-                                       cudaStream_t stream,
-                                       const half* logits_mask);
-}
diff --git a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/transformer_kernels.cu b/paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/transformer_kernels.cu
deleted file mode 100644
index 70cada699fc7..000000000000
--- a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/transformer_kernels.cu
+++ /dev/null
@@ -1,985 +0,0 @@
-/*
-* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-* Copyright (c) 2020-2021, NVIDIA CORPORATION.  All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-#include "fastertransformer/cuda/transformer_kernels.cuh"
-
-namespace fastertransformer {
-
-
-template <typename T>
-__inline__ __device__ T gelu(T x) {
-  float cdf =
-      0.5f *
-      (1.0f + tanhf((0.7978845608028654f * (x + 0.044715f * x * x * x))));
-
-  // NOTE: The precision of gelu with or without approximate formulation
-  // may cause serious problem in some cases. If necessary, the following
-  // comments can be opened to use the non-approximate formulation.
-  // float cdf = 0.5f * (1.0f + erf((float)x / sqrt(2.0f)));
-  return x * cdf;
-}
-
-template <>
-__inline__ __device__ half2 gelu(half2 val) {
-  half2 val_pow3 = __hmul2(val, __hmul2(val, val));
-  float2 tmp_pow = __half22float2(val_pow3);
-  float2 tmp = __half22float2(val);
-
-  tmp.x =
-      0.5f *
-      (1.0f + tanhf((0.7978845608028654f * (tmp.x + 0.044715f * tmp_pow.x))));
-  tmp.y =
-      0.5f *
-      (1.0f + tanhf((0.7978845608028654f * (tmp.y + 0.044715f * tmp_pow.y))));
-  return __hmul2(val, __float22half2_rn(tmp));
-}
-
-template <typename T>
-__inline__ __device__ T warpReduceSum(T val) {
-  for (int mask = 16; mask > 0; mask >>= 1)
-    val += __shfl_xor_sync(FINAL_MASK, val, mask, 32);
-  return val;
-}
-
-template <typename T>
-__inline__ __device__ T blockReduceSum(T val) {
-  static __shared__ T shared[32];
-  int lane = threadIdx.x & 0x1f;
-  int wid = threadIdx.x >> 5;
-
-  val = warpReduceSum<T>(val);
-
-  if (lane == 0) shared[wid] = val;
-  __syncthreads();
-
-  val = (threadIdx.x < (blockDim.x >> 5)) ? shared[lane] : (T)0.0f;
-  val = warpReduceSum(val);
-  return val;
-}
-
-template<typename T>
-__device__ __forceinline__ T clamp_inf_for_half(const float input)
-{
-    if (std::is_same<T, half>::value == true) {
-        // clamp inf values to enable fp16 training
-        return (float)input > 0.0f ? min(input, HALF_FLT_MAX - 1000) : max(input, -HALF_FLT_MAX + 1000);
-    }
-    else {
-        return input;
-    }
-}
-
-template <typename T>
-__global__ void add_bias_gelu(T* out, const T* __restrict bias, int m, int n) {
-  for (int id = blockIdx.x * blockDim.x + threadIdx.x; id < m * n;
-       id += blockDim.x * gridDim.x) {
-    T reg_bias = __ldg(&bias[id % n]);
-    T val = out[id] + reg_bias;
-    out[id] = (T)(gelu(val));
-  }
-}
-
-template <>
-__global__ void add_bias_gelu(half* out,
-                              const half* __restrict bias,
-                              int m,
-                              int n) {
-  half2* out_ptr = (half2*)out;
-  const half2* bias_ptr = (half2*)bias;
-
-  for (int id = blockIdx.x * blockDim.x + threadIdx.x; id < m * n;
-       id += blockDim.x * gridDim.x) {
-    half2 reg_bias = __ldg(&bias_ptr[id % n]);
-    half2 val = out_ptr[id] + reg_bias;
-    out_ptr[id] = gelu(val);
-  }
-}
-
-template <typename T>
-__global__ void add_bias_relu(T* out, const T* __restrict bias, int m, int n) {
-  for (int id = blockIdx.x * blockDim.x + threadIdx.x; id < m * n;
-       id += blockDim.x * gridDim.x) {
-    T reg_bias = __ldg(&bias[id % n]);
-    T val = out[id] + reg_bias;
-    out[id] = (T)(val > 0.0f ? val : 0.0f);
-  }
-}
-
-template <>
-__global__ void add_bias_relu(half* out,
-                              const half* __restrict bias,
-                              int m,
-                              int n) {
-  half2* out_ptr = (half2*)out;
-  const half2* bias_ptr = (half2*)bias;
-
-  for (int id = blockIdx.x * blockDim.x + threadIdx.x; id < m * n;
-       id += blockDim.x * gridDim.x) {
-    half2 reg_bias = __ldg(&bias_ptr[id % n]);
-    half2 val = out_ptr[id] + reg_bias;
-    val.x = val.x > (half)0.0f ? val.x : (half)0.0f;
-    val.y = val.y > (half)0.0f ? val.y : (half)0.0f;
-    out_ptr[id] = val;
-  }
-}
-
-template <typename T>
-__global__ void add_bias_input_layernorm(T* out,
-                                         const T* input,
-                                         const T* bias,
-                                         const T* gamma,
-                                         const T* beta,
-                                         int m,
-                                         int n) {
-  int tid = threadIdx.x;
-
-  __shared__ float s_mean;
-  __shared__ float s_variance;
-  float mean = 0.0f;
-  float variance = 0.0f;
-
-  float local_out = 0.0f;
-  local_out += (float)(out[blockIdx.x * n + tid] + input[blockIdx.x * n + tid] +
-                       __ldg(&bias[tid]));
-
-  mean = blockReduceSum<float>(local_out);
-  if (threadIdx.x == 0) s_mean = mean / n;
-  __syncthreads();
-
-  variance = blockReduceSum<float>((local_out - s_mean) * (local_out - s_mean));
-  if (threadIdx.x == 0) s_variance = variance / n + 1e-6f;
-  __syncthreads();
-
-  out[blockIdx.x * n + tid] = (T)(((local_out - s_mean) * rsqrtf(s_variance)) *
-                                      (float)(__ldg(&gamma[tid])) +
-                                  (float)(__ldg(&beta[tid])));
-}
-
-template <>
-__global__ void add_bias_input_layernorm(half* out,
-                                         const half* input,
-                                         const half* bias,
-                                         const half* gamma,
-                                         const half* beta,
-                                         int m,
-                                         int n) {
-  int tid = threadIdx.x;
-  __shared__ float s_mean;
-  __shared__ float s_variance;
-  float mean = 0.0f;
-  float variance = 0.0f;
-  float2 local_out_fp2;
-
-  half2* out_ptr = (half2*)out;
-  const half2* input_ptr = (const half2*)input;
-  const half2* bias_ptr = (const half2*)bias;
-  const half2* gamma_ptr = (const half2*)gamma;
-  const half2* beta_ptr = (const half2*)beta;
-
-  float local_out = 0.0f;
-  int id = blockIdx.x * n / 2 + tid;
-  local_out_fp2 = __half22float2(
-      __hadd2(__hadd2(out_ptr[id], input_ptr[id]), __ldg(&bias_ptr[tid])));
-  local_out += local_out_fp2.x;
-  local_out += local_out_fp2.y;
-
-  mean = blockReduceSum<float>(local_out);
-  if (threadIdx.x == 0) s_mean = mean / n;
-  __syncthreads();
-
-  variance = (local_out_fp2.x - s_mean) * (local_out_fp2.x - s_mean);
-  variance += (local_out_fp2.y - s_mean) * (local_out_fp2.y - s_mean);
-  variance = blockReduceSum<float>(variance);
-  if (threadIdx.x == 0) s_variance = rsqrtf(variance / n + 1e-6f);
-  __syncthreads();
-
-  float2 gamma_val = __half22float2(__ldg(&gamma_ptr[tid]));
-  float2 beta_val = __half22float2(__ldg(&beta_ptr[tid]));
-  local_out_fp2.x =
-      (local_out_fp2.x - s_mean) * s_variance * gamma_val.x + beta_val.x;
-  local_out_fp2.y =
-      (local_out_fp2.y - s_mean) * s_variance * gamma_val.y + beta_val.y;
-  out_ptr[id] = __float22half2_rn(local_out_fp2);
-}
-
-
-template <typename T>
-__global__ void add_bias_input_layernorm_v2(T* out,
-                                            const T* __restrict input,
-                                            const T* __restrict bias,
-                                            const T* __restrict gamma,
-                                            const T* __restrict beta,
-                                            int n) {
-  const int ite = 4;
-  const int tid = threadIdx.x;
-  const int bid = blockIdx.x;
-
-  __shared__ float s_mean;
-  __shared__ float s_variance;
-  float mean = 0.0f;
-  float variance = 0.0f;
-  float local_out[ite];
-
-  float sum = 0.0f;
-#pragma unroll
-  for (int i = 0; i < ite; i++) {
-    int col_id = i * blockDim.x + tid;
-    int id = bid * n + col_id;
-    local_out[i] = (float)(out[id] + __ldg(&input[id]) + __ldg(&bias[col_id]));
-    sum += local_out[i];
-  }
-
-  mean = blockReduceSum<float>(sum);
-  if (tid == 0) s_mean = mean / n;
-  __syncthreads();
-
-  float var = 0.0f;
-#pragma unroll
-  for (int i = 0; i < ite; i++) {
-    float diff = local_out[i] - s_mean;
-    var += diff * diff;
-  }
-
-  variance = blockReduceSum<float>(var);
-  if (tid == 0) s_variance = rsqrtf(variance / n + 1e-6f);
-  __syncthreads();
-
-#pragma unroll
-  for (int i = 0; i < ite; i++) {
-    int col_id = i * blockDim.x + tid;
-    int id = bid * n + col_id;
-    out[id] = (T)((local_out[i] - s_mean) * s_variance *
-                      (float)__ldg(&gamma[col_id]) +
-                  (float)__ldg(&beta[col_id]));
-  }
-}
-
-template <>
-__global__ void add_bias_input_layernorm_v2(half* out,
-                                            const half* __restrict input,
-                                            const half* __restrict bias,
-                                            const half* __restrict gamma,
-                                            const half* __restrict beta,
-                                            int n) {
-  const int ite = 4;
-  const int tid = threadIdx.x;
-  const int bid = blockIdx.x;
-  __shared__ float s_mean;
-  __shared__ float s_variance;
-  float mean = 0.0f;
-  float variance = 0.0f;
-  half2 local_out_half2[ite];
-
-  half2* out_ptr = (half2*)out;
-  const half2* input_ptr = (const half2*)input;
-  const half2* bias_ptr = (const half2*)bias;
-  const half2* gamma_ptr = (const half2*)gamma;
-  const half2* beta_ptr = (const half2*)beta;
-
-  // float sum = 0.0f;
-  half2 sum = __float2half2_rn(0.0f);
-#pragma unroll
-  for (int i = 0; i < ite; i++) {
-    int col_id = i * blockDim.x + tid;
-    int id = bid * n / 2 + col_id;
-    local_out_half2[i] =
-        out_ptr[id] + __ldg(&input_ptr[id]) + __ldg(&bias_ptr[col_id]);
-    sum += local_out_half2[i];
-  }
-
-  mean = blockReduceSum<float>((float)(sum.x + sum.y));
-  if (threadIdx.x == 0) s_mean = mean / n;
-  __syncthreads();
-
-  float var = 0.0f;
-  half2 s_mean_2 = __float2half2_rn(s_mean);
-#pragma unroll
-  for (int i = 0; i < ite; i++) {
-    local_out_half2[i] = local_out_half2[i] - s_mean_2;
-    float v1 = (float)local_out_half2[i].x;
-    float v2 = (float)local_out_half2[i].y;
-    var += v1 * v1 + v2 * v2;
-  }
-
-  variance = blockReduceSum<float>(var);
-  if (threadIdx.x == 0) s_variance = rsqrtf(variance / n + 1e-6f);
-  __syncthreads();
-
-  half2 s_var_2 = __float2half2_rn(s_variance);
-#pragma unroll
-  for (int i = 0; i < ite; i++) {
-    int col_id = i * blockDim.x + tid;
-    int id = bid * n / 2 + col_id;
-    out_ptr[id] = local_out_half2[i] * s_var_2 * __ldg(&gamma_ptr[col_id]) +
-                  __ldg(&beta_ptr[col_id]);
-  }
-}
-
-template <typename T>
-void add_bias_act_kernelLauncher(T* out,
-                                 const T* bias,
-                                 int m,
-                                 int n,
-                                 ActivationType activation_type,
-                                 cudaStream_t stream) {
-  const int data_type_factor = 4 / sizeof(T);  // 1 for fp32, 2 for fp16
-  dim3 block, grid;
-  if (n / 4 / data_type_factor <= 1024) {
-    block.x = n / 4 / data_type_factor;
-    grid.x = m;
-  } else {
-    block.x = 1024;
-    grid.x = ceil(m * n / 1024.);
-  }
-
-
-  if (activation_type == ActivationType::RELU)
-    add_bias_relu<T><<<grid, block, 0, stream>>>(
-        out, bias, m, n / data_type_factor);
-  else if (activation_type == ActivationType::GELU)
-    add_bias_gelu<T><<<grid, block, 0, stream>>>(
-        out, bias, m, n / data_type_factor);
-}
-
-template <typename T>
-__global__ void gated_add_bias_relu(
-    T* out0,
-    T* out1,
-    const T* __restrict bias0,
-    const T* __restrict bias1,
-    int m,
-    int n) {
-  for (int id = blockIdx.x * blockDim.x + threadIdx.x; id < m * n;
-       id += blockDim.x * gridDim.x) {
-    T reg_bias0 = (bias0) ? __ldg(&bias0[id % n]) : (T)0.0f;
-    T reg_bias1 = (bias1) ? __ldg(&bias1[id % n]) : (T)0.0f;
-
-    float val = (float)(out0[id] + reg_bias0);
-    val = (val > 0.0f) ? val : 0.0f;
-
-    out0[id] = (T)val * (out1[id] + reg_bias1);
-  }
-}
-
-template <>
-__global__ void gated_add_bias_relu(half* out0,
-                                    half* out1,
-                                    const half* __restrict bias0,
-                                    const half* __restrict bias1,
-                                    int m,
-                                    int n) {
-  half2* out0_ptr = (half2*)out0;
-  half2* out1_ptr = (half2*)out1;
-
-  const half2* bias0_ptr = (bias0) ? (half2*)bias0 : nullptr;
-  const half2* bias1_ptr = (bias1) ? (half2*)bias1 : nullptr;
-
-  for (int id = blockIdx.x * blockDim.x + threadIdx.x; id < m * n;
-       id += blockDim.x * gridDim.x) {
-    half2 reg_bias0;
-    if (bias0) {
-      reg_bias0 = __ldg(&bias0_ptr[id % n]);
-    } else {
-      reg_bias0.x = (half)0.0f;
-      reg_bias0.y = (half)0.0f;
-    }
-
-    half2 reg_bias1;
-    if (bias1) {
-      reg_bias1 = __ldg(&bias1_ptr[id % n]);
-    } else {
-      reg_bias1.x = (half)0.0f;
-      reg_bias1.y = (half)0.0f;
-    }
-
-    half2 val0 = out0_ptr[id] + reg_bias0;
-    val0.x = val0.x > (half)0.0f ? val0.x : (half)0.0f;
-    val0.y = val0.y > (half)0.0f ? val0.y : (half)0.0f;
-
-    half2 val1 = out1_ptr[id] + reg_bias1;
-
-    out0_ptr[id] = val0 * val1;
-  }
-}
-
-template <typename T>
-__global__ void gated_add_bias_gelu(
-    T* out0,
-    T* out1,
-    const T* __restrict bias0,
-    const T* __restrict bias1,
-    int m,
-    int n) {
-  for (int id = blockIdx.x * blockDim.x + threadIdx.x; id < m * n;
-       id += blockDim.x * gridDim.x) {
-    T reg_bias0 = (bias0) ? __ldg(&bias0[id % n]) : (T)0.0f;
-    T reg_bias1 = (bias1) ? __ldg(&bias1[id % n]) : (T)0.0f;
-
-    T val = (T)(gelu(out0[id] + reg_bias0)) * (out1[id] + reg_bias1);
-
-    out0[id] = (T)(val);
-  }
-}
-
-template <>
-__global__ void gated_add_bias_gelu(
-    half* out0,
-    half* out1,
-    const half* __restrict bias0,
-    const half* __restrict bias1,
-    int m,
-    int n) {
-  half2* out0_ptr = (half2*)out0;
-  half2* out1_ptr = (half2*)out1;
-
-  const half2* bias0_ptr = (half2*)bias0;
-  const half2* bias1_ptr = (half2*)bias1;
-
-  for (int id = blockIdx.x * blockDim.x + threadIdx.x; id < m * n;
-       id += blockDim.x * gridDim.x) {
-    half2 reg_bias0;
-    if (bias0) {
-      reg_bias0 = __ldg(&bias0_ptr[id % n]);
-    } else {
-      reg_bias0.x = (half)0.0f;
-      reg_bias0.y = (half)0.0f;
-    }
-
-    half2 reg_bias1;
-    if (bias0) {
-      reg_bias1 = __ldg(&bias1_ptr[id % n]);
-    } else {
-      reg_bias1.x = (half)0.0f;
-      reg_bias1.y = (half)0.0f;
-    }
-
-    half2 val0 = out0_ptr[id] + reg_bias0;
-    val0 = gelu(val0);
-
-    out0_ptr[id] = val0 * (out1_ptr[id] + reg_bias1);
-  }
-}
-
-template <typename T>
-void gated_add_bias_act_kernelLauncher(T* out,
-                                       const T* bias0,
-                                       const T* bias1,
-                                       int m,
-                                       int n,
-                                       ActivationType activation_type,
-                                       cudaStream_t stream) {
-  const int data_type_factor = 4 / sizeof(T);  // 1 for fp32, 2 for fp16
-  dim3 block, grid;
-  if (n / 4 / data_type_factor <= 1024) {
-    block.x = n / 4 / data_type_factor;
-    grid.x = m;
-  } else {
-    block.x = 1024;
-    grid.x = ceil(m * n / 1024.);
-  }
-
-  if (activation_type == ActivationType::RELU)
-    gated_add_bias_relu<T><<<grid, block, 0, stream>>>(
-        out, out + m * n, bias0, bias1, m, n / data_type_factor);
-  else if (activation_type == ActivationType::GELU)
-    gated_add_bias_gelu<T><<<grid, block, 0, stream>>>(
-        out, out + m * n, bias0, bias1, m, n / data_type_factor);
-}
-
-template <typename T>
-void add_bias_input_layernorm_kernelLauncher(T* out,
-                                             const T* input,
-                                             const T* bias,
-                                             const T* gamma,
-                                             const T* beta,
-                                             int m,
-                                             int n,
-                                             cudaStream_t stream) {
-  dim3 grid(m);
-  dim3 block(n);
-  assert(n <= 1024);
-  if (n == 768 || n == 1024)
-    add_bias_input_layernorm_v2<T><<<grid, n / 4, 0, stream>>>(
-        out, input, bias, gamma, beta, n);
-  else
-    add_bias_input_layernorm<T><<<grid, block, 0, stream>>>(
-        out, input, bias, gamma, beta, m, n);
-}
-
-template <>
-void add_bias_input_layernorm_kernelLauncher(half* out,
-                                             const half* input,
-                                             const half* bias,
-                                             const half* gamma,
-                                             const half* beta,
-                                             int m,
-                                             int n,
-                                             cudaStream_t stream) {
-  dim3 grid(m);
-  dim3 block(n / 2);
-  assert(n / 2 <= 1024);
-
-  if (m >= 512 && (n == 768 || n == 1024))
-    add_bias_input_layernorm_v2<half><<<grid, n / 8, 0, stream>>>(
-        out, input, bias, gamma, beta, n);
-  else
-    add_bias_input_layernorm<half><<<grid, block, 0, stream>>>(
-        out, input, bias, gamma, beta, m, n);
-}
-
-template <typename T>
-__global__ void add_bias_input_layernorm_2(const T* __restrict input,
-                                           const T* __restrict gamma,
-                                           const T* __restrict beta,
-                                           const T* __restrict bias,
-                                           T* output,
-                                           T* norm_output,
-                                           int m,
-                                           int n) {
-  int tid = threadIdx.x;
-
-  __shared__ float s_mean;
-  __shared__ float s_variance;
-  float mean = 0.0f;
-  float variance = 0.0f;
-
-  float local_sum = 0.0f;
-  for (int i = tid; i < n; i += blockDim.x) {
-    float local_out = (float)(__ldg(&input[blockIdx.x * n + i]));
-    local_out += (float)(output[blockIdx.x * n + i]);
-    if(bias != nullptr){
-        local_out += (float)(__ldg(&bias[i]));
-    }
-    output[blockIdx.x * n + i] = (T)local_out;
-    local_sum += local_out;
-  }
-
-  if (gamma == nullptr || beta == nullptr){
-    return;
-  }
-
-  mean = blockReduceSum<float>(local_sum);
-
-  if (threadIdx.x == 0) s_mean = mean / n;
-  __syncthreads();
-
-  float local_var_sum = 0.0f;
-  for (int i = tid; i < n; i += blockDim.x) {
-    float diff = (float)(__ldg(&output[blockIdx.x * n + i])) - s_mean;
-    local_var_sum += diff * diff;
-  }
-  variance = blockReduceSum<float>(local_var_sum);
-
-  if (threadIdx.x == 0) s_variance = rsqrtf(variance / n + 1e-6);
-  __syncthreads();
-
-  for (int i = tid; i < n; i += blockDim.x) {
-    norm_output[blockIdx.x * n + i] =
-        (T)((((float)output[blockIdx.x * n + i] - s_mean) * s_variance) *
-                (float)(__ldg(&gamma[i])) +
-            (float)(__ldg(&beta[i])));
-  }
-}
-
-template <typename T>
-void add_bias_input_layernorm_2_kernelLauncher(const T* input,
-                                               const T* gamma,
-                                               const T* beta,
-                                               const T* bias,
-                                               T* output,
-                                               T* norm_output,
-                                               int m,
-                                               int n,
-                                               cudaStream_t stream) {
-  dim3 grid(m);
-  dim3 block(min(n, 1024));
-
-  /* For general cases, n is equal to hidden_units, e.g., 512/1024.
-  Since we have warp shuffle inside the code, block.x % 32 should be 0.
-  */
-
-  if (n % 32 != 0) block.x = 1024;
-
-  block.x =
-      block.x / (4 / sizeof(T));  // if using half, only need half of block.x
-
-  /* should pay attention to the rsqrt precision*/
-  add_bias_input_layernorm_2<T><<<grid, block, 0, stream>>>(
-      input, gamma, beta, bias, output, norm_output, m, n);  // For gpt-3
-}
-
-template <typename T>
-__global__ void add_bias_input_t5_layernorm_2(const T* __restrict input,
-                                           const T* __restrict gamma,
-                                           const T* __restrict bias,
-                                           T* output,
-                                           T* norm_output,
-                                           int m,
-                                           int n) {
-  int tid = threadIdx.x;
-
-  __shared__ float s_variance;
-  float variance = 0.0f;
-
-  for (int i = tid; i < n; i += blockDim.x) {
-    float local_out = (float)(__ldg(&input[blockIdx.x * n + i]));
-    local_out += (float)(output[blockIdx.x * n + i]);
-    if(bias != nullptr){
-        local_out += (float)(__ldg(&bias[i]));
-    }
-    output[blockIdx.x * n + i] = (T)local_out;
-  }
-
-  if (gamma == nullptr){
-    return;
-  }
-
-  float local_var_sum = 0.0f;
-  for (int i = tid; i < n; i += blockDim.x) {
-    float diff = (float)(__ldg(&output[blockIdx.x * n + i]));
-    local_var_sum += diff * diff;
-  }
-  variance = blockReduceSum<float>(local_var_sum);
-
-  if (threadIdx.x == 0) s_variance = rsqrtf(variance / n + 1e-6);
-  __syncthreads();
-
-  for (int i = tid; i < n; i += blockDim.x) {
-    norm_output[blockIdx.x * n + i] =
-        clamp_inf_for_half<T>((((float)output[blockIdx.x * n + i]) * s_variance) *
-                (float)(__ldg(&gamma[i])));
-  }
-}
-
-template <typename T>
-void add_bias_input_t5_layernorm_2_kernelLauncher(const T* input,
-                                               const T* gamma,
-                                               const T* beta,
-                                               const T* bias,
-                                               T* output,
-                                               T* norm_output,
-                                               int m,
-                                               int n,
-                                               cudaStream_t stream) {
-  dim3 grid(m);
-  dim3 block(min(n, 1024));
-
-  if (n % 32 != 0) block.x = 1024;
-
-  block.x =
-      block.x / (4 / sizeof(T));
-
-  if (beta != nullptr) {
-    add_bias_input_layernorm_2<T><<<grid, block, 0, stream>>>(
-        input, gamma, beta, bias, output, norm_output, m, n);
-  } else {
-    add_bias_input_t5_layernorm_2<T><<<grid, block, 0, stream>>>(
-        input, gamma, bias, output, norm_output, m, n);
-  }
-}
-
-template <typename T>
-__global__ void add_bias_input(
-    T* output, const T* input, const T* bias, const int m, const int n) {
-  // This kernel can run with any block size and grid size
-  // Since the hidden dimension of GPT-3 would be larger than 1024
-  const int bid = blockIdx.x;
-  // const int blocks_per_row = n / blockDim.x;
-  // const int col_index = (bid % blocks_per_row) * blockDim.x + threadIdx.x;
-  // T bias_val = __ldg(&bias[col_index]);
-  for (int index = bid * blockDim.x + threadIdx.x; index < m * n;
-       index += blockDim.x * gridDim.x) {
-    T bias_val = __ldg(&bias[index % n]);
-    output[index] = output[index] + input[index] + bias_val;
-  }
-}
-
-template <typename T>
-void add_bias_input_kernelLauncher(T* output,
-                                   const T* bias,
-                                   const T* input,
-                                   const int m,
-                                   const int n,
-                                   cudaStream_t stream) {
-  int blocks_per_row = ceil(float(n) / 1024);
-  dim3 grid(min(m * blocks_per_row, 65536));
-  dim3 block(min(n, 1024));
-
-  add_bias_input<<<grid, block, 0, stream>>>(output, input, bias, m, n);
-}
-
-template <typename T>
-__global__ void layer_norm_kernel_generalize(const T* __restrict input,
-                                             const T* __restrict gamma,
-                                             const T* __restrict beta,
-                                             T* output,
-                                             int m,
-                                             int n) {
-  const int tid = threadIdx.x;
-
-  __shared__ float s_mean;
-  __shared__ float s_variance;
-  float mean = 0.0f;
-  float variance = 0.0f;
-
-  float local_sum = 0.0f;
-  for (int i = tid; i < n; i += blockDim.x) {
-    local_sum += (float)(__ldg(&input[blockIdx.x * n + i]));
-  }
-
-  mean = blockReduceSum<float>(local_sum);
-
-  if (threadIdx.x == 0) s_mean = mean / n;
-  __syncthreads();
-
-  float local_var_sum = 0.0f;
-  for (int i = tid; i < n; i += blockDim.x) {
-    float diff = (float)(__ldg(&input[blockIdx.x * n + i])) - s_mean;
-    local_var_sum += diff * diff;
-  }
-  variance = blockReduceSum<float>(local_var_sum);
-
-  if (threadIdx.x == 0) s_variance = rsqrtf(variance / n + 1e-6);
-
-  __syncthreads();
-
-  for (int i = tid; i < n; i += blockDim.x) {
-    output[blockIdx.x * n + i] =
-        (T)((((float)input[blockIdx.x * n + i] - s_mean) * s_variance) *
-                (float)(__ldg(&gamma[i])) +
-            (float)(__ldg(&beta[i])));
-  }
-}
-
-template <typename T>
-void layer_norm(const T* input,
-                const T* gamma,
-                const T* beta,
-                T* output,
-                int m,
-                int n,
-                cudaStream_t stream) {
-  dim3 grid(m);
-  dim3 block(min(n, 1024));
-
-  /* For general cases, n is equal to hidden_units, e.g., 512/1024.
-     Since we have warp shuffle inside the code, block.x % 32 should be 0.
-  */
-  if (n % 32 != 0) block.x = 1024;
-
-  block.x =
-      block.x / (4 / sizeof(T));  // if using half, only need half of block.x
-
-  /* should pay attention to the rsqrt precision*/
-  layer_norm_kernel_generalize<T><<<grid, block, 0, stream>>>(
-      input, gamma, beta, output, m, n);  // For gpt-3
-}
-
-template <typename T>
-__global__ void t5_layer_norm_kernel_generalize(const T* __restrict input,
-                                             const T* __restrict gamma,
-                                             T* output,
-                                             int m,
-                                             int n) {
-    const int tid = threadIdx.x;
-
-    __shared__ float s_variance;
-    float variance = 0.0f;
-
-    float local_var_sum = 0.0f;
-    for (int i = tid; i < n; i += blockDim.x) {
-        float diff = (float)(__ldg(&input[blockIdx.x * n + i]));
-        local_var_sum += diff * diff;
-    }
-    variance = blockReduceSum(local_var_sum);
-
-    if (threadIdx.x == 0) {
-        s_variance = rsqrtf(variance / n + 1e-6f);
-    }
-    __syncthreads();
-
-    for (int i = tid; i < n; i += blockDim.x) {
-        output[blockIdx.x * n + i] =
-            clamp_inf_for_half<T>((((float)input[blockIdx.x * n + i]) * s_variance) * (float)(__ldg(&gamma[i])));
-    }
-}
-
-template <typename T>
-void t5_layer_norm(const T* input,
-                const T* gamma,
-                const T* beta,
-                T* output,
-                int m,
-                int n,
-                cudaStream_t stream) {
-  dim3 grid(m);
-  dim3 block(min(n, 1024));
-
-  if (n % 32 != 0) block.x = 1024;
-
-  block.x =
-      block.x / (4 / sizeof(T));
-  if (beta != nullptr) {
-    layer_norm_kernel_generalize<T><<<grid, block, 0, stream>>>(
-        input, gamma, beta, output, m, n);
-  } else {
-    t5_layer_norm_kernel_generalize<T><<<grid, block, 0, stream>>>(
-        input, gamma, output, m, n);
-  }
-}
-
-template void add_bias_act_kernelLauncher<float>(float* out,
-                                                 const float* bias,
-                                                 int m,
-                                                 int n,
-                                                 ActivationType activation_type,
-                                                 cudaStream_t stream);
-
-template void add_bias_input_layernorm_kernelLauncher<float>(
-    float* out,
-    const float* input,
-    const float* bias,
-    const float* gamma,
-    const float* beta,
-    int m,
-    int n,
-    cudaStream_t stream);
-
-template void add_bias_act_kernelLauncher<half>(half* out,
-                                                const half* bias,
-                                                int m,
-                                                int n,
-                                                ActivationType activation_type,
-                                                cudaStream_t stream);
-
-template void gated_add_bias_act_kernelLauncher<float>(float* out,
-                                                       const float* bias0,
-                                                       const float* bias1,
-                                                       int m,
-                                                       int n,
-                                                       ActivationType activation_type,
-                                                       cudaStream_t stream);
-
-template void gated_add_bias_act_kernelLauncher<half>(half* out,
-                                                      const half* bias0,
-                                                      const half* bias1,
-                                                      int m,
-                                                      int n,
-                                                      ActivationType activation_type,
-                                                      cudaStream_t stream);
-
-template void add_bias_input_layernorm_kernelLauncher<half>(
-    half* out,
-    const half* input,
-    const half* bias,
-    const half* gamma,
-    const half* beta,
-    int m,
-    int n,
-    cudaStream_t stream);
-
-template void add_bias_input_layernorm_2_kernelLauncher<float>(
-    const float* input,
-    const float* gamma,
-    const float* beta,
-    const float* bias,
-    float* output,
-    float* norm_output,
-    int m,
-    int n,
-    cudaStream_t stream);
-
-template void add_bias_input_layernorm_2_kernelLauncher<half>(
-    const half* input,
-    const half* gamma,
-    const half* beta,
-    const half* bias,
-    half* output,
-    half* norm_output,
-    int m,
-    int n,
-    cudaStream_t stream);
-
-template void add_bias_input_t5_layernorm_2_kernelLauncher<float>(
-    const float* input,
-    const float* gamma,
-    const float* beta,
-    const float* bias,
-    float* output,
-    float* norm_output,
-    int m,
-    int n,
-    cudaStream_t stream);
-
-template void add_bias_input_t5_layernorm_2_kernelLauncher<half>(
-    const half* input,
-    const half* gamma,
-    const half* beta,
-    const half* bias,
-    half* output,
-    half* norm_output,
-    int m,
-    int n,
-    cudaStream_t stream);
-
-template void add_bias_input_kernelLauncher<float>(float* output,
-                                                   const float* bias,
-                                                   const float* input,
-                                                   const int m,
-                                                   const int n,
-                                                   cudaStream_t stream);
-
-template void add_bias_input_kernelLauncher<half>(half* output,
-                                                  const half* bias,
-                                                  const half* input,
-                                                  const int m,
-                                                  const int n,
-                                                  cudaStream_t stream);
-
-template void layer_norm<float>(const float* input,
-                                const float* gamma,
-                                const float* beta,
-                                float* output,
-                                int m,
-                                int n,
-                                cudaStream_t stream);
-
-template void layer_norm<half>(const half* input,
-                               const half* gamma,
-                               const half* beta,
-                               half* output,
-                               int m,
-                               int n,
-                               cudaStream_t stream);
-
-template void t5_layer_norm<float>(const float* input,
-                                const float* gamma,
-                                const float* beta,
-                                float* output,
-                                int m,
-                                int n,
-                                cudaStream_t stream);
-
-template void t5_layer_norm<half>(const half* input,
-                               const half* gamma,
-                               const half* beta,
-                               half* output,
-                               int m,
-                               int n,
-                               cudaStream_t stream);
-
-}  // namespace fastertransformer
diff --git a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/transformer_kernels.cuh b/paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/transformer_kernels.cuh
deleted file mode 100644
index 7aaf99feab1f..000000000000
--- a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/transformer_kernels.cuh
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-namespace fastertransformer {
-
-template <typename T>
-void t5_layer_norm(const T* from_tensor,
-                   const T* gamma,
-                   const T* beta,
-                   T* norm_from_tensor_buf_,
-                   const int m,
-                   const int n,
-                   cudaStream_t stream);
-
-template <typename T>
-void add_bias_input_t5_layernorm_2_kernelLauncher(const T* input,
-                                                  const T* gamma,
-                                                  const T* beta,
-                                                  const T* bias,
-                                                  T* output,
-                                                  T* norm_output,
-                                                  int m,
-                                                  int n,
-                                                  cudaStream_t stream);
-
-template <typename T>
-void gated_add_bias_act_kernelLauncher(T* out,
-                                       const T* bias0,
-                                       const T* bias1,
-                                       int m,
-                                       int n,
-                                       ActivationType activation_type,
-                                       cudaStream_t stream);
-
-}  // namespace fastertransformer
diff --git a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/decoding_beamsearch.h b/paddlenlp/ops/patches/FasterTransformer/fastertransformer/decoding_beamsearch.h
deleted file mode 100644
index 459e6c96780d..000000000000
--- a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/decoding_beamsearch.h
+++ /dev/null
@@ -1,1445 +0,0 @@
-/*
- * Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/**
- * Decoder transformer
- **/
-
-#pragma once
-
-#include <cuda_runtime.h>
-#include "fastertransformer/cuda/cuda_kernels.h"
-#include "fastertransformer/open_decoder.h"
-#include "fastertransformer/utils/allocator.h"
-#include "fastertransformer/utils/arguments.h"
-#include "fastertransformer/utils/common.h"
-#include "fastertransformer/utils/functions.h"
-
-namespace fastertransformer {
-
-template <OperationType OpType_>
-class DecodingBeamsearch {
-private:
-  typedef DecoderTransformerTraits<OpType_> Traits_;
-  typedef typename Traits_::DataType DataType_;
-  const IAllocator &allocator_;
-  struct DecodingBeamsearchArguments args_;
-  TensorParallelParam t_parallel_param_;
-  LayerParallelParam l_parallel_param_;
-
-  const cudaDataType_t computeType_ = Traits_::computeType;
-  const cudaDataType_t AType_ = Traits_::AType;
-  const cudaDataType_t BType_ = Traits_::BType;
-  const cudaDataType_t CType_ = Traits_::CType;
-  std::map<std::string, cublasLtMatmulAlgo_info> cublasAlgoMap_;
-
-  OpenDecoder<OpType_> *decoder_;
-  DataType_ **K_cache_;
-  DataType_ **V_cache_;
-  DataType_ **K_mem_cache_;
-  DataType_ **V_mem_cache_;
-  DataType_ *from_tensor_[2];
-  DataType_ *decoder_buf_;
-
-  // Prefix LM
-  DataType_ *trans_out_buf_;
-  DataType_ *lm_normed_result_buf_;
-
-  DataType_ *decoder_normed_result_buf_;
-  DataType_ *embedding_buf_;
-  float *logits_buf_;
-  float *cum_log_buf_;
-  int *word_ids_buf_;
-  int *parent_ids_buf_;
-  bool *finished_buf_;
-  bool *alive_finished_buf_;
-
-  void *buf_;
-  int *finished_count_buf_;
-  bool *h_finished_buf_;
-  int *h_trg_length_;
-  float *temp_storage_;
-
-  bool is_fuse_topk_softMax_;
-  bool keep_alive_beam_;
-
-  void *topK_kernel_workspace = nullptr;
-  size_t topk_workspace_size_ = 0;
-  void *cublas_workspace_ = nullptr;
-
-  DataType_ *padded_embedding_kernel;
-  DataType_ *padded_embedding_bias;
-  DataType_ *tmp_logits_buf_;
-
-public:
-  DecodingBeamsearch(const IAllocator &allocator,
-                     const int batch_size,
-                     const int beam_width,
-                     const int seq_len,
-                     const int head_num,
-                     const int size_per_head,
-                     const int vocab_size,
-                     const int decoder_layers,
-                     const int memory_hidden_units,
-                     const int memory_max_seq_len,
-                     const int start_id,
-                     const int end_id,
-                     const float beam_search_diversity_rate = -0.0f,
-                     const bool is_fuse_topk_softMax = false,
-                     const bool is_fuse_qkv = false,
-                     const bool keep_alive_beam = false,
-                     const float alpha = 0.6,
-                     const bool normalization_before = true,
-                     const int pos_offset = 0,
-                     const ActivationType act = ActivationType::RELU,
-                     const bool pos_bias = false,
-                     const bool prefix_lm = false,
-                     const int finished_candidate_num = -1,
-                     const bool early_stopping = false,
-                     const bool is_mbart = false,
-                     const int min_length = 0,
-                     const int inner_coeff = 4,
-                     const bool is_miro = false)
-      : allocator_(allocator),
-        is_fuse_topk_softMax_(is_fuse_topk_softMax),
-        keep_alive_beam_(keep_alive_beam) {
-#ifndef NDEBUG
-    PRINT_FUNC_NAME_();
-#endif
-    args_.batch_size_ = batch_size;
-    args_.beam_width_ = beam_width;
-    args_.seq_len_ = seq_len;
-    args_.memory_max_seq_len_ = memory_max_seq_len;
-    args_.head_num_ = head_num;
-    args_.size_per_head_ = size_per_head;
-    args_.hidden_units_ = head_num * size_per_head;
-    args_.decoder_layers_ = decoder_layers;
-    args_.vocab_size_ = vocab_size;
-    args_.start_id_ = start_id;
-    args_.end_id_ = end_id;
-    args_.beam_search_diversity_rate_ = beam_search_diversity_rate;
-    if (args_.beam_width_ > 16 || args_.beam_width_ > MAX_K)
-      is_fuse_topk_softMax_ = false;
-    if (std::is_same<DataType_, float>::value)
-      args_.vocab_size_padded_ = vocab_size;
-    else if (std::is_same<DataType_, half>::value)
-      args_.vocab_size_padded_ = (int)(ceil(vocab_size / 8.)) * 8;
-
-    args_.alpha_ = alpha;
-    args_.normalization_before_ = normalization_before;
-    args_.pos_offset_ = pos_offset;
-    args_.pos_bias_ = pos_bias;
-    args_.act_ = act;
-
-    args_.min_length_ = min_length;
-
-    args_.prefix_lm_ = prefix_lm;
-    args_.is_mbart_ = is_mbart;
-    args_.is_miro_ = is_miro;
-
-    args_.finished_candidate_num_ = (finished_candidate_num == -1)
-                                        ? beam_width * 2
-                                        : finished_candidate_num;
-    args_.early_stopping_ = early_stopping;
-
-    K_cache_ = new DataType_ *[2];
-    V_cache_ = new DataType_ *[2];
-
-    K_mem_cache_ = new DataType_ *[args_.decoder_layers_];
-    V_mem_cache_ = new DataType_ *[args_.decoder_layers_];
-
-    decoder_ = new OpenDecoder<OpType_>(head_num,
-                                        size_per_head,
-                                        memory_hidden_units,
-                                        is_fuse_qkv,
-                                        normalization_before,
-                                        args_.act_,
-                                        inner_coeff);
-    decoder_->set_max_batch_size(batch_size * beam_width);
-
-    size_t from_tensor_size =
-        args_.batch_size_ * args_.beam_width_ * args_.hidden_units_;  // type T
-    size_t decoder_workspace_size = decoder_->getWorkspaceSize();     // type T
-    size_t decoder_normed_result_buffer_size =
-        args_.batch_size_ * args_.beam_width_ * args_.hidden_units_;  // type T
-    size_t cache_size = (prefix_lm)
-                            ? (args_.batch_size_ * args_.beam_width_ *
-                               (args_.seq_len_ + args_.memory_max_seq_len_) *
-                               args_.hidden_units_)
-                            : (args_.batch_size_ * args_.beam_width_ *
-                               args_.seq_len_ * args_.hidden_units_);  // type T
-    size_t mem_cache_size =
-        (prefix_lm) ? 0 : (args_.batch_size_ * args_.beam_width_ *
-                           memory_max_seq_len * args_.hidden_units_);  // type T
-
-    size_t logits_buf_size = args_.batch_size_ * args_.beam_width_ *
-                             args_.vocab_size_padded_;  // type float
-    size_t cum_log_buf_size =
-        args_.batch_size_ * args_.beam_width_;  // type float
-    size_t word_ids_buf_size =
-        args_.batch_size_ * args_.beam_width_;  // type int
-    size_t parent_ids_buf_size =
-        keep_alive_beam_ ? word_ids_buf_size : 0;  // type int
-    size_t finished_buf_size =
-        args_.batch_size_ * args_.beam_width_;  // type bool
-    size_t alive_finished_buf_size = keep_alive_beam_ ? finished_buf_size : 0;
-    size_t finished_count_size = (size_t)(ceil(1 / 32.)) * 32;  // type int
-
-    size_t storage_size_per_beam =
-        2 * args_.beam_width_ +
-        SMALL_TOP_K_SOFTMAX_MAX_VOC_PARTS * (2 * MAX_K + 2);
-    args_.temp_storage_size_ = args_.batch_size_ * args_.beam_width_ *
-                               storage_size_per_beam;  // type float
-    args_.temp_storage_size_ = (size_t)(
-        ceil(args_.batch_size_ * args_.beam_width_ * args_.beam_width_ / 4.) *
-            4 * 2 +
-        ceil(args_.batch_size_ * args_.beam_width_ *
-             SMALL_TOP_K_SOFTMAX_MAX_VOC_PARTS * (2 * MAX_K + 2) / 4.) *
-            4);
-    size_t padded_embedding_kernel_size =
-        args_.hidden_units_ * args_.vocab_size_padded_;
-    size_t padded_embedding_bias_size = args_.vocab_size_padded_;
-    if (std::is_same<DataType_, float>::value ||
-        (std::is_same<DataType_, half>::value &&
-         args_.vocab_size_padded_ == args_.vocab_size_)) {
-      padded_embedding_kernel_size = 0;
-      padded_embedding_bias_size = 0;
-    }
-
-    // When using separated alive and finish beam queues, some buffers size need
-    // to be doubled to restore beam search intermedia results of both alive and
-    // finish beams.
-    if (keep_alive_beam_ == true) {
-      // cumulated log-probs of finish beams and alive beams
-      cum_log_buf_size += cum_log_buf_size;
-      finished_buf_size += finished_buf_size;
-      // Double the size of topk_tmp_id_buf, topk_tmp_val_buf, since we need
-      // select the top 2*beam_width.
-      args_.temp_storage_size_ +=
-          ceil(args_.batch_size_ * args_.beam_width_ * args_.beam_width_ / 4.) *
-          4 * 2;
-// Double tmp_buffer since we need select the top 2*beam_width.
-#ifdef DO_SPLIT_SMALL_TOP_K_SOFTMAX
-      args_.temp_storage_size_ +=
-          ceil(args_.batch_size_ * args_.beam_width_ *
-               SMALL_TOP_K_SOFTMAX_MAX_VOC_PARTS * (2 * MAX_K) / 4.) *
-          4;
-#endif
-    }
-
-    // prevent memory misalinged address
-    logits_buf_size = (size_t)(ceil(logits_buf_size / 4.)) * 4;
-    cum_log_buf_size = (size_t)(ceil(cum_log_buf_size / 4.)) * 4;
-    word_ids_buf_size = (size_t)(ceil(word_ids_buf_size / 4.)) * 4;
-    parent_ids_buf_size = (size_t)(ceil(parent_ids_buf_size / 4.)) * 4;
-    finished_buf_size = (size_t)(ceil(finished_buf_size / 32.)) * 32;
-    alive_finished_buf_size =
-        (size_t)(ceil(alive_finished_buf_size / 32.)) * 32;
-    const size_t tmp_logits_buf_size = logits_buf_size;
-
-    // get workspace size of topk kernel
-    if (keep_alive_beam_ == true) {
-      topK_update_kernelLauncher(topK_kernel_workspace,
-                                 topk_workspace_size_,
-                                 logits_buf_,
-                                 finished_buf_,
-                                 alive_finished_buf_,
-                                 nullptr,
-                                 word_ids_buf_,
-                                 parent_ids_buf_,
-                                 nullptr,
-                                 nullptr,
-                                 cum_log_buf_,
-                                 0,
-                                 args_,
-                                 0);
-    } else {
-      topK_kernelLauncher(topK_kernel_workspace,
-                          topk_workspace_size_,
-                          logits_buf_,
-                          word_ids_buf_,
-                          finished_buf_,
-                          args_,
-                          0);
-    }
-
-    size_t lm_head_buffer_size = (prefix_lm)
-                                     ? decoder_normed_result_buffer_size * 3
-                                     : decoder_normed_result_buffer_size;
-
-    size_t datatype_buf_size =
-        from_tensor_size * 2 + decoder_workspace_size +
-        (cache_size * 4 + mem_cache_size * 2) * args_.decoder_layers_ +
-        lm_head_buffer_size;
-
-    buf_ = reinterpret_cast<void *>(allocator_.malloc(
-        ((sizeof(DataType_) == sizeof(half)) ? CUBLAS_WORKSPACE_SIZE : 0) +
-        sizeof(DataType_) * datatype_buf_size +
-        sizeof(float) * (logits_buf_size + cum_log_buf_size) +
-        sizeof(DataType_) * tmp_logits_buf_size +
-        sizeof(DataType_) * padded_embedding_kernel_size +
-        sizeof(float) * padded_embedding_bias_size +
-        sizeof(int) * (word_ids_buf_size + parent_ids_buf_size) +
-        sizeof(bool) * (finished_buf_size + alive_finished_buf_size) +
-        topk_workspace_size_ +
-        sizeof(float) * args_.temp_storage_size_ +  // should be always float
-        sizeof(int) * finished_count_size));
-
-    if (sizeof(DataType_) == sizeof(half)) {
-      cublas_workspace_ = buf_;
-      from_tensor_[0] =
-          (DataType_ *)((char *)cublas_workspace_ + CUBLAS_WORKSPACE_SIZE);
-    } else {
-      cublas_workspace_ = nullptr;
-      from_tensor_[0] = (DataType_ *)(buf_);
-    }
-    from_tensor_[1] = (DataType_ *)(from_tensor_[0] + from_tensor_size);
-
-    for (int i = 0; i < args_.decoder_layers_; ++i) {
-      K_mem_cache_[i] =
-          from_tensor_[1] + from_tensor_size + i * mem_cache_size * 2;
-      V_mem_cache_[i] = from_tensor_[1] + from_tensor_size +
-                        i * mem_cache_size * 2 + mem_cache_size;
-    }
-    if (args_.beam_width_ > 1) {
-      /* We use two-way buffer since we have to update KV buf at the end of each
-       * step. */
-      K_cache_[0] = V_mem_cache_[decoder_layers - 1] + mem_cache_size +
-                    0 * cache_size * args_.decoder_layers_;
-      K_cache_[1] = V_mem_cache_[decoder_layers - 1] + mem_cache_size +
-                    1 * cache_size * args_.decoder_layers_;
-      V_cache_[0] = V_mem_cache_[decoder_layers - 1] + mem_cache_size +
-                    2 * cache_size * args_.decoder_layers_;
-      V_cache_[1] = V_mem_cache_[decoder_layers - 1] + mem_cache_size +
-                    3 * cache_size * args_.decoder_layers_;
-    } else {
-      // if beam width is 1, we only need one buffer
-      K_cache_[0] = V_mem_cache_[decoder_layers - 1] + mem_cache_size +
-                    0 * cache_size * args_.decoder_layers_;
-      K_cache_[1] = K_cache_[0];
-      V_cache_[0] = V_mem_cache_[decoder_layers - 1] + mem_cache_size +
-                    2 * cache_size * args_.decoder_layers_;
-      V_cache_[1] = V_cache_[0];
-    }
-
-    decoder_buf_ = V_cache_[1] + cache_size * args_.decoder_layers_;
-
-    if (prefix_lm) {
-      trans_out_buf_ = (decoder_buf_ + decoder_workspace_size);
-      lm_normed_result_buf_ =
-          (trans_out_buf_ + decoder_normed_result_buffer_size);
-
-      decoder_normed_result_buf_ =
-          (lm_normed_result_buf_ + decoder_normed_result_buffer_size);
-      // Used for post-norm.
-      embedding_buf_ =
-          (lm_normed_result_buf_ + decoder_normed_result_buffer_size);
-    } else {
-      decoder_normed_result_buf_ = (decoder_buf_ + decoder_workspace_size);
-      // Used for post-norm.
-      embedding_buf_ = (decoder_buf_ + decoder_workspace_size);
-    }
-
-    logits_buf_ = (float *)(decoder_normed_result_buf_ +
-                            decoder_normed_result_buffer_size);
-    cum_log_buf_ = (float *)(logits_buf_ + logits_buf_size);
-    word_ids_buf_ = (int *)(cum_log_buf_ + cum_log_buf_size);
-    parent_ids_buf_ = (int *)(word_ids_buf_ + word_ids_buf_size);
-    finished_buf_ = (bool *)(parent_ids_buf_ + parent_ids_buf_size);
-    alive_finished_buf_ = (bool *)(finished_buf_ + finished_buf_size);
-    temp_storage_ = (float *)(alive_finished_buf_ + alive_finished_buf_size);
-    finished_count_buf_ = (int *)(temp_storage_ + args_.temp_storage_size_);
-    topK_kernel_workspace = (void *)(finished_count_buf_ + finished_count_size);
-    padded_embedding_kernel =
-        (DataType_ *)((char *)topK_kernel_workspace + topk_workspace_size_);
-    padded_embedding_bias =
-        (DataType_ *)(padded_embedding_kernel + padded_embedding_kernel_size);
-    tmp_logits_buf_ =
-        (DataType_ *)(padded_embedding_bias + padded_embedding_bias_size);
-
-    h_finished_buf_ = new bool[finished_buf_size];
-    h_trg_length_ = new int[args_.batch_size_];
-
-    int isConfigExist = access("decoding_gemm_config.in", 0);
-    if (isConfigExist == -1) {
-      printf("[WARNING] decoding_gemm_config.in is not found\n");
-    } else {
-      readAlgoFromConfig(cublasAlgoMap_, 1);
-      // check that the gemm_config setting is runnable
-      for (auto iter = cublasAlgoMap_.begin(); iter != cublasAlgoMap_.end();
-           iter++) {
-        int algoId = iter->second.algoId;
-        int stages = iter->second.stages;
-        // only check for cublas
-        if (stages != -1) continue;
-        if (Traits_::OpType == OperationType::FP32) {
-          if (algoId > CUBLAS_GEMM_ALGO23 || algoId < CUBLAS_GEMM_DEFAULT) {
-            // the algorithm is not for FP32
-            printf("[ERROR] cuBLAS Algorithm %d is not used in FP32. \n",
-                   algoId);
-            exit(-1);
-          }
-        } else {
-          if (algoId > CUBLAS_GEMM_ALGO15_TENSOR_OP ||
-              algoId < CUBLAS_GEMM_DEFAULT_TENSOR_OP) {
-            // the algorithm is not for FP16
-            printf("[ERROR] cuBLAS Algorithm %d is not used in FP16. \n",
-                   algoId);
-            exit(-1);
-          }
-        }
-      }
-    }
-  }
-
-  void set_tensor_parallel_param(const TensorParallelParam param) {
-    t_parallel_param_ = param;
-    decoder_->set_tensor_parallel_param(param);
-  }
-
-  void set_layer_parallel_param(const LayerParallelParam param) {
-    l_parallel_param_ = param;
-    decoder_->set_layer_parallel_param(param);
-  }
-
-  void forward_context(const DecoderInitParam<DataType_> *decoder_param,
-                       const DecodingInitParam<DataType_> decoding_params) {
-#ifndef NDEBUG
-      PRINT_FUNC_NAME_();
-#endif
-      const int input_len = decoding_params.request_input_len;
-      const int request_batch_size = decoding_params.request_batch_size;
-
-      const int max_input_len = decoding_params.max_input_len;
-
-      const int local_batch_size = ceil(request_batch_size * 1.0 / 1);
-      const int m = local_batch_size * input_len;
-      const int h_1 = args_.hidden_units_;
-
-      DataType_* from_tensor[2];
-      DataType_* decoder_output;
-      DataType_* decoder_workspace;
-      void *buf = reinterpret_cast<void *>(allocator_.malloc(
-          decoder_->getContextWorkspaceSize(input_len, local_batch_size) + 
-          (m * h_1 + 2 * request_batch_size * input_len * h_1) * sizeof(DataType_)
-      ));
-
-#ifndef NDEBUG
-      cudaDeviceSynchronize();
-      check_cuda_error(cudaGetLastError());
-#endif
-
-      from_tensor[0] = (DataType_*) buf;
-      from_tensor[1] = from_tensor[0] + request_batch_size * input_len * h_1;
-      decoder_output = from_tensor[1] + request_batch_size * input_len * h_1;
-      decoder_workspace = decoder_output + m * h_1;
-
-#ifndef NDEBUG
-      cudaDeviceSynchronize();
-      check_cuda_error(cudaGetLastError());
-#endif
-
-      if (args_.is_miro_) {
-        // Memory reuse. from_tensor[1].
-        start_ids_embeddings_kernel_launcher(from_tensor[1],
-                                            decoding_params.embedding_table,
-                                            decoding_params.position_encoding_table,
-                                            decoding_params.type_table,
-                                            decoding_params.type_id,
-                                            decoding_params.d_start_ids,
-                                            decoding_params.memory_sequence_length,
-                                            1,
-                                            input_len,
-                                            request_batch_size,
-                                            h_1,
-                                            decoding_params.stream,
-                                            decoding_params.role_id,
-                                            decoding_params.role_embedding_table,
-                                            decoding_params.position_ids);
-
-#ifndef NDEBUG
-      cudaDeviceSynchronize();
-      check_cuda_error(cudaGetLastError());
-#endif
-
-        layer_norm(from_tensor[1],
-                   decoding_params.pre_layernorm.gamma,
-                   decoding_params.pre_layernorm.beta,
-                   from_tensor[0],
-                   m,
-                   h_1,
-                   decoding_params.stream);
-
-      } else if (args_.normalization_before_) {
-        start_ids_embeddings_kernel_launcher(from_tensor[0],
-                                            decoding_params.embedding_table,
-                                            decoding_params.position_encoding_table,
-                                            decoding_params.type_table,
-                                            decoding_params.type_id,
-                                            decoding_params.d_start_ids,
-                                            decoding_params.memory_sequence_length,
-                                            1,
-                                            input_len,
-                                            request_batch_size,
-                                            h_1,
-                                            decoding_params.stream,
-                                            decoding_params.role_id,
-                                            decoding_params.role_embedding_table,
-                                            decoding_params.position_ids);
-      } else {
-        // Memory reuse. from_tensor[1].
-        start_ids_embeddings_kernel_launcher(from_tensor[1],
-                                            decoding_params.embedding_table,
-                                            decoding_params.position_encoding_table,
-                                            decoding_params.type_table,
-                                            decoding_params.type_id,
-                                            decoding_params.d_start_ids,
-                                            decoding_params.memory_sequence_length,
-                                            1,
-                                            input_len,
-                                            request_batch_size,
-                                            h_1,
-                                            decoding_params.stream,
-                                            decoding_params.role_id,
-                                            decoding_params.role_embedding_table,
-                                            decoding_params.position_ids);
-
-#ifndef NDEBUG
-      cudaDeviceSynchronize();
-      check_cuda_error(cudaGetLastError());
-#endif
-
-        layer_norm(from_tensor[1],
-                   decoding_params.layernorm.gamma,
-                   decoding_params.layernorm.beta,
-                   from_tensor[0],
-                   m,
-                   h_1,
-                   decoding_params.stream);
-
-      }
-#ifndef NDEBUG
-      cudaDeviceSynchronize();
-      check_cuda_error(cudaGetLastError());
-#endif
-
-      int dummy_decoder_max_seq_len = args_.seq_len_ + args_.memory_max_seq_len_;
-      size_t cache_size = local_batch_size * dummy_decoder_max_seq_len *
-                args_.hidden_units_;
-
-      int in_id, out_id;
-      for (int layer = 0; layer < args_.decoder_layers_; ++layer) {
-        in_id = layer & 0x1;
-        out_id = 1 - in_id;
-
-        decoder_->initialize(decoder_param[layer], decoder_buf_, cublas_workspace_, false);
-#ifndef NDEBUG
-        cudaDeviceSynchronize();
-        check_cuda_error(cudaGetLastError());
-#endif
-
-        size_t cache_offset = layer * cache_size;  // type T
-        decoder_->forward_context(decoder_workspace,
-                                  from_tensor[out_id],
-                                  K_cache_[1] + cache_offset,
-                                  V_cache_[1] + cache_offset,
-                                  from_tensor[in_id],
-                                  decoding_params.d_attn_mask,
-                                  local_batch_size,
-                                  input_len,
-                                  0,
-                                  dummy_decoder_max_seq_len,
-                                  layer == args_.decoder_layers_ - 1,
-                                  decoding_params.memory_sequence_length);
-
-#ifndef NDEBUG
-        cudaDeviceSynchronize();
-        check_cuda_error(cudaGetLastError());
-#endif
-      } // end of for loop of layer
-      allocator_.free(buf);
-#ifndef NDEBUG
-      cudaDeviceSynchronize();
-      check_cuda_error(cudaGetLastError());
-#endif
-  }
-
-  void forward(const DecoderInitParam<DataType_> *param,
-               DecodingInitParam<DataType_> decoding_params) {
-#ifndef NDEBUG
-    PRINT_FUNC_NAME_();
-#endif
-    const int m = args_.batch_size_ * args_.beam_width_;
-    const int k = args_.hidden_units_;
-    const int n = args_.vocab_size_padded_;
-    const DataType_ *embedding_kernel_ptr = nullptr;
-    const DataType_ *embedding_bias_ptr = nullptr;
-
-    int min_trg_len = 0;
-    int max_trg_len = 0;
-
-    if (decoding_params.trg_word) {
-      cudaMemcpy(h_trg_length_,
-                 decoding_params.trg_length,
-                 sizeof(int) * args_.batch_size_,
-                 cudaMemcpyDeviceToHost);
-      min_trg_len = h_trg_length_[0];
-      max_trg_len = h_trg_length_[0];
-
-      for (int i = 1; i < args_.batch_size_; ++i) {
-        min_trg_len = std::min(min_trg_len, h_trg_length_[i]);
-        max_trg_len = std::max(max_trg_len, h_trg_length_[i]);
-      }
-    }
-
-    /*
-      sequence_length initialize to 0
-      finished: false
-      word_ids: start_id_
-      cum_log_probs (for eacm beam, the first element is 0). e.g., [0 -inf -inf
-      -inf][0 -inf -inf -inf]
-      cum_log_probs: If keep_alive_beam_ is true, the first alive element is 0.
-    */
-    if (keep_alive_beam_ == true) {
-      init_kernelLauncher_v2<float>(finished_buf_,
-                                    alive_finished_buf_,
-                                    decoding_params.sequence_length,
-                                    word_ids_buf_,
-                                    cum_log_buf_,
-                                    args_.start_id_,
-                                    args_.batch_size_,
-                                    args_.beam_width_ * 2,
-                                    decoding_params.stream);
-    } else {
-      init_kernelLauncher(finished_buf_,
-                          decoding_params.sequence_length,
-                          word_ids_buf_,
-                          cum_log_buf_,
-                          args_.start_id_,
-                          args_.batch_size_,
-                          args_.beam_width_,
-                          decoding_params.stream);
-    }
-
-#ifndef NDEBUG
-    cudaDeviceSynchronize();
-    check_cuda_error(cudaGetLastError());
-
-/*
-  User can check the init by init_kernel_check.
-  init_kernel_check will compare the results of GPU and CPU.
-  Note that init_kernel_check contains init and uses do not need to call it
-  again.
-*/
-// init_kernel_check(finished_buf_, decoding_params.sequence_length,
-// word_ids_buf_, cum_log_buf_,
-//                   start_id_, batch_size_, beam_width_,
-//                   decoding_params.stream);
-#endif
-
-    if (std::is_same<DataType_, float>::value ||
-        (std::is_same<DataType_, half>::value &&
-         args_.vocab_size_padded_ == args_.vocab_size_)) {
-      embedding_kernel_ptr =
-          (const DataType_ *)decoding_params.embedding_kernel;
-      embedding_bias_ptr = (const DataType_ *)decoding_params.embedding_bias;
-    } else if (std::is_same<DataType_, half>::value) {
-      kernel_padding_kernelLauncher(padded_embedding_kernel,
-                                    decoding_params.embedding_kernel,
-                                    args_.hidden_units_,
-                                    args_.vocab_size_,
-                                    args_.vocab_size_padded_,
-                                    decoding_params.stream);
-
-#ifndef NDEBUG
-      cudaDeviceSynchronize();
-      check_cuda_error(cudaGetLastError());
-#endif
-
-      bias_padding_kernelLauncher(padded_embedding_bias,
-                                  decoding_params.embedding_bias,
-                                  args_.vocab_size_,
-                                  args_.vocab_size_padded_,
-                                  decoding_params.stream);
-
-#ifndef NDEBUG
-      cudaDeviceSynchronize();
-      check_cuda_error(cudaGetLastError());
-#endif
-      embedding_kernel_ptr = padded_embedding_kernel;
-      embedding_bias_ptr = padded_embedding_bias;
-    }
-
-    int cache_size =
-        (args_.prefix_lm_)
-            ? (m * (args_.seq_len_ + args_.memory_max_seq_len_) *
-               args_.hidden_units_)
-            : (m * args_.seq_len_ * args_.hidden_units_);  // type T
-
-    for (uint step = 1; step <= args_.seq_len_; ++step) {
-      // we use two-way buffer
-      int kv_cache_id = step & 0x1;
-      if (args_.is_miro_) {
-        embeddings_kernel_launcher(from_tensor_[1],
-                                   decoding_params.embedding_table,
-                                   decoding_params.position_encoding_table,
-                                   decoding_params.type_table,
-                                   decoding_params.memory_sequence_length,
-                                   decoding_params.decoder_type_id,
-                                   word_ids_buf_,
-                                   step,
-                                   m,
-                                   args_.hidden_units_,
-                                   args_.pos_bias_,
-                                   decoding_params.stream,
-                                   decoding_params.decoder_role_id,
-                                   decoding_params.role_embedding_table,
-                                   decoding_params.decoder_position_ids);
-
-#ifndef NDEBUG
-            cudaDeviceSynchronize();
-            check_cuda_error(cudaGetLastError());
-#endif
-
-        layer_norm(from_tensor_[1],
-                   decoding_params.pre_layernorm.gamma,
-                   decoding_params.pre_layernorm.beta,
-                   from_tensor_[0],
-                   m,
-                   k,
-                   decoding_params.stream);
-
-#ifndef NDEBUG
-            cudaDeviceSynchronize();
-            check_cuda_error(cudaGetLastError());
-#endif
-
-      } else if (args_.normalization_before_) {
-        if (args_.prefix_lm_) {
-          embeddings_kernel_launcher(from_tensor_[0],
-                                     decoding_params.embedding_table,
-                                     decoding_params.position_encoding_table,
-                                     decoding_params.type_table,
-                                     decoding_params.memory_sequence_length,
-                                     decoding_params.decoder_type_id,
-                                     word_ids_buf_,
-                                     step,
-                                     m,
-                                     args_.hidden_units_,
-                                     args_.pos_bias_,
-                                     decoding_params.stream,
-                                     decoding_params.decoder_role_id,
-                                     decoding_params.role_embedding_table,
-                                     decoding_params.decoder_position_ids);
-        } else {
-          if (args_.is_mbart_) {
-            embedding_lookup_sine_position_encoding_kernel_launcher(
-                embedding_buf_,
-                decoding_params.embedding_table,
-                decoding_params.position_encoding_table +
-                    (step - 1 + args_.pos_offset_) * args_.hidden_units_,
-                word_ids_buf_,
-                m,
-                args_.hidden_units_,
-                decoding_params.stream);
-
-            layer_norm(embedding_buf_,
-                       decoding_params.mbart_layernorm.gamma,
-                       decoding_params.mbart_layernorm.beta,
-                       from_tensor_[0],
-                       m,
-                       k,
-                       decoding_params.stream);
-
-          } else {
-            embedding_lookup_sine_position_encoding_kernel_launcher(
-                from_tensor_[0],
-                decoding_params.embedding_table,
-                decoding_params.position_encoding_table +
-                    (step - 1 + args_.pos_offset_) * args_.hidden_units_,
-                word_ids_buf_,
-                m,
-                args_.hidden_units_,
-                decoding_params.stream);
-          }
-        }
-      } else {
-        if (args_.prefix_lm_) {
-          embeddings_kernel_launcher(embedding_buf_,
-                                     decoding_params.embedding_table,
-                                     decoding_params.position_encoding_table,
-                                     decoding_params.type_table,
-                                     decoding_params.memory_sequence_length,
-                                     decoding_params.decoder_type_id,
-                                     word_ids_buf_,
-                                     step,
-                                     m,
-                                     args_.hidden_units_,
-                                     args_.pos_bias_,
-                                     decoding_params.stream,
-                                     decoding_params.decoder_role_id,
-                                     decoding_params.role_embedding_table,
-                                     decoding_params.decoder_position_ids);
-        } else {
-          // TODO(gongenlei): Only support Bart temporarily.
-          embedding_position_lookups_bart_kernel_launcher(
-              embedding_buf_,
-              decoding_params.embedding_table,
-              decoding_params.position_encoding_table +
-                  (step - 1 + args_.pos_offset_) * args_.hidden_units_,
-              word_ids_buf_,
-              m,
-              args_.hidden_units_,
-              decoding_params.stream);
-        }
-
-#ifndef NDEBUG
-        cudaDeviceSynchronize();
-        check_cuda_error(cudaGetLastError());
-#endif
-
-        layer_norm(embedding_buf_,
-                   decoding_params.layernorm.gamma,
-                   decoding_params.layernorm.beta,
-                   from_tensor_[0],
-                   m,
-                   k,
-                   decoding_params.stream);
-      }
-
-#ifndef NDEBUG
-      cudaDeviceSynchronize();
-      check_cuda_error(cudaGetLastError());
-#endif
-
-      int from_id, out_id;
-      for (int layer = 0; layer < args_.decoder_layers_; ++layer) {
-        /*
-          For the first layer (layer-0), from_id is 0. We also stored the
-          embedding lookup
-          result in from_tensor_[0]
-        */
-        from_id = layer & 0x1;
-        out_id = 1 - from_id;
-
-        /*
-          We use one decoder_ object to process multiple decoder layers.
-
-          At the beginning of each decoder layer, we initialize the decoder
-          object
-          with corresponding weights and decoder_buf_.
-          The decoder_buf_ is reused.
-        */
-        decoder_->initialize(param[layer], decoder_buf_, cublas_workspace_);
-
-#ifndef NDEBUG
-        cudaDeviceSynchronize();
-        check_cuda_error(cudaGetLastError());
-#endif
-
-        if (args_.prefix_lm_) {
-          decoder_->forward_v2(
-              from_tensor_[from_id],
-              nullptr,
-              K_cache_[kv_cache_id] + layer * cache_size,
-              V_cache_[kv_cache_id] + layer * cache_size,
-              nullptr,
-              nullptr,
-              nullptr,
-              from_tensor_[out_id],
-              step + args_.memory_max_seq_len_,
-              args_.seq_len_ + args_.memory_max_seq_len_,
-              false, /* is_cross_attention */
-              keep_alive_beam_ ? alive_finished_buf_ : finished_buf_,
-              args_.memory_max_seq_len_,
-              decoding_params.memory_sequence_length);
-        } else {
-          decoder_->forward(
-              from_tensor_[from_id],
-              decoding_params.memory_tensor,
-              K_cache_[kv_cache_id] + layer * cache_size,
-              V_cache_[kv_cache_id] + layer * cache_size,
-              K_mem_cache_[layer],
-              V_mem_cache_[layer],
-              decoding_params.memory_sequence_length,
-              from_tensor_[out_id],
-              step,
-              args_.seq_len_,
-              true, /* is_cross_attention */
-              keep_alive_beam_ ? alive_finished_buf_ : finished_buf_);
-        }
-
-#ifndef NDEBUG
-        cudaDeviceSynchronize();
-        check_cuda_error(cudaGetLastError());
-#endif
-      }
-
-      if (step > min_trg_len) {
-        DataType_ alpha = (DataType_)1.0f;
-        DataType_ beta = (DataType_)0.0f;
-
-        if (args_.is_miro_) {
-            layer_norm(from_tensor_[out_id],
-                       decoding_params.layernorm.gamma,
-                       decoding_params.layernorm.beta,
-                       decoder_normed_result_buf_,
-                       m,
-                       k,
-                       decoding_params.stream);
-
-#ifndef NDEBUG
-            cudaDeviceSynchronize();
-            check_cuda_error(cudaGetLastError());
-#endif
-
-            // trans here
-            cublasMM_cublasLtMM_wrapper_decoder(decoding_params.cublaslt_handle,
-                                                decoding_params.cublas_handle,
-                                                CUBLAS_OP_N,
-                                                CUBLAS_OP_N,
-                                                k,
-                                                m,
-                                                k,
-                                                &alpha,
-                                                decoding_params.trans_kernel,
-                                                AType_,
-                                                k,
-                                                decoder_normed_result_buf_,
-                                                BType_,
-                                                k,
-                                                &beta,
-                                                trans_out_buf_,
-                                                CType_,
-                                                k,
-                                                decoding_params.stream,
-                                                cublasAlgoMap_,
-                                                cublas_workspace_);
-
-#ifndef NDEBUG
-            cudaDeviceSynchronize();
-            check_cuda_error(cudaGetLastError());
-#endif
-
-          // add bias decoding_params.trans_bias
-          add_bias_act_kernelLauncher(trans_out_buf_,
-                                      decoding_params.trans_bias,
-                                      m,
-                                      k,
-                                      args_.act_,
-                                      decoding_params.stream);
-
-#ifndef NDEBUG
-            cudaDeviceSynchronize();
-            check_cuda_error(cudaGetLastError());
-#endif
-
-          layer_norm(trans_out_buf_,
-                     decoding_params.lm_layernorm.gamma,
-                     decoding_params.lm_layernorm.beta,
-                     lm_normed_result_buf_,
-                     m,
-                     k,
-                     decoding_params.stream);
-
-#ifndef NDEBUG
-            cudaDeviceSynchronize();
-            check_cuda_error(cudaGetLastError());
-#endif
-
-          cublasMM_cublasLtMM_wrapper_decoder(decoding_params.cublaslt_handle,
-                                              decoding_params.cublas_handle,
-                                              CUBLAS_OP_N,
-                                              CUBLAS_OP_N,
-                                              n,
-                                              m,
-                                              k,
-                                              &alpha,
-                                              embedding_kernel_ptr,
-                                              AType_,
-                                              n,
-                                              lm_normed_result_buf_,
-                                              BType_,
-                                              k,
-                                              &beta,
-                                              tmp_logits_buf_,
-                                              CType_,
-                                              n,
-                                              decoding_params.stream,
-                                              cublasAlgoMap_,
-                                              cublas_workspace_);
-
-#ifndef NDEBUG
-            cudaDeviceSynchronize();
-            check_cuda_error(cudaGetLastError());
-#endif
-
-        } else if (args_.prefix_lm_) {
-          if (args_.normalization_before_) {
-            layer_norm(from_tensor_[out_id],
-                       decoding_params.layernorm.gamma,
-                       decoding_params.layernorm.beta,
-                       decoder_normed_result_buf_,
-                       m,
-                       k,
-                       decoding_params.stream);
-
-#ifndef NDEBUG
-            cudaDeviceSynchronize();
-            check_cuda_error(cudaGetLastError());
-#endif
-
-            // trans here
-            cublasMM_cublasLtMM_wrapper_decoder(decoding_params.cublaslt_handle,
-                                                decoding_params.cublas_handle,
-                                                CUBLAS_OP_N,
-                                                CUBLAS_OP_N,
-                                                k,
-                                                m,
-                                                k,
-                                                &alpha,
-                                                decoding_params.trans_kernel,
-                                                AType_,
-                                                k,
-                                                decoder_normed_result_buf_,
-                                                BType_,
-                                                k,
-                                                &beta,
-                                                trans_out_buf_,
-                                                CType_,
-                                                k,
-                                                decoding_params.stream,
-                                                cublasAlgoMap_,
-                                                cublas_workspace_);
-          } else {
-            // trans here
-            cublasMM_cublasLtMM_wrapper_decoder(decoding_params.cublaslt_handle,
-                                                decoding_params.cublas_handle,
-                                                CUBLAS_OP_N,
-                                                CUBLAS_OP_N,
-                                                k,
-                                                m,
-                                                k,
-                                                &alpha,
-                                                decoding_params.trans_kernel,
-                                                AType_,
-                                                k,
-                                                from_tensor_[out_id],
-                                                BType_,
-                                                k,
-                                                &beta,
-                                                trans_out_buf_,
-                                                CType_,
-                                                k,
-                                                decoding_params.stream,
-                                                cublasAlgoMap_,
-                                                cublas_workspace_);
-          }
-#ifndef NDEBUG
-          cudaDeviceSynchronize();
-          check_cuda_error(cudaGetLastError());
-#endif
-
-          // add bias decoding_params.trans_bias
-          add_bias_act_kernelLauncher(trans_out_buf_,
-                                      decoding_params.trans_bias,
-                                      m,
-                                      k,
-                                      args_.act_,
-                                      decoding_params.stream);
-
-#ifndef NDEBUG
-          cudaDeviceSynchronize();
-          check_cuda_error(cudaGetLastError());
-#endif
-
-          layer_norm(trans_out_buf_,
-                     decoding_params.lm_layernorm.gamma,
-                     decoding_params.lm_layernorm.beta,
-                     lm_normed_result_buf_,
-                     m,
-                     k,
-                     decoding_params.stream);
-
-#ifndef NDEBUG
-          cudaDeviceSynchronize();
-          check_cuda_error(cudaGetLastError());
-#endif
-
-          cublasMM_cublasLtMM_wrapper_decoder(decoding_params.cublaslt_handle,
-                                              decoding_params.cublas_handle,
-                                              CUBLAS_OP_N,
-                                              CUBLAS_OP_N,
-                                              n,
-                                              m,
-                                              k,
-                                              &alpha,
-                                              embedding_kernel_ptr,
-                                              AType_,
-                                              n,
-                                              lm_normed_result_buf_,
-                                              BType_,
-                                              k,
-                                              &beta,
-                                              tmp_logits_buf_,
-                                              CType_,
-                                              n,
-                                              decoding_params.stream,
-                                              cublasAlgoMap_,
-                                              cublas_workspace_);
-
-        } else {
-          if (args_.normalization_before_) {
-            layer_norm(from_tensor_[out_id],
-                       decoding_params.layernorm.gamma,
-                       decoding_params.layernorm.beta,
-                       decoder_normed_result_buf_,
-                       m,
-                       k,
-                       decoding_params.stream);
-
-#ifndef NDEBUG
-            cudaDeviceSynchronize();
-            check_cuda_error(cudaGetLastError());
-#endif
-
-            cublasMM_cublasLtMM_wrapper_decoder(decoding_params.cublaslt_handle,
-                                                decoding_params.cublas_handle,
-                                                CUBLAS_OP_N,
-                                                CUBLAS_OP_N,
-                                                n,
-                                                m,
-                                                k,
-                                                &alpha,
-                                                embedding_kernel_ptr,
-                                                AType_,
-                                                n,
-                                                decoder_normed_result_buf_,
-                                                BType_,
-                                                k,
-                                                &beta,
-                                                tmp_logits_buf_,
-                                                CType_,
-                                                n,
-                                                decoding_params.stream,
-                                                cublasAlgoMap_,
-                                                cublas_workspace_);
-
-          } else {
-            // Post-norm
-            cublasMM_cublasLtMM_wrapper_decoder(decoding_params.cublaslt_handle,
-                                                decoding_params.cublas_handle,
-                                                CUBLAS_OP_N,
-                                                CUBLAS_OP_N,
-                                                n,
-                                                m,
-                                                k,
-                                                &alpha,
-                                                embedding_kernel_ptr,
-                                                AType_,
-                                                n,
-                                                from_tensor_[out_id],
-                                                BType_,
-                                                k,
-                                                &beta,
-                                                tmp_logits_buf_,
-                                                CType_,
-                                                n,
-                                                decoding_params.stream,
-                                                cublasAlgoMap_,
-                                                cublas_workspace_);
-          }
-        }
-
-#ifndef NDEBUG
-        cudaDeviceSynchronize();
-        check_cuda_error(cudaGetLastError());
-#endif
-
-        if (decoding_params.logits_mask ||
-            (args_.min_length_ != 0 && step <= args_.min_length_) ||
-            args_.vocab_size_padded_ != args_.vocab_size_) {
-          apply_logits_mask_kernelLauncher(
-              tmp_logits_buf_,
-              keep_alive_beam_ ? alive_finished_buf_ : finished_buf_,
-              args_.batch_size_,
-              args_.beam_width_,
-              args_.vocab_size_padded_,
-              args_.vocab_size_,
-              decoding_params.stream,
-              decoding_params.logits_mask,
-              (args_.min_length_ != 0 && step <= args_.min_length_),
-              args_.end_id_);
-#ifndef NDEBUG
-          cudaDeviceSynchronize();
-          check_cuda_error(cudaGetLastError());
-#endif
-        }
-
-        // Beamsearch
-        if (is_fuse_topk_softMax_) {
-          if (keep_alive_beam_) {
-            // Use separated alive and finish beam queues to avoid the decrease
-            // of alive beams.
-            topK_softMax_update(tmp_logits_buf_,
-                                embedding_bias_ptr,
-                                finished_buf_,
-                                alive_finished_buf_,
-                                decoding_params.sequence_length,
-                                word_ids_buf_,
-                                parent_ids_buf_,
-                                decoding_params.output_ids + (step - 1) * m * 2,
-                                decoding_params.parent_ids + (step - 1) * m * 2,
-                                cum_log_buf_,
-                                reinterpret_cast<void *>(temp_storage_),
-                                step,
-                                args_,
-                                decoding_params.stream);
-          } else {
-            topK_softMax(tmp_logits_buf_,
-                         embedding_bias_ptr,
-                         finished_buf_,
-                         cum_log_buf_,
-                         word_ids_buf_,
-                         reinterpret_cast<void *>(temp_storage_),
-                         args_,
-                         decoding_params.stream);
-#ifndef NDEBUG
-            cudaDeviceSynchronize();
-            check_cuda_error(cudaGetLastError());
-#endif
-
-            update_kernelLauncher_v2(
-                finished_buf_,
-                decoding_params.parent_ids + (step - 1) * m,
-                decoding_params.sequence_length,
-                word_ids_buf_,
-                decoding_params.output_ids + (step - 1) * m,
-                finished_count_buf_,
-                args_,
-                decoding_params.stream);
-#ifndef NDEBUG
-            cudaDeviceSynchronize();
-            check_cuda_error(cudaGetLastError());
-#endif
-          }
-
-        } else {
-          if (keep_alive_beam_ == true) {
-            update_logits_v2(tmp_logits_buf_,
-                             embedding_bias_ptr,
-                             args_.end_id_,
-                             finished_buf_,
-                             m,
-                             n,
-                             decoding_params.stream);
-
-            // Use separated alive and finish beam queues to avoid the decrease
-            // of alive beams.
-            topK_update_kernelLauncher(
-                topK_kernel_workspace,
-                topk_workspace_size_,
-                tmp_logits_buf_,
-                finished_buf_,
-                alive_finished_buf_,
-                decoding_params.sequence_length,
-                word_ids_buf_,
-                parent_ids_buf_,
-                decoding_params.output_ids + (step - 1) * m * 2,
-                decoding_params.parent_ids + (step - 1) * m * 2,
-                cum_log_buf_,
-                step,
-                args_,
-                decoding_params.stream);
-          } else {
-            update_logits(logits_buf_,
-                          tmp_logits_buf_,
-                          embedding_bias_ptr,
-                          args_.end_id_,
-                          finished_buf_,
-                          m,
-                          n,
-                          decoding_params.stream);
-
-#ifndef NDEBUG
-            cudaDeviceSynchronize();
-            check_cuda_error(cudaGetLastError());
-
-/*
-  User can check the update_logits by update_logits_kernel_check.
-  update_logits_kernel_check will compare the results of GPU and CPU.
-  Note that update_logits_kernel_check contains update_logits and uses do not
-  need to call it again.
-*/
-// update_logits_kernel_check(logits_buf_, decoding_params.embedding_bias,
-// args_.end_id_, finished_buf_, m, n, decoding_params.stream);
-#endif
-            /* adding cum_log_buf_ to logits_buf_ */
-            broadcast_kernelLauncher(logits_buf_,
-                                     cum_log_buf_,
-                                     args_.batch_size_,
-                                     args_.beam_width_,
-                                     args_.vocab_size_padded_,
-                                     decoding_params.stream);
-#ifndef NDEBUG
-            cudaDeviceSynchronize();
-            check_cuda_error(cudaGetLastError());
-
-/*
-  User can check the broadcast_kernel by broadcast_kernel_check.
-  broadcast_kernel_check will compare the results of GPU and CPU.
-  Note that broadcast_kernel_check contains broadcast_kernelLauncher and uses do
-  not need to call it again.
-*/
-// broadcast_kernel_check(logits_buf_, cum_log_buf_, batch_size_, beam_width_,
-// vocab_size_, decoding_params.stream);
-#endif
-
-            topK_kernelLauncher(topK_kernel_workspace,
-                                topk_workspace_size_,
-                                logits_buf_,
-                                word_ids_buf_,
-                                finished_buf_,
-                                args_,
-                                decoding_params.stream);
-#ifndef NDEBUG
-            cudaDeviceSynchronize();
-            check_cuda_error(cudaGetLastError());
-#endif
-            update_kernelLauncher(logits_buf_,
-                                  cum_log_buf_,
-                                  finished_buf_,
-                                  decoding_params.parent_ids + (step - 1) * m,
-                                  decoding_params.sequence_length,
-                                  word_ids_buf_,
-                                  decoding_params.output_ids + (step - 1) * m,
-                                  args_.batch_size_,
-                                  args_.beam_width_,
-                                  args_.vocab_size_padded_,
-                                  decoding_params.stream,
-                                  args_.end_id_,
-                                  finished_count_buf_);
-          }
-        }
-
-#ifndef NDEBUG
-        cudaDeviceSynchronize();
-        check_cuda_error(cudaGetLastError());
-#endif
-      }
-
-      if (step <= max_trg_len) {
-#ifndef NDEBUG
-        cudaDeviceSynchronize();
-        check_cuda_error(cudaGetLastError());
-#endif
-
-        update_with_force_decodingLauncher<float>(
-            decoding_params.trg_word,
-            decoding_params.trg_length,
-            finished_buf_,
-            word_ids_buf_,
-            (step > min_trg_len) ? nullptr : decoding_params.sequence_length,
-            (keep_alive_beam_) ? parent_ids_buf_ : nullptr,
-            (keep_alive_beam_) ? decoding_params.parent_ids + (step - 1) * m * 2
-                               : decoding_params.parent_ids + (step - 1) * m,
-            (keep_alive_beam_) ? decoding_params.output_ids + (step - 1) * m * 2
-                               : decoding_params.output_ids + (step - 1) * m,
-            cum_log_buf_,
-            keep_alive_beam_,
-            args_.batch_size_,
-            (keep_alive_beam_) ? args_.beam_width_ * 2 : args_.beam_width_,
-            max_trg_len,
-            step,
-            decoding_params.stream);
-      }
-
-#ifndef NDEBUG
-      cudaDeviceSynchronize();
-      check_cuda_error(cudaGetLastError());
-#endif
-
-      if (args_.beam_width_ > 1) {
-        // chose which self cache to use
-        int decoder_max_seq_len =
-            (decoder_->getCacheFormat() != 0) ? args_.seq_len_ : -1;
-
-        update_KV_cache_kernelLauncher_v2(
-            K_cache_,
-            V_cache_,
-            keep_alive_beam_ ? parent_ids_buf_
-                             : decoding_params.parent_ids + (step - 1) * m,
-            keep_alive_beam_ ? alive_finished_buf_ : finished_buf_,
-            args_.batch_size_,
-            args_.beam_width_,
-            args_.head_num_,
-            args_.size_per_head_,
-            step,
-            decoder_max_seq_len,
-            cache_size,
-            args_.decoder_layers_,
-            decoding_params.stream,
-            (args_.prefix_lm_) ? args_.memory_max_seq_len_ : -1);
-      }
-#ifndef NDEBUG
-      cudaDeviceSynchronize();
-      check_cuda_error(cudaGetLastError());
-
-/*
-  User can check the update_KV_cache by update_KV_cache_kernel_check.
-  update_KV_cache_kernel_check will compare the results of GPU and CPU.
-  Note that update_KV_cache_kernel_check contains update_KV_cache and uses do
-  not need to call it again.
-*/
-// update_KV_cache_kernel_check(K_cache_, V_cache_, decoding_params.parent_ids +
-// (step - 1) * batch_size_ * beam_width_, batch_size_, beam_width_,
-// hidden_units_, step, cache_size, decoder_layers_, decoding_params.stream);
-#endif
-
-      if (step > max_trg_len) {
-        // TODO Find a better method to check the is_finished
-        int finish_size = (keep_alive_beam_) ? m * 2 : m;
-        cudaMemcpy(h_finished_buf_,
-                   finished_buf_,
-                   sizeof(bool) * finish_size,
-                   cudaMemcpyDeviceToHost);
-        int sum = 0;
-        for (int i = 0; i < finish_size; i++) {
-          sum += (int)h_finished_buf_[i];
-        }
-        if (sum == finish_size) break;
-      }
-    }  // end for decoding step for llop
-
-    if (decoding_params.output_scores) {
-      cudaMemcpyAsync(decoding_params.output_scores, cum_log_buf_, 
-                      sizeof(float) * args_.batch_size_ * args_.beam_width_, cudaMemcpyDeviceToDevice, decoding_params.stream);
-    }
-  }    // end of forward
-
-  virtual ~DecodingBeamsearch() {
-    delete[] K_cache_;
-    delete[] V_cache_;
-    delete[] K_mem_cache_;
-    delete[] V_mem_cache_;
-    delete[] h_finished_buf_;
-    delete[] h_trg_length_;
-    delete decoder_;
-    allocator_.free(buf_);
-  }
-};
-
-}  // namespace fastertransformer
diff --git a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/decoding_sampling.h b/paddlenlp/ops/patches/FasterTransformer/fastertransformer/decoding_sampling.h
deleted file mode 100644
index afbd2716cd65..000000000000
--- a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/decoding_sampling.h
+++ /dev/null
@@ -1,1319 +0,0 @@
-/*
- * Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
- * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/**
- * Decoder transformer
- **/
-
-#pragma once
-
-#include <cuda_runtime.h>
-#include "fastertransformer/cuda/cuda_kernels.h"
-#include "fastertransformer/open_decoder.h"
-#include "fastertransformer/utils/allocator.h"
-#include "fastertransformer/utils/arguments.h"
-#include "fastertransformer/utils/common.h"
-#include "fastertransformer/utils/functions.h"
-
-namespace fastertransformer {
-
-template <OperationType OpType_>
-class DecodingSampling {
-private:
-  typedef DecoderTransformerTraits<OpType_> Traits_;
-  typedef typename Traits_::DataType DataType_;
-  const IAllocator &allocator_;
-  struct DecodingSamplingArguments args_;
-  TensorParallelParam t_parallel_param_;
-  LayerParallelParam l_parallel_param_;
-
-  const cudaDataType_t computeType_ = Traits_::computeType;
-  const cudaDataType_t AType_ = Traits_::AType;
-  const cudaDataType_t BType_ = Traits_::BType;
-  const cudaDataType_t CType_ = Traits_::CType;
-  std::map<std::string, cublasLtMatmulAlgo_info> cublasAlgoMap_;
-
-  OpenDecoder<OpType_> *decoder_;
-  DataType_ **K_cache_;
-  DataType_ **V_cache_;
-  DataType_ **K_mem_cache_;
-  DataType_ **V_mem_cache_;
-  DataType_ *from_tensor_[2];
-  DataType_ *decoder_buf_;
-  DataType_ *decoder_normed_result_buf_;
-  DataType_ *embedding_buf_;
-  DataType_ *trans_out_buf_;
-  DataType_ *lm_normed_result_buf_;
-  DataType_ *logits_buf_;
-  int *word_ids_buf_;
-  bool *finished_buf_;
-  int *h_trg_length_;
-
-  void *buf_;
-  int *finished_count_buf_;
-  bool *h_finished_buf_;
-
-  void *topk_workspace_ = nullptr;
-  size_t topk_workspace_size_ = 0;
-  void *topp_workspace_ = nullptr;
-  size_t topp_workspace_size_ = 0;
-  void *cublas_workspace_ = nullptr;
-  curandState_t *curandstate_buf_;
-  int *topp_id_vals_buf_;
-  int *topp_offset_buf_;
-  int *begin_topp_offset_buf_;
-
-  DataType_ *padded_embedding_kernel;
-  DataType_ *padded_embedding_bias;
-
-public:
-  DecodingSampling(const IAllocator &allocator,
-                   const int batch_size,
-                   const int seq_len,
-                   const int head_num,
-                   const int size_per_head,
-                   const int vocab_size,
-                   const int decoder_layers,
-                   const int memory_hidden_units,
-                   const int memory_max_seq_len,
-                   const int start_id,
-                   const int end_id,
-                   const int candidate_num = 0,
-                   const float probability_threshold = 0.0,
-                   const int is_fuse_qkv = false,
-                   const bool normalization_before = true,
-                   const int pos_offset = 0,
-                   const ActivationType act = ActivationType::RELU,
-                   const bool pos_bias = false,
-                   const float temperature = 1.0,
-                   const float repeat_penalty = 1.0,
-                   const bool prefix_lm = false,
-                   const bool is_mbart = false,
-                   const int min_length = 0,
-                   const int inner_coeff = 4,
-                   const int seed = -1,
-                   const int tensor_para_size = 1,
-                   const int layer_para_size = 1,
-                   const bool is_miro = false)
-      : allocator_(allocator) {
-    args_.batch_size_ = batch_size;
-    args_.seq_len_ = seq_len;
-    args_.memory_max_seq_len_ = memory_max_seq_len;
-    args_.head_num_ = head_num;
-    args_.size_per_head_ = size_per_head;
-    args_.hidden_units_ = head_num * size_per_head;
-    args_.decoder_layers_ = decoder_layers;
-    args_.vocab_size_ = vocab_size;
-    args_.candidate_num_ = candidate_num;
-    args_.probability_threshold_ = probability_threshold;
-    args_.start_id_ = start_id;
-    args_.end_id_ = end_id;
-    args_.normalization_before_ = normalization_before;
-    args_.pos_offset_ = pos_offset;
-    args_.act_ = act;
-
-    args_.pos_bias_ = pos_bias;
-    args_.temperature_ = temperature;
-    args_.repeat_penalty_ = repeat_penalty;
-
-    args_.min_length_ = min_length;
-    args_.seed_ = seed;
-
-    args_.prefix_lm_ = prefix_lm;
-    args_.is_mbart_ = is_mbart;
-    args_.is_miro_ = is_miro;
-
-    // For models without parallel
-    if (l_parallel_param_.layers_per_group == 0) {
-        l_parallel_param_.layers_per_group = decoder_layers;
-    }
-
-    if (std::is_same<DataType_, float>::value)
-      args_.vocab_size_padded_ = vocab_size;
-    else if (std::is_same<DataType_, half>::value)
-      args_.vocab_size_padded_ = (int)(ceil(vocab_size / 8.)) * 8;
-
-    if (args_.candidate_num_ == 0 && args_.probability_threshold_ == 0.0) {
-      printf(
-          "[ERROR] Candidate_num for topk is 0 and probability threshold for "
-          "top p is 0.0 \n");
-      exit(-1);
-    } else if (args_.candidate_num_ != 0 &&
-               args_.probability_threshold_ != 0.0) {
-      printf(
-          "[ERROR] Candidate_num for topk is not 0 and probability threshold "
-          "for top p is not 0.0 \n");
-      exit(-1);
-    }
-#ifndef NDEBUG
-    PRINT_FUNC_NAME_();
-#endif
-    K_cache_ = new DataType_ *[1];
-    V_cache_ = new DataType_ *[1];
-
-    K_mem_cache_ = new DataType_ *[args_.decoder_layers_];
-    V_mem_cache_ = new DataType_ *[args_.decoder_layers_];
-
-    decoder_ = new OpenDecoder<OpType_>(head_num,
-                                        size_per_head,
-                                        memory_hidden_units,
-                                        is_fuse_qkv,
-                                        normalization_before,
-                                        args_.act_,
-                                        inner_coeff);
-    decoder_->set_max_batch_size(batch_size);
-
-    size_t from_tensor_size =
-        args_.batch_size_ * args_.hidden_units_;                   // type T
-    size_t decoder_workspace_size = decoder_->getWorkspaceSize();  // type T
-    size_t decoder_normed_result_buffer_size =
-        args_.batch_size_ * args_.hidden_units_;  // type T
-
-    size_t cache_size = (prefix_lm)
-                            ? (args_.batch_size_ *
-                               (args_.seq_len_ + args_.memory_max_seq_len_) *
-                               args_.hidden_units_)
-                            : (args_.batch_size_ * args_.seq_len_ *
-                               args_.hidden_units_);  // type T
-    size_t mem_cache_size =
-        (prefix_lm) ? 0 : (args_.batch_size_ * memory_max_seq_len *
-                           args_.hidden_units_);  // type T
-    if (tensor_para_size != 1) { // tensor parallel
-      cache_size /= tensor_para_size;
-      mem_cache_size /= tensor_para_size;
-    }
-    size_t logits_buf_size =
-        args_.batch_size_ * args_.vocab_size_padded_;  // type T
-
-    size_t word_ids_buf_size = args_.batch_size_;               // type int
-    size_t finished_buf_size = args_.batch_size_;               // type bool
-    size_t finished_count_size = (size_t)(ceil(1 / 32.)) * 32;  // type int
-
-    int topp_id_vals_buf_size =
-        args_.batch_size_ * args_.vocab_size_padded_;  // type int
-    int topp_offset_buf_size = args_.batch_size_ + 1;  // type int
-    size_t begin_topp_offset_buf_size = topp_offset_buf_size;
-    size_t curandState_size = args_.batch_size_;
-    size_t padded_embedding_kernel_size =
-        args_.hidden_units_ * args_.vocab_size_padded_;
-    size_t padded_embedding_bias_size = args_.vocab_size_padded_;
-    if (std::is_same<DataType_, float>::value ||
-        (std::is_same<DataType_, half>::value &&
-         args_.vocab_size_ == args_.vocab_size_padded_)) {
-      padded_embedding_kernel_size = 0;
-      padded_embedding_bias_size = 0;
-    }
-
-    // prevent memory misalinged address
-    logits_buf_size = (size_t)(ceil(logits_buf_size / 4.)) * 4;
-    word_ids_buf_size = (size_t)(ceil(word_ids_buf_size / 4.)) * 4;
-    finished_buf_size = (size_t)(ceil(finished_buf_size / 32.)) * 32;
-
-    topp_id_vals_buf_size = (size_t)(ceil(topp_id_vals_buf_size / 4.)) * 4;
-    topp_offset_buf_size = (size_t)(ceil(topp_offset_buf_size / 4.)) * 4;
-    begin_topp_offset_buf_size = topp_offset_buf_size;
-
-    topP_sampling_kernel_kernelLauncher_v2(topp_workspace_,
-                                           topp_workspace_size_,
-                                           logits_buf_,
-                                           topp_id_vals_buf_,
-                                           topp_offset_buf_,
-                                           begin_topp_offset_buf_,
-                                           finished_buf_,
-                                           curandstate_buf_,
-                                           args_,
-                                           nullptr,
-                                           nullptr,
-                                           args_.vocab_size_padded_,
-                                           0,
-                                           args_.batch_size_);
-
-    topK_sampling_kernel_kernelLauncher_v2(topk_workspace_,
-                                           topk_workspace_size_,
-                                           logits_buf_,
-                                           nullptr,
-                                           nullptr,
-                                           finished_buf_,
-                                           curandstate_buf_,
-                                           args_,
-                                           0,
-                                           args_.batch_size_);
-
-    size_t datatype_buf_size =
-        from_tensor_size * 2 + decoder_workspace_size +
-        (cache_size * 4 + mem_cache_size * 2) * args_.decoder_layers_ +
-        decoder_normed_result_buffer_size * 3;
-
-    buf_ = reinterpret_cast<void *>(allocator_.malloc(
-        ((sizeof(DataType_) == sizeof(half)) ? CUBLAS_WORKSPACE_SIZE : 0) +
-        sizeof(DataType_) * (datatype_buf_size + logits_buf_size) +
-        sizeof(DataType_) *
-            (padded_embedding_kernel_size + padded_embedding_bias_size) +
-        sizeof(int) * word_ids_buf_size + sizeof(bool) * finished_buf_size +
-        sizeof(int) * finished_count_size +
-        sizeof(int) * (topp_id_vals_buf_size + 2 * topp_offset_buf_size) +
-        topp_workspace_size_ + topk_workspace_size_ +
-        curandState_size * sizeof(curandState_t)));
-
-    if (sizeof(DataType_) == sizeof(half)) {
-      cublas_workspace_ = buf_;
-      from_tensor_[0] =
-          (DataType_ *)((char *)cublas_workspace_ + CUBLAS_WORKSPACE_SIZE);
-    } else {
-      cublas_workspace_ = nullptr;
-      from_tensor_[0] = (DataType_ *)buf_;
-    }
-    from_tensor_[1] = (DataType_ *)(from_tensor_[0] + from_tensor_size);
-
-    for (int i = 0; i < args_.decoder_layers_; ++i) {
-      K_mem_cache_[i] =
-          from_tensor_[1] + from_tensor_size + i * mem_cache_size * 2;
-      V_mem_cache_[i] = from_tensor_[1] + from_tensor_size +
-                        i * mem_cache_size * 2 + mem_cache_size;
-    }
-
-    /* We use two-way buffer since we have to update KV buf at the end of each
-     * step. */
-    K_cache_[0] = V_mem_cache_[args_.decoder_layers_ - 1] + mem_cache_size +
-                  0 * cache_size * args_.decoder_layers_;
-    V_cache_[0] = V_mem_cache_[args_.decoder_layers_ - 1] + mem_cache_size +
-                  1 * cache_size * args_.decoder_layers_;
-
-    decoder_buf_ = V_cache_[0] + cache_size * args_.decoder_layers_;
-
-    if (prefix_lm) {
-      trans_out_buf_ = (decoder_buf_ + decoder_workspace_size);
-      lm_normed_result_buf_ =
-          (trans_out_buf_ + decoder_normed_result_buffer_size);
-
-      decoder_normed_result_buf_ =
-          (lm_normed_result_buf_ + decoder_normed_result_buffer_size);
-      // Used for post-norm.
-      embedding_buf_ =
-          (lm_normed_result_buf_ + decoder_normed_result_buffer_size);
-    } else {
-      decoder_normed_result_buf_ = (decoder_buf_ + decoder_workspace_size);
-      // Used for post-norm.
-      embedding_buf_ = (decoder_buf_ + decoder_workspace_size);
-    }
-
-    logits_buf_ =
-        decoder_normed_result_buf_ + decoder_normed_result_buffer_size;
-    word_ids_buf_ = (int *)(logits_buf_ + logits_buf_size);
-    finished_buf_ = (bool *)(word_ids_buf_ + word_ids_buf_size);
-    finished_count_buf_ = (int *)(finished_buf_ + finished_buf_size);
-    topp_id_vals_buf_ = (int *)(finished_count_buf_ + finished_count_size);
-    begin_topp_offset_buf_ = (int *)(topp_id_vals_buf_ + topp_id_vals_buf_size);
-    topp_offset_buf_ =
-        (int *)(begin_topp_offset_buf_ + begin_topp_offset_buf_size);
-    topp_workspace_ = (void *)(topp_offset_buf_ + topp_offset_buf_size);
-    topk_workspace_ = (void *)((char *)topp_workspace_ + topp_workspace_size_);
-    padded_embedding_kernel =
-        (DataType_ *)((char *)topk_workspace_ + topk_workspace_size_);
-    padded_embedding_bias =
-        (DataType_ *)(padded_embedding_kernel + padded_embedding_kernel_size);
-    curandstate_buf_ =
-        (curandState_t *)(padded_embedding_bias + padded_embedding_bias_size);
-
-    h_finished_buf_ = new bool[finished_buf_size];
-    h_trg_length_ = new int[args_.batch_size_];
-
-    int isConfigExist = access("decoding_gemm_config.in", 0);
-    if (isConfigExist == -1) {
-      printf("[WARNING] decoding_gemm_config.in is not found\n");
-    } else {
-      readAlgoFromConfig(cublasAlgoMap_, 1);
-      // check that the gemm_config setting is runnable
-      for (auto iter = cublasAlgoMap_.begin(); iter != cublasAlgoMap_.end();
-           iter++) {
-        int algoId = iter->second.algoId;
-        int stages = iter->second.stages;
-        // only check for cublas
-        if (stages != -1) continue;
-        if (Traits_::OpType == OperationType::FP32) {
-          if (algoId > CUBLAS_GEMM_ALGO23 || algoId < CUBLAS_GEMM_DEFAULT) {
-            // the algorithm is not for FP32
-            printf("[ERROR] cuBLAS Algorithm %d is not used in FP32. \n",
-                   algoId);
-            exit(-1);
-          }
-        } else {
-          if (algoId > CUBLAS_GEMM_ALGO15_TENSOR_OP ||
-              algoId < CUBLAS_GEMM_DEFAULT_TENSOR_OP) {
-            // the algorithm is not for FP16
-            printf("[ERROR] cuBLAS Algorithm %d is not used in FP16. \n",
-                   algoId);
-            exit(-1);
-          }
-        }
-      }
-    }
-  }
-
-  void set_tensor_parallel_param(const TensorParallelParam param) {
-    t_parallel_param_ = param;
-    decoder_->set_tensor_parallel_param(param);
-  }
-
-  void set_layer_parallel_param(const LayerParallelParam param) {
-    l_parallel_param_ = param;
-    decoder_->set_layer_parallel_param(param);
-  }
-
-  void forward_context(const DecoderInitParam<DataType_> *decoder_param,
-                       const DecodingInitParam<DataType_> decoding_params) {
-#ifndef NDEBUG
-      PRINT_FUNC_NAME_();
-#endif
-      const int input_len = decoding_params.request_input_len;
-      const int request_batch_size = decoding_params.request_batch_size;
-
-      const int max_input_len = decoding_params.max_input_len;
-
-      const int local_batch_size = ceil(request_batch_size * 1.0 / l_parallel_param_.world_size);
-      const int m = local_batch_size * input_len;
-      const int h_1 = args_.hidden_units_;
-
-      DataType_* from_tensor[2];
-      DataType_* decoder_output;
-      DataType_* decoder_workspace;
-
-      void *buf = reinterpret_cast<void *>(allocator_.malloc(
-          decoder_->getContextWorkspaceSize(input_len, local_batch_size) + 
-          (m * h_1 + 2 * request_batch_size * input_len * h_1) * sizeof(DataType_)
-      ));
-
-#ifndef NDEBUG
-      cudaDeviceSynchronize();
-      check_cuda_error(cudaGetLastError());
-#endif
-
-      from_tensor[0] = (DataType_*) buf;
-      from_tensor[1] = from_tensor[0] + request_batch_size * input_len * h_1;
-      decoder_output = from_tensor[1] + request_batch_size * input_len * h_1;
-      decoder_workspace = decoder_output + m * h_1;
-
-#ifndef NDEBUG
-      cudaDeviceSynchronize();
-      check_cuda_error(cudaGetLastError());
-#endif
-
-      if (args_.is_miro_) {
-        // Memory reuse. from_tensor[1].
-        start_ids_embeddings_kernel_launcher(from_tensor[1],
-                                            decoding_params.embedding_table,
-                                            decoding_params.position_encoding_table,
-                                            decoding_params.type_table,
-                                            decoding_params.type_id,
-                                            decoding_params.d_start_ids,
-                                            decoding_params.memory_sequence_length,
-                                            1,
-                                            input_len,
-                                            request_batch_size,
-                                            h_1,
-                                            decoding_params.stream,
-                                            decoding_params.role_id,
-                                            decoding_params.role_embedding_table,
-                                            decoding_params.position_ids);
-
-#ifndef NDEBUG
-      cudaDeviceSynchronize();
-      check_cuda_error(cudaGetLastError());
-#endif
-
-        layer_norm(from_tensor[1],
-                   decoding_params.pre_layernorm.gamma,
-                   decoding_params.pre_layernorm.beta,
-                   from_tensor[0],
-                   m,
-                   h_1,
-                   decoding_params.stream);
-
-      } else if (args_.normalization_before_) {
-        start_ids_embeddings_kernel_launcher(from_tensor[0],
-                                            decoding_params.embedding_table,
-                                            decoding_params.position_encoding_table,
-                                            decoding_params.type_table,
-                                            decoding_params.type_id,
-                                            decoding_params.d_start_ids,
-                                            decoding_params.memory_sequence_length,
-                                            1,
-                                            input_len,
-                                            request_batch_size,
-                                            h_1,
-                                            decoding_params.stream,
-                                            decoding_params.role_id,
-                                            decoding_params.role_embedding_table,
-                                            decoding_params.position_ids);
-      } else {
-        // Memory reuse. from_tensor[1].
-        start_ids_embeddings_kernel_launcher(from_tensor[1],
-                                            decoding_params.embedding_table,
-                                            decoding_params.position_encoding_table,
-                                            decoding_params.type_table,
-                                            decoding_params.type_id,
-                                            decoding_params.d_start_ids,
-                                            decoding_params.memory_sequence_length,
-                                            1,
-                                            input_len,
-                                            request_batch_size,
-                                            h_1,
-                                            decoding_params.stream,
-                                            decoding_params.role_id,
-                                            decoding_params.role_embedding_table,
-                                            decoding_params.position_ids);
-
-#ifndef NDEBUG
-      cudaDeviceSynchronize();
-      check_cuda_error(cudaGetLastError());
-#endif
-
-        layer_norm(from_tensor[1],
-                   decoding_params.layernorm.gamma,
-                   decoding_params.layernorm.beta,
-                   from_tensor[0],
-                   m,
-                   h_1,
-                   decoding_params.stream);
-      }
-
-#ifndef NDEBUG
-      cudaDeviceSynchronize();
-      check_cuda_error(cudaGetLastError());
-#endif
-
-      int dummy_decoder_max_seq_len = args_.seq_len_ + args_.memory_max_seq_len_;
-      size_t cache_size = local_batch_size * dummy_decoder_max_seq_len *
-                          t_parallel_param_.local_hidden_units_;
-
-      int in_id, out_id;
-      for (int layer = 0; layer < args_.decoder_layers_; ++layer) {
-        in_id = layer & 0x1;
-        out_id = 1 - in_id;
-
-        decoder_->initialize(decoder_param[layer], decoder_buf_, cublas_workspace_, false);
-#ifndef NDEBUG
-        cudaDeviceSynchronize();
-        check_cuda_error(cudaGetLastError());
-#endif
-
-        size_t cache_offset = layer * cache_size;  // type T
-        decoder_->forward_context(decoder_workspace,
-                                  from_tensor[out_id],
-                                  K_cache_[0] + cache_offset,
-                                  V_cache_[0] + cache_offset,
-                                  from_tensor[in_id],
-                                  decoding_params.d_attn_mask,
-                                  local_batch_size,
-                                  input_len,
-                                  0,
-                                  dummy_decoder_max_seq_len,
-                                  layer == args_.decoder_layers_ - 1,
-                                  decoding_params.memory_sequence_length);
-
-#ifndef NDEBUG
-        cudaDeviceSynchronize();
-        check_cuda_error(cudaGetLastError());
-#endif
-      } // end of for loop of layer
-      allocator_.free(buf);
-#ifndef NDEBUG
-      cudaDeviceSynchronize();
-      check_cuda_error(cudaGetLastError());
-#endif
-  }
-
-  void forward(const DecoderInitParam<DataType_> *param,
-               DecodingInitParam<DataType_> decoding_params) {
-#ifndef NDEBUG
-    PRINT_FUNC_NAME_();
-#endif
-    const int m = args_.batch_size_;
-    const int k = args_.hidden_units_;
-    const int n = args_.vocab_size_padded_;
-    const DataType_ *embedding_kernel_ptr = nullptr;
-    const DataType_ *embedding_bias_ptr = nullptr;
-
-    int min_trg_len = 0;
-    int max_trg_len = 0;
-
-    if (decoding_params.trg_word) {
-      cudaMemcpy(h_trg_length_,
-                 decoding_params.trg_length,
-                 sizeof(int) * args_.batch_size_,
-                 cudaMemcpyDeviceToHost);
-      min_trg_len = h_trg_length_[0];
-      max_trg_len = h_trg_length_[0];
-
-      for (int i = 1; i < args_.batch_size_; ++i) {
-        min_trg_len = std::min(min_trg_len, h_trg_length_[i]);
-        max_trg_len = std::max(max_trg_len, h_trg_length_[i]);
-      }
-    }
-
-    /*
-      sequence_length initialize to 0
-      finished: false
-      word_ids: start_id_
-    */
-    if (decoding_params.output_scores) {
-   	  cudaMemsetAsync(decoding_params.output_scores, 0, sizeof(float) * m);
-    }
-    if (args_.candidate_num_ != 0) {
-      sampling_init_kernelLauncher(finished_buf_,
-                                   decoding_params.sequence_length,
-                                   word_ids_buf_,
-                                   args_.start_id_,
-                                   args_.batch_size_,
-                                   decoding_params.stream);
-    } else if (args_.probability_threshold_ != 0.0) {
-      topp_initialization_kernelLauncher_v2(finished_buf_,
-                                            decoding_params.sequence_length,
-                                            word_ids_buf_,
-                                            topp_id_vals_buf_,
-                                            topp_offset_buf_,
-                                            begin_topp_offset_buf_,
-                                            args_.vocab_size_padded_,
-                                            args_,
-                                            decoding_params.stream);
-    }
-    ker_curand_setupLauncher(curandstate_buf_, args_, decoding_params.stream);
-
-#ifndef NDEBUG
-    cudaDeviceSynchronize();
-    check_cuda_error(cudaGetLastError());
-#endif
-
-    if (std::is_same<DataType_, float>::value ||
-        (std::is_same<DataType_, half>::value &&
-         args_.vocab_size_ == args_.vocab_size_padded_)) {
-      embedding_kernel_ptr =
-          (const DataType_ *)decoding_params.embedding_kernel;
-      embedding_bias_ptr = (const DataType_ *)decoding_params.embedding_bias;
-    } else if (std::is_same<DataType_, half>::value) {
-      kernel_padding_kernelLauncher(padded_embedding_kernel,
-                                    decoding_params.embedding_kernel,
-                                    args_.hidden_units_,
-                                    args_.vocab_size_,
-                                    args_.vocab_size_padded_,
-                                    decoding_params.stream);
-
-#ifndef NDEBUG
-      cudaDeviceSynchronize();
-      check_cuda_error(cudaGetLastError());
-#endif
-
-      bias_padding_kernelLauncher(padded_embedding_bias,
-                                  decoding_params.embedding_bias,
-                                  args_.vocab_size_,
-                                  args_.vocab_size_padded_,
-                                  decoding_params.stream);
-
-#ifndef NDEBUG
-      cudaDeviceSynchronize();
-      check_cuda_error(cudaGetLastError());
-#endif
-      embedding_kernel_ptr = padded_embedding_kernel;
-      embedding_bias_ptr = padded_embedding_bias;
-    }
-
-    // TODO(guosheng): move cache offset into for loop for pipeline parallel
-    size_t cache_size =
-        (args_.prefix_lm_) ? (args_.batch_size_ *
-                              (args_.seq_len_ + args_.memory_max_seq_len_) *
-                              t_parallel_param_.local_hidden_units_)
-                           : (args_.batch_size_ * args_.seq_len_ *
-                              t_parallel_param_.local_hidden_units_);  // type T
-
-    const int local_batch = l_parallel_param_.local_batch_size;
-    for (uint step = 1; step <= args_.seq_len_; ++step) {
-      // const int ite_num = args_.batch_size_ / local_batch;
-      // for (size_t ite = 0; ite < ite_num; ite++) {
-      // }
-      if (args_.is_miro_) {
-        embeddings_kernel_launcher(from_tensor_[1],
-                                   decoding_params.embedding_table,
-                                   decoding_params.position_encoding_table,
-                                   decoding_params.type_table,
-                                   decoding_params.memory_sequence_length,
-                                   decoding_params.decoder_type_id,
-                                   word_ids_buf_,
-                                   step,
-                                   args_.batch_size_,
-                                   args_.hidden_units_,
-                                   args_.pos_bias_,
-                                   decoding_params.stream,
-                                   decoding_params.decoder_role_id,
-                                   decoding_params.role_embedding_table,
-                                   decoding_params.decoder_position_ids);
-
-#ifndef NDEBUG
-        cudaDeviceSynchronize();
-        check_cuda_error(cudaGetLastError());
-#endif
-
-        layer_norm(from_tensor_[1],
-                   decoding_params.pre_layernorm.gamma,
-                   decoding_params.pre_layernorm.beta,
-                   from_tensor_[0],
-                   m,
-                   k,
-                   decoding_params.stream);
-
-#ifndef NDEBUG
-        cudaDeviceSynchronize();
-        check_cuda_error(cudaGetLastError());
-#endif
-
-      } else if (args_.normalization_before_) {
-        if (args_.prefix_lm_) {
-          embeddings_kernel_launcher(from_tensor_[0],
-                                     decoding_params.embedding_table,
-                                     decoding_params.position_encoding_table,
-                                     decoding_params.type_table,
-                                     decoding_params.memory_sequence_length,
-                                     decoding_params.decoder_type_id,
-                                     word_ids_buf_,
-                                     step,
-                                     args_.batch_size_,
-                                     args_.hidden_units_,
-                                     args_.pos_bias_,
-                                     decoding_params.stream,
-                                     decoding_params.decoder_role_id,
-                                     decoding_params.role_embedding_table,
-                                     decoding_params.decoder_position_ids);
-        } else {
-          if (args_.is_mbart_) {
-            embedding_lookup_sine_position_encoding_kernel_launcher(
-                embedding_buf_,
-                decoding_params.embedding_table,
-                decoding_params.position_encoding_table +
-                    (step - 1 + args_.pos_offset_) * args_.hidden_units_,
-                word_ids_buf_,
-                m,
-                args_.hidden_units_,
-                decoding_params.stream);
-
-            layer_norm(embedding_buf_,
-                       decoding_params.mbart_layernorm.gamma,
-                       decoding_params.mbart_layernorm.beta,
-                       from_tensor_[0],
-                       m,
-                       k,
-                       decoding_params.stream);
-
-          } else {
-            embedding_lookup_sine_position_encoding_kernel_launcher(
-                from_tensor_[0],
-                decoding_params.embedding_table,
-                decoding_params.position_encoding_table +
-                    (step - 1 + args_.pos_offset_) * args_.hidden_units_,
-                word_ids_buf_,
-                m,
-                args_.hidden_units_,
-                decoding_params.stream);
-          }
-        }
-      } else {
-        if (args_.prefix_lm_) {
-          embeddings_kernel_launcher(embedding_buf_,
-                                     decoding_params.embedding_table,
-                                     decoding_params.position_encoding_table,
-                                     decoding_params.type_table,
-                                     decoding_params.memory_sequence_length,
-                                     decoding_params.decoder_type_id,
-                                     word_ids_buf_,
-                                     step,
-                                     args_.batch_size_,
-                                     args_.hidden_units_,
-                                     args_.pos_bias_,
-                                     decoding_params.stream,
-                                     decoding_params.decoder_role_id,
-                                     decoding_params.role_embedding_table,
-                                     decoding_params.decoder_position_ids);
-        } else {
-          // TODO(gongenlei): Only support Bart temporarily.
-          embedding_position_lookups_bart_kernel_launcher(
-              embedding_buf_,
-              decoding_params.embedding_table,
-              decoding_params.position_encoding_table +
-                  (step - 1 + args_.pos_offset_) * args_.hidden_units_,
-              word_ids_buf_,
-              args_.batch_size_,
-              args_.hidden_units_,
-              decoding_params.stream);
-        }
-
-#ifndef NDEBUG
-        cudaDeviceSynchronize();
-        check_cuda_error(cudaGetLastError());
-#endif
-
-        layer_norm(embedding_buf_,
-                   decoding_params.layernorm.gamma,
-                   decoding_params.layernorm.beta,
-                   from_tensor_[0],
-                   m,
-                   k,
-                   decoding_params.stream);
-      }
-#ifndef NDEBUG
-      cudaDeviceSynchronize();
-      check_cuda_error(cudaGetLastError());
-#endif
-
-      int from_id, out_id;
-      for (int layer = 0; layer < args_.decoder_layers_; ++layer) {
-        if (l_parallel_param_.is_valid(layer)) {
-          /*
-             For the first layer (layer-0), from_id is 0. We also stored the
-             embedding lookup
-             result in from_tensor_[0]
-           */
-          from_id = layer & 0x1;
-          out_id = 1 - from_id;
-
-          /*
-            We use one decoder_ object to process multiple decoder layers.
-
-            At the beginning of each decoder layer, we initialize the decoder
-            object
-            with corresponding weights and decoder_buf_.
-
-            The decoder_buf_ is reused.
-          */
-          decoder_->initialize(param[layer], decoder_buf_, cublas_workspace_);
-
-#ifndef NDEBUG
-          cudaDeviceSynchronize();
-          check_cuda_error(cudaGetLastError());
-#endif
-
-          if (args_.prefix_lm_) {
-            decoder_->forward_v2(from_tensor_[from_id],
-                                nullptr,
-                                K_cache_[0] + layer * cache_size,
-                                V_cache_[0] + layer * cache_size,
-                                nullptr,
-                                nullptr,
-                                nullptr,
-                                from_tensor_[out_id],
-                                step + args_.memory_max_seq_len_,
-                                args_.seq_len_ + args_.memory_max_seq_len_,
-                                false, /* is_cross_attention */
-                                finished_buf_,
-                                args_.memory_max_seq_len_,
-                                decoding_params.memory_sequence_length);
-          } else {
-            decoder_->forward(from_tensor_[from_id],
-                              decoding_params.memory_tensor,
-                              K_cache_[0] + layer * cache_size,
-                              V_cache_[0] + layer * cache_size,
-                              K_mem_cache_[layer],
-                              V_mem_cache_[layer],
-                              decoding_params.memory_sequence_length,
-                              from_tensor_[out_id],
-                              step,
-                              args_.seq_len_,
-                              true, /* is_cross_attention */
-                              finished_buf_);
-          }
-
-#ifndef NDEBUG
-          cudaDeviceSynchronize();
-          check_cuda_error(cudaGetLastError());
-#endif
-        }
-      }
-
-      if (step > min_trg_len) {
-        DataType_ alpha = (DataType_)1.0f;
-        DataType_ beta = (DataType_)0.0f;
-
-        if (args_.is_miro_) {
-            layer_norm(from_tensor_[out_id],
-                       decoding_params.layernorm.gamma,
-                       decoding_params.layernorm.beta,
-                       decoder_normed_result_buf_,
-                       m,
-                       k,
-                       decoding_params.stream);
-
-#ifndef NDEBUG
-            cudaDeviceSynchronize();
-            check_cuda_error(cudaGetLastError());
-#endif
-
-            // trans here
-            cublasMM_cublasLtMM_wrapper_decoder(decoding_params.cublaslt_handle,
-                                                decoding_params.cublas_handle,
-                                                CUBLAS_OP_N,
-                                                CUBLAS_OP_N,
-                                                k,
-                                                m,
-                                                k,
-                                                &alpha,
-                                                decoding_params.trans_kernel,
-                                                AType_,
-                                                k,
-                                                decoder_normed_result_buf_,
-                                                BType_,
-                                                k,
-                                                &beta,
-                                                trans_out_buf_,
-                                                CType_,
-                                                k,
-                                                decoding_params.stream,
-                                                cublasAlgoMap_,
-                                                cublas_workspace_);
-
-#ifndef NDEBUG
-          cudaDeviceSynchronize();
-          check_cuda_error(cudaGetLastError());
-#endif
-
-          // add bias decoding_params.trans_bias
-          add_bias_act_kernelLauncher(trans_out_buf_,
-                                      decoding_params.trans_bias,
-                                      m,
-                                      k,
-                                      args_.act_,
-                                      decoding_params.stream);
-
-#ifndef NDEBUG
-          cudaDeviceSynchronize();
-          check_cuda_error(cudaGetLastError());
-#endif
-
-          layer_norm(trans_out_buf_,
-                     decoding_params.lm_layernorm.gamma,
-                     decoding_params.lm_layernorm.beta,
-                     lm_normed_result_buf_,
-                     m,
-                     k,
-                     decoding_params.stream);
-
-#ifndef NDEBUG
-          cudaDeviceSynchronize();
-          check_cuda_error(cudaGetLastError());
-#endif
-
-          cublasMM_cublasLtMM_wrapper_decoder(decoding_params.cublaslt_handle,
-                                              decoding_params.cublas_handle,
-                                              CUBLAS_OP_N,
-                                              CUBLAS_OP_N,
-                                              n,
-                                              m,
-                                              k,
-                                              &alpha,
-                                              embedding_kernel_ptr,
-                                              AType_,
-                                              n,
-                                              lm_normed_result_buf_,
-                                              BType_,
-                                              k,
-                                              &beta,
-                                              logits_buf_,
-                                              CType_,
-                                              n,
-                                              decoding_params.stream,
-                                              cublasAlgoMap_,
-                                              cublas_workspace_);
-
-#ifndef NDEBUG
-            cudaDeviceSynchronize();
-            check_cuda_error(cudaGetLastError());
-#endif
-
-        } else if (args_.prefix_lm_) {
-          if (args_.normalization_before_) {
-            layer_norm(from_tensor_[out_id],
-                       decoding_params.layernorm.gamma,
-                       decoding_params.layernorm.beta,
-                       decoder_normed_result_buf_,
-                       m,
-                       k,
-                       decoding_params.stream);
-
-#ifndef NDEBUG
-            cudaDeviceSynchronize();
-            check_cuda_error(cudaGetLastError());
-#endif
-
-            // trans here
-            cublasMM_cublasLtMM_wrapper_decoder(decoding_params.cublaslt_handle,
-                                                decoding_params.cublas_handle,
-                                                CUBLAS_OP_N,
-                                                CUBLAS_OP_N,
-                                                k,
-                                                m,
-                                                k,
-                                                &alpha,
-                                                decoding_params.trans_kernel,
-                                                AType_,
-                                                k,
-                                                decoder_normed_result_buf_,
-                                                BType_,
-                                                k,
-                                                &beta,
-                                                trans_out_buf_,
-                                                CType_,
-                                                k,
-                                                decoding_params.stream,
-                                                cublasAlgoMap_,
-                                                cublas_workspace_);
-          } else {
-            // trans here
-            cublasMM_cublasLtMM_wrapper_decoder(decoding_params.cublaslt_handle,
-                                                decoding_params.cublas_handle,
-                                                CUBLAS_OP_N,
-                                                CUBLAS_OP_N,
-                                                k,
-                                                m,
-                                                k,
-                                                &alpha,
-                                                decoding_params.trans_kernel,
-                                                AType_,
-                                                k,
-                                                from_tensor_[out_id],
-                                                BType_,
-                                                k,
-                                                &beta,
-                                                trans_out_buf_,
-                                                CType_,
-                                                k,
-                                                decoding_params.stream,
-                                                cublasAlgoMap_,
-                                                cublas_workspace_);
-          }
-
-#ifndef NDEBUG
-          cudaDeviceSynchronize();
-          check_cuda_error(cudaGetLastError());
-#endif
-
-          // add bias decoding_params.trans_bias
-          add_bias_act_kernelLauncher(trans_out_buf_,
-                                      decoding_params.trans_bias,
-                                      m,
-                                      k,
-                                      args_.act_,
-                                      decoding_params.stream);
-
-#ifndef NDEBUG
-          cudaDeviceSynchronize();
-          check_cuda_error(cudaGetLastError());
-#endif
-
-          layer_norm(trans_out_buf_,
-                     decoding_params.lm_layernorm.gamma,
-                     decoding_params.lm_layernorm.beta,
-                     lm_normed_result_buf_,
-                     m,
-                     k,
-                     decoding_params.stream);
-
-#ifndef NDEBUG
-          cudaDeviceSynchronize();
-          check_cuda_error(cudaGetLastError());
-#endif
-
-          cublasMM_cublasLtMM_wrapper_decoder(decoding_params.cublaslt_handle,
-                                              decoding_params.cublas_handle,
-                                              CUBLAS_OP_N,
-                                              CUBLAS_OP_N,
-                                              n,
-                                              m,
-                                              k,
-                                              &alpha,
-                                              embedding_kernel_ptr,
-                                              AType_,
-                                              n,
-                                              lm_normed_result_buf_,
-                                              BType_,
-                                              k,
-                                              &beta,
-                                              logits_buf_,
-                                              CType_,
-                                              n,
-                                              decoding_params.stream,
-                                              cublasAlgoMap_,
-                                              cublas_workspace_);
-
-        } else {
-          if (args_.normalization_before_) {
-            layer_norm(from_tensor_[out_id],
-                       decoding_params.layernorm.gamma,
-                       decoding_params.layernorm.beta,
-                       decoder_normed_result_buf_,
-                       m,
-                       k,
-                       decoding_params.stream);
-
-#ifndef NDEBUG
-            cudaDeviceSynchronize();
-            check_cuda_error(cudaGetLastError());
-#endif
-
-            cublasMM_cublasLtMM_wrapper_decoder(decoding_params.cublaslt_handle,
-                                                decoding_params.cublas_handle,
-                                                CUBLAS_OP_N,
-                                                CUBLAS_OP_N,
-                                                n,
-                                                m,
-                                                k,
-                                                &alpha,
-                                                embedding_kernel_ptr,
-                                                AType_,
-                                                n,
-                                                decoder_normed_result_buf_,
-                                                BType_,
-                                                k,
-                                                &beta,
-                                                logits_buf_,
-                                                CType_,
-                                                n,
-                                                decoding_params.stream,
-                                                cublasAlgoMap_,
-                                                cublas_workspace_);
-          } else {
-            // Post-norm
-            cublasMM_cublasLtMM_wrapper_decoder(decoding_params.cublaslt_handle,
-                                                decoding_params.cublas_handle,
-                                                CUBLAS_OP_N,
-                                                CUBLAS_OP_N,
-                                                n,
-                                                m,
-                                                k,
-                                                &alpha,
-                                                embedding_kernel_ptr,
-                                                AType_,
-                                                n,
-                                                from_tensor_[out_id],
-                                                BType_,
-                                                k,
-                                                &beta,
-                                                logits_buf_,
-                                                CType_,
-                                                n,
-                                                decoding_params.stream,
-                                                cublasAlgoMap_,
-                                                cublas_workspace_);
-          }
-#ifndef NDEBUG
-          cudaDeviceSynchronize();
-          check_cuda_error(cudaGetLastError());
-#endif
-        }
-
-        if (decoding_params.logits_mask ||
-            (args_.min_length_ != 0 && step <= args_.min_length_) ||
-            args_.vocab_size_padded_ != args_.vocab_size_) {
-          apply_logits_mask_kernelLauncher(logits_buf_,
-                                           finished_buf_,
-                                           args_.batch_size_,
-                                           1,
-                                           args_.vocab_size_padded_,
-                                           args_.vocab_size_,
-                                           decoding_params.stream,
-                                           decoding_params.logits_mask,
-                                           (args_.min_length_ != 0 && step <= args_.min_length_),
-                                           args_.end_id_);
-#ifndef NDEBUG
-          cudaDeviceSynchronize();
-          check_cuda_error(cudaGetLastError());
-#endif
-        }
-
-        if (args_.temperature_ != 1.0) {
-          apply_temperature_penalty_kernelLauncher(
-              logits_buf_,
-              (DataType_)args_.temperature_,
-              args_.batch_size_,
-              args_.vocab_size_,
-              n,
-              decoding_params.stream);
-
-#ifndef NDEBUG
-          cudaDeviceSynchronize();
-          check_cuda_error(cudaGetLastError());
-#endif
-        }
-
-        if (args_.candidate_num_ != 0) {
-          // top k sampling
-          if (decoding_params.output_scores) {
-            softmax_kernelLauncher(logits_buf_,
-                                  embedding_bias_ptr,
-                                  args_.end_id_,
-                                  finished_buf_,
-                                  m,
-                                  n,
-                                  n,
-                                  decoding_params.stream);
-
-            // Return Score.
-            topK_sampling_kernel_kernelLauncher_v3(
-                topk_workspace_,
-                topk_workspace_size_,
-                logits_buf_,
-                decoding_params.output_ids + (step - 1) * args_.batch_size_,
-                decoding_params.sequence_length,
-                decoding_params.output_scores,
-                finished_buf_,
-                curandstate_buf_,  // used as random number
-                args_,
-                decoding_params.stream,
-                args_.batch_size_);
-          } else {
-            update_logits_without_softmax(logits_buf_,
-                                          embedding_bias_ptr,
-                                          args_.end_id_,
-                                          finished_buf_,
-                                          m,
-                                          n,
-                                          decoding_params.stream);
-
-#ifndef NDEBUG
-            cudaDeviceSynchronize();
-            check_cuda_error(cudaGetLastError());
-#endif
-
-            topK_sampling_kernel_kernelLauncher_v2(
-                topk_workspace_,
-                topk_workspace_size_,
-                logits_buf_,
-                decoding_params.output_ids + (step - 1) * args_.batch_size_,
-                decoding_params.sequence_length,
-                finished_buf_,
-                curandstate_buf_,  // used as random number
-                args_,
-                decoding_params.stream,
-                args_.batch_size_);
-          }
-        } else if (args_.probability_threshold_ != 0.0) {
-          // top p sampling
-          softmax_kernelLauncher(logits_buf_,
-                                 embedding_bias_ptr,
-                                 args_.end_id_,
-                                 finished_buf_,
-                                 m,
-                                 n,
-                                 n,
-                                 decoding_params.stream);
-
-#ifndef NDEBUG
-          cudaDeviceSynchronize();
-          check_cuda_error(cudaGetLastError());
-#endif
-
-          if (decoding_params.output_scores) {
-            topP_sampling_kernel_kernelLauncher_v3(
-                topp_workspace_,
-                topp_workspace_size_,
-                logits_buf_,
-                topp_id_vals_buf_,
-                topp_offset_buf_,
-                begin_topp_offset_buf_,
-                finished_buf_,
-                curandstate_buf_,
-                args_,
-                decoding_params.output_ids + (step - 1) * args_.batch_size_,
-                decoding_params.sequence_length,
-                decoding_params.output_scores,
-                n,
-                decoding_params.stream,
-                args_.batch_size_);
-          } else {
-            topP_sampling_kernel_kernelLauncher_v2(
-                topp_workspace_,
-                topp_workspace_size_,
-                logits_buf_,
-                topp_id_vals_buf_,
-                topp_offset_buf_,
-                begin_topp_offset_buf_,
-                finished_buf_,
-                curandstate_buf_,
-                args_,
-                decoding_params.output_ids + (step - 1) * args_.batch_size_,
-                decoding_params.sequence_length,
-                n,
-                decoding_params.stream,
-                args_.batch_size_);
-          }
-        }
-      }
-
-      if (step <= max_trg_len) {
-#ifndef NDEBUG
-        cudaDeviceSynchronize();
-        check_cuda_error(cudaGetLastError());
-#endif
-
-        update_with_force_decodingLauncher(
-            decoding_params.trg_word,
-            decoding_params.trg_length,
-            finished_buf_,
-            word_ids_buf_,
-            (step > min_trg_len) ? nullptr : decoding_params.sequence_length,
-            (int *)nullptr,
-            (int *)nullptr,
-            decoding_params.output_ids + (step - 1) * args_.batch_size_,
-            (DataType_ *)nullptr,
-            false,
-            args_.batch_size_,
-            1,
-            max_trg_len,
-            step,
-            decoding_params.stream);
-      } else {
-        word_ids_buf_ =
-            decoding_params.output_ids + (step - 1) * args_.batch_size_;
-      }
-
-#ifndef NDEBUG
-      cudaDeviceSynchronize();
-      check_cuda_error(cudaGetLastError());
-#endif
-
-      if (step > max_trg_len) {
-        // TODO Find a better method to check the is_finished
-        cudaMemcpy(h_finished_buf_,
-                   finished_buf_,
-                   sizeof(bool) * args_.batch_size_,
-                   cudaMemcpyDeviceToHost);
-        uint sum = 0;
-        for (uint i = 0; i < args_.batch_size_; i++) {
-          sum += (int)h_finished_buf_[i];
-        }
-        if (sum == args_.batch_size_) break;
-      }
-    }
-  }
-
-  virtual ~DecodingSampling() {
-    delete[] K_cache_;
-    delete[] V_cache_;
-    delete[] K_mem_cache_;
-    delete[] V_mem_cache_;
-    delete[] h_finished_buf_;
-    delete[] h_trg_length_;
-    delete decoder_;
-    allocator_.free(buf_);
-  }
-};
-
-}  // namespace fastertransformer
diff --git a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/gpt.h b/paddlenlp/ops/patches/FasterTransformer/fastertransformer/gpt.h
deleted file mode 100644
index f89df6f70c3f..000000000000
--- a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/gpt.h
+++ /dev/null
@@ -1,895 +0,0 @@
-/*
- * Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/**
- * Decoder transformer
- **/
-
-#pragma once
-
-#include "fastertransformer/utils/common.h"
-#include "fastertransformer/utils/functions.h"
-#include "fastertransformer/utils/allocator.h"
-#include "fastertransformer/utils/arguments.h"
-#include "fastertransformer/cuda/cuda_kernels.h"
-#include "fastertransformer/open_decoder.h"
-#include <cuda_runtime.h>
-#include <stdlib.h>
-#include "fastertransformer/utils/nvtx_utils.h"
-
-namespace fastertransformer
-{
-
-template <OperationType OpType_>
-class DecodingGpt
-{
-private:
-    typedef DecoderTransformerTraits<OpType_> Traits_;
-    typedef typename Traits_::DataType DataType_;
-    const IAllocator &allocator_;
-    struct GptArguments args_;
-    TensorParallelParam t_parallel_param_;
-    LayerParallelParam l_parallel_param_;
-
-    const cudaDataType_t computeType_ = Traits_::computeType;
-    const cudaDataType_t AType_ = Traits_::AType;
-    const cudaDataType_t BType_ = Traits_::BType;
-    const cudaDataType_t CType_ = Traits_::CType;
-    std::map<std::string, cublasLtMatmulAlgo_info> cublasAlgoMap_;
-
-    DataType_ *embedding_kernel_padded_;
-
-    OpenDecoder<OpType_> *decoder_;
-    DataType_ **K_cache_;
-    DataType_ **V_cache_;
-    DataType_ *from_tensor_[2];
-    DataType_ *decoder_buf_;
-    DataType_ *decoder_normed_result_buf_;
-    DataType_ *logits_buf_;
-    void *buf_;
-    
-    void *topk_workspace_ = nullptr;
-    size_t topk_workspace_size_ = 0;
-    void *topp_workspace_ = nullptr;
-    size_t topp_workspace_size_ = 0;
-    void *topk_topp_workspace_ = nullptr;
-    size_t topk_topp_workspace_size_ = 0;
-    void *cublas_workspace_ = nullptr;
-    int *topp_id_vals_buf_;
-    int *topp_offset_buf_;
-    curandState_t *curandstate_buf_;
-    int *begin_topp_offset_buf_;
-
-    size_t nccl_buf_size_;
-    DataType_ *nccl_logits_buf_;
-
-    bool *finished_buf_;
-    bool *h_finished_buf_;
-    
-public:
-    DecodingGpt(const IAllocator &allocator, const int batch_size,
-                 const int seq_len,
-                 const int head_num, const int size_per_head,
-                 const int vocab_size, const int decoder_layers,
-                 const int start_id, const int end_id,
-                 const int candidate_num = 1,
-                 const float probability_threshold = 0.0,
-                 const float temperature = 1.0,
-                 const int tensor_para_size = 1,
-                 const int layer_para_size = 1,
-                 const bool is_fuse_QKV = true,
-                 const float repetition_penalty = 1.0,
-                 const int seed = -1) : allocator_(allocator)
-    {
-#ifndef NDEBUG
-        PRINT_FUNC_NAME_();
-#endif
-        assert(temperature != 0.0);
-        assert(repetition_penalty > 0.0);
-        assert(candidate_num > 0 || probability_threshold > 0.0);
-        assert(decoder_layers % layer_para_size == 0);
-
-        args_.batch_size_ = batch_size;
-        args_.seq_len_ = seq_len;
-        args_.head_num_ = head_num;
-        args_.size_per_head_ = size_per_head;
-        args_.hidden_units_ = head_num * size_per_head;
-        args_.decoder_layers_ = decoder_layers;
-        args_.vocab_size_ = vocab_size;
-        args_.start_id_ = start_id;
-        args_.end_id_ = end_id;
-        args_.candidate_num_ = candidate_num;
-        args_.probability_threshold_ = probability_threshold;
-        args_.temperature_ = temperature;
-        args_.repetition_penalty_ = repetition_penalty;
-        /***** newly added by PaddleNLP *****/
-        args_.seed_ = seed;
-        
-        K_cache_ = new DataType_ *[1];
-        V_cache_ = new DataType_ *[1];
-
-        decoder_ = new OpenDecoder<OpType_>(args_.head_num_, size_per_head, 0 /* memory_hidden_units */, is_fuse_QKV);
-        decoder_->set_max_batch_size(args_.batch_size_);
-
-        args_.vocab_size_padded_ = div_up(args_.vocab_size_, 64) * 64;
-
-        size_t from_tensor_size = args_.batch_size_ * args_.hidden_units_;                    // type T
-        size_t decoder_workspace_size = (size_t)decoder_->getWorkspaceSize();                                             // type T
-        size_t decoder_normed_result_buffer_size = args_.batch_size_ * args_.hidden_units_;   // type T
-        // cache costs lots of memory, so we only store part of them when we use multi-gpu for inference
-        size_t cache_size = args_.batch_size_ * args_.seq_len_ * args_.hidden_units_ / tensor_para_size;         // type T
-        size_t logits_buf_size = args_.batch_size_ * args_.vocab_size_padded_; // type T
-
-        size_t topp_id_vals_buf_size = args_.batch_size_ * args_.vocab_size_padded_; // type int
-        size_t topp_offset_buf_size = args_.batch_size_ + 1;
-        size_t begin_topp_offset_buf_size = topp_offset_buf_size;
-        size_t curandState_size = args_.batch_size_;
-        size_t finished_buf_size = args_.batch_size_;
-
-        const int MEM_C = 128;
-        size_t embedding_kernel_transposed_padded_size = args_.hidden_units_ * args_.vocab_size_padded_;
-        embedding_kernel_transposed_padded_size = div_up(embedding_kernel_transposed_padded_size, MEM_C) * MEM_C;
-
-        // prevent memory misalinged address
-        logits_buf_size = (size_t)(ceil(logits_buf_size / 4.)) * 4;
-        
-        topp_id_vals_buf_size = (size_t)(ceil(topp_id_vals_buf_size / 4.)) * 4;
-        topp_offset_buf_size = (size_t)(ceil(topp_offset_buf_size / 4.)) * 4;
-        begin_topp_offset_buf_size = topp_offset_buf_size;
-        curandState_size = (size_t)(ceil(curandState_size / 32.)) * 32;
-        finished_buf_size = (size_t)(ceil(finished_buf_size / 32.)) * 32;
-
-        topP_sampling_kernel_kernelLauncher_v2(topp_workspace_,
-                                               topp_workspace_size_,
-                                               logits_buf_,
-                                               topp_id_vals_buf_,
-                                               topp_offset_buf_,
-                                               begin_topp_offset_buf_,
-                                               nullptr,
-                                               curandstate_buf_,
-                                               args_,
-                                               nullptr,
-                                               nullptr,
-                                               args_.vocab_size_padded_,
-                                               0,
-                                               args_.batch_size_);
-
-        topK_sampling_kernel_kernelLauncher_v2(topk_workspace_,
-                                               topk_workspace_size_,
-                                               logits_buf_,
-                                               nullptr,
-                                               nullptr,
-                                               nullptr,
-                                               curandstate_buf_,
-                                               args_,
-                                               0,
-                                               args_.batch_size_);
-
-        topK_topP_sampling_kernel_kernelLauncher_v2(topk_topp_workspace_,
-                                              topk_topp_workspace_size_,
-                                              nullptr,
-                                              logits_buf_,
-                                              nullptr,
-                                              curandstate_buf_,
-                                              args_,
-                                              0,
-                                              args_.batch_size_);
-
-        size_t datatype_buf_size = from_tensor_size * 2 + decoder_workspace_size +
-                                cache_size * 2 * (args_.decoder_layers_ / layer_para_size) + decoder_normed_result_buffer_size;
-
-        nccl_buf_size_ = args_.batch_size_ * args_.vocab_size_padded_;
-        nccl_buf_size_ = (size_t)(ceil(nccl_buf_size_ / 4.)) * 4;
-
-        buf_ = reinterpret_cast<void *>(allocator_.malloc(
-            ((sizeof(DataType_) == sizeof(half)) ? CUBLAS_WORKSPACE_SIZE : 0) + 
-            sizeof(DataType_) * embedding_kernel_transposed_padded_size +
-            sizeof(DataType_) * (datatype_buf_size + logits_buf_size) + 
-            sizeof(int) * (topp_id_vals_buf_size + topp_offset_buf_size + begin_topp_offset_buf_size) +
-            topp_workspace_size_ + topk_workspace_size_ + topk_topp_workspace_size_ + sizeof(DataType_) * nccl_buf_size_ +
-            finished_buf_size + curandState_size * sizeof(curandState_t)));
-
-        if (sizeof(DataType_) == sizeof(half))
-        {
-          cublas_workspace_ = buf_;
-          embedding_kernel_padded_ = (DataType_ *)((char*)cublas_workspace_ + CUBLAS_WORKSPACE_SIZE);
-        }
-        else
-        {
-          cublas_workspace_ = nullptr;
-          embedding_kernel_padded_ = (DataType_ *)buf_;
-        }
-        from_tensor_[0] = (DataType_ *)(embedding_kernel_padded_ + embedding_kernel_transposed_padded_size);
-        from_tensor_[1] = (DataType_ *)(from_tensor_[0] + from_tensor_size);
-
-        K_cache_[0] = from_tensor_[1] + from_tensor_size + 0 * cache_size * args_.decoder_layers_ / layer_para_size;
-        V_cache_[0] = from_tensor_[1] + from_tensor_size + 1 * cache_size * args_.decoder_layers_ / layer_para_size;
-
-        decoder_buf_ = V_cache_[0] + cache_size * args_.decoder_layers_ / layer_para_size;
-        decoder_normed_result_buf_ = (decoder_buf_ + decoder_workspace_size);
-        logits_buf_ = decoder_normed_result_buf_ + decoder_normed_result_buffer_size;
-        topp_id_vals_buf_ = (int *)((DataType_*)logits_buf_ + logits_buf_size);
-        begin_topp_offset_buf_ = (int *)(topp_id_vals_buf_ + topp_id_vals_buf_size);
-        topp_offset_buf_ = (int *)((int*)begin_topp_offset_buf_ + begin_topp_offset_buf_size);
-        topp_workspace_ = (void *)((int*)topp_offset_buf_ + topp_offset_buf_size);
-        topk_workspace_ = (void *)((char*)topp_workspace_ + topp_workspace_size_);
-        topk_topp_workspace_ = (void *)((char*)topk_workspace_ + topk_workspace_size_);
-        nccl_logits_buf_ = (DataType_ *)((char*)topk_topp_workspace_ + topk_topp_workspace_size_);
-        curandstate_buf_ = (curandState_t*)(nccl_logits_buf_ + nccl_buf_size_);
-        finished_buf_ = (bool*)(curandstate_buf_ + curandState_size);
-        h_finished_buf_ = new bool[args_.batch_size_];
-
-        cudaMemset(embedding_kernel_padded_, 0, embedding_kernel_transposed_padded_size * sizeof(DataType_));
-
-        int isConfigExist = access("decoding_gemm_config.in", 0);
-        if (isConfigExist == -1)
-            printf("[WARNING] decoding_gemm_config.in is not found\n");
-        else
-        {
-            readAlgoFromConfig(cublasAlgoMap_, 1);
-            // check that the gemm_config setting is runnable
-            for (auto iter = cublasAlgoMap_.begin() ; iter != cublasAlgoMap_.end() ; iter++)
-            {
-                int algoId = iter->second.algoId;
-                int stages = iter->second.stages;
-                //only check for cublas
-                if (stages != -1)
-                    continue;
-                if (Traits_::OpType == OperationType::FP32)
-                {
-                    if (algoId > CUBLAS_GEMM_ALGO23 || algoId < CUBLAS_GEMM_DEFAULT)
-                    {
-                        // the algorithm is not for FP32
-                        printf("[ERROR] cuBLAS Algorithm %d is not used in FP32. \n", algoId);
-                        exit(-1);
-                    }
-                }
-                else
-                {
-                    if (algoId > CUBLAS_GEMM_ALGO15_TENSOR_OP || algoId < CUBLAS_GEMM_DEFAULT_TENSOR_OP)
-                    {
-                        // the algorithm is not for FP16
-                        printf("[ERROR] cuBLAS Algorithm %d is not used in FP16. \n", algoId);
-                        exit(-1);
-                    }
-                }
-            }
-        }
-    }
-
-    void set_tensor_parallel_param(const TensorParallelParam param)
-    {
-        t_parallel_param_ = param;
-        decoder_->set_tensor_parallel_param(param);
-    }
-
-    void set_layer_parallel_param(const LayerParallelParam param)
-    {
-        l_parallel_param_ = param;
-        decoder_->set_layer_parallel_param(param);
-    }
-
-    void forward_context(const DecoderInitParam<DataType_> *decoder_param,
-                         const DecodingInitParam<DataType_> decoding_params)
-    {
-#ifndef NDEBUG
-        PRINT_FUNC_NAME_();
-#endif
-        const int input_len = decoding_params.request_input_len;
-        const int max_len = (decoding_params.request_output_len > 0 && input_len + decoding_params.request_output_len <= args_.seq_len_) ?
-                            input_len + decoding_params.request_output_len :
-                            args_.seq_len_;
-        const int request_batch_size = decoding_params.request_batch_size;
-        cudaMemsetAsync(decoding_params.output_ids, 0, sizeof(int) * request_batch_size * max_len, decoding_params.stream);
-#ifndef NDEBUG
-        cudaDeviceSynchronize();
-        check_cuda_error(cudaGetLastError());
-#endif
-
-        // const int input_len = decoding_params.request_input_len;
-        const int max_input_len = decoding_params.max_input_len;
-
-        // d_start_ids: [batch * seqlen]
-        if(input_len == 1)
-        {
-            cudaMemcpyAsync(decoding_params.output_ids, decoding_params.d_start_ids, 
-                            sizeof(int) * request_batch_size, cudaMemcpyDeviceToDevice, decoding_params.stream);
-            return;
-        }
-        const int local_batch_size = ceil(request_batch_size * 1.0 / l_parallel_param_.world_size);
-        const int m = local_batch_size * input_len;
-        const int h_1 = args_.hidden_units_;
-
-        DataType_* from_tensor[2];
-        DataType_* decoder_output;
-        DataType_* decoder_workspace;
-        void *buf = reinterpret_cast<void *>(allocator_.malloc(
-            decoder_->getContextWorkspaceSize(input_len, local_batch_size) + 
-            (m * h_1 + 2 * request_batch_size * input_len * h_1) * sizeof(DataType_)
-        ));
-#ifndef NDEBUG
-        cudaDeviceSynchronize();
-        check_cuda_error(cudaGetLastError());
-#endif
-
-        from_tensor[0] = (DataType_*) buf;
-        from_tensor[1] = from_tensor[0] + request_batch_size * input_len * h_1;
-        decoder_output = from_tensor[1] + request_batch_size * input_len * h_1;
-        decoder_workspace = decoder_output + m * h_1;
-
-#ifndef NDEBUG
-        cudaDeviceSynchronize();
-        check_cuda_error(cudaGetLastError());
-#endif
-
-        if(l_parallel_param_.rank == 0)
-        {
-            PUSH_RANGE("Before Transformer/Embedding")
-            start_id_embedding_position_lookups_kernel_launcher(from_tensor[0],
-                                                                decoding_params.output_ids,
-                                                                decoding_params.embedding_table,
-                                                                decoding_params.position_encoding_table,
-                                                                decoding_params.d_start_ids,
-                                                                1,
-                                                                input_len,
-                                                                max_input_len,
-                                                                request_batch_size,
-                                                                args_.hidden_units_, 
-                                                                decoding_params.stream);
-            POP_RANGE
-#ifndef NDEBUG
-        cudaDeviceSynchronize();
-        check_cuda_error(cudaGetLastError());
-#endif
-        }
-
-        int ite_num = (int)(ceil(request_batch_size * 1.0 / local_batch_size));
-        for(int ite = 0; ite < ite_num; ite++)
-        {
-            int in_id, out_id;
-            for (int layer = 0; layer < args_.decoder_layers_; ++layer)
-            {
-                if(l_parallel_param_.is_valid(layer))
-                {
-                    in_id = layer & 0x1;
-                    out_id = 1 - in_id;
-
-                    if(layer == l_parallel_param_.layers_per_group * l_parallel_param_.rank && layer != 0 && l_parallel_param_.world_size > 1)
-                    {
-                        const int size = m * t_parallel_param_.local_hidden_units_;
-                        nccl_recv(from_tensor[in_id] + ite * m * h_1 + size * t_parallel_param_.rank, size, l_parallel_param_.rank - 1, 
-                                    l_parallel_param_, decoding_params.stream);
-                        all2all_gather(from_tensor[in_id] + ite * m * h_1, from_tensor[in_id] + ite * m * h_1, size, 
-                                    t_parallel_param_, decoding_params.stream);
-                    }
-
-                    decoder_->initialize(decoder_param[layer], decoder_buf_, cublas_workspace_, false);
-#ifndef NDEBUG
-                    cudaDeviceSynchronize();
-                    check_cuda_error(cudaGetLastError());
-#endif
-
-                    int dummy_decoder_max_seq_len = args_.seq_len_;
-                    // int dummy_decoder_max_seq_len = -1;
-                    size_t cache_offset;
-                    if(dummy_decoder_max_seq_len == -1)
-                    {
-                        cache_offset = (layer - l_parallel_param_.layers_per_group * l_parallel_param_.rank) *
-                                        args_.batch_size_ * args_.seq_len_ * t_parallel_param_.local_hidden_units_;
-                    }
-                    else
-                    {
-                        cache_offset = (layer - l_parallel_param_.layers_per_group * l_parallel_param_.rank) *
-                                        args_.batch_size_ * args_.seq_len_ * t_parallel_param_.local_hidden_units_ +
-                                        ite * local_batch_size * args_.seq_len_ * t_parallel_param_.local_hidden_units_;
-                    }
-                    decoder_->forward_context(decoder_workspace,
-                                              from_tensor[out_id] + ite * m * h_1,
-                                              K_cache_[0] + cache_offset,
-                                              V_cache_[0] + cache_offset,
-                                              from_tensor[in_id] + ite * m * h_1,
-                                              decoding_params.d_attn_mask + ite * local_batch_size * input_len * input_len,
-                                              local_batch_size,
-                                              input_len,
-                                              ite,
-                                              dummy_decoder_max_seq_len,
-                                              layer == args_.decoder_layers_ - 1);
-#ifndef NDEBUG
-                    cudaDeviceSynchronize();
-                    check_cuda_error(cudaGetLastError());
-#endif
-                    if(layer == l_parallel_param_.layers_per_group * (l_parallel_param_.rank + 1) - 1 && layer != args_.decoder_layers_ - 1 && l_parallel_param_.world_size > 1)
-                    {
-                        const int size = m * t_parallel_param_.local_hidden_units_;
-                        nccl_send(from_tensor[out_id] + ite * m * h_1 + size * t_parallel_param_.rank, size, l_parallel_param_.rank + 1,
-                                    l_parallel_param_, decoding_params.stream);
-                    }
-                }
-            } // end of for loop of layer
-        } // end of for loop of ite
-        allocator_.free(buf);
-#ifndef NDEBUG
-        cudaDeviceSynchronize();
-        check_cuda_error(cudaGetLastError());
-#endif
-    }
-
-    void forward(const DecoderInitParam<DataType_> *decoder_param,
-                 DecodingInitParam<DataType_> decoding_params)
-    {
-#ifndef NDEBUG
-        PRINT_FUNC_NAME_();
-#endif
-        const int input_len = decoding_params.request_input_len;
-        const int max_input_len = decoding_params.max_input_len;
-        const int request_batch_size = decoding_params.request_batch_size;
-        const int max_len = (decoding_params.request_output_len > 0 && input_len + decoding_params.request_output_len <= args_.seq_len_) ?
-                            input_len + decoding_params.request_output_len :
-                            args_.seq_len_;
-
-        assert(request_batch_size <= args_.batch_size_);
-        assert(request_batch_size % l_parallel_param_.local_batch_size == 0);
-        const int m = request_batch_size;
-        const int k = args_.hidden_units_;
-        const DataType_* embedding_kernel_ptr = nullptr;
-
-        cudaMemsetAsync(finished_buf_, false, sizeof(finished_buf_[0]) * request_batch_size, decoding_params.stream);
-        if (args_.probability_threshold_ != 0.0)
-        {
-            topp_initialization_kernelLauncher_v2(nullptr,
-                                                  nullptr,
-                                                  nullptr,
-                                                  topp_id_vals_buf_,
-                                                  topp_offset_buf_,
-                                                  begin_topp_offset_buf_,
-                                                  args_.candidate_num_ > 0 ? args_.candidate_num_ : args_.vocab_size_padded_,
-                                                  args_,
-                                                  decoding_params.stream);
-
-#ifndef NDEBUG
-            cudaDeviceSynchronize();
-            check_cuda_error(cudaGetLastError());
-#endif
-        }
-        ker_curand_setupLauncher(curandstate_buf_,
-                                 args_,
-                                 decoding_params.stream);
-
-        if(std::is_same<DataType_, float>::value || (std::is_same<DataType_, half>::value && args_.vocab_size_padded_ == args_.vocab_size_))
-        {
-            embedding_kernel_ptr = (const DataType_ *)decoding_params.embedding_kernel;
-        }
-        else
-        {
-            cudaMemcpyAsync(embedding_kernel_padded_, decoding_params.embedding_kernel, 
-                            sizeof(DataType_) * args_.vocab_size_ * args_.hidden_units_, cudaMemcpyDeviceToDevice, decoding_params.stream);
-            embedding_kernel_ptr = (const DataType_ *)embedding_kernel_padded_;
-        }
-#ifndef NDEBUG
-        cudaDeviceSynchronize();
-        check_cuda_error(cudaGetLastError());
-#endif
-        bool is_generation_done = false;
-        const int local_batch = l_parallel_param_.local_batch_size;
-        for (size_t step = input_len; step < max_len; ++step)
-        {
-            const int ite_num = request_batch_size / local_batch;
-            for(size_t ite = 0; ite < ite_num; ite++)
-            {
-                if(l_parallel_param_.rank == 0 && l_parallel_param_.world_size > 1)
-                {
-                    if(step != (size_t)input_len)
-                    {
-                        PUSH_RANGE("token/recv")
-                        nccl_recv(decoding_params.output_ids + (step - 1) * m + ite * local_batch, local_batch,
-                                  l_parallel_param_.world_size - 1, l_parallel_param_, decoding_params.stream);
-                        POP_RANGE
-                    }
-                }
-
-                if(l_parallel_param_.rank < l_parallel_param_.world_size - 1 && l_parallel_param_.world_size > 1)
-                {
-                    if(step != (size_t)input_len)
-                    {
-                        nccl_broadcast(finished_buf_ + ite * local_batch, local_batch, l_parallel_param_.world_size - 1, l_parallel_param_, decoding_params.stream);
-                    }
-                }
-                if(ite == 0)
-                {
-                    cudaMemcpyAsync(h_finished_buf_, finished_buf_, sizeof(bool) * request_batch_size, cudaMemcpyDeviceToHost, decoding_params.stream);
-                    cudaStreamSynchronize(decoding_params.stream);
-                    uint sum = 0;
-                    for (uint i = 0; i < request_batch_size; i++)
-                    {
-                        sum += (int)h_finished_buf_[i];
-                    }
-                    if (sum == request_batch_size)
-                    {
-                        is_generation_done = true;
-                        break;
-                    }
-                }
-
-                if(l_parallel_param_.rank == 0)
-                {
-                    PUSH_RANGE("Before Transformer/Embedding")
-                    /***** newly fixed by PaddleNLP *****/
-                    embedding_position_lookups_kernel_launcher(from_tensor_[0],
-                                                            decoding_params.embedding_table,
-                                                            decoding_params.position_encoding_table,
-                                                            decoding_params.output_ids,
-                                                            local_batch,
-                                                            m,
-                                                            args_.hidden_units_,
-                                                            step,
-                                                            ite,
-                                                            max_input_len,
-                                                            decoding_params.d_start_lengths,
-                                                            decoding_params.stream);
-                    POP_RANGE
-#ifndef NDEBUG
-                    cudaDeviceSynchronize();
-                    check_cuda_error(cudaGetLastError());
-#endif
-                }
-
-                //we use two-way buffer
-                int from_id, out_id;
-                for (int layer = 0; layer < args_.decoder_layers_; ++layer)
-                {
-                    if(l_parallel_param_.is_valid(layer))
-                    {
-                        /*
-                            For the first layer (layer-0), from_id is 0. We also stored the embedding lookup 
-                            result in from_tensor_[0]
-                        */
-                        from_id = layer & 0x1;
-                        out_id = 1 - from_id;
-
-                        if(layer == l_parallel_param_.layers_per_group * l_parallel_param_.rank && layer != 0 && l_parallel_param_.world_size > 1)
-                        {
-                            const int size = local_batch * t_parallel_param_.local_hidden_units_;
-                            nccl_recv(from_tensor_[from_id] + size * t_parallel_param_.rank, size, l_parallel_param_.rank - 1, 
-                                      l_parallel_param_, decoding_params.stream);
-                            all2all_gather(from_tensor_[from_id], from_tensor_[from_id], size, 
-                                           t_parallel_param_, decoding_params.stream);
-                        }
-
-                        /*
-                            We use one decoder_ object to process multiple decoder layers. 
-                            At the beginning of each decoder layer, we initialize the decoder object 
-                            with corresponding weights and decoder_buf_.
-                            The decoder_buf_ is reused.
-                        */
-                        decoder_->initialize(decoder_param[layer], decoder_buf_, cublas_workspace_, false);
-                        
-#ifndef NDEBUG
-                        cudaDeviceSynchronize();
-                        check_cuda_error(cudaGetLastError());
-#endif
-                        int dummy_decoder_max_seq_len = args_.seq_len_;
-                        // int dummy_decoder_max_seq_len = -1;
-                        size_t cache_offset;
-                        if(dummy_decoder_max_seq_len == -1)
-                        {
-                            cache_offset = (layer - l_parallel_param_.layers_per_group * l_parallel_param_.rank) *
-                                            args_.batch_size_ * args_.seq_len_ * t_parallel_param_.local_hidden_units_ +
-                                            ite * local_batch * t_parallel_param_.local_hidden_units_;
-                        }
-                        else
-                        {
-                            cache_offset = (layer - l_parallel_param_.layers_per_group * l_parallel_param_.rank) * 
-                                            args_.batch_size_ * args_.seq_len_ * t_parallel_param_.local_hidden_units_ + 
-                                            ite * local_batch * args_.seq_len_ * t_parallel_param_.local_hidden_units_;
-                        }
-                        decoder_->forward_v2(from_tensor_[from_id], 
-                                            nullptr, // memory_tensor should be nullptr
-                                            K_cache_[0] + cache_offset,
-                                            V_cache_[0] + cache_offset,
-                                            nullptr, nullptr, // key_mem_cache_ and value_mem_cache_ should be nullptr
-                                            nullptr, // memory_sequence_length should be nullptr
-                                            from_tensor_[out_id], step, dummy_decoder_max_seq_len,
-                                            false, 
-                                            finished_buf_ + ite * local_batch,
-                                            max_input_len, 
-                                            decoding_params.d_start_lengths + ite * local_batch);
-
-#ifndef NDEBUG
-                        cudaDeviceSynchronize();
-                        check_cuda_error(cudaGetLastError());
-#endif          
-
-                        if(layer == l_parallel_param_.layers_per_group * (l_parallel_param_.rank + 1) - 1 && layer != args_.decoder_layers_ - 1 && l_parallel_param_.world_size > 1)
-                        {
-                            const size_t size = local_batch * t_parallel_param_.local_hidden_units_;
-                            nccl_send(from_tensor_[out_id] + size * t_parallel_param_.rank, size, l_parallel_param_.rank + 1, 
-                                      l_parallel_param_, decoding_params.stream);
-                        }
-                    }
-                }
-
-                if(l_parallel_param_.rank == l_parallel_param_.world_size - 1)
-                {
-
-                    layer_norm(from_tensor_[out_id],
-                               decoding_params.layernorm.gamma,
-                               decoding_params.layernorm.beta,
-                               decoder_normed_result_buf_,
-                               local_batch,
-                               k,
-                               decoding_params.stream);
-
-#ifndef NDEBUG
-                    cudaDeviceSynchronize();
-                    check_cuda_error(cudaGetLastError());
-#endif
-
-                    DataType_ alpha = DataType_(1.0f);
-                    DataType_ beta = DataType_(0.0f);
-                    assert(args_.vocab_size_padded_ % t_parallel_param_.world_size == 0);
-                    int n = args_.vocab_size_padded_ / t_parallel_param_.world_size;
-                    
-                    if(t_parallel_param_.world_size == 1)
-                    {
-                        PUSH_RANGE("After Transformer/GEMM")
-                        cublasMM_cublasLtMM_wrapper_decoder(decoding_params.cublaslt_handle, 
-                                                            decoding_params.cublas_handle, 
-                                                            CUBLAS_OP_T, CUBLAS_OP_N,
-                                                            n, local_batch, k,
-                                                            &alpha,
-                                                            embedding_kernel_ptr, AType_, k,
-                                                            decoder_normed_result_buf_, BType_, k,
-                                                            &beta,
-                                                            logits_buf_, CType_, n,
-                                                            decoding_params.stream, cublasAlgoMap_,
-                                                            cublas_workspace_);
-                        POP_RANGE
-                    }
-                    else
-                    {
-                        PUSH_RANGE("After Transformer/GEMM")
-                        cublasMM_cublasLtMM_wrapper_decoder(decoding_params.cublaslt_handle, 
-                                                            decoding_params.cublas_handle, 
-                                                            CUBLAS_OP_T, CUBLAS_OP_N,
-                                                            n, local_batch, k,
-                                                            &alpha,
-                                                            embedding_kernel_ptr + t_parallel_param_.rank * n * k,
-                                                            AType_, k,
-                                                            decoder_normed_result_buf_, BType_, k,
-                                                            &beta,
-                                                            nccl_logits_buf_ + t_parallel_param_.rank * local_batch * n,
-                                                            CType_, n,
-                                                            decoding_params.stream, cublasAlgoMap_,
-                                                            cublas_workspace_);
-                        POP_RANGE
-                    }
-
-#ifndef NDEBUG
-                    cudaDeviceSynchronize();
-                    check_cuda_error(cudaGetLastError());
-#endif
-                    
-                    if(t_parallel_param_.world_size == 1)
-                    {
-                        apply_temperature_penalty_kernelLauncher(logits_buf_,
-                                                                (DataType_) args_.temperature_,
-                                                                local_batch,
-                                                                args_.vocab_size_,
-                                                                n,
-                                                                decoding_params.stream);
-                    }
-                    else
-                    {
-                        if(t_parallel_param_.rank == t_parallel_param_.world_size - 1)
-                        {
-                            apply_temperature_penalty_kernelLauncher(nccl_logits_buf_ + t_parallel_param_.rank * local_batch * n,
-                                                                    (DataType_) args_.temperature_,
-                                                                    local_batch,
-                                                                    args_.vocab_size_ - n * t_parallel_param_.rank,
-                                                                    n,
-                                                                    decoding_params.stream);
-                        }
-                        else
-                        {
-                            apply_temperature_penalty_kernelLauncher(nccl_logits_buf_ + t_parallel_param_.rank * local_batch * n,
-                                                                    (DataType_) args_.temperature_,
-                                                                    local_batch,
-                                                                    n,
-                                                                    n,
-                                                                    decoding_params.stream);
-                        }
-                    }
-
-#ifndef NDEBUG
-                    cudaDeviceSynchronize();
-                    check_cuda_error(cudaGetLastError());
-#endif
-
-                    // reduce and concat the result
-                    if(t_parallel_param_.world_size > 1)
-                    {
-                        PUSH_RANGE("After Transformer/all2all_gather")
-                        all2all_gather(nccl_logits_buf_, nccl_logits_buf_, local_batch * n, 
-                                       t_parallel_param_, decoding_params.stream);
-                        POP_RANGE
-                        
-                        transpose_axis_01_kernelLauncher(logits_buf_, nccl_logits_buf_, 
-                                                         t_parallel_param_.world_size, local_batch, n, decoding_params.stream);
-                    }
-
-#ifndef NDEBUG
-                    cudaDeviceSynchronize();
-                    check_cuda_error(cudaGetLastError());
-#endif
-
-                    n = args_.vocab_size_padded_;
-
-                    // Apply repetition penalty.
-                    if (args_.repetition_penalty_ != 1.0) {
-                        PUSH_RANGE("After Transformer/Repetition_penalty")
-                        apply_repetition_penalty_kernelLauncher(logits_buf_,
-                                                                args_.repetition_penalty_,
-                                                                decoding_params.d_start_ids,
-                                                                decoding_params.output_ids,
-                                                                m,
-                                                                local_batch,
-                                                                args_.vocab_size_,
-                                                                n,
-                                                                decoding_params.d_start_lengths,
-                                                                max_input_len,
-                                                                step,
-                                                                ite,
-                                                                decoding_params.stream);
-                        POP_RANGE
-                    }
-
-#ifndef NDEBUG
-                    cudaDeviceSynchronize();
-                    check_cuda_error(cudaGetLastError());
-#endif
-
-                    // Sampling
-                    if(args_.candidate_num_ > 0 && args_.probability_threshold_ == 0.0)
-                    {
-                        PUSH_RANGE("After Transformer/Sampling")
-                        // top k sampling
-                        topK_sampling_kernel_kernelLauncher_v2(topk_workspace_,
-                                                               topk_workspace_size_,
-                                                               logits_buf_,
-                                                               decoding_params.output_ids + step * m + ite * local_batch,
-                                                               nullptr,
-                                                               finished_buf_ + ite * local_batch,
-                                                               curandstate_buf_, // used as random number
-                                                               args_,
-                                                               decoding_params.stream,
-                                                               local_batch);
-                        POP_RANGE
-                    }
-                    else if(args_.candidate_num_ == 0 && args_.probability_threshold_ > 0.0f)
-                    {
-                        PUSH_RANGE("After Transformer/Sampling")
-                        // top p sampling
-                        softmax_kernelLauncher(logits_buf_,
-                                               (DataType_*) nullptr,
-                                               args_.end_id_,
-                                               finished_buf_ + ite * local_batch,
-                                               local_batch,
-                                               args_.vocab_size_padded_,
-                                               args_.vocab_size_,
-                                               decoding_params.stream);
-#ifndef NDEBUG
-                        cudaDeviceSynchronize();
-                        check_cuda_error(cudaGetLastError());
-#endif
-                        topP_sampling_kernel_kernelLauncher_v2(topp_workspace_,
-                                                               topp_workspace_size_,
-                                                               logits_buf_,
-                                                               topp_id_vals_buf_,
-                                                               topp_offset_buf_,
-                                                               begin_topp_offset_buf_,
-                                                               finished_buf_ + ite * local_batch,
-                                                               curandstate_buf_,
-                                                               args_,
-                                                               decoding_params.output_ids + step * m + ite * local_batch,
-                                                               nullptr,
-                                                               n,
-                                                               decoding_params.stream,
-                                                               local_batch);
-
-                        POP_RANGE
-                    }
-                    else if(args_.candidate_num_ > 0 && args_.probability_threshold_ > 0.0f)
-                    {
-                        PUSH_RANGE("After Transformer/Sampling")
-                        topK_topP_sampling_kernel_kernelLauncher_v2(topk_topp_workspace_,
-                                                                    topk_topp_workspace_size_,
-                                                                    decoding_params.output_ids + step * m + ite * local_batch,
-                                                                    logits_buf_,
-                                                                    finished_buf_ + ite * local_batch,
-                                                                    curandstate_buf_,
-                                                                    args_,
-                                                                    decoding_params.stream,
-                                                                    local_batch);
-                        POP_RANGE
-                    }
-#ifndef NDEBUG
-                    cudaDeviceSynchronize();
-                    check_cuda_error(cudaGetLastError());
-#endif
-                }
-                if(step < (size_t)max_input_len)
-                {
-                    // Replace the sampled id by start ids
-                    set_start_ids_kernelLauncher(decoding_params.output_ids, decoding_params.d_start_ids, max_input_len,
-                                                 step, ite, request_batch_size, local_batch, args_.end_id_, decoding_params.stream);
-                }
-
-                if(l_parallel_param_.rank == l_parallel_param_.world_size - 1 && l_parallel_param_.world_size > 1)
-                {
-                    PUSH_RANGE("token/send")
-                    nccl_send(decoding_params.output_ids + step * m + ite * local_batch, local_batch, 0, l_parallel_param_, decoding_params.stream);
-                    POP_RANGE
-                }
-
-#ifndef NDEBUG
-                cudaDeviceSynchronize();
-                check_cuda_error(cudaGetLastError());
-#endif
-
-                if(l_parallel_param_.rank == l_parallel_param_.world_size - 1 && l_parallel_param_.world_size > 1 && step < max_len - 1)
-                {
-                    nccl_broadcast(finished_buf_ + ite * local_batch, local_batch, l_parallel_param_.world_size - 1, l_parallel_param_, decoding_params.stream);
-                }
-#ifndef NDEBUG
-                cudaDeviceSynchronize();
-                check_cuda_error(cudaGetLastError());
-#endif
-            } // end for ite for loop
-
-            if (is_generation_done) {
-                break;
-            }
-        } // end for decoding step for loop
-        if(l_parallel_param_.rank == 0 && l_parallel_param_.world_size > 1)
-        {
-            for(size_t ite = 0; ite < request_batch_size / local_batch; ite++)
-            {
-                nccl_recv(decoding_params.output_ids + (max_len - 1) * m + ite * local_batch,
-                          local_batch, l_parallel_param_.world_size - 1,
-                          l_parallel_param_, decoding_params.stream);
-            }
-        }
-    } // end of forward
-
-    virtual ~DecodingGpt()
-    {
-        delete[] K_cache_;
-        delete[] V_cache_;
-        delete decoder_;
-        allocator_.free(buf_);
-        delete [] h_finished_buf_;
-    }
-
-    inline int get_num_layer() {return args_.decoder_layers_;}
-
-    inline void set_local_batch_size(int local_batch)
-    { 
-        l_parallel_param_.local_batch_size = local_batch;
-        decoder_->set_local_batch_size(local_batch);
-    }
-};
-
-} //namespace fastertransformer
diff --git a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/gptj.h b/paddlenlp/ops/patches/FasterTransformer/fastertransformer/gptj.h
deleted file mode 100644
index 6262d9672311..000000000000
--- a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/gptj.h
+++ /dev/null
@@ -1,946 +0,0 @@
-/*
- * Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/**
- * Decoder transformer
- **/
-
-#pragma once
-
-#include "fastertransformer/utils/common.h"
-#include "fastertransformer/utils/functions.h"
-#include "fastertransformer/utils/allocator.h"
-#include "fastertransformer/utils/arguments.h"
-#include "fastertransformer/cuda/cuda_kernels.h"
-#include "fastertransformer/open_decoder.h"
-#include <cuda_runtime.h>
-#include <stdlib.h>
-#include "fastertransformer/utils/nvtx_utils.h"
-
-namespace fastertransformer
-{
-
-template <OperationType OpType_>
-class DecodingGptJ
-{
-private:
-    typedef DecoderTransformerTraits<OpType_> Traits_;
-    typedef typename Traits_::DataType DataType_;
-    const IAllocator &allocator_;
-    struct GptJArguments args_;
-    TensorParallelParam t_parallel_param_;
-    LayerParallelParam l_parallel_param_;
-
-    const cudaDataType_t computeType_ = Traits_::computeType;
-    const cudaDataType_t AType_ = Traits_::AType;
-    const cudaDataType_t BType_ = Traits_::BType;
-    const cudaDataType_t CType_ = Traits_::CType;
-    std::map<std::string, cublasLtMatmulAlgo_info> cublasAlgoMap_;
-
-    DataType_ *embedding_kernel_padded_;
-    DataType_ *embedding_bias_padded_;
-
-    OpenDecoder<OpType_> *decoder_;
-    DataType_ **K_cache_;
-    DataType_ **V_cache_;
-    DataType_ *from_tensor_[2];
-    DataType_ *decoder_buf_;
-    DataType_ *decoder_normed_result_buf_;
-    DataType_ *logits_buf_;
-    void *buf_;
-    
-    void *topk_workspace_ = nullptr;
-    size_t topk_workspace_size_ = 0;
-    void *topp_workspace_ = nullptr;
-    size_t topp_workspace_size_ = 0;
-    void *topk_topp_workspace_ = nullptr;
-    size_t topk_topp_workspace_size_ = 0;
-    void *cublas_workspace_ = nullptr;
-    int *topp_id_vals_buf_;
-    int *topp_offset_buf_;
-    curandState_t *curandstate_buf_;
-    int *begin_topp_offset_buf_;
-
-    size_t nccl_buf_size_;
-    DataType_ *nccl_logits_buf_;
-
-    bool *finished_buf_;
-    bool *h_finished_buf_;
-    
-public:
-    DecodingGptJ(const IAllocator &allocator, 
-                 const int batch_size,
-                 const int seq_len,
-                 const int head_num, 
-                 const int size_per_head,
-                 const int vocab_size, 
-                 const int decoder_layers,
-                 const int start_id, 
-                 const int end_id,
-                 const int candidate_num = 1,
-                 const float probability_threshold = 0.0,
-                 const float temperature = 1.0,
-                 const int tensor_para_size = 1,
-                 const int layer_para_size = 1,
-                 const bool is_fuse_QKV = true,
-                 const float repetition_penalty = 1.0,
-                 const int seed = -1,
-                 const int rotary_embedding_dim = 0,
-                 const int min_length = 0) : allocator_(allocator)
-    {
-#ifndef NDEBUG
-        PRINT_FUNC_NAME_();
-#endif
-        assert(temperature != 0.0);
-        assert(repetition_penalty > 0.0);
-        assert(candidate_num > 0 || probability_threshold > 0.0);
-        assert(decoder_layers % layer_para_size == 0);
-
-        args_.batch_size_ = batch_size;
-        args_.seq_len_ = seq_len;
-        args_.head_num_ = head_num;
-        args_.size_per_head_ = size_per_head;
-        args_.hidden_units_ = head_num * size_per_head;
-        args_.decoder_layers_ = decoder_layers;
-        args_.vocab_size_ = vocab_size;
-        args_.start_id_ = start_id;
-        args_.end_id_ = end_id;
-        args_.candidate_num_ = candidate_num;
-        args_.probability_threshold_ = probability_threshold;
-        args_.temperature_ = temperature;
-        args_.repetition_penalty_ = repetition_penalty;
-        /***** newly added by PaddleNLP *****/
-        args_.seed_ = seed;
-        args_.rotary_embedding_dim_ = rotary_embedding_dim;
-        args_.min_length_ = min_length;
-        
-        K_cache_ = new DataType_ *[1];
-        V_cache_ = new DataType_ *[1];
-
-        decoder_ = new OpenDecoder<OpType_>(args_.head_num_, size_per_head, 0 /* memory_hidden_units */, is_fuse_QKV);
-        decoder_->set_max_batch_size(args_.batch_size_);
-
-
-        // args_.vocab_size_padded_ = div_up(args_.vocab_size_, 64) * 64;
-        if(std::is_same<DataType_, float>::value)
-        {
-            args_.vocab_size_padded_ = args_.vocab_size_;
-        }
-        else
-        {
-            args_.vocab_size_padded_ = div_up(args_.vocab_size_, 64) * 64;
-        }
-
-        size_t from_tensor_size = args_.batch_size_ * args_.hidden_units_;                    // type T
-        size_t decoder_workspace_size = (size_t)decoder_->getWorkspaceSize();                                             // type T
-        size_t decoder_normed_result_buffer_size = args_.batch_size_ * args_.hidden_units_;   // type T
-        // cache costs lots of memory, so we only store part of them when we use multi-gpu for inference
-        size_t cache_size = args_.batch_size_ * args_.seq_len_ * args_.hidden_units_ / tensor_para_size;         // type T
-        size_t logits_buf_size = args_.batch_size_ * args_.vocab_size_padded_; // type T
-
-        size_t topp_id_vals_buf_size = args_.batch_size_ * args_.vocab_size_padded_; // type int
-        size_t topp_offset_buf_size = args_.batch_size_ + 1;
-        size_t begin_topp_offset_buf_size = topp_offset_buf_size;
-        size_t curandState_size = args_.batch_size_;
-        size_t finished_buf_size = args_.batch_size_;
-
-        const int MEM_C = 128;
-        size_t embedding_kernel_transposed_padded_size = args_.hidden_units_ * args_.vocab_size_padded_;
-        embedding_kernel_transposed_padded_size = div_up(embedding_kernel_transposed_padded_size, MEM_C) * MEM_C;
-
-
-        size_t padded_embedding_bias_size = args_.vocab_size_padded_;
-        if(std::is_same<DataType_, float>::value || (std::is_same<DataType_, half>::value && args_.vocab_size_padded_ == args_.vocab_size_))
-        {
-            padded_embedding_bias_size = 0;
-        }
-
-        // prevent memory misalinged address
-        logits_buf_size = (size_t)(ceil(logits_buf_size / 4.)) * 4;
-        
-        topp_id_vals_buf_size = (size_t)(ceil(topp_id_vals_buf_size / 4.)) * 4;
-        topp_offset_buf_size = (size_t)(ceil(topp_offset_buf_size / 4.)) * 4;
-        begin_topp_offset_buf_size = topp_offset_buf_size;
-        curandState_size = (size_t)(ceil(curandState_size / 32.)) * 32;
-        finished_buf_size = (size_t)(ceil(finished_buf_size / 32.)) * 32;
-
-        topP_sampling_kernel_kernelLauncher_v2(topp_workspace_,
-                                               topp_workspace_size_,
-                                               logits_buf_,
-                                               topp_id_vals_buf_,
-                                               topp_offset_buf_,
-                                               begin_topp_offset_buf_,
-                                               nullptr,
-                                               curandstate_buf_,
-                                               args_,
-                                               nullptr,
-                                               nullptr,
-                                               args_.vocab_size_padded_,
-                                               0,
-                                               args_.batch_size_);
-
-        topK_sampling_kernel_kernelLauncher_v2(topk_workspace_,
-                                               topk_workspace_size_,
-                                               logits_buf_,
-                                               nullptr,
-                                               nullptr,
-                                               nullptr,
-                                               curandstate_buf_,
-                                               args_,
-                                               0,
-                                               args_.batch_size_);
-
-        topK_topP_sampling_kernel_kernelLauncher_v2(topk_topp_workspace_,
-                                              topk_topp_workspace_size_,
-                                              nullptr,
-                                              logits_buf_,
-                                              nullptr,
-                                              curandstate_buf_,
-                                              args_,
-                                              0,
-                                              args_.batch_size_);
-
-        size_t datatype_buf_size = from_tensor_size * 2 + decoder_workspace_size +
-                                cache_size * 2 * (args_.decoder_layers_ / layer_para_size) + decoder_normed_result_buffer_size;
-
-        nccl_buf_size_ = args_.batch_size_ * args_.vocab_size_padded_;
-        nccl_buf_size_ = (size_t)(ceil(nccl_buf_size_ / 4.)) * 4;
-
-        buf_ = reinterpret_cast<void *>(allocator_.malloc(
-            ((sizeof(DataType_) == sizeof(half)) ? CUBLAS_WORKSPACE_SIZE : 0) + 
-            sizeof(DataType_) * embedding_kernel_transposed_padded_size +
-            sizeof(DataType_) * (datatype_buf_size + logits_buf_size) + 
-            sizeof(DataType_) * (padded_embedding_bias_size) +
-            sizeof(int) * (topp_id_vals_buf_size + topp_offset_buf_size + begin_topp_offset_buf_size) +
-            topp_workspace_size_ + topk_workspace_size_ + topk_topp_workspace_size_ + sizeof(DataType_) * nccl_buf_size_ +
-            finished_buf_size + curandState_size * sizeof(curandState_t)));
-
-        if (sizeof(DataType_) == sizeof(half))
-        {
-          cublas_workspace_ = buf_;
-          embedding_kernel_padded_ = (DataType_ *)((char*)cublas_workspace_ + CUBLAS_WORKSPACE_SIZE);
-        }
-        else
-        {
-          cublas_workspace_ = nullptr;
-          embedding_kernel_padded_ = (DataType_ *)buf_;
-        }
-        embedding_bias_padded_ =  (DataType_ *)(embedding_kernel_padded_ + embedding_kernel_transposed_padded_size);
-        from_tensor_[0] = (DataType_ *)(embedding_bias_padded_ + padded_embedding_bias_size);
-        from_tensor_[1] = (DataType_ *)(from_tensor_[0] + from_tensor_size);
-
-        K_cache_[0] = from_tensor_[1] + from_tensor_size + 0 * cache_size * args_.decoder_layers_ / layer_para_size;
-        V_cache_[0] = from_tensor_[1] + from_tensor_size + 1 * cache_size * args_.decoder_layers_ / layer_para_size;
-
-        decoder_buf_ = V_cache_[0] + cache_size * args_.decoder_layers_ / layer_para_size;
-        decoder_normed_result_buf_ = (decoder_buf_ + decoder_workspace_size);
-        logits_buf_ = decoder_normed_result_buf_ + decoder_normed_result_buffer_size;
-        topp_id_vals_buf_ = (int *)((DataType_*)logits_buf_ + logits_buf_size);
-        begin_topp_offset_buf_ = (int *)(topp_id_vals_buf_ + topp_id_vals_buf_size);
-        topp_offset_buf_ = (int *)((int*)begin_topp_offset_buf_ + begin_topp_offset_buf_size);
-        topp_workspace_ = (void *)((int*)topp_offset_buf_ + topp_offset_buf_size);
-        topk_workspace_ = (void *)((char*)topp_workspace_ + topp_workspace_size_);
-        topk_topp_workspace_ = (void *)((char*)topk_workspace_ + topk_workspace_size_);
-        nccl_logits_buf_ = (DataType_ *)((char*)topk_topp_workspace_ + topk_topp_workspace_size_);
-        curandstate_buf_ = (curandState_t*)(nccl_logits_buf_ + nccl_buf_size_);
-        finished_buf_ = (bool*)(curandstate_buf_ + curandState_size);
-        h_finished_buf_ = new bool[args_.batch_size_];
-
-        cudaMemset(embedding_kernel_padded_, 0, embedding_kernel_transposed_padded_size * sizeof(DataType_));
-
-        int isConfigExist = access("decoding_gemm_config.in", 0);
-        if (isConfigExist == -1)
-            printf("[WARNING] decoding_gemm_config.in is not found\n");
-        else
-        {
-            readAlgoFromConfig(cublasAlgoMap_, 1);
-            // check that the gemm_config setting is runnable
-            for (auto iter = cublasAlgoMap_.begin() ; iter != cublasAlgoMap_.end() ; iter++)
-            {
-                int algoId = iter->second.algoId;
-                int stages = iter->second.stages;
-                //only check for cublas
-                if (stages != -1)
-                    continue;
-                if (Traits_::OpType == OperationType::FP32)
-                {
-                    if (algoId > CUBLAS_GEMM_ALGO23 || algoId < CUBLAS_GEMM_DEFAULT)
-                    {
-                        // the algorithm is not for FP32
-                        printf("[ERROR] cuBLAS Algorithm %d is not used in FP32. \n", algoId);
-                        exit(-1);
-                    }
-                }
-                else
-                {
-                    if (algoId > CUBLAS_GEMM_ALGO15_TENSOR_OP || algoId < CUBLAS_GEMM_DEFAULT_TENSOR_OP)
-                    {
-                        // the algorithm is not for FP16
-                        printf("[ERROR] cuBLAS Algorithm %d is not used in FP16. \n", algoId);
-                        exit(-1);
-                    }
-                }
-            }
-        }
-    }
-
-    void set_tensor_parallel_param(const TensorParallelParam param)
-    {
-        t_parallel_param_ = param;
-        decoder_->set_tensor_parallel_param(param);
-    }
-
-    void set_layer_parallel_param(const LayerParallelParam param)
-    {
-        l_parallel_param_ = param;
-        decoder_->set_layer_parallel_param(param);
-    }
-
-
-    void forward_context(const DecoderInitParam<DataType_> *decoder_param,
-                         const DecodingInitParam<DataType_> decoding_params)
-    {
-#ifndef NDEBUG
-        PRINT_FUNC_NAME_();
-#endif
-        const int input_len = decoding_params.request_input_len;
-        const int max_len = (decoding_params.request_output_len > 0 && input_len + decoding_params.request_output_len <= args_.seq_len_) ?
-                            input_len + decoding_params.request_output_len :
-                            args_.seq_len_;
-        const int request_batch_size = decoding_params.request_batch_size;
-        cudaMemsetAsync(decoding_params.output_ids, 0, sizeof(int) * request_batch_size * max_len, decoding_params.stream);
-#ifndef NDEBUG
-        cudaDeviceSynchronize();
-        check_cuda_error(cudaGetLastError());
-#endif
-
-        // const int input_len = decoding_params.request_input_len;
-        const int max_input_len = decoding_params.max_input_len;
-
-        // d_start_ids: [batch * seqlen]
-        if(input_len == 1)
-        {
-            cudaMemcpyAsync(decoding_params.output_ids, decoding_params.d_start_ids, 
-                            sizeof(int) * request_batch_size, cudaMemcpyDeviceToDevice, decoding_params.stream);
-            return;
-        }
-        const int local_batch_size = ceil(request_batch_size * 1.0 / l_parallel_param_.world_size);
-        const int m = local_batch_size * input_len;
-        const int h_1 = args_.hidden_units_;
-
-        DataType_* from_tensor[2];
-        DataType_* decoder_output;
-        DataType_* decoder_workspace;
-        void *buf = reinterpret_cast<void *>(allocator_.malloc(
-            decoder_->getContextWorkspaceSize(input_len, local_batch_size) + 
-            (m * h_1 + 2 * request_batch_size * input_len * h_1) * sizeof(DataType_)
-        ));
-#ifndef NDEBUG
-        cudaDeviceSynchronize();
-        check_cuda_error(cudaGetLastError());
-#endif
-
-        from_tensor[0] = (DataType_*) buf;
-        from_tensor[1] = from_tensor[0] + request_batch_size * input_len * h_1;
-        decoder_output = from_tensor[1] + request_batch_size * input_len * h_1;
-        decoder_workspace = decoder_output + m * h_1;
-
-#ifndef NDEBUG
-        cudaDeviceSynchronize();
-        check_cuda_error(cudaGetLastError());
-#endif
-        if(l_parallel_param_.rank == 0)
-        {
-            PUSH_RANGE("Before Transformer/Embedding")
-            gptj_start_id_embedding_lookups_kernel_launcher(from_tensor[0],
-                                                            decoding_params.output_ids,
-                                                            decoding_params.embedding_table,
-                                                            decoding_params.d_start_ids,
-                                                            input_len,
-                                                            max_input_len,
-                                                            request_batch_size,
-                                                            args_.hidden_units_, 
-                                                            decoding_params.stream);
-            POP_RANGE
-#ifndef NDEBUG
-        cudaDeviceSynchronize();
-        check_cuda_error(cudaGetLastError());
-#endif
-        }
-
-        int ite_num = (int)(ceil(request_batch_size * 1.0 / local_batch_size));
-        for(int ite = 0; ite < ite_num; ite++)
-        {
-            int in_id, out_id;
-            for (int layer = 0; layer < args_.decoder_layers_; ++layer)
-            {
-                if(l_parallel_param_.is_valid(layer))
-                {
-                    in_id = layer & 0x1;
-                    out_id = 1 - in_id;
-
-                    if(layer == l_parallel_param_.layers_per_group * l_parallel_param_.rank && layer != 0 && l_parallel_param_.world_size > 1)
-                    {
-                        const int size = m * t_parallel_param_.local_hidden_units_;
-                        nccl_recv(from_tensor[in_id] + ite * m * h_1 + size * t_parallel_param_.rank, size, l_parallel_param_.rank - 1, 
-                                    l_parallel_param_, decoding_params.stream);
-                        all2all_gather(from_tensor[in_id] + ite * m * h_1, from_tensor[in_id] + ite * m * h_1, size, 
-                                    t_parallel_param_, decoding_params.stream);
-                    }
-
-                    decoder_->initialize(decoder_param[layer], decoder_buf_, cublas_workspace_, false);
-#ifndef NDEBUG
-                    cudaDeviceSynchronize();
-                    check_cuda_error(cudaGetLastError());
-#endif
-
-                    int dummy_decoder_max_seq_len = args_.seq_len_;
-                    // int dummy_decoder_max_seq_len = -1;
-                    size_t cache_offset;
-                    if(dummy_decoder_max_seq_len == -1)
-                    {
-                        cache_offset = (layer - l_parallel_param_.layers_per_group * l_parallel_param_.rank) *
-                                        args_.batch_size_ * args_.seq_len_ * t_parallel_param_.local_hidden_units_;
-                    }
-                    else
-                    {
-                        cache_offset = (layer - l_parallel_param_.layers_per_group * l_parallel_param_.rank) *
-                                        args_.batch_size_ * args_.seq_len_ * t_parallel_param_.local_hidden_units_ +
-                                        ite * local_batch_size * args_.seq_len_ * t_parallel_param_.local_hidden_units_;
-                    }
-                    decoder_->forward_context(decoder_workspace,
-                                              from_tensor[out_id] + ite * m * h_1,
-                                              K_cache_[0] + cache_offset,
-                                              V_cache_[0] + cache_offset,
-                                              from_tensor[in_id] + ite * m * h_1,
-                                              decoding_params.d_attn_mask + ite * local_batch_size * input_len * input_len,
-                                              local_batch_size,
-                                              input_len,
-                                              ite,
-                                              dummy_decoder_max_seq_len,
-                                              layer == args_.decoder_layers_ - 1,
-                                              nullptr,
-                                              args_.rotary_embedding_dim_);
-#ifndef NDEBUG
-                    cudaDeviceSynchronize();
-                    check_cuda_error(cudaGetLastError());
-#endif
-                    if(layer == l_parallel_param_.layers_per_group * (l_parallel_param_.rank + 1) - 1 && layer != args_.decoder_layers_ - 1 && l_parallel_param_.world_size > 1)
-                    {
-                        const int size = m * t_parallel_param_.local_hidden_units_;
-                        nccl_send(from_tensor[out_id] + ite * m * h_1 + size * t_parallel_param_.rank, size, l_parallel_param_.rank + 1,
-                                    l_parallel_param_, decoding_params.stream);
-                    }
-                }
-            } // end of for loop of layer
-        } // end of for loop of ite
-        allocator_.free(buf);
-#ifndef NDEBUG
-        cudaDeviceSynchronize();
-        check_cuda_error(cudaGetLastError());
-#endif
-    }
-
-    void forward(const DecoderInitParam<DataType_> *decoder_param,
-                 DecodingInitParam<DataType_> decoding_params)
-    {
-#ifndef NDEBUG
-        PRINT_FUNC_NAME_();
-#endif
-        const int input_len = decoding_params.request_input_len;
-        const int max_input_len = decoding_params.max_input_len;
-        const int request_batch_size = decoding_params.request_batch_size;
-        const int max_len = (decoding_params.request_output_len > 0 && input_len + decoding_params.request_output_len <= args_.seq_len_) ?
-                            input_len + decoding_params.request_output_len :
-                            args_.seq_len_;
-
-        assert(request_batch_size <= args_.batch_size_);
-        assert(request_batch_size % l_parallel_param_.local_batch_size == 0);
-        const int m = request_batch_size;
-        const int k = args_.hidden_units_;
-        const DataType_* embedding_kernel_ptr = nullptr;
-        const DataType_* embedding_bias_ptr = nullptr;
-
-        cudaMemsetAsync(finished_buf_, false, sizeof(finished_buf_[0]) * request_batch_size, decoding_params.stream);
-        if (args_.probability_threshold_ != 0.0)
-        {
-            topp_initialization_kernelLauncher_v2(nullptr,
-                                                  nullptr,
-                                                  nullptr,
-                                                  topp_id_vals_buf_,
-                                                  topp_offset_buf_,
-                                                  begin_topp_offset_buf_,
-                                                  args_.candidate_num_ > 0 ? args_.candidate_num_ : args_.vocab_size_padded_,
-                                                  args_,
-                                                  decoding_params.stream);
-
-#ifndef NDEBUG
-            cudaDeviceSynchronize();
-            check_cuda_error(cudaGetLastError());
-#endif
-        }
-        ker_curand_setupLauncher(curandstate_buf_,
-                                 args_,
-                                 decoding_params.stream);
-
-        if(std::is_same<DataType_, float>::value || (std::is_same<DataType_, half>::value && args_.vocab_size_padded_ == args_.vocab_size_))
-        {
-            embedding_kernel_ptr = (const DataType_ *)decoding_params.embedding_kernel;
-            embedding_bias_ptr = (const DataType_ *)decoding_params.embedding_bias;
-        }
-        else
-        {
-            cudaMemcpyAsync(embedding_kernel_padded_, decoding_params.embedding_kernel, 
-                            sizeof(DataType_) * args_.vocab_size_ * args_.hidden_units_, cudaMemcpyDeviceToDevice, decoding_params.stream);
-            embedding_kernel_ptr = (const DataType_ *)embedding_kernel_padded_;
-            bias_padding_kernelLauncher(embedding_bias_padded_,
-                        decoding_params.embedding_bias, // GPTJ/CodeGen bias
-                        args_.vocab_size_,
-                        args_.vocab_size_padded_,
-                        decoding_params.stream);
-            embedding_bias_ptr = embedding_bias_padded_;
-        }
-#ifndef NDEBUG
-        cudaDeviceSynchronize();
-        check_cuda_error(cudaGetLastError());
-#endif
-        bool is_generation_done = false;
-        const int local_batch = l_parallel_param_.local_batch_size;
-        for (size_t step = input_len; step < max_len; ++step)
-        {
-            const int ite_num = request_batch_size / local_batch;
-            for(size_t ite = 0; ite < ite_num; ite++)
-            {
-                if(l_parallel_param_.rank == 0 && l_parallel_param_.world_size > 1)
-                {
-                    if(step != (size_t)input_len)
-                    {
-                        PUSH_RANGE("token/recv")
-                        nccl_recv(decoding_params.output_ids + (step - 1) * m + ite * local_batch, local_batch,
-                                  l_parallel_param_.world_size - 1, l_parallel_param_, decoding_params.stream);
-                        POP_RANGE
-                    }
-                }
-
-                if(l_parallel_param_.rank < l_parallel_param_.world_size - 1 && l_parallel_param_.world_size > 1)
-                {
-                    if(step != (size_t)input_len)
-                    {
-                        nccl_broadcast(finished_buf_ + ite * local_batch, local_batch, l_parallel_param_.world_size - 1, l_parallel_param_, decoding_params.stream);
-                    }
-                }
-                if(ite == 0)
-                {
-                    cudaMemcpyAsync(h_finished_buf_, finished_buf_, sizeof(bool) * request_batch_size, cudaMemcpyDeviceToHost, decoding_params.stream);
-                    cudaStreamSynchronize(decoding_params.stream);
-                    uint sum = 0;
-                    for (uint i = 0; i < request_batch_size; i++)
-                    {
-                        sum += (int)h_finished_buf_[i];
-                    }
-                    if (sum == request_batch_size)
-                    {
-                        is_generation_done = true;
-                        break;
-                    }
-                }
-
-                if(l_parallel_param_.rank == 0)
-                {
-                    PUSH_RANGE("Before Transformer/Embedding")
-                    /***** newly fixed by PaddleNLP *****/
-                    gpj_embedding_lookups_kernel_launcher(from_tensor_[0],
-                                                            decoding_params.embedding_table,
-                                                            decoding_params.output_ids,
-                                                            local_batch,
-                                                            m,
-                                                            args_.hidden_units_,
-                                                            step,
-                                                            ite,
-                                                            max_input_len,
-                                                            decoding_params.d_start_lengths,
-                                                            decoding_params.stream);
-                    POP_RANGE
-#ifndef NDEBUG
-                    cudaDeviceSynchronize();
-                    check_cuda_error(cudaGetLastError());
-#endif
-                }
-
-                //we use two-way buffer
-                int from_id, out_id;
-                for (int layer = 0; layer < args_.decoder_layers_; ++layer)
-                {
-                    if(l_parallel_param_.is_valid(layer))
-                    {
-                        /*
-                            For the first layer (layer-0), from_id is 0. We also stored the embedding lookup 
-                            result in from_tensor_[0]
-                        */
-                        from_id = layer & 0x1;
-                        out_id = 1 - from_id;
-
-                        if(layer == l_parallel_param_.layers_per_group * l_parallel_param_.rank && layer != 0 && l_parallel_param_.world_size > 1)
-                        {
-                            const int size = local_batch * t_parallel_param_.local_hidden_units_;
-                            nccl_recv(from_tensor_[from_id] + size * t_parallel_param_.rank, size, l_parallel_param_.rank - 1, 
-                                      l_parallel_param_, decoding_params.stream);
-                            all2all_gather(from_tensor_[from_id], from_tensor_[from_id], size, 
-                                           t_parallel_param_, decoding_params.stream);
-                        }
-
-                        /*
-                            We use one decoder_ object to process multiple decoder layers. 
-                            At the beginning of each decoder layer, we initialize the decoder object 
-                            with corresponding weights and decoder_buf_.
-                            The decoder_buf_ is reused.
-                        */
-                        decoder_->initialize(decoder_param[layer], decoder_buf_, cublas_workspace_, false);
-                        
-#ifndef NDEBUG
-                        cudaDeviceSynchronize();
-                        check_cuda_error(cudaGetLastError());
-#endif
-                        int dummy_decoder_max_seq_len = args_.seq_len_;
-                        // int dummy_decoder_max_seq_len = -1;
-                        size_t cache_offset;
-                        if(dummy_decoder_max_seq_len == -1)
-                        {
-                            cache_offset = (layer - l_parallel_param_.layers_per_group * l_parallel_param_.rank) *
-                                            args_.batch_size_ * args_.seq_len_ * t_parallel_param_.local_hidden_units_ +
-                                            ite * local_batch * t_parallel_param_.local_hidden_units_;
-                        }
-                        else
-                        {
-                            cache_offset = (layer - l_parallel_param_.layers_per_group * l_parallel_param_.rank) * 
-                                            args_.batch_size_ * args_.seq_len_ * t_parallel_param_.local_hidden_units_ + 
-                                            ite * local_batch * args_.seq_len_ * t_parallel_param_.local_hidden_units_;
-                        }
-                        decoder_->forward_v2(from_tensor_[from_id], 
-                                            nullptr, // memory_tensor should be nullptr
-                                            K_cache_[0] + cache_offset,
-                                            V_cache_[0] + cache_offset,
-                                            nullptr, nullptr, // key_mem_cache_ and value_mem_cache_ should be nullptr
-                                            nullptr, // memory_sequence_length should be nullptr
-                                            from_tensor_[out_id], step, dummy_decoder_max_seq_len,
-                                            false, 
-                                            finished_buf_ + ite * local_batch,
-                                            max_input_len, 
-                                            decoding_params.d_start_lengths + ite * local_batch,
-                                            args_.rotary_embedding_dim_);
-
-#ifndef NDEBUG
-                        cudaDeviceSynchronize();
-                        check_cuda_error(cudaGetLastError());
-#endif          
-                        if(layer == l_parallel_param_.layers_per_group * (l_parallel_param_.rank + 1) - 1 && layer != args_.decoder_layers_ - 1 && l_parallel_param_.world_size > 1)
-                        {
-                            const size_t size = local_batch * t_parallel_param_.local_hidden_units_;
-                            nccl_send(from_tensor_[out_id] + size * t_parallel_param_.rank, size, l_parallel_param_.rank + 1, 
-                                      l_parallel_param_, decoding_params.stream);
-                        }
-                    }
-                }
-
-                if(l_parallel_param_.rank == l_parallel_param_.world_size - 1)
-                {
-
-                    layer_norm(from_tensor_[out_id],
-                               decoding_params.layernorm.gamma,
-                               decoding_params.layernorm.beta,
-                               decoder_normed_result_buf_,
-                               local_batch,
-                               k,
-                               decoding_params.stream);
-
-#ifndef NDEBUG
-                    cudaDeviceSynchronize();
-                    check_cuda_error(cudaGetLastError());
-#endif
-
-                    DataType_ alpha = DataType_(1.0f);
-                    DataType_ beta = DataType_(0.0f);
-                    assert(args_.vocab_size_padded_ % t_parallel_param_.world_size == 0);
-                    int n = args_.vocab_size_padded_ / t_parallel_param_.world_size;
-                    
-                    if(t_parallel_param_.world_size == 1)
-                    {
-                        PUSH_RANGE("After Transformer/GEMM")
-                        cublasMM_cublasLtMM_wrapper_decoder(decoding_params.cublaslt_handle, 
-                                                            decoding_params.cublas_handle, 
-                                                            CUBLAS_OP_T, CUBLAS_OP_N,
-                                                            n, local_batch, k,
-                                                            &alpha,
-                                                            embedding_kernel_ptr, AType_, k,
-                                                            decoder_normed_result_buf_, BType_, k,
-                                                            &beta,
-                                                            logits_buf_, CType_, n,
-                                                            decoding_params.stream, cublasAlgoMap_,
-                                                            cublas_workspace_);
-                        POP_RANGE
-                    }
-                    else
-                    {
-                        PUSH_RANGE("After Transformer/GEMM")
-                        cublasMM_cublasLtMM_wrapper_decoder(decoding_params.cublaslt_handle, 
-                                                            decoding_params.cublas_handle, 
-                                                            CUBLAS_OP_T, CUBLAS_OP_N,
-                                                            n, local_batch, k,
-                                                            &alpha,
-                                                            embedding_kernel_ptr + t_parallel_param_.rank * n * k,
-                                                            AType_, k,
-                                                            decoder_normed_result_buf_, BType_, k,
-                                                            &beta,
-                                                            nccl_logits_buf_ + t_parallel_param_.rank * local_batch * n,
-                                                            CType_, n,
-                                                            decoding_params.stream, cublasAlgoMap_,
-                                                            cublas_workspace_);
-                        POP_RANGE
-                    }
-
-#ifndef NDEBUG
-                    cudaDeviceSynchronize();
-                    check_cuda_error(cudaGetLastError());
-#endif
-
-	                apply_logits_mask_kernelLauncher(logits_buf_,
-						    finished_buf_,
-						    args_.batch_size_,
-						    1,
-						    args_.vocab_size_padded_,
-						    args_.vocab_size_,
-						    decoding_params.stream,
-                            (DataType_*) nullptr,
-                            (args_.min_length_ != 0 && step-input_len < args_.min_length_),
-                            args_.end_id_,
-                            embedding_bias_ptr);
-#ifndef NDEBUG
-                    cudaDeviceSynchronize();
-                    check_cuda_error(cudaGetLastError());
-#endif
-
-                    
-                    if(t_parallel_param_.world_size == 1)
-                    {
-                        apply_temperature_penalty_kernelLauncher(logits_buf_,
-                                                                (DataType_) args_.temperature_,
-                                                                local_batch,
-                                                                args_.vocab_size_,
-                                                                n,
-                                                                decoding_params.stream);
-                    }
-                    else
-                    {
-                        if(t_parallel_param_.rank == t_parallel_param_.world_size - 1)
-                        {
-                            apply_temperature_penalty_kernelLauncher(nccl_logits_buf_ + t_parallel_param_.rank * local_batch * n,
-                                                                    (DataType_) args_.temperature_,
-                                                                    local_batch,
-                                                                    args_.vocab_size_ - n * t_parallel_param_.rank,
-                                                                    n,
-                                                                    decoding_params.stream);
-                        }
-                        else
-                        {
-                            apply_temperature_penalty_kernelLauncher(nccl_logits_buf_ + t_parallel_param_.rank * local_batch * n,
-                                                                    (DataType_) args_.temperature_,
-                                                                    local_batch,
-                                                                    n,
-                                                                    n,
-                                                                    decoding_params.stream);
-                        }
-                    }
-
-#ifndef NDEBUG
-                    cudaDeviceSynchronize();
-                    check_cuda_error(cudaGetLastError());
-#endif
-
-                    // reduce and concat the result
-                    if(t_parallel_param_.world_size > 1)
-                    {
-                        PUSH_RANGE("After Transformer/all2all_gather")
-                        all2all_gather(nccl_logits_buf_, nccl_logits_buf_, local_batch * n, 
-                                       t_parallel_param_, decoding_params.stream);
-                        POP_RANGE
-                        
-                        transpose_axis_01_kernelLauncher(logits_buf_, nccl_logits_buf_, 
-                                                         t_parallel_param_.world_size, local_batch, n, decoding_params.stream);
-                    }
-
-#ifndef NDEBUG
-                    cudaDeviceSynchronize();
-                    check_cuda_error(cudaGetLastError());
-#endif
-
-                    n = args_.vocab_size_padded_;
-
-                    // Apply repetition penalty.
-                    if (args_.repetition_penalty_ != 1.0) {
-                        PUSH_RANGE("After Transformer/Repetition_penalty")
-                        apply_repetition_penalty_kernelLauncher(logits_buf_,
-                                                                args_.repetition_penalty_,
-                                                                decoding_params.d_start_ids,
-                                                                decoding_params.output_ids,
-                                                                m,
-                                                                local_batch,
-                                                                args_.vocab_size_,
-                                                                n,
-                                                                decoding_params.d_start_lengths,
-                                                                max_input_len,
-                                                                step,
-                                                                ite,
-                                                                decoding_params.stream);
-                        POP_RANGE
-                    }
-
-#ifndef NDEBUG
-                    cudaDeviceSynchronize();
-                    check_cuda_error(cudaGetLastError());
-#endif
-
-                    // Sampling
-                    if(args_.candidate_num_ > 0 && args_.probability_threshold_ == 0.0)
-                    {
-                        PUSH_RANGE("After Transformer/Sampling")
-                        // top k sampling
-                        topK_sampling_kernel_kernelLauncher_v2(topk_workspace_,
-                                                               topk_workspace_size_,
-                                                               logits_buf_,
-                                                               decoding_params.output_ids + step * m + ite * local_batch,
-                                                               nullptr,
-                                                               finished_buf_ + ite * local_batch,
-                                                               curandstate_buf_, // used as random number
-                                                               args_,
-                                                               decoding_params.stream,
-                                                               local_batch);
-                        POP_RANGE
-                    }
-                    else if(args_.candidate_num_ == 0 && args_.probability_threshold_ > 0.0f)
-                    {
-                        PUSH_RANGE("After Transformer/Sampling")
-                        // top p sampling
-                        softmax_kernelLauncher(logits_buf_,
-                                               (DataType_*) nullptr,
-                                               args_.end_id_,
-                                               finished_buf_ + ite * local_batch,
-                                               local_batch,
-                                               args_.vocab_size_padded_,
-                                               args_.vocab_size_,
-                                               decoding_params.stream);
-#ifndef NDEBUG
-                        cudaDeviceSynchronize();
-                        check_cuda_error(cudaGetLastError());
-#endif
-                        topP_sampling_kernel_kernelLauncher_v2(topp_workspace_,
-                                                               topp_workspace_size_,
-                                                               logits_buf_,
-                                                               topp_id_vals_buf_,
-                                                               topp_offset_buf_,
-                                                               begin_topp_offset_buf_,
-                                                               finished_buf_ + ite * local_batch,
-                                                               curandstate_buf_,
-                                                               args_,
-                                                               decoding_params.output_ids + step * m + ite * local_batch,
-                                                               nullptr,
-                                                               n,
-                                                               decoding_params.stream,
-                                                               local_batch);
-
-                        POP_RANGE
-                    }
-                    else if(args_.candidate_num_ > 0 && args_.probability_threshold_ > 0.0f)
-                    {
-                        PUSH_RANGE("After Transformer/Sampling")
-                        topK_topP_sampling_kernel_kernelLauncher_v2(topk_topp_workspace_,
-                                                                    topk_topp_workspace_size_,
-                                                                    decoding_params.output_ids + step * m + ite * local_batch,
-                                                                    logits_buf_,
-                                                                    finished_buf_ + ite * local_batch,
-                                                                    curandstate_buf_,
-                                                                    args_,
-                                                                    decoding_params.stream,
-                                                                    local_batch);
-                        POP_RANGE
-                    }
-#ifndef NDEBUG
-                    cudaDeviceSynchronize();
-                    check_cuda_error(cudaGetLastError());
-#endif
-                }
-                if(step < (size_t)max_input_len)
-                {
-                    // Replace the sampled id by start ids
-                    set_start_ids_kernelLauncher(decoding_params.output_ids, decoding_params.d_start_ids, max_input_len,
-                                                 step, ite, request_batch_size, local_batch, args_.end_id_, decoding_params.stream);
-                }
-
-                if(l_parallel_param_.rank == l_parallel_param_.world_size - 1 && l_parallel_param_.world_size > 1)
-                {
-                    PUSH_RANGE("token/send")
-                    nccl_send(decoding_params.output_ids + step * m + ite * local_batch, local_batch, 0, l_parallel_param_, decoding_params.stream);
-                    POP_RANGE
-                }
-
-#ifndef NDEBUG
-                cudaDeviceSynchronize();
-                check_cuda_error(cudaGetLastError());
-#endif
-
-                if(l_parallel_param_.rank == l_parallel_param_.world_size - 1 && l_parallel_param_.world_size > 1 && step < max_len - 1)
-                {
-                    nccl_broadcast(finished_buf_ + ite * local_batch, local_batch, l_parallel_param_.world_size - 1, l_parallel_param_, decoding_params.stream);
-                }
-#ifndef NDEBUG
-                cudaDeviceSynchronize();
-                check_cuda_error(cudaGetLastError());
-#endif
-            } // end for ite for loop
-
-            if (is_generation_done) {
-                break;
-            }
-        } // end for decoding step for loop
-        if(l_parallel_param_.rank == 0 && l_parallel_param_.world_size > 1)
-        {
-            for(size_t ite = 0; ite < request_batch_size / local_batch; ite++)
-            {
-                nccl_recv(decoding_params.output_ids + (max_len - 1) * m + ite * local_batch,
-                          local_batch, l_parallel_param_.world_size - 1,
-                          l_parallel_param_, decoding_params.stream);
-            }
-        }
-    } // end of forward
-
-    virtual ~DecodingGptJ()
-    {
-        delete[] K_cache_;
-        delete[] V_cache_;
-        delete decoder_;
-        allocator_.free(buf_);
-        delete [] h_finished_buf_;
-    }
-
-    inline int get_num_layer() {return args_.decoder_layers_;}
-
-    inline void set_local_batch_size(int local_batch)
-    { 
-        l_parallel_param_.local_batch_size = local_batch;
-        decoder_->set_local_batch_size(local_batch);
-    }
-};
-
-} //namespace fastertransformer
diff --git a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/open_decoder.h b/paddlenlp/ops/patches/FasterTransformer/fastertransformer/open_decoder.h
deleted file mode 100644
index c23162285a6d..000000000000
--- a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/open_decoder.h
+++ /dev/null
@@ -1,2166 +0,0 @@
-/*
- * Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/**
- * Decoder for a Single Step of a Single Layer
- **/
-
-#pragma once
-#include <assert.h>
-#include <cuda_fp16.h>
-#include <cuda_runtime.h>
-#include "fastertransformer/cuda/attention_kernels.cuh"
-#include "fastertransformer/cuda/cuda_kernels.h"
-#include "fastertransformer/cuda/open_decoder.cuh"
-#include "fastertransformer/cuda/transformer_kernels.cuh"
-#include "fastertransformer/utils/allocator.h"
-#include "fastertransformer/utils/common.h"
-#include "fastertransformer/utils/common_structure.h"
-#include "fastertransformer/utils/functions.h"
-#include "fastertransformer/utils/nccl_utils.h"
-#include "fastertransformer/utils/nvtx_utils.h"
-#include "fastertransformer/utils/nvtx_utils.h"
-
-// use new attention implementation with [B, H, Dh/x, L, x] cache format for the
-// keys
-// and [B, H, L, Dh] for values
-
-#define USE_CACHE_BATCH_MAJOR_ATTENTION 1
-
-namespace fastertransformer {
-
-template <typename T>
-std::vector<T> copy_data(const T *src, size_t num) {
-  std::vector<T> h_tmp(num, 0);
-  cudaMemcpy(h_tmp.data(), src, sizeof(T) * num, cudaMemcpyDeviceToHost);
-  return h_tmp;
-}
-
-template <typename T>
-class DecoderInitParam : public AbstractParam {
-public:
-  /* weights for masked_multi_head_attention */
-  LayerNormWeight<T> self_layernorm;
-  AttentionWeight<T> self_attention;
-
-  LayerNormWeight<T> cross_layernorm;
-  AttentionWeight<T> cross_attention;
-
-  LayerNormWeight<T> ffn_layernorm;
-  FFNWeight<T> ffn;
-  cublasHandle_t cublas_handle;
-  cublasLtHandle_t cublaslt_handle;
-  cudaStream_t stream;
-
-  int request_batch_size = -1;
-  int request_max_mem_seq_len = -1;
-
-  const float *k_cache = nullptr;
-  const float *v_cache = nullptr;
-};
-
-template <OperationType OpType_>
-class DecoderTransformerTraits;
-
-template <>
-class DecoderTransformerTraits<OperationType::FP32>
-    : public TransformerTraits<OperationType::FP32> {};
-
-template <>
-class DecoderTransformerTraits<OperationType::FP16>
-    : public TransformerTraits<OperationType::FP16> {};
-
-template <OperationType OpType_>
-class OpenDecoder {
-private:
-  typedef DecoderTransformerTraits<OpType_> Traits_;
-  typedef typename Traits_::DataType DataType_;
-  DecoderInitParam<DataType_> param_;
-  TensorParallelParam t_parallel_param_;
-  LayerParallelParam l_parallel_param_;
-
-  const cudaDataType_t computeType_ = Traits_::computeType;
-  const cudaDataType_t AType_ = Traits_::AType;
-  const cudaDataType_t BType_ = Traits_::BType;
-  const cudaDataType_t CType_ = Traits_::CType;
-  std::map<std::string, cublasLtMatmulAlgo_info> cublasAlgoMap_;
-
-  int max_batch_size_ = -1;
-  int head_num_;
-  int size_per_head_;
-  int hidden_units_;
-  int memory_hidden_units_;
-  int normalization_before_;
-  int inner_coeff_ = 4;
-  int inner_size_ = -1;
-  bool use_gated_ = false;
-  ActivationType act_;
-
-  DataType_ *norm_from_tensor_buf_, *query_buf_;
-  DataType_ *context_buf_, *masked_output_buf_;
-  DataType_ *norm_masked_output_buf_, *cross_output_buf_;
-  DataType_ *norm_cross_output_buf_, *ffn_inner_buf_, *ffn_out_buf_;
-  DataType_ *key_buf_, *value_buf_;
-
-  DataType_ **qkv_kernel_;
-  DataType_ **qkv_input_;
-  DataType_ **qkv_buf_;
-  void *cublas_workspace_ = nullptr;
-
-  bool is_fuse_QKV_in_batched_gemm_;
-  const bool is_fuse_QKV_in_normal_gemm_;
-
-public:
-  void judgeFusedQKV() {
-    is_fuse_QKV_in_batched_gemm_ = false;
-    int m, n, k, dataType;
-    if (std::is_same<half, DataType_>::value)
-      dataType = HALF_DATATYPE;
-    else
-      dataType = FLOAT_DATATYPE;
-
-    m = l_parallel_param_.local_batch_size;
-    n = t_parallel_param_.local_hidden_units_;
-    k = hidden_units_;
-    char mark[256], mark2[256];
-    sprintf(mark, "1_%d_%d_%d_%d", n, m, k, dataType);
-    sprintf(mark2, "3_%d_%d_%d_%d", n, m, k, dataType);
-    if (cublasAlgoMap_.find(mark) != cublasAlgoMap_.end() &&
-        cublasAlgoMap_.find(mark2) != cublasAlgoMap_.end() &&
-        3 * cublasAlgoMap_[mark].exec_time > cublasAlgoMap_[mark2].exec_time) {
-      is_fuse_QKV_in_batched_gemm_ = true;
-    }
-  }
-
-
-  OpenDecoder(int head_num,
-              int size_per_head,
-              int memory_hidden_units,
-              bool is_fuse_QKV_in_normal_gemm = false,
-              bool normalization_before = true,
-              ActivationType act = ActivationType::GELU,
-              const int inner_coeff = 4,
-              const int inner_size = -1,
-              const bool use_gated = false)
-      // Activation function default to GELU for GPT.
-      : head_num_(head_num),
-        size_per_head_(size_per_head),
-        memory_hidden_units_(memory_hidden_units),
-        is_fuse_QKV_in_normal_gemm_(is_fuse_QKV_in_normal_gemm),
-        normalization_before_(normalization_before),
-        act_(act),
-        inner_coeff_(inner_coeff),
-        inner_size_(inner_size),
-        use_gated_(use_gated) {
-#ifndef NDEBUG
-    PRINT_FUNC_NAME_();
-#endif
-    hidden_units_ = head_num_ * size_per_head_;
-    t_parallel_param_.local_head_num_ = head_num_;
-    t_parallel_param_.local_hidden_units_ = hidden_units_;
-
-    int isConfigExist = access("decoding_gemm_config.in", 0);
-    if (isConfigExist == -1) {
-      printf("[WARNING] decoding_gemm_config.in is not found\n");
-    } else {
-      readAlgoFromConfig(cublasAlgoMap_);
-      // check that the gemm_config setting is runnable
-      for (auto iter = cublasAlgoMap_.begin(); iter != cublasAlgoMap_.end();
-           iter++) {
-        int algoId = iter->second.algoId;
-        int stages = iter->second.stages;
-        // only check for cublas
-        if (stages != -1) continue;
-        if (Traits_::OpType == OperationType::FP32) {
-          if (algoId > CUBLAS_GEMM_ALGO23 || algoId < CUBLAS_GEMM_DEFAULT) {
-            // the algorithm is not for FP32
-            printf("[ERROR] cuBLAS Algorithm %d is not used in FP32. \n",
-                   algoId);
-            exit(-1);
-          }
-        } else {
-          if (algoId > CUBLAS_GEMM_ALGO15_TENSOR_OP ||
-              algoId < CUBLAS_GEMM_DEFAULT_TENSOR_OP) {
-            // the algorithm is not for FP16
-            printf("[ERROR] cuBLAS Algorithm %d is not used in FP16. \n",
-                   algoId);
-            exit(-1);
-          }
-        }
-      }
-    }
-    judgeFusedQKV();
-  }
-
-  inline void set_max_batch_size(int batch_size) {
-    max_batch_size_ = batch_size;
-  }
-
-  int getWorkspaceSize() {
-    assert(max_batch_size_ != -1);
-    int inner_buf_size = 0;
-    if (inner_size_ <= 0) {
-      inner_buf_size = inner_coeff_ * max_batch_size_ * hidden_units_;
-    } else {
-      inner_buf_size = max_batch_size_ * inner_size_;
-    }
-
-    if (use_gated_) {
-      inner_buf_size *= 2;
-    }
-
-    return 9 * max_batch_size_ * hidden_units_ + sizeof(DataType_ *) * 9 + inner_buf_size;
-  }
-
-  void set_tensor_parallel_param(const TensorParallelParam param) {
-    t_parallel_param_ = param;
-  }
-
-  void set_layer_parallel_param(const LayerParallelParam param) {
-    l_parallel_param_ = param;
-  }
-
-  void initialize(DecoderInitParam<DataType_> param,
-                  DataType_ *buf,
-                  void *cublas_workapsce,
-                  bool set_local_batch = true) {
-#ifndef NDEBUG
-// PRINT_FUNC_NAME_();
-#endif
-    param_ = param;
-    if (l_parallel_param_.local_batch_size == -1 || set_local_batch == true)
-      l_parallel_param_.local_batch_size = param_.request_batch_size;
-    const int buf_size = max_batch_size_ * hidden_units_;
-    // cublas_workspace_ should be the start pointer of cudaMalloc()
-    // to ensure 16B alignemnet
-    cublas_workspace_ = cublas_workapsce;
-
-    norm_from_tensor_buf_ = buf;
-    ffn_out_buf_ = buf;
-    query_buf_ = buf + buf_size;  // store the query values (from_tensor * Q) in
-                                  // both masked and multi-head attention
-    key_buf_ = query_buf_ + buf_size;
-    value_buf_ = key_buf_ + buf_size;
-    context_buf_ = value_buf_ + buf_size;  // store the context result
-                                           // (softmax(qk)v) in both masked and
-                                           // multi-head attention
-
-    masked_output_buf_ = context_buf_ + buf_size;  // masked_attention_output
-    norm_masked_output_buf_ =
-        masked_output_buf_ + buf_size;  // norm(masked_attention_output)
-
-    cross_output_buf_ =
-        norm_masked_output_buf_ + buf_size;  // mutli-head attention_output
-    norm_cross_output_buf_ =
-        cross_output_buf_ + buf_size;  // norm(multi-head attention_output)
-    ffn_inner_buf_ =
-        norm_cross_output_buf_ + buf_size;  // inner_coeff_(4) buf size to store inner product
-
-    int ffn_inner_buf_size = 0;
-    if (inner_size_ <= 0) {
-      ffn_inner_buf_size = inner_coeff_ * buf_size;
-    } else {
-      ffn_inner_buf_size = max_batch_size_ * inner_size_;
-    }
-
-    if (use_gated_) {
-      ffn_inner_buf_size *= 2;
-    }
-
-    qkv_kernel_ = (DataType_ **)(ffn_inner_buf_ + ffn_inner_buf_size);
-    qkv_input_ = qkv_kernel_ + 3;
-    qkv_buf_ = qkv_input_ + 3;
-
-    if (is_fuse_QKV_in_normal_gemm_ == false &&
-        is_fuse_QKV_in_batched_gemm_ == true) {
-      const DataType_ *hA[]{param_.self_attention.query_weight.kernel,
-                            param_.self_attention.key_weight.kernel,
-                            param_.self_attention.value_weight.kernel,
-                            norm_from_tensor_buf_,
-                            norm_from_tensor_buf_,
-                            norm_from_tensor_buf_,
-                            query_buf_,
-                            key_buf_,
-                            value_buf_};
-      cudaMemcpyAsync((void *)qkv_kernel_,
-                      hA,
-                      sizeof(DataType_ *) * 9,
-                      cudaMemcpyHostToDevice,
-                      param_.stream);
-    }
-  }
-
-  void forward(const DataType_ *from_tensor,
-               const DataType_ *memory_tensor,
-               DataType_ *key_cache_,
-               DataType_ *value_cache_,
-               DataType_ *key_mem_cache_,
-               DataType_ *value_mem_cache_,
-               const int *memory_sequence_length,
-               DataType_ *decoder_output,
-               const int step,
-               const int decoder_max_seq_len,
-               const bool is_cross_attention,
-               const bool *finished = nullptr,
-               const DataType_* relative_attention_bias = nullptr,
-               const bool use_t5_layer_norm = false) {
-#ifndef NDEBUG
-// PRINT_FUNC_NAME_();
-#endif
-    const int m = l_parallel_param_.local_batch_size;
-    if (inner_size_ <= 0) {
-      inner_size_ = inner_coeff_ * t_parallel_param_.local_hidden_units_;
-    }
-
-    try {
-      /* masked multi-head attention */
-      /* layernorm(from_tensor) -> norm_from_tensor_buf_ */
-      if (normalization_before_) {
-
-        if (use_t5_layer_norm) {
-          t5_layer_norm(from_tensor,
-                        param_.self_layernorm.gamma,
-                        param_.self_layernorm.beta,
-                        norm_from_tensor_buf_,
-                        m,
-                        hidden_units_,
-                        param_.stream);
-        } else {
-          layer_norm(from_tensor,
-                     param_.self_layernorm.gamma,
-                     param_.self_layernorm.beta,
-                     norm_from_tensor_buf_,
-                     m,
-                     hidden_units_,
-                     param_.stream);
-        }
-
-#ifndef NDEBUG
-        cudaDeviceSynchronize();
-        check_cuda_error(cudaGetLastError());
-#endif
-
-        PUSH_RANGE("Transformer/slf_attn")
-        masked_multi_head_attention(norm_from_tensor_buf_,
-                                    key_cache_,
-                                    value_cache_,
-                                    masked_output_buf_,
-                                    finished,
-                                    step,
-                                    decoder_max_seq_len,
-                                    relative_attention_bias);
-        POP_RANGE
-
-#ifndef NDEBUG
-        cudaDeviceSynchronize();
-        check_cuda_error(cudaGetLastError());
-#endif
-
-        if (is_cross_attention == true) {
-          /*
-              add bias to masked_output_buf_
-              masked_output_buf_ + from_tensor -> masked_output_buf_
-              norm(masked_output_buf_) -> norm_masked_output_buf_
-          */
-          if (use_t5_layer_norm) {
-            add_bias_input_t5_layernorm_2_kernelLauncher(
-                from_tensor,
-                param_.cross_layernorm.gamma,
-                param_.cross_layernorm.beta,
-                param_.self_attention.attention_output_weight.bias,
-                masked_output_buf_,
-                norm_masked_output_buf_,
-                m,
-                hidden_units_,
-                param_.stream);
-
-          } else {
-            add_bias_input_layernorm_2_kernelLauncher(
-                from_tensor,
-                param_.cross_layernorm.gamma,
-                param_.cross_layernorm.beta,
-                param_.self_attention.attention_output_weight.bias,
-                masked_output_buf_,
-                norm_masked_output_buf_,
-                m,
-                hidden_units_,
-                param_.stream);
-          }
-
-#ifndef NDEBUG
-          cudaDeviceSynchronize();
-          check_cuda_error(cudaGetLastError());
-#endif
-          // For Attention is All You Need decoder
-          /* cross attention with memory */
-          cross_multi_head_attention(norm_masked_output_buf_,
-                                     memory_tensor,
-                                     key_mem_cache_,
-                                     value_mem_cache_,
-                                     cross_output_buf_,
-                                     memory_sequence_length,
-                                     finished,
-                                     param_.request_max_mem_seq_len,
-                                     step,
-                                     relative_attention_bias);
-#ifndef NDEBUG
-          cudaDeviceSynchronize();
-          check_cuda_error(cudaGetLastError());
-#endif
-
-          /*
-              cross_output_buf_ + bias + masked_output_buf_ -> cross_output_buf_
-              norm(cross_otuput_buf) -> normed_last_context (input for ffn)
-          */
-          if (use_t5_layer_norm) {
-            add_bias_input_t5_layernorm_2_kernelLauncher(
-                masked_output_buf_,
-                param_.ffn_layernorm.gamma,
-                param_.ffn_layernorm.beta,
-                param_.cross_attention.attention_output_weight.bias,
-                cross_output_buf_,
-                norm_cross_output_buf_,
-                m,
-                hidden_units_,
-                param_.stream);
-
-          } else {
-            add_bias_input_layernorm_2_kernelLauncher(
-                masked_output_buf_,
-                param_.ffn_layernorm.gamma,
-                param_.ffn_layernorm.beta,
-                param_.cross_attention.attention_output_weight.bias,
-                cross_output_buf_,
-                norm_cross_output_buf_,
-                m,
-                hidden_units_,
-                param_.stream);
-          }
-#ifndef NDEBUG
-          cudaDeviceSynchronize();
-          check_cuda_error(cudaGetLastError());
-#endif
-
-          ffn(norm_cross_output_buf_,
-              ffn_inner_buf_,
-              decoder_output,
-              m,
-              inner_size_,
-              hidden_units_,
-              act_,
-              step);
-#ifndef NDEBUG
-          cudaDeviceSynchronize();
-          check_cuda_error(cudaGetLastError());
-#endif
-
-          add_bias_input_kernelLauncher(decoder_output,
-                                        param_.ffn.output_weight.bias,
-                                        cross_output_buf_,
-                                        m,
-                                        hidden_units_,
-                                        param_.stream);
-        } else {
-          if (use_t5_layer_norm) {
-            add_bias_input_t5_layernorm_2_kernelLauncher(
-                from_tensor,
-                param_.ffn_layernorm.gamma,
-                param_.ffn_layernorm.beta,
-                param_.self_attention.attention_output_weight.bias,
-                masked_output_buf_,
-                norm_masked_output_buf_,
-                m,
-                hidden_units_,
-                param_.stream);
-
-          } else {
-            add_bias_input_layernorm_2_kernelLauncher(
-                from_tensor,
-                param_.ffn_layernorm.gamma,
-                param_.ffn_layernorm.beta,
-                param_.self_attention.attention_output_weight.bias,
-                masked_output_buf_,
-                norm_masked_output_buf_,
-                m,
-                hidden_units_,
-                param_.stream);
-          }
-#ifndef NDEBUG
-          cudaDeviceSynchronize();
-          check_cuda_error(cudaGetLastError());
-#endif
-          // For GPT-2 decoder
-          PUSH_RANGE("Transformer/MLP")
-          ffn(norm_masked_output_buf_,
-              ffn_inner_buf_,
-              decoder_output,
-              m,
-              inner_size_,
-              hidden_units_,
-              act_);
-          POP_RANGE
-#ifndef NDEBUG
-          cudaDeviceSynchronize();
-          check_cuda_error(cudaGetLastError());
-#endif
-          add_bias_input_kernelLauncher(decoder_output,
-                                        param_.ffn.output_weight.bias,
-                                        masked_output_buf_,
-                                        m,
-                                        hidden_units_,
-                                        param_.stream);
-        }
-#ifndef NDEBUG
-        cudaDeviceSynchronize();
-        check_cuda_error(cudaGetLastError());
-#endif
-      } else {
-        // post-normalization
-        masked_multi_head_attention(from_tensor,
-                                    key_cache_,
-                                    value_cache_,
-                                    masked_output_buf_,
-                                    finished,
-                                    step,
-                                    decoder_max_seq_len,
-                                    relative_attention_bias);
-
-#ifndef NDEBUG
-        cudaDeviceSynchronize();
-        check_cuda_error(cudaGetLastError());
-#endif
-
-        add_bias_input_layernorm_2_kernelLauncher(
-            from_tensor,
-            param_.self_layernorm.gamma,
-            param_.self_layernorm.beta,
-            param_.self_attention.attention_output_weight.bias,
-            masked_output_buf_,
-            norm_masked_output_buf_,
-            m,
-            hidden_units_,
-            param_.stream);
-
-#ifndef NDEBUG
-        cudaDeviceSynchronize();
-        check_cuda_error(cudaGetLastError());
-#endif
-        if (is_cross_attention == true) {
-          // For Attention is All You Need decoder
-          // cross attention with memory
-          cross_multi_head_attention(norm_masked_output_buf_,
-                                     memory_tensor,
-                                     key_mem_cache_,
-                                     value_mem_cache_,
-                                     cross_output_buf_,
-                                     memory_sequence_length,
-                                     finished,
-                                     param_.request_max_mem_seq_len,
-                                     step,
-                                     relative_attention_bias);
-
-#ifndef NDEBUG
-          cudaDeviceSynchronize();
-          check_cuda_error(cudaGetLastError());
-#endif
-
-          //
-          //  cross_output_buf_ + bias + masked_output_buf_ -> cross_output_buf_
-          //  norm(cross_otuput_buf) -> normed_last_context (input for ffn)
-          //
-          add_bias_input_layernorm_2_kernelLauncher(
-              norm_masked_output_buf_,
-              param_.cross_layernorm.gamma,
-              param_.cross_layernorm.beta,
-              param_.cross_attention.attention_output_weight.bias,
-              cross_output_buf_,
-              norm_cross_output_buf_,
-              m,
-              hidden_units_,
-              param_.stream);
-
-#ifndef NDEBUG
-          cudaDeviceSynchronize();
-          check_cuda_error(cudaGetLastError());
-#endif
-
-          ffn(norm_cross_output_buf_,
-              ffn_inner_buf_,
-              ffn_out_buf_,
-              m,
-              inner_size_,
-              hidden_units_,
-              act_);
-
-#ifndef NDEBUG
-          cudaDeviceSynchronize();
-          check_cuda_error(cudaGetLastError());
-#endif
-
-          add_bias_input_kernelLauncher(ffn_out_buf_,
-                                        param_.ffn.output_weight.bias,
-                                        norm_cross_output_buf_,
-                                        m,
-                                        hidden_units_,
-                                        param_.stream);
-
-#ifndef NDEBUG
-          cudaDeviceSynchronize();
-          check_cuda_error(cudaGetLastError());
-#endif
-
-          layer_norm(ffn_out_buf_,
-                     param_.ffn_layernorm.gamma,
-                     param_.ffn_layernorm.beta,
-                     decoder_output,
-                     m,
-                     hidden_units_,
-                     param_.stream);
-
-        } else {
-          ffn(norm_masked_output_buf_,
-              ffn_inner_buf_,
-              ffn_out_buf_,
-              m,
-              inner_size_,
-              hidden_units_,
-              act_);
-#ifndef NDEBUG
-          cudaDeviceSynchronize();
-          check_cuda_error(cudaGetLastError());
-#endif
-
-          add_bias_input_kernelLauncher(ffn_out_buf_,
-                                        param_.ffn.output_weight.bias,
-                                        norm_masked_output_buf_,
-                                        m,
-                                        hidden_units_,
-                                        param_.stream);
-
-#ifndef NDEBUG
-          cudaDeviceSynchronize();
-          check_cuda_error(cudaGetLastError());
-#endif
-
-          layer_norm(ffn_out_buf_,
-                     param_.ffn_layernorm.gamma,
-                     param_.ffn_layernorm.beta,
-                     decoder_output,
-                     m,
-                     hidden_units_,
-                     param_.stream);
-        }
-      }
-    } catch (std::runtime_error &error) {
-      throw error;
-    }
-
-#ifndef NDEBUG
-    cudaDeviceSynchronize();
-    check_cuda_error(cudaGetLastError());
-#endif
-  }
-
-
-  void forward_v2(const DataType_ *from_tensor,
-                  const DataType_ *memory_tensor,
-                  DataType_ *key_cache_,
-                  DataType_ *value_cache_,
-                  DataType_ *key_mem_cache_,
-                  DataType_ *value_mem_cache_,
-                  const int *memory_sequence_length,
-                  DataType_ *decoder_output,
-                  const int step,
-                  const int decoder_max_seq_len,
-                  const bool is_cross_attention,
-                  const bool *finished = nullptr,
-                  const int max_input_len = 0,
-                  const int *input_lengths = nullptr,
-                  const int rotary_embedding_dim = 0,
-                  const DataType_ *relative_attention_bias = nullptr) {
-#ifndef NDEBUG
-    PRINT_FUNC_NAME_();
-#endif
-    const int m = l_parallel_param_.local_batch_size;
-    if (inner_size_ <= 0) {
-      inner_size_ = inner_coeff_ * t_parallel_param_.local_hidden_units_;
-    }
-    try {
-      /* masked multi-head attention */
-      /* layernorm(from_tensor) -> norm_from_tensor_buf_ */
-      if (normalization_before_) {
-        layer_norm(from_tensor,
-                   param_.self_layernorm.gamma,
-                   param_.self_layernorm.beta,
-                   norm_from_tensor_buf_,
-                   m,
-                   hidden_units_,
-                   param_.stream);
-
-#ifndef NDEBUG
-        cudaDeviceSynchronize();
-        check_cuda_error(cudaGetLastError());
-#endif
-        PUSH_RANGE("Transformer/slf_attn")
-        masked_multi_head_attention_v2(norm_from_tensor_buf_,
-                                       key_cache_,
-                                       value_cache_,
-                                       masked_output_buf_,
-                                       finished,
-                                       step,
-                                       decoder_max_seq_len,
-                                       max_input_len,
-                                       input_lengths,
-                                       rotary_embedding_dim);
-        POP_RANGE
-
-#ifndef NDEBUG
-        cudaDeviceSynchronize();
-        check_cuda_error(cudaGetLastError());
-#endif
-
-        if (is_cross_attention == true) {
-          /*
-              add bias to masked_output_buf_
-              masked_output_buf_ + from_tensor -> masked_output_buf_
-              norm(masked_output_buf_) -> norm_masked_output_buf_
-          */
-          add_bias_input_layernorm_2_kernelLauncher(
-              from_tensor,
-              param_.cross_layernorm.gamma,
-              param_.cross_layernorm.beta,
-              param_.self_attention.attention_output_weight.bias,
-              masked_output_buf_,
-              norm_masked_output_buf_,
-              m,
-              hidden_units_,
-              param_.stream);
-#ifndef NDEBUG
-          cudaDeviceSynchronize();
-          check_cuda_error(cudaGetLastError());
-#endif
-          // For Attention is All You Need decoder
-          /* cross attention with memory */
-          cross_multi_head_attention(norm_masked_output_buf_,
-                                     memory_tensor,
-                                     key_mem_cache_,
-                                     value_mem_cache_,
-                                     cross_output_buf_,
-                                     memory_sequence_length,
-                                     finished,
-                                     param_.request_max_mem_seq_len,
-                                     step);
-#ifndef NDEBUG
-          cudaDeviceSynchronize();
-          check_cuda_error(cudaGetLastError());
-#endif
-          /*
-              cross_output_buf_ + bias + masked_output_buf_ -> cross_output_buf_
-              norm(cross_otuput_buf) -> normed_last_context (input for ffn)
-          */
-          add_bias_input_layernorm_2_kernelLauncher(
-              masked_output_buf_,
-              param_.ffn_layernorm.gamma,
-              param_.ffn_layernorm.beta,
-              param_.cross_attention.attention_output_weight.bias,
-              cross_output_buf_,
-              norm_cross_output_buf_,
-              m,
-              hidden_units_,
-              param_.stream);
-#ifndef NDEBUG
-          cudaDeviceSynchronize();
-          check_cuda_error(cudaGetLastError());
-#endif
-          ffn(norm_cross_output_buf_,
-              ffn_inner_buf_,
-              decoder_output,
-              m,
-              inner_size_,
-              hidden_units_,
-              act_);
-#ifndef NDEBUG
-          cudaDeviceSynchronize();
-          check_cuda_error(cudaGetLastError());
-#endif
-          add_bias_input_kernelLauncher(decoder_output,
-                                        param_.ffn.output_weight.bias,
-                                        cross_output_buf_,
-                                        m,
-                                        hidden_units_,
-                                        param_.stream);
-        } else {
-          if (rotary_embedding_dim > 0){
-            add_bias_input_layernorm_2_kernelLauncher(
-              from_tensor,
-              (DataType_*) nullptr,
-              (DataType_*) nullptr,
-              (DataType_*) nullptr,
-              masked_output_buf_,
-              norm_masked_output_buf_,
-              m,
-              hidden_units_,
-              param_.stream);
-
-          }else{
-            add_bias_input_layernorm_2_kernelLauncher(
-              from_tensor,
-              param_.ffn_layernorm.gamma,
-              param_.ffn_layernorm.beta,
-              param_.self_attention.attention_output_weight.bias,
-              masked_output_buf_,
-              norm_masked_output_buf_,
-              m,
-              hidden_units_,
-              param_.stream);
-          }
-         
-#ifndef NDEBUG
-          cudaDeviceSynchronize();
-          check_cuda_error(cudaGetLastError());
-#endif
-          PUSH_RANGE("Transformer/MLP")
-          ffn(rotary_embedding_dim > 0 ? norm_from_tensor_buf_:norm_masked_output_buf_,
-              ffn_inner_buf_,
-              decoder_output,
-              m,
-              inner_size_,
-              hidden_units_,
-              act_);
-          POP_RANGE
-
-#ifndef NDEBUG
-          cudaDeviceSynchronize();
-          check_cuda_error(cudaGetLastError());
-#endif
-          add_bias_input_kernelLauncher(decoder_output,
-                                        param_.ffn.output_weight.bias,
-                                        masked_output_buf_,
-                                        m,
-                                        hidden_units_,
-                                        param_.stream);
-        }
-#ifndef NDEBUG
-        cudaDeviceSynchronize();
-        check_cuda_error(cudaGetLastError());
-#endif
-      } else {
-        // post-normalization
-        PUSH_RANGE("Transformer/slf_attn")
-        masked_multi_head_attention_v2(from_tensor,
-                                       key_cache_,
-                                       value_cache_,
-                                       masked_output_buf_,
-                                       finished,
-                                       step,
-                                       decoder_max_seq_len,
-                                       max_input_len,
-                                       input_lengths);
-        POP_RANGE
-
-#ifndef NDEBUG
-        cudaDeviceSynchronize();
-        check_cuda_error(cudaGetLastError());
-#endif
-
-        add_bias_input_layernorm_2_kernelLauncher(
-            from_tensor,
-            param_.self_layernorm.gamma,
-            param_.self_layernorm.beta,
-            param_.self_attention.attention_output_weight.bias,
-            masked_output_buf_,
-            norm_masked_output_buf_,
-            m,
-            hidden_units_,
-            param_.stream);
-
-#ifndef NDEBUG
-        cudaDeviceSynchronize();
-        check_cuda_error(cudaGetLastError());
-#endif
-
-        if (is_cross_attention == true) {
-          // For Attention is All You Need decoder
-          /* cross attention with memory */
-          cross_multi_head_attention(norm_masked_output_buf_,
-                                     memory_tensor,
-                                     key_mem_cache_,
-                                     value_mem_cache_,
-                                     cross_output_buf_,
-                                     memory_sequence_length,
-                                     finished,
-                                     param_.request_max_mem_seq_len,
-                                     step);
-
-
-#ifndef NDEBUG
-          cudaDeviceSynchronize();
-          check_cuda_error(cudaGetLastError());
-#endif
-          /*
-              cross_output_buf_ + bias + masked_output_buf_ -> cross_output_buf_
-              norm(cross_otuput_buf) -> normed_last_context (input for ffn)
-          */
-          add_bias_input_layernorm_2_kernelLauncher(
-              norm_masked_output_buf_,
-              param_.cross_layernorm.gamma,
-              param_.cross_layernorm.beta,
-              param_.cross_attention.attention_output_weight.bias,
-              cross_output_buf_,
-              norm_cross_output_buf_,
-              m,
-              hidden_units_,
-              param_.stream);
-
-#ifndef NDEBUG
-          cudaDeviceSynchronize();
-          check_cuda_error(cudaGetLastError());
-#endif
-          ffn(norm_cross_output_buf_,
-              ffn_inner_buf_,
-              ffn_out_buf_,
-              m,
-              inner_size_,
-              hidden_units_,
-              act_);
-
-#ifndef NDEBUG
-          cudaDeviceSynchronize();
-          check_cuda_error(cudaGetLastError());
-#endif
-
-          add_bias_input_kernelLauncher(ffn_out_buf_,
-                                        param_.ffn.output_weight.bias,
-                                        norm_cross_output_buf_,
-                                        m,
-                                        hidden_units_,
-                                        param_.stream);
-
-#ifndef NDEBUG
-          cudaDeviceSynchronize();
-          check_cuda_error(cudaGetLastError());
-#endif
-
-          layer_norm(ffn_out_buf_,
-                     param_.ffn_layernorm.gamma,
-                     param_.ffn_layernorm.beta,
-                     decoder_output,
-                     m,
-                     hidden_units_,
-                     param_.stream);
-        } else {
-          PUSH_RANGE("Transformer/MLP")
-          ffn(norm_masked_output_buf_,
-              ffn_inner_buf_,
-              ffn_out_buf_,
-              m,
-              inner_size_,
-              hidden_units_,
-              act_);
-          POP_RANGE
-
-#ifndef NDEBUG
-          cudaDeviceSynchronize();
-          check_cuda_error(cudaGetLastError());
-#endif
-
-          add_bias_input_kernelLauncher(ffn_out_buf_,
-                                        param_.ffn.output_weight.bias,
-                                        norm_masked_output_buf_,
-                                        m,
-                                        hidden_units_,
-                                        param_.stream);
-
-#ifndef NDEBUG
-          cudaDeviceSynchronize();
-          check_cuda_error(cudaGetLastError());
-#endif
-
-          layer_norm(ffn_out_buf_,
-                     param_.ffn_layernorm.gamma,
-                     param_.ffn_layernorm.beta,
-                     decoder_output,
-                     m,
-                     hidden_units_,
-                     param_.stream);
-        }
-      }
-    } catch (std::runtime_error &error) {
-      throw error;
-    }
-
-#ifndef NDEBUG
-    cudaDeviceSynchronize();
-    check_cuda_error(cudaGetLastError());
-#endif
-  }
-
-  size_t getContextWorkspaceSize(const int seq_len,
-                                 const int local_batch_size) {
-    const size_t m = local_batch_size * seq_len;
-    const size_t qk_buf_size =
-        (size_t)(ceil(local_batch_size * t_parallel_param_.local_head_num_ *
-                      seq_len * seq_len / 4.)) *
-        4;
-    const size_t attn_work_space_size =
-        3 * m * hidden_units_ /* Q, K, V */ +
-        3 * m *
-            t_parallel_param_.local_hidden_units_ /* q_buf, k_buf, v_buf */ +
-        qk_buf_size +
-        2 * m * t_parallel_param_.local_hidden_units_ /* trans_attn, attn */;
-
-    int inner_buf_size = 0;
-    if (inner_size_ <= 0) {
-      inner_buf_size = m * t_parallel_param_.local_hidden_units_ * inner_coeff_;
-    } else {
-      inner_buf_size = m * inner_size_;
-    }
-
-    if (use_gated_) {
-      inner_buf_size *= 2;
-    }
-
-    return (m * hidden_units_ * 3 + attn_work_space_size + inner_buf_size/* ffn buffer */) *
-          sizeof(DataType_);
-  }
-
-  // use to compute the context of gpt model
-  void forward_context(DataType_ *workspace,
-                       DataType_ *decoder_output,
-                       DataType_ *key_cache_,
-                       DataType_ *value_cache_,
-                       const DataType_ *from_tensor,
-                       const DataType_ *d_attn_mask,
-                       const int local_batch_size,
-                       const int seq_len,
-                       const int ite,
-                       const int max_seq_len,
-                       const bool is_final,
-                       const int* memory_sequence_length = nullptr,
-                       const int rotary_embedding_dim = 0) {
-#ifndef NDEBUG
-    PRINT_FUNC_NAME_();
-#endif
-    try {
-      const int m = local_batch_size * seq_len;
-      const int qk_buf_size =
-          (int)(ceil(local_batch_size * t_parallel_param_.local_head_num_ *
-                     seq_len * seq_len / 4.)) *
-          4;
-      const int attn_work_space_size =
-          3 * m * hidden_units_ /* Q, K, V */ +
-          3 * m *
-              t_parallel_param_.local_hidden_units_ /* q_buf, k_buf, v_buf */ +
-          qk_buf_size +
-          2 * m * t_parallel_param_.local_hidden_units_ /* trans_attn, attn */;
-
-      // set workspace
-      DataType_ *norm_from_tensor_buf = (DataType_ *)workspace;
-      DataType_ *ffn_out_buf = (DataType_ *)workspace;
-      DataType_ *attention_workspace = norm_from_tensor_buf + m * hidden_units_;
-      DataType_ *masked_output_buf = attention_workspace + attn_work_space_size;
-      DataType_ *norm_masked_output_buf = masked_output_buf + m * hidden_units_;
-      DataType_ *ffn_inner_buf = norm_masked_output_buf + m * hidden_units_;
-
-      if (normalization_before_) {
-        layer_norm(from_tensor,
-                  param_.self_layernorm.gamma,
-                  param_.self_layernorm.beta,
-                  norm_from_tensor_buf,
-                  m,
-                  hidden_units_,
-                  param_.stream);
-
-#ifndef NDEBUG
-        cudaDeviceSynchronize();
-        check_cuda_error(cudaGetLastError());
-#endif
-
-        PUSH_RANGE("Transformer/slf_attn")
-        unfused_masked_multi_head_attention(attention_workspace,
-                                            norm_from_tensor_buf,
-                                            key_cache_,
-                                            value_cache_,
-                                            masked_output_buf,
-                                            d_attn_mask,
-                                            local_batch_size,
-                                            seq_len,
-                                            ite,
-                                            max_seq_len,
-                                            is_final,
-                                            memory_sequence_length,
-                                            rotary_embedding_dim);
-        if (is_final) return;
-        POP_RANGE
-
-#ifndef NDEBUG
-        cudaDeviceSynchronize();
-        check_cuda_error(cudaGetLastError());
-#endif
-        if (rotary_embedding_dim > 0){
-          add_bias_input_layernorm_2_kernelLauncher(
-              from_tensor,
-              (DataType_*) nullptr,
-              (DataType_*) nullptr,
-              (DataType_*) nullptr,
-              masked_output_buf,
-              norm_masked_output_buf,
-              m,
-              hidden_units_,
-              param_.stream);
-        }else{
-          add_bias_input_layernorm_2_kernelLauncher(
-              from_tensor,
-              param_.ffn_layernorm.gamma,
-              param_.ffn_layernorm.beta,
-              param_.self_attention.attention_output_weight.bias,
-              masked_output_buf,
-              norm_masked_output_buf,
-              m,
-              hidden_units_,
-              param_.stream);
-        }
-
-
-#ifndef NDEBUG
-        cudaDeviceSynchronize();
-        check_cuda_error(cudaGetLastError());
-#endif
-
-        // For GPT decoder
-        PUSH_RANGE("Transformer/MLP");
-        ffn(rotary_embedding_dim > 0 ? norm_from_tensor_buf:norm_masked_output_buf,
-            ffn_inner_buf,
-            decoder_output,
-            m,
-            inner_coeff_ * t_parallel_param_.local_hidden_units_,
-            hidden_units_,
-            act_);
-        POP_RANGE
-
-#ifndef NDEBUG
-        cudaDeviceSynchronize();
-        check_cuda_error(cudaGetLastError());
-#endif
-        add_bias_input_kernelLauncher(decoder_output,
-                                      param_.ffn.output_weight.bias,
-                                      masked_output_buf,
-                                      m,
-                                      hidden_units_,
-                                      param_.stream);
-      } else {
-
-        PUSH_RANGE("Transformer/slf_attn")
-        unfused_masked_multi_head_attention(attention_workspace,
-                                            from_tensor,
-                                            key_cache_,
-                                            value_cache_,
-                                            masked_output_buf,
-                                            d_attn_mask,
-                                            local_batch_size,
-                                            seq_len,
-                                            ite,
-                                            max_seq_len,
-                                            is_final,
-                                            memory_sequence_length);
-        if (is_final) return;
-        POP_RANGE
-
-#ifndef NDEBUG
-        cudaDeviceSynchronize();
-        check_cuda_error(cudaGetLastError());
-#endif
-
-        add_bias_input_layernorm_2_kernelLauncher(
-            from_tensor,
-            param_.self_layernorm.gamma,
-            param_.self_layernorm.beta,
-            param_.self_attention.attention_output_weight.bias,
-            masked_output_buf,
-            norm_masked_output_buf,
-            m,
-            hidden_units_,
-            param_.stream);
-
-#ifndef NDEBUG
-        cudaDeviceSynchronize();
-        check_cuda_error(cudaGetLastError());
-#endif
-
-        // For GPT decoder
-        PUSH_RANGE("Transformer/MLP");
-        ffn(norm_masked_output_buf,
-            ffn_inner_buf,
-            ffn_out_buf,
-            m,
-            inner_coeff_ * t_parallel_param_.local_hidden_units_,
-            hidden_units_,
-            act_);
-        POP_RANGE
-
-#ifndef NDEBUG
-        cudaDeviceSynchronize();
-        check_cuda_error(cudaGetLastError());
-#endif
-
-        add_bias_input_layernorm_2_kernelLauncher(
-            norm_masked_output_buf,
-            param_.ffn_layernorm.gamma,
-            param_.ffn_layernorm.beta,
-            param_.ffn.output_weight.bias,
-            ffn_out_buf,
-            decoder_output,
-            m,
-            hidden_units_,
-            param_.stream);
-
-      }
-
-#ifndef NDEBUG
-      cudaDeviceSynchronize();
-      check_cuda_error(cudaGetLastError());
-#endif
-    } catch (std::runtime_error &error) {
-      throw error;
-    }
-  }
-
-  void masked_multi_head_attention(const DataType_ *from_tensor,
-                                   DataType_ *key_cache_,
-                                   DataType_ *value_cache_,
-                                   DataType_ *decoder_output,
-                                   const bool *finished,
-                                   const int step,
-                                   const int max_seq_len,
-                                   const DataType_* relative_attention_bias = nullptr) {
-    int m = l_parallel_param_.local_batch_size;
-    int n = t_parallel_param_.local_hidden_units_;
-    int k = hidden_units_;
-
-    // chose which attention to use
-    int decoder_max_seq_len = (getCacheFormat() != 0) ? max_seq_len : -1;
-
-    DataType_ alpha = (DataType_)1.0f, beta = (DataType_)0.0f;
-
-    if (is_fuse_QKV_in_normal_gemm_ == true) {
-      cublasMM_cublasLtMM_wrapper_decoder(
-          param_.cublaslt_handle,
-          param_.cublas_handle,
-          CUBLAS_OP_N,
-          CUBLAS_OP_N,
-          3 * n,
-          m,
-          k,
-          &alpha,
-          param_.self_attention.query_weight.kernel,
-          AType_,
-          3 * n,
-          from_tensor,
-          BType_,
-          k,
-          &beta,
-          query_buf_,
-          CType_,
-          3 * n,
-          param_.stream,
-          cublasAlgoMap_,
-          cublas_workspace_);
-
-      fusedQKV_masked_attention_dispatch_v3<DataType_>(
-          query_buf_,
-          param_.self_attention.query_weight.bias,
-          key_cache_,
-          value_cache_,
-          context_buf_,
-          finished,
-          param_.request_batch_size,
-          l_parallel_param_.local_batch_size,
-          t_parallel_param_.local_head_num_,
-          size_per_head_,
-          step,
-          decoder_max_seq_len,
-          param_.stream,
-          relative_attention_bias);
-
-    } else {
-      if (is_fuse_QKV_in_batched_gemm_ == true) {
-        cublasGemmAlgo_t cublasAlgo =
-            static_cast<cublasGemmAlgo_t>(getAlgoIdFromMap(
-                cublasAlgoMap_,
-                3,
-                n,
-                m,
-                k,
-                std::is_same<float, DataType_>::value ? FLOAT_DATATYPE
-                                                      : HALF_DATATYPE));
-        check_cuda_error(cublasGemmBatchedEx(param_.cublas_handle,
-                                             CUBLAS_OP_N,
-                                             CUBLAS_OP_N,
-                                             n,
-                                             m,
-                                             k,
-                                             &alpha,
-                                             (const void *const *)qkv_kernel_,
-                                             AType_,
-                                             n,
-                                             (const void *const *)qkv_input_,
-                                             BType_,
-                                             k,
-                                             &beta,
-                                             (void *const *)qkv_buf_,
-                                             CType_,
-                                             n,
-                                             3,
-                                             computeType_,
-                                             cublasAlgo));
-      } else {
-        cublasMM_cublasLtMM_wrapper_decoder(
-            param_.cublaslt_handle,
-            param_.cublas_handle,
-            CUBLAS_OP_N,
-            CUBLAS_OP_N,
-            n,
-            m,
-            k,
-            &alpha,
-            param_.self_attention.query_weight.kernel,
-            AType_,
-            n,
-            from_tensor,
-            BType_,
-            k,
-            &beta,
-            query_buf_,
-            CType_,
-            n,
-            param_.stream,
-            cublasAlgoMap_,
-            cublas_workspace_);
-
-        cublasMM_cublasLtMM_wrapper_decoder(
-            param_.cublaslt_handle,
-            param_.cublas_handle,
-            CUBLAS_OP_N,
-            CUBLAS_OP_N,
-            n,
-            m,
-            k,
-            &alpha,
-            param_.self_attention.key_weight.kernel,
-            AType_,
-            n,
-            from_tensor,
-            BType_,
-            k,
-            &beta,
-            key_buf_,
-            CType_,
-            n,
-            param_.stream,
-            cublasAlgoMap_,
-            cublas_workspace_);
-
-        cublasMM_cublasLtMM_wrapper_decoder(
-            param_.cublaslt_handle,
-            param_.cublas_handle,
-            CUBLAS_OP_N,
-            CUBLAS_OP_N,
-            n,
-            m,
-            k,
-            &alpha,
-            param_.self_attention.value_weight.kernel,
-            AType_,
-            n,
-            from_tensor,
-            BType_,
-            k,
-            &beta,
-            value_buf_,
-            CType_,
-            n,
-            param_.stream,
-            cublasAlgoMap_,
-            cublas_workspace_);
-      }
-
-      masked_attention_dispatch_v2<DataType_>(
-          key_buf_,
-          value_buf_,
-          query_buf_,
-          param_.self_attention.query_weight.bias,
-          key_cache_,
-          param_.self_attention.key_weight.bias,
-          value_cache_,
-          param_.self_attention.value_weight.bias,
-          context_buf_,
-          finished,
-          param_.request_batch_size,
-          l_parallel_param_.local_batch_size,
-          t_parallel_param_.local_head_num_,
-          size_per_head_,
-          step,
-          decoder_max_seq_len,
-          param_.stream,
-          relative_attention_bias);
-    }
-
-    k = t_parallel_param_.local_hidden_units_;
-    n = hidden_units_;
-
-    cublasMM_cublasLtMM_wrapper_decoder(
-        param_.cublaslt_handle,
-        param_.cublas_handle,
-        CUBLAS_OP_N,
-        CUBLAS_OP_N,
-        n,
-        m,
-        k,
-        &alpha,
-        param_.self_attention.attention_output_weight.kernel,
-        AType_,
-        n,
-        context_buf_,
-        BType_,
-        k,
-        &beta,
-        decoder_output,
-        CType_,
-        n,
-        param_.stream,
-        cublasAlgoMap_,
-        cublas_workspace_);
-
-    PUSH_RANGE("Transformer/slf_attn/all2all_reduce")
-    all2all_reduce_sum(decoder_output,
-                       decoder_output,
-                       m * n,
-                       t_parallel_param_,
-                       param_.stream);
-    POP_RANGE
-  }
-
-  void masked_multi_head_attention_v2(const DataType_ *from_tensor,
-                                      DataType_ *key_cache_,
-                                      DataType_ *value_cache_,
-                                      DataType_ *decoder_output,
-                                      const bool *finished,
-                                      const int step,
-                                      const int max_seq_len,
-                                      const int max_input_len,
-                                      const int *input_lengths,
-                                      const int rotary_embedding_dim = 0) {
-    assert(is_fuse_QKV_in_normal_gemm_ ==
-           true);  // only support for is_fuse_QKV = True.
-
-    int m = l_parallel_param_.local_batch_size;
-    int n = t_parallel_param_.local_hidden_units_;
-    int k = hidden_units_;
-
-    assert(getCacheFormat() !=
-           0);  // this is the only difference with masked_multi_head_attention
-
-    DataType_ alpha = (DataType_)1.0f, beta = (DataType_)0.0f;
-
-    cublasMM_cublasLtMM_wrapper_decoder(
-        param_.cublaslt_handle,
-        param_.cublas_handle,
-        CUBLAS_OP_N,
-        CUBLAS_OP_N,
-        3 * n,
-        m,
-        k,
-        &alpha,
-        param_.self_attention.query_weight.kernel,
-        AType_,
-        3 * n,
-        from_tensor,
-        BType_,
-        k,
-        &beta,
-        query_buf_,
-        CType_,
-        3 * n,
-        param_.stream,
-        cublasAlgoMap_,
-        cublas_workspace_);
-
-    fusedQKV_masked_attention_dispatch_v2<DataType_>(
-        query_buf_,
-        param_.self_attention.query_weight.bias,
-        key_cache_,
-        value_cache_,
-        context_buf_,
-        finished,
-        param_.request_batch_size,
-        l_parallel_param_.local_batch_size,
-        t_parallel_param_.local_head_num_,
-        size_per_head_,
-        step,
-        max_seq_len,
-        max_input_len,
-        input_lengths,
-        rotary_embedding_dim,
-        param_.stream);
-
-    k = t_parallel_param_.local_hidden_units_;
-    n = hidden_units_;
-
-    cublasMM_cublasLtMM_wrapper_decoder(
-        param_.cublaslt_handle,
-        param_.cublas_handle,
-        CUBLAS_OP_N,
-        CUBLAS_OP_N,
-        n,
-        m,
-        k,
-        &alpha,
-        param_.self_attention.attention_output_weight.kernel,
-        AType_,
-        n,
-        context_buf_,
-        BType_,
-        k,
-        &beta,
-        decoder_output,
-        CType_,
-        n,
-        param_.stream,
-        cublasAlgoMap_,
-        cublas_workspace_);
-
-    PUSH_RANGE("Transformer/slf_attn/all2all_reduce")
-    all2all_reduce_sum(decoder_output,
-                       decoder_output,
-                       m * n,
-                       t_parallel_param_,
-                       param_.stream);
-    POP_RANGE
-  }
-
-  /* attention with source sentence */
-  void cross_multi_head_attention(const DataType_ *from_tensor,
-                                  const DataType_ *memory_tensor,
-                                  DataType_ *key_mem_cache_,
-                                  DataType_ *value_mem_cache_,
-                                  DataType_ *decoder_output,
-                                  const int *memory_sequence_length,
-                                  const bool *finished,
-                                  const int max_seq_len,
-                                  const int step,
-                                  const DataType_ *relative_attention_bias = nullptr) {
-    int m = param_.request_batch_size;
-    int n = t_parallel_param_.local_hidden_units_;
-    int k = hidden_units_;
-
-    DataType_ alpha = (DataType_)1.0f, beta = (DataType_)0.0f;
-
-    // reuse the query_buf
-    cublasMM_cublasLtMM_wrapper_decoder(
-        param_.cublaslt_handle,
-        param_.cublas_handle,
-        CUBLAS_OP_N,
-        CUBLAS_OP_N,
-        n,
-        m,
-        k,
-        &alpha,
-        param_.cross_attention.query_weight.kernel,
-        AType_,
-        n,
-        from_tensor,
-        BType_,
-        k,
-        &beta,
-        query_buf_,
-        CType_,
-        n,
-        param_.stream,
-        cublasAlgoMap_,
-        cublas_workspace_);
-
-    if (step == 1) {
-      m *= max_seq_len;
-      k = memory_hidden_units_;
-
-      cublasMM_cublasLtMM_wrapper_decoder(
-          param_.cublaslt_handle,
-          param_.cublas_handle,
-          CUBLAS_OP_N,
-          CUBLAS_OP_N,
-          n,
-          m,
-          k,
-          &alpha,
-          param_.cross_attention.key_weight.kernel,
-          AType_,
-          n,
-          memory_tensor,
-          BType_,
-          k,
-          &beta,
-          key_mem_cache_,
-          CType_,
-          n,
-          param_.stream,
-          cublasAlgoMap_,
-          cublas_workspace_);
-
-      cublasMM_cublasLtMM_wrapper_decoder(
-          param_.cublaslt_handle,
-          param_.cublas_handle,
-          CUBLAS_OP_N,
-          CUBLAS_OP_N,
-          n,
-          m,
-          k,
-          &alpha,
-          param_.cross_attention.value_weight.kernel,
-          AType_,
-          n,
-          memory_tensor,
-          BType_,
-          k,
-          &beta,
-          value_mem_cache_,
-          CType_,
-          n,
-          param_.stream,
-          cublasAlgoMap_,
-          cublas_workspace_);
-
-      k = t_parallel_param_.local_hidden_units_;
-    }
-
-    cross_attention_dispatch_v2<DataType_>(
-        query_buf_,
-        param_.cross_attention.query_weight.bias,
-        key_mem_cache_,
-        param_.cross_attention.key_weight.bias,
-        value_mem_cache_,
-        param_.cross_attention.value_weight.bias,
-        memory_sequence_length,
-        context_buf_,
-        finished,
-        param_.request_batch_size,
-        head_num_,
-        size_per_head_,
-        step,
-        max_seq_len,
-        param_.stream,
-        relative_attention_bias);
-
-    m = param_.request_batch_size;
-    n = hidden_units_;
-    k = t_parallel_param_.local_hidden_units_;
-
-    cublasMM_cublasLtMM_wrapper_decoder(
-        param_.cublaslt_handle,
-        param_.cublas_handle,
-        CUBLAS_OP_N,
-        CUBLAS_OP_N,
-        n,
-        m,
-        k,
-        &alpha,
-        param_.cross_attention.attention_output_weight.kernel,
-        AType_,
-        n,
-        context_buf_,
-        BType_,
-        k,
-        &beta,
-        decoder_output,
-        CType_,
-        n,
-        param_.stream,
-        cublasAlgoMap_,
-        cublas_workspace_);
-  }
-
-
-  void ffn(const DataType_ *input,
-           DataType_ *ffn_inner,
-           DataType_ *output,
-           const int m,
-           const int inner_size,
-           const int n,
-           ActivationType activation_type,
-           const int step = -1) {
-    int m1 = m, k1 = n, n1 = inner_size;
-    DataType_ alpha = (DataType_)1.0f;
-    DataType_ beta = (DataType_)0.0f;
-
-    cublasMM_cublasLtMM_wrapper_decoder(param_.cublaslt_handle,
-                                        param_.cublas_handle,
-                                        CUBLAS_OP_N,
-                                        CUBLAS_OP_N,
-                                        n1,
-                                        m1,
-                                        k1,
-                                        &alpha,
-                                        param_.ffn.intermediate_weight.kernel,
-                                        AType_,
-                                        n1,
-                                        input,
-                                        BType_,
-                                        k1,
-                                        &beta,
-                                        ffn_inner,
-                                        CType_,
-                                        n1,
-                                        param_.stream,
-                                        cublasAlgoMap_,
-                                        cublas_workspace_);
-
-    if (param_.ffn.intermediate_weight_1.kernel) {
-      cublasMM_cublasLtMM_wrapper_decoder(param_.cublaslt_handle,
-                                          param_.cublas_handle,
-                                          CUBLAS_OP_N,
-                                          CUBLAS_OP_N,
-                                          n1,
-                                          m1,
-                                          k1,
-                                          &alpha,
-                                          param_.ffn.intermediate_weight_1.kernel,
-                                          AType_,
-                                          n1,
-                                          input,
-                                          BType_,
-                                          k1,
-                                          &beta,
-                                          ffn_inner + m1 * inner_size,
-                                          CType_,
-                                          n1,
-                                          param_.stream,
-                                          cublasAlgoMap_,
-                                          cublas_workspace_);
-
-      gated_add_bias_act_kernelLauncher(ffn_inner,
-                                        param_.ffn.intermediate_weight.bias,
-                                        param_.ffn.intermediate_weight_1.bias,
-                                        m1,
-                                        inner_size,
-                                        activation_type,
-                                        param_.stream);
-
-    } else {
-      add_bias_act_kernelLauncher(ffn_inner,
-                                  param_.ffn.intermediate_weight.bias,
-                                  m1,
-                                  inner_size,
-                                  activation_type,
-                                  param_.stream);
-
-    }
-
-    int m2 = m, n2 = n, k2 = inner_size;
-    cublasMM_cublasLtMM_wrapper_decoder(param_.cublaslt_handle,
-                                        param_.cublas_handle,
-                                        CUBLAS_OP_N,
-                                        CUBLAS_OP_N,
-                                        n2,
-                                        m2,
-                                        k2,
-                                        &alpha,
-                                        param_.ffn.output_weight.kernel,
-                                        AType_,
-                                        n2,
-                                        ffn_inner,
-                                        BType_,
-                                        k2,
-                                        &beta,
-                                        output,
-                                        CType_,
-                                        n2,
-                                        param_.stream,
-                                        cublasAlgoMap_,
-                                        cublas_workspace_);
-
-    PUSH_RANGE("Transformer/MLP/all2all_reduce")
-    all2all_reduce_sum(output, output, m * n, t_parallel_param_, param_.stream);
-    POP_RANGE
-  }
-
-  void unfused_masked_multi_head_attention(DataType_ *workspace,
-                                           const DataType_ *from_tensor,
-                                           DataType_ *key_cache_,
-                                           DataType_ *value_cache_,
-                                           DataType_ *decoder_output,
-                                           const DataType_ *attr_mask,
-                                           const int local_batch_size,
-                                           const int seq_len,
-                                           const int ite,
-                                           const int max_seq_len,
-                                           const bool is_final,
-                                           const int* memory_sequence_length = nullptr,
-                                           const int rotary_embedding_dim = 0) {
-    const DataType_ scalar = 1 / sqrtf(size_per_head_ * 1.0f);
-    const int m = local_batch_size * seq_len;
-
-    const int qk_buf_size =
-        (int)(ceil(local_batch_size * t_parallel_param_.local_head_num_ *
-                   seq_len * seq_len / 4.)) *
-        4;
-
-    DataType_ *Q = workspace;
-    DataType_ *K = Q + m * hidden_units_;
-    DataType_ *V = K + m * hidden_units_;
-    DataType_ *q_buf = V + m * hidden_units_;
-    DataType_ *k_buf = q_buf + m * t_parallel_param_.local_hidden_units_;
-    DataType_ *v_buf = k_buf + m * t_parallel_param_.local_hidden_units_;
-    DataType_ *qk_buf = v_buf + m * t_parallel_param_.local_hidden_units_;
-    DataType_ *attn_trans_out = qk_buf + qk_buf_size;
-    DataType_ *attn_out =
-        attn_trans_out + m * t_parallel_param_.local_hidden_units_;
-
-    DataType_ alpha = (DataType_)1.0f, beta = (DataType_)0.0f;
-
-    if (is_fuse_QKV_in_normal_gemm_ == true) {
-      const int n = t_parallel_param_.local_hidden_units_;
-      const int k = hidden_units_;
-      cublasMM_cublasLtMM_wrapper_decoder(
-          param_.cublaslt_handle,
-          param_.cublas_handle,
-          CUBLAS_OP_N,
-          CUBLAS_OP_N,
-          3 * n,
-          m,
-          k,
-          &alpha,
-          param_.self_attention.query_weight.kernel,
-          AType_,
-          3 * n,
-          from_tensor,
-          BType_,
-          k,
-          &beta,
-          Q,
-          CType_,
-          3 * n,
-          param_.stream,
-          cublasAlgoMap_,
-          cublas_workspace_);
-      add_fusedQKV_bias_transpose_kernelLauncher(
-          q_buf,
-          k_buf,
-          v_buf,
-          Q,
-          param_.self_attention.query_weight.bias,
-          local_batch_size,
-          seq_len,
-          t_parallel_param_.local_head_num_,
-          size_per_head_,
-          rotary_embedding_dim,
-          param_.stream);
-    } else {
-      const int n = t_parallel_param_.local_hidden_units_;
-      const int k = hidden_units_;
-      cublasMM_cublasLtMM_wrapper_decoder(
-          param_.cublaslt_handle,
-          param_.cublas_handle,
-          CUBLAS_OP_N,
-          CUBLAS_OP_N,
-          n,
-          m,
-          k,
-          &alpha,
-          param_.self_attention.query_weight.kernel,
-          AType_,
-          n,
-          from_tensor,
-          BType_,
-          k,
-          &beta,
-          Q,
-          CType_,
-          n,
-          param_.stream,
-          cublasAlgoMap_,
-          cublas_workspace_);
-
-      cublasMM_cublasLtMM_wrapper_decoder(
-          param_.cublaslt_handle,
-          param_.cublas_handle,
-          CUBLAS_OP_N,
-          CUBLAS_OP_N,
-          n,
-          m,
-          k,
-          &alpha,
-          param_.self_attention.key_weight.kernel,
-          AType_,
-          n,
-          from_tensor,
-          BType_,
-          k,
-          &beta,
-          K,
-          CType_,
-          n,
-          param_.stream,
-          cublasAlgoMap_,
-          cublas_workspace_);
-
-      cublasMM_cublasLtMM_wrapper_decoder(
-          param_.cublaslt_handle,
-          param_.cublas_handle,
-          CUBLAS_OP_N,
-          CUBLAS_OP_N,
-          n,
-          m,
-          k,
-          &alpha,
-          param_.self_attention.value_weight.kernel,
-          AType_,
-          n,
-          from_tensor,
-          BType_,
-          k,
-          &beta,
-          V,
-          CType_,
-          n,
-          param_.stream,
-          cublasAlgoMap_,
-          cublas_workspace_);
-
-      add_QKV_bias_transpose_kernelLauncher(
-          q_buf,
-          k_buf,
-          v_buf,
-          Q,
-          param_.self_attention.query_weight.bias,
-          K,
-          param_.self_attention.key_weight.bias,
-          V,
-          param_.self_attention.value_weight.bias,
-          local_batch_size,
-          seq_len,
-          t_parallel_param_.local_head_num_,
-          size_per_head_,
-          param_.stream);
-    }
-
-    // !!! need to implement cget_cache_config
-    if (max_seq_len == -1 || USE_CACHE_BATCH_MAJOR_ATTENTION == 0) {
-      transpose_4d_kernelLauncher(key_cache_,
-                                  k_buf,
-                                  local_batch_size,
-                                  seq_len,
-                                  size_per_head_,
-                                  t_parallel_param_.local_hidden_units_,
-                                  t_parallel_param_.local_head_num_,
-                                  param_.request_batch_size,
-                                  ite,
-                                  param_.stream);
-
-      transpose_4d_kernelLauncher(value_cache_,
-                                  v_buf,
-                                  local_batch_size,
-                                  seq_len,
-                                  size_per_head_,
-                                  t_parallel_param_.local_hidden_units_,
-                                  t_parallel_param_.local_head_num_,
-                                  param_.request_batch_size,
-                                  ite,
-                                  param_.stream);
-    } else if (USE_CACHE_BATCH_MAJOR_ATTENTION == 1) {
-      // Use batch major
-      // put k/v_buf from shape [B, H, L, Dh]
-      // to cache [B, H, Dh/x, L, x]  and [B, H, L, Dh/x, x]
-      if (memory_sequence_length == nullptr) {
-        transpose_4d_batch_major_kernelLauncher(key_cache_,
-                                                value_cache_,
-                                                k_buf,
-                                                v_buf,
-                                                local_batch_size,
-                                                seq_len,
-                                                max_seq_len,
-                                                size_per_head_,
-                                                t_parallel_param_.local_head_num_,
-                                                param_.stream);
-      } else {
-        // Left pad to right pad. 
-        transpose_cache_batch_major_kernelLauncher(
-            key_cache_,
-            value_cache_,
-            k_buf,
-            v_buf,
-            memory_sequence_length,
-            local_batch_size,
-            seq_len,
-            max_seq_len,
-            size_per_head_,
-            t_parallel_param_.local_head_num_,
-            param_.stream);
-      }
-    } else {
-      printf("[ERROR] Can not decide on the cache config \n");
-      exit(-1);
-    }
-
-    if (is_final) return;
-
-    cublasGemmAlgo_t cublasAlgo =
-        static_cast<cublasGemmAlgo_t>(getAlgoIdFromMap(
-            cublasAlgoMap_,
-            local_batch_size * t_parallel_param_.local_head_num_,
-            seq_len,
-            seq_len,
-            size_per_head_,
-            std::is_same<float, DataType_>::value ? FLOAT_DATATYPE
-                                                  : HALF_DATATYPE));
-
-    check_cuda_error(cublasGemmStridedBatchedEx(
-        param_.cublas_handle,
-        CUBLAS_OP_T,
-        CUBLAS_OP_N,
-        seq_len,
-        seq_len,
-        size_per_head_,
-        &alpha,
-        k_buf,
-        AType_,
-        size_per_head_,
-        seq_len * size_per_head_,
-        q_buf,
-        BType_,
-        size_per_head_,
-        seq_len * size_per_head_,
-        &beta,
-        qk_buf,
-        CType_,
-        seq_len,
-        seq_len * seq_len,
-        local_batch_size * t_parallel_param_.local_head_num_,
-        computeType_,
-        cublasAlgo));
-
-    attn_softmax_kernelLauncher(qk_buf,
-                                attr_mask,
-                                local_batch_size,
-                                seq_len,
-                                t_parallel_param_.local_head_num_,
-                                scalar,
-                                param_.stream);
-
-    cublasAlgo = static_cast<cublasGemmAlgo_t>(getAlgoIdFromMap(
-        cublasAlgoMap_,
-        local_batch_size * t_parallel_param_.local_head_num_,
-        size_per_head_,
-        seq_len,
-        seq_len,
-        std::is_same<float, DataType_>::value ? FLOAT_DATATYPE
-                                              : HALF_DATATYPE));
-
-    check_cuda_error(cublasGemmStridedBatchedEx(
-        param_.cublas_handle,
-        CUBLAS_OP_N,
-        CUBLAS_OP_N,
-        size_per_head_,
-        seq_len,
-        seq_len,
-        &alpha,
-        v_buf,
-        AType_,
-        size_per_head_,
-        seq_len * size_per_head_,
-        qk_buf,
-        BType_,
-        seq_len,
-        seq_len * seq_len,
-        &beta,
-        attn_trans_out,
-        CType_,
-        size_per_head_,
-        seq_len * size_per_head_,
-        local_batch_size * t_parallel_param_.local_head_num_,
-        computeType_,
-        cublasAlgo));
-
-    transpose_general_kernelLauncher(attn_out,
-                                     attn_trans_out,
-                                     local_batch_size,
-                                     seq_len,
-                                     t_parallel_param_.local_head_num_,
-                                     size_per_head_,
-                                     param_.stream);
-
-    {
-      const int k = t_parallel_param_.local_hidden_units_;
-      const int n = hidden_units_;
-
-      cublasMM_cublasLtMM_wrapper_decoder(
-          param_.cublaslt_handle,
-          param_.cublas_handle,
-          CUBLAS_OP_N,
-          CUBLAS_OP_N,
-          n,
-          m,
-          k,
-          &alpha,
-          param_.self_attention.attention_output_weight.kernel,
-          AType_,
-          n,
-          attn_out,
-          BType_,
-          k,
-          &beta,
-          decoder_output,
-          CType_,
-          n,
-          param_.stream,
-          cublasAlgoMap_,
-          cublas_workspace_);
-
-      PUSH_RANGE("Transformer/slf_attn/all2all_reduce")
-      all2all_reduce_sum(decoder_output,
-                         decoder_output,
-                         m * n,
-                         t_parallel_param_,
-                         param_.stream);
-      POP_RANGE
-    }
-  }
-
-  int getCacheFormat() {
-    int x = (Traits_::OpType == OperationType::FP32) ? 4 : 8;
-    return (USE_CACHE_BATCH_MAJOR_ATTENTION == 1 && size_per_head_ % x == 0)
-               ? x
-               : 0;
-  }
-
-  ~OpenDecoder() {
-    norm_from_tensor_buf_ = nullptr;
-    query_buf_ = nullptr;
-    key_buf_ = nullptr;
-    value_buf_ = nullptr;
-    context_buf_ = nullptr;
-
-    masked_output_buf_ = nullptr;
-    norm_masked_output_buf_ = nullptr;
-
-    cross_output_buf_ = nullptr;
-    norm_cross_output_buf_ = nullptr;
-    ffn_inner_buf_ = nullptr;
-  }
-
-  inline void set_local_batch_size(int local_batch) {
-    l_parallel_param_.local_batch_size = local_batch;
-  }
-};
-}  // namespace fastertransformer
diff --git a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/opt.h b/paddlenlp/ops/patches/FasterTransformer/fastertransformer/opt.h
deleted file mode 100644
index 279ff81e66de..000000000000
--- a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/opt.h
+++ /dev/null
@@ -1,927 +0,0 @@
-/*
- * Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/**
- * Decoder transformer
- **/
-
-#pragma once
-
-#include "fastertransformer/utils/common.h"
-#include "fastertransformer/utils/functions.h"
-#include "fastertransformer/utils/allocator.h"
-#include "fastertransformer/utils/arguments.h"
-#include "fastertransformer/cuda/cuda_kernels.h"
-#include "fastertransformer/open_decoder.h"
-#include <cuda_runtime.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include "fastertransformer/utils/nvtx_utils.h"
-
-namespace fastertransformer
-{
-
-template <OperationType OpType_>
-class DecodingOpt
-{
-private:
-    typedef DecoderTransformerTraits<OpType_> Traits_;
-    typedef typename Traits_::DataType DataType_;
-    const IAllocator &allocator_;
-    struct GptArguments args_;
-    TensorParallelParam t_parallel_param_;
-    LayerParallelParam l_parallel_param_;
-
-    const cudaDataType_t computeType_ = Traits_::computeType;
-    const cudaDataType_t AType_ = Traits_::AType;
-    const cudaDataType_t BType_ = Traits_::BType;
-    const cudaDataType_t CType_ = Traits_::CType;
-    std::map<std::string, cublasLtMatmulAlgo_info> cublasAlgoMap_;
-
-    DataType_ *embedding_kernel_padded_;
-
-    OpenDecoder<OpType_> *decoder_;
-    DataType_ **K_cache_;
-    DataType_ **V_cache_;
-    DataType_ *from_tensor_[2];
-    DataType_ *decoder_buf_;
-    DataType_ *decoder_normed_result_buf_;
-    DataType_ *logits_buf_;
-    void *buf_;
-
-    DataType_ *project_buf_;
-
-    // save the word embedding size
-    
-    void *topk_workspace_ = nullptr;
-    size_t topk_workspace_size_ = 0;
-    void *topp_workspace_ = nullptr;
-    size_t topp_workspace_size_ = 0;
-    void *topk_topp_workspace_ = nullptr;
-    size_t topk_topp_workspace_size_ = 0;
-    void *cublas_workspace_ = nullptr;
-    int *topp_id_vals_buf_;
-    int *topp_offset_buf_;
-    curandState_t *curandstate_buf_;
-    int *begin_topp_offset_buf_;
-
-    size_t nccl_buf_size_;
-    DataType_ *nccl_logits_buf_;
-
-    bool *finished_buf_;
-    bool *h_finished_buf_;
-    
-public:
-    DecodingOpt(const IAllocator &allocator, const int batch_size,
-                 const int seq_len,
-                 const int head_num, const int size_per_head,
-                 const int vocab_size, const int decoder_layers,
-                 const int start_id, const int end_id,
-                 const int candidate_num = 1,
-                 const float probability_threshold = 0.0,
-                 const float temperature = 1.0,
-                 const int tensor_para_size = 1,
-                 const int layer_para_size = 1,
-                 const bool is_fuse_QKV = true,
-                 const bool normalization_before = true,
-                 const float repetition_penalty = 1.0,
-                 const int seed = -1) : allocator_(allocator)
-    {
-#ifndef NDEBUG
-        PRINT_FUNC_NAME_();
-#endif
-        assert(temperature != 0.0);
-        assert(repetition_penalty > 0.0);
-        assert(candidate_num > 0 || probability_threshold > 0.0);
-        assert(decoder_layers % layer_para_size == 0);
-
-        args_.batch_size_ = batch_size;
-        args_.normalization_before_ = normalization_before;
-
-        args_.seq_len_ = seq_len;
-        args_.head_num_ = head_num;
-        args_.size_per_head_ = size_per_head;
-        args_.hidden_units_ = head_num * size_per_head;
-        args_.decoder_layers_ = decoder_layers;
-        args_.vocab_size_ = vocab_size;
-        args_.start_id_ = start_id;
-        args_.end_id_ = end_id;
-        args_.candidate_num_ = candidate_num;
-        args_.probability_threshold_ = probability_threshold;
-        args_.temperature_ = temperature;
-        args_.repetition_penalty_ = repetition_penalty;
-        /***** newly added by PaddleNLP *****/
-        args_.seed_ = seed;
-
-        /***** newly added according to the OPT model*****/
-        
-        K_cache_ = new DataType_ *[1];
-        V_cache_ = new DataType_ *[1];
-        
-        // opt use relu activation function
-        decoder_ = new OpenDecoder<OpType_>(args_.head_num_, size_per_head, 0 /* memory_hidden_units */, is_fuse_QKV, args_.normalization_before_, ActivationType::RELU);
-        decoder_->set_max_batch_size(args_.batch_size_);
-
-        args_.vocab_size_padded_ = div_up(args_.vocab_size_, 64) * 64;
-
-        size_t from_tensor_size = args_.batch_size_ * args_.hidden_units_;                    // type T
-        size_t decoder_workspace_size = (size_t)decoder_->getWorkspaceSize();                                             // type T
-
-        // change the size of decoder normed_result_buffer_resiz 
-        size_t decoder_normed_result_buffer_size = args_.batch_size_ * args_.hidden_units_;   // type T
-        // cache costs lots of memory, so we only store part of them when we use multi-gpu for inference
-        size_t cache_size = args_.batch_size_ * args_.seq_len_ * args_.hidden_units_ / tensor_para_size;         // type T
-        size_t logits_buf_size = args_.batch_size_ * args_.vocab_size_padded_; // type T
-
-        size_t topp_id_vals_buf_size = args_.batch_size_ * args_.vocab_size_padded_; // type int
-        size_t topp_offset_buf_size = args_.batch_size_ + 1;
-        size_t begin_topp_offset_buf_size = topp_offset_buf_size;
-        size_t curandState_size = args_.batch_size_;
-        size_t finished_buf_size = args_.batch_size_;
-
-        const int MEM_C = 128;
-        size_t embedding_kernel_transposed_padded_size = args_.hidden_units_ * args_.vocab_size_padded_;
-        embedding_kernel_transposed_padded_size = div_up(embedding_kernel_transposed_padded_size, MEM_C) * MEM_C;
-
-        // prevent memory misalinged address
-        logits_buf_size = (size_t)(ceil(logits_buf_size / 4.)) * 4;
-        
-        topp_id_vals_buf_size = (size_t)(ceil(topp_id_vals_buf_size / 4.)) * 4;
-        topp_offset_buf_size = (size_t)(ceil(topp_offset_buf_size / 4.)) * 4;
-        begin_topp_offset_buf_size = topp_offset_buf_size;
-        curandState_size = (size_t)(ceil(curandState_size / 32.)) * 32;
-        finished_buf_size = (size_t)(ceil(finished_buf_size / 32.)) * 32;
-
-        topP_sampling_kernel_kernelLauncher_v2(topp_workspace_,
-                                               topp_workspace_size_,
-                                               logits_buf_,
-                                               topp_id_vals_buf_,
-                                               topp_offset_buf_,
-                                               begin_topp_offset_buf_,
-                                               nullptr,
-                                               curandstate_buf_,
-                                               args_,
-                                               nullptr,
-                                               nullptr,
-                                               args_.vocab_size_padded_,
-                                               0,
-                                               args_.batch_size_);
-
-        topK_sampling_kernel_kernelLauncher_v2(topk_workspace_,
-                                               topk_workspace_size_,
-                                               logits_buf_,
-                                               nullptr,
-                                               nullptr,
-                                               nullptr,
-                                               curandstate_buf_,
-                                               args_,
-                                               0,
-                                               args_.batch_size_);
-
-        topK_topP_sampling_kernel_kernelLauncher_v2(topk_topp_workspace_,
-                                              topk_topp_workspace_size_,
-                                              nullptr,
-                                              logits_buf_,
-                                              nullptr,
-                                              curandstate_buf_,
-                                              args_,
-                                              0,
-                                              args_.batch_size_);
-
-        size_t datatype_buf_size = from_tensor_size * 2 + decoder_workspace_size +
-                                cache_size * 2 * (args_.decoder_layers_ / layer_para_size) + decoder_normed_result_buffer_size;
-
-        nccl_buf_size_ = args_.batch_size_ * args_.vocab_size_padded_;
-        nccl_buf_size_ = (size_t)(ceil(nccl_buf_size_ / 4.)) * 4;
-
-        // comput the total buf size and project buf address
-        size_t buf_size = 
-            ((sizeof(DataType_) == sizeof(half)) ? CUBLAS_WORKSPACE_SIZE : 0) + 
-            sizeof(DataType_) * embedding_kernel_transposed_padded_size +
-            sizeof(DataType_) * (datatype_buf_size + logits_buf_size) + 
-            sizeof(int) * (topp_id_vals_buf_size + topp_offset_buf_size + begin_topp_offset_buf_size) +
-            topp_workspace_size_ + topk_workspace_size_ + topk_topp_workspace_size_ + sizeof(DataType_) * 
-            nccl_buf_size_ + finished_buf_size + curandState_size * sizeof(curandState_t);
-
-        buf_ = reinterpret_cast<void *>(allocator_.malloc(buf_size));
-        project_buf_ = (DataType_ *)(buf_ + buf_size);
-
-        if (sizeof(DataType_) == sizeof(half))
-        {
-          cublas_workspace_ = buf_;
-          embedding_kernel_padded_ = (DataType_ *)((char*)cublas_workspace_ + CUBLAS_WORKSPACE_SIZE);
-        }
-        else
-        {
-          cublas_workspace_ = nullptr;
-          embedding_kernel_padded_ = (DataType_ *)buf_;
-        }
-        from_tensor_[0] = (DataType_ *)(embedding_kernel_padded_ + embedding_kernel_transposed_padded_size);
-        from_tensor_[1] = (DataType_ *)(from_tensor_[0] + from_tensor_size);
-
-        K_cache_[0] = from_tensor_[1] + from_tensor_size + 0 * cache_size * args_.decoder_layers_ / layer_para_size;
-        V_cache_[0] = from_tensor_[1] + from_tensor_size + 1 * cache_size * args_.decoder_layers_ / layer_para_size;
-
-        decoder_buf_ = V_cache_[0] + cache_size * args_.decoder_layers_ / layer_para_size;
-        decoder_normed_result_buf_ = (decoder_buf_ + decoder_workspace_size);
-        logits_buf_ = decoder_normed_result_buf_ + decoder_normed_result_buffer_size;
-        topp_id_vals_buf_ = (int *)((DataType_*)logits_buf_ + logits_buf_size);
-        begin_topp_offset_buf_ = (int *)(topp_id_vals_buf_ + topp_id_vals_buf_size);
-        topp_offset_buf_ = (int *)((int*)begin_topp_offset_buf_ + begin_topp_offset_buf_size);
-        topp_workspace_ = (void *)((int*)topp_offset_buf_ + topp_offset_buf_size);
-        topk_workspace_ = (void *)((char*)topp_workspace_ + topp_workspace_size_);
-        topk_topp_workspace_ = (void *)((char*)topk_workspace_ + topk_workspace_size_);
-        nccl_logits_buf_ = (DataType_ *)((char*)topk_topp_workspace_ + topk_topp_workspace_size_);
-        curandstate_buf_ = (curandState_t*)(nccl_logits_buf_ + nccl_buf_size_);
-        finished_buf_ = (bool*)(curandstate_buf_ + curandState_size);
-        h_finished_buf_ = new bool[args_.batch_size_];
-
-        cudaMemset(embedding_kernel_padded_, 0, embedding_kernel_transposed_padded_size * sizeof(DataType_));
-
-        int isConfigExist = access("decoding_gemm_config.in", 0);
-        if (isConfigExist == -1)
-            printf("[WARNING] decoding_gemm_config.in is not found\n");
-        else
-        {
-            readAlgoFromConfig(cublasAlgoMap_, 1);
-            // check that the gemm_config setting is runnable
-            for (auto iter = cublasAlgoMap_.begin() ; iter != cublasAlgoMap_.end() ; iter++)
-            {
-                int algoId = iter->second.algoId;
-                int stages = iter->second.stages;
-                //only check for cublas
-                if (stages != -1)
-                    continue;
-                if (Traits_::OpType == OperationType::FP32)
-                {
-                    if (algoId > CUBLAS_GEMM_ALGO23 || algoId < CUBLAS_GEMM_DEFAULT)
-                    {
-                        // the algorithm is not for FP32
-                        printf("[ERROR] cuBLAS Algorithm %d is not used in FP32. \n", algoId);
-                        exit(-1);
-                    }
-                }
-                else
-                {
-                    if (algoId > CUBLAS_GEMM_ALGO15_TENSOR_OP || algoId < CUBLAS_GEMM_DEFAULT_TENSOR_OP)
-                    {
-                        // the algorithm is not for FP16
-                        printf("[ERROR] cuBLAS Algorithm %d is not used in FP16. \n", algoId);
-                        exit(-1);
-                    }
-                }
-            }
-        }
-    }
-
-    void set_tensor_parallel_param(const TensorParallelParam param)
-    {
-        t_parallel_param_ = param;
-        decoder_->set_tensor_parallel_param(param);
-    }
-
-    void set_layer_parallel_param(const LayerParallelParam param)
-    {
-        l_parallel_param_ = param;
-        decoder_->set_layer_parallel_param(param);
-    }
-
-    void forward_context(const DecoderInitParam<DataType_> *decoder_param,
-                         const DecodingInitParam<DataType_> decoding_params)
-    {
-#ifndef NDEBUG
-        PRINT_FUNC_NAME_();
-#endif
-        const int input_len = decoding_params.request_input_len;
-        const int max_len = (decoding_params.request_output_len > 0 && input_len + decoding_params.request_output_len <= args_.seq_len_) ?
-                            input_len + decoding_params.request_output_len :
-                            args_.seq_len_;
-        const int request_batch_size = decoding_params.request_batch_size;
-
-        cudaMemsetAsync(decoding_params.output_ids, 0, sizeof(int) * request_batch_size * max_len, decoding_params.stream);
-#ifndef NDEBUG
-        cudaDeviceSynchronize();
-        check_cuda_error(cudaGetLastError());
-#endif
-
-        // const int input_len = decoding_params.request_input_len;
-        const int max_input_len = decoding_params.max_input_len;
-        DataType_ alpha = DataType_(1.0f);
-        DataType_ beta = DataType_(0.0f);
-        if(input_len == 1)
-        {
-            cudaMemcpyAsync(decoding_params.output_ids, decoding_params.d_start_ids, 
-                            sizeof(int) * request_batch_size, cudaMemcpyDeviceToDevice, decoding_params.stream);
-            return;
-        }
-        const int local_batch_size = ceil(request_batch_size * 1.0 / l_parallel_param_.world_size);
-        const int m = local_batch_size * input_len;
-        const int h_1 = args_.hidden_units_;
-
-        DataType_* from_tensor[2];
-        DataType_* decoder_output;
-        DataType_* decoder_workspace;
-        void *buf = reinterpret_cast<void *>(allocator_.malloc(
-            decoder_->getContextWorkspaceSize(input_len, local_batch_size) + 
-            (m * h_1 + 2 * request_batch_size * input_len * h_1) * sizeof(DataType_)
-        ));
-#ifndef NDEBUG
-        cudaDeviceSynchronize();
-        check_cuda_error(cudaGetLastError());
-#endif
-
-        from_tensor[0] = (DataType_*) buf;
-        from_tensor[1] = from_tensor[0] + request_batch_size * input_len * h_1;
-        decoder_output = from_tensor[1] + request_batch_size * input_len * h_1;
-        decoder_workspace = decoder_output + m * h_1;
-
-#ifndef NDEBUG
-        cudaDeviceSynchronize();
-        check_cuda_error(cudaGetLastError());
-#endif
-
-        if(l_parallel_param_.rank == 0)
-        {
-            PUSH_RANGE("Before Transformer/Embedding")
-            start_id_embedding_position_lookups_kernel_launcher(from_tensor[0],
-                                                                decoding_params.output_ids,
-                                                                decoding_params.embedding_table,
-                                                                decoding_params.position_encoding_table,
-                                                                decoding_params.d_start_ids,
-                                                                1,
-                                                                input_len,
-                                                                max_input_len,
-                                                                request_batch_size,
-                                                                args_.hidden_units_, 
-                                                                decoding_params.stream);
-
-            POP_RANGE
-#ifndef NDEBUG
-        cudaDeviceSynchronize();
-        check_cuda_error(cudaGetLastError());
-#endif
-        }
-
-        int ite_num = (int)(ceil(request_batch_size * 1.0 / local_batch_size));
-        for(int ite = 0; ite < ite_num; ite++)
-        {
-            int in_id, out_id;
-            for (int layer = 0; layer < args_.decoder_layers_; ++layer)
-            {
-                if(l_parallel_param_.is_valid(layer))
-                {
-                    in_id = layer & 0x1;
-                    out_id = 1 - in_id;
-
-                    if(layer == l_parallel_param_.layers_per_group * l_parallel_param_.rank && layer != 0 && l_parallel_param_.world_size > 1)
-                    {
-                        const int size = m * t_parallel_param_.local_hidden_units_;
-                        nccl_recv(from_tensor[in_id] + ite * m * h_1 + size * t_parallel_param_.rank, size, l_parallel_param_.rank - 1, 
-                                    l_parallel_param_, decoding_params.stream);
-                        all2all_gather(from_tensor[in_id] + ite * m * h_1, from_tensor[in_id] + ite * m * h_1, size, 
-                                    t_parallel_param_, decoding_params.stream);
-                    }
-
-                    decoder_->initialize(decoder_param[layer], decoder_buf_, cublas_workspace_, false);
-#ifndef NDEBUG
-                    cudaDeviceSynchronize();
-                    check_cuda_error(cudaGetLastError());
-#endif
-
-                    int dummy_decoder_max_seq_len = args_.seq_len_;
-                    // int dummy_decoder_max_seq_len = -1;
-                    size_t cache_offset;
-                    if(dummy_decoder_max_seq_len == -1)
-                    {
-                        cache_offset = (layer - l_parallel_param_.layers_per_group * l_parallel_param_.rank) *
-                                        args_.batch_size_ * args_.seq_len_ * t_parallel_param_.local_hidden_units_;
-                    }
-                    else
-                    {
-                        cache_offset = (layer - l_parallel_param_.layers_per_group * l_parallel_param_.rank) *
-                                        args_.batch_size_ * args_.seq_len_ * t_parallel_param_.local_hidden_units_ +
-                                        ite * local_batch_size * args_.seq_len_ * t_parallel_param_.local_hidden_units_;
-                    }
-                    decoder_->forward_context(decoder_workspace,
-                                              from_tensor[out_id] + ite * m * h_1,
-                                              K_cache_[0] + cache_offset,
-                                              V_cache_[0] + cache_offset,
-                                              from_tensor[in_id] + ite * m * h_1,
-                                              decoding_params.d_attn_mask + ite * local_batch_size * input_len * input_len,
-                                              local_batch_size,
-                                              input_len,
-                                              ite,
-                                              dummy_decoder_max_seq_len,
-                                              layer == args_.decoder_layers_ - 1);
-#ifndef NDEBUG
-                    cudaDeviceSynchronize();
-                    check_cuda_error(cudaGetLastError());
-#endif
-                    if(layer == l_parallel_param_.layers_per_group * (l_parallel_param_.rank + 1) - 1 && layer != args_.decoder_layers_ - 1 && l_parallel_param_.world_size > 1)
-                    {
-                        const int size = m * t_parallel_param_.local_hidden_units_;
-                        nccl_send(from_tensor[out_id] + ite * m * h_1 + size * t_parallel_param_.rank, size, l_parallel_param_.rank + 1,
-                                    l_parallel_param_, decoding_params.stream);
-                    }
-                }
-            } // end of for loop of layer
-        } // end of for loop of ite
-        allocator_.free(buf);
-#ifndef NDEBUG
-        cudaDeviceSynchronize();
-        check_cuda_error(cudaGetLastError());
-#endif
-    }
-
-    void forward(const DecoderInitParam<DataType_> *decoder_param,
-                 DecodingInitParam<DataType_> decoding_params)
-    {
-#ifndef NDEBUG
-        PRINT_FUNC_NAME_();
-#endif
-        const int input_len = decoding_params.request_input_len;
-
-        const int max_input_len = decoding_params.max_input_len;
-        const int request_batch_size = decoding_params.request_batch_size;
-        const int max_len = (decoding_params.request_output_len > 0 && input_len + decoding_params.request_output_len <= args_.seq_len_) ?
-                            input_len + decoding_params.request_output_len :
-                            args_.seq_len_;
-
-        assert(request_batch_size <= args_.batch_size_);
-        assert(request_batch_size % l_parallel_param_.local_batch_size == 0);
-        const int m = request_batch_size;
-        const int k = args_.hidden_units_;
-        const DataType_* embedding_kernel_ptr = nullptr;
-
-        cudaMemsetAsync(finished_buf_, false, sizeof(finished_buf_[0]) * request_batch_size, decoding_params.stream);
-        if (args_.probability_threshold_ != 0.0)
-        {
-            topp_initialization_kernelLauncher_v2(nullptr,
-                                                  nullptr,
-                                                  nullptr,
-                                                  topp_id_vals_buf_,
-                                                  topp_offset_buf_,
-                                                  begin_topp_offset_buf_,
-                                                  args_.candidate_num_ > 0 ? args_.candidate_num_ : args_.vocab_size_padded_,
-                                                  args_,
-                                                  decoding_params.stream);
-
-#ifndef NDEBUG
-            cudaDeviceSynchronize();
-            check_cuda_error(cudaGetLastError());
-#endif
-        }
-        ker_curand_setupLauncher(curandstate_buf_,
-                                 args_,
-                                 decoding_params.stream);
-
-        if(std::is_same<DataType_, float>::value || (std::is_same<DataType_, half>::value && args_.vocab_size_padded_ == args_.vocab_size_))
-        {
-            embedding_kernel_ptr = (const DataType_ *)decoding_params.embedding_kernel;
-        }
-        else
-        {
-            cudaMemcpyAsync(embedding_kernel_padded_, decoding_params.embedding_kernel, 
-                            sizeof(DataType_) * args_.vocab_size_ * args_.hidden_units_, cudaMemcpyDeviceToDevice, decoding_params.stream);
-            embedding_kernel_ptr = (const DataType_ *)embedding_kernel_padded_;
-        }
-
-#ifndef NDEBUG
-        cudaDeviceSynchronize();
-        check_cuda_error(cudaGetLastError());
-#endif
-        bool is_generation_done = false;
-        const int local_batch = l_parallel_param_.local_batch_size;
-        
-        DataType_ alpha = DataType_(1.0f);
-        DataType_ beta = DataType_(0.0f);
-        for (size_t step = input_len; step < max_len; ++step)
-        {
-            const int ite_num = request_batch_size / local_batch;
-            for(size_t ite = 0; ite < ite_num; ite++)
-            {
-                if(l_parallel_param_.rank == 0 && l_parallel_param_.world_size > 1)
-                {
-                    if(step != (size_t)input_len)
-                    {
-                        PUSH_RANGE("token/recv")
-                        nccl_recv(decoding_params.output_ids + (step - 1) * m + ite * local_batch, local_batch,
-                                  l_parallel_param_.world_size - 1, l_parallel_param_, decoding_params.stream);
-                        POP_RANGE
-                    }
-                }
-
-                if(l_parallel_param_.rank < l_parallel_param_.world_size - 1 && l_parallel_param_.world_size > 1)
-                {
-                    if(step != (size_t)input_len)
-                    {
-                        nccl_broadcast(finished_buf_ + ite * local_batch, local_batch, l_parallel_param_.world_size - 1, l_parallel_param_, decoding_params.stream);
-                    }
-                }
-                if(ite == 0)
-                {
-                    cudaMemcpyAsync(h_finished_buf_, finished_buf_, sizeof(bool) * request_batch_size, cudaMemcpyDeviceToHost, decoding_params.stream);
-                    cudaStreamSynchronize(decoding_params.stream);
-                    uint sum = 0;
-                    for (uint i = 0; i < request_batch_size; i++)
-                    {
-                        sum += (int)h_finished_buf_[i];
-                    }
-                    if (sum == request_batch_size)
-                    {
-                        is_generation_done = true;
-                        break;
-                    }
-                }
-
-                if(l_parallel_param_.rank == 0)
-                {
-                    PUSH_RANGE("Before Transformer/Embedding")
-                    /***** newly fixed by PaddleNLP *****/
-                    embedding_position_lookups_kernel_launcher(from_tensor_[0],
-                                                            decoding_params.embedding_table,
-                                                            decoding_params.position_encoding_table,
-                                                            decoding_params.output_ids,
-                                                            local_batch,
-                                                            m,
-                                                            args_.hidden_units_,
-                                                            step,
-                                                            ite,
-                                                            max_input_len,
-                                                            decoding_params.d_start_lengths,
-                                                            decoding_params.stream);
-                    cudaDeviceSynchronize();
-
-                    POP_RANGE
-#ifndef NDEBUG
-                    cudaDeviceSynchronize();
-                    check_cuda_error(cudaGetLastError());
-#endif
-                }
-
-                //we use two-way buffer
-                int from_id, out_id;
-                for (int layer = 0; layer < args_.decoder_layers_; ++layer)
-                {
-                    if(l_parallel_param_.is_valid(layer))
-                    {
-                        /*
-                            For the first layer (layer-0), from_id is 0. We also stored the embedding lookup 
-                            result in from_tensor_[0]
-                        */
-                        from_id = layer & 0x1;
-                        out_id = 1 - from_id;
-
-                        if(layer == l_parallel_param_.layers_per_group * l_parallel_param_.rank && layer != 0 && l_parallel_param_.world_size > 1)
-                        {
-                            const int size = local_batch * t_parallel_param_.local_hidden_units_;
-                            nccl_recv(from_tensor_[from_id] + size * t_parallel_param_.rank, size, l_parallel_param_.rank - 1, 
-                                      l_parallel_param_, decoding_params.stream);
-                            all2all_gather(from_tensor_[from_id], from_tensor_[from_id], size, 
-                                           t_parallel_param_, decoding_params.stream);
-                        }
-
-                        /*
-                            We use one decoder_ object to process multiple decoder layers. 
-                            At the beginning of each decoder layer, we initialize the decoder object 
-                            with corresponding weights and decoder_buf_.
-                            The decoder_buf_ is reused.
-                        */
-                        decoder_->initialize(decoder_param[layer], decoder_buf_, cublas_workspace_, false);
-                        
-#ifndef NDEBUG
-                        cudaDeviceSynchronize();
-                        check_cuda_error(cudaGetLastError());
-#endif
-                        int dummy_decoder_max_seq_len = args_.seq_len_;
-                        // int dummy_decoder_max_seq_len = -1;
-                        size_t cache_offset;
-                        if(dummy_decoder_max_seq_len == -1)
-                        {
-                            cache_offset = (layer - l_parallel_param_.layers_per_group * l_parallel_param_.rank) *
-                                            args_.batch_size_ * args_.seq_len_ * t_parallel_param_.local_hidden_units_ +
-                                            ite * local_batch * t_parallel_param_.local_hidden_units_;
-                        }
-                        else
-                        {
-                            cache_offset = (layer - l_parallel_param_.layers_per_group * l_parallel_param_.rank) * 
-                                            args_.batch_size_ * args_.seq_len_ * t_parallel_param_.local_hidden_units_ + 
-                                            ite * local_batch * args_.seq_len_ * t_parallel_param_.local_hidden_units_;
-                        }
-
-                        decoder_->forward_v2(from_tensor_[from_id], 
-                                            nullptr, // memory_tensor should be nullptr
-                                            K_cache_[0] + cache_offset,
-                                            V_cache_[0] + cache_offset,
-                                            nullptr, nullptr, // key_mem_cache_ and value_mem_cache_ should be nullptr
-                                            nullptr, // memory_sequence_length should be nullptr
-                                            from_tensor_[out_id], step, dummy_decoder_max_seq_len,
-                                            false, 
-                                            finished_buf_ + ite * local_batch,
-                                            max_input_len, 
-                                            decoding_params.d_start_lengths + ite * local_batch);
-
-#ifndef NDEBUG
-                        cudaDeviceSynchronize();
-                        check_cuda_error(cudaGetLastError());
-#endif          
-
-                        if(layer == l_parallel_param_.layers_per_group * (l_parallel_param_.rank + 1) - 1 && layer != args_.decoder_layers_ - 1 && l_parallel_param_.world_size > 1)
-                        {
-                            const size_t size = local_batch * t_parallel_param_.local_hidden_units_;
-                            nccl_send(from_tensor_[out_id] + size * t_parallel_param_.rank, size, l_parallel_param_.rank + 1, 
-                                      l_parallel_param_, decoding_params.stream);
-                        }
-                    }
-                }
-                
-
-                if(l_parallel_param_.rank == l_parallel_param_.world_size - 1)
-                {
-                    if (args_.normalization_before_) {
-                        layer_norm(from_tensor_[out_id],
-                                decoding_params.layernorm.gamma,
-                                decoding_params.layernorm.beta,
-                                decoder_normed_result_buf_,
-                                local_batch,
-                                k,
-                                decoding_params.stream);                        
-                    } else {
-                        decoder_normed_result_buf_ = from_tensor_[out_id];
-                    }
-#ifndef NDEBUG
-                    cudaDeviceSynchronize();
-                    check_cuda_error(cudaGetLastError());
-#endif
-
-                    DataType_ alpha = DataType_(1.0f);
-                    DataType_ beta = DataType_(0.0f);
-                    assert(args_.vocab_size_padded_ % t_parallel_param_.world_size == 0);
-                    int n = args_.vocab_size_padded_ / t_parallel_param_.world_size;
-                    
-                    // single gpu computing, if it need to fit into multi-gpu, it should change the function
-                    if(t_parallel_param_.world_size == 1)
-                    {
-                        PUSH_RANGE("After Transformer/GEMM")
-
-                        cublasMM_cublasLtMM_wrapper_decoder(decoding_params.cublaslt_handle, 
-                                                        decoding_params.cublas_handle, 
-                                                        CUBLAS_OP_T, CUBLAS_OP_N,
-                                                        n, local_batch, args_.hidden_units_,
-                                                        &alpha,
-                                                        embedding_kernel_ptr, AType_, args_.hidden_units_,
-                                                        decoder_normed_result_buf_, BType_, args_.hidden_units_,
-                                                        &beta,
-                                                        logits_buf_, CType_, n,
-                                                        decoding_params.stream, cublasAlgoMap_,
-                                                        cublas_workspace_);
-                        POP_RANGE
-                    }
-                    else
-                    {
-                        PUSH_RANGE("After Transformer/GEMM")
-                        cublasMM_cublasLtMM_wrapper_decoder(decoding_params.cublaslt_handle, 
-                                                            decoding_params.cublas_handle, 
-                                                            CUBLAS_OP_T, CUBLAS_OP_N,
-                                                            n, local_batch, k,
-                                                            &alpha,
-                                                            embedding_kernel_ptr + t_parallel_param_.rank * n * k,
-                                                            AType_, k,
-                                                            decoder_normed_result_buf_, BType_, k,
-                                                            &beta,
-                                                            nccl_logits_buf_ + t_parallel_param_.rank * local_batch * n,
-                                                            CType_, n,
-                                                            decoding_params.stream, cublasAlgoMap_,
-                                                            cublas_workspace_);
-                        POP_RANGE
-                    }
-
-#ifndef NDEBUG
-                    cudaDeviceSynchronize();
-                    check_cuda_error(cudaGetLastError());
-#endif
-                    
-                    if(t_parallel_param_.world_size == 1)
-                    {
-                        apply_temperature_penalty_kernelLauncher(logits_buf_,
-                                                                (DataType_) args_.temperature_,
-                                                                local_batch,
-                                                                args_.vocab_size_,
-                                                                n,
-                                                                decoding_params.stream);
-                    }
-                    else
-                    {
-                        if(t_parallel_param_.rank == t_parallel_param_.world_size - 1)
-                        {
-                            apply_temperature_penalty_kernelLauncher(nccl_logits_buf_ + t_parallel_param_.rank * local_batch * n,
-                                                                    (DataType_) args_.temperature_,
-                                                                    local_batch,
-                                                                    args_.vocab_size_ - n * t_parallel_param_.rank,
-                                                                    n,
-                                                                    decoding_params.stream);
-                        }
-                        else
-                        {
-                            apply_temperature_penalty_kernelLauncher(nccl_logits_buf_ + t_parallel_param_.rank * local_batch * n,
-                                                                    (DataType_) args_.temperature_,
-                                                                    local_batch,
-                                                                    n,
-                                                                    n,
-                                                                    decoding_params.stream);
-                        }
-                    }
-
-#ifndef NDEBUG
-                    cudaDeviceSynchronize();
-                    check_cuda_error(cudaGetLastError());
-#endif
-
-                    // reduce and concat the result
-                    if(t_parallel_param_.world_size > 1)
-                    {
-                        PUSH_RANGE("After Transformer/all2all_gather")
-                        all2all_gather(nccl_logits_buf_, nccl_logits_buf_, local_batch * n, 
-                                       t_parallel_param_, decoding_params.stream);
-                        POP_RANGE
-                        
-                        transpose_axis_01_kernelLauncher(logits_buf_, nccl_logits_buf_, 
-                                                         t_parallel_param_.world_size, local_batch, n, decoding_params.stream);
-                    }
-
-#ifndef NDEBUG
-                    cudaDeviceSynchronize();
-                    check_cuda_error(cudaGetLastError());
-#endif
-
-                    n = args_.vocab_size_padded_;
-
-                    // Apply repetition penalty.
-                    if (args_.repetition_penalty_ != 1.0) {
-                        PUSH_RANGE("After Transformer/Repetition_penalty")
-                        apply_repetition_penalty_kernelLauncher(logits_buf_,
-                                                                args_.repetition_penalty_,
-                                                                decoding_params.d_start_ids,
-                                                                decoding_params.output_ids,
-                                                                m,
-                                                                local_batch,
-                                                                args_.vocab_size_,
-                                                                n,
-                                                                decoding_params.d_start_lengths,
-                                                                max_input_len,
-                                                                step,
-                                                                ite,
-                                                                decoding_params.stream);
-                        POP_RANGE
-                    }
-
-#ifndef NDEBUG
-                    cudaDeviceSynchronize();
-                    check_cuda_error(cudaGetLastError());
-#endif
-
-                    // Sampling
-                    if(args_.candidate_num_ > 0 && args_.probability_threshold_ == 0.0)
-                    {
-                        PUSH_RANGE("After Transformer/Sampling")
-                        // top k sampling
-                        topK_sampling_kernel_kernelLauncher_v2(topk_workspace_,
-                                                               topk_workspace_size_,
-                                                               logits_buf_,
-                                                               decoding_params.output_ids + step * m + ite * local_batch,
-                                                               nullptr,
-                                                               finished_buf_ + ite * local_batch,
-                                                               curandstate_buf_, // used as random number
-                                                               args_,
-                                                               decoding_params.stream,
-                                                               local_batch);
-                        POP_RANGE
-                    }
-                    else if(args_.candidate_num_ == 0 && args_.probability_threshold_ > 0.0f)
-                    {
-                        PUSH_RANGE("After Transformer/Sampling")
-                        // top p sampling
-                        softmax_kernelLauncher(logits_buf_,
-                                               (DataType_*) nullptr,
-                                               args_.end_id_,
-                                               finished_buf_ + ite * local_batch,
-                                               local_batch,
-                                               args_.vocab_size_padded_,
-                                               args_.vocab_size_,
-                                               decoding_params.stream);
-#ifndef NDEBUG
-                        cudaDeviceSynchronize();
-                        check_cuda_error(cudaGetLastError());
-#endif
-                        topP_sampling_kernel_kernelLauncher_v2(topp_workspace_,
-                                                               topp_workspace_size_,
-                                                               logits_buf_,
-                                                               topp_id_vals_buf_,
-                                                               topp_offset_buf_,
-                                                               begin_topp_offset_buf_,
-                                                               finished_buf_ + ite * local_batch,
-                                                               curandstate_buf_,
-                                                               args_,
-                                                               decoding_params.output_ids + step * m + ite * local_batch,
-                                                               nullptr,
-                                                               n,
-                                                               decoding_params.stream,
-                                                               local_batch);
-
-                        POP_RANGE
-                    }
-                    else if(args_.candidate_num_ > 0 && args_.probability_threshold_ > 0.0f)
-                    {
-                        PUSH_RANGE("After Transformer/Sampling")
-                        topK_topP_sampling_kernel_kernelLauncher_v2(topk_topp_workspace_,
-                                                                    topk_topp_workspace_size_,
-                                                                    decoding_params.output_ids + step * m + ite * local_batch,
-                                                                    logits_buf_,
-                                                                    finished_buf_ + ite * local_batch,
-                                                                    curandstate_buf_,
-                                                                    args_,
-                                                                    decoding_params.stream,
-                                                                    local_batch);
-                        POP_RANGE
-                    }
-#ifndef NDEBUG
-                    cudaDeviceSynchronize();
-                    check_cuda_error(cudaGetLastError());
-#endif
-                }
-                if(step < (size_t)max_input_len)
-                {
-                    // Replace the sampled id by start ids
-                    set_start_ids_kernelLauncher(decoding_params.output_ids, decoding_params.d_start_ids, max_input_len,
-                                                 step, ite, request_batch_size, local_batch, args_.end_id_, decoding_params.stream);
-                }
-
-                if(l_parallel_param_.rank == l_parallel_param_.world_size - 1 && l_parallel_param_.world_size > 1)
-                {
-                    PUSH_RANGE("token/send")
-                    nccl_send(decoding_params.output_ids + step * m + ite * local_batch, local_batch, 0, l_parallel_param_, decoding_params.stream);
-                    POP_RANGE
-                }
-
-#ifndef NDEBUG
-                cudaDeviceSynchronize();
-                check_cuda_error(cudaGetLastError());
-#endif
-
-                if(l_parallel_param_.rank == l_parallel_param_.world_size - 1 && l_parallel_param_.world_size > 1 && step < max_len - 1)
-                {
-                    nccl_broadcast(finished_buf_ + ite * local_batch, local_batch, l_parallel_param_.world_size - 1, l_parallel_param_, decoding_params.stream);
-                }
-#ifndef NDEBUG
-                cudaDeviceSynchronize();
-                check_cuda_error(cudaGetLastError());
-#endif
-            } // end for ite for loop
-            
-            if (is_generation_done) {
-                break;
-            }
-        } // end for decoding step for loop
-        if(l_parallel_param_.rank == 0 && l_parallel_param_.world_size > 1)
-        {
-            for(size_t ite = 0; ite < request_batch_size / local_batch; ite++)
-            {
-                nccl_recv(decoding_params.output_ids + (max_len - 1) * m + ite * local_batch,
-                          local_batch, l_parallel_param_.world_size - 1,
-                          l_parallel_param_, decoding_params.stream);
-            }
-        }
-    } // end of forward
-
-    virtual ~DecodingOpt()
-    {
-        delete[] K_cache_;
-        delete[] V_cache_;
-        delete decoder_;
-        allocator_.free(buf_);
-        delete [] h_finished_buf_;
-    }
-
-    inline int get_num_layer() {return args_.decoder_layers_;}
-
-    inline void set_local_batch_size(int local_batch)
-    { 
-        l_parallel_param_.local_batch_size = local_batch;
-        decoder_->set_local_batch_size(local_batch);
-    }
-};
-
-} //namespace fastertransformer
diff --git a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/standard_encoder.h b/paddlenlp/ops/patches/FasterTransformer/fastertransformer/standard_encoder.h
deleted file mode 100644
index 474fcc9cdad1..000000000000
--- a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/standard_encoder.h
+++ /dev/null
@@ -1,1013 +0,0 @@
-/*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Standard Encoder transformer
- **/
-
-#pragma once
-
-#include <cuda_runtime.h>
-#include "fastertransformer/cuda/cuda_int8_kernels.h"
-#include "fastertransformer/cuda/cuda_kernels.h"
-#include "fastertransformer/cuda/open_attention.h"
-#include "fastertransformer/gemm_test/encoder_gemm_func.h"
-#include "fastertransformer/gemm_test/encoder_igemm_func.h"
-#include "fastertransformer/utils/allocator.h"
-#include "fastertransformer/utils/common_structure.h"
-#include "fastertransformer/utils/functions.h"
-
-namespace fastertransformer {
-
-template <typename T>
-class EncoderInitParam {
-public:
-  const T *from_tensor = nullptr;
-  const T *to_tensor = nullptr;
-
-  LayerNormWeight<T> input_layernorm;
-  AttentionWeight<T> self_attention;
-  const T *attr_mask = nullptr;
-  LayerNormWeight<T> self_layernorm;
-
-  FFNWeight<T> ffn;
-
-  T *transformer_out;
-  cublasHandle_t cublas_handle = nullptr;
-  cublasLtHandle_t cublaslt_handle = nullptr;
-  cudaStream_t stream = 0;
-
-  const int *sequence_id_offset = nullptr;
-  int valid_word_num = -1;
-  int layer_idx = 0;
-  int layer_num = 12;
-
-  // Part 1:
-  //  First 80 are for activation amaxs. For each activation amax, there are 4
-  //  values: amax, amax/127.0f, amax/127.0f/127.0f, 127.0f/amax -- input_amax
-  //  0-3 , Q_aftergemm_amax 4-7, Qbias_amax 8-11, K_aftergemm_amax 12-15,
-  //  Kbias_amax 16-19, V_aftergemm_amax 20-23, Vbias_amax 24-27, bmm1_amax
-  //  28-31, Softmax_amax 32-35, bmm2_amax 36-39, Proj_aftergemm_scale 40-43,
-  //  ProjBiasNorm_amax 44-47, FC1_aftergemm_amax 48-51, F1Bias_amax 52-55,
-  //  FC2_aftergemm_amax 56-59, F2BiasNorm_amax 60-63, reserve 64-79
-  // Part 2:
-  //  Kernel amaxs, for each kernel amax list, there are output_channel values :
-  //  query_weight_amax_list, key_weight_amax_list, value_weight_amax_list,
-  //  proj_weight_amax_list, FC1_weight_amax_list, FC2_weight_amax_list
-  // Part 3:
-  //  Int8 gemm deQFactor list (8 values): Q_deQ_scale, K_deQ_scale,
-  //  V_deQ_scale, bmm1_deQ_scale, bmm2_deQ_scale, FC0_deQ_scale, FC1_deQ_scale,
-  //  FC2_deQ_scale
-  // Part 4:
-  //  Amax used in trt fused mha kernel (3 values) : QKVbias_amax, Softmax_amax,
-  //  bmm2_amax
-  const float *amaxList = nullptr;
-  const int *trt_seqlen_offset = nullptr;
-  int trt_seqlen_size = -1;
-};
-
-template <OperationType OpType_,
-          template <OperationType> class MultiHeadAttention_>
-class OpenEncoderTraits;
-
-template <template <OperationType> class MultiHeadAttention_>
-class OpenEncoderTraits<OperationType::FP32, MultiHeadAttention_>
-    : public TransformerTraits<OperationType::FP32> {
-public:
-  typedef MultiHeadAttention_<OpType> MultiHeadAttention;
-};
-
-template <template <OperationType> class MultiHeadAttention_>
-class OpenEncoderTraits<OperationType::FP16, MultiHeadAttention_>
-    : public TransformerTraits<OperationType::FP16> {
-public:
-  typedef MultiHeadAttention_<OpType> MultiHeadAttention;
-};
-
-template <class Traits_>
-class OpenEncoder {
-  IAllocator *allocator_ = NULL;
-  typename Traits_::MultiHeadAttention *attention_ = NULL;
-  typedef typename Traits_::DataType DataType_;
-  EncoderInitParam<DataType_> param_;
-
-  const cudaDataType_t AType_ = Traits_::AType;
-  const cudaDataType_t BType_ = Traits_::BType;
-  const cudaDataType_t CType_ = Traits_::CType;
-  std::map<std::string, cublasLtMatmulAlgo_info> cublasAlgoMap_;
-  std::map<std::string, int> parameterMap_;
-
-  DataType_ *buf_ = NULL;
-  DataType_ *normed_from_tensor_;
-  DataType_ *attr_out_buf_;
-  DataType_ *attr_matmul_buf_;
-  DataType_ *inter_matmul_buf_;
-  DataType_ *attr_unnormed_matmul_buf_;
-  void *cublas_workspace_ = NULL;
-  int batch_size_;
-  int from_seq_len_;
-  int to_seq_len_;
-  int head_num_;
-  int size_per_head_;
-
-  int sm_;
-  bool allow_gemm_test_ = false;
-  bool use_gelu_ = false;
-  bool use_ORDER_COL32_2R_4R4_ = false;
-
-  // for int8 quantization
-  const float *FC0_weight_amax_list, *FC1_weight_amax_list,
-      *FC2_weight_amax_list;
-  float scale_list[INT8O_GEMM_NUM + TRT_FUSED_MHA_AMAX_NUM];
-  const float *bmm2_amax_ptr, *ProjBiasNorm_amax_ptr, *F1Bias_amax_ptr,
-      *F2BiasNorm_amax_ptr, *to_tensor_amax_ptr, *Proj_aftergemm_amax_ptr,
-      *F1_aftergemm_amax_ptr, *F2_aftergemm_amax_ptr,
-      *int8O_gemm_deQ_scale_list;
-  // int8_mode == 0 -- not use int8
-  // int8_mode == 1 -- use int8; without quantized residual; when (batch*seqLen
-  // >= 512) or (seqLen % 32 !=0 ), using trt fused mha
-  // int8_mode == 2 -- use int8; with quantized residual; with trt fused mha
-  // int8_mode == 3 -- use int8; with quantized residual; without trt fused mha
-  int int8_mode_;
-  int layer_idx_;
-  int layer_num_;
-  const int8_t *int8_from_tensor_;
-  const DataType_ *transA_from_tensor_;
-  int32_t *int_buf_;
-  DataType_ *tmp_DataType_, *transA_from_tensor_tmp_,
-      *transformer_out_tmp_DataType_;
-  int8_t *tmp_int8_, *int8_from_tensor_tmp_, *attr_matmul_buf_tmp_,
-      *transformer_out_tmp_int8_;
-
-public:
-  void setLayerIdx(int layer_idx) { layer_idx_ = layer_idx; }
-
-  size_t calBufSizeInByte(int batch_size,
-                          int seq_len,
-                          int head_num,
-                          int size_per_head,
-                          int int8_mode) {
-    size_t m = batch_size * seq_len;
-    size_t n = head_num * size_per_head;
-    size_t k = n;
-    size_t normal_buf_size;
-    if (int8_mode != 0) {
-      // transA_from_tensor & transformer_out_tmp_DataType
-      normal_buf_size =
-          m * k * sizeof(DataType_) +
-          // int8_from_tensor & attr_matmul_buf_tmp & transformer_out_tmp_int8
-          m * k * sizeof(int8_t) +
-          // int8 qkv weight
-          3 * n * k * sizeof(int8_t) +
-          // FC0 & FC1 & FC2 for m*k(4k)*sizeof(DataType)
-          4 * m * k * sizeof(int) +
-          // attr_out_buf_ & attr_matmul_buf_ & inter_matmul_buf_
-          6 * m * n * sizeof(DataType_) +
-          // temp buf
-          m * n * sizeof(DataType_);
-    } else {
-      normal_buf_size =
-          sizeof(DataType_) * (m * n) * 8 +
-          ((sizeof(DataType_) == sizeof(half)) ? CUBLAS_WORKSPACE_SIZE : 0);
-    }
-    return normal_buf_size;
-  }
-
-  bool checkParameterInMap(int batch_size,
-                           int seq_len,
-                           int head_num,
-                           int size_per_head,
-                           int int8_mode,
-                           int is_fp16) {
-    char mark[1000];
-    bool parameterInMap;
-    int dataType = is_fp16 == 0 ? FLOAT_DATATYPE : HALF_DATATYPE;
-    if (int8_mode != 0) {
-      dataType = INT8_DATATYPE;
-    }
-    sprintf(mark,
-            "%d_%d_%d_%d_%d",
-            batch_size,
-            seq_len,
-            head_num,
-            size_per_head,
-            dataType);
-    if (parameterMap_.find(std::string(mark)) != parameterMap_.end())
-      parameterInMap = true;
-    else
-      parameterInMap = false;
-    return parameterInMap;
-  }
-
-  // free buffer for gemm test
-  // This function requires the same allocator of allocateBufferForGemmTest(*)
-  void freeBufferForGemmTest(IAllocator *allocator, void *&buffer) {
-    if (buffer != NULL) {
-      allocator->free(buffer);
-      buffer = NULL;
-    }
-  }
-
-  void allocateBufferForGemmTest(IAllocator *allocator,
-                                 void *&buffer,
-                                 int batch_size,
-                                 int seq_len,
-                                 int head_num,
-                                 int size_per_head,
-                                 int int8_mode,
-                                 int is_fp16) {
-    size_t buf_size_in_byte = calGemmTestBufSizeInByte(
-        batch_size, seq_len, head_num, size_per_head, int8_mode, is_fp16);
-    size_t total, free;
-    check_cuda_error(cudaMemGetInfo(&free, &total));
-    if (free < buf_size_in_byte + 10 * 1024 * 1024) {
-      printf(
-          "[WARNING] There is no enough device memory for gemm test!\n %ld "
-          "Bytes is needed, but only %ld Bytes is free.\n",
-          buf_size_in_byte,
-          free);
-      buffer = NULL;
-      return;
-    }
-    buffer =
-        reinterpret_cast<void *>(allocator->malloc(buf_size_in_byte, false));
-  }
-
-  bool gemmTest(int batch_size,
-                int seq_len,
-                int head_num,
-                int size_per_head,
-                int int8_mode,
-                int is_fp16) {
-    bool hasChangedConfig = false;
-    if (int8_mode != 0) {
-      // if not found parameters in map,
-      // read config first
-      // in case multiple instances (for example in tensorflow op) are used
-      if (!checkParameterInMap(batch_size,
-                               seq_len,
-                               head_num,
-                               size_per_head,
-                               int8_mode,
-                               is_fp16)) {
-        readAlgoFromConfig(int8_mode, cublasAlgoMap_, parameterMap_);
-      } else {
-        return hasChangedConfig;
-      }
-
-      // if still not found algos in map,
-      // do gemm test
-      if (!checkParameterInMap(batch_size,
-                               seq_len,
-                               head_num,
-                               size_per_head,
-                               int8_mode,
-                               is_fp16)) {
-        void *gemm_test_buf = NULL;
-        allocateBufferForGemmTest(allocator_,
-                                  gemm_test_buf,
-                                  batch_size,
-                                  seq_len,
-                                  head_num,
-                                  size_per_head,
-                                  int8_mode,
-                                  is_fp16);
-        if (gemm_test_buf != NULL) {
-          generate_encoder_igemm_config(
-              batch_size, seq_len, head_num, size_per_head, gemm_test_buf);
-          freeBufferForGemmTest(allocator_, gemm_test_buf);
-          readAlgoFromConfig(int8_mode, cublasAlgoMap_, parameterMap_);
-          hasChangedConfig = true;
-        }
-      } else {
-        hasChangedConfig = true;
-        return hasChangedConfig;
-      }
-    } else {
-      // if not found parameters in map,
-      // read config first
-      // in case multiple instances (for example in tensorflow op) are used
-      if (!checkParameterInMap(batch_size,
-                               seq_len,
-                               head_num,
-                               size_per_head,
-                               int8_mode,
-                               is_fp16)) {
-        readAlgoFromConfig(int8_mode, cublasAlgoMap_, parameterMap_);
-      } else {
-        return hasChangedConfig;
-      }
-
-      // if still not found parameters in map,
-      // do gemm test
-      if (!checkParameterInMap(batch_size,
-                               seq_len,
-                               head_num,
-                               size_per_head,
-                               int8_mode,
-                               is_fp16)) {
-        void *gemm_test_buf = NULL;
-        allocateBufferForGemmTest(allocator_,
-                                  gemm_test_buf,
-                                  batch_size,
-                                  seq_len,
-                                  head_num,
-                                  size_per_head,
-                                  int8_mode,
-                                  is_fp16);
-        if (gemm_test_buf != NULL) {
-          if (is_fp16 == 1)
-            generate_encoder_gemm_config<half>(
-                batch_size, seq_len, head_num, size_per_head, gemm_test_buf);
-          else
-            generate_encoder_gemm_config<float>(
-                batch_size, seq_len, head_num, size_per_head, gemm_test_buf);
-          freeBufferForGemmTest(allocator_, gemm_test_buf);
-          readAlgoFromConfig(int8_mode, cublasAlgoMap_, parameterMap_);
-          hasChangedConfig = true;
-        }
-      } else {
-        hasChangedConfig = true;
-        return hasChangedConfig;
-      }
-    }
-    return hasChangedConfig;
-  }
-
-  // free buffer for OpenEncoder
-  void freeBuffer() {
-#ifndef NDEBUG
-    PRINT_FUNC_NAME_();
-#endif
-    if (buf_ != NULL) {
-      if (allocator_ == NULL) {
-        printf("[ERROR][OpenEncoder][freeBuffer] allocator_ is NULL!\n");
-        exit(-1);
-      }
-      allocator_->free(buf_);
-      buf_ = NULL;
-    }
-    if (attention_ != NULL) attention_->freeBuffer();
-  }
-
-  // allocate buffer for OpenEncoder
-  // do gemm test if allow_gemm_test == true
-  void allocateBuffer(IAllocator *allocator,
-                      int batch_size,
-                      int from_seq_len,
-                      int to_seq_len,
-                      int head_num,
-                      int size_per_head,
-                      bool use_trt_kernel = true) {
-#ifndef NDEBUG
-    PRINT_FUNC_NAME_();
-#endif
-    try {
-      if (allocator == NULL) {
-        printf("[ERROR][OpenEncoder][allocateBuffer] allocator == NULL!\n");
-        exit(-1);
-      }
-      // only allocate new buffer when buf_ is empty
-      // if buf_ is not empty, use previous allocated one
-      // this can ensure consistency between (allocator_, batch_size_, ...) and
-      // buf_
-      if (buf_ != nullptr) {
-        printf(
-            "[ERROR][OpenEncoder][allocateBuffer] previous buffer is not "
-            "freed, use previous one. To allocate new buffer, please use "
-            "freeBuffer() to free previous buffer first.\n");
-        exit(-1);
-      } else {
-        allocator_ = allocator;
-        batch_size_ = batch_size;
-        from_seq_len_ = from_seq_len;
-        to_seq_len_ = to_seq_len;
-        head_num_ = head_num;
-        size_per_head_ = size_per_head;
-
-        int m = batch_size_ * from_seq_len_;
-        int k = head_num_ * size_per_head_;
-        int n = k;
-
-        int buf_size = m * n;
-        size_t buf_size_in_byte = calBufSizeInByte(
-            batch_size_, from_seq_len_, head_num_, size_per_head_, int8_mode_);
-
-        // allocate buffer
-        if (int8_mode_ != 0) {
-#ifdef WITH_INT8
-          buf_ = reinterpret_cast<DataType_ *>(
-              allocator_->malloc(buf_size_in_byte, false));
-          if (buf_ == nullptr)
-            throw std::runtime_error(
-                std::string("Allocator failed to allocate internal buffer."));
-
-          attr_out_buf_ =
-              (DataType_ *)(((char *)buf_) + m * k * sizeof(DataType_) +
-                            m * k * sizeof(int8_t) +
-                            3 * n * k * sizeof(int8_t) +
-                            4 * m * k * sizeof(int));
-          attr_matmul_buf_ = attr_out_buf_ + buf_size;
-          inter_matmul_buf_ = attr_matmul_buf_ + buf_size;
-
-          int8_from_tensor_tmp_ =
-              (int8_t *)(((char *)buf_) + m * k * (sizeof(DataType_)));
-          attr_matmul_buf_tmp_ = int8_from_tensor_tmp_;
-          transformer_out_tmp_int8_ = int8_from_tensor_tmp_;
-          transA_from_tensor_tmp_ = (DataType_ *)buf_;
-          transformer_out_tmp_DataType_ = transA_from_tensor_tmp_;
-
-          int_buf_ =
-              (int32_t *)(((char *)buf_) +
-                          (m * k) * (sizeof(DataType_) + sizeof(int8_t)) +
-                          3 * n * k * sizeof(int8_t));
-
-          tmp_DataType_ =
-              (DataType_ *)(((char *)buf_) +
-                            m * k * (sizeof(DataType_) + sizeof(int8_t)) +
-                            3 * n * k * sizeof(int8_t) +
-                            4 * m * k * sizeof(int32_t) +
-                            6 * m * n * sizeof(DataType_));
-          tmp_int8_ = (int8_t *)tmp_DataType_;
-#else
-      printf("[ERROR] Standard transformer does not support INT8. \n");
-      exit(-1);
-#endif
-        } else {
-          buf_ = reinterpret_cast<DataType_ *>(
-              allocator_->malloc(buf_size_in_byte, false));
-          if (buf_ == nullptr)
-            throw std::runtime_error(
-                std::string("Allocator failed to allocate internal buffer."));
-
-
-          if (sizeof(DataType_) == sizeof(half)) {
-            // cublas_workspace_ should be the start pointer of cudaMalloc()
-            // to ensure 16B alignemnet
-            cublas_workspace_ = buf_;
-            normed_from_tensor_ = (DataType_ *)((char *)cublas_workspace_ +
-                                                CUBLAS_WORKSPACE_SIZE);
-          } else {
-            cublas_workspace_ = nullptr;
-            normed_from_tensor_ = (DataType_ *)buf_;
-          }
-          attr_out_buf_ = normed_from_tensor_ + buf_size;
-          attr_matmul_buf_ = attr_out_buf_ + buf_size;
-          attr_unnormed_matmul_buf_ = attr_matmul_buf_ + buf_size;
-          inter_matmul_buf_ = attr_unnormed_matmul_buf_ + buf_size;
-        }
-      }
-
-      bool hasChangedConfig = false;
-      int is_fp16;
-      if (Traits_::OpType == OperationType::FP32)
-        is_fp16 = 0;
-      else
-        is_fp16 = 1;
-      // check if target algos in map
-      if (allow_gemm_test_) {
-        /*
-        hasChangedConfig = gemmTest(batch_size_,
-                                    from_seq_len_,
-                                    head_num_,
-                                    size_per_head_,
-                                    int8_mode_,
-                                    is_fp16);
-        */
-      }
-
-      // allocate buffer for attention_
-      attention_->allocateBuffer(allocator,
-                                 cublas_workspace_,
-                                 batch_size_,
-                                 from_seq_len_,
-                                 to_seq_len,
-                                 head_num_,
-                                 size_per_head_,
-                                 hasChangedConfig,
-                                 use_trt_kernel);
-    } catch (std::runtime_error &error) {
-      throw error;
-    }
-  }
-
-
-  OpenEncoder(int int8_mode = 0,
-              bool allow_gemm_test = false,
-              bool use_gelu = false)
-      : int8_mode_(int8_mode),
-        allow_gemm_test_(allow_gemm_test),
-        use_gelu_(use_gelu) {
-#ifndef NDEBUG
-    PRINT_FUNC_NAME_();
-#endif
-
-    try {
-      // sm_ = getSMVersion();
-      // Set fake sm_ which have no effect.
-      sm_ = 70;
-
-      if (sm_ >= 80) {
-        use_ORDER_COL32_2R_4R4_ = true;
-      }
-      if (sm_ < 75 && int8_mode_ != 0) {
-        printf(
-            "[ERROR][BertEncoderTransformer] int8 mode only works with sm >= "
-            "75.\n");
-        exit(-1);
-      }
-
-
-      int isConfigExist = -1;
-      if (int8_mode_ != 0) {
-#ifdef WITH_INT8
-        isConfigExist = access(IGEMM_CONFIG, 0);
-#else
-      printf("[ERROR] Standard transformer does not support INT8. \n");
-      exit(-1);
-#endif
-      } else {
-        isConfigExist = access(GEMM_CONFIG, 0);
-      }
-      if (isConfigExist == -1) {
-        if (!allow_gemm_test_) {
-          // printf(
-          //     "[WARNING][OpenEncoder] %s is not found; using default GEMM "
-          //     "algo\n",
-          //     int8_mode_ != 0 ? IGEMM_CONFIG : GEMM_CONFIG);
-        }
-      } else {
-        readAlgoFromConfig(int8_mode_, cublasAlgoMap_, parameterMap_);
-      }
-
-      attention_ = new typename Traits_::MultiHeadAttention(
-          int8_mode_, allow_gemm_test_, use_ORDER_COL32_2R_4R4_, sm_);
-    } catch (std::runtime_error &error) {
-      throw error;
-    }
-  }
-
-  void genTransATensorAndInt8TensorForFirstLayer() {
-    const int m = param_.sequence_id_offset == nullptr
-                      ? batch_size_ * from_seq_len_
-                      : param_.valid_word_num;
-    const int k = head_num_ * size_per_head_;
-    if (int8_mode_ == 1) {
-      transposeMatrix_colMajorToCOL32_kernelLauncher(
-          transA_from_tensor_tmp_, param_.from_tensor, k, m, param_.stream);
-      transA_from_tensor_ = (const DataType_ *)transA_from_tensor_tmp_;
-      quantized_kernelLauncher(int8_from_tensor_tmp_,
-                               transA_from_tensor_,
-                               m * k,
-                               to_tensor_amax_ptr + 3,
-                               param_.stream);
-    } else if (int8_mode_ == 2 || int8_mode_ == 3) {
-      transposeMatrix_colMajorToCOL32_quantize_kernelLauncher(
-          int8_from_tensor_tmp_,
-          param_.from_tensor,
-          k,
-          m,
-          to_tensor_amax_ptr + 3,
-          param_.stream);
-    }
-    int8_from_tensor_ = (const int8_t *)(int8_from_tensor_tmp_);
-  }
-
-  /**
-   * Initialize the parameters in class
-   * We will keep the Ctor empty to ensure the sub classes follow the same init
-   *routine.
-   * Please be aware that no dynamic memory allocation should be placed
-   **/
-  void initialize(EncoderInitParam<DataType_> param) {
-#ifndef NDEBUG
-    PRINT_FUNC_NAME_();
-#endif
-
-    param_ = param;
-    cuda::MultiHeadInitParam<DataType_> multi_head_init_param;
-
-    if (int8_mode_ != 0) {
-      printf("[ERROR] Standard transformer does not support INT8. \n");
-      exit(-1);
-      // int hidden_dim = size_per_head_*head_num_;
-      // layer_idx_ = param_.layer_idx;
-      // layer_num_ = param_.layer_num;
-
-      // bmm2_amax_ptr = param_.amaxList + 36;
-      // ProjBiasNorm_amax_ptr = param_.amaxList + 44;
-      // F1Bias_amax_ptr = param_.amaxList + 52;
-      // F2BiasNorm_amax_ptr = param_.amaxList + 60;
-      // Proj_aftergemm_amax_ptr = param_.amaxList + 40;
-      // F1_aftergemm_amax_ptr = param_.amaxList + 48;
-      // F2_aftergemm_amax_ptr = param_.amaxList + 56;
-      // to_tensor_amax_ptr = param_.amaxList;
-
-      // FC0_weight_amax_list = param_.amaxList + ACTIVATION_AMAX_NUM +
-      // 3*hidden_dim;
-      // FC1_weight_amax_list = FC0_weight_amax_list + hidden_dim;
-      // FC2_weight_amax_list = FC1_weight_amax_list + 4*hidden_dim;
-
-      // //This D2H copy operation will cause performance degradation
-      // if ( (int8_mode_ == 1 && ((batch_size_*from_seq_len_ >= 512) ||
-      // (from_seq_len_ % 32 != 0)) ) || int8_mode_ == 2 || int8_mode_ == 3)
-      // {
-      //   //copy (int8O_gemm_deQ_scale_list + trt_fused_mha_amax_list) amax
-      //   into scale_list
-      //   check_cuda_error(cudaMemcpyAsync(scale_list, FC2_weight_amax_list +
-      //   hidden_dim, (INT8O_GEMM_NUM+TRT_FUSED_MHA_AMAX_NUM)*sizeof(float),
-      //   cudaMemcpyDeviceToHost, param_.stream));
-      //   int8O_gemm_deQ_scale_list = scale_list;
-      // }
-      // int k = hidden_dim;
-
-      // const int m = param_.sequence_id_offset == nullptr ? batch_size_ *
-      // from_seq_len_ : param_.valid_word_num;
-      // if (layer_idx_ == 0){
-      //   genTransATensorAndInt8TensorForFirstLayer();
-      // }
-      // else
-      // {
-      //   transA_from_tensor_ = param_.from_tensor;
-      //   if (int8_mode_ == 2 || int8_mode_ == 3){
-      //     int8_from_tensor_ = (const int8_t*)transA_from_tensor_;
-      //   }
-      //   else if (int8_mode_ == 1){
-      //     quantized_kernelLauncher(int8_from_tensor_tmp_,
-      //     transA_from_tensor_, m*k, to_tensor_amax_ptr + 3, param_.stream);
-      //     int8_from_tensor_ = (const int8_t*)(int8_from_tensor_tmp_);
-      //   }
-      // }
-
-      // multi_head_init_param.int8_from_tensor = int8_from_tensor_;
-
-      // multi_head_init_param.amaxList = param_.amaxList;
-
-      // multi_head_init_param.int8O_gemm_deQ_scale_list =
-      // int8O_gemm_deQ_scale_list;
-
-      // multi_head_init_param.trt_fused_mha_amax_list = scale_list +
-      // INT8O_GEMM_NUM;
-    }
-
-    multi_head_init_param.from_tensor = param.from_tensor;
-    multi_head_init_param.to_tensor = param.to_tensor;
-    multi_head_init_param.self_attention = param.self_attention;
-    multi_head_init_param.attr_mask = param.attr_mask;
-    multi_head_init_param.stream = param.stream;
-    multi_head_init_param.cublas_handle = param.cublas_handle;
-    multi_head_init_param.cublaslt_handle = param_.cublaslt_handle;
-    multi_head_init_param.attr_out = attr_out_buf_;
-    multi_head_init_param.valid_word_num = param.valid_word_num;
-    multi_head_init_param.sequence_id_offset = param.sequence_id_offset;
-    multi_head_init_param.trt_seqlen_offset = param_.trt_seqlen_offset;
-    multi_head_init_param.trt_seqlen_size = param_.trt_seqlen_size;
-
-    attention_->initialize(multi_head_init_param);
-  }
-
-  /**
-   * do forward
-   **/
-  void forward() {
-#ifndef NDEBUG
-    PRINT_FUNC_NAME_();
-#endif
-    try {
-      const int m = param_.sequence_id_offset == nullptr
-                        ? batch_size_ * from_seq_len_
-                        : param_.valid_word_num;
-      int k = head_num_ * size_per_head_;
-      int n = k;
-
-      layer_norm(param_.from_tensor,
-                 param_.input_layernorm.gamma,
-                 param_.input_layernorm.beta,
-                 normed_from_tensor_,
-                 m,
-                 k,
-                 param_.stream);
-      attention_->forward(normed_from_tensor_, normed_from_tensor_);
-
-#ifndef NDEBUG
-      cudaDeviceSynchronize();
-      check_cuda_error(cudaGetLastError());
-#endif
-
-      DataType_ alpha = (DataType_)1.0f;
-      DataType_ beta = (DataType_)0.0f;
-
-      if (int8_mode_ != 0) {
-        printf("[ERROR] Standard transformer does not support INT8. \n");
-        exit(-1);
-        //         if (int8_mode_ == 1)
-        //         {
-        //           cublasLtMM_withAlgo(int_buf_, 1, m, n, k, m*k, n*k, m*n,
-        //                               (int8_t*)attr_out_buf_,
-        //                               (int8_t*)(param_.self_attention.
-        //                               attention_output_weight.kernel),
-        //                               param_.cublaslt_handle, param_.stream,
-        //                               cublasAlgoMap_,
-        //                               use_ORDER_COL32_2R_4R4_);
-        //           add_bias_input_layernorm_COL32_int32I_DataTypeO_kernelLauncher(attr_matmul_buf_,
-        //           int_buf_, transA_from_tensor_,
-        //           param_.self_attention.attention_output_weight.bias,
-        //                                                                          param_.self_layernorm.gamma, param_.self_layernorm.beta, m, n, param_.stream,
-        //                                                                          FC0_weight_amax_list, bmm2_amax_ptr);
-        //         }
-        //         else if (int8_mode_ == 2 || int8_mode_ == 3)
-        //         {
-        //           cublasLtMM_withAlgo_int8IO((int8_t*)int_buf_, 1, m, n, k,
-        //           m*k, n*k, m*n, int8O_gemm_deQ_scale_list[5],
-        //                                      (int8_t*)attr_out_buf_,
-        //                                      (int8_t*)(param_.self_attention.
-        //                                      attention_output_weight.kernel),
-        //                                      param_.cublaslt_handle,
-        //                                      param_.stream, cublasAlgoMap_,
-        //                                      use_ORDER_COL32_2R_4R4_);
-        //           add_bias_input_layernorm_COL32_int8IO_kernelLauncher((int8_t*)attr_matmul_buf_,
-        //           (int8_t*)int_buf_, int8_from_tensor_,
-        //                                                                param_.self_attention.attention_output_weight.bias,
-        //                                                                param_.self_layernorm.gamma,
-        //                                                                param_.self_layernorm.beta,
-        //                                                                m, n,
-        //                                                                param_.stream,
-        //                                                                Proj_aftergemm_amax_ptr+1,
-        //                                                                to_tensor_amax_ptr+1,
-        //                                                                ProjBiasNorm_amax_ptr+3);
-        //         }
-
-        // #ifndef NDEBUG
-        //         cudaDeviceSynchronize();
-        //         check_cuda_error(cudaGetLastError());
-        // #endif
-
-        //         n *= 4;
-
-        //         if (int8_mode_ == 1){
-        //           quantized_kernelLauncher(attr_matmul_buf_tmp_,
-        //           attr_matmul_buf_, k*m, ProjBiasNorm_amax_ptr + 3,
-        //           param_.stream);
-        //           cublasLtMM_withAlgo(int_buf_, 1, m, n, k, m*k, n*k, m*n,
-        //                               attr_matmul_buf_tmp_,
-        //                               (int8_t*)(param_.ffn.intermediate_weight.kernel),
-        //                               param_.cublaslt_handle, param_.stream,
-        //                               cublasAlgoMap_,
-        //                               use_ORDER_COL32_2R_4R4_);
-        //           add_bias_act_COL32_int32I_int8O_kernelLauncher((int8_t*)inter_matmul_buf_,
-        //           int_buf_, param_.ffn.intermediate_weight.bias,
-        //                                                          m, n,
-        //                                                          param_.stream,
-        //                                                          FC1_weight_amax_list,
-        //                                                          ProjBiasNorm_amax_ptr+2,
-        //                                                          F1Bias_amax_ptr+3);
-        //         }
-        //         else if (int8_mode_ == 2 || int8_mode_ == 3)
-        //         {
-        //           cublasLtMM_withAlgo_int8IO((int8_t*)int_buf_, 1, m, n, k,
-        //           m*k, n*k, m*n, int8O_gemm_deQ_scale_list[6],
-        //                                      (int8_t*)attr_matmul_buf_,
-        //                                      (int8_t*)(param_.ffn.intermediate_weight.kernel),
-        //                                      param_.cublaslt_handle,
-        //                                      param_.stream, cublasAlgoMap_,
-        //                                      use_ORDER_COL32_2R_4R4_);
-        //           add_bias_act_COL32_int8IO_kernelLauncher((int8_t*)inter_matmul_buf_,
-        //           (int8_t*)int_buf_, param_.ffn.intermediate_weight.bias,
-        //                                                     m, n,
-        //                                                     param_.stream,
-        //                                                     F1_aftergemm_amax_ptr+1,
-        //                                                     F1Bias_amax_ptr+3);
-        //         }
-
-        // #ifndef NDEBUG
-        //         cudaDeviceSynchronize();
-        //         check_cuda_error(cudaGetLastError());
-        // #endif
-
-        //         n = k;
-        //         k *= 4;
-
-        //         if (int8_mode_ == 1)
-        //         {
-        //           cublasLtMM_withAlgo(int_buf_, 1, m, n, k, m*k, n*k, m*n,
-        //                               (int8_t*)inter_matmul_buf_,
-        //                               (int8_t*)(param_.ffn.output_weight.kernel),
-        //                               param_.cublaslt_handle, param_.stream,
-        //                               cublasAlgoMap_,
-        //                               use_ORDER_COL32_2R_4R4_);
-        //           if (layer_idx_ != layer_num_ - 1)
-        //           {
-        //             add_bias_input_layernorm_COL32_int32I_DataTypeO_kernelLauncher(param_.transformer_out,
-        //             int_buf_, attr_matmul_buf_,
-        //                                                                            param_.ffn.output_weight.bias, param_.ffn_layernorm.gamma,
-        //                                                                            param_.ffn_layernorm.beta, m, n, param_.stream, FC2_weight_amax_list,
-        //                                                                            F1Bias_amax_ptr);
-        //           }
-        //           else
-        //           {
-        //             add_bias_input_layernorm_COL32_int32I_DataTypeO_kernelLauncher(transformer_out_tmp_DataType_,
-        //             int_buf_, attr_matmul_buf_,
-        //                                                                            param_.ffn.output_weight.bias, param_.ffn_layernorm.gamma,
-        //                                                                            param_.ffn_layernorm.beta, m, n, param_.stream, FC2_weight_amax_list,
-        //                                                                            F1Bias_amax_ptr);
-        //             transposeMatrix_COL32ToColMajor_kernelLauncher(param_.transformer_out,
-        //             transformer_out_tmp_DataType_, m, n, param_.stream);
-        //           }
-        //         }
-        //         else if (int8_mode_ == 2 || int8_mode_ == 3)
-        //         {
-        //           cublasLtMM_withAlgo_int8IO((int8_t*)int_buf_, 1, m, n, k,
-        //           m*k, n*k, m*n, int8O_gemm_deQ_scale_list[7],
-        //                                      (int8_t*)inter_matmul_buf_,
-        //                                      (int8_t*)(param_.ffn.output_weight.kernel),
-        //                                      param_.cublaslt_handle,
-        //                                      param_.stream, cublasAlgoMap_,
-        //                                      use_ORDER_COL32_2R_4R4_);
-        //           if (layer_idx_ != layer_num_ - 1)
-        //           {
-        //             add_bias_input_layernorm_COL32_int8IO_kernelLauncher((int8_t*)param_.transformer_out,
-        //             (int8_t*)int_buf_, (int8_t*)attr_matmul_buf_,
-        //                                                                   param_.ffn.output_weight.bias,
-        //                                                                   param_.ffn_layernorm.gamma,
-        //                                                                   param_.ffn_layernorm.beta,
-        //                                                                   m,
-        //                                                                   n,
-        //                                                                   param_.stream,
-        //                                                                   F2_aftergemm_amax_ptr+1,
-        //                                                                   ProjBiasNorm_amax_ptr+1,
-        //                                                                   F2BiasNorm_amax_ptr+3);
-        //           }
-        //           else
-        //           {
-        //             add_bias_input_layernorm_COL32_int8I_DataTypeO_kernelLauncher(transformer_out_tmp_DataType_,
-        //             (int8_t*)int_buf_, (int8_t*)attr_matmul_buf_,
-        //                                                                           param_.ffn.output_weight.bias, param_.ffn_layernorm.gamma,
-        //                                                                           param_.ffn_layernorm.beta, m, n, param_.stream, F2_aftergemm_amax_ptr+1, ProjBiasNorm_amax_ptr+1);
-        //             transposeMatrix_COL32ToColMajor_kernelLauncher(param_.transformer_out,
-        //             transformer_out_tmp_DataType_, m, n, param_.stream);
-        //           }
-        //         }
-
-        // #ifndef NDEBUG
-        //         cudaDeviceSynchronize();
-        //         check_cuda_error(cudaGetLastError());
-        // #endif
-      } else {
-        cublasMM_cublasLtMM_wrapper(
-            param_.cublaslt_handle,
-            param_.cublas_handle,
-            CUBLAS_OP_N,
-            CUBLAS_OP_N,
-            n,
-            m,
-            k,
-            &alpha,
-            param_.self_attention.attention_output_weight.kernel,
-            AType_,
-            n,
-            attr_out_buf_,
-            BType_,
-            k,
-            &beta,
-            (DataType_ *)attr_unnormed_matmul_buf_,
-            CType_,
-            n,
-            param_.stream,
-            cublasAlgoMap_,
-            sm_,
-            cublas_workspace_);
-
-        add_bias_input_layernorm_2_kernelLauncher<DataType_>(
-            param_.from_tensor,
-            param_.self_layernorm.gamma,
-            param_.self_layernorm.beta,
-            param_.self_attention.attention_output_weight.bias,
-            attr_unnormed_matmul_buf_,
-            attr_matmul_buf_,
-            m,
-            n,
-            param_.stream);
-
-#ifndef NDEBUG
-        cudaDeviceSynchronize();
-        check_cuda_error(cudaGetLastError());
-#endif
-
-        n *= 4;
-
-        cublasMM_cublasLtMM_wrapper(param_.cublaslt_handle,
-                                    param_.cublas_handle,
-                                    CUBLAS_OP_N,
-                                    CUBLAS_OP_N,
-                                    n,
-                                    m,
-                                    k,
-                                    &alpha,
-                                    param_.ffn.intermediate_weight.kernel,
-                                    AType_,
-                                    n,
-                                    attr_matmul_buf_,
-                                    BType_,
-                                    k,
-                                    &beta,
-                                    (DataType_ *)inter_matmul_buf_,
-                                    CType_,
-                                    n,
-                                    param_.stream,
-                                    cublasAlgoMap_,
-                                    sm_,
-                                    cublas_workspace_);
-
-        if (use_gelu_ == true) {
-          add_bias_act_kernelLauncher<DataType_>(
-              inter_matmul_buf_,
-              param_.ffn.intermediate_weight.bias,
-              m,
-              n,
-              ActivationType::GELU,
-              param_.stream);
-        } else {
-          add_bias_act_kernelLauncher<DataType_>(
-              inter_matmul_buf_,
-              param_.ffn.intermediate_weight.bias,
-              m,
-              n,
-              ActivationType::RELU,
-              param_.stream);
-        }
-
-#ifndef NDEBUG
-        cudaDeviceSynchronize();
-        check_cuda_error(cudaGetLastError());
-#endif
-
-        n = k;
-        k *= 4;
-
-        cublasMM_cublasLtMM_wrapper(param_.cublaslt_handle,
-                                    param_.cublas_handle,
-                                    CUBLAS_OP_N,
-                                    CUBLAS_OP_N,
-                                    n,
-                                    m,
-                                    k,
-                                    &alpha,
-                                    param_.ffn.output_weight.kernel,
-                                    AType_,
-                                    n,
-                                    inter_matmul_buf_,
-                                    BType_,
-                                    k,
-                                    &beta,
-                                    (DataType_ *)(param_.transformer_out),
-                                    CType_,
-                                    n,
-                                    param_.stream,
-                                    cublasAlgoMap_,
-                                    sm_,
-                                    cublas_workspace_);
-
-
-        add_bias_input_kernelLauncher(param_.transformer_out,
-                                      param_.ffn.output_weight.bias,
-                                      attr_unnormed_matmul_buf_,
-                                      m,
-                                      n,
-                                      param_.stream);
-
-#ifndef NDEBUG
-        cudaDeviceSynchronize();
-        check_cuda_error(cudaGetLastError());
-#endif
-      }
-    } catch (std::runtime_error &error) {
-      throw error;
-    }
-  }
-
-  ~OpenEncoder() {
-    if (buf_ != NULL) {
-      if (allocator_ == NULL) {
-        printf("[ERROR][OpenEncoder][~OpenEncoder] allocator_ is NULL!\n");
-        exit(-1);
-      }
-      allocator_->free(buf_);
-    }
-    if (attention_ != NULL) delete attention_;
-  }
-};
-
-}  // namespace fastertransformer
diff --git a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/t5_beamsearch.h b/paddlenlp/ops/patches/FasterTransformer/fastertransformer/t5_beamsearch.h
deleted file mode 100644
index 4f0c5db255f0..000000000000
--- a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/t5_beamsearch.h
+++ /dev/null
@@ -1,922 +0,0 @@
-/*
- * Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/**
- * Decoder transformer
- **/
-
-#pragma once
-
-#include <cuda_runtime.h>
-
-#include "fastertransformer/cuda/cuda_kernels.h"
-#include "fastertransformer/open_decoder.h"
-#include "fastertransformer/utils/allocator.h"
-#include "fastertransformer/utils/arguments.h"
-#include "fastertransformer/utils/common.h"
-#include "fastertransformer/utils/functions.h"
-
-namespace fastertransformer {
-
-template <OperationType OpType_>
-class T5DecodingBeamsearch {
-private:
-  typedef DecoderTransformerTraits<OpType_> Traits_;
-  typedef typename Traits_::DataType DataType_;
-  const IAllocator &allocator_;
-  struct T5BeamsearchArguments args_;
-  TensorParallelParam t_parallel_param_;
-  LayerParallelParam l_parallel_param_;
-
-  const cudaDataType_t computeType_ = Traits_::computeType;
-  const cudaDataType_t AType_ = Traits_::AType;
-  const cudaDataType_t BType_ = Traits_::BType;
-  const cudaDataType_t CType_ = Traits_::CType;
-  std::map<std::string, cublasLtMatmulAlgo_info> cublasAlgoMap_;
-
-  OpenDecoder<OpType_> *decoder_;
-  DataType_ **K_cache_;
-  DataType_ **V_cache_;
-  DataType_ **K_mem_cache_;
-  DataType_ **V_mem_cache_;
-  DataType_ *from_tensor_[2];
-  DataType_ *decoder_buf_;
-
-  // Prefix LM
-  DataType_ *trans_out_buf_;
-  DataType_ *lm_normed_result_buf_;
-
-  DataType_ *decoder_normed_result_buf_;
-  DataType_ *embedding_buf_;
-  float *logits_buf_;
-  float *cum_log_buf_;
-  int *word_ids_buf_;
-  int *parent_ids_buf_;
-  bool *finished_buf_;
-  bool *alive_finished_buf_;
-  DataType_ *relative_attention_bias_;
-
-  void *buf_;
-  int *finished_count_buf_;
-  bool *h_finished_buf_;
-  int *h_trg_length_;
-  float *temp_storage_;
-
-  bool is_fuse_topk_softMax_;
-  bool keep_alive_beam_;
-
-  void *topK_kernel_workspace = nullptr;
-  size_t topk_workspace_size_ = 0;
-  void *cublas_workspace_ = nullptr;
-
-  DataType_ *padded_embedding_kernel;
-  DataType_ *padded_embedding_bias;
-  DataType_ *tmp_logits_buf_;
-
-public:
-  T5DecodingBeamsearch(const IAllocator &allocator,
-                       const int batch_size,
-                       const int beam_width,
-                       const int seq_len,
-                       const int head_num,
-                       const int size_per_head,
-                       const int vocab_size,
-                       const int decoder_layers,
-                       const int memory_hidden_units,
-                       const int memory_max_seq_len,
-                       const int start_id,
-                       const int end_id,
-                       const float beam_search_diversity_rate = -0.0f,
-                       const bool is_fuse_topk_softMax = false,
-                       const bool is_fuse_qkv = false,
-                       const bool keep_alive_beam = false,
-                       const float alpha = 0.6,
-                       const bool normalization_before = true,
-                       const ActivationType act = ActivationType::RELU,
-                       const int finished_candidate_num = -1,
-                       const bool early_stopping = false,
-                       const int min_length = 0,
-                       const int inner_coeff = 4,
-                       const int inner_size = -1,
-                       const int num_bucket = -1,
-                       const int max_distance = 128,
-                       const bool tie_word_embeddings = true,
-                       const bool use_gated = false)
-      : allocator_(allocator),
-        is_fuse_topk_softMax_(is_fuse_topk_softMax),
-        keep_alive_beam_(keep_alive_beam) {
-#ifndef NDEBUG
-    PRINT_FUNC_NAME_();
-#endif
-
-    args_.batch_size_ = batch_size;
-    args_.beam_width_ = beam_width;
-    args_.seq_len_ = seq_len;
-    args_.memory_max_seq_len_ = memory_max_seq_len;
-    args_.head_num_ = head_num;
-    args_.size_per_head_ = size_per_head;
-    args_.hidden_units_ = head_num * size_per_head;
-    args_.decoder_layers_ = decoder_layers;
-    args_.vocab_size_ = vocab_size;
-    args_.start_id_ = start_id;
-    args_.end_id_ = end_id;
-    args_.beam_search_diversity_rate_ = beam_search_diversity_rate;
-    if (args_.beam_width_ > 16 || args_.beam_width_ > MAX_K)
-      is_fuse_topk_softMax_ = false;
-    if (std::is_same<DataType_, float>::value)
-      args_.vocab_size_padded_ = vocab_size;
-    else if (std::is_same<DataType_, half>::value)
-      args_.vocab_size_padded_ = (int)(ceil(vocab_size / 8.)) * 8;
-
-    args_.alpha_ = alpha;
-    args_.normalization_before_ = normalization_before;
-    args_.act_ = act;
-
-    args_.min_length_ = min_length;
-
-    args_.finished_candidate_num_ = (finished_candidate_num == -1)
-                                        ? beam_width * 2
-                                        : finished_candidate_num;
-    args_.early_stopping_ = early_stopping;
-
-    args_.num_bucket_ = num_bucket;
-    args_.max_distance_ = max_distance;
-    args_.tie_word_embeddings_ = tie_word_embeddings;
-
-    K_cache_ = new DataType_ *[2];
-    V_cache_ = new DataType_ *[2];
-
-    K_mem_cache_ = new DataType_ *[args_.decoder_layers_];
-    V_mem_cache_ = new DataType_ *[args_.decoder_layers_];
-
-    decoder_ = new OpenDecoder<OpType_>(head_num,
-                                        size_per_head,
-                                        memory_hidden_units,
-                                        is_fuse_qkv,
-                                        normalization_before,
-                                        args_.act_,
-                                        inner_coeff,
-                                        inner_size,
-                                        use_gated);
-    decoder_->set_max_batch_size(batch_size * beam_width);
-
-    size_t from_tensor_size =
-        args_.batch_size_ * args_.beam_width_ * args_.hidden_units_;  // type T
-    size_t decoder_workspace_size = decoder_->getWorkspaceSize();     // type T
-    size_t decoder_normed_result_buffer_size =
-        args_.batch_size_ * args_.beam_width_ * args_.hidden_units_;  // type T
-    size_t cache_size = (args_.batch_size_ * args_.beam_width_ *
-                               args_.seq_len_ * args_.hidden_units_);  // type T
-    size_t mem_cache_size = (args_.batch_size_ * args_.beam_width_ *
-                       memory_max_seq_len * args_.hidden_units_);  // type T
-
-    size_t logits_buf_size = args_.batch_size_ * args_.beam_width_ *
-                             args_.vocab_size_padded_;  // type float
-    size_t cum_log_buf_size =
-        args_.batch_size_ * args_.beam_width_;  // type float
-    size_t word_ids_buf_size =
-        args_.batch_size_ * args_.beam_width_;  // type int
-    size_t parent_ids_buf_size =
-        keep_alive_beam_ ? word_ids_buf_size : 0;  // type int
-    size_t finished_buf_size =
-        args_.batch_size_ * args_.beam_width_;  // type bool
-    size_t alive_finished_buf_size = keep_alive_beam_ ? finished_buf_size : 0;
-    size_t finished_count_size = (size_t)(ceil(1 / 32.)) * 32;  // type int
-
-    size_t storage_size_per_beam =
-        2 * args_.beam_width_ +
-        SMALL_TOP_K_SOFTMAX_MAX_VOC_PARTS * (2 * MAX_K + 2);
-    args_.temp_storage_size_ = args_.batch_size_ * args_.beam_width_ *
-                               storage_size_per_beam;  // type float
-    args_.temp_storage_size_ =
-        (size_t)(ceil(args_.batch_size_ * args_.beam_width_ *
-                      args_.beam_width_ / 4.) *
-                     4 * 2 +
-                 ceil(args_.batch_size_ * args_.beam_width_ *
-                      SMALL_TOP_K_SOFTMAX_MAX_VOC_PARTS * (2 * MAX_K + 2) /
-                      4.) *
-                     4);
-    size_t padded_embedding_kernel_size =
-        args_.hidden_units_ * args_.vocab_size_padded_;
-    size_t padded_embedding_bias_size = args_.vocab_size_padded_;
-    if (std::is_same<DataType_, float>::value ||
-        (std::is_same<DataType_, half>::value &&
-         args_.vocab_size_padded_ == args_.vocab_size_)) {
-      padded_embedding_kernel_size = 0;
-      padded_embedding_bias_size = 0;
-    }
-
-    // When using separated alive and finish beam queues, some buffers size need
-    // to be doubled to restore beam search intermedia results of both alive and
-    // finish beams.
-    if (keep_alive_beam_ == true) {
-      // cumulated log-probs of finish beams and alive beams
-      cum_log_buf_size += cum_log_buf_size;
-      finished_buf_size += finished_buf_size;
-      // Double the size of topk_tmp_id_buf, topk_tmp_val_buf, since we need
-      // select the top 2*beam_width.
-      args_.temp_storage_size_ +=
-          ceil(args_.batch_size_ * args_.beam_width_ * args_.beam_width_ / 4.) *
-          4 * 2;
-// Double tmp_buffer since we need select the top 2*beam_width.
-#ifdef DO_SPLIT_SMALL_TOP_K_SOFTMAX
-      args_.temp_storage_size_ +=
-          ceil(args_.batch_size_ * args_.beam_width_ *
-               SMALL_TOP_K_SOFTMAX_MAX_VOC_PARTS * (2 * MAX_K) / 4.) *
-          4;
-#endif
-    }
-
-    int relative_attention_bias_size =
-        (args_.seq_len_ + 1) * (args_.seq_len_ + 1) * head_num;
-
-    // prevent memory misalinged address
-    logits_buf_size = (size_t)(ceil(logits_buf_size / 4.)) * 4;
-    cum_log_buf_size = (size_t)(ceil(cum_log_buf_size / 4.)) * 4;
-    word_ids_buf_size = (size_t)(ceil(word_ids_buf_size / 4.)) * 4;
-    parent_ids_buf_size = (size_t)(ceil(parent_ids_buf_size / 4.)) * 4;
-    finished_buf_size = (size_t)(ceil(finished_buf_size / 32.)) * 32;
-    alive_finished_buf_size =
-        (size_t)(ceil(alive_finished_buf_size / 32.)) * 32;
-    const size_t tmp_logits_buf_size = logits_buf_size;
-
-    // get workspace size of topk kernel
-    if (keep_alive_beam_ == true) {
-      topK_update_kernelLauncher(topK_kernel_workspace,
-                                 topk_workspace_size_,
-                                 logits_buf_,
-                                 finished_buf_,
-                                 alive_finished_buf_,
-                                 nullptr,
-                                 word_ids_buf_,
-                                 parent_ids_buf_,
-                                 nullptr,
-                                 nullptr,
-                                 cum_log_buf_,
-                                 0,
-                                 args_,
-                                 0);
-    } else {
-      topK_kernelLauncher(topK_kernel_workspace,
-                          topk_workspace_size_,
-                          logits_buf_,
-                          word_ids_buf_,
-                          finished_buf_,
-                          args_,
-                          0);
-    }
-
-    size_t lm_head_buffer_size = decoder_normed_result_buffer_size;
-
-    size_t datatype_buf_size =
-        from_tensor_size * 2 + decoder_workspace_size +
-        (cache_size * 4 + mem_cache_size * 2) * args_.decoder_layers_ +
-        lm_head_buffer_size;
-
-    buf_ = reinterpret_cast<void *>(allocator_.malloc(
-        ((sizeof(DataType_) == sizeof(half)) ? CUBLAS_WORKSPACE_SIZE : 0) +
-        sizeof(DataType_) * datatype_buf_size +
-        sizeof(float) * (logits_buf_size + cum_log_buf_size) +
-        sizeof(DataType_) * tmp_logits_buf_size +
-        sizeof(DataType_) * padded_embedding_kernel_size +
-        sizeof(float) * padded_embedding_bias_size +
-        sizeof(int) * (word_ids_buf_size + parent_ids_buf_size) +
-        sizeof(bool) * (finished_buf_size + alive_finished_buf_size) +
-        topk_workspace_size_ +
-        sizeof(float) * args_.temp_storage_size_ +  // should be always float
-        sizeof(int) * finished_count_size +
-        sizeof(DataType_) * relative_attention_bias_size));
-
-    if (sizeof(DataType_) == sizeof(half)) {
-      cublas_workspace_ = buf_;
-      from_tensor_[0] =
-          (DataType_ *)((char *)cublas_workspace_ + CUBLAS_WORKSPACE_SIZE);
-    } else {
-      cublas_workspace_ = nullptr;
-      from_tensor_[0] = (DataType_ *)(buf_);
-    }
-    from_tensor_[1] = (DataType_ *)(from_tensor_[0] + from_tensor_size);
-
-    for (int i = 0; i < args_.decoder_layers_; ++i) {
-      K_mem_cache_[i] =
-          from_tensor_[1] + from_tensor_size + i * mem_cache_size * 2;
-      V_mem_cache_[i] = from_tensor_[1] + from_tensor_size +
-                        i * mem_cache_size * 2 + mem_cache_size;
-    }
-    if (args_.beam_width_ > 1) {
-      /* We use two-way buffer since we have to update KV buf at the end of each
-       * step. */
-      K_cache_[0] = V_mem_cache_[decoder_layers - 1] + mem_cache_size +
-                    0 * cache_size * args_.decoder_layers_;
-      K_cache_[1] = V_mem_cache_[decoder_layers - 1] + mem_cache_size +
-                    1 * cache_size * args_.decoder_layers_;
-      V_cache_[0] = V_mem_cache_[decoder_layers - 1] + mem_cache_size +
-                    2 * cache_size * args_.decoder_layers_;
-      V_cache_[1] = V_mem_cache_[decoder_layers - 1] + mem_cache_size +
-                    3 * cache_size * args_.decoder_layers_;
-    } else {
-      // if beam width is 1, we only need one buffer
-      K_cache_[0] = V_mem_cache_[decoder_layers - 1] + mem_cache_size +
-                    0 * cache_size * args_.decoder_layers_;
-      K_cache_[1] = K_cache_[0];
-      V_cache_[0] = V_mem_cache_[decoder_layers - 1] + mem_cache_size +
-                    2 * cache_size * args_.decoder_layers_;
-      V_cache_[1] = V_cache_[0];
-    }
-
-    decoder_buf_ = V_cache_[1] + cache_size * args_.decoder_layers_;
-
-    decoder_normed_result_buf_ = (decoder_buf_ + decoder_workspace_size);
-    // Used for post-norm.
-    embedding_buf_ = (decoder_buf_ + decoder_workspace_size);
-
-    logits_buf_ = (float *)(decoder_normed_result_buf_ +
-                            decoder_normed_result_buffer_size);
-    cum_log_buf_ = (float *)(logits_buf_ + logits_buf_size);
-    word_ids_buf_ = (int *)(cum_log_buf_ + cum_log_buf_size);
-    parent_ids_buf_ = (int *)(word_ids_buf_ + word_ids_buf_size);
-    finished_buf_ = (bool *)(parent_ids_buf_ + parent_ids_buf_size);
-    alive_finished_buf_ = (bool *)(finished_buf_ + finished_buf_size);
-    temp_storage_ = (float *)(alive_finished_buf_ + alive_finished_buf_size);
-    finished_count_buf_ = (int *)(temp_storage_ + args_.temp_storage_size_);
-
-    relative_attention_bias_ =
-        (DataType_ *)(finished_count_buf_ + finished_count_size);
-
-    topK_kernel_workspace =
-        (void *)(relative_attention_bias_ + relative_attention_bias_size);
-    padded_embedding_kernel =
-        (DataType_ *)((char *)topK_kernel_workspace + topk_workspace_size_);
-    padded_embedding_bias =
-        (DataType_ *)(padded_embedding_kernel + padded_embedding_kernel_size);
-    tmp_logits_buf_ =
-        (DataType_ *)(padded_embedding_bias + padded_embedding_bias_size);
-
-    h_finished_buf_ = new bool[finished_buf_size];
-    h_trg_length_ = new int[args_.batch_size_];
-
-    int isConfigExist = access("decoding_gemm_config.in", 0);
-    if (isConfigExist == -1) {
-      printf("[WARNING] decoding_gemm_config.in is not found\n");
-    } else {
-      readAlgoFromConfig(cublasAlgoMap_, 1);
-      // check that the gemm_config setting is runnable
-      for (auto iter = cublasAlgoMap_.begin(); iter != cublasAlgoMap_.end();
-           iter++) {
-        int algoId = iter->second.algoId;
-        int stages = iter->second.stages;
-        // only check for cublas
-        if (stages != -1) continue;
-        if (Traits_::OpType == OperationType::FP32) {
-          if (algoId > CUBLAS_GEMM_ALGO23 || algoId < CUBLAS_GEMM_DEFAULT) {
-            // the algorithm is not for FP32
-            printf("[ERROR] cuBLAS Algorithm %d is not used in FP32. \n",
-                   algoId);
-            exit(-1);
-          }
-        } else {
-          if (algoId > CUBLAS_GEMM_ALGO15_TENSOR_OP ||
-              algoId < CUBLAS_GEMM_DEFAULT_TENSOR_OP) {
-            // the algorithm is not for FP16
-            printf("[ERROR] cuBLAS Algorithm %d is not used in FP16. \n",
-                   algoId);
-            exit(-1);
-          }
-        }
-      }
-    }
-  }
-
-  void set_tensor_parallel_param(const TensorParallelParam param) {
-    t_parallel_param_ = param;
-    decoder_->set_tensor_parallel_param(param);
-  }
-
-  void set_layer_parallel_param(const LayerParallelParam param) {
-    l_parallel_param_ = param;
-    decoder_->set_layer_parallel_param(param);
-  }
-
-  void forward(const DecoderInitParam<DataType_> *param,
-               DecodingInitParam<DataType_> decoding_params) {
-#ifndef NDEBUG
-    PRINT_FUNC_NAME_();
-#endif
-    const int m = args_.batch_size_ * args_.beam_width_;
-    const int k = args_.hidden_units_;
-    const int n = args_.vocab_size_padded_;
-    const DataType_ *embedding_kernel_ptr = nullptr;
-    const DataType_ *embedding_bias_ptr = nullptr;
-
-    int min_trg_len = 0;
-    int max_trg_len = 0;
-
-    if (decoding_params.trg_word) {
-      cudaMemcpy(h_trg_length_,
-                 decoding_params.trg_length,
-                 sizeof(int) * args_.batch_size_,
-                 cudaMemcpyDeviceToHost);
-      min_trg_len = h_trg_length_[0];
-      max_trg_len = h_trg_length_[0];
-
-      for (int i = 1; i < args_.batch_size_; ++i) {
-        min_trg_len = std::min(min_trg_len, h_trg_length_[i]);
-        max_trg_len = std::max(max_trg_len, h_trg_length_[i]);
-      }
-    }
-
-    /*
-      sequence_length initialize to 0
-      finished: false
-      word_ids: start_id_
-      cum_log_probs (for eacm beam, the first element is 0). e.g., [0 -inf -inf
-      -inf][0 -inf -inf -inf]
-      cum_log_probs: If keep_alive_beam_ is true, the first alive element is 0.
-    */
-    if (keep_alive_beam_ == true) {
-      init_kernelLauncher_v2<float>(finished_buf_,
-                                    alive_finished_buf_,
-                                    decoding_params.sequence_length,
-                                    word_ids_buf_,
-                                    cum_log_buf_,
-                                    args_.start_id_,
-                                    args_.batch_size_,
-                                    args_.beam_width_ * 2,
-                                    decoding_params.stream);
-    } else {
-      init_kernelLauncher(finished_buf_,
-                          decoding_params.sequence_length,
-                          word_ids_buf_,
-                          cum_log_buf_,
-                          args_.start_id_,
-                          args_.batch_size_,
-                          args_.beam_width_,
-                          decoding_params.stream);
-    }
-
-#ifndef NDEBUG
-    cudaDeviceSynchronize();
-    check_cuda_error(cudaGetLastError());
-
-/*
-  User can check the init by init_kernel_check.
-  init_kernel_check will compare the results of GPU and CPU.
-  Note that init_kernel_check contains init and uses do not need to call it
-  again.
-*/
-// init_kernel_check(finished_buf_, decoding_params.sequence_length,
-// word_ids_buf_, cum_log_buf_,
-//                   start_id_, batch_size_, beam_width_,
-//                   decoding_params.stream);
-#endif
-
-    build_relative_attention_bias_launcher(
-        relative_attention_bias_,
-        decoding_params.self_relative_attention_bias_weight,
-        args_.head_num_,
-        (args_.seq_len_ + 1),
-        args_.num_bucket_,
-        false,
-        args_.max_distance_,
-        decoding_params.stream);
-
-    if (std::is_same<DataType_, float>::value ||
-        (std::is_same<DataType_, half>::value &&
-         args_.vocab_size_padded_ == args_.vocab_size_)) {
-      embedding_kernel_ptr =
-          (const DataType_ *)decoding_params.embedding_kernel;
-      embedding_bias_ptr = (const DataType_ *)decoding_params.embedding_bias;
-    } else if (std::is_same<DataType_, half>::value) {
-      kernel_padding_kernelLauncher(padded_embedding_kernel,
-                                    decoding_params.embedding_kernel,
-                                    args_.hidden_units_,
-                                    args_.vocab_size_,
-                                    args_.vocab_size_padded_,
-                                    decoding_params.stream);
-
-#ifndef NDEBUG
-      cudaDeviceSynchronize();
-      check_cuda_error(cudaGetLastError());
-#endif
-
-      bias_padding_kernelLauncher(padded_embedding_bias,
-                                  decoding_params.embedding_bias,
-                                  args_.vocab_size_,
-                                  args_.vocab_size_padded_,
-                                  decoding_params.stream);
-
-#ifndef NDEBUG
-      cudaDeviceSynchronize();
-      check_cuda_error(cudaGetLastError());
-#endif
-      embedding_kernel_ptr = padded_embedding_kernel;
-      embedding_bias_ptr = padded_embedding_bias;
-    }
-
-    int cache_size = (m * args_.seq_len_ * args_.hidden_units_);  // type T
-
-    for (uint step = 1; step <= args_.seq_len_; ++step) {
-      // we use two-way buffer
-      int kv_cache_id = step & 0x1;
-
-      words_embeddings_kernel_launcher(from_tensor_[0],
-                                       decoding_params.embedding_table,
-                                       word_ids_buf_,
-                                       m,
-                                       args_.hidden_units_,
-                                       decoding_params.stream);
-
-#ifndef NDEBUG
-      cudaDeviceSynchronize();
-      check_cuda_error(cudaGetLastError());
-#endif
-
-      int from_id, out_id;
-      for (int layer = 0; layer < args_.decoder_layers_; ++layer) {
-        /*
-          For the first layer (layer-0), from_id is 0. We also stored the
-          embedding lookup
-          result in from_tensor_[0]
-        */
-        from_id = layer & 0x1;
-        out_id = 1 - from_id;
-
-        /*
-          We use one decoder_ object to process multiple decoder layers.
-
-          At the beginning of each decoder layer, we initialize the decoder
-          object
-          with corresponding weights and decoder_buf_.
-          The decoder_buf_ is reused.
-        */
-        decoder_->initialize(param[layer], decoder_buf_, cublas_workspace_);
-
-#ifndef NDEBUG
-        cudaDeviceSynchronize();
-        check_cuda_error(cudaGetLastError());
-#endif
-
-        decoder_->forward(
-            from_tensor_[from_id],
-            decoding_params.memory_tensor,
-            K_cache_[kv_cache_id] + layer * cache_size,
-            V_cache_[kv_cache_id] + layer * cache_size,
-            K_mem_cache_[layer],
-            V_mem_cache_[layer],
-            decoding_params.memory_sequence_length,
-            from_tensor_[out_id],
-            step,
-            args_.seq_len_,
-            true, /* is_cross_attention */
-            keep_alive_beam_ ? alive_finished_buf_ : finished_buf_,
-            relative_attention_bias_,
-            true);
-
-#ifndef NDEBUG
-        cudaDeviceSynchronize();
-        check_cuda_error(cudaGetLastError());
-#endif
-      }
-
-      if (step > min_trg_len) {
-        DataType_ alpha = (DataType_)1.0f;
-        DataType_ beta = (DataType_)0.0f;
-
-        t5_layer_norm(from_tensor_[out_id],
-                      decoding_params.layernorm.gamma,
-                      decoding_params.layernorm.beta,
-                      decoder_normed_result_buf_,
-                      m,
-                      k,
-                      decoding_params.stream);
-
-#ifndef NDEBUG
-        cudaDeviceSynchronize();
-        check_cuda_error(cudaGetLastError());
-#endif
-
-        if (args_.tie_word_embeddings_) {
-          alpha = (DataType_) pow((float)(k), -0.5);
-        }
-
-        cublasMM_cublasLtMM_wrapper_decoder(decoding_params.cublaslt_handle,
-                                            decoding_params.cublas_handle,
-                                            CUBLAS_OP_N,
-                                            CUBLAS_OP_N,
-                                            n,
-                                            m,
-                                            k,
-                                            &alpha,
-                                            embedding_kernel_ptr,
-                                            AType_,
-                                            n,
-                                            decoder_normed_result_buf_,
-                                            BType_,
-                                            k,
-                                            &beta,
-                                            tmp_logits_buf_,
-                                            CType_,
-                                            n,
-                                            decoding_params.stream,
-                                            cublasAlgoMap_,
-                                            cublas_workspace_);
-
-#ifndef NDEBUG
-        cudaDeviceSynchronize();
-        check_cuda_error(cudaGetLastError());
-#endif
-
-        if (decoding_params.logits_mask ||
-            (args_.min_length_ != 0 && step <= args_.min_length_)) {
-          apply_logits_mask_kernelLauncher(
-              tmp_logits_buf_,
-              keep_alive_beam_ ? alive_finished_buf_ : finished_buf_,
-              args_.batch_size_,
-              args_.beam_width_,
-              args_.vocab_size_padded_,
-              args_.vocab_size_,
-              decoding_params.stream,
-              decoding_params.logits_mask,
-              (args_.min_length_ != 0 && step <= args_.min_length_),
-              args_.end_id_);
-#ifndef NDEBUG
-          cudaDeviceSynchronize();
-          check_cuda_error(cudaGetLastError());
-#endif
-        }
-
-        // Beamsearch
-        if (is_fuse_topk_softMax_) {
-          if (keep_alive_beam_) {
-            // Use separated alive and finish beam queues to avoid the decrease
-            // of alive beams.
-
-            topK_softMax_update(tmp_logits_buf_,
-                                embedding_bias_ptr,
-                                finished_buf_,
-                                alive_finished_buf_,
-                                decoding_params.sequence_length,
-                                word_ids_buf_,
-                                parent_ids_buf_,
-                                decoding_params.output_ids + (step - 1) * m * 2,
-                                decoding_params.parent_ids + (step - 1) * m * 2,
-                                cum_log_buf_,
-                                reinterpret_cast<void *>(temp_storage_),
-                                step,
-                                args_,
-                                decoding_params.stream);
-
-          } else {
-            topK_softMax(tmp_logits_buf_,
-                         embedding_bias_ptr,
-                         finished_buf_,
-                         cum_log_buf_,
-                         word_ids_buf_,
-                         reinterpret_cast<void *>(temp_storage_),
-                         args_,
-                         decoding_params.stream);
-#ifndef NDEBUG
-            cudaDeviceSynchronize();
-            check_cuda_error(cudaGetLastError());
-#endif
-
-            update_kernelLauncher_v2(
-                finished_buf_,
-                decoding_params.parent_ids + (step - 1) * m,
-                decoding_params.sequence_length,
-                word_ids_buf_,
-                decoding_params.output_ids + (step - 1) * m,
-                finished_count_buf_,
-                args_,
-                decoding_params.stream);
-#ifndef NDEBUG
-            cudaDeviceSynchronize();
-            check_cuda_error(cudaGetLastError());
-#endif
-          }
-
-        } else {
-          if (keep_alive_beam_ == true) {
-            update_logits_v2(tmp_logits_buf_,
-                             embedding_bias_ptr,
-                             args_.end_id_,
-                             finished_buf_,
-                             m,
-                             n,
-                             decoding_params.stream);
-
-            // Use separated alive and finish beam queues to avoid the decrease
-            // of alive beams.
-            topK_update_kernelLauncher(
-                topK_kernel_workspace,
-                topk_workspace_size_,
-                tmp_logits_buf_,
-                finished_buf_,
-                alive_finished_buf_,
-                decoding_params.sequence_length,
-                word_ids_buf_,
-                parent_ids_buf_,
-                decoding_params.output_ids + (step - 1) * m * 2,
-                decoding_params.parent_ids + (step - 1) * m * 2,
-                cum_log_buf_,
-                step,
-                args_,
-                decoding_params.stream);
-
-          } else {
-            update_logits(logits_buf_,
-                          tmp_logits_buf_,
-                          embedding_bias_ptr,
-                          args_.end_id_,
-                          finished_buf_,
-                          m,
-                          n,
-                          decoding_params.stream);
-
-#ifndef NDEBUG
-            cudaDeviceSynchronize();
-            check_cuda_error(cudaGetLastError());
-
-/*
-  User can check the update_logits by update_logits_kernel_check.
-  update_logits_kernel_check will compare the results of GPU and CPU.
-  Note that update_logits_kernel_check contains update_logits and uses do not
-  need to call it again.
-*/
-// update_logits_kernel_check(logits_buf_, decoding_params.embedding_bias,
-// args_.end_id_, finished_buf_, m, n, decoding_params.stream);
-#endif
-            /* adding cum_log_buf_ to logits_buf_ */
-            broadcast_kernelLauncher(logits_buf_,
-                                     cum_log_buf_,
-                                     args_.batch_size_,
-                                     args_.beam_width_,
-                                     args_.vocab_size_padded_,
-                                     decoding_params.stream);
-#ifndef NDEBUG
-            cudaDeviceSynchronize();
-            check_cuda_error(cudaGetLastError());
-
-/*
-  User can check the broadcast_kernel by broadcast_kernel_check.
-  broadcast_kernel_check will compare the results of GPU and CPU.
-  Note that broadcast_kernel_check contains broadcast_kernelLauncher and uses do
-  not need to call it again.
-*/
-// broadcast_kernel_check(logits_buf_, cum_log_buf_, batch_size_, beam_width_,
-// vocab_size_, decoding_params.stream);
-#endif
-
-            topK_kernelLauncher(topK_kernel_workspace,
-                                topk_workspace_size_,
-                                logits_buf_,
-                                word_ids_buf_,
-                                finished_buf_,
-                                args_,
-                                decoding_params.stream);
-
-#ifndef NDEBUG
-            cudaDeviceSynchronize();
-            check_cuda_error(cudaGetLastError());
-#endif
-
-            update_kernelLauncher(logits_buf_,
-                                  cum_log_buf_,
-                                  finished_buf_,
-                                  decoding_params.parent_ids + (step - 1) * m,
-                                  decoding_params.sequence_length,
-                                  word_ids_buf_,
-                                  decoding_params.output_ids + (step - 1) * m,
-                                  args_.batch_size_,
-                                  args_.beam_width_,
-                                  args_.vocab_size_padded_,
-                                  decoding_params.stream,
-                                  args_.end_id_,
-                                  finished_count_buf_);
-          }
-        }
-
-#ifndef NDEBUG
-        cudaDeviceSynchronize();
-        check_cuda_error(cudaGetLastError());
-#endif
-      }
-
-      if (step <= max_trg_len) {
-#ifndef NDEBUG
-        cudaDeviceSynchronize();
-        check_cuda_error(cudaGetLastError());
-#endif
-
-        update_with_force_decodingLauncher<float>(
-            decoding_params.trg_word,
-            decoding_params.trg_length,
-            finished_buf_,
-            word_ids_buf_,
-            (step > min_trg_len) ? nullptr : decoding_params.sequence_length,
-            (keep_alive_beam_) ? parent_ids_buf_ : nullptr,
-            (keep_alive_beam_) ? decoding_params.parent_ids + (step - 1) * m * 2
-                               : decoding_params.parent_ids + (step - 1) * m,
-            (keep_alive_beam_) ? decoding_params.output_ids + (step - 1) * m * 2
-                               : decoding_params.output_ids + (step - 1) * m,
-            cum_log_buf_,
-            keep_alive_beam_,
-            args_.batch_size_,
-            (keep_alive_beam_) ? args_.beam_width_ * 2 : args_.beam_width_,
-            max_trg_len,
-            step,
-            decoding_params.stream);
-      }
-
-#ifndef NDEBUG
-      cudaDeviceSynchronize();
-      check_cuda_error(cudaGetLastError());
-#endif
-
-      if (args_.beam_width_ > 1) {
-        // chose which self cache to use
-        int decoder_max_seq_len =
-            (decoder_->getCacheFormat() != 0) ? args_.seq_len_ : -1;
-
-        update_KV_cache_kernelLauncher_v2(
-            K_cache_,
-            V_cache_,
-            keep_alive_beam_ ? parent_ids_buf_
-                             : decoding_params.parent_ids + (step - 1) * m,
-            keep_alive_beam_ ? alive_finished_buf_ : finished_buf_,
-            args_.batch_size_,
-            args_.beam_width_,
-            args_.head_num_,
-            args_.size_per_head_,
-            step,
-            decoder_max_seq_len,
-            cache_size,
-            args_.decoder_layers_,
-            decoding_params.stream,
-            (args_.prefix_lm_) ? args_.memory_max_seq_len_ : -1);
-      }
-#ifndef NDEBUG
-      cudaDeviceSynchronize();
-      check_cuda_error(cudaGetLastError());
-
-/*
-  User can check the update_KV_cache by update_KV_cache_kernel_check.
-  update_KV_cache_kernel_check will compare the results of GPU and CPU.
-  Note that update_KV_cache_kernel_check contains update_KV_cache and uses do
-  not need to call it again.
-*/
-// update_KV_cache_kernel_check(K_cache_, V_cache_, decoding_params.parent_ids +
-// (step - 1) * batch_size_ * beam_width_, batch_size_, beam_width_,
-// hidden_units_, step, cache_size, decoder_layers_, decoding_params.stream);
-#endif
-
-      if (step > max_trg_len) {
-        // TODO Find a better method to check the is_finished
-        int finish_size = (keep_alive_beam_) ? m * 2 : m;
-        cudaMemcpy(h_finished_buf_,
-                   finished_buf_,
-                   sizeof(bool) * finish_size,
-                   cudaMemcpyDeviceToHost);
-        int sum = 0;
-        for (int i = 0; i < finish_size; i++) {
-          sum += (int)h_finished_buf_[i];
-        }
-        if (sum == finish_size) break;
-      }
-    }  // end for decoding step for llop
-
-    if (decoding_params.output_scores) {
-      cudaMemcpyAsync(decoding_params.output_scores,
-                      cum_log_buf_,
-                      sizeof(float) * args_.batch_size_ * args_.beam_width_,
-                      cudaMemcpyDeviceToDevice,
-                      decoding_params.stream);
-    }
-  }  // end of forward
-
-  virtual ~T5DecodingBeamsearch() {
-    delete[] K_cache_;
-    delete[] V_cache_;
-    delete[] K_mem_cache_;
-    delete[] V_mem_cache_;
-    delete[] h_finished_buf_;
-    delete[] h_trg_length_;
-    delete decoder_;
-    allocator_.free(buf_);
-  }
-};
-
-}  // namespace fastertransformer
diff --git a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/t5_sampling.h b/paddlenlp/ops/patches/FasterTransformer/fastertransformer/t5_sampling.h
deleted file mode 100644
index 9f0053966257..000000000000
--- a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/t5_sampling.h
+++ /dev/null
@@ -1,780 +0,0 @@
-/*
- * Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
- * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/**
- * Decoder transformer
- **/
-
-#pragma once
-
-#include <cuda_runtime.h>
-
-#include "fastertransformer/cuda/cuda_kernels.h"
-#include "fastertransformer/open_decoder.h"
-#include "fastertransformer/utils/allocator.h"
-#include "fastertransformer/utils/arguments.h"
-#include "fastertransformer/utils/common.h"
-#include "fastertransformer/utils/functions.h"
-
-namespace fastertransformer {
-
-template <OperationType OpType_>
-class T5DecodingSampling {
-private:
-  typedef DecoderTransformerTraits<OpType_> Traits_;
-  typedef typename Traits_::DataType DataType_;
-  const IAllocator &allocator_;
-  struct T5SamplingArguments args_;
-  TensorParallelParam t_parallel_param_;
-  LayerParallelParam l_parallel_param_;
-
-  const cudaDataType_t computeType_ = Traits_::computeType;
-  const cudaDataType_t AType_ = Traits_::AType;
-  const cudaDataType_t BType_ = Traits_::BType;
-  const cudaDataType_t CType_ = Traits_::CType;
-  std::map<std::string, cublasLtMatmulAlgo_info> cublasAlgoMap_;
-
-  OpenDecoder<OpType_> *decoder_;
-  DataType_ **K_cache_;
-  DataType_ **V_cache_;
-  DataType_ **K_mem_cache_;
-  DataType_ **V_mem_cache_;
-  DataType_ *from_tensor_[2];
-  DataType_ *decoder_buf_;
-  DataType_ *decoder_normed_result_buf_;
-  DataType_ *embedding_buf_;
-  DataType_ *logits_buf_;
-  int *word_ids_buf_;
-  bool *finished_buf_;
-  int *h_trg_length_;
-
-  DataType_ *relative_attention_bias_;
-
-  void *buf_;
-  int *finished_count_buf_;
-  bool *h_finished_buf_;
-
-  void *topk_workspace_ = nullptr;
-  size_t topk_workspace_size_ = 0;
-  void *topp_workspace_ = nullptr;
-  size_t topp_workspace_size_ = 0;
-  void *cublas_workspace_ = nullptr;
-  curandState_t *curandstate_buf_;
-  int *topp_id_vals_buf_;
-  int *topp_offset_buf_;
-  int *begin_topp_offset_buf_;
-
-  DataType_ *padded_embedding_kernel;
-  DataType_ *padded_embedding_bias;
-
-public:
-  T5DecodingSampling(const IAllocator &allocator,
-                     const int batch_size,
-                     const int seq_len,
-                     const int head_num,
-                     const int size_per_head,
-                     const int vocab_size,
-                     const int decoder_layers,
-                     const int memory_hidden_units,
-                     const int memory_max_seq_len,
-                     const int start_id,
-                     const int end_id,
-                     const int candidate_num = 0,
-                     const float probability_threshold = 0.0,
-                     const int is_fuse_qkv = false,
-                     const bool normalization_before = true,
-                     const ActivationType act = ActivationType::RELU,
-                     const float temperature = 1.0,
-                     const float repeat_penalty = 1.0,
-                     const int min_length = 0,
-                     const int inner_coeff = 4,
-                     const int inner_size = -1,
-                     const int seed = -1,
-                     const int tensor_para_size = 1,
-                     const int layer_para_size = 1,
-                     const int num_bucket = -1,
-                     const int max_distance = 128,
-                     const bool tie_word_embeddings = true,
-                     const bool use_gated = false)
-      : allocator_(allocator) {
-    args_.batch_size_ = batch_size;
-    args_.seq_len_ = seq_len;
-    args_.memory_max_seq_len_ = memory_max_seq_len;
-    args_.head_num_ = head_num;
-    args_.size_per_head_ = size_per_head;
-    args_.hidden_units_ = head_num * size_per_head;
-    args_.decoder_layers_ = decoder_layers;
-    args_.vocab_size_ = vocab_size;
-    args_.candidate_num_ = candidate_num;
-    args_.probability_threshold_ = probability_threshold;
-    args_.start_id_ = start_id;
-    args_.end_id_ = end_id;
-    args_.normalization_before_ = normalization_before;
-    args_.act_ = act;
-
-    args_.temperature_ = temperature;
-    args_.repeat_penalty_ = repeat_penalty;
-
-    args_.min_length_ = min_length;
-    args_.seed_ = seed;
-
-    args_.num_bucket_ = num_bucket;
-    args_.max_distance_ = max_distance;
-    args_.tie_word_embeddings_ = tie_word_embeddings;
-
-    // For models without parallel
-    if (l_parallel_param_.layers_per_group == 0) {
-      l_parallel_param_.layers_per_group = decoder_layers;
-    }
-
-    if (std::is_same<DataType_, float>::value)
-      args_.vocab_size_padded_ = vocab_size;
-    else if (std::is_same<DataType_, half>::value)
-      args_.vocab_size_padded_ = (int)(ceil(vocab_size / 8.)) * 8;
-
-    if (args_.candidate_num_ == 0 && args_.probability_threshold_ == 0.0) {
-      printf(
-          "[ERROR] Candidate_num for topk is 0 and probability threshold for "
-          "top p is 0.0 \n");
-      exit(-1);
-    } else if (args_.candidate_num_ != 0 &&
-               args_.probability_threshold_ != 0.0) {
-      printf(
-          "[ERROR] Candidate_num for topk is not 0 and probability threshold "
-          "for top p is not 0.0 \n");
-      exit(-1);
-    }
-#ifndef NDEBUG
-    PRINT_FUNC_NAME_();
-#endif
-    K_cache_ = new DataType_ *[1];
-    V_cache_ = new DataType_ *[1];
-
-    K_mem_cache_ = new DataType_ *[args_.decoder_layers_];
-    V_mem_cache_ = new DataType_ *[args_.decoder_layers_];
-
-    decoder_ = new OpenDecoder<OpType_>(head_num,
-                                        size_per_head,
-                                        memory_hidden_units,
-                                        is_fuse_qkv,
-                                        normalization_before,
-                                        args_.act_,
-                                        inner_coeff,
-                                        inner_size,
-                                        use_gated);
-    decoder_->set_max_batch_size(batch_size);
-
-    size_t from_tensor_size =
-        args_.batch_size_ * args_.hidden_units_;                   // type T
-    size_t decoder_workspace_size = decoder_->getWorkspaceSize();  // type T
-    size_t decoder_normed_result_buffer_size =
-        args_.batch_size_ * args_.hidden_units_;  // type T
-
-    size_t cache_size = (args_.batch_size_ * args_.seq_len_ *
-                         args_.hidden_units_);  // type T
-    size_t mem_cache_size = (args_.batch_size_ * memory_max_seq_len *
-                                   args_.hidden_units_);  // type T
-    if (tensor_para_size != 1) {                          // tensor parallel
-      cache_size /= tensor_para_size;
-      mem_cache_size /= tensor_para_size;
-    }
-
-    int relative_attention_bias_size =
-        (args_.seq_len_ + 1) * (args_.seq_len_ + 1) * head_num;
-
-    size_t logits_buf_size =
-        args_.batch_size_ * args_.vocab_size_padded_;  // type T
-
-    size_t word_ids_buf_size = args_.batch_size_;               // type int
-    size_t finished_buf_size = args_.batch_size_;               // type bool
-    size_t finished_count_size = (size_t)(ceil(1 / 32.)) * 32;  // type int
-
-    int topp_id_vals_buf_size =
-        args_.batch_size_ * args_.vocab_size_padded_;  // type int
-    int topp_offset_buf_size = args_.batch_size_ + 1;  // type int
-    size_t begin_topp_offset_buf_size = topp_offset_buf_size;
-    size_t curandState_size = args_.batch_size_;
-    size_t padded_embedding_kernel_size =
-        args_.hidden_units_ * args_.vocab_size_padded_;
-    size_t padded_embedding_bias_size = args_.vocab_size_padded_;
-    if (std::is_same<DataType_, float>::value ||
-        (std::is_same<DataType_, half>::value &&
-         args_.vocab_size_ == args_.vocab_size_padded_)) {
-      padded_embedding_kernel_size = 0;
-      padded_embedding_bias_size = 0;
-    }
-
-    // prevent memory misalinged address
-    logits_buf_size = (size_t)(ceil(logits_buf_size / 4.)) * 4;
-    word_ids_buf_size = (size_t)(ceil(word_ids_buf_size / 4.)) * 4;
-    finished_buf_size = (size_t)(ceil(finished_buf_size / 32.)) * 32;
-
-    topp_id_vals_buf_size = (size_t)(ceil(topp_id_vals_buf_size / 4.)) * 4;
-    topp_offset_buf_size = (size_t)(ceil(topp_offset_buf_size / 4.)) * 4;
-    begin_topp_offset_buf_size = topp_offset_buf_size;
-
-    topP_sampling_kernel_kernelLauncher_v2(topp_workspace_,
-                                           topp_workspace_size_,
-                                           logits_buf_,
-                                           topp_id_vals_buf_,
-                                           topp_offset_buf_,
-                                           begin_topp_offset_buf_,
-                                           finished_buf_,
-                                           curandstate_buf_,
-                                           args_,
-                                           nullptr,
-                                           nullptr,
-                                           args_.vocab_size_padded_,
-                                           0,
-                                           args_.batch_size_);
-
-    topK_sampling_kernel_kernelLauncher_v2(topk_workspace_,
-                                           topk_workspace_size_,
-                                           logits_buf_,
-                                           nullptr,
-                                           nullptr,
-                                           finished_buf_,
-                                           curandstate_buf_,
-                                           args_,
-                                           0,
-                                           args_.batch_size_);
-
-    size_t datatype_buf_size =
-        from_tensor_size * 2 + decoder_workspace_size +
-        (cache_size * 2 + mem_cache_size * 2) * args_.decoder_layers_ +
-        decoder_normed_result_buffer_size;
-
-    buf_ = reinterpret_cast<void *>(allocator_.malloc(
-        ((sizeof(DataType_) == sizeof(half)) ? CUBLAS_WORKSPACE_SIZE : 0) +
-        sizeof(DataType_) * (datatype_buf_size + logits_buf_size) +
-        sizeof(DataType_) *
-            (padded_embedding_kernel_size + padded_embedding_bias_size) +
-        sizeof(int) * word_ids_buf_size + sizeof(bool) * finished_buf_size +
-        sizeof(int) * finished_count_size +
-        sizeof(int) * (topp_id_vals_buf_size + 2 * topp_offset_buf_size) +
-        topp_workspace_size_ + topk_workspace_size_ +
-        sizeof(DataType_) * relative_attention_bias_size +
-        curandState_size * sizeof(curandState_t)));
-
-    if (sizeof(DataType_) == sizeof(half)) {
-      cublas_workspace_ = buf_;
-      from_tensor_[0] =
-          (DataType_ *)((char *)cublas_workspace_ + CUBLAS_WORKSPACE_SIZE);
-    } else {
-      cublas_workspace_ = nullptr;
-      from_tensor_[0] = (DataType_ *)buf_;
-    }
-    from_tensor_[1] = (DataType_ *)(from_tensor_[0] + from_tensor_size);
-
-    for (int i = 0; i < args_.decoder_layers_; ++i) {
-      K_mem_cache_[i] =
-          from_tensor_[1] + from_tensor_size + i * mem_cache_size * 2;
-      V_mem_cache_[i] = from_tensor_[1] + from_tensor_size +
-                        i * mem_cache_size * 2 + mem_cache_size;
-    }
-
-    K_cache_[0] = V_mem_cache_[args_.decoder_layers_ - 1] + mem_cache_size +
-                  0 * cache_size * args_.decoder_layers_;
-    V_cache_[0] = V_mem_cache_[args_.decoder_layers_ - 1] + mem_cache_size +
-                  1 * cache_size * args_.decoder_layers_;
-
-    decoder_buf_ = V_cache_[0] + cache_size * args_.decoder_layers_;
-
-    decoder_normed_result_buf_ = (decoder_buf_ + decoder_workspace_size);
-    // Used for post-norm.
-    embedding_buf_ = (decoder_buf_ + decoder_workspace_size);
-
-    logits_buf_ =
-        decoder_normed_result_buf_ + decoder_normed_result_buffer_size;
-    word_ids_buf_ = (int *)(logits_buf_ + logits_buf_size);
-    finished_buf_ = (bool *)(word_ids_buf_ + word_ids_buf_size);
-    finished_count_buf_ = (int *)(finished_buf_ + finished_buf_size);
-
-    relative_attention_bias_ =
-        (DataType_ *)(finished_count_buf_ + finished_count_size);
-
-    topp_id_vals_buf_ =
-        (int *)(relative_attention_bias_ + relative_attention_bias_size);
-    begin_topp_offset_buf_ = (int *)(topp_id_vals_buf_ + topp_id_vals_buf_size);
-    topp_offset_buf_ =
-        (int *)(begin_topp_offset_buf_ + begin_topp_offset_buf_size);
-    topp_workspace_ = (void *)(topp_offset_buf_ + topp_offset_buf_size);
-    topk_workspace_ = (void *)((char *)topp_workspace_ + topp_workspace_size_);
-    padded_embedding_kernel =
-        (DataType_ *)((char *)topk_workspace_ + topk_workspace_size_);
-    padded_embedding_bias =
-        (DataType_ *)(padded_embedding_kernel + padded_embedding_kernel_size);
-    curandstate_buf_ =
-        (curandState_t *)(padded_embedding_bias + padded_embedding_bias_size);
-
-    h_finished_buf_ = new bool[finished_buf_size];
-    h_trg_length_ = new int[args_.batch_size_];
-
-    int isConfigExist = access("decoding_gemm_config.in", 0);
-    if (isConfigExist == -1) {
-      printf("[WARNING] decoding_gemm_config.in is not found\n");
-    } else {
-      readAlgoFromConfig(cublasAlgoMap_, 1);
-      // check that the gemm_config setting is runnable
-      for (auto iter = cublasAlgoMap_.begin(); iter != cublasAlgoMap_.end();
-           iter++) {
-        int algoId = iter->second.algoId;
-        int stages = iter->second.stages;
-        // only check for cublas
-        if (stages != -1) continue;
-        if (Traits_::OpType == OperationType::FP32) {
-          if (algoId > CUBLAS_GEMM_ALGO23 || algoId < CUBLAS_GEMM_DEFAULT) {
-            // the algorithm is not for FP32
-            printf("[ERROR] cuBLAS Algorithm %d is not used in FP32. \n",
-                   algoId);
-            exit(-1);
-          }
-        } else {
-          if (algoId > CUBLAS_GEMM_ALGO15_TENSOR_OP ||
-              algoId < CUBLAS_GEMM_DEFAULT_TENSOR_OP) {
-            // the algorithm is not for FP16
-            printf("[ERROR] cuBLAS Algorithm %d is not used in FP16. \n",
-                   algoId);
-            exit(-1);
-          }
-        }
-      }
-    }
-  }
-
-  void set_tensor_parallel_param(const TensorParallelParam param) {
-    t_parallel_param_ = param;
-    decoder_->set_tensor_parallel_param(param);
-  }
-
-  void set_layer_parallel_param(const LayerParallelParam param) {
-    l_parallel_param_ = param;
-    decoder_->set_layer_parallel_param(param);
-  }
-
-  void forward(const DecoderInitParam<DataType_> *param,
-               DecodingInitParam<DataType_> decoding_params) {
-#ifndef NDEBUG
-    PRINT_FUNC_NAME_();
-#endif
-    const int m = args_.batch_size_;
-    const int k = args_.hidden_units_;
-    const int n = args_.vocab_size_padded_;
-    const DataType_ *embedding_kernel_ptr = nullptr;
-    const DataType_ *embedding_bias_ptr = nullptr;
-
-    int min_trg_len = 0;
-    int max_trg_len = 0;
-
-    if (decoding_params.trg_word) {
-      cudaMemcpy(h_trg_length_,
-                 decoding_params.trg_length,
-                 sizeof(int) * args_.batch_size_,
-                 cudaMemcpyDeviceToHost);
-      min_trg_len = h_trg_length_[0];
-      max_trg_len = h_trg_length_[0];
-
-      for (int i = 1; i < args_.batch_size_; ++i) {
-        min_trg_len = std::min(min_trg_len, h_trg_length_[i]);
-        max_trg_len = std::max(max_trg_len, h_trg_length_[i]);
-      }
-    }
-
-    /*
-      sequence_length initialize to 0
-      finished: false
-      word_ids: start_id_
-    */
-    if (decoding_params.output_scores) {
-      cudaMemsetAsync(decoding_params.output_scores, 0, sizeof(float) * m);
-    }
-    if (args_.candidate_num_ != 0) {
-      sampling_init_kernelLauncher(finished_buf_,
-                                   decoding_params.sequence_length,
-                                   word_ids_buf_,
-                                   args_.start_id_,
-                                   args_.batch_size_,
-                                   decoding_params.stream);
-    } else if (args_.probability_threshold_ != 0.0) {
-      topp_initialization_kernelLauncher_v2(finished_buf_,
-                                            decoding_params.sequence_length,
-                                            word_ids_buf_,
-                                            topp_id_vals_buf_,
-                                            topp_offset_buf_,
-                                            begin_topp_offset_buf_,
-                                            args_.vocab_size_padded_,
-                                            args_,
-                                            decoding_params.stream);
-    }
-    ker_curand_setupLauncher(curandstate_buf_, args_, decoding_params.stream);
-
-#ifndef NDEBUG
-    cudaDeviceSynchronize();
-    check_cuda_error(cudaGetLastError());
-#endif
-
-    build_relative_attention_bias_launcher(
-        relative_attention_bias_,
-        decoding_params.self_relative_attention_bias_weight,
-        args_.head_num_,
-        (args_.seq_len_ + 1),
-        args_.num_bucket_,
-        false,
-        args_.max_distance_,
-        decoding_params.stream);
-
-    if (std::is_same<DataType_, float>::value ||
-        (std::is_same<DataType_, half>::value &&
-         args_.vocab_size_ == args_.vocab_size_padded_)) {
-      embedding_kernel_ptr =
-          (const DataType_ *)decoding_params.embedding_kernel;
-      embedding_bias_ptr = (const DataType_ *)decoding_params.embedding_bias;
-    } else if (std::is_same<DataType_, half>::value) {
-      kernel_padding_kernelLauncher(padded_embedding_kernel,
-                                    decoding_params.embedding_kernel,
-                                    args_.hidden_units_,
-                                    args_.vocab_size_,
-                                    args_.vocab_size_padded_,
-                                    decoding_params.stream);
-
-#ifndef NDEBUG
-      cudaDeviceSynchronize();
-      check_cuda_error(cudaGetLastError());
-#endif
-
-      bias_padding_kernelLauncher(padded_embedding_bias,
-                                  decoding_params.embedding_bias,
-                                  args_.vocab_size_,
-                                  args_.vocab_size_padded_,
-                                  decoding_params.stream);
-
-#ifndef NDEBUG
-      cudaDeviceSynchronize();
-      check_cuda_error(cudaGetLastError());
-#endif
-      embedding_kernel_ptr = padded_embedding_kernel;
-      embedding_bias_ptr = padded_embedding_bias;
-    }
-
-    // TODO(guosheng): move cache offset into for loop for pipeline parallel
-    size_t cache_size = (args_.batch_size_ * args_.seq_len_ *
-                         args_.hidden_units_);  // type T
-
-    const int local_batch = l_parallel_param_.local_batch_size;
-    for (uint step = 1; step <= args_.seq_len_; ++step) {
-
-      words_embeddings_kernel_launcher(from_tensor_[0],
-                                       decoding_params.embedding_table,
-                                       word_ids_buf_,
-                                       m,
-                                       args_.hidden_units_,
-                                       decoding_params.stream);
-
-#ifndef NDEBUG
-      cudaDeviceSynchronize();
-      check_cuda_error(cudaGetLastError());
-#endif
-
-      int from_id, out_id;
-      for (int layer = 0; layer < args_.decoder_layers_; ++layer) {
-        if (l_parallel_param_.is_valid(layer)) {
-          /*
-             For the first layer (layer-0), from_id is 0. We also stored the
-             embedding lookup
-             result in from_tensor_[0]
-           */
-          from_id = layer & 0x1;
-          out_id = 1 - from_id;
-
-          /*
-            We use one decoder_ object to process multiple decoder layers.
-
-            At the beginning of each decoder layer, we initialize the decoder
-            object
-            with corresponding weights and decoder_buf_.
-
-            The decoder_buf_ is reused.
-          */
-          decoder_->initialize(param[layer], decoder_buf_, cublas_workspace_);
-
-#ifndef NDEBUG
-          cudaDeviceSynchronize();
-          check_cuda_error(cudaGetLastError());
-#endif
-
-          decoder_->forward(from_tensor_[from_id],
-                            decoding_params.memory_tensor,
-                            K_cache_[0] + layer * cache_size,
-                            V_cache_[0] + layer * cache_size,
-                            K_mem_cache_[layer],
-                            V_mem_cache_[layer],
-                            decoding_params.memory_sequence_length,
-                            from_tensor_[out_id],
-                            step,
-                            args_.seq_len_,
-                            true, /* is_cross_attention */
-                            finished_buf_,
-                            relative_attention_bias_,
-                            true);
-
-#ifndef NDEBUG
-          cudaDeviceSynchronize();
-          check_cuda_error(cudaGetLastError());
-#endif
-        }
-      }
-
-      if (step > min_trg_len) {
-        DataType_ alpha = (DataType_)1.0f;
-        DataType_ beta = (DataType_)0.0f;
-
-        t5_layer_norm(from_tensor_[out_id],
-                      decoding_params.layernorm.gamma,
-                      decoding_params.layernorm.beta,
-                      decoder_normed_result_buf_,
-                      m,
-                      k,
-                      decoding_params.stream);
-
-#ifndef NDEBUG
-        cudaDeviceSynchronize();
-        check_cuda_error(cudaGetLastError());
-#endif
-
-        if (args_.tie_word_embeddings_) {
-          alpha = (DataType_) pow((float)(k), -0.5);
-        }
-
-        cublasMM_cublasLtMM_wrapper_decoder(decoding_params.cublaslt_handle,
-                                            decoding_params.cublas_handle,
-                                            CUBLAS_OP_N,
-                                            CUBLAS_OP_N,
-                                            n,
-                                            m,
-                                            k,
-                                            &alpha,
-                                            embedding_kernel_ptr,
-                                            AType_,
-                                            n,
-                                            decoder_normed_result_buf_,
-                                            BType_,
-                                            k,
-                                            &beta,
-                                            logits_buf_,
-                                            CType_,
-                                            n,
-                                            decoding_params.stream,
-                                            cublasAlgoMap_,
-                                            cublas_workspace_);
-
-        if (decoding_params.logits_mask ||
-            (args_.min_length_ != 0 && step <= args_.min_length_)) {
-          apply_logits_mask_kernelLauncher(
-              logits_buf_,
-              finished_buf_,
-              args_.batch_size_,
-              1,
-              args_.vocab_size_padded_,
-              args_.vocab_size_,
-              decoding_params.stream,
-              decoding_params.logits_mask,
-              (args_.min_length_ != 0 && step <= args_.min_length_),
-              args_.end_id_);
-#ifndef NDEBUG
-          cudaDeviceSynchronize();
-          check_cuda_error(cudaGetLastError());
-#endif
-        }
-
-        if (args_.temperature_ != 1.0) {
-          apply_temperature_penalty_kernelLauncher(
-              logits_buf_,
-              (DataType_)args_.temperature_,
-              args_.batch_size_,
-              args_.vocab_size_,
-              n,
-              decoding_params.stream);
-
-#ifndef NDEBUG
-          cudaDeviceSynchronize();
-          check_cuda_error(cudaGetLastError());
-#endif
-        }
-
-        if (args_.candidate_num_ != 0) {
-          // top k sampling
-          if (decoding_params.output_scores) {
-            softmax_kernelLauncher(logits_buf_,
-                                   embedding_bias_ptr,
-                                   args_.end_id_,
-                                   finished_buf_,
-                                   m,
-                                   n,
-                                   n,
-                                   decoding_params.stream);
-
-            // Return Score.
-            topK_sampling_kernel_kernelLauncher_v3(
-                topk_workspace_,
-                topk_workspace_size_,
-                logits_buf_,
-                decoding_params.output_ids + (step - 1) * args_.batch_size_,
-                decoding_params.sequence_length,
-                decoding_params.output_scores,
-                finished_buf_,
-                curandstate_buf_,  // used as random number
-                args_,
-                decoding_params.stream,
-                args_.batch_size_);
-          } else {
-            update_logits_without_softmax(logits_buf_,
-                                          embedding_bias_ptr,
-                                          args_.end_id_,
-                                          finished_buf_,
-                                          m,
-                                          n,
-                                          decoding_params.stream);
-
-#ifndef NDEBUG
-            cudaDeviceSynchronize();
-            check_cuda_error(cudaGetLastError());
-#endif
-
-            topK_sampling_kernel_kernelLauncher_v2(
-                topk_workspace_,
-                topk_workspace_size_,
-                logits_buf_,
-                decoding_params.output_ids + (step - 1) * args_.batch_size_,
-                decoding_params.sequence_length,
-                finished_buf_,
-                curandstate_buf_,  // used as random number
-                args_,
-                decoding_params.stream,
-                args_.batch_size_);
-          }
-        } else if (args_.probability_threshold_ != 0.0) {
-          // top p sampling
-          softmax_kernelLauncher(logits_buf_,
-                                 embedding_bias_ptr,
-                                 args_.end_id_,
-                                 finished_buf_,
-                                 m,
-                                 n,
-                                 n,
-                                 decoding_params.stream);
-
-#ifndef NDEBUG
-          cudaDeviceSynchronize();
-          check_cuda_error(cudaGetLastError());
-#endif
-
-          if (decoding_params.output_scores) {
-            topP_sampling_kernel_kernelLauncher_v3(
-                topp_workspace_,
-                topp_workspace_size_,
-                logits_buf_,
-                topp_id_vals_buf_,
-                topp_offset_buf_,
-                begin_topp_offset_buf_,
-                finished_buf_,
-                curandstate_buf_,
-                args_,
-                decoding_params.output_ids + (step - 1) * args_.batch_size_,
-                decoding_params.sequence_length,
-                decoding_params.output_scores,
-                n,
-                decoding_params.stream,
-                args_.batch_size_);
-          } else {
-            topP_sampling_kernel_kernelLauncher_v2(
-                topp_workspace_,
-                topp_workspace_size_,
-                logits_buf_,
-                topp_id_vals_buf_,
-                topp_offset_buf_,
-                begin_topp_offset_buf_,
-                finished_buf_,
-                curandstate_buf_,
-                args_,
-                decoding_params.output_ids + (step - 1) * args_.batch_size_,
-                decoding_params.sequence_length,
-                n,
-                decoding_params.stream,
-                args_.batch_size_);
-          }
-        }
-      }
-
-      if (step <= max_trg_len) {
-#ifndef NDEBUG
-        cudaDeviceSynchronize();
-        check_cuda_error(cudaGetLastError());
-#endif
-
-        update_with_force_decodingLauncher(
-            decoding_params.trg_word,
-            decoding_params.trg_length,
-            finished_buf_,
-            word_ids_buf_,
-            (step > min_trg_len) ? nullptr : decoding_params.sequence_length,
-            (int *)nullptr,
-            (int *)nullptr,
-            decoding_params.output_ids + (step - 1) * args_.batch_size_,
-            (DataType_ *)nullptr,
-            false,
-            args_.batch_size_,
-            1,
-            max_trg_len,
-            step,
-            decoding_params.stream);
-      } else {
-        word_ids_buf_ =
-            decoding_params.output_ids + (step - 1) * args_.batch_size_;
-      }
-
-#ifndef NDEBUG
-      cudaDeviceSynchronize();
-      check_cuda_error(cudaGetLastError());
-#endif
-
-      if (step > max_trg_len) {
-        // TODO Find a better method to check the is_finished
-        cudaMemcpy(h_finished_buf_,
-                   finished_buf_,
-                   sizeof(bool) * args_.batch_size_,
-                   cudaMemcpyDeviceToHost);
-        uint sum = 0;
-        for (uint i = 0; i < args_.batch_size_; i++) {
-          sum += (int)h_finished_buf_[i];
-        }
-        if (sum == args_.batch_size_) break;
-      }
-    }
-  }
-
-  virtual ~T5DecodingSampling() {
-    delete[] K_cache_;
-    delete[] V_cache_;
-    delete[] K_mem_cache_;
-    delete[] V_mem_cache_;
-    delete[] h_finished_buf_;
-    delete[] h_trg_length_;
-    delete decoder_;
-    allocator_.free(buf_);
-  }
-};
-
-}  // namespace fastertransformer
diff --git a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/utils/allocator.h b/paddlenlp/ops/patches/FasterTransformer/fastertransformer/utils/allocator.h
deleted file mode 100644
index 3ed6a1577e89..000000000000
--- a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/utils/allocator.h
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/**
- * Memory Allocator
- **/
-
-#pragma once
-
-#include <cuda_runtime.h>
-#include <vector>
-#include "fastertransformer/utils/common.h"
-#include "fastertransformer/utils/utils.h"
-
-#ifdef PADDLE_CUDA
-#ifdef PADDLE_ON_INFERENCE
-#include "paddle/extension.h"
-#include "paddle_inference_api.h"
-#include "paddle/common/exception.h"
-#else
-#include "paddle/extension.h"
-#endif
-#endif
-
-namespace fastertransformer {
-
-class IAllocator {
-public:
-  virtual void *malloc(size_t size, const bool is_set_zero = true) const = 0;
-  virtual void free(void *ptr) const = 0;
-};
-
-template <AllocatorType AllocType_>
-class Allocator;
-
-template <>
-class Allocator<AllocatorType::CUDA> : public IAllocator {
-  const int device_id_;
-
-public:
-  Allocator(int device_id) : device_id_(device_id) {}
-  ~Allocator() {}
-
-  void *malloc(size_t size, const bool is_set_zero = true) const {
-    void *ptr = nullptr;
-    int o_device = 0;
-    check_cuda_error(get_set_device(device_id_, &o_device));
-    check_cuda_error(cudaMalloc(&ptr, size));
-    check_cuda_error(get_set_device(o_device));
-    return ptr;
-  }
-
-  void free(void *ptr) const {
-    int o_device = 0;
-    check_cuda_error(get_set_device(device_id_, &o_device));
-    check_cuda_error(cudaFree(ptr));
-    check_cuda_error(get_set_device(o_device));
-    return;
-  }
-};
-
-#ifdef PADDLE_CUDA
-template <>
-class Allocator<AllocatorType::PD> : public IAllocator {
-  std::shared_ptr<std::vector<paddle::Tensor>> allocated_tensor_vector;
-  cudaStream_t stream_;
-
-public:
-  Allocator(cudaStream_t stream)
-      : allocated_tensor_vector(
-            std::make_shared<std::vector<paddle::Tensor>>()),
-        stream_(stream) {}
-
-  void *malloc(size_t size, const bool is_set_zero = true) const {
-    PD_CHECK(size > 0, "Allocated memory must be greater than 0. ");
-
-    int64_t buf_size = static_cast<int64_t>(size);
-    std::vector<int64_t> buf_dims({buf_size});
-#ifdef PADDLE_NEW_ALLOCATOR
-    // For PaddlePaddle>=2.3.0
-    auto buf = paddle::empty(buf_dims, paddle::DataType::UINT8, paddle::GPUPlace());
-    allocated_tensor_vector->push_back(buf);
-    auto *flat = buf.data<uint8_t>();
-#else
-    auto buf = paddle::Tensor(paddle::PlaceType::kGPU, buf_dims);
-    allocated_tensor_vector->push_back(buf);
-    auto *flat = buf.mutable_data<uint8_t>(paddle::PlaceType::kGPU);
-#endif
-    void *ptr = reinterpret_cast<void *>(flat);
-    return ptr;
-  }
-
-  void free(void *ptr) const {
-#ifndef NDEBUG
-    printf("call from allocator free\n");
-#endif
-    return;
-  }
-
-  //   ~Allocator() {
-  //     allocated_tensor_vector->clear();
-  //     delete allocated_tensor_vector;
-  //   }
-};
-#endif
-
-}  // namespace fastertransformer
diff --git a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/utils/arguments.h b/paddlenlp/ops/patches/FasterTransformer/fastertransformer/utils/arguments.h
deleted file mode 100644
index e4b2093b59b5..000000000000
--- a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/utils/arguments.h
+++ /dev/null
@@ -1,210 +0,0 @@
-/*
- * Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/**
- * Decoder transformer
- **/
-
-#pragma once
-
-#include <cuda_runtime.h>
-#include <stdlib.h>
-
-#include "fastertransformer/utils/common.h"
-#include "fastertransformer/utils/common_structure.h"
-#include "fastertransformer/utils/nccl_utils.h"
-
-namespace fastertransformer {
-
-template <typename T>
-class DecodingInitParam : public AbstractParam {
-public:
-  /* weights for masked_multi_head_attention */
-  const T *embedding_table = nullptr;
-  const T *embedding_kernel = nullptr;
-  const T *embedding_bias = nullptr;
-
-  // Used for unilm.
-  const T *trans_kernel = nullptr;
-  const T *trans_bias = nullptr;
-
-  const T *memory_tensor = nullptr;
-
-  const int *type_id = nullptr;
-  const int *decoder_type_id = nullptr;
-  const int *memory_sequence_length = nullptr;
-
-  // Used for force decoding.
-  const int *trg_word = nullptr;
-  const int *trg_length = nullptr;
-
-  const T *position_encoding_table = nullptr;
-
-  // segment table
-  const T *type_table = nullptr;
-
-  // For PLATO embedding.
-  const int *latent_id = nullptr;
-  const T *latent_embedding_table = nullptr;
-  const int *role_id = nullptr;
-  const int *decoder_role_id = nullptr;
-  const T *role_embedding_table = nullptr;
-  // Custom position.
-  const int *position_ids = nullptr;
-  const int *decoder_position_ids = nullptr;
-
-  LayerNormWeight<T> pre_layernorm;
-  LayerNormWeight<T> layernorm;
-  LayerNormWeight<T> lm_layernorm;
-  LayerNormWeight<T> mbart_layernorm;
-
-  const T *logits_mask = nullptr;
-
-  int *output_ids = nullptr;
-  int *parent_ids = nullptr;
-  int *sequence_length = nullptr;
-  float *output_scores = nullptr;
-
-  cublasHandle_t cublas_handle;
-  cublasLtHandle_t cublaslt_handle;
-  cudaStream_t stream;
-
-  // T5
-  const T *self_relative_attention_bias_weight = nullptr;
-
-  // For GPT model
-  int request_batch_size = 0;
-  int request_input_len = 0;
-  int request_output_len = 0;
-  int max_input_len = 0;
-  int *d_start_ids = nullptr;
-  const int *d_start_lengths = nullptr;
-  const T *d_attn_mask = nullptr;
-
-  virtual ~DecodingInitParam() {}
-};
-
-struct TransformerArguments {
-  size_t batch_size_;
-  size_t seq_len_;
-  size_t head_num_;
-  size_t size_per_head_;
-  size_t hidden_units_;
-};
-
-struct DecodingArguments : public TransformerArguments {
-  int decoder_layers_;
-  int vocab_size_;
-  int start_id_;
-  int end_id_;
-  int vocab_size_padded_;
-
-  int min_length_{0};
-};
-
-struct DecodingSamplingArguments : public DecodingArguments {
-  int candidate_num_;
-  float probability_threshold_;
-  size_t cub_temp_storage_size_{0};
-  bool normalization_before_{true};
-  int pos_offset_{0};     // For BART position embedding
-  bool pos_bias_{false};  // For Unified position embedding
-  ActivationType act_{ActivationType::RELU};
-
-  int memory_max_seq_len_{0};
-  float temperature_{1.0};
-  float repeat_penalty_{1.0};
-  bool prefix_lm_{false};
-  // For tensor parallel usage currently.
-  int seed_{-1};
-
-  bool is_mbart_{false};
-  bool is_miro_{false};
-};
-
-struct DecodingBeamsearchArguments : public DecodingArguments {
-  int beam_width_;
-  int temp_storage_size_;
-  float beam_search_diversity_rate_;
-  float alpha_;  // power number for length penalty in beam search v2
-  bool normalization_before_{true};
-  int pos_offset_{0};     // For BART position embedding
-  bool pos_bias_{false};  // For Unified position embedding
-  ActivationType act_{ActivationType::RELU};
-
-  int memory_max_seq_len_{0};
-  bool prefix_lm_{false};
-  int finished_candidate_num_{-1};
-  bool early_stopping_{false};
-
-  bool is_mbart_{false};
-  bool is_miro_{false};
-};
-
-struct T5SamplingArguments : public DecodingSamplingArguments {
-  int num_bucket_{-1};
-  int max_distance_{128};
-  bool tie_word_embeddings_{true};
-};
-
-struct T5BeamsearchArguments : public DecodingBeamsearchArguments {
-  int num_bucket_{-1};
-  int max_distance_{128};
-  bool tie_word_embeddings_{true};
-};
-
-struct GptArguments : public DecodingSamplingArguments {
-  int **start_ids_;
-  int start_len_;
-  float temperature_{2.0};
-  float len_penalty{1.0};
-  float repetition_penalty_{1.0};
-  int *vocab_mask{nullptr};
-  int min_gpu_num_{1};
-};
-
-
-struct GptJArguments : public GptArguments {
-  int rotary_embedding_dim_{0};
-};
-
-struct TransformerSamplingArguments : public DecodingSamplingArguments {
-  int **start_ids_;
-  int start_len_;
-  float temperature_{1.0};
-  float len_penalty{1.0};
-  float repetition_penalty_{1.0};
-  int *vocab_mask{nullptr};
-  bool normalization_before_{true};
-  bool pos_bias_{true};
-  int unk_id_{-1};
-  int mask_id_{-1};
-  ActivationType act_{ActivationType::GELU};
-};
-
-struct TransformerBeamsearchArguments : public DecodingBeamsearchArguments {
-  int start_len_;
-  float temperature_{2.0};
-  float len_penalty{1.0};
-  float repetition_penalty_{2.0};
-  bool normalization_before_{true};
-  bool pos_bias_{true};
-  int unk_id_{-1};
-  int mask_id_{-1};
-  ActivationType act_{ActivationType::GELU};
-};
-
-}  // namespace fastertransformer
diff --git a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/utils/common.h b/paddlenlp/ops/patches/FasterTransformer/fastertransformer/utils/common.h
deleted file mode 100644
index dc748dc50529..000000000000
--- a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/utils/common.h
+++ /dev/null
@@ -1,231 +0,0 @@
-/*
- * Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <cublasLt.h>
-#include <cublas_v2.h>
-#include <cuda_fp16.h>
-#include <cuda_runtime.h>
-#include <fstream>
-#include <iostream>
-#include <map>
-#include <string>
-#include <stdexcept>
-#include "stdio.h"
-
-#define MAX_CONFIG_NUM 20
-#define GEMM_NUM 6
-#define COL32_ 32
-#define ACTIVATION_AMAX_NUM 80
-#define INT8O_GEMM_NUM 8
-#define TRT_FUSED_MHA_AMAX_NUM 3
-#define GEMM_CONFIG "gemm_config.in"
-#define IGEMM_CONFIG "igemm_config.in"
-// workspace for cublas gemm : 32MB
-#define CUBLAS_WORKSPACE_SIZE 33554432
-
-
-#include "fastertransformer/gemm_test/encoder_gemm_func.h"
-#include "fastertransformer/gemm_test/encoder_igemm_func.h"
-
-struct AbstractParam {
-  virtual ~AbstractParam(){};
-};
-
-namespace fastertransformer {
-
-enum { FLOAT_DATATYPE = 0, HALF_DATATYPE = 1, INT8_DATATYPE = 2 };
-
-enum class OperationType { FP32, FP16 };
-enum class AllocatorType { CUDA, PD };
-
-#define PRINT_FUNC_NAME_()                                          \
-  do {                                                              \
-    std::cout << "[FT][CALL] " << __FUNCTION__ << " " << std::endl; \
-  } while (0)
-
-static double diffTime(timeval start, timeval end) {
-  return (end.tv_sec - start.tv_sec) * 1000 +
-         (end.tv_usec - start.tv_usec) * 0.001;
-}
-
-static const char *_cudaGetErrorEnum(cudaError_t error) {
-  return cudaGetErrorString(error);
-}
-
-static inline __device__ int8_t float_to_int8_rn(float x) {
-  uint32_t dst;
-  asm volatile("cvt.rni.sat.s8.f32 %0, %1;" : "=r"(dst) : "f"(x));
-  return reinterpret_cast<const int8_t &>(dst);
-}
-
-static const char *_cudaGetErrorEnum(cublasStatus_t error) {
-  switch (error) {
-    case CUBLAS_STATUS_SUCCESS:
-      return "CUBLAS_STATUS_SUCCESS";
-
-    case CUBLAS_STATUS_NOT_INITIALIZED:
-      return "CUBLAS_STATUS_NOT_INITIALIZED";
-
-    case CUBLAS_STATUS_ALLOC_FAILED:
-      return "CUBLAS_STATUS_ALLOC_FAILED";
-
-    case CUBLAS_STATUS_INVALID_VALUE:
-      return "CUBLAS_STATUS_INVALID_VALUE";
-
-    case CUBLAS_STATUS_ARCH_MISMATCH:
-      return "CUBLAS_STATUS_ARCH_MISMATCH";
-
-    case CUBLAS_STATUS_MAPPING_ERROR:
-      return "CUBLAS_STATUS_MAPPING_ERROR";
-
-    case CUBLAS_STATUS_EXECUTION_FAILED:
-      return "CUBLAS_STATUS_EXECUTION_FAILED";
-
-    case CUBLAS_STATUS_INTERNAL_ERROR:
-      return "CUBLAS_STATUS_INTERNAL_ERROR";
-
-    case CUBLAS_STATUS_NOT_SUPPORTED:
-      return "CUBLAS_STATUS_NOT_SUPPORTED";
-
-    case CUBLAS_STATUS_LICENSE_ERROR:
-      return "CUBLAS_STATUS_LICENSE_ERROR";
-  }
-  return "<unknown>";
-}
-
-template <typename T>
-void check(T result,
-           char const *const func,
-           const char *const file,
-           int const line) {
-  if (result) {
-    throw std::runtime_error(std::string("[FT][ERROR] CUDA runtime error: ") +
-                             (_cudaGetErrorEnum(result)) + " " + file + ":" +
-                             std::to_string(line) + " \n");
-  }
-}
-
-#define check_cuda_error(val) check((val), #val, __FILE__, __LINE__)
-
-template <typename T>
-void print_to_file(const T *result, const int size, const char *file) {
-  cudaDeviceSynchronize();
-  check_cuda_error(cudaGetLastError());
-  printf("[INFO] file: %s \n", file);
-  FILE *fd = fopen(file, "w");
-  T *tmp = (T *)malloc(sizeof(T) * size);
-  check_cuda_error(
-      cudaMemcpy(tmp, result, sizeof(T) * size, cudaMemcpyDeviceToHost));
-  for (int i = 0; i < size; ++i) {
-    float val;
-    if (sizeof(T) == 2)
-      val = (T)__half2float(tmp[i]);
-    else
-      val = (T)tmp[i];
-    fprintf(fd, "%f\n", val);
-  }
-  free(tmp);
-  fclose(fd);
-  cudaDeviceSynchronize();
-  check_cuda_error(cudaGetLastError());
-}
-
-template <typename T>
-void print_to_file(const T *result,
-                   const int size,
-                   const char *file,
-                   cudaStream_t stream,
-                   std::ios::openmode open_mode = std::ios::out) {
-  cudaDeviceSynchronize();
-  check_cuda_error(cudaGetLastError());
-  printf("[INFO] file: %s with size %d.\n", file, size);
-  std::ofstream outFile(file, open_mode);
-  if (outFile) {
-    T *tmp = new T[size];
-    check_cuda_error(cudaMemcpyAsync(
-        tmp, result, sizeof(T) * size, cudaMemcpyDeviceToHost, stream));
-    for (int i = 0; i < size; ++i) {
-      float val;
-      if (sizeof(T) == 2)
-        val = (T)__half2float(tmp[i]);
-      else
-        val = (T)tmp[i];
-      outFile << val << std::endl;
-    }
-    delete[] tmp;
-  } else {
-    printf("[ERROR] cannot open file %s \n", file);
-    exit(-1);
-  }
-  cudaDeviceSynchronize();
-  check_cuda_error(cudaGetLastError());
-}
-
-template <typename T>
-void print_to_screen(T *result, const int size) {
-  float *tmp = (float *)malloc(sizeof(float) * size);
-  check_cuda_error(
-      cudaMemcpy(tmp, result, sizeof(float) * size, cudaMemcpyDeviceToHost));
-  for (int i = 0; i < size; ++i) printf("%d, %f\n", i, (float)tmp[i]);
-  free(tmp);
-}
-
-template <typename T>
-void check_max_val(const T *result, const int size) {
-  T *tmp = new T[size];
-  cudaMemcpy(tmp, result, sizeof(T) * size, cudaMemcpyDeviceToHost);
-  float max_val = -100000;
-  for (int i = 0; i < size; i++) {
-    float val = (float)(tmp[i]);
-    if (val > max_val) max_val = val;
-  }
-  delete tmp;
-  printf("[INFO][CUDA] addr %p max val: %f \n", result, max_val);
-}
-
-inline int getSMVersion() {
-  int device{-1};
-  check_cuda_error(cudaGetDevice(&device));
-  cudaDeviceProp props;
-  check_cuda_error(cudaGetDeviceProperties(&props, device));
-  return props.major * 10 + props.minor;
-}
-
-template <typename T>
-void check_abs_mean_val(const T *result, const int size) {
-  T *tmp = new T[size];
-  cudaMemcpy(tmp, result, sizeof(T) * size, cudaMemcpyDeviceToHost);
-  float sum = 0.0f;
-  for (int i = 0; i < size; i++) {
-    sum += abs((float)tmp[i]);
-  }
-  delete tmp;
-  printf("[INFO][CUDA] addr %p abs mean val: %f \n", result, sum / size);
-}
-
-inline int div_up(int a, int n) { return (a + n - 1) / n; }
-
-inline void print_mem_usage() {
-  size_t free_bytes, total_bytes;
-  check_cuda_error(cudaMemGetInfo(&free_bytes, &total_bytes));
-  float free = (float)(free_bytes) / 1024.0 / 1024.0 / 1024.0;
-  float total = (float)(total_bytes) / 1024.0 / 1024.0 / 1024.0;
-  printf("after allocation, free %.2f GB total %.2f GB\n", free, total);
-}
-
-}  // namespace fastertransformer
diff --git a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/utils/common_structure.h b/paddlenlp/ops/patches/FasterTransformer/fastertransformer/utils/common_structure.h
deleted file mode 100644
index e154f63ef06c..000000000000
--- a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/utils/common_structure.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-template<typename T>
-struct DenseWeight{
-    const T* kernel = nullptr;
-    const T* bias = nullptr;
-};
-
-template<typename T>
-struct LayerNormWeight{
-    const T* gamma = nullptr;
-    const T* beta = nullptr;
-};
-
-template<typename T>
-struct AttentionWeight{
-    DenseWeight<T> query_weight;
-    DenseWeight<T> key_weight;
-    DenseWeight<T> value_weight;
-    DenseWeight<T> attention_output_weight;
-};
-
-template<typename T>
-struct FFNWeight{
-    DenseWeight<T> intermediate_weight;
-    DenseWeight<T> intermediate_weight_1;
-    DenseWeight<T> output_weight;
-};
-
-namespace fastertransformer{
-
-enum class ActivationType{RELU, GELU};
-
-template<OperationType OpType_>
-class TransformerTraits;
-
-template<>
-class TransformerTraits<OperationType::FP32>
-{
-  public:
-    typedef float DataType;
-    static const OperationType OpType = OperationType::FP32;
-    static cudaDataType_t const computeType = CUDA_R_32F;
-    static cudaDataType_t const scaleType = CUDA_R_32F;
-    static cudaDataType_t const AType = CUDA_R_32F;
-    static cudaDataType_t const BType = CUDA_R_32F;
-    static cudaDataType_t const CType = CUDA_R_32F;
-};
-
-template<>
-class TransformerTraits<OperationType::FP16>
-{
-  public:
-    typedef half DataType;
-    static const OperationType OpType = OperationType::FP16;
-    static cudaDataType_t const computeType = CUDA_R_16F;
-    static cudaDataType_t const scaleType = CUDA_R_16F;
-    static cudaDataType_t const AType = CUDA_R_16F;
-    static cudaDataType_t const BType = CUDA_R_16F;
-    static cudaDataType_t const CType = CUDA_R_16F;
-};
-
-} // end of fastertransformer 

From 75be3d03836e241dd400ac06ecda0c7befa321e3 Mon Sep 17 00:00:00 2001
From: Ting Liu <wtmlon@foxmail.com>
Date: Wed, 19 Jun 2024 14:35:57 +0800
Subject: [PATCH 2/2] fix ci

---
 paddlenlp/ops/__init__.py             |  1 -
 paddlenlp/ops/einsum.py               | 30 ++++++++++++++++++++++++++-
 paddlenlp/transformers/model_utils.py |  8 -------
 3 files changed, 29 insertions(+), 10 deletions(-)

diff --git a/paddlenlp/ops/__init__.py b/paddlenlp/ops/__init__.py
index d98e77ece75f..ea980d24c1d8 100644
--- a/paddlenlp/ops/__init__.py
+++ b/paddlenlp/ops/__init__.py
@@ -17,4 +17,3 @@
 from . import optimizer
 from .distributed import *
 from .einsum import *
-
diff --git a/paddlenlp/ops/einsum.py b/paddlenlp/ops/einsum.py
index 46c396b75edf..7e0e22850a14 100644
--- a/paddlenlp/ops/einsum.py
+++ b/paddlenlp/ops/einsum.py
@@ -12,9 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import numpy as np
 import paddle
 
-__all__ = ["einsum"]
+__all__ = ["einsum", "transfer_param"]
 
 
 def einsum(equation, *operands):
@@ -337,3 +338,30 @@ def _mul_sum(left, right, sum_dims):
     if len(squeeze_dims) != 0:
         result = paddle.squeeze(result, squeeze_dims)
     return result
+
+
+# copy from fast transformers
+def transfer_param(p, is_bias=False, dtype="float16", restore_data=False):
+    param_shape = p.shape
+    # Allow CPU/GPU and float16/float32 transfer
+    # NOTE: str(p.place) differs between paddle develop and 2.2
+    if str(p.dtype)[-len(dtype) :] == dtype and ("gpu" in str(p.place).lower() or "cuda" in str(p.place).lower()):
+        return p
+    if restore_data:
+        if paddle.in_dynamic_mode():
+            param_data = p.numpy()
+            # Creating parameters with Assign initializer is too slow. Maybe we
+            # can cast to fp16 directly and get a tensor, while we do it more
+            # elaborately to get a ParamBase. Also note `VarBase.set_value`
+            # enforce the same dtype and can not be used directly.
+            new_p = type(p)(shape=param_shape, dtype=dtype, is_bias=is_bias)
+            new_p.value().get_tensor().set(param_data.astype(dtype), paddle.framework._current_expected_place())
+            return new_p
+        else:
+            param_data = np.array(paddle.static.global_scope().find_var(p.name).get_tensor())
+    return paddle.create_parameter(
+        shape=param_shape,
+        dtype=dtype,
+        is_bias=is_bias,
+        default_initializer=paddle.nn.initializer.Assign(param_data) if restore_data else None,
+    )
diff --git a/paddlenlp/transformers/model_utils.py b/paddlenlp/transformers/model_utils.py
index 31ecadcabd2b..8319adce6d4f 100644
--- a/paddlenlp/transformers/model_utils.py
+++ b/paddlenlp/transformers/model_utils.py
@@ -1860,14 +1860,6 @@ def _fuse_or_split_keys(
             return state_dict, resume_state_dict, fused_keys, new_keys
 
         if state_dict is not None:
-            # DONT Hold tensor parallel here, only hold afer load state dict.
-            # Whole checkpoint
-            # For model parallel if FastGeneration
-            # To avoid recursive import temporarily.
-            import paddlenlp.ops.fast_transformer.transformer.decoding as ft_decoding
-
-            state_dict = ft_decoding.get_ft_para_conf().fit_partial_model(model_to_load, state_dict)
-
             # have loaded all state_dict, no resume state_dict
             state_dict, _, fused_keys, new_keys = _fuse_or_split_keys(
                 state_dict,