From 8ccbb376dafbc49de537302ab868782925e55a6b Mon Sep 17 00:00:00 2001 From: Ting Liu Date: Wed, 19 Jun 2024 11:57:52 +0800 Subject: [PATCH 1/2] remove fast generation --- fast_generation/README.md | 305 -- fast_generation/perf/README.md | 250 - fast_generation/perf/bart_perf.py | 170 - fast_generation/perf/codegen_perf.py | 175 - fast_generation/perf/gpt_perf.py | 155 - fast_generation/perf/opt_perf.py | 162 - fast_generation/perf/pegasus_perf.py | 168 - fast_generation/perf/run_perf_bart.sh | 76 - fast_generation/perf/run_perf_codegen.sh | 64 - fast_generation/perf/run_perf_gpt.sh | 52 - fast_generation/perf/run_perf_opt.sh | 52 - fast_generation/perf/run_perf_pegasus.sh | 45 - fast_generation/samples/codegen_16b_sample.py | 38 - fast_generation/samples/codegen_sample.py | 37 - fast_generation/samples/gpt_mp_sample.py | 132 - fast_generation/samples/gpt_sample.py | 35 - fast_generation/samples/gptj_sample.py | 42 - fast_generation/samples/mbart_sample.py | 58 - fast_generation/samples/opt_sample.py | 45 - fast_generation/samples/pegasus_sample.py | 36 - fast_generation/samples/plato_sample.py | 62 - fast_generation/samples/plato_xl_sample.py | 162 - fast_generation/samples/t5_sample.py | 58 - fast_generation/samples/unimo_text_sample.py | 59 - paddlenlp/ops/CMakeLists.txt | 490 -- paddlenlp/ops/__init__.py | 13 - paddlenlp/ops/cmake/FindNCCL.cmake | 165 - paddlenlp/ops/cmake/external/boost.cmake | 64 - paddlenlp/ops/ext_utils.py | 367 -- paddlenlp/ops/fast_transformer/CMakeLists.txt | 14 - paddlenlp/ops/fast_transformer/__init__.py | 13 - .../sample/bart_decoding_sample.py | 132 - .../sample/bart_export_model_sample.py | 111 - .../fast_transformer/sample/bart_inference.py | 107 - .../sample/config/decoder.sample.yaml | 39 - .../sample/config/decoding.sample.yaml | 44 - .../fast_transformer/sample/decoder_sample.py | 145 - .../sample/decoding_sample.py | 99 - .../sample/encoder_decoder_sample.py | 101 - .../sample/encoder_decoding_sample.py | 128 - .../sample/gpt_export_model_sample.py | 106 - .../ops/fast_transformer/sample/gpt_sample.py | 112 - .../sample/mbart_decoding_sample.py | 138 - .../sample/mbart_export_model_sample.py | 118 - .../sample/mbart_inference.py | 108 - .../sample/plato_export_model_sample.py | 120 - .../sample/plato_inference.py | 108 - .../sample/t5_export_model_sample.py | 113 - .../fast_transformer/sample/t5_inference.py | 94 - .../sample/unimo_text_export_model_sample.py | 111 - .../sample/unimo_text_inference.py | 92 - .../ops/fast_transformer/src/CMakeLists.txt | 336 -- .../ops/fast_transformer/src/cublas_handle.cc | 28 - .../ops/fast_transformer/src/cublas_handle.h | 58 - .../ops/fast_transformer/src/demo/gpt.cc | 321 -- .../ops/fast_transformer/src/demo/helper.h | 66 - .../src/demo/transformer_e2e.cc | 281 - .../ops/fast_transformer/src/demo/utf8.h | 34 - .../fast_transformer/src/demo/utf8/checked.h | 319 -- .../ops/fast_transformer/src/demo/utf8/core.h | 387 -- .../fast_transformer/src/demo/utf8/cpp11.h | 103 - .../fast_transformer/src/demo/utf8/cpp17.h | 103 - .../src/demo/utf8/unchecked.h | 257 - .../src/fusion_bart_decoding_op.cc | 352 -- .../src/fusion_bart_decoding_op.cu | 581 --- .../src/fusion_bart_decoding_op.h | 85 - .../fast_transformer/src/fusion_decoder_op.cc | 228 - .../fast_transformer/src/fusion_decoder_op.cu | 374 -- .../fast_transformer/src/fusion_decoder_op.h | 72 - .../src/fusion_decoding_op.cc | 337 -- .../src/fusion_decoding_op.cu | 538 -- .../fast_transformer/src/fusion_decoding_op.h | 84 - .../fast_transformer/src/fusion_encoder_op.cc | 193 - .../fast_transformer/src/fusion_encoder_op.cu | 443 -- .../fast_transformer/src/fusion_encoder_op.h | 63 - .../src/fusion_force_decoding_op.cc | 340 -- .../src/fusion_force_decoding_op.cu | 572 --- .../src/fusion_force_decoding_op.h | 85 - .../ops/fast_transformer/src/fusion_gpt_op.cc | 223 - .../ops/fast_transformer/src/fusion_gpt_op.cu | 378 -- .../ops/fast_transformer/src/fusion_gpt_op.h | 71 - .../fast_transformer/src/fusion_gptj_op.cc | 203 - .../fast_transformer/src/fusion_gptj_op.cu | 334 -- .../ops/fast_transformer/src/fusion_gptj_op.h | 66 - .../src/fusion_mbart_decoding_op.cc | 368 -- .../src/fusion_mbart_decoding_op.cu | 596 --- .../src/fusion_mbart_decoding_op.h | 88 - .../fast_transformer/src/fusion_miro_op.cc | 427 -- .../fast_transformer/src/fusion_miro_op.cu | 710 --- .../ops/fast_transformer/src/fusion_miro_op.h | 102 - .../ops/fast_transformer/src/fusion_opt_op.cc | 227 - .../ops/fast_transformer/src/fusion_opt_op.cu | 384 -- .../ops/fast_transformer/src/fusion_opt_op.h | 71 - .../src/fusion_pegasus_decoding_op.cc | 372 -- .../src/fusion_pegasus_decoding_op.cu | 554 -- .../src/fusion_pegasus_decoding_op.h | 86 - .../src/fusion_t5_decoding_op.cc | 377 -- .../src/fusion_t5_decoding_op.cu | 635 --- .../src/fusion_t5_decoding_op.h | 91 - .../src/fusion_unified_decoding_op.cc | 417 -- .../src/fusion_unified_decoding_op.cu | 693 --- .../src/fusion_unified_decoding_op.h | 100 - .../fast_transformer/src/parallel_utils.cc | 148 - .../ops/fast_transformer/src/parallel_utils.h | 102 - .../ops/fast_transformer/src/pd_traits.h | 37 - paddlenlp/ops/fast_transformer/src/utils.cc | 25 - paddlenlp/ops/fast_transformer/src/utils.h | 21 - .../fast_transformer/transformer/__init__.py | 13 - .../fast_transformer/transformer/decoder.py | 586 --- .../fast_transformer/transformer/decoding.py | 4550 ----------------- .../fast_transformer/transformer/encoder.py | 456 -- .../transformer/fast_transformer.py | 2021 -------- .../patches/FasterTransformer/CMakeLists.txt | 418 -- .../fastertransformer/CMakeLists.txt | 27 - .../bert_encoder_transformer.h | 1123 ---- .../cuda/attention_kernels.cu | 154 - .../cuda/attention_kernels.cuh | 34 - .../fastertransformer/cuda/cuda_kernels.cu | 95 - .../fastertransformer/cuda/cuda_kernels.h | 198 - .../cuda/decoding_kernels.cu | 713 --- .../cuda/lightseq_kernels.cu | 56 - .../cuda/masked_multihead_attention.cu | 1504 ------ .../cuda/masked_multihead_attention.h | 115 - .../cuda/masked_multihead_attention_utils.h | 265 - .../cuda/online_softmax_beamsearch_kernels.cu | 1559 ------ .../fastertransformer/cuda/open_attention.h | 1137 ---- .../fastertransformer/cuda/open_decoder.cu | 646 --- .../fastertransformer/cuda/open_decoder.cuh | 123 - .../fastertransformer/cuda/topk_kernels.cu | 2643 ---------- .../fastertransformer/cuda/topk_kernels.cuh | 87 - .../cuda/transformer_decoder.cu | 643 --- .../cuda/transformer_decoding_kernels.cu | 671 --- .../cuda/transformer_kernels.cu | 985 ---- .../cuda/transformer_kernels.cuh | 48 - .../fastertransformer/decoding_beamsearch.h | 1445 ------ .../fastertransformer/decoding_sampling.h | 1319 ----- .../FasterTransformer/fastertransformer/gpt.h | 895 ---- .../fastertransformer/gptj.h | 946 ---- .../fastertransformer/open_decoder.h | 2166 -------- .../FasterTransformer/fastertransformer/opt.h | 927 ---- .../fastertransformer/standard_encoder.h | 1013 ---- .../fastertransformer/t5_beamsearch.h | 922 ---- .../fastertransformer/t5_sampling.h | 780 --- .../fastertransformer/utils/allocator.h | 120 - .../fastertransformer/utils/arguments.h | 210 - .../fastertransformer/utils/common.h | 231 - .../utils/common_structure.h | 80 - 147 files changed, 52160 deletions(-) delete mode 100644 fast_generation/README.md delete mode 100644 fast_generation/perf/README.md delete mode 100644 fast_generation/perf/bart_perf.py delete mode 100644 fast_generation/perf/codegen_perf.py delete mode 100644 fast_generation/perf/gpt_perf.py delete mode 100644 fast_generation/perf/opt_perf.py delete mode 100644 fast_generation/perf/pegasus_perf.py delete mode 100644 fast_generation/perf/run_perf_bart.sh delete mode 100644 fast_generation/perf/run_perf_codegen.sh delete mode 100644 fast_generation/perf/run_perf_gpt.sh delete mode 100644 fast_generation/perf/run_perf_opt.sh delete mode 100644 fast_generation/perf/run_perf_pegasus.sh delete mode 100644 fast_generation/samples/codegen_16b_sample.py delete mode 100644 fast_generation/samples/codegen_sample.py delete mode 100644 fast_generation/samples/gpt_mp_sample.py delete mode 100644 fast_generation/samples/gpt_sample.py delete mode 100644 fast_generation/samples/gptj_sample.py delete mode 100644 fast_generation/samples/mbart_sample.py delete mode 100644 fast_generation/samples/opt_sample.py delete mode 100644 fast_generation/samples/pegasus_sample.py delete mode 100644 fast_generation/samples/plato_sample.py delete mode 100644 fast_generation/samples/plato_xl_sample.py delete mode 100644 fast_generation/samples/t5_sample.py delete mode 100644 fast_generation/samples/unimo_text_sample.py delete mode 100644 paddlenlp/ops/CMakeLists.txt delete mode 100644 paddlenlp/ops/cmake/FindNCCL.cmake delete mode 100644 paddlenlp/ops/cmake/external/boost.cmake delete mode 100644 paddlenlp/ops/ext_utils.py delete mode 100644 paddlenlp/ops/fast_transformer/CMakeLists.txt delete mode 100644 paddlenlp/ops/fast_transformer/__init__.py delete mode 100644 paddlenlp/ops/fast_transformer/sample/bart_decoding_sample.py delete mode 100644 paddlenlp/ops/fast_transformer/sample/bart_export_model_sample.py delete mode 100644 paddlenlp/ops/fast_transformer/sample/bart_inference.py delete mode 100644 paddlenlp/ops/fast_transformer/sample/config/decoder.sample.yaml delete mode 100644 paddlenlp/ops/fast_transformer/sample/config/decoding.sample.yaml delete mode 100644 paddlenlp/ops/fast_transformer/sample/decoder_sample.py delete mode 100644 paddlenlp/ops/fast_transformer/sample/decoding_sample.py delete mode 100644 paddlenlp/ops/fast_transformer/sample/encoder_decoder_sample.py delete mode 100644 paddlenlp/ops/fast_transformer/sample/encoder_decoding_sample.py delete mode 100644 paddlenlp/ops/fast_transformer/sample/gpt_export_model_sample.py delete mode 100644 paddlenlp/ops/fast_transformer/sample/gpt_sample.py delete mode 100644 paddlenlp/ops/fast_transformer/sample/mbart_decoding_sample.py delete mode 100644 paddlenlp/ops/fast_transformer/sample/mbart_export_model_sample.py delete mode 100644 paddlenlp/ops/fast_transformer/sample/mbart_inference.py delete mode 100644 paddlenlp/ops/fast_transformer/sample/plato_export_model_sample.py delete mode 100644 paddlenlp/ops/fast_transformer/sample/plato_inference.py delete mode 100644 paddlenlp/ops/fast_transformer/sample/t5_export_model_sample.py delete mode 100644 paddlenlp/ops/fast_transformer/sample/t5_inference.py delete mode 100644 paddlenlp/ops/fast_transformer/sample/unimo_text_export_model_sample.py delete mode 100644 paddlenlp/ops/fast_transformer/sample/unimo_text_inference.py delete mode 100644 paddlenlp/ops/fast_transformer/src/CMakeLists.txt delete mode 100644 paddlenlp/ops/fast_transformer/src/cublas_handle.cc delete mode 100644 paddlenlp/ops/fast_transformer/src/cublas_handle.h delete mode 100644 paddlenlp/ops/fast_transformer/src/demo/gpt.cc delete mode 100644 paddlenlp/ops/fast_transformer/src/demo/helper.h delete mode 100644 paddlenlp/ops/fast_transformer/src/demo/transformer_e2e.cc delete mode 100644 paddlenlp/ops/fast_transformer/src/demo/utf8.h delete mode 100644 paddlenlp/ops/fast_transformer/src/demo/utf8/checked.h delete mode 100644 paddlenlp/ops/fast_transformer/src/demo/utf8/core.h delete mode 100644 paddlenlp/ops/fast_transformer/src/demo/utf8/cpp11.h delete mode 100644 paddlenlp/ops/fast_transformer/src/demo/utf8/cpp17.h delete mode 100644 paddlenlp/ops/fast_transformer/src/demo/utf8/unchecked.h delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_bart_decoding_op.cc delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_bart_decoding_op.cu delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_bart_decoding_op.h delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_decoder_op.cc delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_decoder_op.cu delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_decoder_op.h delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_decoding_op.cc delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_decoding_op.cu delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_decoding_op.h delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_encoder_op.cc delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_encoder_op.cu delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_encoder_op.h delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_force_decoding_op.cc delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_force_decoding_op.cu delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_force_decoding_op.h delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_gpt_op.cc delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_gpt_op.cu delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_gpt_op.h delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_gptj_op.cc delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_gptj_op.cu delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_gptj_op.h delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_mbart_decoding_op.cc delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_mbart_decoding_op.cu delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_mbart_decoding_op.h delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_miro_op.cc delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_miro_op.cu delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_miro_op.h delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_opt_op.cc delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_opt_op.cu delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_opt_op.h delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_pegasus_decoding_op.cc delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_pegasus_decoding_op.cu delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_pegasus_decoding_op.h delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_t5_decoding_op.cc delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_t5_decoding_op.cu delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_t5_decoding_op.h delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_unified_decoding_op.cc delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_unified_decoding_op.cu delete mode 100644 paddlenlp/ops/fast_transformer/src/fusion_unified_decoding_op.h delete mode 100644 paddlenlp/ops/fast_transformer/src/parallel_utils.cc delete mode 100644 paddlenlp/ops/fast_transformer/src/parallel_utils.h delete mode 100644 paddlenlp/ops/fast_transformer/src/pd_traits.h delete mode 100644 paddlenlp/ops/fast_transformer/src/utils.cc delete mode 100644 paddlenlp/ops/fast_transformer/src/utils.h delete mode 100644 paddlenlp/ops/fast_transformer/transformer/__init__.py delete mode 100644 paddlenlp/ops/fast_transformer/transformer/decoder.py delete mode 100644 paddlenlp/ops/fast_transformer/transformer/decoding.py delete mode 100644 paddlenlp/ops/fast_transformer/transformer/encoder.py delete mode 100644 paddlenlp/ops/fast_transformer/transformer/fast_transformer.py delete mode 100644 paddlenlp/ops/patches/FasterTransformer/CMakeLists.txt delete mode 100644 paddlenlp/ops/patches/FasterTransformer/fastertransformer/CMakeLists.txt delete mode 100644 paddlenlp/ops/patches/FasterTransformer/fastertransformer/bert_encoder_transformer.h delete mode 100644 paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/attention_kernels.cu delete mode 100644 paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/attention_kernels.cuh delete mode 100644 paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/cuda_kernels.cu delete mode 100644 paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/cuda_kernels.h delete mode 100644 paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/decoding_kernels.cu delete mode 100644 paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/lightseq_kernels.cu delete mode 100644 paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/masked_multihead_attention.cu delete mode 100644 paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/masked_multihead_attention.h delete mode 100644 paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/masked_multihead_attention_utils.h delete mode 100644 paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/online_softmax_beamsearch_kernels.cu delete mode 100644 paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/open_attention.h delete mode 100644 paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/open_decoder.cu delete mode 100644 paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/open_decoder.cuh delete mode 100644 paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/topk_kernels.cu delete mode 100644 paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/topk_kernels.cuh delete mode 100644 paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/transformer_decoder.cu delete mode 100644 paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/transformer_decoding_kernels.cu delete mode 100644 paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/transformer_kernels.cu delete mode 100644 paddlenlp/ops/patches/FasterTransformer/fastertransformer/cuda/transformer_kernels.cuh delete mode 100644 paddlenlp/ops/patches/FasterTransformer/fastertransformer/decoding_beamsearch.h delete mode 100644 paddlenlp/ops/patches/FasterTransformer/fastertransformer/decoding_sampling.h delete mode 100644 paddlenlp/ops/patches/FasterTransformer/fastertransformer/gpt.h delete mode 100644 paddlenlp/ops/patches/FasterTransformer/fastertransformer/gptj.h delete mode 100644 paddlenlp/ops/patches/FasterTransformer/fastertransformer/open_decoder.h delete mode 100644 paddlenlp/ops/patches/FasterTransformer/fastertransformer/opt.h delete mode 100644 paddlenlp/ops/patches/FasterTransformer/fastertransformer/standard_encoder.h delete mode 100644 paddlenlp/ops/patches/FasterTransformer/fastertransformer/t5_beamsearch.h delete mode 100644 paddlenlp/ops/patches/FasterTransformer/fastertransformer/t5_sampling.h delete mode 100644 paddlenlp/ops/patches/FasterTransformer/fastertransformer/utils/allocator.h delete mode 100644 paddlenlp/ops/patches/FasterTransformer/fastertransformer/utils/arguments.h delete mode 100644 paddlenlp/ops/patches/FasterTransformer/fastertransformer/utils/common.h delete mode 100644 paddlenlp/ops/patches/FasterTransformer/fastertransformer/utils/common_structure.h diff --git a/fast_generation/README.md b/fast_generation/README.md deleted file mode 100644 index fe699a9c7271..000000000000 --- a/fast_generation/README.md +++ /dev/null @@ -1,305 +0,0 @@ -# FastGeneration - -FastGeneration是PaddleNLP v2.2版本加入的文本生成高性能加速功能,其支持GPT、OPT、BART、UnifiedTransformer等多种NLP生成类预训练模型,并且支持多种解码策略,可以用于机器翻译、文本续写、文本摘要、对话生成等多种NLG任务的GPU场景预测加速。 - -功能底层依托于[NV FasterTransformer](https://github.com/NVIDIA/FasterTransformer),该库针对标准的Transformer和GPT模型、beam search和sampling解码策略进行了性能优化。PaddleNLP FastGeneration在其之上进行了扩展,实现了更多模型和生成策略的优化支持,并将功能入口封装于`model.generate`函数。功能的开启和关闭通过传入`use_fast`参数进行控制(默认为关闭状态)。通过调用generate函数,用户可以简单的使用模型高性能推理功能。下图展示了FastGeneration的启动流程: - - -

- -

- -## Featrues - -- 全面支持生成式预训练模型。包括GPT、OPT、CodeGen、GPTJ、BART、mBART、UnifiedTransformer和UNIMO-text。 -- 支持大多数主流解码策略。包括Beam Search、Sampling、Greedy Search。以及Diverse Sibling Search、Length Penalty等子策略。 -- 解码速度快。最高可达非加速版generate函数的**18倍**。**并支持FP16混合精度计算**。 -- 易用性强。功能的入口为`model.generate`,与非加速版生成api的使用方法相同,当满足加速条件时使用jit即时编译高性能算子并用于生成,不满足则自动切换回非加速版生成api。 -- GPT、UnifiedTransformer和UNIMO-text模型支持高性能并行推理,在具备MPI和NCCL的环境中一行代码即可开启使用,允许通过多张小显存容量的 GPU 使用百亿大模型,预测速度较单卡也进一步提升。百亿模型四卡并行高性能推理速度达单卡高性能推理速度2+倍。 - -### Inference Model Support -下表为PaddleNLP FastGeneration对预训练模型和解码策略的支持情况(GPU)。 - -| Model Name | GPT2 | OPT | CodeGen| GPTJ| BART | mBART | UnifiedTransformer | -|------------------------|---------|---------| ---------| ---------|-----------------|-----------------|--------------------| -| Model Structure | Decoder | Decoder |Decoder|Decoder| Encoder-Decoder | Encoder-Decoder | Prefix-LM | -| Beam Search | ❌ | ❌ |❌|❌| ✅ | ✅ | ✅ | -| Top-K Sampling | ✅ | ✅ |✅|✅| ✅ | ✅ | ✅ | -| Top-P Sampling | ✅ | ✅ |✅|✅| ✅ | ✅ | ✅ | -| Diverse Sibling Search | ❌ | ❌ |❌|❌| ✅ | ✅ | ✅ | -| Forced Decoding | ❌ | ❌ |❌|❌| ❌ | ✅ | ❌ | -| Length Penalty | ❌ | ❌ |❌|❌| ✅ | ✅ | ✅ | -| Temperature | ✅ | ✅ |✅|✅| ✅ | ✅ | ✅ | -| Repetition Penalty | ✅ | ✅ |✅|✅| ❌ | ❌ | ❌ | - -## Performence - -FastGeneration的高性能解码相比原版generate方法加速明显,并且与竞品相比有也有极大的速度优势。以下为性能对比图: - -- **batch_size = 4, out_seq_len = 32** -- Device: Tesla V100-SXM2-16GB -- CUDA version 11.2 -- cudnn version 8 -- torch version 1.10.0+cu113 -- transformers version 4.12.5 - -### **BART** (bart-base, batch_size=4, max_length=32) - -

- -

- -### **GPT** (gpt2, batch_size=4, max_length=32) - -

- -

- -### **OPT** (opt, batch_size=4, max_length=32) - -

- -

- -### **CodeGen:** -* 环境和超参 - - Platform: Tesla V100-SXM2-32GB - - CUDA 10.1 - - CUDNN 7.6.5 - - PaddlePaddle-gpu 2.3.1.post101 - - transformers==4.21.1 - - torch==1.11.0 - - Batch Size: 1 - - Input Length: 60 - - Output Length: 20 -

- -

- -- Platform: A100-40G -

- -

- -### **Pegasus** -* 环境和超参 - - Platform: Tesla V100-SXM2-32GB - - CUDA 10.1 - - CUDNN 7.6.5 - - PaddlePaddle-gpu 2.3.2.post101 - - transformers==4.21.1 - - torch==1.11.0 - - Batch Size: 4 - - Input Length: 60 - - Output Length: 20 - - Decode_strategy: beam search - - num_beams: 4 -

- -

- -更详细的性能数据请参见[这里](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/fast_generation/perf) - -## Quick Start - -### 高性能推理 - -为体现FastGeneration的易用性,我们在`samples`文件夹中内置了几个典型任务示例,下面以基于GPT模型的中文文本续写任务为例: - -```sh -python samples/gpt_sample.py -``` - -如果是第一次执行,PaddleNLP会启动即时编译([JIT Compile](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/new_op/new_custom_op_cn.html#jit-compile))自动编译高性能解码算子。 - -```sh -... -2021-11-17 13:42:56,771 - INFO - execute command: cd /10.2/hub/PaddleNLP/paddlenlp/ops/extenstions && /usr/local/bin/python FasterTransformer_setup.py build -INFO:utils.cpp_extension:execute command: cd /10.2/hub/PaddleNLP/paddlenlp/ops/extenstions && /usr/local/bin/python FasterTransformer_setup.py build -grep: warning: GREP_OPTIONS is deprecated; please use an alias or script -running build -running build_ext --- The C compiler identification is GNU 8.2.0 --- The CXX compiler identification is GNU 8.2.0 --- The CUDA compiler identification is NVIDIA 10.2.89 --- Check for working C compiler: /usr/bin/cc --- Check for working C compiler: /usr/bin/cc -- works --- Detecting C compiler ABI info --- Detecting C compiler ABI info - done --- Detecting C compile features --- Detecting C compile features - done --- Check for working CXX compiler: /usr -... -``` - -编译过程通常会花费几分钟的时间编译只会进行一次,之后再次使用高性能解码就不需要重新编译了,编译完成后会继续运行,可以看到生成的结果如下: - -``` -Model input: 花间一壶酒,独酌无相亲。举杯邀明月, -Result: 对影成三人。 -``` - -打开示例代码 `samples/gpt_sample.py` ,我们可以看到如下代码: - -``` -... -model = GPTLMHeadModel.from_pretrained(model_name) -... -outputs, _ = model.generate( - input_ids=inputs_ids, max_length=10, decode_strategy='greedy_search', - use_fast=True) -... -``` - -可以看到,FastGeneration的使用方法与 `model.generate()` 相同,只需传入输入tensor和解码相关参数即可,使用非常简便。如果要使用非加速版的 `model.generate()` 方法,只需传入 `use_fast=False` 即可,示例如下: - -``` -... -outputs, _ = model.generate( - input_ids=inputs_ids, max_length=10, decode_strategy='greedy_search', use_fast=False) -... -``` - -**NOTE:** 需要注意的是,如果传入 `model.generate()` 的参数不满足高性能版本的要求。程序会做出提示并自动切换为非加速版本,例如我们在上面的例子中传入 `min_length=1` ,会得到如下提示: - -``` -... -[2021-11-17 14:21:06,132] [ WARNING] - 'min_length != 0' is not supported yet in the fast version -[2021-11-17 14:21:06,132] [ WARNING] - FastGeneration is not available, and the original version would be used instead. -... -``` - -关于该函数的详细介绍可以参考API文档[generate](https://paddlenlp.readthedocs.io/zh/latest/source/paddlenlp.transformers.generation_utils.html)和**Aistudio教程[文本生成任务实战:如何使用PaddleNLP实现各种解码策略](https://aistudio.baidu.com/aistudio/projectdetail/3243711?contributionType=1)。**`samples`文件夹中的其他示例的使用方法相同。 - -### 并行推理 - -FastGeneration对GPT、UnifiedTransformer和UNIMO-text模型在高性能推理的基础上还实现了模型并行功能,其中GPT支持Tensor Parallel和Layer Parallel(Pipeline Parallel)两种并行策略的组合,UnifiedTransformer和UNIMO-text支持Tensor Parallel。关于这两种并行策略的详细介绍请参考[Megatron论文](https://arxiv.org/pdf/2104.04473.pdf)。 - -并行推理当前依赖MPI([MPICH](https://www.mpich.org)、[OpenMPI](https://www.open-mpi.org)均可)和[NCCL](https://developer.nvidia.com/nccl),如需使用还请先安装依赖。在使用时,相比上面的单卡高性能加速代码中也只增加了`from_pretrained`创建加载模型之前加上`enable_ft_para()`一行。 -#### GPT 并行推理 - -GPT高性能并行推理的完整使用示例已在`gpt_mp_sample.py`中提供,按照如下方式启动即可: - -```sh -mpirun -n 4 python gpt_mp_sample.py --tensor_para_size 4 --layer_para_size 1 -``` - -其中`-n 4`指明使用的进程和GPU数,`tensor_para_size`和`tensor_para_size`分别指明Tensor Parallel和Layer Parallel各自使用的GPU数,均设置为1则进行单卡预测。另外加上`--use_fp16`以使用FP16,加上`--profile`可以进行相应设置的性能测试。其他生成相关的参数设置释义如下: -- `model_name` 指定使用的GPT模型,默认为[`gpt-cpm-larg-cn`](https://github.com/TsinghuaAI/CPM-1-Generate)。 -- `max_length` 指定生成的最大长度,默认为50。 -- `topk` 用于Top-K采样策略,采样时将只从概率最高K个token中采样,默认为1,即greedy search。 -- `topp` 用于Top-P采样策略,采样时将只从概率最高且累加概率不超过该值的token中采样,默认为1.0。 -- `temperature` 用于调整预测概率分布,默认为1.0,即保持模型原有的预测概率。 - -使用`gpt-cpm-larg-cn`(2.6B)和默认设置,在V100上4卡Tensor Parallel较单卡高性能预测速度提升约40%。 - -#### PLATO-XL 并行推理 - -PLATO-XL百亿对话预训练模型(11B UnifiedTransformer模型)高性能并行推理的完整使用示例已在`plato_xl_sample.py`中提供(当前只支持Tensor Parallel),按照如下方式启动即可: - -```shell -mpirun -n 4 python plato_xl_sample.py -``` - -参数释义基本同上。在V100上4卡Tensor Parallel高性能预测为单卡高性能预测速度的2倍。 - -## Generate Examples - -除了以上示例之外,PaddleNLP的examples中大多使用了`model.generate`的示例都可以通过调整到合适的参数使用高性能推理。具体如下: - -- [examples/dialogue/unified_transformer](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/examples/dialogue/unified_transformer) -- [model_zoo/gpt/fast_gpt](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/model_zoo/gpt/fast_gpt) -- [examples/text_generation/unimo-text](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/examples/text_generation/unimo-text) -- [examples/text_summarization/bart](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/examples/text_summarization/bart) - -下面我们以基于 `Unified Transformer` 的任务型对话为例展示一下FastGeneration的加速效果: - -打开以上链接中Unified Transformer对应的example,找到README中对应预测的脚本。稍作修改如下: - -```sh -export CUDA_VISIBLE_DEVICES=0 - python infer.py \ - --model_name_or_path=unified_transformer-12L-cn-luge \ - --output_path=./predict.txt \ - --logging_steps=10 \ - --seed=2021 \ - --max_seq_len=512 \ - --max_knowledge_len=256 \ - --batch_size=4 \ - --min_dec_len=1 \ - --max_dec_len=64 \ - --num_return_sequences=1 \ - --decode_strategy=sampling \ - --top_k=5 \ - --faster - --device=gpu -``` - -由于这里只是展示性能,我们直接在 `model_name_or_path` 填入PaddleNLP预训练模型名称 `unified_transformer-12L-cn-luge` 。 - -可以看到,由于该任务为对话任务,我们为了防止模型生成过多安全回复(如:哈哈哈、不错等),保证生成结果具有更多的随机性,我们选择TopK-sampling作为解码策略,并让k=5。 - -打开 `infer.py` ,可以看到我们传入的脚本参数大多都提供给了 `model.generate()` 方法: - -``` -output = model.generate( - input_ids=input_ids, - token_type_ids=token_type_ids, - position_ids=position_ids, - attention_mask=attention_mask, - seq_len=seq_len, - max_length=args.max_dec_len, - min_length=args.min_dec_len, - decode_strategy=args.decode_strategy, - temperature=args.temperature, - top_k=args.top_k, - top_p=args.top_p, - num_beams=args.num_beams, - length_penalty=args.length_penalty, - early_stopping=args.early_stopping, - num_return_sequences=args.num_return_sequences, - use_fp16_decoding=args.use_fp16_decoding, - use_fast=args.faster) -``` - -运行脚本,输出结果如下: - -```sh -step 10 - 1.695s/step -step 20 - 1.432s/step -step 30 - 1.435s/step -``` - -可以看到,非加速版 `generate()` 方法的预测速度为每个step耗时1.5秒左右。 - -下面我们在启动脚本中传入 `--faster` 参数,该参数会向 `generate()` 方法传入 `use_fast=True` ,启动加速模式。同时我们需要设置 `--min_dec_len=0` ,因为FastGeneration当前还不支持该参数。新的脚本启动参数如下: - -```sh -export CUDA_VISIBLE_DEVICES=0 - python infer.py \ - --model_name_or_path=unified_transformer-12L-cn-luge \ - --output_path=./predict.txt \ - --logging_steps=10 \ - --seed=2021 \ - --max_seq_len=512 \ - --max_knowledge_len=256 \ - --batch_size=4 \ - --min_dec_len=0 \ - --max_dec_len=64 \ - --num_return_sequences=1 \ - --decode_strategy=sampling \ - --top_k=5 \ - --device=gpu \ - --faster -``` - -再次运行脚本,输出结果如下(由于我们已经编译过高性能算子,所以这里不会重新编译): - -```sh -[2021-11-23 13:38:09,200] [ DEBUG] - skipping 'FastGeneration' extension (up-to-date) build -step 10 - 0.250s/step -step 20 - 0.156s/step -step 30 - 0.141s/step -``` - -可以看到,FastGeneration的预测速度为每个step耗时0.15秒左右,相比非加速版提速超过9倍。 diff --git a/fast_generation/perf/README.md b/fast_generation/perf/README.md deleted file mode 100644 index 242bf765ec11..000000000000 --- a/fast_generation/perf/README.md +++ /dev/null @@ -1,250 +0,0 @@ -# FastGeneration Performance - -以下性能数据为非加速版generate方法和FastGeneration对比数据。 - -- **测试设备:** Tesla V100-SXM2-16GB -- **Batch Size:** 4 -- **Max Length:** 32 - -## 性能数据 -*** - -CUDA 10.1, cudnn 7, gcc 82 - -torch version 1.10.0+cu102, transformers version 4.12.5 - -**BART:** - -| Model Size | Decode Strategy| FastGeneration(FP32)
(ms) | FastGeneration(FP16)
(ms) | HF generate
(ms) | Speed Up Rate
(Faster32/HF) | Speed Up Rate
(Faster16/HF) | -|-----|----|---|---|---|---|---| -|num_layers = 6
num_attention_heads = 12
hidden_size = 768
(bart-base)|top_k = 1|37.53|34.01|136.89|3.65|4.02 -| |top_k = 4 |39.33|34.98|146.89|3.73|4.2 | -| |top_k = 8 |42.35|34.77|136.80|3.23|3.93| -| |top_k = 16 |40.95|35.45|148.45|3.63|4.19| -| |top_p = 0.4 |45.83|33.32|184.36|4.02|5.53| -| |num_beams = 4|44.72|37.51|242.73|5.43|6.47| -| |num_beams = 8|61.56|40.27|273.93|4.45|6.8 | -| |num_beams = 16|82.05|46.68|433.51|5.28|9.29| -|num_layers = 12
num_attention_heads = 16
hidden_size = 1024
(bart-large)|top_k = 1|55.03|45.44|199.27|3.62|4.39| -| |top_k = 4|70.12|56.81|220.96|3.15|3.89| -| |top_k = 8|69.96|57.73|201.06|2.87|3.48| -| |top_k = 16|69.16|59.62|223.73|3.23|3.75| -| |top_p = 0.4|73.49|61.43|275.86|3.75|4.49| -| |num_beams = 4|66.44|50.71|277.61|4.18|5.47| -| |num_beams = 8|135.30|85.75|314.78|2.33|3.67| -| |num_beams = 16|168.01|100.22|441.95|2.63|4.41| - -**GPT:** - -| Model Size | Decode Strategy| FastGeneration(FP32)
(ms) | FastGeneration(FP16)
(ms) | HF generate
(ms) | Speed Up Rate
(Faster32/HF) | Speed Up Rate
(Faster16/HF) | -|-----|----|---|---|---|---|---| -|num_layers = 12
num_attention_heads = 12
hidden_size = 768
(gpt2)|top_k = 1|69.29|59.20|363.93|5.25|6.15| -| |top_k = 4|68.07|60.92|391.02|5.74|6.42| -| |top_k = 8|69.16|60.45|401.18|5.80|6.64| -| |top_k = 16|73.59|62.40|401.55|5.46|6.44| -| |top_p = 0.4|95.61|76.26|429.63|4.49|5.63| -|num_layers = 24
num_attention_heads = 16
hidden_size = 1024
(gpt2-medium)|top_k = 1|127.04|95.13|726.83|5.72|7.64| -| |top_k = 4|126.74|93.95|694.53|5.48|7.39| -| |top_k = 8|128.11|94.07|743.63|5.80|7.91| -| |top_k = 16|126.78|95.00|732.96|5.78|7.72| -| |top_p = 0.4|143.36|105.40|756.12|5.27|7.17| -|num_layers = 36
num_attention_heads = 20
hidden_size = 1280
(gpt2-large)top_k = 1|236.80|200.37|1057.94|4.47|5.28| -| |top_k = 4|236.69|201.95|1075.17|4.54|5.32| -| |top_k = 8|237.04|202.00|1084.60|4.58|5.37| -| |top_k = 16|235.01|201.79|1110.75|4.73|5.5| -| |top_p = 0.4|270.31|205.84|1111.16|4.11|5.4| - -**OPT** - -* 模型参数 - -| Model Name | num_layers | num_attention_heads | hidden_size | -|------------|------------|---------------------|-------------| -| OPT-125m | 12 | 12 | 768 | -| OPT-350M | 24 | 16 | 1024 | - -transformer: 4.20.1 - -* 性能指标数据 - -| Model | Decoding Strategy | Faster Generation(FP32)(ms) | Faster Generation(FP16)(ms) | HF Generation(ms) | Speed Up Rate(Faster32/HF) | Speed Up Rate(Faster16/HF) | -|:--------:|:-------------------:|:-----------------------------:|:-----------------------------:|:-------------------:|:----------------------------:|:----------------------------:| -| opt-125m | top_k=1 | 58.39 | 48.82 | 290.14 | 4.97 | 5.94 | -| | top_k=4 | 58.45 | 49.05 | 283.55 | 4.85 | 5.78 | -| | top_k=8 | 59.13 | 49.32 | 284.76 | 4.82 | 5.77 | -| | top_k=16 | 60.15 | 49.54 | 299.87 | 4.99 | 6.05 | -| | top_p=0.4 | 75.78 | 60.72 | 335.70 | 4.43 | 5.53 | -| opt-350m | top_k=1 | 124.49 | 90.58 | 511.46 | 4.11 | 5.65 | -| | top_k=4 | 125.60 | 90.96 | 528.42 | 4.21 | 5.81 | -| | top_k=8 | 125.93 | 90.96 | 523.46 | 4.16 | 5.75 | -| | top_k=16 | 126.25 | 91.58 | 524.79 | 4.16 | 5.73 | -| | top_p=0.4 | 142.93 | 103.68 | 600.80 | 4.20 | 5.79 | - -*** - -CUDA 11.2, cudnn 8, gcc 82 - -torch version 1.10.0+cu113, transformers version 4.12.5 - -**BART:** - -| Model Size | Decode Strategy| FastGeneration(FP32)
(ms) | FastGeneration(FP16)
(ms) | HF generate
(ms) | Speed Up Rate
(Faster32/HF) | Speed Up Rate
(Faster16/HF) | -|-----|----|---|---|---|---|---| -|num_layers = 6
num_attention_heads = 12
hidden_size = 768
(bart-base)|top_k = 1|31.1|27.4|139.46|4.48|5.09 -| |top_k = 4 |32.13|29.06|149.81|4.66|5.16| -| |top_k = 8 |31.7|28.36|154.3|4.87|5.44| -| |top_k = 16 |32.93|28.66|145.85|4.43|5.09| -| |top_p = 0.4 |33.35|29.01|173.18|5.19|5.97| -| |num_beams = 4|47.55|38.02|252.71|5.31|6.65| -| |num_beams = 8|52.19|41.39|282.3|5.41|6.82| -| |num_beams = 16|67.18|45.82|441.59|6.57|9.64| -|num_layers = 12
num_attention_heads = 16
hidden_size = 1024
(bart-large)|top_k = 1|45.8|37.43|173.08|3.78|4.62| -| |top_k = 4|51.11|48.28|246.27|4.82|5.1| -| |top_k = 8|61.61|50.67|246.19|4.0|4.86| -| |top_k = 16|63.81|48.33|272.93|4.28|5.65| -| |top_p = 0.4|63.0|50.05|288.76|4.58|5.77| -| |num_beams = 4|65.54|48.58|273.84|4.18|5.64| -| |num_beams = 8|75.68|52.59|340.86|4.5|6.48| -| |num_beams = 16|102.87|62.25|477.97|4.65|7.68| - -**GPT:** - -| Model Size | Decode Strategy| FastGeneration(FP32)
(ms) | FastGeneration(FP16)
(ms) | HF generate
(ms) | Speed Up Rate
(Faster32/HF) | Speed Up Rate
(Faster16/HF) | -|-----|----|---|---|---|---|---| -|num_layers = 12
num_attention_heads = 12
hidden_size = 768
(gpt2)|top_k = 1|50.84|40.37|399.58|7.86|9.9| -| |top_k = 4|50.38|38.81|419.55|8.33|10.81| -| |top_k = 8|51.23|36.78|411.7|8.04|11.19| -| |top_k = 16|51.03|38.76|408.36|8.0|10.54| -| |top_p = 0.4|68.55|48.04|489.45|7.14|10.19| -|num_layers = 24
num_attention_heads = 16
hidden_size = 1024
(gpt2-medium)|top_k = 1|111.37|79.73|753.11|6.76|9.45| -| |top_k = 4|110.53|80.48|767.48|6.94|9.54| -| |top_k = 8|109.87|78.92|754.99|6.87|9.57| -| |top_k = 16|110.61|85.26|764.16|6.91|8.96| -| |top_p = 0.4|127.51|87.72|830.24|6.51|9.46| -|num_layers = 36
num_attention_heads = 20
hidden_size = 1280
(gpt2-large)|top_k = 1|203.76|142.85|1108.26|5.44|7.76| -| |top_k = 4|204.18|139.49|1230.63|6.03|8.82| -| |top_k = 8|204.22|139.14|1238.96|6.07|8.9| -| |top_k = 16|204.11|140.04|1148.05|5.62|8.2| -| |top_p = 0.4|222.12|150.68|1248.75|5.62|8.29| - - -**OPT:** - -* 模型参数 - -| Model Name | num_layers | num_attention_heads | hidden_size | -|------------|------------|---------------------|-------------| -| OPT-125m | 12 | 12 | 768 | -| OPT-350M | 24 | 16 | 1024 | - -transformers: 4.20.1 - -* 性能结果报表 - -| Model | Decoding Strategy | Faster Generation(FP32)(ms) | Faster Generation(FP16)(ms) | HF Generation(ms) | Speed Up Rate(Faster32/HF) | Speed Up Rate(Faster16/HF) | -|:--------:|:-------------------:|:-----------------------------:|:-----------------------------:|:-------------------:|:----------------------------:|:----------------------------:| -| opt-125m | top_k=1 | 50.57 | 42.59 | 267.95 | 5.30 | 6.29 | -| | top_k=4 | 50.88 | 40.01 | 280.95 | 5.52 | 7.02 | -| | top_k=8 | 50.91 | 43.77 | 268.54 | 5.27 | 6.14 | -| | top_k=16 | 51.08 | 42.56 | 265.40 | 5.20 | 6.24 | -| | top_p=0.4 | 69.08 | 54.59 | 330.56 | 4.78 | 6.06 | -| opt-350m | top_k=1 | 110.22 | 77.82 | 507.00 | 4.60 | 6.51 | -| | top_k=4 | 110.76 | 77.93 | 479.42 | 4.33 | 6.15 | -| | top_k=8 | 142.07 | 78.86 | 513.79 | 3.62 | 6.52 | -| | top_k=16 | 110.80 | 78.19 | 488.34 | 4.41 | 6.25 | -| | top_p=0.4 | 128.33 | 92.57 | 544.18 | 4.24 | 5.88 | - -**CodeGen:** -* 环境和超参 - -- Platform: Tesla V100-SXM2-32GB -- CUDA 10.1 -- CUDNN 7.6.5 -- PaddlePaddle-gpu 2.3.1.post101 -- transformers==4.21.1 -- torch==1.11.0 -- Batch Size: 1 -- Input Length: 60 -- Output Length: 20 - -* 模型参数 - -| Model Name | num_layers | num_attention_heads | hidden_size | -|------------|------------|---------------------|-------------| -| Salesforce/codegen-350M-mono | 20 | 16 | 1024 | -| Salesforce/codegen-2B-mono | 32 | 32 | 2560 | -| Salesforce/codegen-6B-mono | 33 | 16 | 4096 | -| Salesforce/codegen-16B-mono | 34 | 24 | 6144 | - - - -* 性能结果报表 - -| Model | Decoding Strategy | Faster Generation(FP32)(ms) | Faster Generation(FP16)(ms) | HF Generation(ms) | Speed Up Rate(Faster32/HF) | Speed Up Rate(Faster16/HF) | -|:--------:|:-------------------:|:-----------------------------:|:-----------------------------:|:-------------------:|:----------------------------:|:----------------------------:| -| Salesforce/codegen-350M-mono | top_k=1 | 57.76 | 51.35 | 709.62 | 12.29 | 13.82 | -| | top_k=4 | 57.42 | 50.88 | 639.58 | 11.14 | 12.57 | -| | top_k=8 | 57.24 | 51.67 | 685.82 | 11.98 | 13.27 | -| | top_k=16 | 57.57 | 51.61 | 686.62 | 11.93 | 13.30 | -| | top_p=0.4 | 67.26 | 57.35 | 656.12 | 9.75 | 11.44 | -| Salesforce/codegen-2B-mono| top_k=1 | 319.03 | 207.41 | 1040.71 | 3.26 | 5.02 | -| | top_k=4 | 318.98 | 207.37 | 1014.32 | 3.18 | 4.89 | -| | top_k=8 | 319.66 | 207.26 | 1084.09 | 3.39 | 5.23 | -| | top_k=16 | 320.04 | 207.74 | 1040.28 | 3.25 | 5.01 | -| | top_p=0.4 | 329.07 | 213.97 | 1055.55 | 3.21 | 4.93 | -| Salesforce/codegen-6B-mono| top_k=1 | 762.91 | 411.94 | 1384.90 | 1.82 | 3.36 | -| | top_k=4 | 762.58 | 412.79 | 1378.32 | 1.81 | 3.34 | -| | top_k=8 | 763.43 | 413.32 | 1366.45 | 1.79 | 3.31 | -| | top_k=16 | 762.79 | 413.83 | 1376.69 | 1.80 | 3.33 | -| | top_p=0.4 | 771.77 | 419.16 | 1366.49 | 1.77 | 3.26 | - - -**Pegasus:** - -| Model Size | Decode Strategy| FastGeneration(FP32)
(ms) | FastGeneration(FP16)
(ms) | HF generate
(ms) | Speed Up Rate
(Faster32/HF) | Speed Up Rate
(Faster16/HF) | -|-----|----|---|---|---|---|---| -|IDEA-CCNL/Randeng-Pegasus-238M-Summary-Chinese|num_beams=2|87.41|75.47|1322.24|15.13|17.52 -| |num_beams=4 |91.55|66.47|1364.43|14.90|20.53| -| |num_beams=6 |94.55|73.25|1391.35|14.72|18.99| -| |num_beams=8 |100.48|84.82|1467.64|14.61|17.30| -|IDEA-CCNL/Randeng-Pegasus-523M-Summary-Chinese|num_beams=2|120.15|94.26|1735.21|14.44|18.41| -| |num_beams=4 |126.42|99.07|1622.31|12.83|16.38| -| |num_beams=6 |142.21|99.95|1717.49|12.08|17.18| -| |num_beams=8 |158.26|104.31|1697.65|10.73|16.27| - - -## 测试方法 - -运行如下命令即可bart性能测试: - -```sh -bash run_perf_bart.sh -``` - -运行如下命令即可启动gpt性能测试: - -```sh -bash run_perf_gpt.sh -``` - -运行以上命令后,脚本会自动使用不同的模型参数进行性能测试,结果如下图所示: - -```sh -... -[2021-12-10 08:11:37,255] [ DEBUG] - skipping 'FastGeneration' extension (up-to-date) build -Namespace(decode_strategy='sampling', max_length=32, model_name_or_path='bart-base', num_beams=1, top_k=1, top_p=1.0, use_fp16_decoding=False) -Faster FP32 cost: 40.13654176145792 -PD cost: 511.413540635258 -HF cost: 138.49875444546342 -Speed up Faster FP32/PD: 12.741843671403577 -Speed up Faster FP32/HF: 3.4506897796177394 -... -... -[2021-12-10 08:13:42,858] [ DEBUG] - skipping 'FastGeneration' extension (up-to-date) build -Namespace(decode_strategy='sampling', max_length=32, model_name_or_path='bart-base', num_beams=1, top_k=1, top_p=1.0, use_fp16_decoding=True) -Faster FP16 cost: 34.004870522767305 -... -``` -可以看到,对于每组参数,脚本会先输出FP32和竞品的测试对比,再单独输出FP16的性能数据。 - -**NOTE:** 根据测试环境和机器状态的不同,以上性能测试脚本的结果可能与表中结果有所出入。 diff --git a/fast_generation/perf/bart_perf.py b/fast_generation/perf/bart_perf.py deleted file mode 100644 index 8466dafcaaef..000000000000 --- a/fast_generation/perf/bart_perf.py +++ /dev/null @@ -1,170 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import time -from pprint import pprint - -import paddle -import torch -from transformers import BartForConditionalGeneration as hf_bart_model - -from paddlenlp.data import Pad -from paddlenlp.transformers import BartForConditionalGeneration, BartTokenizer - - -def prepare_input(tokenizer, sentences): - word_pad = Pad(tokenizer.pad_token_id, dtype="int64") - tokenized = tokenizer(sentences) - inputs = word_pad([i["input_ids"] for i in tokenized]) - input_ids = paddle.to_tensor(inputs) - return input_ids - - -def parse_args(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--model_name_or_path", - default="bart-base", - type=str, - choices=["bart-base", "bart-large"], - help="The model name to specify the bart to use. Can be one of ['bart-base', 'bart-large']. ", - ) - parser.add_argument( - "--decode_strategy", - default="sampling", - type=str, - choices=["greedy_search", "beam_search", "sampling"], - help="The decoding strategy. Can be one of ['greedy_search', 'beam_search', 'sampling']", - ) - parser.add_argument("--num_beams", default=4, type=int, help="The parameters for beam search. ") - parser.add_argument("--top_k", default=4, type=int, help="The number of candidate to procedure beam search. ") - parser.add_argument( - "--top_p", default=1.0, type=float, help="The probability threshold to procedure topp sampling. " - ) - parser.add_argument("--max_length", default=32, type=int, help="Maximum output length. ") - parser.add_argument("--use_fp16_decoding", action="store_true", help="Whether to use fp16 decoding to predict. ") - args = parser.parse_args() - return args - - -def do_predict(args): - place = "gpu" - place = paddle.set_device(place) - - tokenizer = BartTokenizer.from_pretrained(args.model_name_or_path) - model = BartForConditionalGeneration.from_pretrained(args.model_name_or_path) - # Set evaluate mode - model.eval() - sentences = [ - "I love that girl, but does not me.", - "She is so that I can not help glance at .", - "Nothing's gonna my love for you.", - "Drop everything now. Meet me in the pouring . Kiss me on the sidewalk.", - ] - - input_ids = prepare_input(tokenizer, sentences) - - # Define model - model.eval() - - num_loop = 100 - with paddle.no_grad(): - for i in range(num_loop): - # For warmup. - if 50 == i: - # PaddlePaddle >= 2.2 - paddle.device.cuda.synchronize(place) - start = time.perf_counter() - model.generate( - input_ids=input_ids, - max_length=args.max_length, - decode_strategy=args.decode_strategy, - top_k=args.top_k, - top_p=args.top_p, - num_beams=args.num_beams, - early_stopping=True, - use_fast=True, - use_fp16_decoding=args.use_fp16_decoding, - ) - paddle.device.cuda.synchronize(place) - fast_cost = (time.perf_counter() - start) / 50 * 1000 - - if args.use_fp16_decoding: - pprint(args) - print("Fast FP16 cost:", fast_cost) - return - - with paddle.no_grad(): - for i in range(num_loop): - # For warmup. - if 50 == i: - # PaddlePaddle >= 2.2 - paddle.device.cuda.synchronize(place) - start = time.perf_counter() - model.generate( - input_ids=input_ids, - max_length=args.max_length, - decode_strategy=args.decode_strategy, - top_k=args.top_k, - top_p=args.top_p, - num_beams=args.num_beams, - early_stopping=True, - ) - paddle.device.cuda.synchronize(place) - pd_cost = (time.perf_counter() - start) / 50 * 1000 - - device = torch.device("cuda:0") - hf_model = hf_bart_model.from_pretrained("facebook/" + args.model_name_or_path) - hf_model.to(device) - hf_model.eval() - hf_input_ids = prepare_input(tokenizer, sentences) - hf_input_ids = torch.tensor(hf_input_ids.numpy()) - hf_input_ids = hf_input_ids.to(device) - - if args.decode_strategy == "sampling": - do_sample = True - else: - do_sample = False - with torch.no_grad(): - for i in range(num_loop): - # For warmup. - if 50 == i: - torch.cuda.synchronize() - start = time.perf_counter() - hf_model.generate( - hf_input_ids, - do_sample=do_sample, - max_length=args.max_length + 1, - top_k=args.top_k, - top_p=args.top_p, - num_beams=args.num_beams, - no_repeat_ngram_size=0, - length_penalty=0.0, - ) - torch.cuda.synchronize() - hf_cost = (time.perf_counter() - start) / 50 * 1000 - - pprint(args) - print("Fast FP32 cost:", fast_cost) - print("PD cost:", pd_cost) - print("HF cost:", hf_cost) - print("Speed up Fast FP32/PD:", pd_cost / fast_cost) - print("Speed up Fast FP32/HF:", hf_cost / fast_cost) - - -if __name__ == "__main__": - args = parse_args() - do_predict(args) diff --git a/fast_generation/perf/codegen_perf.py b/fast_generation/perf/codegen_perf.py deleted file mode 100644 index 1a84b4e94fab..000000000000 --- a/fast_generation/perf/codegen_perf.py +++ /dev/null @@ -1,175 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import time -from pprint import pprint - -import numpy as np -import paddle -import pynvml - -from paddlenlp.transformers import CodeGenForCausalLM, CodeGenTokenizer - -pynvml.nvmlInit() - - -def query_by_id(gpu_id=2): - handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_id) - meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle) - return meminfo.used // 1024 // 1024 - - -def perf_pd(args): - start_mem = query_by_id(args.gpu_id) - place = "gpu" - place = paddle.set_device(place) - tokenizer = CodeGenTokenizer.from_pretrained(args.model_name_or_path) - model = CodeGenForCausalLM.from_pretrained(args.model_name_or_path) - model.eval() - load_mem = query_by_id(args.gpu_id) - - input_ids_np = [ - np.random.choice(list(tokenizer.decoder.keys())[:-1], args.input_len) for _ in range(args.batch_size) - ] - input_ids = paddle.to_tensor(input_ids_np) - - num_loop = 100 - with paddle.no_grad(): - for i in range(num_loop): - # For warmup. - if num_loop // 2 == i: - # PaddlePaddle >= 2.2 - paddle.device.cuda.synchronize(place) - start = time.perf_counter() - model.generate( - input_ids=input_ids, - max_length=args.generate_len, - min_length=args.generate_len, - decode_strategy="sampling", - top_k=args.top_k, - top_p=args.top_p, - use_fast=args.use_faster, - use_fp16_decoding=args.use_fp16_decoding, - ) - generate_mem = query_by_id(args.gpu_id) - paddle.device.cuda.synchronize(place) - pd_cost = (time.perf_counter() - start) / (num_loop - num_loop // 2) * 1000 - return pd_cost, load_mem - start_mem, generate_mem - start_mem - - -def perf_hf(args): - import torch - from transformers import CodeGenForCausalLM as hf_codegen - from transformers import CodeGenTokenizer as hf_tokenizer - - start_mem = query_by_id(args.gpu_id) - device = torch.device("cuda") - tokenizer = hf_tokenizer.from_pretrained(args.model_name_or_path) - model = hf_codegen.from_pretrained(args.model_name_or_path) - model.to(device) - model.eval() - load_mem = query_by_id(args.gpu_id) - - input_ids_np = [np.random.choice(list(tokenizer.decoder.keys()), args.input_len) for _ in range(args.batch_size)] - input_ids = torch.tensor(input_ids_np) - input_ids = input_ids.to(device) - num_loop = 100 - with torch.no_grad(): - for i in range(num_loop): - # For warmup. - if num_loop // 2 == i: - torch.cuda.synchronize() - start = time.perf_counter() - model.generate( - input_ids, - do_sample=True, - max_length=args.generate_len + input_ids.shape[-1], - min_length=args.generate_len + input_ids.shape[-1], - top_k=args.top_k, - top_p=args.top_p, - ) - generate_mem = query_by_id(args.gpu_id) - torch.cuda.synchronize() - hf_cost = (time.perf_counter() - start) / (num_loop - num_loop // 2) * 1000 - return hf_cost, load_mem - start_mem, generate_mem - start_mem - - -def parse_args(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--perf_type", - default="pd", - type=str, - choices=["pd", "pd_faster_fp32", "pd_faster_fp16", "hf"], - help="The type of perf. ", - ) - parser.add_argument( - "--model_name_or_path", - default="Salesforce/codegen-350M-mono", - type=str, - choices=[ - "Salesforce/codegen-350M-mono", - "Salesforce/codegen-2B-mono", - "Salesforce/codegen-6B-mono", - "Salesforce/codegen-16B-mono", - ], - help="The model name to specify the bart to use. ", - ) - parser.add_argument("--top_k", default=4, type=int, help="The number of candidate to procedure topk sampling. ") - parser.add_argument( - "--top_p", default=1.0, type=float, help="The probability threshold to procedure topp sampling. " - ) - parser.add_argument("--batch_size", default=1, type=int, help="The size of input batch. ") - parser.add_argument("--input_len", default=60, type=int, help="The size of model input. ") - parser.add_argument("--generate_len", default=20, type=int, help="Length of output . ") - parser.add_argument("--gpu_id", default=2, type=int, help="The id of GPU . ") - parser.add_argument( - "--use_faster", action="store_true", help="Whether to process inference using faster codegen. " - ) - - parser.add_argument("--use_fp16_decoding", action="store_true", help="Whether to use fp16 decoding to predict. ") - args = parser.parse_args() - return args - - -def do_predict(args): - try: - if args.perf_type == "pd": - args.use_faster = False - cost, load_mem, generate_mem = perf_pd(args) - elif args.perf_type == "pd_faster_fp32": - args.use_faster = True - args.use_fp16_decoding = False - cost, load_mem, generate_mem = perf_pd(args) - elif args.perf_type == "pd_faster_fp16": - args.use_faster = True - args.use_fp16_decoding = True - paddle.set_default_dtype("float16") - cost, load_mem, generate_mem = perf_pd(args) - else: - cost, load_mem, generate_mem = perf_hf(args) - pprint(args) - print( - f"CodeGenPerfResult: cost_time: {cost} ms, load_mem: {load_mem} MB, generate_mem:{generate_mem} MB, args:{args}\n" - ) - except Exception as e: - pprint(args) - print(f"CodeGenPerfResult: ERROR: {e}, args:{args}\n") - - -if __name__ == "__main__": - args = parse_args() - do_predict(args) diff --git a/fast_generation/perf/gpt_perf.py b/fast_generation/perf/gpt_perf.py deleted file mode 100644 index 87afcba682b4..000000000000 --- a/fast_generation/perf/gpt_perf.py +++ /dev/null @@ -1,155 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import time -from pprint import pprint - -import numpy as np -import paddle -import torch -from transformers import GPT2LMHeadModel as hf_gpt_model - -from paddlenlp.transformers import GPTLMHeadModel, GPTTokenizer - - -def parse_args(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--model_name_or_path", - default="gpt2-en", - type=str, - choices=["gpt2-en", "gpt2-medium-en", "gpt2-large-en"], - help="The model name to specify the bart to use. Can be one of ['gpt2-en', 'gpt2-medium-en', 'gpt2-large-en']. ", - ) - parser.add_argument( - "--decode_strategy", - default="sampling", - type=str, - choices=["greedy_search", "sampling"], - help="The decoding strategy. Can be one of ['greedy_search', 'sampling']", - ) - parser.add_argument("--top_k", default=4, type=int, help="The number of candidate to procedure beam search. ") - parser.add_argument("--batch_size", default=4, type=int, help="The size of input batch. ") - parser.add_argument( - "--top_p", default=1.0, type=float, help="The probability threshold to procedure topp sampling. " - ) - parser.add_argument("--max_length", default=32, type=int, help="Maximum output length. ") - parser.add_argument("--use_fp16_decoding", action="store_true", help="Whether to use fp16 decoding to predict. ") - args = parser.parse_args() - return args - - -def do_predict(args): - place = "gpu" - place = paddle.set_device(place) - - tokenizer = GPTTokenizer.from_pretrained(args.model_name_or_path) - model = GPTLMHeadModel.from_pretrained(args.model_name_or_path) - # Set evaluate mode - model.eval() - bos_id = tokenizer.convert_tokens_to_ids("<|endoftext|>") - eos_id = tokenizer.convert_tokens_to_ids("<|endoftext|>") - - input_ids_np = np.array([[bos_id] for i in range(args.batch_size)]).astype("int64").reshape([args.batch_size, 1]) - input_ids = paddle.to_tensor(input_ids_np) - # Define model - num_loop = 100 - with paddle.no_grad(): - for i in range(num_loop): - # For warmup. - if 50 == i: - # PaddlePaddle >= 2.2 - paddle.device.cuda.synchronize(place) - start = time.perf_counter() - model.generate( - input_ids=input_ids, - max_length=args.max_length, - decode_strategy=args.decode_strategy, - top_k=args.top_k, - top_p=args.top_p, - bos_token_id=bos_id, - eos_token_id=eos_id, - use_fast=True, - use_fp16_decoding=args.use_fp16_decoding, - ) - paddle.device.cuda.synchronize(place) - fast_cost = (time.perf_counter() - start) / 50 * 1000 - - if args.use_fp16_decoding: - pprint(args) - print("Fast FP16 cost:", fast_cost) - return - with paddle.no_grad(): - for i in range(num_loop): - # For warmup. - if 50 == i: - # PaddlePaddle >= 2.2 - paddle.device.cuda.synchronize(place) - start = time.perf_counter() - model.generate( - input_ids=input_ids, - max_length=args.max_length, - decode_strategy=args.decode_strategy, - top_k=args.top_k, - top_p=args.top_p, - bos_token_id=bos_id, - eos_token_id=eos_id, - ) - paddle.device.cuda.synchronize(place) - pd_cost = (time.perf_counter() - start) / 50 * 1000 - - device = torch.device("cuda:0") - hf_model = hf_gpt_model.from_pretrained(args.model_name_or_path[:-3]) - hf_model.to(device) - hf_model.eval() - - hf_input_ids = torch.tensor(input_ids_np) - hf_input_ids = hf_input_ids.to(device) - - if args.decode_strategy == "sampling": - do_sample = True - else: - do_sample = False - with torch.no_grad(): - for i in range(num_loop): - # For warmup. - if 50 == i: - torch.cuda.synchronize() - start = time.perf_counter() - hf_model.generate( - hf_input_ids, - do_sample=do_sample, - max_length=args.max_length + 1, - bos_token_id=bos_id, - eos_token_id=eos_id, - pad_token_id=0, - top_k=args.top_k, - top_p=args.top_p, - ) - torch.cuda.synchronize() - hf_cost = (time.perf_counter() - start) / 50 * 1000 - - pprint(args) - print("Fast FP32 cost:", fast_cost) - print("PD cost:", pd_cost) - print("HF cost:", hf_cost) - print("Speed up Fast FP32/PD:", pd_cost / fast_cost) - print("Speed up Fast FP32/HF:", hf_cost / fast_cost) - - -if __name__ == "__main__": - args = parse_args() - do_predict(args) diff --git a/fast_generation/perf/opt_perf.py b/fast_generation/perf/opt_perf.py deleted file mode 100644 index 213881fbf947..000000000000 --- a/fast_generation/perf/opt_perf.py +++ /dev/null @@ -1,162 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse - -# append project root dir to project to make it run with latest code -import sys -import time -from pprint import pprint - -import numpy as np -import paddle -import torch -from transformers.models.opt.modeling_opt import OPTForCausalLM as hf_opt_model - -from paddlenlp.transformers import GPTTokenizer, OPTForCausalLM - -sys.path.insert(0, "../../") - - -def parse_args(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--model_name_or_path", - default="facebook/opt-125m", - type=str, - choices=["facebook/opt-125m", "facebook/opt-350m", "facebook/opt-1.3b", "facebook/opt-2.7b"], - help="The model name to specify the bart to use. Can be one of ['facebook/opt-125m', 'facebook/opt-350m', 'facebook/opt-1.3b', 'facebook/opt-2.7b']. ", - ) - parser.add_argument( - "--decode_strategy", - default="greedy_search", - type=str, - choices=["greedy_search", "sampling"], - help="The decoding strategy. Can be one of ['greedy_search', 'sampling']", - ) - parser.add_argument("--top_k", default=4, type=int, help="The number of candidate to procedure beam search. ") - parser.add_argument("--batch_size", default=4, type=int, help="The size of input batch. ") - parser.add_argument( - "--top_p", default=1.0, type=float, help="The probability threshold to procedure topp sampling. " - ) - parser.add_argument("--max_length", default=32, type=int, help="Maximum output length. ") - parser.add_argument("--use_fp16_decoding", action="store_true", help="Whether to use fp16 decoding to predict. ") - args = parser.parse_args() - return args - - -def do_predict(args): - place = "gpu" - place = paddle.set_device(place) - - tokenizer = GPTTokenizer.from_pretrained(args.model_name_or_path) - model = OPTForCausalLM.from_pretrained(args.model_name_or_path) - # Set evaluate mode - model.eval() - bos_id = tokenizer.convert_tokens_to_ids("<|endoftext|>") - eos_id = tokenizer.convert_tokens_to_ids("<|endoftext|>") - - input_ids_np = np.array([[bos_id] for i in range(args.batch_size)]).astype("int64").reshape([args.batch_size, 1]) - input_ids = paddle.to_tensor(input_ids_np) - # Define model - num_loop = 100 - with paddle.no_grad(): - for i in range(num_loop): - # For warmup. - if 50 == i: - # PaddlePaddle >= 2.2 - paddle.device.cuda.synchronize(place) - start = time.perf_counter() - model.generate( - input_ids=input_ids, - max_length=args.max_length, - decode_strategy=args.decode_strategy, - top_k=args.top_k, - top_p=args.top_p, - bos_token_id=bos_id, - eos_token_id=eos_id, - use_fast=True, - use_fp16_decoding=args.use_fp16_decoding, - ) - paddle.device.cuda.synchronize(place) - fast_cost = (time.perf_counter() - start) / 50 * 1000 - - if args.use_fp16_decoding: - pprint(args) - print("Fast FP16 cost:", fast_cost) - return - with paddle.no_grad(): - for i in range(num_loop): - # For warmup. - if 50 == i: - # PaddlePaddle >= 2.2 - paddle.device.cuda.synchronize(place) - start = time.perf_counter() - model.generate( - input_ids=input_ids, - max_length=args.max_length, - decode_strategy=args.decode_strategy, - top_k=args.top_k, - top_p=args.top_p, - bos_token_id=bos_id, - eos_token_id=eos_id, - ) - paddle.device.cuda.synchronize(place) - pd_cost = (time.perf_counter() - start) / 50 * 1000 - - device = torch.device("cuda:0") - hf_model = hf_opt_model.from_pretrained(args.model_name_or_path) - - hf_model.to(device) - hf_model.eval() - - hf_input_ids = torch.tensor(input_ids_np) - hf_input_ids = hf_input_ids.to(device) - - if args.decode_strategy == "sampling": - do_sample = True - else: - do_sample = False - with torch.no_grad(): - for i in range(num_loop): - # For warmup. - if 50 == i: - torch.cuda.synchronize() - start = time.perf_counter() - hf_model.generate( - hf_input_ids, - do_sample=do_sample, - max_length=args.max_length + 1, - bos_token_id=bos_id, - eos_token_id=eos_id, - pad_token_id=0, - top_k=args.top_k, - top_p=args.top_p, - ) - torch.cuda.synchronize() - hf_cost = (time.perf_counter() - start) / 50 * 1000 - - pprint(args) - print("Fast FP32 cost:", fast_cost) - print("PD cost:", pd_cost) - print("HF cost:", hf_cost) - print("Speed up Fast FP32/PD:", pd_cost / fast_cost) - print("Speed up Fast FP32/HF:", hf_cost / fast_cost) - - -if __name__ == "__main__": - args = parse_args() - print(args.model_name_or_path) - do_predict(args) diff --git a/fast_generation/perf/pegasus_perf.py b/fast_generation/perf/pegasus_perf.py deleted file mode 100644 index fe8ba55fb8e3..000000000000 --- a/fast_generation/perf/pegasus_perf.py +++ /dev/null @@ -1,168 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import time -from pprint import pprint - -import numpy as np -import paddle -import pynvml - -from paddlenlp.transformers import ( - PegasusChineseTokenizer, - PegasusForConditionalGeneration, -) - -pynvml.nvmlInit() - - -def query_by_id(gpu_id=2): - handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_id) - meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle) - return meminfo.used // 1024 // 1024 - - -def perf_pd(args): - start_mem = query_by_id(args.gpu_id) - place = "gpu" - place = paddle.set_device(place) - tokenizer = PegasusChineseTokenizer.from_pretrained(args.model_name_or_path) - model = PegasusForConditionalGeneration.from_pretrained(args.model_name_or_path) - model.eval() - load_mem = query_by_id(args.gpu_id) - input_ids_np = [np.random.choice(range(len(tokenizer.vocab)), args.input_len) for _ in range(args.batch_size)] - input_ids = paddle.to_tensor(input_ids_np) - - num_loop = 100 - with paddle.no_grad(): - for i in range(num_loop): - # For warmup. - if num_loop // 2 == i: - # PaddlePaddle >= 2.2 - paddle.device.cuda.synchronize(place) - start = time.perf_counter() - model.generate( - input_ids=input_ids, - max_length=args.generate_len, - min_length=args.generate_len, - decode_strategy="beam_search", - num_beams=args.num_beams, - use_fast=args.use_faster, - use_fp16_decoding=args.use_fp16_decoding, - ) - generate_mem = query_by_id(args.gpu_id) - paddle.device.cuda.synchronize(place) - pd_cost = (time.perf_counter() - start) / (num_loop - num_loop // 2) * 1000 - return pd_cost, load_mem - start_mem, generate_mem - start_mem - - -def perf_hf(args): - import torch - from tokenizers_pegasus import PegasusTokenizer as hf_tokenizer - from transformers import PegasusForConditionalGeneration as hf_pegasus - - start_mem = query_by_id(args.gpu_id) - device = torch.device("cuda") - tokenizer = hf_tokenizer.from_pretrained(args.model_name_or_path) - model = hf_pegasus.from_pretrained(args.model_name_or_path) - model.to(device) - model.eval() - load_mem = query_by_id(args.gpu_id) - - input_ids_np = [np.random.choice(range(len(tokenizer.vocab)), args.input_len) for _ in range(args.batch_size)] - input_ids = torch.tensor(input_ids_np) - input_ids = input_ids.to(device) - num_loop = 100 - with torch.no_grad(): - for i in range(num_loop): - # For warmup. - if num_loop // 2 == i: - torch.cuda.synchronize() - start = time.perf_counter() - model.generate( - input_ids, - do_sample=False, - num_beams=args.num_beams, - max_length=args.generate_len + input_ids.shape[-1], - min_length=args.generate_len + input_ids.shape[-1], - ) - generate_mem = query_by_id(args.gpu_id) - torch.cuda.synchronize() - hf_cost = (time.perf_counter() - start) / (num_loop - num_loop // 2) * 1000 - return hf_cost, load_mem - start_mem, generate_mem - start_mem - - -def parse_args(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--perf_type", - default="pd", - type=str, - choices=["pd", "pd_faster_fp32", "pd_faster_fp16", "hf"], - help="The type of perf. ", - ) - parser.add_argument( - "--model_name_or_path", - default="IDEA-CCNL/Randeng-Pegasus-238M-Summary-Chinese", - type=str, - choices=[ - "IDEA-CCNL/Randeng-Pegasus-238M-Summary-Chinese", - "IDEA-CCNL/Randeng-Pegasus-523M-Summary-Chinese", - ], - help="The model name to specify the pegasus to use. ", - ) - parser.add_argument("--num_beams", default=4, type=int, help="The number of beams to procedure beam search. ") - parser.add_argument("--batch_size", default=1, type=int, help="The size of input batch. ") - parser.add_argument("--input_len", default=60, type=int, help="The size of model input. ") - parser.add_argument("--generate_len", default=20, type=int, help="Length of output . ") - parser.add_argument("--gpu_id", default=2, type=int, help="The id of GPU . ") - parser.add_argument( - "--use_faster", action="store_true", help="Whether to process inference using faster pegasus. " - ) - - parser.add_argument("--use_fp16_decoding", action="store_true", help="Whether to use fp16 decoding to predict. ") - args = parser.parse_args() - return args - - -def do_predict(args): - try: - if args.perf_type == "pd": - args.use_faster = False - cost, load_mem, generate_mem = perf_pd(args) - elif args.perf_type == "pd_faster_fp32": - args.use_faster = True - args.use_fp16_decoding = False - cost, load_mem, generate_mem = perf_pd(args) - elif args.perf_type == "pd_faster_fp16": - args.use_faster = True - args.use_fp16_decoding = True - # paddle.set_default_dtype('float16') - cost, load_mem, generate_mem = perf_pd(args) - else: - cost, load_mem, generate_mem = perf_hf(args) - pprint(args) - print( - f"PegasusPerfResult: cost_time: {cost} ms, load_mem: {load_mem} MB, generate_mem:{generate_mem} MB, args:{args}\n" - ) - except Exception as e: - pprint(args) - print(f"PegasusPerfResult: ERROR: {e}, args:{args}\n") - - -if __name__ == "__main__": - args = parse_args() - do_predict(args) diff --git a/fast_generation/perf/run_perf_bart.sh b/fast_generation/perf/run_perf_bart.sh deleted file mode 100644 index fa087770cb5a..000000000000 --- a/fast_generation/perf/run_perf_bart.sh +++ /dev/null @@ -1,76 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -export CUDA_VISIBLE_DEVICES=3 - -for model_name in bart-base bart-large; - do - for top_k in 1 4 8 16; - do - python bart_perf.py \ - --model_name_or_path=$model_name \ - --decode_strategy=sampling \ - --num_beams=1 \ - --top_k=$top_k \ - --top_p=1 \ - --max_length=32 - sleep 10s - python bart_perf.py \ - --model_name_or_path=$model_name \ - --decode_strategy=sampling \ - --num_beams=1 \ - --top_k=$top_k \ - --top_p=1 \ - --max_length=32 \ - --use_fp16_decoding - sleep 10s - done - python bart_perf.py \ - --model_name_or_path=$model_name \ - --decode_strategy=sampling \ - --num_beams=1 \ - --top_k=0 \ - --top_p=0.4 \ - --max_length=32 - sleep 10s - python bart_perf.py \ - --model_name_or_path=$model_name \ - --decode_strategy=sampling \ - --num_beams=1 \ - --top_k=0 \ - --top_p=0.4 \ - --max_length=32 \ - --use_fp16_decoding - sleep 10s - for num_beams in 4 8 16; - do - python bart_perf.py \ - --model_name_or_path=$model_name \ - --decode_strategy=beam_search \ - --num_beams=$num_beams \ - --top_k=1 \ - --top_p=1 \ - --max_length=32 - sleep 10s - python bart_perf.py \ - --model_name_or_path=$model_name \ - --decode_strategy=beam_search \ - --num_beams=$num_beams \ - --top_k=1 \ - --top_p=1 \ - --max_length=32 \ - --use_fp16_decoding - sleep 10s - done - done \ No newline at end of file diff --git a/fast_generation/perf/run_perf_codegen.sh b/fast_generation/perf/run_perf_codegen.sh deleted file mode 100644 index be4792096e2e..000000000000 --- a/fast_generation/perf/run_perf_codegen.sh +++ /dev/null @@ -1,64 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -GPU_ID=1 -export CUDA_VISIBLE_DEVICES=${GPU_ID} - -for model_name in Salesforce/codegen-350M-mono Salesforce/codegen-2B-mono Salesforce/codegen-6B-mono; - do - for top_k in 1 4 8 16; - do - for input_len in 60; - do - for generate_len in 20; - do - for perf_type in pd pd_faster_fp32 pd_faster_fp16 hf; - do - echo model_name: $model_name, perf_type: $perf_type, top_k: $top_k, top_p: 1.0, input_len: $input_len, generate_len: $generate_len - python codegen_perf.py \ - --model_name_or_path=$model_name \ - --perf_type=$perf_type \ - --top_k=$top_k \ - --top_p=1.0 \ - --input_len=$input_len \ - --generate_len=$generate_len \ - --gpu_id ${GPU_ID} - sleep 3s - done - done - done - done - for top_p in 0.4; - do - for input_len in 60; - do - for generate_len in 20; - do - for perf_type in pd pd_faster_fp32 pd_faster_fp16 hf; - do - echo model_name: $model_name, perf_type: $perf_type, top_k: 0, top_p: $top_p, input_len: $input_len, generate_len: $generate_len - python codegen_perf.py \ - --model_name_or_path=$model_name \ - --perf_type=$perf_type \ - --top_k=0 \ - --top_p=$top_p \ - --input_len=$input_len \ - --generate_len=$generate_len \ - --gpu_id ${GPU_ID} - sleep 3s - done - done - done - done - done \ No newline at end of file diff --git a/fast_generation/perf/run_perf_gpt.sh b/fast_generation/perf/run_perf_gpt.sh deleted file mode 100644 index 5363b0546af6..000000000000 --- a/fast_generation/perf/run_perf_gpt.sh +++ /dev/null @@ -1,52 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -export CUDA_VISIBLE_DEVICES=3 - -for model_name in gpt2-en gpt2-medium-en gpt2-large-en; - do - for top_k in 1 4 8 16; - do - python gpt_perf.py \ - --model_name_or_path=$model_name \ - --decode_strategy=sampling \ - --top_k=$top_k \ - --top_p=1 \ - --max_length=32 - sleep 10s - python gpt_perf.py \ - --model_name_or_path=$model_name \ - --decode_strategy=sampling \ - --top_k=$top_k \ - --top_p=1 \ - --max_length=32 \ - --use_fp16_decoding - sleep 10s - done - python gpt_perf.py \ - --model_name_or_path=$model_name \ - --decode_strategy=sampling \ - --top_k=0 \ - --top_p=0.4 \ - --max_length=32 - sleep 10s - python gpt_perf.py \ - --model_name_or_path=$model_name \ - --decode_strategy=sampling \ - --top_k=0 \ - --top_p=0.4 \ - --max_length=32 \ - --use_fp16_decoding - sleep 10s - done \ No newline at end of file diff --git a/fast_generation/perf/run_perf_opt.sh b/fast_generation/perf/run_perf_opt.sh deleted file mode 100644 index bc1d525c00ac..000000000000 --- a/fast_generation/perf/run_perf_opt.sh +++ /dev/null @@ -1,52 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -export CUDA_VISIBLE_DEVICES=3 - -for model_name in facebook/opt-125m facebook/opt-350m; - do - for top_k in 1 4 8 16; - do - python opt_perf.py \ - --model_name_or_path=$model_name \ - --decode_strategy=sampling \ - --top_k=$top_k \ - --top_p=0.4 \ - --max_length=32 - sleep 10s - python opt_perf.py \ - --model_name_or_path=$model_name \ - --decode_strategy=sampling \ - --top_k=$top_k \ - --top_p=0.4 \ - --max_length=32 \ - --use_fp16_decoding - sleep 10s - done - python opt_perf.py \ - --model_name_or_path=$model_name \ - --decode_strategy=sampling \ - --top_k=0 \ - --top_p=0.4 \ - --max_length=32 - sleep 10s - python opt_perf.py \ - --model_name_or_path=$model_name \ - --decode_strategy=sampling \ - --top_k=0 \ - --top_p=0.4 \ - --max_length=32 \ - --use_fp16_decoding - sleep 10s - done diff --git a/fast_generation/perf/run_perf_pegasus.sh b/fast_generation/perf/run_perf_pegasus.sh deleted file mode 100644 index 264c28b22c8b..000000000000 --- a/fast_generation/perf/run_perf_pegasus.sh +++ /dev/null @@ -1,45 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -GPU_ID=4 -export CUDA_VISIBLE_DEVICES=${GPU_ID} - -for model_name in IDEA-CCNL/Randeng-Pegasus-238M-Summary-Chinese IDEA-CCNL/Randeng-Pegasus-523M-Summary-Chinese; - do - for batch_size in 1 4 8 16; - do - for num_beams in 2 4 6 8; - do - for input_len in 60; - do - for generate_len in 20; - do - for perf_type in pd_faster_fp16 pd_faster_fp32 pd hf; - do - echo model_name: $model_name, perf_type: $perf_type, batch_size:$batch_size, num_beams: $num_beams, input_len: $input_len, generate_len: $generate_len - python pegasus_perf.py \ - --model_name_or_path=$model_name \ - --perf_type=$perf_type \ - --batch_size=$batch_size \ - --num_beams=$num_beams \ - --input_len=$input_len \ - --generate_len=$generate_len \ - --gpu_id ${GPU_ID} - sleep 3s - done - done - done - done - done - done \ No newline at end of file diff --git a/fast_generation/samples/codegen_16b_sample.py b/fast_generation/samples/codegen_16b_sample.py deleted file mode 100644 index 0f556911e813..000000000000 --- a/fast_generation/samples/codegen_16b_sample.py +++ /dev/null @@ -1,38 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle - -from paddlenlp.transformers import CodeGenForCausalLM, CodeGenTokenizer - -# Can be load on A100-40G -paddle.set_default_dtype("float16") -model_name = "Salesforce/codegen-16B-mono" - -tokenizer = CodeGenTokenizer.from_pretrained(model_name) -model = CodeGenForCausalLM.from_pretrained(model_name) -model.eval() - -inputs = "def hello" -input_ids = tokenizer([inputs], return_tensors="pd")["input_ids"] - -# Enable FastGeneration -outputs, _ = model.generate( - input_ids=input_ids, max_length=128, decode_strategy="greedy_search", use_fp16_decoding=True, use_fast=True -) - -result = tokenizer.decode(outputs[0], truncate_before_pattern=[r"\n\n^#", "^'''", "\n\n\n"]) - -print("Model input:", inputs) -print("Result:", result) diff --git a/fast_generation/samples/codegen_sample.py b/fast_generation/samples/codegen_sample.py deleted file mode 100644 index 77cb5c7a335e..000000000000 --- a/fast_generation/samples/codegen_sample.py +++ /dev/null @@ -1,37 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from paddlenlp.transformers import CodeGenForCausalLM, CodeGenTokenizer - -model_name = "Salesforce/codegen-350M-mono" - -tokenizer = CodeGenTokenizer.from_pretrained(model_name) -model = CodeGenForCausalLM.from_pretrained(model_name) -model.eval() - -inputs = "def hello" -input_ids = tokenizer([inputs], return_tensors="pd")["input_ids"] - -outputs, _ = model.generate( - input_ids=input_ids, max_length=128, decode_strategy="greedy_search", use_fp16_decoding=True, use_fast=True -) - -result = tokenizer.decode(outputs[0], truncate_before_pattern=[r"\n\n^#", "^'''", "\n\n\n"]) - -print("Model input:", inputs) -print("Result:", result) -# Result: _world(): -# print("Hello World") - -# hello_world() diff --git a/fast_generation/samples/gpt_mp_sample.py b/fast_generation/samples/gpt_mp_sample.py deleted file mode 100644 index 061318e74661..000000000000 --- a/fast_generation/samples/gpt_mp_sample.py +++ /dev/null @@ -1,132 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import time -from pprint import pprint - -import paddle - -from paddlenlp.ops import enable_ft_para, get_ft_para_conf -from paddlenlp.transformers import GPTChineseTokenizer, GPTLMHeadModel, GPTTokenizer - -MODEL_CLASSES = { - "gpt-cpm-large-cn": (GPTLMHeadModel, GPTChineseTokenizer), - "gpt-cpm-small-cn-distill": (GPTLMHeadModel, GPTChineseTokenizer), - "gpt2-en": (GPTLMHeadModel, GPTTokenizer), - "gpt2-medium-en": (GPTLMHeadModel, GPTTokenizer), - "gpt2-large-en": (GPTLMHeadModel, GPTTokenizer), - "gpt2-xl-en": (GPTLMHeadModel, GPTTokenizer), -} - - -def parse_args(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--model_name", - default="gpt-cpm-large-cn", - choices=list(MODEL_CLASSES.keys()), - help="The model name to specify which gpt to use. It can be " + ", ".join(MODEL_CLASSES.keys()), - ) - parser.add_argument("--batch_size", default=4, type=int, help="Batch size.") - parser.add_argument("--max_length", default=50, type=int, help="Maximum output length.") - parser.add_argument( - "--topk", default=1, type=int, help="The number of highest probability tokens to keep for top-k-sampling." - ) - parser.add_argument("--topp", default=1.0, type=float, help="The cumulative probability for top-p-filtering.") - parser.add_argument("--temperature", default=1.0, type=float, help="The temperature to set.") - parser.add_argument("--tensor_para_size", default=2, type=int, help="The size for tensor parallel.") - parser.add_argument("--layer_para_size", default=1, type=int, help="The size for layer parallel.") - parser.add_argument( - "--layer_para_batch_size", - default=None, - type=int, - help="The local batch size for pipeline parallel." "It is suggested to use `batch_size // layer_para_size`.", - ) - parser.add_argument("--use_fp16", action="store_true", help="Whether to use fp16 to predict.") - parser.add_argument("--profile", action="store_true", help="Whether to profile.") - args = parser.parse_args() - return args - - -def profile(batch_size, total_step=50, warmup_step=10, rank=0): - def _wrapper(func): - def _impl(*args, **kwargs): - for i in range(total_step): - if i == warmup_step: - paddle.device.cuda.synchronize() - start_time = time.time() - out = func(*args, **kwargs) - paddle.device.cuda.synchronize() - end_time = time.time() - if rank is None or get_ft_para_conf().rank == rank: - time_interval = end_time - start_time - num_batch = total_step - warmup_step - print("Latency: %2fs, QPS: %2f" % (time_interval / num_batch, num_batch * batch_size / time_interval)) - return out - - return _impl - - return _wrapper - - -def main(args): - if args.use_fp16: - paddle.set_default_dtype("float16") - enable_ft_para( - args.tensor_para_size, - args.layer_para_size, - args.batch_size // args.layer_para_size if args.layer_para_batch_size is None else args.layer_para_batch_size, - ) - # TODO(guosheng): Maybe device can be set in `enable_ft_para` - paddle.set_device("gpu:" + str(get_ft_para_conf().rank)) - - model_name = args.model_name - if args.profile: - MODEL_CLASSES[model_name][0].generate = profile(args.batch_size)(MODEL_CLASSES[model_name][0].generate) - tokenizer = MODEL_CLASSES[model_name][-1].from_pretrained(model_name) - model = MODEL_CLASSES[model_name][0].from_pretrained(model_name) - model.eval() - - # NOTE: When using prompt, open this and replace the text with what you want. - input = "花间一壶酒,独酌无相亲。举杯邀明月," - # input = '一时黛玉进了荣府,下了车。众嬷嬷引着,便往东转弯,' - # input = '爱因斯坦曾经说过:' - input_ids = tokenizer(input)["input_ids"] - # NOTE: When generating from the beginning, open this. - # input_ids = [tokenizer.eos_token_id] - input_ids = [input_ids] * args.batch_size - - inputs_ids = paddle.to_tensor(input_ids, dtype="int32") - outputs, _ = model.generate( - input_ids=inputs_ids, - max_length=args.max_length, - decode_strategy="sampling", - top_k=args.topk, - top_p=args.topp, - temperature=args.temperature, - use_fast=True, - ) - - # Only make the first process to output. - if get_ft_para_conf().rank == 0: - for i in range(len(outputs)): - result = tokenizer.convert_ids_to_string(outputs[i].numpy().tolist()) - print("Result:", result) - - -if __name__ == "__main__": - args = parse_args() - pprint(args) - main(args) diff --git a/fast_generation/samples/gpt_sample.py b/fast_generation/samples/gpt_sample.py deleted file mode 100644 index e0cff0bba726..000000000000 --- a/fast_generation/samples/gpt_sample.py +++ /dev/null @@ -1,35 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle - -from paddlenlp.transformers import GPTChineseTokenizer, GPTLMHeadModel - -model_name = "gpt-cpm-small-cn-distill" - -tokenizer = GPTChineseTokenizer.from_pretrained(model_name) -model = GPTLMHeadModel.from_pretrained(model_name) -model.eval() - -inputs = "花间一壶酒,独酌无相亲。举杯邀明月," -inputs_ids = tokenizer(inputs)["input_ids"] -inputs_ids = paddle.to_tensor(inputs_ids, dtype="int64").unsqueeze(0) - -outputs, _ = model.generate(input_ids=inputs_ids, max_length=10, decode_strategy="greedy_search", use_fast=True) - -result = tokenizer.convert_ids_to_string(outputs[0].numpy().tolist()) - -print("Model input:", inputs) -print("Result:", result) -# 对影成三人。 diff --git a/fast_generation/samples/gptj_sample.py b/fast_generation/samples/gptj_sample.py deleted file mode 100644 index 17615667dfda..000000000000 --- a/fast_generation/samples/gptj_sample.py +++ /dev/null @@ -1,42 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle - -from paddlenlp.transformers import GPTJForCausalLM, GPTJTokenizer - -paddle.set_default_dtype("float16") -model_name = "EleutherAI/gpt-j-6B" - -tokenizer = GPTJTokenizer.from_pretrained(model_name) -model = GPTJForCausalLM.from_pretrained(model_name) -model.eval() - -inputs = "What is PaddleNLP?" -input_ids = tokenizer([inputs], return_tensors="pd")["input_ids"] - -outputs, _ = model.generate( - input_ids=input_ids, - max_length=100, - decode_strategy="sampling", - temperature=0.8, - top_p=0.9, - use_fp16_decoding=True, - use_fast=True, -) - -result = tokenizer.decode(outputs[0]) - -print("Model input:", inputs) -print("Result:", result) diff --git a/fast_generation/samples/mbart_sample.py b/fast_generation/samples/mbart_sample.py deleted file mode 100644 index e16c4e7de176..000000000000 --- a/fast_generation/samples/mbart_sample.py +++ /dev/null @@ -1,58 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import paddle - -from paddlenlp.transformers import MBart50Tokenizer, MBartForConditionalGeneration - -model_name = "mbart-large-50-many-to-many-mmt" - -tokenizer = MBart50Tokenizer.from_pretrained(model_name, src_lang="en_XX") -model = MBartForConditionalGeneration.from_pretrained(model_name) -model.eval() - - -def postprocess_response(seq, bos_idx, eos_idx): - """Post-process the decoded sequence.""" - eos_pos = len(seq) - 1 - for i, idx in enumerate(seq): - if idx == eos_idx: - eos_pos = i - break - seq = [idx for idx in seq[: eos_pos + 1] if idx != bos_idx and idx != eos_idx] - res = tokenizer.convert_ids_to_string(seq) - return res - - -bos_id = tokenizer.lang_code_to_id["zh_CN"] -eos_id = model.mbart.config["eos_token_id"] - -inputs = "PaddleNLP is a powerful NLP library with Awesome pre-trained models and easy-to-use interface, supporting wide-range of NLP tasks from research to industrial applications." -input_ids = tokenizer(inputs)["input_ids"] -input_ids = paddle.to_tensor(input_ids, dtype="int32").unsqueeze(0) - -outputs, _ = model.generate( - input_ids=input_ids, - forced_bos_token_id=bos_id, - decode_strategy="beam_search", - num_beams=4, - max_length=50, - use_fast=True, -) - -result = postprocess_response(outputs[0].numpy().tolist(), bos_id, eos_id) - -print("Model input:", inputs) - -print("Result:", result) -# PaddleNLP是一个强大的NLP库,具有超乎寻常的预训练模型和易于使用的接口,支持从研究到工业应用的广泛的NLP任务。 diff --git a/fast_generation/samples/opt_sample.py b/fast_generation/samples/opt_sample.py deleted file mode 100644 index 812fd6e01b8f..000000000000 --- a/fast_generation/samples/opt_sample.py +++ /dev/null @@ -1,45 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle - -from paddlenlp.transformers import GPTTokenizer, OPTForCausalLM - -model_name = "facebook/opt-350m" - -tokenizer = GPTTokenizer.from_pretrained(model_name) -model = OPTForCausalLM.from_pretrained(model_name) -model.eval() - -inputs = """a chat between a curious human and Statue of Liberty. -Human: What is your name? -Statue: I am statue of liberty. -Human: where do you live? -Statue: New york city. -Human: how long have you lived there?。""" - -inputs_ids = tokenizer([inputs])["input_ids"] -inputs_ids = paddle.to_tensor(inputs_ids, dtype="int64") - -outputs, _ = model.generate( - input_ids=inputs_ids, - max_length=20, - decode_strategy="greedy_search", - use_fast=True, -) - -result = tokenizer.convert_ids_to_string(outputs[0].numpy().tolist()) - -print("Model input:", inputs) -print("Result:", result) diff --git a/fast_generation/samples/pegasus_sample.py b/fast_generation/samples/pegasus_sample.py deleted file mode 100644 index ddbc340808b6..000000000000 --- a/fast_generation/samples/pegasus_sample.py +++ /dev/null @@ -1,36 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from paddlenlp.transformers import ( - PegasusChineseTokenizer, - PegasusForConditionalGeneration, -) - -model = PegasusForConditionalGeneration.from_pretrained("IDEA-CCNL/Randeng-Pegasus-238M-Summary-Chinese") -tokenizer = PegasusChineseTokenizer.from_pretrained("IDEA-CCNL/Randeng-Pegasus-238M-Summary-Chinese") -model.eval() - -inputs = "在北京冬奥会自由式滑雪女子坡面障碍技巧决赛中,中国选手谷爱凌夺得银牌。祝贺谷爱凌!今天上午,自由式滑雪女子坡面障碍技巧决赛举行。决赛分三轮进行,取选手最佳成绩排名决出奖牌。第一跳,中国选手谷爱凌获得69.90分。在12位选手中排名第三。完成动作后,谷爱凌又扮了个鬼脸,甚是可爱。第二轮中,谷爱凌在道具区第三个障碍处失误,落地时摔倒。获得16.98分。网友:摔倒了也没关系,继续加油!在第二跳失误摔倒的情况下,谷爱凌顶住压力,第三跳稳稳发挥,流畅落地!获得86.23分!此轮比赛,共12位选手参赛,谷爱凌第10位出场。网友:看比赛时我比谷爱凌紧张,加油!" -tokenized = tokenizer(inputs, return_tensors="pd") -outputs, _ = model.generate( - input_ids=tokenized["input_ids"], - decode_strategy="beam_search", - num_beams=4, - use_fp16_decoding=True, - use_fast=True, -) -result = tokenizer.decode(outputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=False) - -print("Model input:", inputs) -print("Result:", result) diff --git a/fast_generation/samples/plato_sample.py b/fast_generation/samples/plato_sample.py deleted file mode 100644 index ac79e60918e4..000000000000 --- a/fast_generation/samples/plato_sample.py +++ /dev/null @@ -1,62 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from paddlenlp.transformers import ( - UnifiedTransformerLMHeadModel, - UnifiedTransformerTokenizer, -) - -model_name = "plato-mini" - -tokenizer = UnifiedTransformerTokenizer.from_pretrained(model_name) -model = UnifiedTransformerLMHeadModel.from_pretrained(model_name) -model.eval() - - -def postprocess_response(token_ids, tokenizer): - """Post-process the decoded sequence. Truncate from the first .""" - eos_pos = len(token_ids) - for i, tok_id in enumerate(token_ids): - if tok_id == tokenizer.sep_token_id: - eos_pos = i - break - token_ids = token_ids[:eos_pos] - tokens = tokenizer.convert_ids_to_tokens(token_ids) - tokens = tokenizer.merge_subword(tokens) - return tokens - - -inputs = "你好啊,你今年多大了" - -inputs_ids = tokenizer.dialogue_encode( - inputs, add_start_token_as_response=True, return_tensors=True, is_split_into_words=False -) - -outputs, _ = model.generate( - input_ids=inputs_ids["input_ids"], - token_type_ids=inputs_ids["token_type_ids"], - position_ids=inputs_ids["position_ids"], - attention_mask=inputs_ids["attention_mask"], - max_length=64, - decode_strategy="sampling", - top_k=5, - use_fast=True, -) - -result = postprocess_response(outputs[0].numpy(), tokenizer) -result = "".join(result) - -print("Model input:", inputs) -print("Result:", result) -# 我今年23岁了,你今年多大了? diff --git a/fast_generation/samples/plato_xl_sample.py b/fast_generation/samples/plato_xl_sample.py deleted file mode 100644 index 9c6138a9721b..000000000000 --- a/fast_generation/samples/plato_xl_sample.py +++ /dev/null @@ -1,162 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import os -import time -from distutils.util import strtobool -from pprint import pprint - -import paddle - -from paddlenlp.data import DataCollatorWithPadding -from paddlenlp.ops import enable_ft_para, get_ft_para_conf -from paddlenlp.transformers import ( - UnifiedTransformerLMHeadModel, - UnifiedTransformerTokenizer, -) - - -def parse_args(): - parser = argparse.ArgumentParser() - parser.add_argument("--use_role", type=strtobool, default=True, help="Whether to use role embeddings.") - parser.add_argument( - "--position_style", - default="relative", - choices=["continuous", "relative"], - type=str, - help="The type for positional embedding. Default is relative.", - ) - parser.add_argument("--batch_size", default=1, type=int, help="Batch size.") - parser.add_argument( - "--num_return_sequences", default=1, type=int, help="The number of returned sequences for each sample." - ) - parser.add_argument("--max_out_len", default=64, type=int, help="Maximum output sequence length.") - parser.add_argument("--min_out_len", default=1, type=int, help="Minimum output sequence length.") - parser.add_argument( - "--topk", default=1, type=int, help="The number of highest probability tokens to keep for top-k-sampling." - ) - parser.add_argument("--topp", default=1.0, type=float, help="The cumulative probability for top-p-filtering.") - parser.add_argument("--temperature", default=1.0, type=float, help="The temperature to set.") - parser.add_argument("--use_fp16", action="store_true", help="Whether to use fp16 to predict.") - parser.add_argument("--profile", action="store_true", help="Whether to profile.") - args = parser.parse_args() - return args - - -def profile(batch_size, total_step=50, warmup_step=10, rank=0): - def _wrapper(func): - def _impl(*args, **kwargs): - for i in range(total_step): - if i == warmup_step: - paddle.device.cuda.synchronize() - start_time = time.time() - out = func(*args, **kwargs) - paddle.device.cuda.synchronize() - end_time = time.time() - if rank is None or get_ft_para_conf().rank == rank: - time_interval = end_time - start_time - num_batch = total_step - warmup_step - print("Latency: %2fs, QPS: %2f" % (time_interval / num_batch, num_batch * batch_size / time_interval)) - return out - - return _impl - - return _wrapper - - -def postprocess_response(token_ids, tokenizer): - """Post-process the decoded sequence. Truncate from the first .""" - eos_pos = len(token_ids) - for i, tok_id in enumerate(token_ids): - if tok_id == tokenizer.sep_token_id: - eos_pos = i - break - token_ids = token_ids[:eos_pos] - tokens = tokenizer.convert_ids_to_tokens(token_ids) - tokens = tokenizer.merge_subword(tokens) - response = " ".join(tokens) - return response - - -def main(args): - # For memory saving when using FastGeneration: - # If environment variable `PPFG_QKV_MEM_OPT` is set and the weights of q/k/v - # is fused, it will try to delete the original unfused weights. Note the - # rollback to original model would not be guarantee anymore when the fast - # model failed if the original weights are deleted. - os.environ["PPFG_QKV_MEM_OPT"] = "1" - if args.use_fp16: - paddle.set_default_dtype("float16") - enable_ft_para() - # TODO(guosheng): Maybe device can be set in `enable_ft_para` - paddle.set_device("gpu:" + str(get_ft_para_conf().rank)) - - if args.profile: - UnifiedTransformerLMHeadModel.generate = profile(args.batch_size)(UnifiedTransformerLMHeadModel.generate) - tokenizer = UnifiedTransformerTokenizer.from_pretrained("plato-xl") - model = UnifiedTransformerLMHeadModel.from_pretrained("plato-xl") - model.eval() - - history = [ - "hi , Mary ! What do you usually like to do in your spare time ?", - "well , I spend a lot of time watching movies .", - "what a confidence ! I always watch a lot of movies , too ." - "oh really , Frank ? What kind of movies do you like ?", - ] - inputs = [history] * args.batch_size - inputs = list( - map( - lambda history: tokenizer.dialogue_encode( - history=history, - add_start_token_as_response=True, - return_length=True, - return_role_ids=args.use_role, - position_style=args.position_style, - ), - inputs, - ) - ) - collator = DataCollatorWithPadding(tokenizer) - data = collator(inputs) - - outputs, _ = model.generate( - input_ids=data["input_ids"], - token_type_ids=data["token_type_ids"], - position_ids=data["position_ids"], - attention_mask=data["attention_mask"].cast("float32"), # TODO(guosheng): remove this cast - role_ids=data.get("role_ids", None), - seq_len=data["seq_len"], - max_length=args.max_out_len, - min_length=args.min_out_len, - decode_strategy="sampling", - top_k=args.topk, - top_p=args.topp, - temperature=args.temperature, - num_return_sequences=args.num_return_sequences, - use_fast=True, - use_fp16_decoding=args.use_fp16, - ) - - # Only make the first process to output. - if get_ft_para_conf().rank == 0: - for i in range(len(outputs)): - result = postprocess_response(outputs[i].numpy(), tokenizer) - print("Result:", result) - - -if __name__ == "__main__": - args = parse_args() - pprint(args) - main(args) diff --git a/fast_generation/samples/t5_sample.py b/fast_generation/samples/t5_sample.py deleted file mode 100644 index 53ad13f903c1..000000000000 --- a/fast_generation/samples/t5_sample.py +++ /dev/null @@ -1,58 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse - -from paddlenlp.transformers import T5ForConditionalGeneration, T5Tokenizer - - -def parse_args(): - parser = argparse.ArgumentParser() - parser.add_argument("--max_length", default=256, type=int, help="Maximum output sequence length.") - parser.add_argument("--beam_size", default=4, type=int, help="The beam size to set.") - parser.add_argument("--use_faster", action="store_true", help="Whether to use faster to predict.") - parser.add_argument("--use_fp16_decoding", action="store_true", help="Whether to use fp16 to predict.") - args = parser.parse_args() - return args - - -def predict(args): - model_name = "t5-base" - - model = T5ForConditionalGeneration.from_pretrained(model_name) - model.eval() - tokenizer = T5Tokenizer.from_pretrained(model_name) - - en_text = ' This image section from an infrared recording by the Spitzer telescope shows a "family portrait" of countless generations of stars: the oldest stars are seen as blue dots. ' - input_ids = tokenizer.encode("translate English to French: " + en_text, return_tensors="pd")["input_ids"] - - output, _ = model.generate( - input_ids=input_ids, - num_beams=args.beam_size, - max_length=args.max_length, - decode_strategy="beam_search", - use_fast=True, # args.use_faster, - use_fp16_decoding=args.use_fp16_decoding, - ) - - translation = tokenizer.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False) - - print("The original sentence: ", en_text) - print("The translation result: ", translation) - - -if __name__ == "__main__": - args = parse_args() - - predict(args) diff --git a/fast_generation/samples/unimo_text_sample.py b/fast_generation/samples/unimo_text_sample.py deleted file mode 100644 index 29197be47e52..000000000000 --- a/fast_generation/samples/unimo_text_sample.py +++ /dev/null @@ -1,59 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from paddlenlp.transformers import UNIMOLMHeadModel, UNIMOTokenizer - -model_name = "unimo-text-1.0-lcsts-new" - -model = UNIMOLMHeadModel.from_pretrained(model_name) -model.eval() -tokenizer = UNIMOTokenizer.from_pretrained(model_name) - - -def postprocess_response(token_ids, tokenizer): - """Post-process the decoded sequence. Truncate from the first .""" - eos_pos = len(token_ids) - for i, tok_id in enumerate(token_ids): - if tok_id == tokenizer.mask_token_id: - eos_pos = i - break - token_ids = token_ids[:eos_pos] - tokens = tokenizer.convert_ids_to_tokens(token_ids) - tokens = tokenizer.merge_subword(tokens) - return tokens - - -inputs = "深度学习是人工智能的核心技术领域。百度飞桨作为中国首个自主研发、功能丰富、开源开放的产业级深度学习平台,将从多层次技术产品、产业AI人才培养和强大的生态资源支持三方面全面护航企业实现快速AI转型升级。" - -inputs_ids = tokenizer.gen_encode( - inputs, add_start_token_for_decoding=True, return_tensors=True, is_split_into_words=False -) - -outputs, _ = model.generate( - input_ids=inputs_ids["input_ids"], - token_type_ids=inputs_ids["token_type_ids"], - position_ids=inputs_ids["position_ids"], - attention_mask=inputs_ids["attention_mask"], - max_length=64, - decode_strategy="beam_search", - num_beams=2, - use_fast=True, -) - -result = postprocess_response(outputs[0].numpy(), tokenizer) -result = "".join(result) - -print("Model input:", inputs) -print("Result:", result) -# 百度飞桨:深度学习助力企业转型升级 diff --git a/paddlenlp/ops/CMakeLists.txt b/paddlenlp/ops/CMakeLists.txt deleted file mode 100644 index d0914969a979..000000000000 --- a/paddlenlp/ops/CMakeLists.txt +++ /dev/null @@ -1,490 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -cmake_minimum_required(VERSION 3.10 FATAL_ERROR) -project(FasterTransformer LANGUAGES C CXX CUDA) - -find_package(CUDA 10.1 REQUIRED) - -find_program(CCACHE_PROGRAM ccache) -if(CCACHE_PROGRAM) - set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache) - set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK ccache) -endif() - -INCLUDE(ExternalProject) - -set(CXX_STD "17" CACHE STRING "C++ standard") - -option(ON_INFER "Compiled with inference. " OFF) -option(WITH_GPU "Compiled with GPU/CPU, default use CPU." ON) -option(WITH_MKL "Compile with MKL. Only works when ON_INFER is ON." ON) -option(USE_TENSORRT "Compiled with TensorRT." OFF) -option(WITH_TRANSFORMER "Compiled with Transformer." ON) -option(WITH_GPT "Compiled with GPT." ON) -option(WITH_OPT "Compiled with OPT." ON) -option(WITH_UNIFIED "Compiled with Unified Transformer." ON) -option(WITH_T5 "Compiled with T5." ON) -option(WITH_SP "Compiled with sentencepiece. Only works when WITH_GPT and ON_INFER is ON." OFF) -option(WITH_DECODER "Compile with Transformer Decoder" ON) -option(WITH_ENCODER "Compile with Transformer Encoder" ON) -option(WITH_STATIC_LIB "Compile static lib" OFF) -option(WITH_BART "Compile with BART" ON) -option(WITH_MBART "Compile with MBART" ON) -option(WITH_PARALLEL "Compile with model parallel for GPT" OFF) -option(WITH_ONNXRUNTIME "Compile demo with ONNXRuntime" OFF) -option(WITH_GPTJ "Compile with GPTJ" ON) -option(WITH_PEGASUS "Compile with Pegasus" ON) - -set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake") -if(WITH_PARALLEL) - # https://cmake.org/cmake/help/latest/module/FindMPI.html#variables-for-locating-mpi - # https://github.com/Kitware/CMake/blob/master/Modules/FindMPI.cmake - find_package(MPI REQUIRED) - find_package(NCCL REQUIRED) - add_definitions(-DBUILD_GPT) - list(APPEND decoding_op_files parallel_utils.cc) -endif() - -if(NOT WITH_GPU) - message(FATAL_ERROR "Faster transformer custom op doesn't support CPU. Please add the flag -DWITH_GPU=ON to use GPU. ") -endif() - -list(APPEND decoding_op_files cublas_handle.cc utils.cc) - -if(WITH_TRANSFORMER) - list(APPEND decoding_op_files fusion_decoding_op.cc fusion_decoding_op.cu fusion_force_decoding_op.cc fusion_force_decoding_op.cu) -endif() - -if(WITH_GPT) - list(APPEND decoding_op_files fusion_gpt_op.cc fusion_gpt_op.cu) -endif() - -if(WITH_OPT) - list(APPEND decoding_op_files fusion_opt_op.cc fusion_opt_op.cu) -endif() - -if(WITH_UNIFIED) - list(APPEND decoding_op_files fusion_unified_decoding_op.cc fusion_unified_decoding_op.cu fusion_miro_op.cc fusion_miro_op.cu) -endif() - -if(WITH_ENCODER) - list(APPEND decoding_op_files fusion_encoder_op.cc fusion_encoder_op.cu) -endif() - -if(WITH_DECODER) - list(APPEND decoding_op_files fusion_decoder_op.cc fusion_decoder_op.cu) -endif() - -if(WITH_BART) - list(APPEND decoding_op_files fusion_bart_decoding_op.cc fusion_bart_decoding_op.cu) -endif() - -if(WITH_MBART) - list(APPEND decoding_op_files fusion_mbart_decoding_op.cc fusion_mbart_decoding_op.cu) -endif() - -if(WITH_GPTJ) - list(APPEND decoding_op_files fusion_gptj_op.cc fusion_gptj_op.cu) -endif() - -if(WITH_PEGASUS) - list(APPEND decoding_op_files fusion_pegasus_decoding_op.cc fusion_pegasus_decoding_op.cu) -endif() - -if(WITH_T5) - list(APPEND decoding_op_files fusion_t5_decoding_op.cc fusion_t5_decoding_op.cu) -endif() - -if(NOT WITH_TRANSFORMER AND NOT WITH_GPT AND NOT WITH_DECODER AND NOT WITH_ENCODER AND NOT WITH_BART AND NOT WITH_MBART AND NOT WITH_GPTJ AND NOT WITH_PEGASUS AND NOT WITH_T5) - message(FATAL_ERROR "-DWITH_TRANSFORMER=ON or/and -DWITH_GPT=ON or/and -DWITH_DECODER=ON or/and -DWITH_ENCODER=ON or/and -DWITH_BART=ON or/and -DWITH_MBART=ON or/and -DWITH_GPTJ=ON or/and -DWITH_PEGASUS=ON or/and -DWITH_T5=ON must be set to use FasterTransformer. ") -endif() - -set(CUDA_PATH ${CUDA_TOOLKIT_ROOT_DIR}) - -list(APPEND CMAKE_MODULE_PATH ${CUDA_PATH}/lib64) - -# Setting compiler flags -set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}") -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") -set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -Wall") - -###################################################################################### -# A function for automatic detection of GPUs installed (if autodetection is enabled) -# Usage: -# detect_installed_gpus(out_variable) -function(detect_installed_gpus out_variable) - if(NOT CUDA_gpu_detect_output) - set(cufile ${PROJECT_BINARY_DIR}/detect_cuda_archs.cu) - - file(WRITE ${cufile} "" - "#include \"stdio.h\"\n" - "#include \"cuda.h\"\n" - "#include \"cuda_runtime.h\"\n" - "int main() {\n" - " int count = 0;\n" - " if (cudaSuccess != cudaGetDeviceCount(&count)) return -1;\n" - " if (count == 0) return -1;\n" - " for (int device = 0; device < count; ++device) {\n" - " cudaDeviceProp prop;\n" - " if (cudaSuccess == cudaGetDeviceProperties(&prop, device))\n" - " printf(\"%d.%d \", prop.major, prop.minor);\n" - " }\n" - " return 0;\n" - "}\n") - - execute_process(COMMAND "${CUDA_NVCC_EXECUTABLE}" - "--run" "${cufile}" - WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/" - RESULT_VARIABLE nvcc_res OUTPUT_VARIABLE nvcc_out - ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) - - if(nvcc_res EQUAL 0) - # Only use last item of nvcc_out (the last device's compute capability). - string(REGEX REPLACE "\\." "" nvcc_out "${nvcc_out}") - string(REGEX MATCHALL "[0-9()]+" nvcc_out "${nvcc_out}") - list(GET nvcc_out -1 nvcc_out) - set(CUDA_gpu_detect_output ${nvcc_out} CACHE INTERNAL "Returned GPU architetures from detect_installed_gpus tool" FORCE) - endif() - endif() - - if(NOT CUDA_gpu_detect_output) - message(STATUS "Automatic GPU detection failed. Building for all known architectures.") - set(${out_variable} ${paddle_known_gpu_archs} PARENT_SCOPE) - else() - set(${out_variable} ${CUDA_gpu_detect_output} PARENT_SCOPE) - endif() -endfunction() - -if (NOT SM) - # TODO(guosheng): Remove it if `GetCUDAComputeCapability` is exposed by paddle. - # Currently, if `CUDA_gpu_detect_output` is not defined, use the detected arch. - detect_installed_gpus(SM) -endif() - -#[[ -if (SM STREQUAL 80 OR - SM STREQUAL 86 OR - SM STREQUAL 70 OR - SM STREQUAL 75 OR - SM STREQUAL 61 OR - SM STREQUAL 60) -set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_${SM},code=\\\"sm_${SM},compute_${SM}\\\"") - if (SM STREQUAL 70 OR SM STREQUAL 75 OR SM STREQUAL 80 OR SM STREQUAL 86) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DWMMA") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DWMMA") - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DWMMA") - endif() -message("-- Assign GPU architecture (sm=${SM})") - -else() -set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} \ - -gencode=arch=compute_70,code=\\\"sm_70,compute_70\\\" \ - -gencode=arch=compute_75,code=\\\"sm_75,compute_75\\\" \ - ") - -set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DWMMA") -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DWMMA") -set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DWMMA") - -message("-- Assign GPU architecture (sm=70,75)") -endif() -]] - -set(SM_SETS 52 60 61 70 75 80) -set(USING_WMMA False) -set(FIND_SM False) - -foreach(SM_NUM IN LISTS SM_SETS) - string(FIND "${SM}" "${SM_NUM}" SM_POS) - if(SM_POS GREATER -1) - set(FIND_SM True) - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_${SM_NUM},code=\\\"sm_${SM_NUM},compute_${SM_NUM}\\\"") - - if (SM_NUM STREQUAL 70 OR SM_NUM STREQUAL 75 OR SM_NUM STREQUAL 80 OR SM_NUM STREQUAL 86) - set(USING_WMMA True) - endif() - - set(CMAKE_CUDA_ARCHITECTURES ${SM_NUM}) - message("-- Assign GPU architecture (sm=${SM_NUM})") - endif() -endforeach() - -if(USING_WMMA STREQUAL True) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DWMMA") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DWMMA") - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DWMMA") - message("-- Use WMMA") -endif() - -if(NOT (FIND_SM STREQUAL True)) - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} \ - -gencode=arch=compute_70,code=\\\"sm_70,compute_70\\\" \ - -gencode=arch=compute_75,code=\\\"sm_75,compute_75\\\" \ - -gencode=arch=compute_80,code=\\\"sm_80,compute_80\\\" \ - ") - - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DWMMA") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DWMMA") - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DWMMA") - if(BUILD_PYT) - set(ENV{TORCH_CUDA_ARCH_LIST} "7.0;7.5;8.0") - endif() - set(CMAKE_CUDA_ARCHITECTURES 70 75 80) - message("-- Assign GPU architecture (sm=70,75,80)") -endif() - -set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -Wall -O0") -set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -Wall -O0") -set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -O0 -G -Xcompiler -Wall") - -set(CMAKE_CXX17_STANDARD_COMPILE_OPTION "-std=c++{CXX_STD}") -set(CMAKE_CXX17_EXTENSION_COMPILE_OPTION "-std=gnu++{CXX_STD}") -set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda") -set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr") -set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --std=c++${CXX_STD}") - -set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3") -set(CMAKE_CUDA_FLAGS_RELEASE "${CMAKE_CUDA_FLAGS_RELEASE} -Xcompiler -O3") - -set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) -set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) -set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) - -list(APPEND COMMON_HEADER_DIRS - ${PROJECT_SOURCE_DIR} - ${CUDA_PATH}/include) - -set(COMMON_LIB_DIRS - ${CUDA_PATH}/lib64 -) - -if(WITH_PARALLEL) - list(APPEND COMMON_HEADER_DIRS - ${NCCL_INCLUDE_PATH} - ${MPI_INCLUDE_PATH}) -endif() - -set(THIRD_PATH "third-party") -set(THIRD_PARTY_NAME "fastertransformer") - -include(external/boost) - -set(OPS_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) - -file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/fastertransformer/utils/allocator.h allocator_src) -file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/utils/allocator.h allocator_dst) - -file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/fastertransformer/utils/common.h common_src) -file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/utils/common.h common_dst) - -file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/fastertransformer/utils/common_structure.h common_structure_src) -file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/utils/common_structure.h common_structure_dst) - -file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/CMakeLists.txt cmakelists_src) -file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/CMakeLists.txt cmakelists_dst) - -file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/fastertransformer/cuda/topk_kernels.cu topk_kernels_src) -file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/cuda/topk_kernels.cu topk_kernels_dst) - -file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/fastertransformer/cuda/lightseq_kernels.cu lightseq_kernels_cu_src) - -file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/fastertransformer/cuda/open_decoder.cu open_decoder_cu_src) -file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/cuda/open_decoder.cu open_decoder_cu_dst) - -file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/fastertransformer/open_decoder.h open_decoder_h_src) -file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/open_decoder.h open_decoder_h_dst) - -file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/fastertransformer/cuda/cuda_kernels.h cuda_kernels_h_src) -file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/cuda/cuda_kernels.h cuda_kernels_h_dst) - -file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/fastertransformer/cuda/cuda_kernels.cu cuda_kernels_cu_src) -file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/cuda/cuda_kernels.cu cuda_kernels_cu_dst) - -file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/fastertransformer/cuda/decoding_kernels.cu decoding_kernels_cu_src) -file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/fastertransformer/cuda/transformer_decoding_kernels.cu trans_decoding_kernels_cu_src) -file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/cuda/decoding_kernels.cu decoding_kernels_cu_dst) - -file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/fastertransformer/cuda/open_decoder.cuh open_decoder_cuh_src) -file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/cuda/open_decoder.cuh open_decoder_cuh_dst) - -file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/fastertransformer/utils/arguments.h arguments_h_src) -file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/utils/arguments.h arguments_h_dst) - -file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/fastertransformer/decoding_beamsearch.h decoding_beamsearch_h_src) -file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/decoding_beamsearch.h decoding_beamsearch_h_dst) - -file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/fastertransformer/decoding_sampling.h decoding_sampling_h_src) -file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/decoding_sampling.h decoding_sampling_h_dst) - -file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/fastertransformer/cuda/online_softmax_beamsearch_kernels.cu online_softmax_beamsearch_kernels_cu_src) -file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/cuda/online_softmax_beamsearch_kernels.cu online_softmax_beamsearch_kernels_cu_dst) - -file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/fastertransformer/cuda/topk_kernels.cuh topk_kernels_cuh_src) -file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/cuda/topk_kernels.cuh topk_kernels_cuh_dst) - -file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/fastertransformer/cuda/transformer_kernels.cu trans_kernels_cu_src) -file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/cuda/transformer_kernels.cu trans_kernels_cu_dst) - -file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/fastertransformer/cuda/transformer_kernels.cuh trans_kernels_cuh_src) -file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/cuda/transformer_kernels.cuh trans_kernels_cuh_dst) - -file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/fastertransformer/cuda/masked_multihead_attention.cu masked_multihead_attention_cu_src) -file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/cuda/masked_multihead_attention.cu masked_multihead_attention_cu_dst) - -file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/fastertransformer/gpt.h gpt_h_src) -file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/gpt.h gpt_h_dst) - -file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/fastertransformer/opt.h opt_h_src) -file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/opt.h opt_h_dst) - -file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/fastertransformer/gptj.h gptj_h_src) -file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/gptj.h gptj_h_dst) - -file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/fastertransformer/cuda/masked_multihead_attention_utils.h masked_multihead_attention_utils_h_src) -file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/cuda/masked_multihead_attention_utils.h masked_multihead_attention_utils_h_dst) - -file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/fastertransformer/cuda/masked_multihead_attention.h masked_multihead_attention_h_src) -file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/cuda/masked_multihead_attention.h masked_multihead_attention_h_dst) - -file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/fastertransformer/cuda/attention_kernels.cu attention_kernels_cu_src) -file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/cuda/attention_kernels.cu attention_kernels_cu_dst) - -file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/fastertransformer/cuda/attention_kernels.cuh attention_kernels_cuh_src) -file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/cuda/attention_kernels.cuh attention_kernels_cuh_dst) - -file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/fastertransformer/t5_beamsearch.h t5_bs_h_src) -file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/fastertransformer/t5_sampling.h t5_spl_h_src) -set(ft_dst ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/) - -# Encoder patches. -file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/fastertransformer/bert_encoder_transformer.h bert_encoder_transformer_h_src) -file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/bert_encoder_transformer.h bert_encoder_transformer_h_dst) - -file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/fastertransformer/standard_encoder.h standard_encoder_h_src) -file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/standard_encoder.h standard_encoder_h_dst) - -file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/fastertransformer/cuda/open_attention.h open_attention_h_src) -file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/cuda/open_attention.h open_attention_h_dst) - -file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/cuda/open_attention.cu open_attention_cu_dst) - -file(TO_NATIVE_PATH ${OPS_SOURCE_DIR}/patches/FasterTransformer/fastertransformer/CMakeLists.txt fastertransformer_cmakelists_src) -file(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/CMakeLists.txt fastertransformer_cmakelists_dst) -# Encoder patches end. - -# TODO(guosheng): `find` seems meeting errors missing argument to `-exec', fix it -set(MUTE_COMMAND grep -rl "printf(\"\\[WARNING\\]" ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/ | xargs -i{} sed -i "s/printf(\"\\WWARNING\\W decoding[^)]\\{1,\\})/ /" {}) -set(OPEN_ATTENTION_MUTE_COMMAND grep -rl "printf(\"\\[WARNING\\]" ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/ | xargs -i{} sed -i "s/printf(\"\\WWARNING\\W\\WOpenMultiHeadAttention\\W[^)]\\{1,\\})/ /" {}) - -set(RM_OLD_CUB_COMMAND rm -rf ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/${THIRD_PARTY_NAME}/fastertransformer/cuda/cub) - -set(FT_PATCH_COMMAND - printf \\n\\n > blank_lines - && cp ${allocator_src} ${allocator_dst} - && cp ${common_src} ${common_dst} - && cp ${common_structure_src} ${common_structure_dst} - && cp ${cmakelists_src} ${cmakelists_dst} - && cp ${topk_kernels_src} ${topk_kernels_dst} - && cp ${decoding_beamsearch_h_src} ${decoding_beamsearch_h_dst} - && cp ${decoding_sampling_h_src} ${decoding_sampling_h_dst} - && cp ${online_softmax_beamsearch_kernels_cu_src} ${online_softmax_beamsearch_kernels_cu_dst} - && cp ${arguments_h_src} ${arguments_h_dst} - && cp ${open_decoder_h_src} ${open_decoder_h_dst} - && cp ${standard_encoder_h_src} ${standard_encoder_h_dst} - && cp ${bert_encoder_transformer_h_src} ${bert_encoder_transformer_h_dst} - && cp ${trans_kernels_cu_src} ${trans_kernels_cu_dst} - && cp ${masked_multihead_attention_cu_src} ${masked_multihead_attention_cu_dst} - && cp ${open_attention_h_src} ${open_attention_h_dst} - && cp ${fastertransformer_cmakelists_src} ${fastertransformer_cmakelists_dst} - && cp ${gpt_h_src} ${gpt_h_dst} - && cp ${opt_h_src} ${opt_h_dst} - && cp ${gptj_h_src} ${gptj_h_dst} - && cp ${masked_multihead_attention_h_src} ${masked_multihead_attention_h_dst} - && cp ${t5_bs_h_src} ${ft_dst} - && cp ${t5_spl_h_src} ${ft_dst} - && cat blank_lines ${masked_multihead_attention_utils_h_src} >> ${masked_multihead_attention_utils_h_dst} - && cat blank_lines ${attention_kernels_cu_src} >> ${attention_kernels_cu_dst} - && cat blank_lines ${attention_kernels_cuh_src} >> ${attention_kernels_cuh_dst} - && cat blank_lines ${cuda_kernels_h_src} >> ${cuda_kernels_h_dst} - && cat blank_lines ${lightseq_kernels_cu_src} >> ${topk_kernels_dst} - && cat blank_lines ${cuda_kernels_cu_src} >> ${cuda_kernels_cu_dst} - && cat blank_lines ${decoding_kernels_cu_src} >> ${decoding_kernels_cu_dst} - && cat blank_lines ${topk_kernels_cuh_src} >> ${topk_kernels_cuh_dst} - && cat blank_lines ${trans_decoding_kernels_cu_src} >> ${decoding_kernels_cu_dst} - && cat blank_lines ${open_decoder_cu_src} >> ${open_decoder_cu_dst} - && cat blank_lines ${open_decoder_cuh_src} >> ${open_decoder_cuh_dst} - && cat blank_lines ${trans_kernels_cuh_src} >> ${trans_kernels_cuh_dst} - && sed -i "s/^#define NEW_TRANSPOSE_BATCH_MAJOR 1/#define NEW_TRANSPOSE_BATCH_MAJOR 0/g" ${open_decoder_cu_dst} - && sed -i "2091,2119d" ${open_attention_cu_dst} - && rm blank_lines - && ${MUTE_COMMAND} - && ${OPEN_ATTENTION_MUTE_COMMAND} - && ${RM_OLD_CUB_COMMAND} -) - -# TODO(guosheng): Use UPDATE_COMMAND instead of PATCH_COMMAND to make cmake -# re-run always use the latest patches when the developer changes FT patch codes, -# all patches rather than the changes would re-build, any better way to do this. -# Or maybe hidden this function for simplicity. -set(FT_UPDATE_COMMAND git checkout nccl_dependent_refine && git checkout . && ${FT_PATCH_COMMAND}) - -ExternalProject_Add( - extern_${THIRD_PARTY_NAME} - GIT_REPOSITORY https://gitee.com/paddlepaddle/FasterTransformer.git - GIT_TAG nccl_dependent_refine - PREFIX ${THIRD_PATH} - SOURCE_DIR ${THIRD_PATH}/source/${THIRD_PARTY_NAME} - UPDATE_COMMAND ${FT_UPDATE_COMMAND} # PATCH_COMMAND ${FT_PATCH_COMMAND} - BINARY_DIR ${THIRD_PATH}/build/${THIRD_PARTY_NAME} - INSTALL_COMMAND "" - CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} -DSM=${SM} -DBUILD_PD=ON -DBUILD_ENCODER=${WITH_ENCODER} -DPY_CMD=${PY_CMD} -DON_INFER=${ON_INFER} -DPADDLE_LIB=${PADDLE_LIB} -DWITH_MKL=${WITH_MKL} -DWITH_STATIC_LIB=${WITH_STATIC_LIB} -DBUILD_GPT=${WITH_PARALLEL} -DWITH_ONNXRUNTIME=${WITH_ONNXRUNTIME} -) -# -DBUILD_GPT=${WITH_GPT} -ExternalProject_Get_property(extern_${THIRD_PARTY_NAME} BINARY_DIR) -ExternalProject_Get_property(extern_${THIRD_PARTY_NAME} SOURCE_DIR) -ExternalProject_Get_property(extern_${THIRD_PARTY_NAME} SOURCE_SUBDIR) - -set(FT_INCLUDE_PATH ${SOURCE_DIR}/${SOURCE_SUBDIR}) -set(FT_LIB_PATH ${BINARY_DIR}/lib) - -include_directories( - ${FT_INCLUDE_PATH} -) - -link_directories( - ${FT_LIB_PATH} -) - -if(ON_INFER AND WITH_GPT AND WITH_SP) - ExternalProject_Add( - extern_sentencepiece - GIT_REPOSITORY https://github.com/google/sentencepiece.git - PREFIX ${THIRD_PATH} - SOURCE_DIR ${THIRD_PATH}/source/sentencepiece/ - BINARY_DIR ${THIRD_PATH}/build/sentencepiece/ - INSTALL_COMMAND "" - ) - - include_directories( - ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source/sentencepiece/src/ - ) - - link_directories( - ${CMAKE_BINARY_DIR}/${THIRD_PATH}/build/sentencepiece/src/ - ) - - add_definitions(-DGPT_ON_SENTENCEPIECE) -endif() - -add_subdirectory(fast_transformer) diff --git a/paddlenlp/ops/__init__.py b/paddlenlp/ops/__init__.py index f18e6d0817ca..d98e77ece75f 100644 --- a/paddlenlp/ops/__init__.py +++ b/paddlenlp/ops/__init__.py @@ -18,16 +18,3 @@ from .distributed import * from .einsum import * -# isort: split -from .fast_transformer.transformer.decoding import * - -# isort: split -from .fast_transformer.transformer.decoder import * -from .fast_transformer.transformer.encoder import * -from .fast_transformer.transformer.fast_transformer import * - -paddle.nn.TransformerEncoderLayer._ft_forward = encoder_layer_forward # noqa F405 -paddle.nn.TransformerEncoder._ft_forward = encoder_forward # noqa F405 - -paddle.nn.TransformerEncoderLayer._ori_forward = paddle.nn.TransformerEncoderLayer.forward -paddle.nn.TransformerEncoder._ori_forward = paddle.nn.TransformerEncoder.forward diff --git a/paddlenlp/ops/cmake/FindNCCL.cmake b/paddlenlp/ops/cmake/FindNCCL.cmake deleted file mode 100644 index 7dc1fa9968f4..000000000000 --- a/paddlenlp/ops/cmake/FindNCCL.cmake +++ /dev/null @@ -1,165 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# From PyTorch: -# -# Copyright (c) 2016- Facebook, Inc (Adam Paszke) -# Copyright (c) 2014- Facebook, Inc (Soumith Chintala) -# Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert) -# Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu) -# Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu) -# Copyright (c) 2011-2013 NYU (Clement Farabet) -# Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston) -# Copyright (c) 2006 Idiap Research Institute (Samy Bengio) -# Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz) -# -# From Caffe2: -# -# Copyright (c) 2016-present, Facebook Inc. All rights reserved. -# -# All contributions by Facebook: -# Copyright (c) 2016 Facebook Inc. -# -# All contributions by Google: -# Copyright (c) 2015 Google Inc. -# All rights reserved. -# -# All contributions by Yangqing Jia: -# Copyright (c) 2015 Yangqing Jia -# All rights reserved. -# -# All contributions by Kakao Brain: -# Copyright 2019-2020 Kakao Brain -# -# All contributions from Caffe: -# Copyright(c) 2013, 2014, 2015, the respective contributors -# All rights reserved. -# -# All other contributions: -# Copyright(c) 2015, 2016 the respective contributors -# All rights reserved. -# -# Caffe2 uses a copyright model similar to Caffe: each contributor holds -# copyright over their contributions to Caffe2. The project versioning records -# all such contribution and copyright details. If a contributor wants to further -# mark their specific copyright on a particular contribution, they should -# indicate their copyright solely in the commit message of the change when it is -# committed. -# -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# -# 3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America -# and IDIAP Research Institute nor the names of its contributors may be -# used to endorse or promote products derived from this software without -# specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -# POSSIBILITY OF SUCH DAMAGE. -# -# Find the nccl libraries -# -# The following variables are optionally searched for defaults -# NCCL_ROOT: Base directory where all NCCL components are foundHong Xu, 1 year ago: • Let CMake handle NCCL detection instead of ou… -# NCCL_INCLUDE_DIR: Directory where NCCL header is foundPieter Noordhuis, 3 years ago: • Bump gloo -# NCCL_LIB_DIR: Directory where NCCL library is found -# -# The following are set after configuration is done: -# NCCL_FOUND -# NCCL_INCLUDE_DIRS -# NCCL_LIBRARIES -# -# The path hints include CUDA_TOOLKIT_ROOT_DIR seeing as some folks -# install NCCL in the same location as the CUDA toolkit. -# See https://github.com/caffe2/caffe2/issues/1601 - -set(NCCL_INCLUDE_DIR $ENV{NCCL_INCLUDE_DIR} CACHE PATH "Folder contains NVIDIA NCCL headers") -set(NCCL_LIB_DIR $ENV{NCCL_LIB_DIR} CACHE PATH "Folder contains NVIDIA NCCL libraries") -set(NCCL_VERSION $ENV{NCCL_VERSION} CACHE STRING "Version of NCCL to build with") - -if ($ENV{NCCL_ROOT_DIR}) - message(WARNING "NCCL_ROOT_DIR is deprecated. Please set NCCL_ROOT instead.") -endif() -list(APPEND NCCL_ROOT $ENV{NCCL_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR}) -# Compatible layer for CMake <3.12. NCCL_ROOT will be accounted in for searching paths and libraries for CMake >=3.12. -list(APPEND CMAKE_PREFIX_PATH ${NCCL_ROOT}) - -find_path(NCCL_INCLUDE_DIRS - NAMES nccl.h - HINTS ${NCCL_INCLUDE_DIR}) - -if (USE_STATIC_NCCL) - MESSAGE(STATUS "USE_STATIC_NCCL is set. Linking with static NCCL library.") - SET(NCCL_LIBNAME "nccl_static") - if (NCCL_VERSION) # Prefer the versioned library if a specific NCCL version is specified - set(CMAKE_FIND_LIBRARY_SUFFIXES ".a.${NCCL_VERSION}" ${CMAKE_FIND_LIBRARY_SUFFIXES}) - endif() -else() - SET(NCCL_LIBNAME "nccl") - if (NCCL_VERSION) # Prefer the versioned library if a specific NCCL version is specified - set(CMAKE_FIND_LIBRARY_SUFFIXES ".so.${NCCL_VERSION}" ${CMAKE_FIND_LIBRARY_SUFFIXES}) - endif() -endif() - -find_library(NCCL_LIBRARIES - NAMES ${NCCL_LIBNAME} - HINTS ${NCCL_LIB_DIR}) - -include(FindPackageHandleStandardArgs) -find_package_handle_standard_args(NCCL DEFAULT_MSG NCCL_INCLUDE_DIRS NCCL_LIBRARIES) - -if(NCCL_FOUND) # obtaining NCCL version and some sanity checks - set (NCCL_HEADER_FILE "${NCCL_INCLUDE_DIRS}/nccl.h") - message (STATUS "Determining NCCL version from ${NCCL_HEADER_FILE}...") - set (OLD_CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES}) - list (APPEND CMAKE_REQUIRED_INCLUDES ${NCCL_INCLUDE_DIRS}) - include(CheckCXXSymbolExists) - check_cxx_symbol_exists(NCCL_VERSION_CODE nccl.h NCCL_VERSION_DEFINED) - - if (NCCL_VERSION_DEFINED) - set(file "${PROJECT_BINARY_DIR}/detect_nccl_version.cc") - file(WRITE ${file} " - #include - #include - int main() - { - std::cout << NCCL_MAJOR << '.' << NCCL_MINOR << '.' << NCCL_PATCH << std::endl; - int x; - ncclGetVersion(&x); - return x == NCCL_VERSION_CODE; - } -") - try_run(NCCL_VERSION_MATCHED compile_result ${PROJECT_BINARY_DIR} ${file} - RUN_OUTPUT_VARIABLE NCCL_VERSION_FROM_HEADER - LINK_LIBRARIES ${NCCL_LIBRARIES}) - if (NOT NCCL_VERSION_MATCHED) - message(FATAL_ERROR "Found NCCL header version and library version do not match! \ -(include: ${NCCL_INCLUDE_DIRS}, library: ${NCCL_LIBRARIES}) Please set NCCL_INCLUDE_DIR and NCCL_LIB_DIR manually.") - endif() - message(STATUS "NCCL version: ${NCCL_VERSION_FROM_HEADER}") - else() - # message(STATUS "NCCL version < 2.3.5-5") - endif () - set (CMAKE_REQUIRED_INCLUDES ${OLD_CMAKE_REQUIRED_INCLUDES}) - - message(STATUS "Found NCCL (include: ${NCCL_INCLUDE_DIRS}, library: ${NCCL_LIBRARIES})") - mark_as_advanced(NCCL_ROOT_DIR NCCL_INCLUDE_DIRS NCCL_LIBRARIES) -endif() diff --git a/paddlenlp/ops/cmake/external/boost.cmake b/paddlenlp/ops/cmake/external/boost.cmake deleted file mode 100644 index 3140c7a48f46..000000000000 --- a/paddlenlp/ops/cmake/external/boost.cmake +++ /dev/null @@ -1,64 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -include(ExternalProject) - -set(BOOST_PROJECT "extern_boost") -# To release PaddlePaddle as a pip package, we have to follow the -# manylinux1 standard, which features as old Linux kernels and -# compilers as possible and recommends CentOS 5. Indeed, the earliest -# CentOS version that works with NVIDIA CUDA is CentOS 6. And a new -# version of boost, say, 1.66.0, doesn't build on CentOS 6. We -# checked that the devtools package of CentOS 6 installs boost 1.41.0. -# So we use 1.41.0 here. -set(BOOST_VER "1.41.0") -set(BOOST_TAR "boost_1_41_0" CACHE STRING "" FORCE) -set(BOOST_URL "http://paddlepaddledeps.bj.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE) - -MESSAGE(STATUS "BOOST_TAR: ${BOOST_TAR}, BOOST_URL: ${BOOST_URL}") - -set(THIRD_PARTY_PATH ${CMAKE_BINARY_DIR}/${THIRD_PATH}/source) - -set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost) -set(BOOST_DOWNLOAD_DIR "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}") - -set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}" CACHE PATH "boost include directory." FORCE) -set_directory_properties(PROPERTIES CLEAN_NO_CUSTOM 1) -include_directories(${BOOST_INCLUDE_DIR}) - -ExternalProject_Add( - ${BOOST_PROJECT} - ${EXTERNAL_PROJECT_LOG_ARGS} - DOWNLOAD_DIR ${BOOST_DOWNLOAD_DIR} - URL ${BOOST_URL} - DOWNLOAD_NO_PROGRESS 1 - PREFIX ${BOOST_SOURCES_DIR} - CONFIGURE_COMMAND "" - BUILD_COMMAND "" - INSTALL_COMMAND "" - UPDATE_COMMAND "" - ) - -ExternalProject_Get_property(${BOOST_PROJECT} SOURCE_DIR) - -if (${CMAKE_VERSION} VERSION_LESS "3.3.0" OR NOT WIN32) - set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/boost_dummy.c) - file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";") - add_library(boost STATIC ${dummyfile}) -else() - add_library(boost INTERFACE) -endif() - -add_dependencies(boost ${BOOST_PROJECT}) -set(Boost_INCLUDE_DIR ${BOOST_INCLUDE_DIR}) -include_directories(${Boost_INCLUDE_DIR}) diff --git a/paddlenlp/ops/ext_utils.py b/paddlenlp/ops/ext_utils.py deleted file mode 100644 index 5891d78abdaa..000000000000 --- a/paddlenlp/ops/ext_utils.py +++ /dev/null @@ -1,367 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import functools -import hashlib -import os -import subprocess -import sys -import sysconfig -import textwrap -from pathlib import Path - -from filelock import FileLock -from paddle.utils.cpp_extension import load_op_meta_info_and_register_op -from paddle.utils.cpp_extension.cpp_extension import CUDA_HOME -from paddle.utils.cpp_extension.cpp_extension import ( - BuildExtension as PaddleBuildExtension, -) -from paddle.utils.cpp_extension.cpp_extension import CppExtension -from paddle.utils.cpp_extension.extension_utils import ( - _import_module_from_library, - _jit_compile, -) -from setuptools import Extension - -from paddlenlp.utils.env import PPNLP_HOME -from paddlenlp.utils.log import logger - -if CUDA_HOME and not os.path.exists(CUDA_HOME): - # CUDA_HOME is only None for Windows CPU version in paddle `find_cuda_home`. - # Clear it for other non-CUDA situations. - CUDA_HOME = None - -LOADED_EXT = {} - - -def file_lock(lock_file_path): - def _wrapper(func): - @functools.wraps(func) - def _impl(*args, **kwargs): - with FileLock(lock_file_path): - func(*args, **kwargs) - - return _impl - - return _wrapper - - -def _get_files(path): - """ - Helps to list all files under the given path. - """ - if os.path.isfile(path): - return [path] - all_files = [] - for root, _dirs, files in os.walk(path, followlinks=True): - for file in files: - file = os.path.join(root, file) - all_files.append(file) - return all_files - - -# copy form distutils.dep_util to avoid import distutils -def newer_group(sources, target, missing="error"): - """Return true if 'target' is out-of-date with respect to any file - listed in 'sources'. In other words, if 'target' exists and is newer - than every file in 'sources', return false; otherwise return true. - 'missing' controls what we do when a source file is missing; the - default ("error") is to blow up with an OSError from inside 'stat()'; - if it is "ignore", we silently drop any missing source files; if it is - "newer", any missing source files make us assume that 'target' is - out-of-date (this is handy in "dry-run" mode: it'll make you pretend to - carry out commands that wouldn't work because inputs are missing, but - that doesn't matter because you're not actually going to run the - commands). - """ - # If the target doesn't even exist, then it's definitely out-of-date. - if not os.path.exists(target): - return 1 - - # Otherwise we have to find out the hard way: if *any* source file - # is more recent than 'target', then 'target' is out-of-date and - # we can immediately return true. If we fall through to the end - # of the loop, then 'target' is up-to-date and we return false. - from stat import ST_MTIME - - target_mtime = os.stat(target)[ST_MTIME] - for source in sources: - if not os.path.exists(source): - if missing == "error": # blow up when we stat() the file - pass - elif missing == "ignore": # missing source dropped from - continue # target's dependency list - elif missing == "newer": # missing source means target is - return 1 # out-of-date - - source_mtime = os.stat(source)[ST_MTIME] - if source_mtime > target_mtime: - return 1 - else: - return 0 - - -class CMakeExtension(Extension): - def __init__(self, name, source_dir=None): - # A CMakeExtension needs a source_dir instead of a file list. - Extension.__init__(self, name, sources=[]) - if source_dir is None: - self.source_dir = str(Path(__file__).parent.resolve()) - else: - self.source_dir = os.path.abspath(os.path.expanduser(source_dir)) - self.sources = _get_files(self.source_dir) - - def build_with_command(self, ext_builder): - """ - Custom `build_ext.build_extension` in `Extension` instead of `Command`. - `ext_builder` is the instance of `build_ext` command. - """ - # refer to https://github.com/pybind/cmake_example/blob/master/setup.py - if ext_builder.compiler.compiler_type == "msvc": - raise NotImplementedError - cmake_args = getattr(self, "cmake_args", []) + [ - "-DCMAKE_BUILD_TYPE={}".format("Debug" if ext_builder.debug else "Release"), - "-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={}".format(ext_builder.build_lib), - ] - build_args = [] - - # Set CMAKE_BUILD_PARALLEL_LEVEL to control the parallel build level - # across all generators. - if "CMAKE_BUILD_PARALLEL_LEVEL" not in os.environ: - # self.parallel is a Python 3 only way to set parallel jobs by hand - # using -j in the build_ext call, not supported by pip or PyPA-build. - if hasattr(ext_builder, "parallel") and ext_builder.parallel: - # CMake 3.12+ only. - build_args += ["-j{}".format(ext_builder.parallel)] - - build_args += ["-j14"] - - if not os.path.exists(ext_builder.build_temp): - os.makedirs(ext_builder.build_temp) - - # Redirect stdout/stderr to mute, especially when allowing errors - stdout = getattr(self, "_std_out_handle", None) - subprocess.check_call( - ["cmake", self.source_dir] + cmake_args, cwd=ext_builder.build_temp, stdout=stdout, stderr=stdout - ) - subprocess.check_call( - ["cmake", "--build", "."] + build_args, cwd=ext_builder.build_temp, stdout=stdout, stderr=stdout - ) - - def get_target_filename(self): - """ - The file names of libraries. Currently only support one library for - one extension. - """ - raise NotImplementedError - - def get_output_filename(self): - """ - The file names of outputs, which mostly is the same with - `get_target_filename`. - """ - return self.get_target_filename() - - -class FasterTransformerExtension(CMakeExtension): - def __init__(self, name, source_dir=None, need_parallel=False): - super(FasterTransformerExtension, self).__init__(name, source_dir) - self.sources = _get_files(os.path.join(self.source_dir, "fast_transformer", "src")) + _get_files( - os.path.join(self.source_dir, "patches", "FasterTransformer") - ) - self._std_out_handle = None - # Env variable may not work as expected, since jit compile by `load` - # would not re-built if source code is not update. - # self.sm = os.environ.get("PPNLP_GENERATE_CODE", None) - # Whether or not to use model parallel. Note that since the building use - # a new process, we shoud find a way to let it know whether to use model - # parallel. - self.need_parallel = need_parallel - - def build_with_command(self, ext_builder): - if CUDA_HOME is None: # GPU only - # TODO(guosheng): should we touch a dummy file or add a quick exit - # method to avoid meaningless process in `load` - logger.warning("FastGeneration is not available because CUDA can not be found.") - raise NotImplementedError - # TODO(guosheng): Multiple -std seems be passed in FastGeneration, - # which is not allowed by NVCC. Fix it later. - self.cmake_args = [f"-DPY_CMD={sys.executable}"] - # `GetCUDAComputeCapability` is not exposed yet, and detect CUDA/GPU - # version in cmake file. - # self.cmake_args += [f"-DSM={self.sm}"] if self.sm is not None else [] - self.cmake_args += "-DWITH_GPT=ON -DON_INFER=OFF -DWITH_MKL=ON -DWITH_ONNXRUNTIME=ON".split() - - self.cmake_args += ["-DCMAKE_C_COMPILER={}".format(os.popen("which gcc").read().replace("\n", ""))] - self.cmake_args += ["-DCMAKE_CXX_COMPILER={}".format(os.popen("which g++").read().replace("\n", ""))] - - self.cmake_args += ["-DPYTHON_LIBRARY={}".format(sysconfig.get_config_var("LIBDIR"))] - self.cmake_args += ["-DPYTHON_INCLUDE_DIR={}".format(sysconfig.get_config_var("INCLUDEPY"))] - - if self.need_parallel: - self.cmake_args += ["-DWITH_PARALLEL=ON"] - - try: - super(FasterTransformerExtension, self).build_with_command(ext_builder) - # FastGeneration cmake file resets `CMAKE_LIBRARY_OUTPUT_DIRECTORY` - # to `CMAKE_BINARY_DIR/lib`, thus copy the lib back to `build_ext.build_lib`. - # Maybe move this copy to CMakeList. - # `copy_tree` or `copy_file`, boost lib might be included - ext_builder.copy_tree(os.path.join(ext_builder.build_temp, "lib"), ext_builder.build_lib) - # TODO(guosheng): Maybe we should delete the build dir especially - # when it is in the dir of paddlenlp package. - # os.remove(ext_builder.build_temp) - except Exception as e: - logger.warning("FastGeneration is not available due to build errors.") - raise e - - def get_target_filename(self): - # CMake file has fixed the name of lib, maybe we can copy it as the name - # returned by `BuildExtension.get_ext_filename` after build. - return "libdecoding_op.so" - - def get_output_filename(self): - return "libdecoding_op.so" - - -class BuildExtension(PaddleBuildExtension): - """ - Support both `CppExtention` of Paddle and custom extensions of PaddleNLP. - """ - - def build_extensions(self): - custom_exts = [] # for - no_custom_exts = [] # for normal extentions paddle.utils.cpp_extension - for ext in self.extensions: - if hasattr(ext, "build_with_command"): - # custom build in Extension - ext.build_with_command(self) - custom_exts.append(ext) - else: - no_custom_exts.append(ext) - if no_custom_exts: - # Build CppExtentio/CUDAExtension with `PaddleBuildExtension` - self.extensions = no_custom_exts - super(BuildExtension, self).build_extensions() - self.extensions = custom_exts + no_custom_exts - - -EXTENSIONS = { - "FastGeneration": FasterTransformerExtension, - # NOTE: Since model parallel code is supported by definitions, to avoid - # performance degrading on non-parallel mode, we use a separated lib for - # model parallel. - "FasterTransformerParallel": FasterTransformerExtension, -} - - -def get_extension_maker(name): - # Use `paddle.utils.cpp_extension.CppExtension` as the default - # TODO(guosheng): Maybe register extension classes into `Extensions`. - return EXTENSIONS.get(name, CppExtension) - - -def _write_setup_file(name, file_path, build_dir, **kwargs): - """ - Automatically generate setup.py and write it into build directory. - `kwargws` is arguments for the corresponding Extension initialization. - Any type extension can be jit build. - """ - template = textwrap.dedent( - """ - from setuptools import setup - from paddlenlp.ops.ext_utils import get_extension_maker, BuildExtension - - setup( - name='{name}', - ext_modules=[ - get_extension_maker('{name}')( - name='{name}', - {kwargs_str})], - cmdclass={{'build_ext' : BuildExtension.with_options( - output_dir=r'{build_dir}') - }})""" - ).lstrip() - kwargs_str = "" - for key, value in kwargs.items(): - kwargs_str += key + "=" + (f"'{value}'" if isinstance(value, str) else str(value)) + "," - content = template.format(name=name, kwargs_str=kwargs_str, build_dir=build_dir) - - with open(file_path, "w") as f: - f.write(content) - - -@file_lock(os.path.join(PPNLP_HOME, "load_ext.lock")) -def load(name, build_dir=None, force=False, verbose=False, **kwargs): - # TODO(guosheng): Need better way to resolve unsupported such as CPU. Currently, - # raise NotImplementedError and skip `_jit_compile`. Otherwise, `_jit_compile` - # will output the error to stdout (when verbose is True) and raise `RuntimeError`, - # which is not friendly for users though no other bad effect. - if CUDA_HOME is None: - logger.warning("%s is not available because CUDA can not be found." % name) - raise NotImplementedError - if name in LOADED_EXT.keys(): - # TODO(guosheng): Maybe the key should combined with kwargs since the - # extension object is created using them. - return LOADED_EXT[name] - if build_dir is None: - # build_dir = os.path.join(PPNLP_HOME, 'extenstions') - # Maybe under package dir is better to avoid cmake source path conflict - # with different source path, like this: - # build_dir = os.path.join( - # str(Path(__file__).parent.resolve()), 'extenstions') - # However if it is under the package dir, it might make the package hard - # to uninstall. Thus we put it in PPNLP_HOME with digest of current path, - # like this: - build_dir = os.path.join( - PPNLP_HOME, "extensions", hashlib.md5(str(Path(__file__).parent.resolve()).encode("utf-8")).hexdigest() - ) - build_base_dir = os.path.abspath(os.path.expanduser(os.path.join(build_dir, name))) - if not os.path.exists(build_base_dir): - os.makedirs(build_base_dir) - - extension = get_extension_maker(name)(name, **kwargs) - # Check if 'target' is out-of-date with respect to any file to avoid rebuild - if isinstance(extension, CMakeExtension): - # `CppExtention/CUDAExtension `has version manager by `PaddleBuildExtension` - # Maybe move this to CMakeExtension later. - # TODO(guosheng): flags/args changes may also trigger build, and maybe - # need version manager like `PaddleBuildExtension`. - out_filename = extension.get_output_filename() - if isinstance(out_filename, str): - out_filename = [out_filename] - out_filepath = [os.path.join(build_base_dir, f) for f in out_filename] - lib_filename = extension.get_target_filename() - lib_filepath = os.path.join(build_base_dir, lib_filename) - if not force: - ext_sources = extension.sources - if all(os.path.exists(f) and not newer_group(ext_sources, f, "newer") for f in out_filepath): - logger.debug("skipping '%s' extension (up-to-date) build" % name) - ops = load_op_meta_info_and_register_op(lib_filepath) - LOADED_EXT[name] = ops - return LOADED_EXT[name] - - # write setup file and jit compile - file_path = os.path.join(build_dir, name, "{}_setup.py".format(name)) - _write_setup_file(name, file_path, build_base_dir, **kwargs) - _jit_compile(file_path, verbose) - if isinstance(extension, CMakeExtension): - # Load a shared library (if exists) only to register op. - if os.path.exists(lib_filepath): - ops = load_op_meta_info_and_register_op(lib_filepath) - LOADED_EXT[name] = ops - return LOADED_EXT[name] - else: - # Import as callable python api - return _import_module_from_library(name, build_base_dir, verbose) diff --git a/paddlenlp/ops/fast_transformer/CMakeLists.txt b/paddlenlp/ops/fast_transformer/CMakeLists.txt deleted file mode 100644 index be58f747a2fb..000000000000 --- a/paddlenlp/ops/fast_transformer/CMakeLists.txt +++ /dev/null @@ -1,14 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -add_subdirectory(src) diff --git a/paddlenlp/ops/fast_transformer/__init__.py b/paddlenlp/ops/fast_transformer/__init__.py deleted file mode 100644 index 185a92b8d94d..000000000000 --- a/paddlenlp/ops/fast_transformer/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/paddlenlp/ops/fast_transformer/sample/bart_decoding_sample.py b/paddlenlp/ops/fast_transformer/sample/bart_decoding_sample.py deleted file mode 100644 index 6e3edff7c33f..000000000000 --- a/paddlenlp/ops/fast_transformer/sample/bart_decoding_sample.py +++ /dev/null @@ -1,132 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse -import time -from pprint import pprint - -import paddle - -from paddlenlp.transformers import BartForConditionalGeneration, BartTokenizer -from paddlenlp.utils.log import logger - - -def postprocess_seq(seq, bos_idx, eos_idx, output_bos=False, output_eos=False): - """ - Post-process the decoded sequence. - """ - eos_pos = len(seq) - 1 - for i, idx in enumerate(seq): - if idx == eos_idx: - eos_pos = i - break - seq = [idx for idx in seq[: eos_pos + 1] if (output_bos or idx != bos_idx) and (output_eos or idx != eos_idx)] - return seq - - -def prepare_input(tokenizer, sentences): - tokenized = tokenizer(sentences, padding=True) - input_ids = paddle.to_tensor(tokenized["input_ids"], dtype="int64") - return input_ids - - -def parse_args(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--model_name_or_path", - default="bart-base", - type=str, - help="The model name to specify the bart to use. Can be one of ['bart-base', 'bart-large',]. ", - ) - parser.add_argument( - "--decoding_strategy", - default="beam_search", - type=str, - help="The decoding strategy. Can be one of [greedy_search, beam_search, sampling]", - ) - parser.add_argument("--beam_size", default=5, type=int, help="The parameters for beam search. ") - parser.add_argument("--top_k", default=4, type=int, help="The number of candidate to procedure beam search. ") - parser.add_argument( - "--top_p", default=1.0, type=float, help="The probability threshold to procedure topp sampling. " - ) - parser.add_argument("--max_length", default=20, type=int, help="Maximum output length. ") - parser.add_argument("--diversity_rate", default=0.0, type=float, help="The diversity of beam search. ") - parser.add_argument( - "--length_penalty", default=0.6, type=float, help="The power number in length penalty calculation" - ) - parser.add_argument("--use_fp16_decoding", action="store_true", help="Whether to use fp16 decoding to predict. ") - args = parser.parse_args() - return args - - -def do_predict(args): - place = "gpu" - paddle.set_device(place) - - tokenizer = BartTokenizer.from_pretrained(args.model_name_or_path) - logger.info("Loading the model parameters, please wait...") - model = BartForConditionalGeneration.from_pretrained(args.model_name_or_path) - - # Set evaluate mode - model.eval() - sentences = [ - "I love that girl, but does not me.", - "She is so that I can not help glance at .", - "Nothing's gonna my love for you.", - "Drop everything now. Meet me in the pouring . Kiss me on the sidewalk.", - ] - - bos_id = model.bart.config["bos_token_id"] - eos_id = model.bart.config["eos_token_id"] - input_ids = prepare_input(tokenizer, sentences) - # Define model - fast_bart = model - - # Set evaluate mode - fast_bart.eval() - - with paddle.no_grad(): - for i in range(100): - # For warmup. - if 50 == i: - # PaddlePaddle >= 2.2 - paddle.device.cuda.synchronize() - start = time.perf_counter() - finished_seq, _ = fast_bart.generate( - input_ids=input_ids, - max_length=args.max_length, - decode_strategy=args.decoding_strategy, - top_k=args.top_k, - top_p=args.top_p, - num_beams=args.beam_size, - diversity_rate=args.diversity_rate, - length_penalty=args.length_penalty, - use_fp16_decoding=args.use_fp16_decoding, - use_fast=True, - ) - - paddle.device.cuda.synchronize() - logger.info("Average test time for decoding is %f ms" % ((time.perf_counter() - start) / 50 * 1000)) - - # Output - finished_seq = finished_seq.numpy() - for ins in finished_seq: - generated_ids = postprocess_seq(ins, bos_id, eos_id) - print(tokenizer.convert_ids_to_string(generated_ids)) - - -if __name__ == "__main__": - args = parse_args() - pprint(args) - - do_predict(args) diff --git a/paddlenlp/ops/fast_transformer/sample/bart_export_model_sample.py b/paddlenlp/ops/fast_transformer/sample/bart_export_model_sample.py deleted file mode 100644 index e72e5c3ae91a..000000000000 --- a/paddlenlp/ops/fast_transformer/sample/bart_export_model_sample.py +++ /dev/null @@ -1,111 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import os -from pprint import pprint - -import paddle - -from paddlenlp.ops import FasterBART -from paddlenlp.transformers import BartForConditionalGeneration, BartTokenizer -from paddlenlp.utils.log import logger - - -def parse_args(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--model_name_or_path", default="bart-base", type=str, help="The model name to specify the bart to use. " - ) - parser.add_argument( - "--inference_model_dir", default="./infer_model/", type=str, help="Path to save inference model of bart. " - ) - parser.add_argument("--topk", default=4, type=int, help="The number of candidate to procedure top_k sampling. ") - parser.add_argument( - "--topp", default=1.0, type=float, help="The probability threshold to procedure top_p sampling. " - ) - parser.add_argument("--max_out_len", default=20, type=int, help="Maximum output length. ") - parser.add_argument("--temperature", default=1.0, type=float, help="The temperature to set. ") - parser.add_argument("--num_return_sequences", default=1, type=int, help="The number of returned sequences. ") - parser.add_argument("--use_fp16_decoding", action="store_true", help="Whether to use fp16 decoding to predict. ") - parser.add_argument( - "--decoding_strategy", - default="beam_search", - choices=["sampling", "beam_search"], - type=str, - help="The main strategy to decode. ", - ) - parser.add_argument("--num_beams", default=5, type=int, help="The number of candidate to procedure beam search. ") - parser.add_argument( - "--diversity_rate", default=0.0, type=float, help="The diversity rate to procedure beam search. " - ) - parser.add_argument("--repetition_penalty", default=1.0, type=float, help="The repetition_penalty to set. ") - parser.add_argument("--length_penalty", default=0.0, type=float, help="The length penalty to decode. ") - parser.add_argument("--early_stopping", action="store_true", help="Whether to do early stopping. ") - - args = parser.parse_args() - return args - - -def do_predict(args): - place = "gpu" - place = paddle.set_device(place) - - model = BartForConditionalGeneration.from_pretrained(args.model_name_or_path) - tokenizer = BartTokenizer.from_pretrained(args.model_name_or_path) - - # For opening faster_encoder - model.eval() - - fast_bart = FasterBART(model=model, use_fp16_decoding=args.use_fp16_decoding) - # Set evaluate mode - fast_bart.eval() - - # Convert dygraph model to static graph model - fast_bart = paddle.jit.to_static( - fast_bart, - input_spec=[ - # input_ids - paddle.static.InputSpec(shape=[None, None], dtype="int32"), - # encoder_output - None, - # seq_len - None, - args.num_beams, # num_beams. - args.topk, - args.topp, - args.decoding_strategy, - tokenizer.bos_token_id, # bos - tokenizer.eos_token_id, # eos - tokenizer.pad_token_id, # pad - tokenizer.eos_token_id, # decoder_start_token_id - args.max_out_len, # max_length - args.diversity_rate, # diversity_rate - args.length_penalty, # length_penalty - args.num_return_sequences, - args.early_stopping, - tokenizer.eos_token_id, # forced_eos_token_id - ], - ) - - # Save converted static graph model - paddle.jit.save(fast_bart, os.path.join(args.inference_model_dir, "bart")) - logger.info("BART has been saved to {}.".format(args.inference_model_dir)) - - -if __name__ == "__main__": - args = parse_args() - pprint(args) - - do_predict(args) diff --git a/paddlenlp/ops/fast_transformer/sample/bart_inference.py b/paddlenlp/ops/fast_transformer/sample/bart_inference.py deleted file mode 100644 index 3ba744a0efde..000000000000 --- a/paddlenlp/ops/fast_transformer/sample/bart_inference.py +++ /dev/null @@ -1,107 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import os -from pprint import pprint - -import numpy as np -import paddle.inference as paddle_infer - -from paddlenlp.ops.ext_utils import load -from paddlenlp.transformers import BartTokenizer - - -def setup_args(): - """Setup arguments.""" - parser = argparse.ArgumentParser() - parser.add_argument( - "--inference_model_dir", default="./infer_model/", type=str, help="Path to save inference model of BART. " - ) - - args = parser.parse_args() - - return args - - -def prepare_input(tokenizer, sentences): - tokenized = tokenizer(sentences, padding=True) - input_ids = np.asarray(tokenized["input_ids"], dtype="int32") - return input_ids - - -def postprocess_seq(seq, bos_idx, eos_idx, output_bos=False, output_eos=False): - """ - Post-process the decoded sequence. - """ - eos_pos = len(seq) - 1 - for i, idx in enumerate(seq): - if idx == eos_idx: - eos_pos = i - break - seq = [idx for idx in seq[: eos_pos + 1] if (output_bos or idx != bos_idx) and (output_eos or idx != eos_idx)] - return seq - - -def infer(args): - model_name = "bart-base" - tokenizer = BartTokenizer.from_pretrained(model_name) - - sentences = [ - "I love that girl, but does not me.", - "She is so that I can not help glance at .", - "Nothing's gonna my love for you.", - "Drop everything now. Meet me in the pouring . Kiss me on the sidewalk.", - ] - - input_ids = prepare_input(tokenizer, sentences) - - # Load FastGeneration lib. - load("FastGeneration", verbose=True) - - config = paddle_infer.Config( - os.path.join(args.inference_model_dir, "bart.pdmodel"), - os.path.join(args.inference_model_dir, "bart.pdiparams"), - ) - - config.enable_use_gpu(100, 0) - config.disable_glog_info() - # `embedding_eltwise_layernorm_fuse_pass` failed - config.delete_pass("embedding_eltwise_layernorm_fuse_pass") - predictor = paddle_infer.create_predictor(config) - - input_names = predictor.get_input_names() - input_handle = predictor.get_input_handle(input_names[0]) - input_handle.copy_from_cpu(input_ids.astype("int32")) - - predictor.run() - - output_names = predictor.get_output_names() - output_handle = predictor.get_output_handle(output_names[0]) - output_data = output_handle.copy_to_cpu() - - for idx, sample in enumerate(output_data.transpose([1, 2, 0]).tolist()): - for beam_idx, beam in enumerate(sample): - if beam_idx >= len(sample) / 2: - break - generated_ids = postprocess_seq(beam, tokenizer.bos_token_id, tokenizer.eos_token_id) - seq = tokenizer.convert_ids_to_string(generated_ids) - print(f"{idx}-{beam_idx}: {seq}") - - -if __name__ == "__main__": - args = setup_args() - pprint(args) - - infer(args) diff --git a/paddlenlp/ops/fast_transformer/sample/config/decoder.sample.yaml b/paddlenlp/ops/fast_transformer/sample/config/decoder.sample.yaml deleted file mode 100644 index 7a9eee1bfc60..000000000000 --- a/paddlenlp/ops/fast_transformer/sample/config/decoder.sample.yaml +++ /dev/null @@ -1,39 +0,0 @@ -# Batch size during inference. -infer_batch_size: 8 -max_out_len: 32 - -# Hyparams for model: -# These following five vocabularies related configurations will be set -# automatically according to the passed vocabulary path and special tokens. -# Size of source word dictionary. -src_vocab_size: 38512 -# Size of target word dictionay -trg_vocab_size: 38512 -# Index for token -bos_idx: 0 -# Index for token -eos_idx: 1 -# Index for token -unk_idx: 2 -# Max length of sequences deciding the size of position encoding table. -max_length: 32 -# The dimension for word embeddings, which is also the last dimension of -# the input and output of multi-head attention, position-wise feed-forward -# networks, encoder and decoder. -d_model: 512 -# Size of the hidden layer in position-wise feed-forward networks. -d_inner_hid: 2048 -# Number of head used in multi-head attention. -n_head: 8 -# Number of sub-layers to be stacked in the encoder. -num_encoder_layers: 6 -# Number of sub-layers to be stacked in the decoder. -num_decoder_layers: 6 -# Dropout rates. -dropout: 0.1 -# The flag indicating whether to share embedding and softmax weights. -# Vocabularies in source and target should be same for weight sharing. -weight_sharing: True - -# Path of trained parameter, to make prediction -init_from_params: base_trained_models/step_final/ diff --git a/paddlenlp/ops/fast_transformer/sample/config/decoding.sample.yaml b/paddlenlp/ops/fast_transformer/sample/config/decoding.sample.yaml deleted file mode 100644 index b0ac5ba2e774..000000000000 --- a/paddlenlp/ops/fast_transformer/sample/config/decoding.sample.yaml +++ /dev/null @@ -1,44 +0,0 @@ -# Batch size during inference. -infer_batch_size: 32 -# Hyparams for generation: -decoding_strategy: "beam_search" -# The parameters for beam search. -beam_size: 4 -# The parameters for topk sampling. -topk: 4 -# The parameters for topp sampling. -topp: 0.0 -max_out_len: 32 -# The number of decoded sentences to output. -n_best: 1 - -# Hyparams for model: -# These following five vocabularies related configurations will be set -# automatically according to the passed vocabulary path and special tokens. -# Size of source word dictionary. -src_vocab_size: 30000 -# Size of target word dictionay -trg_vocab_size: 30000 -# Index for token -bos_idx: 0 -# Index for token -eos_idx: 1 -# Index for token -unk_idx: 2 -# Max length of sequences deciding the size of position encoding table. -max_length: 32 -# The dimension for word embeddings, which is also the last dimension of -# the input and output of multi-head attention, position-wise feed-forward -# networks, encoder and decoder. -d_model: 512 -# Size of the hidden layer in position-wise feed-forward networks. -d_inner_hid: 2048 -# Number of head used in multi-head attention. -n_head: 8 -# Number of sub-layers to be stacked in the encoder and decoder. -n_layer: 6 -# Dropout rates. -dropout: 0.1 -# The flag indicating whether to share embedding and softmax weights. -# Vocabularies in source and target should be same for weight sharing. -weight_sharing: True diff --git a/paddlenlp/ops/fast_transformer/sample/decoder_sample.py b/paddlenlp/ops/fast_transformer/sample/decoder_sample.py deleted file mode 100644 index 6927aa41ad60..000000000000 --- a/paddlenlp/ops/fast_transformer/sample/decoder_sample.py +++ /dev/null @@ -1,145 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import os -import time -from pprint import pprint - -import paddle -import yaml -from attrdict import AttrDict - -from paddlenlp.ops import FasterDecoder -from paddlenlp.utils.log import logger - - -def get_op_cache_config(use_batch_major_op_cache, size_per_head, is_fp16): - x = 8 if is_fp16 else 4 - use_batch_major_op_cache = True if use_batch_major_op_cache is True and size_per_head % x == 0 else False - x = x if use_batch_major_op_cache else 1 - return use_batch_major_op_cache, x - - -def parse_args(): - parser = argparse.ArgumentParser() - parser.add_argument("--config", default="./config/decoder.sample.yaml", type=str, help="Path of the config file. ") - parser.add_argument( - "--decoder_lib", default="../../build/lib/libdecoding_op.so", type=str, help="Path of libdecoding_op.so. " - ) - parser.add_argument("--use_fp16_decoder", action="store_true", help="Whether to use fp16 decoder to predict. ") - args = parser.parse_args() - return args - - -def do_predict(args): - place = "gpu" - paddle.set_device(place) - - use_batch_major_op_cache = True - size_per_head = args.d_model // args.n_head - use_batch_major_op_cache, x = get_op_cache_config(use_batch_major_op_cache, size_per_head, args.use_fp16_decoder) - print(f"use_batch_major_op_cache={use_batch_major_op_cache}, x={x}") - # Define model - transformer = FasterDecoder( - src_vocab_size=args.src_vocab_size, - trg_vocab_size=args.trg_vocab_size, - max_length=args.max_length + 1, - num_encoder_layers=args.num_encoder_layers, - num_decoder_layers=args.num_decoder_layers, - n_head=args.n_head, - d_model=args.d_model, - d_inner_hid=args.d_inner_hid, - dropout=args.dropout, - weight_sharing=args.weight_sharing, - bos_id=args.bos_idx, - eos_id=args.eos_idx, - max_out_len=args.max_out_len, - decoder_lib=args.decoder_lib, - use_fp16_decoder=args.use_fp16_decoder, - use_batch_major_op_cache=use_batch_major_op_cache, - ) - - # Load checkpoint. - transformer.load(os.path.join(args.init_from_params, "transformer.pdparams")) - # Set evaluate mode - transformer.eval() - - # Generate data randomly - dec_input = paddle.randn(shape=[args.infer_batch_size, 1, args.d_model], dtype="float32") - enc_output = paddle.randn(shape=[args.infer_batch_size, args.max_length, args.d_model], dtype="float32") - mem_seq_lens = paddle.full(shape=[args.infer_batch_size, 1], fill_value=args.max_length, dtype="int32") - dtype = "float32" - if args.use_fp16_decoder: - dtype = "float16" - dec_input = paddle.cast(dec_input, dtype=dtype) - enc_output = paddle.cast(enc_output, dtype=dtype) - if not use_batch_major_op_cache: - self_cache_key = paddle.zeros( - shape=[args.num_decoder_layers, 0, args.infer_batch_size, args.d_model], dtype=dtype - ) - self_cache_value = paddle.zeros( - shape=[args.num_decoder_layers, 0, args.infer_batch_size, args.d_model], dtype=dtype - ) - else: - self_cache_key = paddle.zeros( - shape=[ - args.num_decoder_layers, - args.infer_batch_size, - args.n_head, - size_per_head // x, - args.max_out_len, - x, - ], - dtype=dtype, - ) - self_cache_value = paddle.zeros( - shape=[args.num_decoder_layers, args.infer_batch_size, args.n_head, args.max_out_len, size_per_head], - dtype=dtype, - ) - mem_cache = paddle.zeros( - shape=[args.num_decoder_layers, 2, args.infer_batch_size, args.max_length, args.d_model], dtype=dtype - ) - - with paddle.no_grad(): - for i in range(100): - # For warmup. - if 50 == i: - start = time.time() - paddle.device.cuda.synchronize() - dec_output, self_cache_key, self_cache_value, mem_cache = transformer.decoder( - from_tensor=dec_input, - memory_tensor=enc_output, - mem_seq_len=mem_seq_lens, - self_cache_key=self_cache_key, - self_cache_value=self_cache_value, - mem_cache=mem_cache, - step=0, - memory_hidden_dim=args.d_model, - is_fuse_qkv=False, - ) - paddle.device.cuda.synchronize() - logger.info("Average test time for decoder is %f ms" % ((time.time() - start) / 50 * 1000)) - - -if __name__ == "__main__": - ARGS = parse_args() - yaml_file = ARGS.config - with open(yaml_file, "rt") as f: - args = AttrDict(yaml.safe_load(f)) - args.decoder_lib = ARGS.decoder_lib - args.use_fp16_decoder = ARGS.use_fp16_decoder - pprint(args) - - do_predict(args) diff --git a/paddlenlp/ops/fast_transformer/sample/decoding_sample.py b/paddlenlp/ops/fast_transformer/sample/decoding_sample.py deleted file mode 100644 index 5ea3f94f1138..000000000000 --- a/paddlenlp/ops/fast_transformer/sample/decoding_sample.py +++ /dev/null @@ -1,99 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import time -from pprint import pprint - -import numpy as np -import paddle -import yaml -from attrdict import AttrDict - -from paddlenlp.ops import FasterTransformer -from paddlenlp.utils.log import logger - -paddle.seed(2) -np.random.seed(2) - - -def parse_args(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--config", - default="./fast_transformer/sample/config/decoding.sample.yaml", - type=str, - help="Path of the config file. ", - ) - parser.add_argument( - "--decoding_lib", default="./build/lib/libdecoding_op.so", type=str, help="Path of libdecoding_op.so. " - ) - parser.add_argument("--use_fp16_decoding", action="store_true", help="Whether to use fp16 decoding to predict. ") - args = parser.parse_args() - return args - - -def do_predict(args): - place = "gpu" - place = paddle.set_device(place) - - # Define model - transformer = FasterTransformer( - src_vocab_size=args.src_vocab_size, - trg_vocab_size=args.trg_vocab_size, - max_length=args.max_length + 1, - num_encoder_layers=args.n_layer, - num_decoder_layers=args.n_layer, - n_head=args.n_head, - d_model=args.d_model, - d_inner_hid=args.d_inner_hid, - dropout=args.dropout, - weight_sharing=args.weight_sharing, - bos_id=args.bos_idx, - eos_id=args.eos_idx, - decoding_strategy=args.decoding_strategy, - beam_size=args.beam_size, - topk=args.topk, - topp=args.topp, - max_out_len=args.max_out_len, - decoding_lib=args.decoding_lib, - use_fp16_decoding=args.use_fp16_decoding, - ) - - # Set evaluate mode - transformer.eval() - - enc_output = paddle.randn([args.infer_batch_size, args.max_length, args.d_model]) - if args.use_fp16_decoding: - enc_output = paddle.cast(enc_output, "float16") - mem_seq_len = paddle.randint(1, args.max_length + 1, shape=[args.infer_batch_size], dtype="int32") - with paddle.no_grad(): - for i in range(100): - # For warmup. - if 50 == i: - start = time.time() - transformer.decoding(enc_output=enc_output, memory_seq_lens=mem_seq_len) - logger.info("Average test time for decoding is %f ms" % ((time.time() - start) / 50 * 1000)) - - -if __name__ == "__main__": - ARGS = parse_args() - yaml_file = ARGS.config - with open(yaml_file, "rt") as f: - args = AttrDict(yaml.safe_load(f)) - pprint(args) - args.decoding_lib = ARGS.decoding_lib - args.use_fp16_decoding = ARGS.use_fp16_decoding - - do_predict(args) diff --git a/paddlenlp/ops/fast_transformer/sample/encoder_decoder_sample.py b/paddlenlp/ops/fast_transformer/sample/encoder_decoder_sample.py deleted file mode 100644 index eba7667d30ca..000000000000 --- a/paddlenlp/ops/fast_transformer/sample/encoder_decoder_sample.py +++ /dev/null @@ -1,101 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse -import os -import time -from pprint import pprint - -import paddle -import yaml -from attrdict import AttrDict - -from paddlenlp.ops import FasterDecoder -from paddlenlp.utils.log import logger - - -def parse_args(): - parser = argparse.ArgumentParser() - parser.add_argument("--config", default="./config/decoder.sample.yaml", type=str, help="Path of the config file. ") - parser.add_argument( - "--decoder_lib", default="../../build/lib/libdecoder_op.so", type=str, help="Path of libdecoder_op.so. " - ) - parser.add_argument("--use_fp16_decoder", action="store_true", help="Whether to use fp16 decoder to predict. ") - args = parser.parse_args() - return args - - -def get_op_cache_config(use_batch_major_op_cache, size_per_head, is_fp16): - x = 8 if is_fp16 else 4 - use_batch_major_op_cache = True if use_batch_major_op_cache is True and size_per_head % x == 0 else False - x = x if use_batch_major_op_cache else 1 - return use_batch_major_op_cache, x - - -def do_predict(args): - place = "gpu" - paddle.set_device(place) - - use_batch_major_op_cache = True - size_per_head = args.d_model // args.n_head - use_batch_major_op_cache, x = get_op_cache_config(use_batch_major_op_cache, size_per_head, args.use_fp16_decoder) - - # Define model - transformer = FasterDecoder( - src_vocab_size=args.src_vocab_size, - trg_vocab_size=args.trg_vocab_size, - max_length=args.max_length + 1, - num_encoder_layers=args.num_encoder_layers, - num_decoder_layers=args.num_decoder_layers, - n_head=args.n_head, - d_model=args.d_model, - d_inner_hid=args.d_inner_hid, - dropout=args.dropout, - weight_sharing=args.weight_sharing, - bos_id=args.bos_idx, - eos_id=args.eos_idx, - max_out_len=args.max_out_len, - decoder_lib=args.decoder_lib, - use_fp16_decoder=args.use_fp16_decoder, - use_batch_major_op_cache=use_batch_major_op_cache, - ) - - # Load checkpoint. - transformer.load(os.path.join(args.init_from_params, "transformer.pdparams")) - # Set evaluate mode - transformer.eval() - - # Generate src_word randomly - src_word = paddle.randint(0, args.src_vocab_size, shape=[args.infer_batch_size, args.max_length], dtype="int64") - - with paddle.no_grad(): - for i in range(100): - # For warmup. - if 50 == i: - start = time.time() - paddle.device.cuda.synchronize() - finished_seq, finished_scores = transformer(src_word=src_word) - paddle.device.cuda.synchronize() - logger.info("Average test time for decoder is %f ms" % ((time.time() - start) / 50 * 1000)) - - -if __name__ == "__main__": - ARGS = parse_args() - yaml_file = ARGS.config - with open(yaml_file, "rt") as f: - args = AttrDict(yaml.safe_load(f)) - args.decoder_lib = ARGS.decoder_lib - args.use_fp16_decoder = ARGS.use_fp16_decoder - pprint(args) - - do_predict(args) diff --git a/paddlenlp/ops/fast_transformer/sample/encoder_decoding_sample.py b/paddlenlp/ops/fast_transformer/sample/encoder_decoding_sample.py deleted file mode 100644 index 2c4a092465a2..000000000000 --- a/paddlenlp/ops/fast_transformer/sample/encoder_decoding_sample.py +++ /dev/null @@ -1,128 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import time -from pprint import pprint - -import numpy as np -import paddle -import yaml -from attrdict import AttrDict - -from paddlenlp.data import Pad -from paddlenlp.ops import FasterTransformer, enable_fast_encoder -from paddlenlp.utils.log import logger - - -def parse_args(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--config", - default="./fast_transformer/sample/config/decoding.sample.yaml", - type=str, - help="Path of the config file. ", - ) - parser.add_argument( - "--decoding_lib", default="./build/lib/libdecoding_op.so", type=str, help="Path of libdecoding_op.so. " - ) - parser.add_argument("--use_fp16_decoding", action="store_true", help="Whether to use fp16 decoding to predict. ") - parser.add_argument( - "--enable_fast_encoder", - action="store_true", - help="Whether to use fast version encoder to predict. This is experimental option for now. ", - ) - parser.add_argument("--use_fp16_encoder", action="store_true", help="Whether to use fp16 encoder to predict. ") - args = parser.parse_args() - return args - - -def generate_src_word(batch_size, vocab_size, max_length, eos_idx, pad_idx): - memory_sequence_length = np.random.randint(low=1, high=max_length, size=batch_size).astype(np.int32) - data = [] - for i in range(batch_size): - data.append(np.random.randint(low=3, high=vocab_size, size=memory_sequence_length[i], dtype=np.int64)) - - word_pad = Pad(pad_idx) - src_word = word_pad([list(word) + [eos_idx] for word in data]) - - return paddle.to_tensor(src_word, dtype="int64") - - -def do_predict(args): - place = "gpu" - place = paddle.set_device(place) - - # Define model - transformer = FasterTransformer( - src_vocab_size=args.src_vocab_size, - trg_vocab_size=args.trg_vocab_size, - max_length=args.max_length + 1, - num_encoder_layers=args.n_layer, - num_decoder_layers=args.n_layer, - n_head=args.n_head, - d_model=args.d_model, - d_inner_hid=args.d_inner_hid, - dropout=args.dropout, - weight_sharing=args.weight_sharing, - bos_id=args.bos_idx, - eos_id=args.eos_idx, - decoding_strategy=args.decoding_strategy, - beam_size=args.beam_size, - topk=args.topk, - topp=args.topp, - max_out_len=args.max_out_len, - decoding_lib=args.decoding_lib, - use_fp16_decoding=args.use_fp16_decoding, - enable_fast_encoder=args.enable_fast_encoder, - use_fp16_encoder=args.use_fp16_encoder, - ) - - # Set evaluate mode - transformer.eval() - - if args.enable_fast_encoder: - transformer = enable_fast_encoder(transformer, use_fp16=args.use_fp16_encoder) - - src_word = generate_src_word( - batch_size=args.infer_batch_size, - vocab_size=args.src_vocab_size, - max_length=args.max_length, - eos_idx=args.eos_idx, - pad_idx=args.bos_idx, - ) - - with paddle.no_grad(): - for i in range(100): - # For warmup. - if 50 == i: - paddle.device.cuda.synchronize(place) - start = time.time() - transformer(src_word=src_word) - paddle.device.cuda.synchronize(place) - logger.info("Average test time for encoder-decoding is %f ms" % ((time.time() - start) / 50 * 1000)) - - -if __name__ == "__main__": - ARGS = parse_args() - yaml_file = ARGS.config - with open(yaml_file, "rt") as f: - args = AttrDict(yaml.safe_load(f)) - args.decoding_lib = ARGS.decoding_lib - args.use_fp16_decoding = ARGS.use_fp16_decoding - args.enable_fast_encoder = ARGS.enable_fast_encoder - args.use_fp16_encoder = ARGS.use_fp16_encoder - pprint(args) - - do_predict(args) diff --git a/paddlenlp/ops/fast_transformer/sample/gpt_export_model_sample.py b/paddlenlp/ops/fast_transformer/sample/gpt_export_model_sample.py deleted file mode 100644 index f3ab7771bf17..000000000000 --- a/paddlenlp/ops/fast_transformer/sample/gpt_export_model_sample.py +++ /dev/null @@ -1,106 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse -import os -from pprint import pprint - -import paddle - -from paddlenlp.ops import FasterGPT -from paddlenlp.transformers import GPTChineseTokenizer, GPTLMHeadModel, GPTTokenizer -from paddlenlp.utils.log import logger - -MODEL_CLASSES = { - "gpt-cpm-large-cn": (GPTLMHeadModel, GPTChineseTokenizer), - "gpt2-medium-en": (GPTLMHeadModel, GPTTokenizer), -} - - -def parse_args(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--model_name_or_path", - default="gpt2-medium-en", - type=str, - help="The model name to specify the gpt to use. Can be one of ['gpt2-en', 'gpt2-medium-en', 'gpt-cpm-large-cn']. ", - ) - parser.add_argument( - "--decoding_lib", default="../../build/lib/libdecoding_op.so", type=str, help="Path of libdecoding_op.so. " - ) - parser.add_argument( - "--inference_model_dir", default="./infer_model/", type=str, help="Path to save inference model of gpt. " - ) - parser.add_argument("--topk", default=4, type=int, help="The number of candidate to procedure beam search. ") - parser.add_argument( - "--topp", default=0.0, type=float, help="The probability threshold to procedure topp sampling. " - ) - parser.add_argument("--max_out_len", default=32, type=int, help="Maximum output length. ") - parser.add_argument("--temperature", default=1.0, type=float, help="The temperature to set. ") - parser.add_argument("--use_fp16_decoding", action="store_true", help="Whether to use fp16 decoding to predict. ") - args = parser.parse_args() - return args - - -def do_predict(args): - place = "gpu" - place = paddle.set_device(place) - - model_class, tokenizer_class = MODEL_CLASSES[args.model_name_or_path] - tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) - logger.info("Loading the model parameters, please wait...") - model = model_class.from_pretrained(args.model_name_or_path, max_predict_len=args.max_out_len) - - gpt = FasterGPT(model=model, decoding_lib=args.decoding_lib, use_fp16_decoding=args.use_fp16_decoding) - - # Set evaluate mode - gpt.eval() - - # Convert dygraph model to static graph model - gpt = paddle.jit.to_static( - gpt, - input_spec=[ - # input_ids - paddle.static.InputSpec(shape=[None, None], dtype="int32"), - # - # If it's necessarry to provide mem_seq_len and attention_mask, - # the parameters should be: - # mem_seq_len - # paddle.static.InputSpec(shape=[None, None], dtype="int32"), - # attention_mask - # paddle.static.InputSpec(shape=[None, None, None], dtype="float16" if args.use_fp16_decoding else "float32"), - # - None, # mem_seq_len - None, # attention_mask - args.topk, - args.topp, - args.max_out_len, - tokenizer.eos_token_id, - tokenizer.eos_token_id, - tokenizer.pad_token_id, - None, # forced_eos_token_id - args.temperature, - ], - ) - - # Save converted static graph model - paddle.jit.save(gpt, os.path.join(args.inference_model_dir, "gpt")) - logger.info("GPT has been saved to {}".format(args.inference_model_dir)) - - gpt.save_resources(tokenizer, args.inference_model_dir) - - -if __name__ == "__main__": - args = parse_args() - pprint(args) - do_predict(args) diff --git a/paddlenlp/ops/fast_transformer/sample/gpt_sample.py b/paddlenlp/ops/fast_transformer/sample/gpt_sample.py deleted file mode 100644 index 7f3344fe6063..000000000000 --- a/paddlenlp/ops/fast_transformer/sample/gpt_sample.py +++ /dev/null @@ -1,112 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import time -from pprint import pprint - -import numpy as np -import paddle - -from paddlenlp.transformers import GPTChineseTokenizer, GPTLMHeadModel, GPTTokenizer -from paddlenlp.utils.log import logger - -MODEL_CLASSES = { - "gpt-cpm-large-cn": (GPTLMHeadModel, GPTChineseTokenizer), - "gpt2-medium-en": (GPTLMHeadModel, GPTTokenizer), -} - - -def parse_args(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--model_name_or_path", - default="gpt2-medium-en", - type=str, - help="The model name to specify the gpt to use. Can be one of ['gpt2-en', 'gpt2-medium-en', 'gpt-cpm-large-cn']. ", - ) - parser.add_argument( - "--decoding_lib", default="../build/lib/libdecoding_op.so", type=str, help="Path of libdecoding_op.so. " - ) - parser.add_argument("--batch_size", default=4, type=int, help="Batch size. ") - parser.add_argument("--topk", default=4, type=int, help="The number of candidate to procedure beam search. ") - parser.add_argument( - "--topp", default=1.0, type=float, help="The probability threshold to procedure topp sampling. " - ) - parser.add_argument("--max_length", default=32, type=int, help="Maximum output length. ") - parser.add_argument( - "--start_token", default="<|endoftext|>", type=str, help="The start token. Defaults to <|endoftext|>. " - ) - parser.add_argument( - "--end_token", default="<|endoftext|>", type=str, help="The end token. Defaults to <|endoftext|>. " - ) - parser.add_argument("--temperature", default=1.0, type=float, help="The temperature to set. ") - parser.add_argument("--use_fp16_decoding", action="store_true", help="Whether to use fp16 decoding to predict. ") - args = parser.parse_args() - return args - - -def do_predict(args): - place = "gpu" - place = paddle.set_device(place) - - model_class, tokenizer_class = MODEL_CLASSES[args.model_name_or_path] - tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) - logger.info("Loading the model parameters, please wait...") - model = model_class.from_pretrained(args.model_name_or_path) - model.eval() - - bos_id = tokenizer.convert_tokens_to_ids(args.start_token) - eos_id = tokenizer.convert_tokens_to_ids(args.end_token) - - # Define model - gpt = model - - # Set evaluate mode - gpt.eval() - input_ids = np.array([[bos_id] for i in range(args.batch_size * 1)]).astype("int64").reshape([args.batch_size, 1]) - input_ids = paddle.to_tensor(input_ids) - - with paddle.no_grad(): - for i in range(100): - # For warmup. - if 50 == i: - paddle.device.cuda.synchronize(place) - start = time.time() - out_seq, _ = gpt.generate( - input_ids, - top_k=args.topk, - top_p=args.topp, - max_length=args.max_length, - temperature=args.temperature, - bos_token_id=bos_id, - eos_token_id=eos_id, - decode_strategy="sampling", - use_fp16_decoding=args.use_fp16_decoding, - use_fast=True, - ) - output_sequence = out_seq.numpy() - - paddle.device.cuda.synchronize(place) - logger.info("Average test time for decoding is %f ms" % ((time.time() - start) / 50 * 1000)) - output_sequence = out_seq.numpy().tolist() - for i in range(args.batch_size): - print("========== Sample-%d ==========" % i) - print(tokenizer.convert_ids_to_string(output_sequence[i])) - - -if __name__ == "__main__": - args = parse_args() - pprint(args) - do_predict(args) diff --git a/paddlenlp/ops/fast_transformer/sample/mbart_decoding_sample.py b/paddlenlp/ops/fast_transformer/sample/mbart_decoding_sample.py deleted file mode 100644 index 76c0d5699323..000000000000 --- a/paddlenlp/ops/fast_transformer/sample/mbart_decoding_sample.py +++ /dev/null @@ -1,138 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse -import time -from pprint import pprint - -import paddle - -from paddlenlp.data import Pad -from paddlenlp.transformers import MBartForConditionalGeneration, MBartTokenizer -from paddlenlp.utils.log import logger - - -def post_process_seq(seq, bos_idx, eos_idx, output_bos=False, output_eos=False): - """ - Post-process the decoded sequence. - """ - eos_pos = len(seq) - 1 - for i, idx in enumerate(seq): - if idx == eos_idx: - eos_pos = i - break - seq = [idx for idx in seq[: eos_pos + 1] if (output_bos or idx != bos_idx) and (output_eos or idx != eos_idx)] - return seq - - -def prepare_input(tokenizer, sentences, pad_id): - word_pad = Pad(pad_id, dtype="int64") - tokenized = tokenizer(sentences, return_length=True) - inputs = word_pad(tokenized["input_ids"]) - input_ids = paddle.to_tensor(inputs) - return input_ids - - -def parse_args(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--model_name_or_path", - default="mbart-large-50-one-to-many-mmt", - type=str, - help="The model name to specify the bart to use. ", - choices=[ - "mbart-large-50-one-to-many-mmt", - "mbart-large-50-many-to-one-mmt", - "mbart-large-50-many-to-many-mmt", - "mbart-large-cc25", - "mbart-large-en-ro", - ], - ) - parser.add_argument( - "--decoding_strategy", - default="beam_search", - type=str, - help="The decoding strategy.", - choices=["greedy_search", "beam_search", "sampling"], - ) - parser.add_argument("--beam_size", default=4, type=int, help="The parameters for beam search. ") - parser.add_argument("--top_k", default=4, type=int, help="The number of candidate to procedure beam search. ") - parser.add_argument( - "--top_p", default=1.0, type=float, help="The probability threshold to procedure topp sampling. " - ) - parser.add_argument("--max_length", default=50, type=int, help="Maximum output length. ") - parser.add_argument("--diversity_rate", default=0.0, type=float, help="The diversity of beam search. ") - parser.add_argument( - "--length_penalty", default=0.0, type=float, help="The power number in length penalty calculation" - ) - parser.add_argument("--use_fp16_decoding", action="store_true", help="Whether to use fp16 decoding to predict. ") - parser.add_argument("--not_use_faster", action="store_false", help="Whether to use FastGeneration. ") - args = parser.parse_args() - return args - - -def do_predict(args): - place = "gpu" - paddle.set_device(place) - - tokenizer = MBartTokenizer.from_pretrained(args.model_name_or_path, src_lang="en_XX") - logger.info("Loading the model parameters, please wait...") - model = MBartForConditionalGeneration.from_pretrained(args.model_name_or_path) - # Set evaluate mode - model.eval() - sentences = [ - "I love that girl, but she does not love me.", - "She is so beautiful that I can not help glance at her.", - "Nothing's gonna change my love for you.", - "Drop everything now. Meet me in the pouring rain. Kiss me on the sidewalk.", - ] - - eos_id = model.mbart.config["eos_token_id"] - pad_id = model.mbart.config["pad_token_id"] - input_ids = prepare_input(tokenizer, sentences, pad_id) - - with paddle.no_grad(): - for i in range(100): - # For warmup. - if 50 == i: - # PaddlePaddle >= 2.2 - paddle.device.cuda.synchronize() - start = time.perf_counter() - finished_seqs, _ = model.generate( - input_ids=input_ids, - forced_bos_token_id=tokenizer.lang_code_to_id["zh_CN"], - max_length=args.max_length, - decode_strategy=args.decoding_strategy, - top_k=args.top_k, - top_p=args.top_p, - num_beams=args.beam_size, - diversity_rate=args.diversity_rate, - length_penalty=args.length_penalty, - use_fast=args.not_use_faster, - ) - paddle.device.cuda.synchronize() - logger.info("Average test time for decoding is %f ms" % ((time.perf_counter() - start) / 50 * 1000)) - - # Output - finished_seqs = finished_seqs.numpy().tolist() - for idx, finished_seq in enumerate(finished_seqs): - finished_seq = finished_seq - print(f"source: {sentences[idx]}") - finished_seq = post_process_seq(finished_seq, tokenizer.lang_code_to_id["zh_CN"], eos_id) - print(f"target: {tokenizer.convert_ids_to_string(finished_seq)}\n") - - -if __name__ == "__main__": - args = parse_args() - pprint(args) - do_predict(args) diff --git a/paddlenlp/ops/fast_transformer/sample/mbart_export_model_sample.py b/paddlenlp/ops/fast_transformer/sample/mbart_export_model_sample.py deleted file mode 100644 index 330e7aa5da78..000000000000 --- a/paddlenlp/ops/fast_transformer/sample/mbart_export_model_sample.py +++ /dev/null @@ -1,118 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import os -from pprint import pprint - -import paddle - -from paddlenlp.ops import FasterMBART -from paddlenlp.transformers import MBartForConditionalGeneration, MBartTokenizer -from paddlenlp.utils.log import logger - - -def parse_args(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--model_name_or_path", - default="mbart-large-50-many-to-many-mmt", - type=str, - help="The model name to specify the bart to use. ", - ) - parser.add_argument( - "--inference_model_dir", default="./infer_model/", type=str, help="Path to save inference model of bart. " - ) - parser.add_argument("--topk", default=4, type=int, help="The number of candidate to procedure top_k sampling. ") - parser.add_argument( - "--topp", default=1.0, type=float, help="The probability threshold to procedure top_p sampling. " - ) - parser.add_argument("--max_out_len", default=64, type=int, help="Maximum output length. ") - parser.add_argument("--temperature", default=1.0, type=float, help="The temperature to set. ") - parser.add_argument("--num_return_sequences", default=1, type=int, help="The number of returned sequences. ") - parser.add_argument("--use_fp16_decoding", action="store_true", help="Whether to use fp16 decoding to predict. ") - parser.add_argument( - "--decoding_strategy", - default="beam_search", - choices=["sampling", "beam_search"], - type=str, - help="The main strategy to decode. ", - ) - parser.add_argument("--num_beams", default=5, type=int, help="The number of candidate to procedure beam search. ") - parser.add_argument( - "--diversity_rate", default=0.0, type=float, help="The diversity rate to procedure beam search. " - ) - parser.add_argument("--repetition_penalty", default=1.0, type=float, help="The repetition_penalty to set. ") - parser.add_argument("--length_penalty", default=0.0, type=float, help="The length penalty to decode. ") - parser.add_argument("--early_stopping", action="store_true", help="Whether to do early stopping. ") - - args = parser.parse_args() - return args - - -def do_predict(args): - place = "gpu" - place = paddle.set_device(place) - - model = MBartForConditionalGeneration.from_pretrained(args.model_name_or_path) - tokenizer = MBartTokenizer.from_pretrained(args.model_name_or_path, src_lang="en_XX") - - # For opening faster_encoder - model.eval() - - fast_mbart = FasterMBART(model=model, use_fp16_decoding=args.use_fp16_decoding) - # Set evaluate mode - fast_mbart.eval() - - # Convert dygraph model to static graph model - fast_mbart = paddle.jit.to_static( - fast_mbart, - input_spec=[ - # input_ids - paddle.static.InputSpec(shape=[None, None], dtype="int32"), - # encoder_output - None, - # seq_len - None, - paddle.static.InputSpec( - shape=[None, 1], dtype="int32" - ), # forced_bos_token_id can be a Tensor or int (bos_id) - args.num_beams, # num_beams. - args.topk, # top_k - args.topp, # top_p - args.decoding_strategy, # decode_strategy - tokenizer.bos_token_id, # bos_token_id - tokenizer.eos_token_id, # eos_token_id - tokenizer.pad_token_id, # pad_token_id - model.mbart.config["decoder_start_token_id"], # decoder_start_token_id - args.max_out_len, # max_length - args.diversity_rate, # diversity_rate - args.length_penalty, # length_penalty - args.temperature, # temperature - args.num_return_sequences, # num_return_sequences - args.early_stopping, # early_stopping - tokenizer.eos_token_id, # forced_eos_token_id - ], - ) - - # Save converted static graph model - paddle.jit.save(fast_mbart, os.path.join(args.inference_model_dir, "mbart")) - logger.info("MBART has been saved to {}.".format(args.inference_model_dir)) - - -if __name__ == "__main__": - args = parse_args() - pprint(args) - - do_predict(args) diff --git a/paddlenlp/ops/fast_transformer/sample/mbart_inference.py b/paddlenlp/ops/fast_transformer/sample/mbart_inference.py deleted file mode 100644 index fd4b4d2ad150..000000000000 --- a/paddlenlp/ops/fast_transformer/sample/mbart_inference.py +++ /dev/null @@ -1,108 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import os -from pprint import pprint - -import numpy as np -import paddle.inference as paddle_infer - -from paddlenlp.ops.ext_utils import load -from paddlenlp.transformers import MBartTokenizer - - -def setup_args(): - """Setup arguments.""" - parser = argparse.ArgumentParser() - parser.add_argument( - "--inference_model_dir", default="./infer_model/", type=str, help="Path to save inference model of BART. " - ) - parser.add_argument("--batch_size", default=1, type=int, help="Batch size. ") - - args = parser.parse_args() - - return args - - -def postprocess_response(tokenizer, seq, bos_idx, eos_idx): - """Post-process the decoded sequence.""" - eos_pos = len(seq) - 1 - for i, idx in enumerate(seq): - if idx == eos_idx: - eos_pos = i - break - seq = [idx for idx in seq[: eos_pos + 1] if idx != bos_idx and idx != eos_idx] - res = tokenizer.convert_ids_to_string(seq) - return res - - -def infer(args): - model_name = "mbart-large-50-many-to-many-mmt" - - tokenizer = MBartTokenizer.from_pretrained(model_name, src_lang="en_XX") - bos_id = tokenizer.lang_code_to_id["zh_CN"] - inputs = "PaddleNLP is a powerful NLP library with Awesome pre-trained models and easy-to-use interface, supporting wide-range of NLP tasks from research to industrial applications." - - eos_id = tokenizer.eos_token_id - - # Input ids - input_ids = tokenizer(inputs)["input_ids"] - input_ids = np.asarray(input_ids, dtype="int32").reshape(1, -1).repeat(args.batch_size, axis=0) - - # Forced bos token ids - forced_bos_token = np.ones([args.batch_size, 1], dtype="int32") * bos_id - - # Load FastGeneration lib. - load("FastGeneration", verbose=True) - - config = paddle_infer.Config( - os.path.join(args.inference_model_dir, "mbart.pdmodel"), - os.path.join(args.inference_model_dir, "mbart.pdiparams"), - ) - - config.enable_use_gpu(100, 0) - config.disable_glog_info() - predictor = paddle_infer.create_predictor(config) - - input_names = predictor.get_input_names() - - # Input ids - input_ids_handle = predictor.get_input_handle(input_names[0]) - input_ids_handle.copy_from_cpu(input_ids.astype("int32")) - - # Forced bos token ids - forced_bos_token_handle = predictor.get_input_handle(input_names[1]) - forced_bos_token_handle.copy_from_cpu(forced_bos_token.astype("int32")) - - predictor.run() - - output_names = predictor.get_output_names() - output_handle = predictor.get_output_handle(output_names[0]) - output_data = output_handle.copy_to_cpu() - - # [batch_size, num_beams * 2, sequence_length] - output_data = output_data.transpose([1, 2, 0]) - - # Only use the best sequence. - result = [postprocess_response(tokenizer, sample.tolist()[0], bos_id, eos_id) for sample in output_data] - print("Model input:", inputs) - print("Result:", "\n".join(result)) - - -if __name__ == "__main__": - args = setup_args() - pprint(args) - - infer(args) diff --git a/paddlenlp/ops/fast_transformer/sample/plato_export_model_sample.py b/paddlenlp/ops/fast_transformer/sample/plato_export_model_sample.py deleted file mode 100644 index 1b3e6a0e877e..000000000000 --- a/paddlenlp/ops/fast_transformer/sample/plato_export_model_sample.py +++ /dev/null @@ -1,120 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import os -from pprint import pprint - -import paddle - -from paddlenlp.ops import FasterUnifiedTransformer -from paddlenlp.transformers import ( - UnifiedTransformerLMHeadModel, - UnifiedTransformerTokenizer, -) -from paddlenlp.utils.log import logger - - -def parse_args(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--model_name_or_path", - default="plato-xl", - type=str, - help="The model name to specify the PLATO/UnifiedTransformer to use. ", - ) - parser.add_argument( - "--inference_model_dir", default="./infer_model/", type=str, help="Path to save inference model of gpt. " - ) - parser.add_argument("--topk", default=4, type=int, help="The number of candidate to procedure top_k sampling. ") - parser.add_argument( - "--topp", default=1.0, type=float, help="The probability threshold to procedure top_p sampling. " - ) - parser.add_argument("--max_out_len", default=64, type=int, help="Maximum output length. ") - parser.add_argument("--min_out_len", default=1, type=int, help="Minimum output length. ") - parser.add_argument("--temperature", default=1.0, type=float, help="The temperature to set. ") - parser.add_argument("--num_return_sequences", default=1, type=int, help="The number of returned sequences. ") - parser.add_argument("--use_fp16_decoding", action="store_true", help="Whether to use fp16 decoding to predict. ") - parser.add_argument( - "--decoding_strategy", - default="sampling", - choices=["sampling", "beam_search"], - type=str, - help="The main strategy to decode. ", - ) - parser.add_argument("--num_beams", default=4, type=int, help="The number of candidate to procedure beam search. ") - parser.add_argument( - "--diversity_rate", default=0.0, type=float, help="The diversity rate to procedure beam search. " - ) - - args = parser.parse_args() - return args - - -def do_predict(args): - place = "gpu" - place = paddle.set_device(place) - - if args.use_fp16_decoding and os.getenv("PPFG_QKV_MEM_OPT", "0") == "1": - paddle.set_default_dtype("float16") - - model_name = "plato-xl" - model = UnifiedTransformerLMHeadModel.from_pretrained(model_name) - tokenizer = UnifiedTransformerTokenizer.from_pretrained(model_name) - - plato = FasterUnifiedTransformer(model=model, use_fp16_decoding=args.use_fp16_decoding) - # Set evaluate mode - plato.eval() - - # Convert dygraph model to static graph model - plato = paddle.jit.to_static( - plato, - input_spec=[ - # input_ids - paddle.static.InputSpec(shape=[None, None], dtype="int32"), - # token_type_ids - paddle.static.InputSpec(shape=[None, None], dtype="int32"), - # attention_mask - paddle.static.InputSpec(shape=[None, 1, None, None], dtype="float32"), - # seq_len - paddle.static.InputSpec(shape=[None], dtype="int32"), - # role_ids - paddle.static.InputSpec(shape=[None, None], dtype="int32"), - # position_ids - paddle.static.InputSpec(shape=[None, None], dtype="int32"), - args.max_out_len, - args.min_out_len, - args.topk, - args.topp, - args.decoding_strategy, - tokenizer.cls_token_id, # cls/bos - tokenizer.sep_token_id, # sep/eos - tokenizer.pad_token_id, # pad - args.num_beams, # num_beams. Used for beam_search. - args.diversity_rate, # diversity rate. Used for beam search. - args.temperature, - args.num_return_sequences, - ], - ) - - # Save converted static graph model - paddle.jit.save(plato, os.path.join(args.inference_model_dir, "plato")) - logger.info("PLATO has been saved to {}.".format(args.inference_model_dir)) - - -if __name__ == "__main__": - args = parse_args() - pprint(args) - - do_predict(args) diff --git a/paddlenlp/ops/fast_transformer/sample/plato_inference.py b/paddlenlp/ops/fast_transformer/sample/plato_inference.py deleted file mode 100644 index 8f935b7a0d25..000000000000 --- a/paddlenlp/ops/fast_transformer/sample/plato_inference.py +++ /dev/null @@ -1,108 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -from pprint import pprint - -import numpy as np -import paddle.inference as paddle_infer - -from paddlenlp.ops.ext_utils import load -from paddlenlp.transformers import UnifiedTransformerTokenizer - - -def setup_args(): - """Setup arguments.""" - parser = argparse.ArgumentParser() - parser.add_argument( - "--inference_model_dir", default="./infer_model/", type=str, help="Path to save inference model of PLATO. " - ) - parser.add_argument("--use_role", action="store_true", help="Whether to use role embeddings. ") - parser.add_argument( - "--position_style", - default="relative", - choices=["continuous", "relative"], - type=str, - help="The type for positional embedding. Default is continuous. ", - ) - - args = parser.parse_args() - - return args - - -def postprocess_response(token_ids, tokenizer): - """Post-process the decoded sequence. Truncate from the first .""" - eos_pos = len(token_ids) - for i, tok_id in enumerate(token_ids): - if tok_id == tokenizer.sep_token_id: - eos_pos = i - break - token_ids = token_ids[:eos_pos] - tokens = tokenizer.convert_ids_to_tokens(token_ids) - tokens = tokenizer.merge_subword(tokens) - return tokens - - -def infer(args): - model_name = "plato-xl" - tokenizer = UnifiedTransformerTokenizer.from_pretrained(model_name) - - context = [ - "Hi , Becky , what's up ?", - "Not much , except that my mother-in-law is driving me up the wall .", - "What's the problem ?", - ] - - data = tokenizer.dialogue_encode( - history=context, - add_start_token_as_response=True, - return_length=True, - return_role_ids=args.use_role, - position_style=args.position_style, - ) - - # Load FastGeneration lib. - load("FastGeneration", verbose=True) - - config = paddle_infer.Config( - args.inference_model_dir + "plato.pdmodel", args.inference_model_dir + "plato.pdiparams" - ) - config.enable_use_gpu(100, 0) - config.disable_glog_info() - predictor = paddle_infer.create_predictor(config) - - input_handles = {} - for name in predictor.get_input_names(): - input_handles[name] = predictor.get_input_handle(name) - if name == "attention_mask": - input_handles[name].copy_from_cpu(np.expand_dims(np.asarray(data[name], dtype="float32"), axis=(0, 1))) - else: - input_handles[name].copy_from_cpu(np.asarray(data[name], dtype="int32").reshape([1, -1])) - - output_handles = [predictor.get_output_handle(name) for name in predictor.get_output_names()] - - predictor.run() - - output = [output_handle.copy_to_cpu() for output_handle in output_handles] - - for sample in output[0].transpose([1, 0]).tolist(): - print(" ".join(postprocess_response(sample, tokenizer))) - - -if __name__ == "__main__": - args = setup_args() - pprint(args) - - infer(args) diff --git a/paddlenlp/ops/fast_transformer/sample/t5_export_model_sample.py b/paddlenlp/ops/fast_transformer/sample/t5_export_model_sample.py deleted file mode 100644 index 5fdf1c99532e..000000000000 --- a/paddlenlp/ops/fast_transformer/sample/t5_export_model_sample.py +++ /dev/null @@ -1,113 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import os -from pprint import pprint - -import paddle - -from paddlenlp.ops import FasterT5 -from paddlenlp.transformers import T5ForConditionalGeneration, T5Tokenizer -from paddlenlp.utils.log import logger - - -def parse_args(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--model_name_or_path", default="t5-base", type=str, help="The model name to specify the bart to use. " - ) - parser.add_argument( - "--inference_model_dir", default="./infer_model/", type=str, help="Path to save inference model of bart. " - ) - parser.add_argument("--topk", default=4, type=int, help="The number of candidate to procedure top_k sampling. ") - parser.add_argument( - "--topp", default=1.0, type=float, help="The probability threshold to procedure top_p sampling. " - ) - parser.add_argument("--max_out_len", default=256, type=int, help="Maximum output length. ") - parser.add_argument("--temperature", default=1.0, type=float, help="The temperature to set. ") - parser.add_argument("--num_return_sequences", default=1, type=int, help="The number of returned sequences. ") - parser.add_argument("--use_fp16_decoding", action="store_true", help="Whether to use fp16 decoding to predict. ") - parser.add_argument( - "--decoding_strategy", - default="beam_search", - choices=["sampling", "beam_search"], - type=str, - help="The main strategy to decode. ", - ) - parser.add_argument("--num_beams", default=4, type=int, help="The number of candidate to procedure beam search. ") - parser.add_argument( - "--diversity_rate", default=0.0, type=float, help="The diversity rate to procedure beam search. " - ) - parser.add_argument("--repetition_penalty", default=1.0, type=float, help="The repetition_penalty to set. ") - parser.add_argument("--length_penalty", default=0.0, type=float, help="The length penalty to decode. ") - parser.add_argument("--early_stopping", action="store_true", help="Whether to do early stopping. ") - - args = parser.parse_args() - return args - - -def do_predict(args): - place = "gpu" - place = paddle.set_device(place) - - model = T5ForConditionalGeneration.from_pretrained(args.model_name_or_path) - tokenizer = T5Tokenizer.from_pretrained(args.model_name_or_path) - - # For opening faster_encoder - model.eval() - - fast_t5 = FasterT5(model=model, use_fp16_decoding=args.use_fp16_decoding) - # Set evaluate mode - fast_t5.eval() - - # Convert dygraph model to static graph model - fast_t5 = paddle.jit.to_static( - fast_t5, - input_spec=[ - # input_ids - paddle.static.InputSpec(shape=[None, None], dtype="int32"), - # encoder_output - None, - # seq_len - None, - args.max_out_len, # max_length - 0, # min_length - args.topk, # top_k - args.topp, # top_p - args.num_beams, # num_beams - args.decoding_strategy, # decode_strategy - None, # decoder_start_token_id - tokenizer.bos_token_id, # bos_token_id - tokenizer.eos_token_id, # eos_token_id - tokenizer.pad_token_id, # pad_token_id - args.diversity_rate, # diversity_rate - args.temperature, # temperature - args.num_return_sequences, # num_return_sequences - args.length_penalty, # length_penalty - args.early_stopping, # early_stopping - tokenizer.eos_token_id, # forced_eos_token_id - ], - ) - - # Save converted static graph model - paddle.jit.save(fast_t5, os.path.join(args.inference_model_dir, "t5")) - logger.info("T5 has been saved to {}.".format(args.inference_model_dir)) - - -if __name__ == "__main__": - args = parse_args() - pprint(args) - - do_predict(args) diff --git a/paddlenlp/ops/fast_transformer/sample/t5_inference.py b/paddlenlp/ops/fast_transformer/sample/t5_inference.py deleted file mode 100644 index 585a4a9566f2..000000000000 --- a/paddlenlp/ops/fast_transformer/sample/t5_inference.py +++ /dev/null @@ -1,94 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import os -from pprint import pprint - -import paddle.inference as paddle_infer - -from paddlenlp.ops.ext_utils import load -from paddlenlp.transformers import T5Tokenizer - - -def setup_args(): - """Setup arguments.""" - parser = argparse.ArgumentParser() - parser.add_argument( - "--inference_model_dir", default="./infer_model/", type=str, help="Path to save inference model of T5. " - ) - parser.add_argument("--batch_size", default=1, type=int, help="Batch size. ") - - args = parser.parse_args() - - return args - - -def postprocess_response(tokenizer, seq, bos_idx, eos_idx): - """Post-process the decoded sequence.""" - eos_pos = len(seq) - 1 - for i, idx in enumerate(seq): - if idx == eos_idx: - eos_pos = i - break - seq = [idx for idx in seq[: eos_pos + 1] if idx != bos_idx and idx != eos_idx] - res = tokenizer.convert_ids_to_string(seq) - return res - - -def infer(args): - model_name = "t5-base" - - tokenizer = T5Tokenizer.from_pretrained(model_name) - inputs = ' This image section from an infrared recording by the Spitzer telescope shows a "family portrait" of countless generations of stars: the oldest stars are seen as blue dots. ' - - # Input ids - input_ids = tokenizer.encode("translate English to French: " + inputs, return_tensors="np")["input_ids"] - - # Load FastGeneration lib. - load("FastGeneration", verbose=True) - - config = paddle_infer.Config( - os.path.join(args.inference_model_dir, "t5.pdmodel"), os.path.join(args.inference_model_dir, "t5.pdiparams") - ) - - config.enable_use_gpu(100, 0) - config.disable_glog_info() - predictor = paddle_infer.create_predictor(config) - - input_names = predictor.get_input_names() - - # Input ids - input_ids_handle = predictor.get_input_handle(input_names[0]) - input_ids_handle.copy_from_cpu(input_ids.astype("int32")) - - predictor.run() - - output_names = predictor.get_output_names() - output_handle = predictor.get_output_handle(output_names[0]) - output_data = output_handle.copy_to_cpu() - - # [batch_size, num_beams * 2, sequence_length] - output_data = output_data.transpose([1, 2, 0]) - - # Only use the best sequence. - translation = tokenizer.decode(output_data[0][0], skip_special_tokens=True, clean_up_tokenization_spaces=False) - print("Result:", translation) - - -if __name__ == "__main__": - args = setup_args() - pprint(args) - - infer(args) diff --git a/paddlenlp/ops/fast_transformer/sample/unimo_text_export_model_sample.py b/paddlenlp/ops/fast_transformer/sample/unimo_text_export_model_sample.py deleted file mode 100644 index eb3f2eeda8d8..000000000000 --- a/paddlenlp/ops/fast_transformer/sample/unimo_text_export_model_sample.py +++ /dev/null @@ -1,111 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import os -from pprint import pprint - -import paddle - -from paddlenlp.ops import FasterUNIMOText -from paddlenlp.transformers import UNIMOLMHeadModel, UNIMOTokenizer -from paddlenlp.utils.log import logger - - -def parse_args(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--model_name_or_path", - default="unimo-text-1.0-lcsts-new", - type=str, - help="The model name to specify the UNIMOText to use. ", - ) - parser.add_argument( - "--inference_model_dir", default="./infer_model/", type=str, help="Path to save inference model of gpt. " - ) - parser.add_argument("--topk", default=4, type=int, help="The number of candidate to procedure top_k sampling. ") - parser.add_argument( - "--topp", default=1.0, type=float, help="The probability threshold to procedure top_p sampling. " - ) - parser.add_argument("--max_out_len", default=64, type=int, help="Maximum output length. ") - parser.add_argument("--min_out_len", default=1, type=int, help="Minimum output length. ") - parser.add_argument("--num_return_sequence", default=1, type=int, help="The number of returned sequence. ") - parser.add_argument("--temperature", default=1.0, type=float, help="The temperature to set. ") - parser.add_argument("--num_return_sequences", default=1, type=int, help="The number of returned sequences. ") - parser.add_argument("--use_fp16_decoding", action="store_true", help="Whether to use fp16 decoding to predict. ") - parser.add_argument( - "--decoding_strategy", - default="sampling", - choices=["sampling", "beam_search"], - type=str, - help="The main strategy to decode. ", - ) - parser.add_argument("--num_beams", default=4, type=int, help="The number of candidate to procedure beam search. ") - parser.add_argument( - "--diversity_rate", default=0.0, type=float, help="The diversity rate to procedure beam search. " - ) - - args = parser.parse_args() - return args - - -def do_predict(args): - place = "gpu" - place = paddle.set_device(place) - - model_name = "unimo-text-1.0-lcsts-new" - model = UNIMOLMHeadModel.from_pretrained(model_name) - tokenizer = UNIMOTokenizer.from_pretrained(model_name) - - unimo_text = FasterUNIMOText(model=model, use_fp16_decoding=args.use_fp16_decoding) - # Set evaluate mode - unimo_text.eval() - - # Convert dygraph model to static graph model - unimo_text = paddle.jit.to_static( - unimo_text, - input_spec=[ - # input_ids - paddle.static.InputSpec(shape=[None, None], dtype="int32"), - # token_type_ids - paddle.static.InputSpec(shape=[None, None], dtype="int32"), - # attention_mask - paddle.static.InputSpec(shape=[None, 1, None, None], dtype="float32"), - # seq_len - paddle.static.InputSpec(shape=[None], dtype="int32"), - args.max_out_len, - args.min_out_len, - args.topk, - args.topp, - args.num_beams, # num_beams. Used for beam_search. - args.decoding_strategy, - tokenizer.cls_token_id, # cls/bos - tokenizer.mask_token_id, # mask/eos - tokenizer.pad_token_id, # pad - args.diversity_rate, # diversity rate. Used for beam search. - args.temperature, - args.num_return_sequences, - ], - ) - - # Save converted static graph model - paddle.jit.save(unimo_text, os.path.join(args.inference_model_dir, "unimo_text")) - logger.info("UNIMOText has been saved to {}.".format(args.inference_model_dir)) - - -if __name__ == "__main__": - args = parse_args() - pprint(args) - - do_predict(args) diff --git a/paddlenlp/ops/fast_transformer/sample/unimo_text_inference.py b/paddlenlp/ops/fast_transformer/sample/unimo_text_inference.py deleted file mode 100644 index 02effef6af56..000000000000 --- a/paddlenlp/ops/fast_transformer/sample/unimo_text_inference.py +++ /dev/null @@ -1,92 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -from pprint import pprint - -import numpy as np -import paddle.inference as paddle_infer - -from paddlenlp.ops.ext_utils import load -from paddlenlp.transformers import UNIMOTokenizer - - -def setup_args(): - """Setup arguments.""" - parser = argparse.ArgumentParser() - parser.add_argument( - "--inference_model_dir", default="./infer_model/", type=str, help="Path to save inference model of UNIMOText. " - ) - - args = parser.parse_args() - - return args - - -def postprocess_response(token_ids, tokenizer): - """Post-process the decoded sequence. Truncate from the first .""" - eos_pos = len(token_ids) - for i, tok_id in enumerate(token_ids): - if tok_id == tokenizer.mask_token_id: - eos_pos = i - break - token_ids = token_ids[:eos_pos] - tokens = tokenizer.convert_ids_to_tokens(token_ids) - tokens = tokenizer.merge_subword(tokens) - return tokens - - -def infer(args): - model_name = "unimo-text-1.0-lcsts-new" - tokenizer = UNIMOTokenizer.from_pretrained(model_name) - - inputs = "深度学习是人工智能的核心技术领域。百度飞桨作为中国首个自主研发、功能丰富、开源开放的产业级深度学习平台,将从多层次技术产品、产业AI人才培养和强大的生态资源支持三方面全面护航企业实现快速AI转型升级。" - - data = tokenizer.gen_encode( - inputs, add_start_token_for_decoding=True, return_length=True, is_split_into_words=False - ) - - # Load FastGeneration lib. - load("FastGeneration", verbose=True) - - config = paddle_infer.Config( - args.inference_model_dir + "unimo_text.pdmodel", args.inference_model_dir + "unimo_text.pdiparams" - ) - config.enable_use_gpu(100, 0) - config.disable_glog_info() - predictor = paddle_infer.create_predictor(config) - - input_handles = {} - for name in predictor.get_input_names(): - input_handles[name] = predictor.get_input_handle(name) - if name == "attention_mask": - input_handles[name].copy_from_cpu(np.expand_dims(np.asarray(data[name], dtype="float32"), axis=(0, 1))) - else: - input_handles[name].copy_from_cpu(np.asarray(data[name], dtype="int32").reshape([1, -1])) - - output_handles = [predictor.get_output_handle(name) for name in predictor.get_output_names()] - - predictor.run() - - output = [output_handle.copy_to_cpu() for output_handle in output_handles] - - for sample in output[0].transpose([1, 0]).tolist(): - print("".join(postprocess_response(sample, tokenizer))) - - -if __name__ == "__main__": - args = setup_args() - pprint(args) - - infer(args) diff --git a/paddlenlp/ops/fast_transformer/src/CMakeLists.txt b/paddlenlp/ops/fast_transformer/src/CMakeLists.txt deleted file mode 100644 index 7db53476685b..000000000000 --- a/paddlenlp/ops/fast_transformer/src/CMakeLists.txt +++ /dev/null @@ -1,336 +0,0 @@ -if (${CUDA_VERSION} GREATER_EQUAL 11.0) - message(STATUS "Add DCUDA11_MODE") - add_definitions("-DCUDA11_MODE") -endif() - -add_definitions(-DNDEBUG) -add_definitions(-DPADDLE_CUDA) -# Default is 1 in standard c++ when using gcc8.2 -add_definitions(-D_GLIBCXX_USE_CXX11_ABI=1) - -add_definitions(-w) - -if(ON_INFER) - add_definitions(-DPADDLE_ON_INFERENCE) - - link_directories(${COMMON_LIB_DIRS}) - - set(ft_lib_link - decoder decoding topk cuda_int8_kernels cuda_kernels online_softmax_beamsearch transformer_kernels attention_kernels encoder nccl_utils nvtx_utils - ) - - if(WITH_GPU) - add_definitions("-DPADDLE_WITH_CUDA") - endif() - - if(NOT WITH_STATIC_LIB) - add_definitions("-DPADDLE_WITH_SHARED_LIB") - else() - # PD_INFER_DECL is mainly used to set the dllimport/dllexport attribute in dynamic library mode. - # Set it to empty in static library mode to avoid compilation issues. - add_definitions("/DPD_INFER_DECL=") - endif() - - macro(safe_set_static_flag) - foreach(flag_var - CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE - CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO) - if(${flag_var} MATCHES "/MD") - string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}") - endif(${flag_var} MATCHES "/MD") - endforeach(flag_var) - endmacro() - - if(NOT DEFINED PADDLE_LIB) - message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/path/paddle/lib") - endif() - if(NOT DEFINED DEMO) - message(FATAL_ERROR "please set DEMO with -DDEMO=demo_name") - endif() - - include_directories("${PADDLE_LIB}/paddle/include") - set(PADDLE_LIB_THIRD_PARTY_PATH "${PADDLE_LIB}/third_party/install/") - include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}protobuf/include") - include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}glog/include") - include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}gflags/include") - include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/include") - if (WITH_ONNXRUNTIME) - include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/include") - include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/include") - endif() - - link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}protobuf/lib") - link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}glog/lib") - link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}gflags/lib") - link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/lib") - link_directories("${PADDLE_LIB}/paddle/lib") - if (WITH_ONNXRUNTIME) - link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib") - link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/lib") - endif() - - if (WIN32) - add_definitions("/DGOOGLE_GLOG_DLL_DECL=") - option(MSVC_STATIC_CRT "use static C Runtime library by default" ON) - if (MSVC_STATIC_CRT) - if (WITH_MKL) - set(FLAG_OPENMP "/openmp") - endif() - set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd ${FLAG_OPENMP}") - set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT ${FLAG_OPENMP}") - set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd ${FLAG_OPENMP}") - set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT ${FLAG_OPENMP}") - safe_set_static_flag() - if (WITH_STATIC_LIB) - add_definitions(-DSTATIC_LIB) - endif() - endif() - else() - if(WITH_MKL) - set(FLAG_OPENMP "-fopenmp") - endif() - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAG_OPENMP}") - endif() - - if (USE_TENSORRT AND WITH_GPU) - set(TENSORRT_ROOT "" CACHE STRING "The root directory of TensorRT library") - if("${TENSORRT_ROOT}" STREQUAL "") - message(FATAL_ERROR "The TENSORRT_ROOT is empty, you must assign it a value with CMake command. Such as: -DTENSORRT_ROOT=TENSORRT_ROOT_PATH ") - endif() - set(TENSORRT_INCLUDE_DIR ${TENSORRT_ROOT}/include) - set(TENSORRT_LIB_DIR ${TENSORRT_ROOT}/lib) - endif() - - if (NOT WIN32) - if (USE_TENSORRT AND WITH_GPU) - include_directories("${TENSORRT_INCLUDE_DIR}") - link_directories("${TENSORRT_LIB_DIR}") - endif() - endif(NOT WIN32) - - if(WITH_MKL) - set(MATH_LIB_PATH "${PADDLE_LIB_THIRD_PARTY_PATH}mklml") - include_directories("${MATH_LIB_PATH}/include") - if(WIN32) - set(MATH_LIB ${MATH_LIB_PATH}/lib/mklml${CMAKE_STATIC_LIBRARY_SUFFIX} - ${MATH_LIB_PATH}/lib/libiomp5md${CMAKE_STATIC_LIBRARY_SUFFIX}) - else() - set(MATH_LIB ${MATH_LIB_PATH}/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX} - ${MATH_LIB_PATH}/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX}) - endif() - set(MKLDNN_PATH "${PADDLE_LIB_THIRD_PARTY_PATH}mkldnn") - if(EXISTS ${MKLDNN_PATH}) - include_directories("${MKLDNN_PATH}/include") - if(WIN32) - set(MKLDNN_LIB ${MKLDNN_PATH}/lib/mkldnn.lib) - else(WIN32) - set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libdnnl.so.3) - endif(WIN32) - endif() - else() - set(OPENBLAS_LIB_PATH "${PADDLE_LIB_THIRD_PARTY_PATH}openblas") - include_directories("${OPENBLAS_LIB_PATH}/include/openblas") - if(WIN32) - set(MATH_LIB ${OPENBLAS_LIB_PATH}/lib/openblas${CMAKE_STATIC_LIBRARY_SUFFIX}) - else() - set(MATH_LIB ${OPENBLAS_LIB_PATH}/lib/libopenblas${CMAKE_STATIC_LIBRARY_SUFFIX}) - endif() - endif() - - if(WITH_STATIC_LIB) - set(DEPS ${PADDLE_LIB}/paddle/lib/libpaddle_inference${CMAKE_STATIC_LIBRARY_SUFFIX}) - else() - if(WIN32) - set(DEPS ${PADDLE_LIB}/paddle/lib/libpaddle_inference${CMAKE_STATIC_LIBRARY_SUFFIX}) - else() - set(DEPS ${PADDLE_LIB}/paddle/lib/libpaddle_inference${CMAKE_SHARED_LIBRARY_SUFFIX}) - endif() - endif() - - if (WITH_ONNXRUNTIME) - set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/libonnxruntime.so paddle2onnx) - endif() - - if (NOT WIN32) - set(EXTERNAL_LIB "-lrt -ldl -lpthread") - set(DEPS ${DEPS} - ${MATH_LIB} ${MKLDNN_LIB} - glog gflags protobuf xxhash - ${EXTERNAL_LIB}) - else() - set(DEPS ${DEPS} - ${MATH_LIB} ${MKLDNN_LIB} - glog gflags_static libprotobuf xxhash ${EXTERNAL_LIB}) - set(DEPS ${DEPS} shlwapi.lib) - endif(NOT WIN32) - - cuda_add_library(decoding_infer_op ${decoding_op_files} SHARED) - add_dependencies(decoding_infer_op extern_${THIRD_PARTY_NAME} boost) - - string(REPLACE "/" ";" DEMO_PATH ${DEMO}) - - list(LENGTH DEMO_PATH PATH_LEN) - MATH(EXPR PATH_LEN "${PATH_LEN}-1") - list(GET DEMO_PATH ${PATH_LEN} DEMO_NAME) - - string(REPLACE "." ";" DEMO_NAME ${DEMO_NAME}) - list(GET DEMO_NAME 0 DEMO_NAME) - add_executable(${DEMO_NAME} ${DEMO}) - set(DEPS decoding_infer_op ${ft_lib_link} boost ${DEPS} cublas cudart) - - if(WITH_GPT AND WITH_SP) - set(DEPS ${DEPS} sentencepiece) - add_dependencies(decoding_infer_op extern_sentencepiece) - endif() - - if(WITH_PARALLEL) - set(DEPS ${DEPS} mpi nccl) - endif() - - if(WIN32) - if(USE_TENSORRT) - add_custom_command(TARGET ${DEMO_NAME} POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy ${TENSORRT_LIB_DIR}/nvinfer${CMAKE_SHARED_LIBRARY_SUFFIX} - ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE} - COMMAND ${CMAKE_COMMAND} -E copy ${TENSORRT_LIB_DIR}/nvinfer_plugin${CMAKE_SHARED_LIBRARY_SUFFIX} - ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE} - ) - endif() - if(WITH_MKL) - add_custom_command(TARGET ${DEMO_NAME} POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy ${MATH_LIB_PATH}/lib/mklml.dll ${CMAKE_BINARY_DIR}/Release - COMMAND ${CMAKE_COMMAND} -E copy ${MATH_LIB_PATH}/lib/libiomp5md.dll ${CMAKE_BINARY_DIR}/Release - COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_PATH}/lib/mkldnn.dll ${CMAKE_BINARY_DIR}/Release - ) - else() - add_custom_command(TARGET ${DEMO_NAME} POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy ${OPENBLAS_LIB_PATH}/lib/openblas.dll ${CMAKE_BINARY_DIR}/Release - ) - endif() - if(NOT WITH_STATIC_LIB) - add_custom_command(TARGET ${DEMO_NAME} POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy "${PADDLE_LIB}/paddle/lib/paddle_fluid.dll" ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE} - ) - endif() - endif() - - target_link_libraries(${DEMO_NAME} ${DEPS}) - -else(ON_INFER) - if(NOT PY_CMD) - set(PYTHON_PATH "python" CACHE STRING "Python path") - else() - set(PYTHON_PATH ${PY_CMD} CACHE STRING "Python path") - endif() - - execute_process(COMMAND ${PYTHON_PATH} "-c" "from __future__ import print_function; import paddle; print(paddle.__version__)" - RESULT_VARIABLE _INC_PYTHON_SUCCESS - OUTPUT_VARIABLE _INC_PYTHON_VALUES) - message(STATUS "PADDLE_VERSION: ${_INC_PYTHON_VALUES}") - - # TODO(gongenlei): support PADDLE_NEW_ALLOCATOR for ON_INFER - if(_INC_PYTHON_VALUES VERSION_GREATER_EQUAL "2.3.0") - add_definitions(-DPADDLE_NEW_ALLOCATOR) - endif() - - if(_INC_PYTHON_VALUES VERSION_GREATER_EQUAL "2.5.0" OR _INC_PYTHON_VALUES VERSION_EQUAL "0.0.0") - find_package(PythonLibs REQUIRED) - include_directories(${PYTHON_INCLUDE_DIRS}) - endif() - - execute_process(COMMAND ${PYTHON_PATH} "-c" "from __future__ import print_function; import paddle; print(paddle.sysconfig.get_include())" - RESULT_VARIABLE _INC_PYTHON_SUCCESS - OUTPUT_VARIABLE _INC_PYTHON_VALUES) - if (NOT _INC_PYTHON_SUCCESS MATCHES 0) - message(FATAL_ERROR "Python config Error.") - endif() - string(REGEX REPLACE ";" "\\\\;" _INC_PYTHON_VALUES ${_INC_PYTHON_VALUES}) - string(REGEX REPLACE "\n" ";" _INC_PYTHON_VALUES ${_INC_PYTHON_VALUES}) - list(GET _INC_PYTHON_VALUES 0 PY_INCLUDE_DIR) - - list(APPEND COMMON_HEADER_DIRS ${PY_INCLUDE_DIR}) - list(APPEND COMMON_HEADER_DIRS ${PY_INCLUDE_DIR}/third_party) - - include_directories( - ${COMMON_HEADER_DIRS} - ) - - execute_process(COMMAND ${PYTHON_PATH} "-c" "from __future__ import print_function; import paddle; print(paddle.sysconfig.get_lib())" - RESULT_VARIABLE _LIB_PYTHON_SUCCESS - OUTPUT_VARIABLE _LIB_PYTHON_VALUES) - if (NOT _LIB_PYTHON_SUCCESS MATCHES 0) - message(FATAL_ERROR "Python config Error.") - endif() - string(REGEX REPLACE ";" "\\\\;" _LIB_PYTHON_VALUES ${_LIB_PYTHON_VALUES}) - string(REGEX REPLACE "\n" ";" _LIB_PYTHON_VALUES ${_LIB_PYTHON_VALUES}) - list(GET _LIB_PYTHON_VALUES 0 PY_LIB_DIR) - list(APPEND COMMON_LIB_DIRS ${PY_LIB_DIR}) - - link_directories( - ${COMMON_LIB_DIRS} - ) - - include_directories(${PY_INCLUDE_DIR}) - include_directories(${PY_INCLUDE_DIR}/third_party) - - if(EXISTS ${PY_LIB_DIR}/libpaddle_custom_op.so) - set(lib_link - -lpaddle_custom_op - ) - endif() - - if(EXISTS ${PY_LIB_DIR}/../fluid/) - if(EXISTS ${PY_LIB_DIR}/../fluid/libpaddle.so) - set(lib_link - -lpaddle - ) - elseif(EXISTS ${PY_LIB_DIR}/../fluid/core_avx.so) - set(lib_link - -l:core_avx.so - ) - else() - set(lib_link - -l:core_noavx.so - ) - endif() - link_directories( - ${PY_LIB_DIR}/../fluid/ - ) - elseif(EXISTS ${PY_LIB_DIR}/../base/) - if(EXISTS ${PY_LIB_DIR}/../base/libpaddle.so) - set(lib_link - -lpaddle - ) - elseif(EXISTS ${PY_LIB_DIR}/../base/core_avx.so) - set(lib_link - -l:core_avx.so - ) - else() - set(lib_link - -l:core_noavx.so - ) - endif() - link_directories( - ${PY_LIB_DIR}/../base/ - ) - endif() - - set(ft_lib_link - -ldecoder -ldecoding -ltopk -lcuda_int8_kernels -lcuda_kernels -lonline_softmax_beamsearch -ltransformer_kernels -lattention_kernels -lencoder -lnccl_utils -lnvtx_utils - ) - - if(WITH_PARALLEL) - set(ft_lib_link ${ft_lib_link} -lmpi -lnccl) - endif() - - add_definitions(-DPADDLE_WITH_CUDA) - add_definitions(-DEIGEN_USE_GPU) - add_definitions(-DPADDLE_USE_DSO) - if (WITH_MKL) - add_definitions(-DPADDLE_WITH_MKLDNN) - endif() - - add_library(decoding_op SHARED ${decoding_op_files}) - add_dependencies(decoding_op extern_${THIRD_PARTY_NAME} boost) - target_link_libraries(decoding_op PRIVATE -lcublas -lcudart ${lib_link} ${ft_lib_link}) -endif() diff --git a/paddlenlp/ops/fast_transformer/src/cublas_handle.cc b/paddlenlp/ops/fast_transformer/src/cublas_handle.cc deleted file mode 100644 index dfdc9badc005..000000000000 --- a/paddlenlp/ops/fast_transformer/src/cublas_handle.cc +++ /dev/null @@ -1,28 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "cublas_handle.h" - -CublasHandle* CublasHandle::GetInstance() { - static CublasHandle* p_handle = nullptr; - if (p_handle == nullptr) { - p_handle = new CublasHandle(); - } - return p_handle; -} - -CublasHandle::~CublasHandle() { - cublasDestroy(cublas_handle_); - cublasLtDestroy(cublaslt_handle_); -} diff --git a/paddlenlp/ops/fast_transformer/src/cublas_handle.h b/paddlenlp/ops/fast_transformer/src/cublas_handle.h deleted file mode 100644 index d636d0585dd1..000000000000 --- a/paddlenlp/ops/fast_transformer/src/cublas_handle.h +++ /dev/null @@ -1,58 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - - -#include "fastertransformer/utils/common.h" - - -/** - * The CublasHandle class defines the `GetInstance` method that serves as an - * alternative to constructor and lets clients access the same instance of this - * class over and over. - */ -class CublasHandle { - /** - * The CublasHandle's constructor should always be private to prevent direct - * construction calls with the `new` operator. - */ -private: - CublasHandle() { - cublasCreate(&cublas_handle_); - cublasLtCreate(&cublaslt_handle_); - } - -public: - /** - * CublasHandle should not be cloneable. - */ - CublasHandle(CublasHandle& other) = delete; - - /** - * CublasHandle should not be assignable. - */ - void operator=(const CublasHandle&) = delete; - - /** - * This is the static method that controls the access to the singleton - * instance. On the first run, it creates a singleton object and places it - * into the static field. On subsequent runs, it returns the client existing - * object stored in the static field. - */ - static CublasHandle* GetInstance(); - - cublasHandle_t cublas_handle_; - cublasLtHandle_t cublaslt_handle_; - - ~CublasHandle(); -}; diff --git a/paddlenlp/ops/fast_transformer/src/demo/gpt.cc b/paddlenlp/ops/fast_transformer/src/demo/gpt.cc deleted file mode 100644 index 7dd55efb2c7a..000000000000 --- a/paddlenlp/ops/fast_transformer/src/demo/gpt.cc +++ /dev/null @@ -1,321 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef GPT_ON_SENTENCEPIECE -#include -#endif - -#include "helper.h" -#include "utf8.h" - -#include -#include -#include -#include -#include -#include - -DEFINE_int32(batch_size, 1, "Batch size to do inference. "); -DEFINE_int32(gpu_id, 0, "The gpu id to do inference. "); -DEFINE_string(model_dir, - "./infer_model/", - "The directory to the inference model. "); -DEFINE_string(vocab_file, - "./infer_model/vocab.txt", - "The path to the vocabulary file. "); -DEFINE_string(start_token, "<|endoftext|>", "The start token of GPT."); -DEFINE_string(end_token, "<|endoftext|>", "The end token of GPT."); - -using namespace paddle_infer; - -std::string model_dir = ""; -std::string vocab_file = ""; - -const int BOS_IDX = 50256; -const int EOS_IDX = 50256; -const int PAD_IDX = 50256; -const int MAX_LENGTH = 256; - -int batch_size = 1; -int gpu_id = 0; - -namespace paddle { -namespace inference { - -struct DataInput { - std::vector src_data; -}; - -struct DataResult { - std::wstring result_q; -}; - -bool get_result_tensor(const std::unique_ptr& seq_ids, - std::vector& dataresultvec, - std::unordered_map& num2word_dict, - std::unordered_map& byte_decoder) { - // NOTE: Add SentencePiece to do some postprocess on cpm model. - // sentencepiece::SentencePieceProcessor processor; - // max_length * batch_size - std::vector output_shape = seq_ids->shape(); - int batch_size = output_shape[1]; - int out_num = std::accumulate( - output_shape.begin(), output_shape.end(), 1, std::multiplies()); - std::vector seq_ids_out; - seq_ids_out.resize(out_num); - seq_ids->CopyToCpu(seq_ids_out.data()); - - dataresultvec.resize(batch_size); - auto max_output_length = output_shape[0]; - - for (int bsz = 0; bsz < batch_size; ++bsz) { - std::u32string tmp_result_q = U""; - for (int len = 1; len < max_output_length; ++len) { - tmp_result_q = - tmp_result_q + num2word_dict[seq_ids_out[len * batch_size + bsz]]; - } - - for (int i = 0; i < tmp_result_q.length(); ++i) { - char32_t tmp = tmp_result_q[i]; - if (byte_decoder.find(tmp) != byte_decoder.end()) { - dataresultvec[bsz].result_q = dataresultvec[bsz].result_q + - static_cast(byte_decoder[tmp]); - } else { - std::cout << "Should not reach here. " << std::endl; - exit(-1); - } - } - } - return true; -} - -std::unordered_map convert_unicode() { - char32_t c0 = U'!'; - char32_t c1 = U'~'; - char32_t c2 = U'¡'; - char32_t c3 = U'¬'; - char32_t c4 = U'®'; - char32_t c5 = U'ÿ'; - - int a0 = c0; - int a1 = c1; - int a2 = c2; - int a3 = c3; - int a4 = c4; - int a5 = c5; - - std::unordered_map ret; - int n = 0; - for (int b = 0; b < 256; ++b) { - char32_t key; - if (b < a0 || (b > a1 && b < a2) || (b < a3 && b > a4) || b > a5) { - key = static_cast(256 + n); - ret.insert(std::pair(key, b)); - n++; - } else { - key = static_cast(b); - ret.insert(std::pair(key, b)); - } - } - - return ret; -} - -class DataReader { -public: - DataReader() {} - - bool NextBatch(std::shared_ptr& predictor, - const int& batch_size, - const std::u32string& start_token, - const std::u32string& end_token, - const int& num_batches, - std::vector& source_query_vec) { - if (current_batches++ >= num_batches) { - return false; - } - - for (int i = 0; i < batch_size; ++i) { - source_query_vec.push_back(start_token); - } - - std::u32string line; - std::vector word_data; - std::vector data_input_vec; - int max_len = 0; - for (int i = 0; i < batch_size; i++) { - DataInput data_input; - data_input.src_data.push_back(word2num_dict[start_token]); - max_len = std::max(max_len, static_cast(data_input.src_data.size())); - max_len = std::min(max_len, MAX_LENGTH); - data_input_vec.push_back(data_input); - } - if (data_input_vec.empty()) { - return false; - } - return TensorMoreBatch( - predictor, data_input_vec, max_len, data_input_vec.size()); - } - - bool GetWordDict() { - std::ifstream fin(vocab_file); - std::string line; - int k = 0; - while (std::getline(fin, line)) { - std::u32string tmp = utf8::utf8to32(line); - word2num_dict[tmp] = k; - num2word_dict[k] = tmp; - k += 1; - } - - fin.close(); - - return true; - } - - int GetCurrentBatch() { return current_batches; } - - std::unordered_map word2num_dict; - std::unordered_map num2word_dict; - std::unique_ptr file; - -private: - bool TensorMoreBatch(std::shared_ptr& predictor, - std::vector& data_input_vec, - int max_len, - int batch_size) { - auto ids_name = predictor->GetInputNames(); - auto ids_t = predictor->GetInputHandle(ids_name[0]); - std::vector ids_vec; - ids_vec.resize(max_len * batch_size); - for (int i = 0; i < batch_size; ++i) { - for (int k = 0; k < max_len; ++k) { - if (k < data_input_vec[i].src_data.size()) { - ids_vec[i * max_len + k] = data_input_vec[i].src_data[k]; - } else { - ids_vec[i * max_len + k] = PAD_IDX; - } - } - } - ids_t->Reshape({batch_size, max_len}); - ids_t->CopyFromCpu(ids_vec.data()); - - return true; - } - - int current_batches = 0; -}; - - -template -void SummaryConfig(const paddle_infer::Config& config, - double infer_time, - int num_batches, - int num_samples) { - LOG(INFO) << "----------------------- Perf info -----------------------"; - LOG(INFO) << "batch_size: " << batch_size; - LOG(INFO) << "average_latency(ms): " << infer_time / num_samples << ", " - << "QPS: " << num_samples / (infer_time / 1000.0); -} - - -void Main(const int& batch_size, - const int& gpu_id, - const std::u32string& start_token, - const std::u32string& end_token) { - Config config; - config.SetModel(model_dir + "/gpt.pdmodel", model_dir + "/gpt.pdiparams"); - - config.EnableUseGpu(100, gpu_id); - - config.SwitchUseFeedFetchOps(false); - config.SwitchSpecifyInputNames(true); - auto predictor = CreatePredictor(config); - DataReader reader; - reader.GetWordDict(); - - double whole_time = 0; - Timer timer; - int num_batches = 100; - int warmup = 50; - std::vector source_query_vec; - auto byte_decoder = convert_unicode(); - - while (reader.NextBatch(predictor, - batch_size, - start_token, - end_token, - num_batches, - source_query_vec)) { - int crt_batch = reader.GetCurrentBatch(); - if (crt_batch >= warmup) { - timer.tic(); - } - predictor->Run(); - - if (crt_batch >= warmup) { - whole_time += timer.toc(); - } - - std::vector dataresultvec; - auto output_names = predictor->GetOutputNames(); - get_result_tensor(predictor->GetOutputHandle(output_names[0]), - dataresultvec, - reader.num2word_dict, - byte_decoder); - - for (int i = 0; i < batch_size; ++i) { - std::wcout << dataresultvec[i].result_q; - std::cout << std::endl; - } - source_query_vec.clear(); - } - std::cout << std::endl; - SummaryConfig(config, - whole_time, - num_batches - warmup, - (num_batches - warmup) * batch_size); -} -} // namespace inference -} // namespace paddle - -int main(int argc, char** argv) { - gflags::ParseCommandLineFlags(&argc, &argv, true); - - batch_size = FLAGS_batch_size; - gpu_id = FLAGS_gpu_id; - - model_dir = FLAGS_model_dir; - vocab_file = FLAGS_vocab_file; - - paddle::inference::Main(batch_size, - gpu_id, - utf8::utf8to32(FLAGS_start_token), - utf8::utf8to32(FLAGS_end_token)); - - return 0; -} diff --git a/paddlenlp/ops/fast_transformer/src/demo/helper.h b/paddlenlp/ops/fast_transformer/src/demo/helper.h deleted file mode 100644 index 046ca4c9e3ea..000000000000 --- a/paddlenlp/ops/fast_transformer/src/demo/helper.h +++ /dev/null @@ -1,66 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#pragma once -#include -#include -#include -#include -#include // NOLINT -#include -#include -#include -#include -#include "paddle_inference_api.h" - -namespace paddle { -namespace inference { -// Timer for timer -class Timer { -public: - std::chrono::high_resolution_clock::time_point start; - std::chrono::high_resolution_clock::time_point startu; - - void tic() { start = std::chrono::high_resolution_clock::now(); } - - double toc() { - startu = std::chrono::high_resolution_clock::now(); - std::chrono::duration time_span = - std::chrono::duration_cast>(startu - - start); - double used_time_ms = static_cast(time_span.count()) * 1000.0; - return used_time_ms; - } -}; - -static void split(const std::string &str, - char sep, - std::vector *pieces) { - pieces->clear(); - if (str.empty()) { - return; - } - size_t pos = 0; - size_t next = str.find(sep, pos); - while (next != std::string::npos) { - pieces->push_back(str.substr(pos, next - pos)); - pos = next + 1; - next = str.find(sep, pos); - } - if (!str.substr(pos).empty()) { - pieces->push_back(str.substr(pos)); - } -} - -} // namespace inference -} // namespace paddle diff --git a/paddlenlp/ops/fast_transformer/src/demo/transformer_e2e.cc b/paddlenlp/ops/fast_transformer/src/demo/transformer_e2e.cc deleted file mode 100644 index 2f1c802f96ed..000000000000 --- a/paddlenlp/ops/fast_transformer/src/demo/transformer_e2e.cc +++ /dev/null @@ -1,281 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "helper.h" - -#include -#include -#include -#include -#include -#include - -using namespace paddle_infer; - -DEFINE_int32(batch_size, 1, "Batch size to do inference. "); -DEFINE_int32(gpu_id, 0, "The gpu id to do inference. "); -DEFINE_string(model_dir, - "./infer_model/", - "The directory to the inference model. "); -DEFINE_string(vocab_file, - "./vocab_all.bpe.33708", - "The path to the vocabulary file. "); -DEFINE_string(data_file, - "./newstest2014.tok.bpe.33708.en", - "The path to the input data file. "); - -std::string model_dir = ""; -std::string vocab_file = ""; -std::string data_file = ""; - -const int EOS_IDX = 1; -const int PAD_IDX = 0; -const int MAX_LENGTH = 256; -const int N_BEST = 1; - -int batch_size = 1; -int gpu_id = 0; - -namespace paddle { -namespace inference { - -struct DataInput { - std::vector src_data; -}; - -struct DataResult { - std::string result_q; -}; - -bool get_result_tensor(const std::unique_ptr& seq_ids, - std::vector& dataresultvec, - std::unordered_map& num2word_dict) { - std::vector output_shape = seq_ids->shape(); - int batch_size = output_shape[1]; - int beam_num = output_shape[2]; - int out_num = std::accumulate( - output_shape.begin(), output_shape.end(), 1, std::multiplies()); - std::vector seq_ids_out; - seq_ids_out.resize(out_num); - seq_ids->CopyToCpu(seq_ids_out.data()); - - dataresultvec.resize(batch_size * N_BEST); - auto max_output_length = output_shape[0]; - - for (int bsz = 0; bsz < output_shape[1]; ++bsz) { - for (int k = 0; k < N_BEST; ++k) { - dataresultvec[bsz * N_BEST + k].result_q = ""; - for (int len = 0; len < max_output_length; ++len) { - if (seq_ids_out[len * batch_size * beam_num + bsz * beam_num + k] == - EOS_IDX) - break; - dataresultvec[bsz * N_BEST + k].result_q = - dataresultvec[bsz * N_BEST + k].result_q + - num2word_dict[seq_ids_out[len * batch_size * beam_num + - bsz * beam_num + k]] + - " "; - } - } - } - return true; -} - -class DataReader { -public: - explicit DataReader(const std::string& path) - : file(new std::ifstream(path)) {} - - bool NextBatch(std::shared_ptr& predictor, - const int& batch_size, - std::vector& source_query_vec) { - std::string line; - std::vector word_data; - std::vector data_input_vec; - int max_len = 0; - for (int i = 0; i < batch_size; i++) { - if (!std::getline(*file, line)) { - break; - } - DataInput data_input; - split(line, ' ', &word_data); - std::string query_str = ""; - for (int j = 0; j < word_data.size(); ++j) { - if (j >= MAX_LENGTH) { - break; - } - query_str += word_data[j]; - if (word2num_dict.find(word_data[j]) == word2num_dict.end()) { - data_input.src_data.push_back(word2num_dict[""]); - } else { - data_input.src_data.push_back(word2num_dict[word_data[j]]); - } - } - source_query_vec.push_back(query_str); - data_input.src_data.push_back(EOS_IDX); - max_len = std::max(max_len, static_cast(data_input.src_data.size())); - max_len = std::min(max_len, MAX_LENGTH); - data_input_vec.push_back(data_input); - } - if (data_input_vec.empty()) { - return false; - } - return TensorMoreBatch( - predictor, data_input_vec, max_len, data_input_vec.size()); - } - - bool GetWordDict() { - std::ifstream fin(vocab_file); - std::string line; - int k = 0; - while (std::getline(fin, line)) { - word2num_dict[line] = k; - num2word_dict[k] = line; - k += 1; - } - - fin.close(); - - return true; - } - - std::unordered_map word2num_dict; - std::unordered_map num2word_dict; - std::unique_ptr file; - -private: - bool TensorMoreBatch(std::shared_ptr& predictor, - std::vector& data_input_vec, - int max_len, - int batch_size) { - auto src_word_t = predictor->GetInputHandle("src_word"); - std::vector src_word_vec; - src_word_vec.resize(max_len * batch_size); - for (int i = 0; i < batch_size; ++i) { - for (int k = 0; k < max_len; ++k) { - if (k < data_input_vec[i].src_data.size()) { - src_word_vec[i * max_len + k] = data_input_vec[i].src_data[k]; - } else { - src_word_vec[i * max_len + k] = PAD_IDX; - } - } - } - src_word_t->Reshape({batch_size, max_len}); - src_word_t->CopyFromCpu(src_word_vec.data()); - - // NOTE: If the saved model supports force decoding, a nullptr must be - // given to trg_word to ensure predictor work properly when not - // using force decoding. - /* - * auto trg_word_t = predictor->GetInputHandle("trg_word"); - * trg_word_t->Reshape({0, 0}); - * trg_word_t->CopyFromCpu((int*)nullptr); - */ - - return true; - } -}; - - -template -void SummaryConfig(const paddle_infer::Config& config, - double infer_time, - int num_batches, - int num_samples) { - LOG(INFO) << "----------------------- Data info -----------------------"; - LOG(INFO) << "batch_size: " << batch_size; - LOG(INFO) << "num_of_samples: " << num_samples; - LOG(INFO) << "----------------------- Conf info -----------------------"; - LOG(INFO) << "runtime_device: " << (config.use_gpu() ? "gpu" : "cpu"); - LOG(INFO) << "ir_optim: " << (config.ir_optim() ? "true" : "false"); - LOG(INFO) << "----------------------- Perf info -----------------------"; - LOG(INFO) << "average_latency(ms): " << infer_time / num_samples << ", " - << "QPS: " << num_samples / (infer_time / 1000.0); -} - - -void Main(int batch_size, int gpu_id) { - Config config; - config.SetModel(model_dir + "/transformer.pdmodel", - model_dir + "/transformer.pdiparams"); - - config.EnableUseGpu(100, gpu_id); - - config.SwitchUseFeedFetchOps(false); - config.SwitchSpecifyInputNames(true); - // When using fp16, fc_elementwise_layernorm_fuse_pass causes a little - // different translation results with original dygraph prediction, maybe you - // can turn off the IR optimization for same results as following: - // config.SwitchIrOptim(false); - auto predictor = CreatePredictor(config); - DataReader reader(data_file); - reader.GetWordDict(); - - double whole_time = 0; - Timer timer; - int num_batches = 0; - int num_samples = 0; - std::vector source_query_vec; - std::ofstream out("predict.txt"); - - while (reader.NextBatch(predictor, batch_size, source_query_vec)) { - timer.tic(); - predictor->Run(); - std::vector dataresultvec; - auto output_names = predictor->GetOutputNames(); - get_result_tensor(predictor->GetOutputHandle(output_names[0]), - dataresultvec, - reader.num2word_dict); - - whole_time += timer.toc(); - num_batches++; - - if (out.is_open()) { - for (int i = 0; i < dataresultvec.size(); ++i) { - out << dataresultvec[i].result_q << "\n"; - } - } - num_samples += dataresultvec.size(); - - source_query_vec.clear(); - } - SummaryConfig(config, whole_time, num_batches, num_samples); - out.close(); -} -} // namespace inference -} // namespace paddle - -int main(int argc, char** argv) { - gflags::ParseCommandLineFlags(&argc, &argv, true); - - batch_size = FLAGS_batch_size; - gpu_id = FLAGS_gpu_id; - - model_dir = FLAGS_model_dir; - vocab_file = FLAGS_vocab_file; - data_file = FLAGS_data_file; - - paddle::inference::Main(batch_size, gpu_id); - - return 0; -} diff --git a/paddlenlp/ops/fast_transformer/src/demo/utf8.h b/paddlenlp/ops/fast_transformer/src/demo/utf8.h deleted file mode 100644 index 82b13f59f983..000000000000 --- a/paddlenlp/ops/fast_transformer/src/demo/utf8.h +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright 2006 Nemanja Trifunovic - -/* -Permission is hereby granted, free of charge, to any person or organization -obtaining a copy of the software and accompanying documentation covered by -this license (the "Software") to use, reproduce, display, distribute, -execute, and transmit the Software, and to prepare derivative works of the -Software, and to permit third-parties to whom the Software is furnished to -do so, all subject to the following: - -The copyright notices in the Software and this entire statement, including -the above license grant, this restriction and the following disclaimer, -must be included in all copies of the Software, in whole or in part, and -all derivative works of the Software, unless such copies or derivative -works are solely in the form of machine-executable object code generated by -a source language processor. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT -SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE -FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, -ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. -*/ - - -#ifndef UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731 -#define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731 - -#include "utf8/checked.h" -#include "utf8/unchecked.h" - -#endif // header guard diff --git a/paddlenlp/ops/fast_transformer/src/demo/utf8/checked.h b/paddlenlp/ops/fast_transformer/src/demo/utf8/checked.h deleted file mode 100644 index 512dcc2fbac8..000000000000 --- a/paddlenlp/ops/fast_transformer/src/demo/utf8/checked.h +++ /dev/null @@ -1,319 +0,0 @@ -// Copyright 2006-2016 Nemanja Trifunovic - -/* -Permission is hereby granted, free of charge, to any person or organization -obtaining a copy of the software and accompanying documentation covered by -this license (the "Software") to use, reproduce, display, distribute, -execute, and transmit the Software, and to prepare derivative works of the -Software, and to permit third-parties to whom the Software is furnished to -do so, all subject to the following: - -The copyright notices in the Software and this entire statement, including -the above license grant, this restriction and the following disclaimer, -must be included in all copies of the Software, in whole or in part, and -all derivative works of the Software, unless such copies or derivative -works are solely in the form of machine-executable object code generated by -a source language processor. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT -SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE -FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, -ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. -*/ - - -#ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 -#define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 - -#include "core.h" -#include - -namespace utf8 -{ - // Base for the exceptions that may be thrown from the library - class exception : public ::std::exception { - }; - - // Exceptions that may be thrown from the library functions. - class invalid_code_point : public exception { - uint32_t cp; - public: - invalid_code_point(uint32_t codepoint) : cp(codepoint) {} - virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid code point"; } - uint32_t code_point() const {return cp;} - }; - - class invalid_utf8 : public exception { - uint8_t u8; - public: - invalid_utf8 (uint8_t u) : u8(u) {} - invalid_utf8 (char c) : u8(static_cast(c)) {} - virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid UTF-8"; } - uint8_t utf8_octet() const {return u8;} - }; - - class invalid_utf16 : public exception { - uint16_t u16; - public: - invalid_utf16 (uint16_t u) : u16(u) {} - virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid UTF-16"; } - uint16_t utf16_word() const {return u16;} - }; - - class not_enough_room : public exception { - public: - virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Not enough space"; } - }; - - /// The library API - functions intended to be called by the users - - template - octet_iterator append(uint32_t cp, octet_iterator result) - { - if (!utf8::internal::is_code_point_valid(cp)) - throw invalid_code_point(cp); - - return internal::append(cp, result); - } - - template - output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement) - { - while (start != end) { - octet_iterator sequence_start = start; - internal::utf_error err_code = utf8::internal::validate_next(start, end); - switch (err_code) { - case internal::UTF8_OK : - for (octet_iterator it = sequence_start; it != start; ++it) - *out++ = *it; - break; - case internal::NOT_ENOUGH_ROOM: - out = utf8::append (replacement, out); - start = end; - break; - case internal::INVALID_LEAD: - out = utf8::append (replacement, out); - ++start; - break; - case internal::INCOMPLETE_SEQUENCE: - case internal::OVERLONG_SEQUENCE: - case internal::INVALID_CODE_POINT: - out = utf8::append (replacement, out); - ++start; - // just one replacement mark for the sequence - while (start != end && utf8::internal::is_trail(*start)) - ++start; - break; - } - } - return out; - } - - template - inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out) - { - static const uint32_t replacement_marker = utf8::internal::mask16(0xfffd); - return utf8::replace_invalid(start, end, out, replacement_marker); - } - - template - uint32_t next(octet_iterator& it, octet_iterator end) - { - uint32_t cp = 0; - internal::utf_error err_code = utf8::internal::validate_next(it, end, cp); - switch (err_code) { - case internal::UTF8_OK : - break; - case internal::NOT_ENOUGH_ROOM : - throw not_enough_room(); - case internal::INVALID_LEAD : - case internal::INCOMPLETE_SEQUENCE : - case internal::OVERLONG_SEQUENCE : - throw invalid_utf8(static_cast(*it)); - case internal::INVALID_CODE_POINT : - throw invalid_code_point(cp); - } - return cp; - } - - template - uint32_t peek_next(octet_iterator it, octet_iterator end) - { - return utf8::next(it, end); - } - - template - uint32_t prior(octet_iterator& it, octet_iterator start) - { - // can't do much if it == start - if (it == start) - throw not_enough_room(); - - octet_iterator end = it; - // Go back until we hit either a lead octet or start - while (utf8::internal::is_trail(*(--it))) - if (it == start) - throw invalid_utf8(*it); // error - no lead byte in the sequence - return utf8::peek_next(it, end); - } - - template - void advance (octet_iterator& it, distance_type n, octet_iterator end) - { - const distance_type zero(0); - if (n < zero) { - // backward - for (distance_type i = n; i < zero; ++i) - utf8::prior(it, end); - } else { - // forward - for (distance_type i = zero; i < n; ++i) - utf8::next(it, end); - } - } - - template - typename std::iterator_traits::difference_type - distance (octet_iterator first, octet_iterator last) - { - typename std::iterator_traits::difference_type dist; - for (dist = 0; first < last; ++dist) - utf8::next(first, last); - return dist; - } - - template - octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) - { - while (start != end) { - uint32_t cp = utf8::internal::mask16(*start++); - // Take care of surrogate pairs first - if (utf8::internal::is_lead_surrogate(cp)) { - if (start != end) { - uint32_t trail_surrogate = utf8::internal::mask16(*start++); - if (utf8::internal::is_trail_surrogate(trail_surrogate)) - cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; - else - throw invalid_utf16(static_cast(trail_surrogate)); - } - else - throw invalid_utf16(static_cast(cp)); - - } - // Lone trail surrogate - else if (utf8::internal::is_trail_surrogate(cp)) - throw invalid_utf16(static_cast(cp)); - - result = utf8::append(cp, result); - } - return result; - } - - template - u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result) - { - while (start < end) { - uint32_t cp = utf8::next(start, end); - if (cp > 0xffff) { //make a surrogate pair - *result++ = static_cast((cp >> 10) + internal::LEAD_OFFSET); - *result++ = static_cast((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); - } - else - *result++ = static_cast(cp); - } - return result; - } - - template - octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result) - { - while (start != end) - result = utf8::append(*(start++), result); - - return result; - } - - template - u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result) - { - while (start < end) - (*result++) = utf8::next(start, end); - - return result; - } - - // The iterator class - template - class iterator { - octet_iterator it; - octet_iterator range_start; - octet_iterator range_end; - public: - typedef uint32_t value_type; - typedef uint32_t* pointer; - typedef uint32_t& reference; - typedef std::ptrdiff_t difference_type; - typedef std::bidirectional_iterator_tag iterator_category; - iterator () {} - explicit iterator (const octet_iterator& octet_it, - const octet_iterator& rangestart, - const octet_iterator& rangeend) : - it(octet_it), range_start(rangestart), range_end(rangeend) - { - if (it < range_start || it > range_end) - throw std::out_of_range("Invalid utf-8 iterator position"); - } - // the default "big three" are OK - octet_iterator base () const { return it; } - uint32_t operator * () const - { - octet_iterator temp = it; - return utf8::next(temp, range_end); - } - bool operator == (const iterator& rhs) const - { - if (range_start != rhs.range_start || range_end != rhs.range_end) - throw std::logic_error("Comparing utf-8 iterators defined with different ranges"); - return (it == rhs.it); - } - bool operator != (const iterator& rhs) const - { - return !(operator == (rhs)); - } - iterator& operator ++ () - { - utf8::next(it, range_end); - return *this; - } - iterator operator ++ (int) - { - iterator temp = *this; - utf8::next(it, range_end); - return temp; - } - iterator& operator -- () - { - utf8::prior(it, range_start); - return *this; - } - iterator operator -- (int) - { - iterator temp = *this; - utf8::prior(it, range_start); - return temp; - } - }; // class iterator - -} // namespace utf8 - -#if UTF_CPP_CPLUSPLUS >= 201703L // C++ 17 or later -#include "cpp17.h" -#elif UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later -#include "cpp11.h" -#endif // C++ 11 or later - -#endif //header guard - diff --git a/paddlenlp/ops/fast_transformer/src/demo/utf8/core.h b/paddlenlp/ops/fast_transformer/src/demo/utf8/core.h deleted file mode 100644 index 34371ee31c8c..000000000000 --- a/paddlenlp/ops/fast_transformer/src/demo/utf8/core.h +++ /dev/null @@ -1,387 +0,0 @@ -// Copyright 2006 Nemanja Trifunovic - -/* -Permission is hereby granted, free of charge, to any person or organization -obtaining a copy of the software and accompanying documentation covered by -this license (the "Software") to use, reproduce, display, distribute, -execute, and transmit the Software, and to prepare derivative works of the -Software, and to permit third-parties to whom the Software is furnished to -do so, all subject to the following: - -The copyright notices in the Software and this entire statement, including -the above license grant, this restriction and the following disclaimer, -must be included in all copies of the Software, in whole or in part, and -all derivative works of the Software, unless such copies or derivative -works are solely in the form of machine-executable object code generated by -a source language processor. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT -SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE -FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, -ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. -*/ - - -#ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 -#define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 - -#include - -// Determine the C++ standard version. -// If the user defines UTF_CPP_CPLUSPLUS, use that. -// Otherwise, trust the unreliable predefined macro __cplusplus - -#if !defined UTF_CPP_CPLUSPLUS - #define UTF_CPP_CPLUSPLUS __cplusplus -#endif - -#if UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later - #define UTF_CPP_OVERRIDE override - #define UTF_CPP_NOEXCEPT noexcept -#else // C++ 98/03 - #define UTF_CPP_OVERRIDE - #define UTF_CPP_NOEXCEPT throw() -#endif // C++ 11 or later - - -namespace utf8 -{ - // The typedefs for 8-bit, 16-bit and 32-bit unsigned integers - // You may need to change them to match your system. - // These typedefs have the same names as ones from cstdint, or boost/cstdint - typedef unsigned char uint8_t; - typedef unsigned short uint16_t; - typedef unsigned int uint32_t; - -// Helper code - not intended to be directly called by the library users. May be changed at any time -namespace internal -{ - // Unicode constants - // Leading (high) surrogates: 0xd800 - 0xdbff - // Trailing (low) surrogates: 0xdc00 - 0xdfff - const uint16_t LEAD_SURROGATE_MIN = 0xd800u; - const uint16_t LEAD_SURROGATE_MAX = 0xdbffu; - const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u; - const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu; - const uint16_t LEAD_OFFSET = 0xd7c0u; // LEAD_SURROGATE_MIN - (0x10000 >> 10) - const uint32_t SURROGATE_OFFSET = 0xfca02400u; // 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN - - // Maximum valid value for a Unicode code point - const uint32_t CODE_POINT_MAX = 0x0010ffffu; - - template - inline uint8_t mask8(octet_type oc) - { - return static_cast(0xff & oc); - } - template - inline uint16_t mask16(u16_type oc) - { - return static_cast(0xffff & oc); - } - template - inline bool is_trail(octet_type oc) - { - return ((utf8::internal::mask8(oc) >> 6) == 0x2); - } - - template - inline bool is_lead_surrogate(u16 cp) - { - return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX); - } - - template - inline bool is_trail_surrogate(u16 cp) - { - return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX); - } - - template - inline bool is_surrogate(u16 cp) - { - return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX); - } - - template - inline bool is_code_point_valid(u32 cp) - { - return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp)); - } - - template - inline typename std::iterator_traits::difference_type - sequence_length(octet_iterator lead_it) - { - uint8_t lead = utf8::internal::mask8(*lead_it); - if (lead < 0x80) - return 1; - else if ((lead >> 5) == 0x6) - return 2; - else if ((lead >> 4) == 0xe) - return 3; - else if ((lead >> 3) == 0x1e) - return 4; - else - return 0; - } - - template - inline bool is_overlong_sequence(uint32_t cp, octet_difference_type length) - { - if (cp < 0x80) { - if (length != 1) - return true; - } - else if (cp < 0x800) { - if (length != 2) - return true; - } - else if (cp < 0x10000) { - if (length != 3) - return true; - } - - return false; - } - - enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT}; - - /// Helper for get_sequence_x - template - utf_error increase_safely(octet_iterator& it, octet_iterator end) - { - if (++it == end) - return NOT_ENOUGH_ROOM; - - if (!utf8::internal::is_trail(*it)) - return INCOMPLETE_SEQUENCE; - - return UTF8_OK; - } - - #define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT, END) {utf_error ret = increase_safely(IT, END); if (ret != UTF8_OK) return ret;} - - /// get_sequence_x functions decode utf-8 sequences of the length x - template - utf_error get_sequence_1(octet_iterator& it, octet_iterator end, uint32_t& code_point) - { - if (it == end) - return NOT_ENOUGH_ROOM; - - code_point = utf8::internal::mask8(*it); - - return UTF8_OK; - } - - template - utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t& code_point) - { - if (it == end) - return NOT_ENOUGH_ROOM; - - code_point = utf8::internal::mask8(*it); - - UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) - - code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f); - - return UTF8_OK; - } - - template - utf_error get_sequence_3(octet_iterator& it, octet_iterator end, uint32_t& code_point) - { - if (it == end) - return NOT_ENOUGH_ROOM; - - code_point = utf8::internal::mask8(*it); - - UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) - - code_point = ((code_point << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff); - - UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) - - code_point += (*it) & 0x3f; - - return UTF8_OK; - } - - template - utf_error get_sequence_4(octet_iterator& it, octet_iterator end, uint32_t& code_point) - { - if (it == end) - return NOT_ENOUGH_ROOM; - - code_point = utf8::internal::mask8(*it); - - UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) - - code_point = ((code_point << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff); - - UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) - - code_point += (utf8::internal::mask8(*it) << 6) & 0xfff; - - UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) - - code_point += (*it) & 0x3f; - - return UTF8_OK; - } - - #undef UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR - - template - utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t& code_point) - { - if (it == end) - return NOT_ENOUGH_ROOM; - - // Save the original value of it so we can go back in case of failure - // Of course, it does not make much sense with i.e. stream iterators - octet_iterator original_it = it; - - uint32_t cp = 0; - // Determine the sequence length based on the lead octet - typedef typename std::iterator_traits::difference_type octet_difference_type; - const octet_difference_type length = utf8::internal::sequence_length(it); - - // Get trail octets and calculate the code point - utf_error err = UTF8_OK; - switch (length) { - case 0: - return INVALID_LEAD; - case 1: - err = utf8::internal::get_sequence_1(it, end, cp); - break; - case 2: - err = utf8::internal::get_sequence_2(it, end, cp); - break; - case 3: - err = utf8::internal::get_sequence_3(it, end, cp); - break; - case 4: - err = utf8::internal::get_sequence_4(it, end, cp); - break; - } - - if (err == UTF8_OK) { - // Decoding succeeded. Now, security checks... - if (utf8::internal::is_code_point_valid(cp)) { - if (!utf8::internal::is_overlong_sequence(cp, length)){ - // Passed! Return here. - code_point = cp; - ++it; - return UTF8_OK; - } - else - err = OVERLONG_SEQUENCE; - } - else - err = INVALID_CODE_POINT; - } - - // Failure branch - restore the original value of the iterator - it = original_it; - return err; - } - - template - inline utf_error validate_next(octet_iterator& it, octet_iterator end) { - uint32_t ignored; - return utf8::internal::validate_next(it, end, ignored); - } - - // Internal implementation of both checked and unchecked append() function - // This function will be invoked by the overloads below, as they will know - // the octet_type. - template - octet_iterator append(uint32_t cp, octet_iterator result) { - if (cp < 0x80) // one octet - *(result++) = static_cast(cp); - else if (cp < 0x800) { // two octets - *(result++) = static_cast((cp >> 6) | 0xc0); - *(result++) = static_cast((cp & 0x3f) | 0x80); - } - else if (cp < 0x10000) { // three octets - *(result++) = static_cast((cp >> 12) | 0xe0); - *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); - *(result++) = static_cast((cp & 0x3f) | 0x80); - } - else { // four octets - *(result++) = static_cast((cp >> 18) | 0xf0); - *(result++) = static_cast(((cp >> 12) & 0x3f)| 0x80); - *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); - *(result++) = static_cast((cp & 0x3f) | 0x80); - } - return result; - } - - // One of the following overloads will be invoked from the API calls - - // A simple (but dangerous) case: the caller appends byte(s) to a char array - inline char* append(uint32_t cp, char* result) { - return append(cp, result); - } - - // Hopefully, most common case: the caller uses back_inserter - // i.e. append(cp, std::back_inserter(str)); - template - std::back_insert_iterator append - (uint32_t cp, std::back_insert_iterator result) { - return append, - typename container_type::value_type>(cp, result); - } - - // The caller uses some other kind of output operator - not covered above - // Note that in this case we are not able to determine octet_type - // so we assume it's uint_8; that can cause a conversion warning if we are wrong. - template - octet_iterator append(uint32_t cp, octet_iterator result) { - return append(cp, result); - } - -} // namespace internal - - /// The library API - functions intended to be called by the users - - // Byte order mark - const uint8_t bom[] = {0xef, 0xbb, 0xbf}; - - template - octet_iterator find_invalid(octet_iterator start, octet_iterator end) - { - octet_iterator result = start; - while (result != end) { - utf8::internal::utf_error err_code = utf8::internal::validate_next(result, end); - if (err_code != internal::UTF8_OK) - return result; - } - return result; - } - - template - inline bool is_valid(octet_iterator start, octet_iterator end) - { - return (utf8::find_invalid(start, end) == end); - } - - template - inline bool starts_with_bom (octet_iterator it, octet_iterator end) - { - return ( - ((it != end) && (utf8::internal::mask8(*it++)) == bom[0]) && - ((it != end) && (utf8::internal::mask8(*it++)) == bom[1]) && - ((it != end) && (utf8::internal::mask8(*it)) == bom[2]) - ); - } -} // namespace utf8 - -#endif // header guard - - diff --git a/paddlenlp/ops/fast_transformer/src/demo/utf8/cpp11.h b/paddlenlp/ops/fast_transformer/src/demo/utf8/cpp11.h deleted file mode 100644 index 2366f12915cb..000000000000 --- a/paddlenlp/ops/fast_transformer/src/demo/utf8/cpp11.h +++ /dev/null @@ -1,103 +0,0 @@ -// Copyright 2018 Nemanja Trifunovic - -/* -Permission is hereby granted, free of charge, to any person or organization -obtaining a copy of the software and accompanying documentation covered by -this license (the "Software") to use, reproduce, display, distribute, -execute, and transmit the Software, and to prepare derivative works of the -Software, and to permit third-parties to whom the Software is furnished to -do so, all subject to the following: - -The copyright notices in the Software and this entire statement, including -the above license grant, this restriction and the following disclaimer, -must be included in all copies of the Software, in whole or in part, and -all derivative works of the Software, unless such copies or derivative -works are solely in the form of machine-executable object code generated by -a source language processor. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT -SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE -FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, -ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. -*/ - - -#ifndef UTF8_FOR_CPP_a184c22c_d012_11e8_a8d5_f2801f1b9fd1 -#define UTF8_FOR_CPP_a184c22c_d012_11e8_a8d5_f2801f1b9fd1 - -#include "checked.h" -#include - -namespace utf8 -{ - - inline void append(char32_t cp, std::string& s) - { - append(uint32_t(cp), std::back_inserter(s)); - } - - inline std::string utf16to8(const std::u16string& s) - { - std::string result; - utf16to8(s.begin(), s.end(), std::back_inserter(result)); - return result; - } - - inline std::u16string utf8to16(const std::string& s) - { - std::u16string result; - utf8to16(s.begin(), s.end(), std::back_inserter(result)); - return result; - } - - inline std::string utf32to8(const std::u32string& s) - { - std::string result; - utf32to8(s.begin(), s.end(), std::back_inserter(result)); - return result; - } - - inline std::u32string utf8to32(const std::string& s) - { - std::u32string result; - utf8to32(s.begin(), s.end(), std::back_inserter(result)); - return result; - } - - inline std::size_t find_invalid(const std::string& s) - { - std::string::const_iterator invalid = find_invalid(s.begin(), s.end()); - return (invalid == s.end()) ? std::string::npos : static_cast(invalid - s.begin()); - } - - inline bool is_valid(const std::string& s) - { - return is_valid(s.begin(), s.end()); - } - - inline std::string replace_invalid(const std::string& s, char32_t replacement) - { - std::string result; - replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement); - return result; - } - - inline std::string replace_invalid(const std::string& s) - { - std::string result; - replace_invalid(s.begin(), s.end(), std::back_inserter(result)); - return result; - } - - inline bool starts_with_bom(const std::string& s) - { - return starts_with_bom(s.begin(), s.end()); - } - -} // namespace utf8 - -#endif // header guard - diff --git a/paddlenlp/ops/fast_transformer/src/demo/utf8/cpp17.h b/paddlenlp/ops/fast_transformer/src/demo/utf8/cpp17.h deleted file mode 100644 index 32a77ce30750..000000000000 --- a/paddlenlp/ops/fast_transformer/src/demo/utf8/cpp17.h +++ /dev/null @@ -1,103 +0,0 @@ -// Copyright 2018 Nemanja Trifunovic - -/* -Permission is hereby granted, free of charge, to any person or organization -obtaining a copy of the software and accompanying documentation covered by -this license (the "Software") to use, reproduce, display, distribute, -execute, and transmit the Software, and to prepare derivative works of the -Software, and to permit third-parties to whom the Software is furnished to -do so, all subject to the following: - -The copyright notices in the Software and this entire statement, including -the above license grant, this restriction and the following disclaimer, -must be included in all copies of the Software, in whole or in part, and -all derivative works of the Software, unless such copies or derivative -works are solely in the form of machine-executable object code generated by -a source language processor. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT -SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE -FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, -ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. -*/ - - -#ifndef UTF8_FOR_CPP_7e906c01_03a3_4daf_b420_ea7ea952b3c9 -#define UTF8_FOR_CPP_7e906c01_03a3_4daf_b420_ea7ea952b3c9 - -#include "checked.h" -#include - -namespace utf8 -{ - - inline void append(char32_t cp, std::string& s) - { - append(uint32_t(cp), std::back_inserter(s)); - } - - inline std::string utf16to8(std::u16string_view s) - { - std::string result; - utf16to8(s.begin(), s.end(), std::back_inserter(result)); - return result; - } - - inline std::u16string utf8to16(std::string_view s) - { - std::u16string result; - utf8to16(s.begin(), s.end(), std::back_inserter(result)); - return result; - } - - inline std::string utf32to8(std::u32string_view s) - { - std::string result; - utf32to8(s.begin(), s.end(), std::back_inserter(result)); - return result; - } - - inline std::u32string utf8to32(std::string_view s) - { - std::u32string result; - utf8to32(s.begin(), s.end(), std::back_inserter(result)); - return result; - } - - inline std::size_t find_invalid(std::string_view s) - { - std::string_view::const_iterator invalid = find_invalid(s.begin(), s.end()); - return (invalid == s.end()) ? std::string_view::npos : static_cast(invalid - s.begin()); - } - - inline bool is_valid(std::string_view s) - { - return is_valid(s.begin(), s.end()); - } - - inline std::string replace_invalid(std::string_view s, char32_t replacement) - { - std::string result; - replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement); - return result; - } - - inline std::string replace_invalid(std::string_view s) - { - std::string result; - replace_invalid(s.begin(), s.end(), std::back_inserter(result)); - return result; - } - - inline bool starts_with_bom(std::string_view s) - { - return starts_with_bom(s.begin(), s.end()); - } - -} // namespace utf8 - -#endif // header guard - diff --git a/paddlenlp/ops/fast_transformer/src/demo/utf8/unchecked.h b/paddlenlp/ops/fast_transformer/src/demo/utf8/unchecked.h deleted file mode 100644 index 8fe83c9ecbc7..000000000000 --- a/paddlenlp/ops/fast_transformer/src/demo/utf8/unchecked.h +++ /dev/null @@ -1,257 +0,0 @@ -// Copyright 2006 Nemanja Trifunovic - -/* -Permission is hereby granted, free of charge, to any person or organization -obtaining a copy of the software and accompanying documentation covered by -this license (the "Software") to use, reproduce, display, distribute, -execute, and transmit the Software, and to prepare derivative works of the -Software, and to permit third-parties to whom the Software is furnished to -do so, all subject to the following: - -The copyright notices in the Software and this entire statement, including -the above license grant, this restriction and the following disclaimer, -must be included in all copies of the Software, in whole or in part, and -all derivative works of the Software, unless such copies or derivative -works are solely in the form of machine-executable object code generated by -a source language processor. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT -SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE -FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, -ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. -*/ - - -#ifndef UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 -#define UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 - -#include "core.h" - -namespace utf8 -{ - namespace unchecked - { - template - octet_iterator append(uint32_t cp, octet_iterator result) - { - return internal::append(cp, result); - } - - template - output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement) - { - while (start != end) { - octet_iterator sequence_start = start; - internal::utf_error err_code = utf8::internal::validate_next(start, end); - switch (err_code) { - case internal::UTF8_OK : - for (octet_iterator it = sequence_start; it != start; ++it) - *out++ = *it; - break; - case internal::NOT_ENOUGH_ROOM: - out = utf8::unchecked::append (replacement, out); - start = end; - break; - case internal::INVALID_LEAD: - out = utf8::unchecked::append (replacement, out); - ++start; - break; - case internal::INCOMPLETE_SEQUENCE: - case internal::OVERLONG_SEQUENCE: - case internal::INVALID_CODE_POINT: - out = utf8::unchecked::append (replacement, out); - ++start; - // just one replacement mark for the sequence - while (start != end && utf8::internal::is_trail(*start)) - ++start; - break; - } - } - return out; - } - - template - inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out) - { - static const uint32_t replacement_marker = utf8::internal::mask16(0xfffd); - return utf8::unchecked::replace_invalid(start, end, out, replacement_marker); - } - - template - uint32_t next(octet_iterator& it) - { - uint32_t cp = utf8::internal::mask8(*it); - typename std::iterator_traits::difference_type length = utf8::internal::sequence_length(it); - switch (length) { - case 1: - break; - case 2: - it++; - cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f); - break; - case 3: - ++it; - cp = ((cp << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff); - ++it; - cp += (*it) & 0x3f; - break; - case 4: - ++it; - cp = ((cp << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff); - ++it; - cp += (utf8::internal::mask8(*it) << 6) & 0xfff; - ++it; - cp += (*it) & 0x3f; - break; - } - ++it; - return cp; - } - - template - uint32_t peek_next(octet_iterator it) - { - return utf8::unchecked::next(it); - } - - template - uint32_t prior(octet_iterator& it) - { - while (utf8::internal::is_trail(*(--it))) ; - octet_iterator temp = it; - return utf8::unchecked::next(temp); - } - - template - void advance (octet_iterator& it, distance_type n) - { - const distance_type zero(0); - if (n < zero) { - // backward - for (distance_type i = n; i < zero; ++i) - utf8::unchecked::prior(it); - } else { - // forward - for (distance_type i = zero; i < n; ++i) - utf8::unchecked::next(it); - } - } - - template - typename std::iterator_traits::difference_type - distance (octet_iterator first, octet_iterator last) - { - typename std::iterator_traits::difference_type dist; - for (dist = 0; first < last; ++dist) - utf8::unchecked::next(first); - return dist; - } - - template - octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) - { - while (start != end) { - uint32_t cp = utf8::internal::mask16(*start++); - // Take care of surrogate pairs first - if (utf8::internal::is_lead_surrogate(cp)) { - uint32_t trail_surrogate = utf8::internal::mask16(*start++); - cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; - } - result = utf8::unchecked::append(cp, result); - } - return result; - } - - template - u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result) - { - while (start < end) { - uint32_t cp = utf8::unchecked::next(start); - if (cp > 0xffff) { //make a surrogate pair - *result++ = static_cast((cp >> 10) + internal::LEAD_OFFSET); - *result++ = static_cast((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); - } - else - *result++ = static_cast(cp); - } - return result; - } - - template - octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result) - { - while (start != end) - result = utf8::unchecked::append(*(start++), result); - - return result; - } - - template - u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result) - { - while (start < end) - (*result++) = utf8::unchecked::next(start); - - return result; - } - - // The iterator class - template - class iterator { - octet_iterator it; - public: - typedef uint32_t value_type; - typedef uint32_t* pointer; - typedef uint32_t& reference; - typedef std::ptrdiff_t difference_type; - typedef std::bidirectional_iterator_tag iterator_category; - iterator () {} - explicit iterator (const octet_iterator& octet_it): it(octet_it) {} - // the default "big three" are OK - octet_iterator base () const { return it; } - uint32_t operator * () const - { - octet_iterator temp = it; - return utf8::unchecked::next(temp); - } - bool operator == (const iterator& rhs) const - { - return (it == rhs.it); - } - bool operator != (const iterator& rhs) const - { - return !(operator == (rhs)); - } - iterator& operator ++ () - { - ::std::advance(it, utf8::internal::sequence_length(it)); - return *this; - } - iterator operator ++ (int) - { - iterator temp = *this; - ::std::advance(it, utf8::internal::sequence_length(it)); - return temp; - } - iterator& operator -- () - { - utf8::unchecked::prior(it); - return *this; - } - iterator operator -- (int) - { - iterator temp = *this; - utf8::unchecked::prior(it); - return temp; - } - }; // class iterator - - } // namespace utf8::unchecked -} // namespace utf8 - - -#endif // header guard - diff --git a/paddlenlp/ops/fast_transformer/src/fusion_bart_decoding_op.cc b/paddlenlp/ops/fast_transformer/src/fusion_bart_decoding_op.cc deleted file mode 100644 index ed182f854001..000000000000 --- a/paddlenlp/ops/fast_transformer/src/fusion_bart_decoding_op.cc +++ /dev/null @@ -1,352 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include -#include - -#include "fusion_bart_decoding_op.h" -#include "pd_traits.h" - - -std::vector BartDecodingForward( - const paddle::Tensor& input, - const paddle::Tensor& mem_seq_len, - const paddle::Tensor& word_embedding, - const std::vector& self_ln_weight, - const std::vector& self_ln_bias, - const std::vector& self_q_weight, - const std::vector& self_q_bias, - const std::vector& self_k_weight, - const std::vector& self_k_bias, - const std::vector& self_v_weight, - const std::vector& self_v_bias, - const std::vector& self_out_weight, - const std::vector& self_out_bias, - const std::vector& cross_ln_weight, - const std::vector& cross_ln_bias, - const std::vector& cross_q_weight, - const std::vector& cross_q_bias, - const std::vector& cross_k_weight, - const std::vector& cross_k_bias, - const std::vector& cross_v_weight, - const std::vector& cross_v_bias, - const std::vector& cross_out_weight, - const std::vector& cross_out_bias, - const std::vector& ffn_ln_weight, - const std::vector& ffn_ln_bias, - const std::vector& ffn_inter_weight, - const std::vector& ffn_inter_bias, - const std::vector& ffn_out_weight, - const std::vector& ffn_out_bias, - const paddle::Tensor& decoder_ln_weight, - const paddle::Tensor& decoder_ln_bias, - const paddle::Tensor& embedding_weight, - const paddle::Tensor& embedding_bias, - const paddle::Tensor& positional_embedding_weight, - const std::string& decoding_strategy, - const int& beam_size, - const int& topk, - const float& topp, - const float& temperature, - const int& n_head, - const int& size_per_head, - const int& num_layer, - const int& bos_id, - const int& eos_id, - const int64_t& max_len, - const int64_t& min_len, - const float& beam_search_diversity_rate, - const bool& rel_len, - const float& alpha, - const bool& early_stopping) { - int batch_size = input.shape()[0]; - int max_out_len = rel_len ? max_len + input.shape()[1] : max_len; - int min_out_len = rel_len ? min_len + input.shape()[1] : min_len; - - std::vector output_dims; - std::vector parent_ids_dims; - std::vector sequence_length_dims({batch_size}); - if (decoding_strategy == "beam_search") { - batch_size /= beam_size; - output_dims = {max_out_len, batch_size, beam_size}; - parent_ids_dims = output_dims; - } else if (decoding_strategy == "beam_search_v2" || - decoding_strategy == "beam_search_v3") { - // Use separated alive and finish beam queues to avoid the decrease of alive - // beams. The outputs must include both the finish and alive to trace full - // path. - if (batch_size != -1) { - sequence_length_dims = {batch_size * 2}; - batch_size /= beam_size; - } else { - sequence_length_dims = {batch_size}; - } - output_dims = {max_out_len, batch_size, beam_size * 2}; - parent_ids_dims = output_dims; - } else if (decoding_strategy == "topk_sampling" || - decoding_strategy == "topp_sampling") { - output_dims = {max_out_len, batch_size}; - parent_ids_dims = {1}; - } else { - PD_THROW("Not supported decoding strategy. "); - } - - if (input.place() == paddle::PlaceType::kGPU) { - auto output_ids = paddle::Tensor(paddle::PlaceType::kGPU, output_dims); - auto parent_ids = paddle::Tensor(paddle::PlaceType::kGPU, parent_ids_dims); - auto sequence_length = - paddle::Tensor(paddle::PlaceType::kGPU, sequence_length_dims); - - paddle::Tensor seq_len = paddle::Tensor(paddle::PlaceType::kGPU); - - if (mem_seq_len.place() != paddle::PlaceType::kGPU) { - seq_len = mem_seq_len.copy_to(paddle::PlaceType::kGPU); - } else { - seq_len = mem_seq_len; - } - - return BartDecodingCUDAForward(input, - seq_len, - word_embedding, - self_ln_weight, - self_ln_bias, - self_q_weight, - self_q_bias, - self_k_weight, - self_k_bias, - self_v_weight, - self_v_bias, - self_out_weight, - self_out_bias, - cross_ln_weight, - cross_ln_bias, - cross_q_weight, - cross_q_bias, - cross_k_weight, - cross_k_bias, - cross_v_weight, - cross_v_bias, - cross_out_weight, - cross_out_bias, - ffn_ln_weight, - ffn_ln_bias, - ffn_inter_weight, - ffn_inter_bias, - ffn_out_weight, - ffn_out_bias, - decoder_ln_weight, - decoder_ln_bias, - embedding_weight, - embedding_bias, - positional_embedding_weight, - output_ids, - parent_ids, - sequence_length, - decoding_strategy, - beam_size, - topk, - topp, - temperature, - n_head, - size_per_head, - num_layer, - bos_id, - eos_id, - max_out_len, - min_out_len, - beam_search_diversity_rate, - alpha, - early_stopping); - } else { - PD_THROW("Not implemented place. Only GPU is supported. "); - } -} - -std::vector> BartDecodingInferShape( - const std::vector& input_shape, - const std::vector& mem_seq_len_shape, - const std::vector& word_embedding_shape, - const std::vector>& self_ln_weight_shapes, - const std::vector>& self_ln_bias_shapes, - const std::vector>& self_q_weight_shapes, - const std::vector>& self_q_bias_shapes, - const std::vector>& self_k_weight_shapes, - const std::vector>& self_k_bias_shapes, - const std::vector>& self_v_weight_shapes, - const std::vector>& self_v_bias_shapes, - const std::vector>& self_out_weight_shapes, - const std::vector>& self_out_bias_shapes, - const std::vector>& cross_ln_weight_shapes, - const std::vector>& cross_ln_bias_shapes, - const std::vector>& cross_q_weight_shapes, - const std::vector>& cross_q_bias_shapes, - const std::vector>& cross_k_weight_shapes, - const std::vector>& cross_k_bias_shapes, - const std::vector>& cross_v_weight_shapes, - const std::vector>& cross_v_bias_shapes, - const std::vector>& cross_out_weight_shapes, - const std::vector>& cross_out_bias_shapes, - const std::vector>& ffn_ln_weight_shapes, - const std::vector>& ffn_ln_bias_shapes, - const std::vector>& ffn_inter_weight_shapes, - const std::vector>& ffn_inter_bias_shapes, - const std::vector>& ffn_out_weight_shapes, - const std::vector>& ffn_out_bias_shapes, - const std::vector& decoder_ln_weight_shape, - const std::vector& decoder_ln_bias_shape, - const std::vector& embedding_weight_shape, - const std::vector& embedding_bias_shape, - const std::vector& positional_embedding_weight_shape, - const std::string& decoding_strategy, - const int& beam_size, - const int& topk, - const float& topp, - const float& temperature, - const int& n_head, - const int& size_per_head, - const int& num_layer, - const int& bos_id, - const int& eos_id, - const int64_t& max_len, - const int64_t& min_len, - const float& beam_search_diversity_rate, - const bool& rel_len, - const float& alpha, - const bool& early_stopping) { - int batch_size = input_shape[0]; - - std::vector output_dims; - std::vector sequence_length_dims({batch_size}); - if (decoding_strategy == "beam_search") { - if (batch_size != -1) { - batch_size /= beam_size; - } - output_dims = {max_len, batch_size, beam_size}; - return {output_dims, output_dims, sequence_length_dims}; - } else if (decoding_strategy == "beam_search_v2" || - decoding_strategy == "beam_search_v3") { - // Use separated alive and finish beam queues to avoid the decrease of alive - // beams. The outputs must include both the finish and alive to trace full - // path. - if (batch_size != -1) { - sequence_length_dims = {batch_size * 2}; - batch_size /= beam_size; - } else { - sequence_length_dims = {batch_size}; - } - output_dims = {max_len, batch_size, beam_size * 2}; - return {output_dims, output_dims, sequence_length_dims}; - } else if (decoding_strategy == "topk_sampling" || - decoding_strategy == "topp_sampling") { - output_dims = {max_len, batch_size}; - return {output_dims, {1}, sequence_length_dims}; - } else { - PD_THROW("Not supported decoding strategy. "); - } -} - -std::vector BartDecodingInferDtype( - const paddle::DataType& input, - const paddle::DataType& mem_seq_len, - const paddle::DataType& word_embedding, - const std::vector& self_ln_weight, - const std::vector& self_ln_bias, - const std::vector& self_q_weight, - const std::vector& self_q_bias, - const std::vector& self_k_weight, - const std::vector& self_k_bias, - const std::vector& self_v_weight, - const std::vector& self_v_bias, - const std::vector& self_out_weight, - const std::vector& self_out_bias, - const std::vector& cross_ln_weight, - const std::vector& cross_ln_bias, - const std::vector& cross_q_weight, - const std::vector& cross_q_bias, - const std::vector& cross_k_weight, - const std::vector& cross_k_bias, - const std::vector& cross_v_weight, - const std::vector& cross_v_bias, - const std::vector& cross_out_weight, - const std::vector& cross_out_bias, - const std::vector& ffn_ln_weight, - const std::vector& ffn_ln_bias, - const std::vector& ffn_inter_weight, - const std::vector& ffn_inter_bias, - const std::vector& ffn_out_weight, - const std::vector& ffn_out_bias, - const paddle::DataType& decoder_ln_weight, - const paddle::DataType& decoder_ln_bias, - const paddle::DataType& embedding_weight, - const paddle::DataType& embedding_bias, - const paddle::DataType& positional_embedding_weight) { - return {paddle::DataType::INT32, - paddle::DataType::INT32, - paddle::DataType::INT32}; -} - -PD_BUILD_OP(fusion_bart_decoding) - .Inputs({"Input", - "MemSeqLen", - "WordEmbedding", - paddle::Vec("SelfLayernormWeight"), - paddle::Vec("SelfLayernormBias"), - paddle::Vec("SelfQueryWeight"), - paddle::Vec("SelfQueryBias"), - paddle::Vec("SelfKeyWeight"), - paddle::Vec("SelfKeyBias"), - paddle::Vec("SelfValueWeight"), - paddle::Vec("SelfValueBias"), - paddle::Vec("SelfOutWeight"), - paddle::Vec("SelfOutBias"), - paddle::Vec("CrossLayernormWeight"), - paddle::Vec("CrossLayernormBias"), - paddle::Vec("CrossQueryWeight"), - paddle::Vec("CrossQueryBias"), - paddle::Vec("CrossKeyWeight"), - paddle::Vec("CrossKeyBias"), - paddle::Vec("CrossValueWeight"), - paddle::Vec("CrossValueBias"), - paddle::Vec("CrossOutWeight"), - paddle::Vec("CrossOutBias"), - paddle::Vec("FFNLayernormWeight"), - paddle::Vec("FFNLayernormBias"), - paddle::Vec("FFNInterWeight"), - paddle::Vec("FFNInterBias"), - paddle::Vec("FFNOutWeight"), - paddle::Vec("FFNOutBias"), - "DecoderLayernormWeight", - "DecoderLayernormBias", - "EmbWeight", - "EmbBias", - "PositionEncEmb"}) - .Outputs({"OutputIds", "ParentIds", "SequenceLength"}) - .Attrs({"decoding_strategy: std::string", - "beam_size: int", - "topk: int", - "topp: float", - "temperature: float", - "n_head: int", - "size_per_head: int", - "num_layer: int", - "bos_id: int", - "eos_id: int", - "max_len: int64_t", - "min_len: int64_t", - "beam_search_diversity_rate: float", - "rel_len: bool", - "alpha: float", - "early_stopping: bool"}) - .SetKernelFn(PD_KERNEL(BartDecodingForward)) - .SetInferShapeFn(PD_INFER_SHAPE(BartDecodingInferShape)) - .SetInferDtypeFn(PD_INFER_DTYPE(BartDecodingInferDtype)); diff --git a/paddlenlp/ops/fast_transformer/src/fusion_bart_decoding_op.cu b/paddlenlp/ops/fast_transformer/src/fusion_bart_decoding_op.cu deleted file mode 100644 index 11d454156788..000000000000 --- a/paddlenlp/ops/fast_transformer/src/fusion_bart_decoding_op.cu +++ /dev/null @@ -1,581 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "cublas_handle.h" - -#include "fusion_bart_decoding_op.h" -#include "pd_traits.h" - -template -std::vector bart_decoding_kernel( - const paddle::Tensor& input, - const paddle::Tensor& memory_sequence_length, - const paddle::Tensor& word_emb, - const std::vector& self_layernorm_weight, - const std::vector& self_layernorm_bias, - const std::vector& self_attn_query_weight, - const std::vector& self_attn_query_bias, - const std::vector& self_attn_key_weight, - const std::vector& self_attn_key_bias, - const std::vector& self_attn_value_weight, - const std::vector& self_attn_value_bias, - const std::vector& self_attn_output_weight, - const std::vector& self_attn_output_bias, - const std::vector& cross_layernorm_weight, - const std::vector& cross_layernorm_bias, - const std::vector& cross_attn_query_weight, - const std::vector& cross_attn_query_bias, - const std::vector& cross_attn_key_weight, - const std::vector& cross_attn_key_bias, - const std::vector& cross_attn_value_weight, - const std::vector& cross_attn_value_bias, - const std::vector& cross_attn_output_weight, - const std::vector& cross_attn_output_bias, - const std::vector& ffn_layernorm_weight, - const std::vector& ffn_layernorm_bias, - const std::vector& ffn_intermediate_weight, - const std::vector& ffn_intermediate_bias, - const std::vector& ffn_output_weight, - const std::vector& ffn_output_bias, - const paddle::Tensor& decoder_layernorm_weight, - const paddle::Tensor& decoder_layernorm_bias, - const paddle::Tensor& embedding_weight, - const paddle::Tensor& embedding_bias, - const paddle::Tensor& position_encoding_table, - paddle::Tensor& output_ids, - paddle::Tensor& parent_ids, - paddle::Tensor& sequence_length, - const std::string& decoding_strategy, - const int& beam_size, - const int& topk, - const float& topp, - const float& temperature, - const int& head_num_, - const int& size_per_head_, - const int& num_layer_, - const int& start_id_, - const int& end_id_, - const int64_t& max_seq_len_, - const int64_t& min_seq_len_, - const float& beam_search_diversity_rate_, - const float& alpha, - const bool& early_stopping, - cudaStream_t stream) { - int beam_width_ = (decoding_strategy == "beam_search" || - decoding_strategy == "beam_search_v2" || - decoding_strategy == "beam_search_v3") - ? beam_size - : 1; - int candidate_num_ = (decoding_strategy == "topk_sampling" || - decoding_strategy == "topp_sampling") - ? topk - : 1; - float probability_threshold_ = (decoding_strategy == "topk_sampling" || - decoding_strategy == "topp_sampling") - ? topp - : 0.0; - - auto input_dims = input.shape(); - int batch_size_ = (decoding_strategy == "beam_search" || - decoding_strategy == "beam_search_v2" || - decoding_strategy == "beam_search_v3") - ? input_dims[0] / beam_width_ - : input_dims[0]; - const int memory_max_seq_len = input_dims[1]; - const int memory_hidden_dim = input_dims[2]; - const int vocab_size = word_emb.shape()[0]; - - typedef PDTraits traits_; - typedef typename traits_::DataType DataType_; - typedef typename traits_::data_t data_t_; - - DecodingInitParam decoding_params; - decoding_params.cublas_handle = CublasHandle::GetInstance()->cublas_handle_; - decoding_params.cublaslt_handle = - CublasHandle::GetInstance()->cublaslt_handle_; - - decoding_params.output_ids = output_ids.mutable_data(input.place()); - decoding_params.parent_ids = parent_ids.mutable_data(input.place()); - decoding_params.sequence_length = - sequence_length.mutable_data(input.place()); - - typedef DecoderTransformerTraits DecodingTraits_; - decoding_params.stream = stream; - fastertransformer::Allocator allocator_(stream); - - decoding_params.memory_tensor = - reinterpret_cast(input.data()); - decoding_params.memory_sequence_length = memory_sequence_length.data(); - - //TODO(gongenlei): Support MP & PP - TensorParallelParam tensor_parallel_param; - LayerParallelParam layer_parallel_param; - tensor_parallel_param.rank = 0; - tensor_parallel_param.world_size = 1; - tensor_parallel_param.local_head_num_ = head_num_; - tensor_parallel_param.local_hidden_units_ = memory_hidden_dim; - layer_parallel_param.rank = 0; - layer_parallel_param.world_size = 1; - layer_parallel_param.layers_per_group = num_layer_; - layer_parallel_param.local_batch_size = batch_size_; - - DecoderInitParam* params = - new DecoderInitParam[num_layer_]; - - for (int i = 0; i < num_layer_; i++) { - params[i].stream = stream; - params[i].cublas_handle = CublasHandle::GetInstance()->cublas_handle_; - params[i].cublaslt_handle = CublasHandle::GetInstance()->cublaslt_handle_; - - if (decoding_strategy == "beam_search" || - decoding_strategy == "beam_search_v2" || - decoding_strategy == "beam_search_v3") { - params[i].request_batch_size = batch_size_ * beam_width_; - params[i].request_max_mem_seq_len = memory_max_seq_len; - } else if (decoding_strategy == "sampling" || - decoding_strategy == "topk_sampling" || - decoding_strategy == "topp_sampling") { - params[i].request_batch_size = batch_size_; - params[i].request_max_mem_seq_len = memory_max_seq_len; - } - - // self attn - params[i].self_layernorm.gamma = reinterpret_cast( - self_layernorm_weight[i].data()); - params[i].self_layernorm.beta = reinterpret_cast( - self_layernorm_bias[i].data()); - // query - params[i].self_attention.query_weight.kernel = - reinterpret_cast( - self_attn_query_weight[i].data()); - params[i].self_attention.query_weight.bias = - reinterpret_cast( - self_attn_query_bias[i].data()); - // // key - // params[i].self_attention.key_weight.kernel = - // reinterpret_cast( - // self_attn_key_weight[i].data()); - // params[i].self_attention.key_weight.bias = - // reinterpret_cast( - // self_attn_key_bias[i].data()); - // // value - // params[i].self_attention.value_weight.kernel = - // reinterpret_cast( - // self_attn_value_weight[i].data()); - // params[i].self_attention.value_weight.bias = - // reinterpret_cast( - // self_attn_value_bias[i].data()); - - // key - params[i].self_attention.key_weight.kernel = nullptr; - params[i].self_attention.key_weight.bias = nullptr; - // value - params[i].self_attention.value_weight.kernel = nullptr; - params[i].self_attention.value_weight.bias = nullptr; - - // out proj - params[i].self_attention.attention_output_weight.kernel = - reinterpret_cast( - self_attn_output_weight[i].data()); - params[i].self_attention.attention_output_weight.bias = - reinterpret_cast( - self_attn_output_bias[i].data()); - - // cross - params[i].cross_layernorm.gamma = reinterpret_cast( - cross_layernorm_weight[i].data()); - params[i].cross_layernorm.beta = reinterpret_cast( - cross_layernorm_bias[i].data()); - // query - params[i].cross_attention.query_weight.kernel = - reinterpret_cast( - cross_attn_query_weight[i].data()); - params[i].cross_attention.query_weight.bias = - reinterpret_cast( - cross_attn_query_bias[i].data()); - // key - params[i].cross_attention.key_weight.kernel = - reinterpret_cast( - cross_attn_key_weight[i].data()); - params[i].cross_attention.key_weight.bias = - reinterpret_cast( - cross_attn_key_bias[i].data()); - // value - params[i].cross_attention.value_weight.kernel = - reinterpret_cast( - cross_attn_value_weight[i].data()); - params[i].cross_attention.value_weight.bias = - reinterpret_cast( - cross_attn_value_bias[i].data()); - // out proj - params[i].cross_attention.attention_output_weight.kernel = - reinterpret_cast( - cross_attn_output_weight[i].data()); - params[i].cross_attention.attention_output_weight.bias = - reinterpret_cast( - cross_attn_output_bias[i].data()); - - // ffn - params[i].ffn_layernorm.gamma = reinterpret_cast( - ffn_layernorm_weight[i].data()); - params[i].ffn_layernorm.beta = reinterpret_cast( - ffn_layernorm_bias[i].data()); - // intermediate proj - params[i].ffn.intermediate_weight.kernel = - reinterpret_cast( - ffn_intermediate_weight[i].data()); - params[i].ffn.intermediate_weight.bias = reinterpret_cast( - ffn_intermediate_bias[i].data()); - // out proj - params[i].ffn.output_weight.kernel = reinterpret_cast( - ffn_output_weight[i].data()); - params[i].ffn.output_weight.bias = - reinterpret_cast(ffn_output_bias[i].data()); - } - - decoding_params.layernorm.gamma = reinterpret_cast( - decoder_layernorm_weight.data()); - decoding_params.layernorm.beta = reinterpret_cast( - decoder_layernorm_bias.data()); - // for embedding - decoding_params.embedding_table = - reinterpret_cast(word_emb.data()); - - // for weight sharing matmul - decoding_params.embedding_kernel = - reinterpret_cast(embedding_weight.data()); - // for matmul bias - decoding_params.embedding_bias = - reinterpret_cast(embedding_bias.data()); - decoding_params.position_encoding_table = reinterpret_cast( - position_encoding_table.data()); - - int finished_candidate_num_ = - ("beam_search_v3" == decoding_strategy) ? beam_width_ : beam_width_ * 2; - - if ("beam_search" == decoding_strategy) { - DecodingBeamsearch* decoding_beamsearch_; - decoding_beamsearch_ = new DecodingBeamsearch( - allocator_, - batch_size_, - beam_width_, - max_seq_len_, - head_num_, - size_per_head_, - vocab_size, - num_layer_, - memory_hidden_dim, - memory_max_seq_len, - start_id_, - end_id_, - beam_search_diversity_rate_, - true, /*is_fuse_topk_softMax*/ - true, /*is_fuse_qkv*/ - false, /*keep_alive_beam*/ - alpha, - false, /*normalization_before*/ - 2, /*pos_offset*/ - ActivationType::GELU, - false, /*pos_bias*/ - false, /*prefix_lm*/ - -1, /*finished_candidate_num*/ - false, /*early_stopping*/ - false, /*is_mbart*/ - min_seq_len_); - - decoding_beamsearch_->set_tensor_parallel_param( - tensor_parallel_param); - decoding_beamsearch_->set_layer_parallel_param( - layer_parallel_param); - - decoding_beamsearch_->forward(params, decoding_params); - - delete decoding_beamsearch_; - } else if ("beam_search_v2" == decoding_strategy || - "beam_search_v3" == decoding_strategy) { - DecodingBeamsearch* decoding_beamsearch_; - decoding_beamsearch_ = new DecodingBeamsearch( - allocator_, - batch_size_, - beam_width_, - max_seq_len_, - head_num_, - size_per_head_, - vocab_size, - num_layer_, - memory_hidden_dim, - memory_max_seq_len, - start_id_, - end_id_, - beam_search_diversity_rate_, - true, /*is_fuse_topk_softMax*/ - true, /*is_fuse_qkv*/ - true, /*keep_alive_beam*/ - alpha, - false, /*normalization_before*/ - 2, - ActivationType::GELU, - false, /*pos_bias*/ - false, /*prefix_lm*/ - finished_candidate_num_, - early_stopping, - false, /*is_mbart*/ - min_seq_len_); - - decoding_beamsearch_->set_tensor_parallel_param( - tensor_parallel_param); - decoding_beamsearch_->set_layer_parallel_param( - layer_parallel_param); - - decoding_beamsearch_->forward(params, decoding_params); - - delete decoding_beamsearch_; - } else if ("topk_sampling" == decoding_strategy || - "topp_sampling" == decoding_strategy || - "sampling" == decoding_strategy) { - DecodingSampling* decoding_sampling_; - decoding_sampling_ = - new DecodingSampling(allocator_, - batch_size_, - max_seq_len_, - head_num_, - size_per_head_, - vocab_size, - num_layer_, - memory_hidden_dim, - memory_max_seq_len, - start_id_, - end_id_, - candidate_num_, - probability_threshold_, - true, /*is_fuse_qkv*/ - false, /*normalization_before*/ - 2, /*pos_offset*/ - ActivationType::GELU, - false, /*pos_bias*/ - temperature, /*temperature*/ - 1.0, /*repeat_penalty*/ - false, /*prefix_lm*/ - false, /*is_mbart*/ - min_seq_len_); - decoding_sampling_->set_tensor_parallel_param( - tensor_parallel_param); - decoding_sampling_->set_layer_parallel_param( - layer_parallel_param); - - decoding_sampling_->forward(params, decoding_params); - - delete decoding_sampling_; - } else { - PD_THROW( - "Only beam_search, topk_sampling and topp_sampling are supported for " - "FastGeneration. "); - } - delete[] params; - - return {output_ids, parent_ids, sequence_length}; -} - -std::vector BartDecodingCUDAForward( - const paddle::Tensor& input, - const paddle::Tensor& mem_seq_len, - const paddle::Tensor& word_embedding, - const std::vector& self_ln_weight, - const std::vector& self_ln_bias, - const std::vector& self_q_weight, - const std::vector& self_q_bias, - const std::vector& self_k_weight, - const std::vector& self_k_bias, - const std::vector& self_v_weight, - const std::vector& self_v_bias, - const std::vector& self_out_weight, - const std::vector& self_out_bias, - const std::vector& cross_ln_weight, - const std::vector& cross_ln_bias, - const std::vector& cross_q_weight, - const std::vector& cross_q_bias, - const std::vector& cross_k_weight, - const std::vector& cross_k_bias, - const std::vector& cross_v_weight, - const std::vector& cross_v_bias, - const std::vector& cross_out_weight, - const std::vector& cross_out_bias, - const std::vector& ffn_ln_weight, - const std::vector& ffn_ln_bias, - const std::vector& ffn_inter_weight, - const std::vector& ffn_inter_bias, - const std::vector& ffn_out_weight, - const std::vector& ffn_out_bias, - const paddle::Tensor& decoder_ln_weight, - const paddle::Tensor& decoder_ln_bias, - const paddle::Tensor& embedding_weight, - const paddle::Tensor& embedding_bias, - const paddle::Tensor& positional_embedding_weight, - paddle::Tensor& output_ids, - paddle::Tensor& parent_ids, - paddle::Tensor& sequence_length, - const std::string& decoding_strategy, - const int& beam_size, - const int& topk, - const float& topp, - const float& temperature, - const int& n_head, - const int& size_per_head, - const int& num_layer, - const int& bos_id, - const int& eos_id, - const int64_t& max_len, - const int64_t& min_len, - const float& beam_search_diversity_rate, - const float& alpha, - const bool& early_stopping) { - auto stream = input.stream(); - - cublasSetStream(CublasHandle::GetInstance()->cublas_handle_, stream); - - std::vector ret; - - switch (input.type()) { - case paddle::DataType::FLOAT16: { - ret = bart_decoding_kernel( - input, - mem_seq_len, - word_embedding, - self_ln_weight, - self_ln_bias, - self_q_weight, - self_q_bias, - self_k_weight, - self_k_bias, - self_v_weight, - self_v_bias, - self_out_weight, - self_out_bias, - cross_ln_weight, - cross_ln_bias, - cross_q_weight, - cross_q_bias, - cross_k_weight, - cross_k_bias, - cross_v_weight, - cross_v_bias, - cross_out_weight, - cross_out_bias, - ffn_ln_weight, - ffn_ln_bias, - ffn_inter_weight, - ffn_inter_bias, - ffn_out_weight, - ffn_out_bias, - decoder_ln_weight, - decoder_ln_bias, - embedding_weight, - embedding_bias, - positional_embedding_weight, - output_ids, - parent_ids, - sequence_length, - decoding_strategy, - beam_size, - topk, - topp, - temperature, - n_head, - size_per_head, - num_layer, - bos_id, - eos_id, - max_len, - min_len, - beam_search_diversity_rate, - alpha, - early_stopping, - stream); - break; - } - case paddle::DataType::FLOAT32: { - ret = bart_decoding_kernel( - input, - mem_seq_len, - word_embedding, - self_ln_weight, - self_ln_bias, - self_q_weight, - self_q_bias, - self_k_weight, - self_k_bias, - self_v_weight, - self_v_bias, - self_out_weight, - self_out_bias, - cross_ln_weight, - cross_ln_bias, - cross_q_weight, - cross_q_bias, - cross_k_weight, - cross_k_bias, - cross_v_weight, - cross_v_bias, - cross_out_weight, - cross_out_bias, - ffn_ln_weight, - ffn_ln_bias, - ffn_inter_weight, - ffn_inter_bias, - ffn_out_weight, - ffn_out_bias, - decoder_ln_weight, - decoder_ln_bias, - embedding_weight, - embedding_bias, - positional_embedding_weight, - output_ids, - parent_ids, - sequence_length, - decoding_strategy, - beam_size, - topk, - topp, - temperature, - n_head, - size_per_head, - num_layer, - bos_id, - eos_id, - max_len, - min_len, - beam_search_diversity_rate, - alpha, - early_stopping, - stream); - break; - } - default: { - PD_THROW( - "NOT supported data type. " - "Only float16 and float32 are supported. "); - break; - } - } - return ret; -} diff --git a/paddlenlp/ops/fast_transformer/src/fusion_bart_decoding_op.h b/paddlenlp/ops/fast_transformer/src/fusion_bart_decoding_op.h deleted file mode 100644 index 82219aba6ebe..000000000000 --- a/paddlenlp/ops/fast_transformer/src/fusion_bart_decoding_op.h +++ /dev/null @@ -1,85 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#pragma once - -#include -#include - -#include "fastertransformer/decoding_beamsearch.h" -#include "fastertransformer/decoding_sampling.h" -#include "fastertransformer/open_decoder.h" -#include "fastertransformer/utils/common.h" - -#ifdef PADDLE_ON_INFERENCE -#include "paddle/extension.h" -#include "paddle_inference_api.h" -#include "paddle/common/exception.h" -#else -#include "paddle/extension.h" -#endif - - -std::vector BartDecodingCUDAForward( - const paddle::Tensor& input, - const paddle::Tensor& mem_seq_len, - const paddle::Tensor& word_embedding, - const std::vector& self_ln_weight, - const std::vector& self_ln_bias, - const std::vector& self_q_weight, - const std::vector& self_q_bias, - const std::vector& self_k_weight, - const std::vector& self_k_bias, - const std::vector& self_v_weight, - const std::vector& self_v_bias, - const std::vector& self_out_weight, - const std::vector& self_out_bias, - const std::vector& cross_ln_weight, - const std::vector& cross_ln_bias, - const std::vector& cross_q_weight, - const std::vector& cross_q_bias, - const std::vector& cross_k_weight, - const std::vector& cross_k_bias, - const std::vector& cross_v_weight, - const std::vector& cross_v_bias, - const std::vector& cross_out_weight, - const std::vector& cross_out_bias, - const std::vector& ffn_ln_weight, - const std::vector& ffn_ln_bias, - const std::vector& ffn_inter_weight, - const std::vector& ffn_inter_bias, - const std::vector& ffn_out_weight, - const std::vector& ffn_out_bias, - const paddle::Tensor& decoder_ln_weight, - const paddle::Tensor& decoder_ln_bias, - const paddle::Tensor& embedding_weight, - const paddle::Tensor& embedding_bias, - const paddle::Tensor& positional_embedding_weight, - paddle::Tensor& output_ids, - paddle::Tensor& parent_ids, - paddle::Tensor& sequence_length, - const std::string& decoding_strategy, - const int& beam_size, - const int& topk, - const float& topp, - const float& temperature, - const int& n_head, - const int& size_per_head, - const int& num_layer, - const int& bos_id, - const int& eos_id, - const int64_t& max_len, - const int64_t& min_len, - const float& beam_search_diversity_rate, - const float& alpha, - const bool& early_stopping); diff --git a/paddlenlp/ops/fast_transformer/src/fusion_decoder_op.cc b/paddlenlp/ops/fast_transformer/src/fusion_decoder_op.cc deleted file mode 100644 index e0d055bde5fe..000000000000 --- a/paddlenlp/ops/fast_transformer/src/fusion_decoder_op.cc +++ /dev/null @@ -1,228 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include -#include - -#include "fusion_decoder_op.h" -#include "pd_traits.h" - - -std::vector DecoderForward( - const paddle::Tensor& from_tensor, - const paddle::Tensor& memory_tensor, - const paddle::Tensor& mem_seq_len, - const paddle::Tensor& self_ln_weight, - const paddle::Tensor& self_ln_bias, - const paddle::Tensor& self_q_weight, - const paddle::Tensor& self_q_bias, - const paddle::Tensor& self_k_weight, - const paddle::Tensor& self_k_bias, - const paddle::Tensor& self_v_weight, - const paddle::Tensor& self_v_bias, - const paddle::Tensor& self_out_weight, - const paddle::Tensor& self_out_bias, - const paddle::Tensor& cross_ln_weight, - const paddle::Tensor& cross_ln_bias, - const paddle::Tensor& cross_q_weight, - const paddle::Tensor& cross_q_bias, - const paddle::Tensor& cross_k_weight, - const paddle::Tensor& cross_k_bias, - const paddle::Tensor& cross_v_weight, - const paddle::Tensor& cross_v_bias, - const paddle::Tensor& cross_out_weight, - const paddle::Tensor& cross_out_bias, - const paddle::Tensor& ffn_ln_weight, - const paddle::Tensor& ffn_ln_bias, - const paddle::Tensor& ffn_inter_weight, - const paddle::Tensor& ffn_inter_bias, - const paddle::Tensor& ffn_out_weight, - const paddle::Tensor& ffn_out_bias, - const paddle::Tensor& old_self_cache_key, - const paddle::Tensor& old_self_cache_value, - const paddle::Tensor& old_mem_cache, - const int step, - int n_head, - int size_per_head, - int memory_hidden_dim, - bool is_fuse_qkv) { - const int batch_size = memory_tensor.shape()[0]; - std::vector output_dims; - output_dims = {batch_size, 1, n_head * size_per_head}; - - auto new_self_cache_key = old_self_cache_key; - auto new_self_cache_value = old_self_cache_value; - auto new_mem_cache = old_mem_cache; - - if (from_tensor.place() == paddle::PlaceType::kGPU) { - auto decoder_output = paddle::Tensor(from_tensor.place(), output_dims); - - paddle::Tensor _mem_seq_len = paddle::Tensor(paddle::PlaceType::kGPU); - - if (mem_seq_len.place() != paddle::PlaceType::kGPU) { - _mem_seq_len = mem_seq_len.copy_to(paddle::PlaceType::kGPU); - } else { - _mem_seq_len = mem_seq_len; - } - - return DecoderCUDAForward(from_tensor, - memory_tensor, - _mem_seq_len, - self_ln_weight, - self_ln_bias, - self_q_weight, - self_q_bias, - self_k_weight, - self_k_bias, - self_v_weight, - self_v_bias, - self_out_weight, - self_out_bias, - cross_ln_weight, - cross_ln_bias, - cross_q_weight, - cross_q_bias, - cross_k_weight, - cross_k_bias, - cross_v_weight, - cross_v_bias, - cross_out_weight, - cross_out_bias, - ffn_ln_weight, - ffn_ln_bias, - ffn_inter_weight, - ffn_inter_bias, - ffn_out_weight, - ffn_out_bias, - old_self_cache_key, - old_self_cache_value, - old_mem_cache, - step, - decoder_output, - new_self_cache_key, - new_self_cache_value, - new_mem_cache, - n_head, - size_per_head, - memory_hidden_dim, - is_fuse_qkv); - } else { - PD_THROW("Not implemented place. Only GPU is supported. "); - } -} - -std::vector> DecoderInferShape( - const std::vector& from_tensor_shape, - const std::vector& memory_tensor_shape, - const std::vector& mem_seq_len_shape, - const std::vector& self_ln_weight_shapes, - const std::vector& self_ln_bias_shapes, - const std::vector& self_q_weight_shapes, - const std::vector& self_q_bias_shapes, - const std::vector& self_k_weight_shapes, - const std::vector& self_k_bias_shapes, - const std::vector& self_v_weight_shapes, - const std::vector& self_v_bias_shapes, - const std::vector& self_out_weight_shapes, - const std::vector& self_out_bias_shapes, - const std::vector& cross_ln_weight_shapes, - const std::vector& cross_ln_bias_shapes, - const std::vector& cross_q_weight_shapes, - const std::vector& cross_q_bias_shapes, - const std::vector& cross_k_weight_shapes, - const std::vector& cross_k_bias_shapes, - const std::vector& cross_v_weight_shapes, - const std::vector& cross_v_bias_shapes, - const std::vector& cross_out_weight_shapes, - const std::vector& cross_out_bias_shapes, - const std::vector& ffn_ln_weight_shapes, - const std::vector& ffn_ln_bias_shapes, - const std::vector& ffn_inter_weight_shapes, - const std::vector& ffn_inter_bias_shapes, - const std::vector& ffn_out_weight_shapes, - const std::vector& ffn_out_bias_shapes, - const std::vector& old_self_cache_key_shape, - const std::vector& old_self_cache_value_shape, - const std::vector& old_mem_cache_shape, - const int& step, - const int& n_head, - const int& size_per_head, - const int& memory_hidden_dim, - const bool& is_fuse_qkv) { - return {from_tensor_shape, - old_self_cache_key_shape, - old_self_cache_value_shape, - old_mem_cache_shape}; -} - -std::vector DecoderInferDtype( - const paddle::DataType& from_tensor, - const paddle::DataType& memory_tensor, - const paddle::DataType& mem_seq_len, - const paddle::DataType& self_ln_weight, - const paddle::DataType& self_ln_bias, - const paddle::DataType& self_q_weight, - const paddle::DataType& self_q_bias, - const paddle::DataType& self_k_weight, - const paddle::DataType& self_k_bias, - const paddle::DataType& self_v_weight, - const paddle::DataType& self_v_bias, - const paddle::DataType& self_out_weight, - const paddle::DataType& self_out_bias, - const paddle::DataType& cross_ln_weight, - const paddle::DataType& cross_ln_bias, - const paddle::DataType& cross_q_weight, - const paddle::DataType& cross_q_bias, - const paddle::DataType& cross_k_weight, - const paddle::DataType& cross_k_bias, - const paddle::DataType& cross_v_weight, - const paddle::DataType& cross_v_bias, - const paddle::DataType& cross_out_weight, - const paddle::DataType& cross_out_bias, - const paddle::DataType& ffn_ln_weight, - const paddle::DataType& ffn_ln_bias, - const paddle::DataType& ffn_inter_weight, - const paddle::DataType& ffn_inter_bias, - const paddle::DataType& ffn_out_weight, - const paddle::DataType& ffn_out_bias, - const paddle::DataType& old_self_cache_key, - const paddle::DataType& old_self_cache_value, - const paddle::DataType& old_mem_cache) { - return {from_tensor, old_self_cache_key, old_self_cache_value, old_mem_cache}; -} - -PD_BUILD_OP(fusion_decoder) - .Inputs( - {"FromTensor", "MemoryTensor", "MemSeqLen", - "SelfLayernormWeight", "SelfLayernormBias", "SelfQueryWeight", - "SelfQueryBias", "SelfKeyWeight", "SelfKeyBias", - "SelfValueWeight", "SelfValueBias", "SelfOutWeight", - "SelfOutBias", "CrossLayernormWeight", "CrossLayernormBias", - "CrossQueryWeight", "CrossQueryBias", "CrossKeyWeight", - "CrossKeyBias", "CrossValueWeight", "CrossValueBias", - "CrossOutWeight", "CrossOutBias", "FFNLayernormWeight", - "FFNLayernormBias", "FFNInterWeight", "FFNInterBias", - "FFNOutWeight", "FFNOutBias", "OldSelfCacheKey", - "OldSelfCacheValue", "OldMemCache"}) - .Outputs({"DecoderOutput", - "NewSelfCacheKey", - "NewSelfCacheValue", - "NewMemCache"}) - .Attrs({"step: int", - "n_head: int", - "size_per_head: int", - "memory_hidden_dim: int", - "is_fuse_qkv: bool"}) - .SetKernelFn(PD_KERNEL(DecoderForward)) - .SetInferShapeFn(PD_INFER_SHAPE(DecoderInferShape)) - .SetInferDtypeFn(PD_INFER_DTYPE(DecoderInferDtype)); diff --git a/paddlenlp/ops/fast_transformer/src/fusion_decoder_op.cu b/paddlenlp/ops/fast_transformer/src/fusion_decoder_op.cu deleted file mode 100644 index efe05f4be58e..000000000000 --- a/paddlenlp/ops/fast_transformer/src/fusion_decoder_op.cu +++ /dev/null @@ -1,374 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -#include "fusion_decoder_op.h" -#include "pd_traits.h" - - -template -std::vector decoder_kernel( - const paddle::Tensor& from_tensor_input, - const paddle::Tensor& memory_tensor_input, - const paddle::Tensor& mem_seq_len_input, - const paddle::Tensor& self_ln_weight, - const paddle::Tensor& self_ln_bias, - const paddle::Tensor& self_q_weight, - const paddle::Tensor& self_q_bias, - const paddle::Tensor& self_k_weight, - const paddle::Tensor& self_k_bias, - const paddle::Tensor& self_v_weight, - const paddle::Tensor& self_v_bias, - const paddle::Tensor& self_out_weight, - const paddle::Tensor& self_out_bias, - const paddle::Tensor& cross_ln_weight, - const paddle::Tensor& cross_ln_bias, - const paddle::Tensor& cross_q_weight, - const paddle::Tensor& cross_q_bias, - const paddle::Tensor& cross_k_weight, - const paddle::Tensor& cross_k_bias, - const paddle::Tensor& cross_v_weight, - const paddle::Tensor& cross_v_bias, - const paddle::Tensor& cross_out_weight, - const paddle::Tensor& cross_out_bias, - const paddle::Tensor& ffn_ln_weight, - const paddle::Tensor& ffn_ln_bias, - const paddle::Tensor& ffn_inter_weight, - const paddle::Tensor& ffn_inter_bias, - const paddle::Tensor& ffn_out_weight, - const paddle::Tensor& ffn_out_bias, - const paddle::Tensor& old_self_cache_key, - const paddle::Tensor& old_self_cache_value, - const paddle::Tensor& old_mem_cache, - const int step, - paddle::Tensor& decoder_output_tensor, - paddle::Tensor& new_self_cache_key, - paddle::Tensor& new_self_cache_value, - paddle::Tensor& new_mem_cache, - int n_head, - int size_per_head, - int memory_hidden_dim, - bool is_fuse_qkv, - cublasHandle_t cublas_handle_, - cublasLtHandle_t cublaslt_handle_, - cudaStream_t stream) { - auto input_dims = memory_tensor_input.shape(); - const int batch_size_ = static_cast(input_dims[0]); - const int max_seq_len_ = static_cast(input_dims[1]); - const int memory_hidden_dim_ = static_cast(memory_hidden_dim); - const bool is_fuse_qkv_ = static_cast(is_fuse_qkv); - - // Detect we use batch major - bool use_batch_major = - (old_self_cache_key.shape().size() == 5) ? true : false; - // we use decoder_max_seq_len == -1 to tell the decoder we use seq major cache - // format - int decoder_max_seq_len = - (use_batch_major) ? (int)old_self_cache_value.shape()[2] : -1; - - typedef PDTraits traits_; - typedef typename traits_::DataType DataType_; - typedef typename traits_::data_t data_t_; - typedef DecoderTransformerTraits DecoderTraits_; - OpenDecoder* decoder_; - decoder_ = new OpenDecoder(n_head, - size_per_head, - memory_hidden_dim_, - is_fuse_qkv_, - true, - ActivationType::RELU); - - DataType_* decoder_output = reinterpret_cast( - decoder_output_tensor.mutable_data()); - DataType_* self_cache_key_tensor = reinterpret_cast( - const_cast(old_self_cache_key.data())); - DataType_* self_cache_value_tensor = reinterpret_cast( - const_cast(old_self_cache_value.data())); - DataType_* memory_cache = reinterpret_cast( - const_cast(old_mem_cache.data())); - const DataType_* from_tensor = - reinterpret_cast(from_tensor_input.data()); - const DataType_* memory_tensor = - reinterpret_cast(memory_tensor_input.data()); - const int* memory_sequence_length = mem_seq_len_input.data(); - - DecoderInitParam params; - params.cublas_handle = cublas_handle_; - params.cublaslt_handle = cublaslt_handle_; - params.stream = stream; - params.request_max_mem_seq_len = max_seq_len_; - params.request_batch_size = batch_size_; - fastertransformer::Allocator allocator_(stream); - - params.self_layernorm.gamma = - reinterpret_cast(self_ln_weight.data()); - params.self_layernorm.beta = - reinterpret_cast(self_ln_bias.data()); - params.self_attention.query_weight.kernel = - reinterpret_cast(self_q_weight.data()); - params.self_attention.query_weight.bias = - reinterpret_cast(self_q_bias.data()); - params.self_attention.key_weight.kernel = - reinterpret_cast(self_k_weight.data()); - params.self_attention.key_weight.bias = - reinterpret_cast(self_k_bias.data()); - params.self_attention.value_weight.kernel = - reinterpret_cast(self_v_weight.data()); - params.self_attention.value_weight.bias = - reinterpret_cast(self_v_bias.data()); - params.self_attention.attention_output_weight.kernel = - reinterpret_cast(self_out_weight.data()); - params.self_attention.attention_output_weight.bias = - reinterpret_cast(self_out_bias.data()); - params.cross_layernorm.gamma = - reinterpret_cast(cross_ln_weight.data()); - params.cross_layernorm.beta = - reinterpret_cast(cross_ln_bias.data()); - params.cross_attention.query_weight.kernel = - reinterpret_cast(cross_q_weight.data()); - params.cross_attention.query_weight.bias = - reinterpret_cast(cross_q_bias.data()); - params.cross_attention.key_weight.kernel = - reinterpret_cast(cross_k_weight.data()); - params.cross_attention.key_weight.bias = - reinterpret_cast(cross_k_bias.data()); - params.cross_attention.value_weight.kernel = - reinterpret_cast(cross_v_weight.data()); - params.cross_attention.value_weight.bias = - reinterpret_cast(cross_v_bias.data()); - params.cross_attention.attention_output_weight.kernel = - reinterpret_cast(cross_out_weight.data()); - params.cross_attention.attention_output_weight.bias = - reinterpret_cast(cross_out_bias.data()); - params.ffn_layernorm.gamma = - reinterpret_cast(ffn_ln_weight.data()); - params.ffn_layernorm.beta = - reinterpret_cast(ffn_ln_bias.data()); - params.ffn.intermediate_weight.kernel = - reinterpret_cast(ffn_inter_weight.data()); - params.ffn.intermediate_weight.bias = - reinterpret_cast(ffn_inter_bias.data()); - params.ffn.output_weight.kernel = - reinterpret_cast(ffn_out_weight.data()); - params.ffn.output_weight.bias = - reinterpret_cast(ffn_out_bias.data()); - - const int local_step = static_cast(step) + 1; - const int hidden_units = n_head * size_per_head; - DataType_* K_cache = self_cache_key_tensor; - DataType_* V_cache = self_cache_value_tensor; - DataType_* K_mem_cache = memory_cache; - DataType_* V_mem_cache = - memory_cache + batch_size_ * max_seq_len_ * hidden_units; - decoder_->set_max_batch_size(batch_size_); - - const int decoder_buffer_size = - decoder_->getWorkspaceSize() * sizeof(DataType_); - void* buf = - allocator_.malloc(((sizeof(DataType_) == 2) ? CUBLAS_WORKSPACE_SIZE : 0) + - decoder_buffer_size); - void* cublas_workspace = nullptr; - DataType_* decoder_buffer = (DataType_*)buf; - if (sizeof(DataType_) == 2) // half - { - cublas_workspace = buf; - decoder_buffer = - (DataType_*)((char*)cublas_workspace + CUBLAS_WORKSPACE_SIZE); - } - decoder_->initialize(params, decoder_buffer, cublas_workspace); - decoder_->forward(from_tensor, - memory_tensor, - K_cache, - V_cache, - K_mem_cache, - V_mem_cache, - memory_sequence_length, - decoder_output, - local_step, - decoder_max_seq_len, - true); - allocator_.free(decoder_buffer); - delete decoder_; - return {decoder_output_tensor, - new_self_cache_key, - new_self_cache_value, - new_mem_cache}; -} - -std::vector DecoderCUDAForward( - const paddle::Tensor& from_tensor, - const paddle::Tensor& memory_tensor, - const paddle::Tensor& mem_seq_len, - const paddle::Tensor& self_ln_weight, - const paddle::Tensor& self_ln_bias, - const paddle::Tensor& self_q_weight, - const paddle::Tensor& self_q_bias, - const paddle::Tensor& self_k_weight, - const paddle::Tensor& self_k_bias, - const paddle::Tensor& self_v_weight, - const paddle::Tensor& self_v_bias, - const paddle::Tensor& self_out_weight, - const paddle::Tensor& self_out_bias, - const paddle::Tensor& cross_ln_weight, - const paddle::Tensor& cross_ln_bias, - const paddle::Tensor& cross_q_weight, - const paddle::Tensor& cross_q_bias, - const paddle::Tensor& cross_k_weight, - const paddle::Tensor& cross_k_bias, - const paddle::Tensor& cross_v_weight, - const paddle::Tensor& cross_v_bias, - const paddle::Tensor& cross_out_weight, - const paddle::Tensor& cross_out_bias, - const paddle::Tensor& ffn_ln_weight, - const paddle::Tensor& ffn_ln_bias, - const paddle::Tensor& ffn_inter_weight, - const paddle::Tensor& ffn_inter_bias, - const paddle::Tensor& ffn_out_weight, - const paddle::Tensor& ffn_out_bias, - const paddle::Tensor& old_self_cache_key, - const paddle::Tensor& old_self_cache_value, - const paddle::Tensor& old_mem_cache, - const int step, - paddle::Tensor& decoder_output, - paddle::Tensor& new_self_cache_key, - paddle::Tensor& new_self_cache_value, - paddle::Tensor& new_mem_cache, - int n_head, - int size_per_head, - int memory_hidden_dim, - bool is_fuse_qkv) { - auto stream = memory_tensor.stream(); - cublasHandle_t cublas_handle_; - cublasCreate(&cublas_handle_); - cublasLtHandle_t cublaslt_handle_; - cublasLtCreate(&cublaslt_handle_); - cublasSetStream(cublas_handle_, stream); - - std::vector ret; - - switch (memory_tensor.type()) { - case paddle::DataType::FLOAT16: { - ret = decoder_kernel(from_tensor, - memory_tensor, - mem_seq_len, - self_ln_weight, - self_ln_bias, - self_q_weight, - self_q_bias, - self_k_weight, - self_k_bias, - self_v_weight, - self_v_bias, - self_out_weight, - self_out_bias, - cross_ln_weight, - cross_ln_bias, - cross_q_weight, - cross_q_bias, - cross_k_weight, - cross_k_bias, - cross_v_weight, - cross_v_bias, - cross_out_weight, - cross_out_bias, - ffn_ln_weight, - ffn_ln_bias, - ffn_inter_weight, - ffn_inter_bias, - ffn_out_weight, - ffn_out_bias, - old_self_cache_key, - old_self_cache_value, - old_mem_cache, - step, - decoder_output, - new_self_cache_key, - new_self_cache_value, - new_mem_cache, - n_head, - size_per_head, - memory_hidden_dim, - is_fuse_qkv, - cublas_handle_, - cublaslt_handle_, - stream); - break; - } - case paddle::DataType::FLOAT32: { - ret = decoder_kernel(from_tensor, - memory_tensor, - mem_seq_len, - self_ln_weight, - self_ln_bias, - self_q_weight, - self_q_bias, - self_k_weight, - self_k_bias, - self_v_weight, - self_v_bias, - self_out_weight, - self_out_bias, - cross_ln_weight, - cross_ln_bias, - cross_q_weight, - cross_q_bias, - cross_k_weight, - cross_k_bias, - cross_v_weight, - cross_v_bias, - cross_out_weight, - cross_out_bias, - ffn_ln_weight, - ffn_ln_bias, - ffn_inter_weight, - ffn_inter_bias, - ffn_out_weight, - ffn_out_bias, - old_self_cache_key, - old_self_cache_value, - old_mem_cache, - step, - decoder_output, - new_self_cache_key, - new_self_cache_value, - new_mem_cache, - n_head, - size_per_head, - memory_hidden_dim, - is_fuse_qkv, - cublas_handle_, - cublaslt_handle_, - stream); - break; - } - default: { - PD_THROW( - "NOT supported data type. " - "Only float16 and float32 are supported. "); - break; - } - } - cublasDestroy(cublas_handle_); - cublasLtDestroy(cublaslt_handle_); - return ret; -} diff --git a/paddlenlp/ops/fast_transformer/src/fusion_decoder_op.h b/paddlenlp/ops/fast_transformer/src/fusion_decoder_op.h deleted file mode 100644 index e9cc413b42dc..000000000000 --- a/paddlenlp/ops/fast_transformer/src/fusion_decoder_op.h +++ /dev/null @@ -1,72 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#pragma once - -#include -#include - -#include "fastertransformer/open_decoder.h" -#include "fastertransformer/utils/common.h" - -#ifdef PADDLE_ON_INFERENCE -#include "paddle/extension.h" -#include "paddle_inference_api.h" -#include "paddle/common/exception.h" -#else -#include "paddle/extension.h" -#endif - - -std::vector DecoderCUDAForward( - const paddle::Tensor& from_tensor, - const paddle::Tensor& memory_tensor, - const paddle::Tensor& mem_seq_len, - const paddle::Tensor& self_ln_weight, - const paddle::Tensor& self_ln_bias, - const paddle::Tensor& self_q_weight, - const paddle::Tensor& self_q_bias, - const paddle::Tensor& self_k_weight, - const paddle::Tensor& self_k_bias, - const paddle::Tensor& self_v_weight, - const paddle::Tensor& self_v_bias, - const paddle::Tensor& self_out_weight, - const paddle::Tensor& self_out_bias, - const paddle::Tensor& cross_ln_weight, - const paddle::Tensor& cross_ln_bias, - const paddle::Tensor& cross_q_weight, - const paddle::Tensor& cross_q_bias, - const paddle::Tensor& cross_k_weight, - const paddle::Tensor& cross_k_bias, - const paddle::Tensor& cross_v_weight, - const paddle::Tensor& cross_v_bias, - const paddle::Tensor& cross_out_weight, - const paddle::Tensor& cross_out_bias, - const paddle::Tensor& ffn_ln_weight, - const paddle::Tensor& ffn_ln_bias, - const paddle::Tensor& ffn_inter_weight, - const paddle::Tensor& ffn_inter_bias, - const paddle::Tensor& ffn_out_weight, - const paddle::Tensor& ffn_out_bias, - const paddle::Tensor& old_self_cache_key, - const paddle::Tensor& old_self_cache_value, - const paddle::Tensor& old_mem_cache, - const int step, - paddle::Tensor& decoder_output, - paddle::Tensor& new_self_cache_key, - paddle::Tensor& new_self_cache_value, - paddle::Tensor& new_mem_cache, - int n_head, - int size_per_head, - int memory_hidden_dim, - bool is_fuse_qkv); diff --git a/paddlenlp/ops/fast_transformer/src/fusion_decoding_op.cc b/paddlenlp/ops/fast_transformer/src/fusion_decoding_op.cc deleted file mode 100644 index 3607f70961fb..000000000000 --- a/paddlenlp/ops/fast_transformer/src/fusion_decoding_op.cc +++ /dev/null @@ -1,337 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include -#include - -#include "fusion_decoding_op.h" -#include "pd_traits.h" - - -std::vector DecodingForward( - const paddle::Tensor& input, - const paddle::Tensor& mem_seq_len, - const paddle::Tensor& word_embedding, - const std::vector& self_ln_weight, - const std::vector& self_ln_bias, - const std::vector& self_q_weight, - const std::vector& self_q_bias, - const std::vector& self_k_weight, - const std::vector& self_k_bias, - const std::vector& self_v_weight, - const std::vector& self_v_bias, - const std::vector& self_out_weight, - const std::vector& self_out_bias, - const std::vector& cross_ln_weight, - const std::vector& cross_ln_bias, - const std::vector& cross_q_weight, - const std::vector& cross_q_bias, - const std::vector& cross_k_weight, - const std::vector& cross_k_bias, - const std::vector& cross_v_weight, - const std::vector& cross_v_bias, - const std::vector& cross_out_weight, - const std::vector& cross_out_bias, - const std::vector& ffn_ln_weight, - const std::vector& ffn_ln_bias, - const std::vector& ffn_inter_weight, - const std::vector& ffn_inter_bias, - const std::vector& ffn_out_weight, - const std::vector& ffn_out_bias, - const paddle::Tensor& decoder_ln_weight, - const paddle::Tensor& decoder_ln_bias, - const paddle::Tensor& embedding_weight, - const paddle::Tensor& embedding_bias, - const paddle::Tensor& positional_embedding_weight, - const std::string& decoding_strategy, - const int& beam_size, - const int& topk, - const float& topp, - const int& n_head, - const int& size_per_head, - const int& num_layer, - const int& bos_id, - const int& eos_id, - const int64_t& max_len, - const float& beam_search_diversity_rate, - const bool& rel_len, - const float& alpha) { - int batch_size = input.shape()[0]; - int max_out_len = rel_len ? max_len + input.shape()[1] : max_len; - - std::vector output_dims; - std::vector parent_ids_dims; - std::vector sequence_length_dims({batch_size}); - if (decoding_strategy == "beam_search") { - batch_size /= beam_size; - output_dims = {max_out_len, batch_size, beam_size}; - parent_ids_dims = output_dims; - } else if (decoding_strategy == "beam_search_v2") { - // Use separated alive and finish beam queues to avoid the decrease of alive - // beams. The outputs must include both the finish and alive to trace full - // path. - if (batch_size != -1) { - sequence_length_dims = {batch_size * 2}; - batch_size /= beam_size; - } else { - sequence_length_dims = {batch_size}; - } - output_dims = {max_out_len, batch_size, beam_size * 2}; - parent_ids_dims = output_dims; - } else if (decoding_strategy == "topk_sampling" || - decoding_strategy == "topp_sampling") { - output_dims = {max_out_len, batch_size}; - parent_ids_dims = {1}; - } else { - PD_THROW("Not supported decoding strategy. "); - } - - if (input.place() == paddle::PlaceType::kGPU) { - auto output_ids = paddle::Tensor(paddle::PlaceType::kGPU, output_dims); - auto parent_ids = paddle::Tensor(paddle::PlaceType::kGPU, parent_ids_dims); - auto sequence_length = - paddle::Tensor(paddle::PlaceType::kGPU, sequence_length_dims); - - paddle::Tensor seq_len = paddle::Tensor(paddle::PlaceType::kGPU); - - if (mem_seq_len.place() != paddle::PlaceType::kGPU) { - seq_len = mem_seq_len.copy_to(paddle::PlaceType::kGPU); - } else { - seq_len = mem_seq_len; - } - - return DecodingCUDAForward(input, - seq_len, - word_embedding, - self_ln_weight, - self_ln_bias, - self_q_weight, - self_q_bias, - self_k_weight, - self_k_bias, - self_v_weight, - self_v_bias, - self_out_weight, - self_out_bias, - cross_ln_weight, - cross_ln_bias, - cross_q_weight, - cross_q_bias, - cross_k_weight, - cross_k_bias, - cross_v_weight, - cross_v_bias, - cross_out_weight, - cross_out_bias, - ffn_ln_weight, - ffn_ln_bias, - ffn_inter_weight, - ffn_inter_bias, - ffn_out_weight, - ffn_out_bias, - decoder_ln_weight, - decoder_ln_bias, - embedding_weight, - embedding_bias, - positional_embedding_weight, - output_ids, - parent_ids, - sequence_length, - decoding_strategy, - beam_size, - topk, - topp, - n_head, - size_per_head, - num_layer, - bos_id, - eos_id, - max_out_len, - beam_search_diversity_rate, - alpha); - } else { - PD_THROW("Not implemented place. Only GPU is supported. "); - } -} - -std::vector> DecodingInferShape( - const std::vector& input_shape, - const std::vector& mem_seq_len_shape, - const std::vector& word_embedding_shape, - const std::vector>& self_ln_weight_shapes, - const std::vector>& self_ln_bias_shapes, - const std::vector>& self_q_weight_shapes, - const std::vector>& self_q_bias_shapes, - const std::vector>& self_k_weight_shapes, - const std::vector>& self_k_bias_shapes, - const std::vector>& self_v_weight_shapes, - const std::vector>& self_v_bias_shapes, - const std::vector>& self_out_weight_shapes, - const std::vector>& self_out_bias_shapes, - const std::vector>& cross_ln_weight_shapes, - const std::vector>& cross_ln_bias_shapes, - const std::vector>& cross_q_weight_shapes, - const std::vector>& cross_q_bias_shapes, - const std::vector>& cross_k_weight_shapes, - const std::vector>& cross_k_bias_shapes, - const std::vector>& cross_v_weight_shapes, - const std::vector>& cross_v_bias_shapes, - const std::vector>& cross_out_weight_shapes, - const std::vector>& cross_out_bias_shapes, - const std::vector>& ffn_ln_weight_shapes, - const std::vector>& ffn_ln_bias_shapes, - const std::vector>& ffn_inter_weight_shapes, - const std::vector>& ffn_inter_bias_shapes, - const std::vector>& ffn_out_weight_shapes, - const std::vector>& ffn_out_bias_shapes, - const std::vector& decoder_ln_weight_shape, - const std::vector& decoder_ln_bias_shape, - const std::vector& embedding_weight_shape, - const std::vector& embedding_bias_shape, - const std::vector& positional_embedding_weight_shape, - const std::string& decoding_strategy, - const int& beam_size, - const int& topk, - const float& topp, - const int& n_head, - const int& size_per_head, - const int& num_layer, - const int& bos_id, - const int& eos_id, - const int64_t& max_len, - const float& beam_search_diversity_rate, - const bool& rel_len, - const float& alpha) { - int batch_size = input_shape[0]; - - std::vector output_dims; - std::vector sequence_length_dims({batch_size}); - if (decoding_strategy == "beam_search") { - if (batch_size != -1) { - batch_size /= beam_size; - } - output_dims = {max_len, batch_size, beam_size}; - return {output_dims, output_dims, sequence_length_dims}; - } else if (decoding_strategy == "beam_search_v2") { - // Use separated alive and finish beam queues to avoid the decrease of alive - // beams. The outputs must include both the finish and alive to trace full - // path. - if (batch_size != -1) { - sequence_length_dims = {batch_size * 2}; - batch_size /= beam_size; - } else { - sequence_length_dims = {batch_size}; - } - output_dims = {max_len, batch_size, beam_size * 2}; - return {output_dims, output_dims, sequence_length_dims}; - } else if (decoding_strategy == "topk_sampling" || - decoding_strategy == "topp_sampling") { - output_dims = {max_len, batch_size}; - return {output_dims, {1}, sequence_length_dims}; - } else { - PD_THROW("Not supported decoding strategy. "); - } -} - -std::vector DecodingInferDtype( - const paddle::DataType& input, - const paddle::DataType& mem_seq_len, - const paddle::DataType& word_embedding, - const std::vector& self_ln_weight, - const std::vector& self_ln_bias, - const std::vector& self_q_weight, - const std::vector& self_q_bias, - const std::vector& self_k_weight, - const std::vector& self_k_bias, - const std::vector& self_v_weight, - const std::vector& self_v_bias, - const std::vector& self_out_weight, - const std::vector& self_out_bias, - const std::vector& cross_ln_weight, - const std::vector& cross_ln_bias, - const std::vector& cross_q_weight, - const std::vector& cross_q_bias, - const std::vector& cross_k_weight, - const std::vector& cross_k_bias, - const std::vector& cross_v_weight, - const std::vector& cross_v_bias, - const std::vector& cross_out_weight, - const std::vector& cross_out_bias, - const std::vector& ffn_ln_weight, - const std::vector& ffn_ln_bias, - const std::vector& ffn_inter_weight, - const std::vector& ffn_inter_bias, - const std::vector& ffn_out_weight, - const std::vector& ffn_out_bias, - const paddle::DataType& decoder_ln_weight, - const paddle::DataType& decoder_ln_bias, - const paddle::DataType& embedding_weight, - const paddle::DataType& embedding_bias, - const paddle::DataType& positional_embedding_weight) { - return {paddle::DataType::INT32, - paddle::DataType::INT32, - paddle::DataType::INT32}; -} - -PD_BUILD_OP(fusion_decoding) - .Inputs({"Input", - "MemSeqLen", - "WordEmbedding", - paddle::Vec("SelfLayernormWeight"), - paddle::Vec("SelfLayernormBias"), - paddle::Vec("SelfQueryWeight"), - paddle::Vec("SelfQueryBias"), - paddle::Vec("SelfKeyWeight"), - paddle::Vec("SelfKeyBias"), - paddle::Vec("SelfValueWeight"), - paddle::Vec("SelfValueBias"), - paddle::Vec("SelfOutWeight"), - paddle::Vec("SelfOutBias"), - paddle::Vec("CrossLayernormWeight"), - paddle::Vec("CrossLayernormBias"), - paddle::Vec("CrossQueryWeight"), - paddle::Vec("CrossQueryBias"), - paddle::Vec("CrossKeyWeight"), - paddle::Vec("CrossKeyBias"), - paddle::Vec("CrossValueWeight"), - paddle::Vec("CrossValueBias"), - paddle::Vec("CrossOutWeight"), - paddle::Vec("CrossOutBias"), - paddle::Vec("FFNLayernormWeight"), - paddle::Vec("FFNLayernormBias"), - paddle::Vec("FFNInterWeight"), - paddle::Vec("FFNInterBias"), - paddle::Vec("FFNOutWeight"), - paddle::Vec("FFNOutBias"), - "DecoderLayernormWeight", - "DecoderLayernormBias", - "EmbWeight", - "EmbBias", - "PositionEncEmb"}) - .Outputs({"OutputIds", "ParentIds", "SequenceLength"}) - .Attrs({"decoding_strategy: std::string", - "beam_size: int", - "topk: int", - "topp: float", - "n_head: int", - "size_per_head: int", - "num_layer: int", - "bos_id: int", - "eos_id: int", - "max_len: int64_t", - "beam_search_diversity_rate: float", - "rel_len: bool", - "alpha: float"}) - .SetKernelFn(PD_KERNEL(DecodingForward)) - .SetInferShapeFn(PD_INFER_SHAPE(DecodingInferShape)) - .SetInferDtypeFn(PD_INFER_DTYPE(DecodingInferDtype)); diff --git a/paddlenlp/ops/fast_transformer/src/fusion_decoding_op.cu b/paddlenlp/ops/fast_transformer/src/fusion_decoding_op.cu deleted file mode 100644 index 3072b19709a7..000000000000 --- a/paddlenlp/ops/fast_transformer/src/fusion_decoding_op.cu +++ /dev/null @@ -1,538 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include -#include -#include -#include - -#include -#include -#include -#include -#include - - -#include "fusion_decoding_op.h" -#include "pd_traits.h" - -template -std::vector decoding_kernel( - const paddle::Tensor& input, - const paddle::Tensor& memory_sequence_length, - const paddle::Tensor& word_emb, - const std::vector& self_layernorm_weight, - const std::vector& self_layernorm_bias, - const std::vector& self_attn_query_weight, - const std::vector& self_attn_query_bias, - const std::vector& self_attn_key_weight, - const std::vector& self_attn_key_bias, - const std::vector& self_attn_value_weight, - const std::vector& self_attn_value_bias, - const std::vector& self_attn_output_weight, - const std::vector& self_attn_output_bias, - const std::vector& cross_layernorm_weight, - const std::vector& cross_layernorm_bias, - const std::vector& cross_attn_query_weight, - const std::vector& cross_attn_query_bias, - const std::vector& cross_attn_key_weight, - const std::vector& cross_attn_key_bias, - const std::vector& cross_attn_value_weight, - const std::vector& cross_attn_value_bias, - const std::vector& cross_attn_output_weight, - const std::vector& cross_attn_output_bias, - const std::vector& ffn_layernorm_weight, - const std::vector& ffn_layernorm_bias, - const std::vector& ffn_intermediate_weight, - const std::vector& ffn_intermediate_bias, - const std::vector& ffn_output_weight, - const std::vector& ffn_output_bias, - const paddle::Tensor& decoder_layernorm_weight, - const paddle::Tensor& decoder_layernorm_bias, - const paddle::Tensor& embedding_weight, - const paddle::Tensor& embedding_bias, - const paddle::Tensor& position_encoding_table, - paddle::Tensor& output_ids, - paddle::Tensor& parent_ids, - paddle::Tensor& sequence_length, - const std::string& decoding_strategy, - const int& beam_size, - const int& topk, - const float& topp, - const int& head_num_, - const int& size_per_head_, - const int& num_layer_, - const int& start_id_, - const int& end_id_, - const int64_t& max_seq_len_, - const float& beam_search_diversity_rate_, - const float& alpha, - cudaStream_t stream) { - int beam_width_ = (decoding_strategy == "beam_search" || - decoding_strategy == "beam_search_v2") - ? beam_size - : 1; - int candidate_num_ = (decoding_strategy == "topk_sampling" || - decoding_strategy == "topp_sampling") - ? topk - : 1; - float probability_threshold_ = (decoding_strategy == "topk_sampling" || - decoding_strategy == "topp_sampling") - ? topp - : 0.0; - - auto input_dims = input.shape(); - int batch_size_ = (decoding_strategy == "beam_search" || - decoding_strategy == "beam_search_v2") - ? input_dims[0] / beam_width_ - : input_dims[0]; - const int memory_max_seq_len = input_dims[1]; - const int memory_hidden_dim = input_dims[2]; - const int vocab_size = word_emb.shape()[0]; - - typedef PDTraits traits_; - typedef typename traits_::DataType DataType_; - typedef typename traits_::data_t data_t_; - - DecodingInitParam decoding_params; - decoding_params.cublas_handle = CublasHandle::GetInstance()->cublas_handle_; - decoding_params.cublaslt_handle = - CublasHandle::GetInstance()->cublaslt_handle_; - - decoding_params.output_ids = output_ids.mutable_data(input.place()); - decoding_params.parent_ids = parent_ids.mutable_data(input.place()); - decoding_params.sequence_length = - sequence_length.mutable_data(input.place()); - - typedef DecoderTransformerTraits DecodingTraits_; - decoding_params.stream = stream; - fastertransformer::Allocator allocator_(stream); - - decoding_params.memory_tensor = - reinterpret_cast(input.data()); - decoding_params.memory_sequence_length = memory_sequence_length.data(); - - DecoderInitParam* params = - new DecoderInitParam[num_layer_]; - - int inner_coeff = ffn_intermediate_weight[0].shape()[1] / memory_hidden_dim; - - auto q_weight_shape = self_attn_query_weight[0].shape(); - auto k_weight_shape = self_attn_key_weight[0].shape(); - bool fuse_qkv = (q_weight_shape[1] == k_weight_shape[1]) ? false : true; - - for (int i = 0; i < num_layer_; i++) { - params[i].stream = stream; - params[i].cublas_handle = CublasHandle::GetInstance()->cublas_handle_; - params[i].cublaslt_handle = CublasHandle::GetInstance()->cublaslt_handle_; - - if (decoding_strategy == "beam_search" || - decoding_strategy == "beam_search_v2") { - params[i].request_batch_size = batch_size_ * beam_width_; - params[i].request_max_mem_seq_len = memory_max_seq_len; - } else if (decoding_strategy == "sampling" || - decoding_strategy == "topk_sampling" || - decoding_strategy == "topp_sampling") { - params[i].request_batch_size = batch_size_; - params[i].request_max_mem_seq_len = memory_max_seq_len; - } - - // self attn - params[i].self_layernorm.gamma = reinterpret_cast( - self_layernorm_weight[i].data()); - params[i].self_layernorm.beta = reinterpret_cast( - self_layernorm_bias[i].data()); - // query - params[i].self_attention.query_weight.kernel = - reinterpret_cast( - self_attn_query_weight[i].data()); - params[i].self_attention.query_weight.bias = - reinterpret_cast( - self_attn_query_bias[i].data()); - // key - params[i].self_attention.key_weight.kernel = - reinterpret_cast( - self_attn_key_weight[i].data()); - params[i].self_attention.key_weight.bias = - reinterpret_cast( - self_attn_key_bias[i].data()); - // value - params[i].self_attention.value_weight.kernel = - reinterpret_cast( - self_attn_value_weight[i].data()); - params[i].self_attention.value_weight.bias = - reinterpret_cast( - self_attn_value_bias[i].data()); - // out proj - params[i].self_attention.attention_output_weight.kernel = - reinterpret_cast( - self_attn_output_weight[i].data()); - params[i].self_attention.attention_output_weight.bias = - reinterpret_cast( - self_attn_output_bias[i].data()); - - // cross - params[i].cross_layernorm.gamma = reinterpret_cast( - cross_layernorm_weight[i].data()); - params[i].cross_layernorm.beta = reinterpret_cast( - cross_layernorm_bias[i].data()); - // query - params[i].cross_attention.query_weight.kernel = - reinterpret_cast( - cross_attn_query_weight[i].data()); - params[i].cross_attention.query_weight.bias = - reinterpret_cast( - cross_attn_query_bias[i].data()); - // key - params[i].cross_attention.key_weight.kernel = - reinterpret_cast( - cross_attn_key_weight[i].data()); - params[i].cross_attention.key_weight.bias = - reinterpret_cast( - cross_attn_key_bias[i].data()); - // value - params[i].cross_attention.value_weight.kernel = - reinterpret_cast( - cross_attn_value_weight[i].data()); - params[i].cross_attention.value_weight.bias = - reinterpret_cast( - cross_attn_value_bias[i].data()); - // out proj - params[i].cross_attention.attention_output_weight.kernel = - reinterpret_cast( - cross_attn_output_weight[i].data()); - params[i].cross_attention.attention_output_weight.bias = - reinterpret_cast( - cross_attn_output_bias[i].data()); - - // ffn - params[i].ffn_layernorm.gamma = reinterpret_cast( - ffn_layernorm_weight[i].data()); - params[i].ffn_layernorm.beta = reinterpret_cast( - ffn_layernorm_bias[i].data()); - // intermediate proj - params[i].ffn.intermediate_weight.kernel = - reinterpret_cast( - ffn_intermediate_weight[i].data()); - params[i].ffn.intermediate_weight.bias = reinterpret_cast( - ffn_intermediate_bias[i].data()); - // out proj - params[i].ffn.output_weight.kernel = reinterpret_cast( - ffn_output_weight[i].data()); - params[i].ffn.output_weight.bias = - reinterpret_cast(ffn_output_bias[i].data()); - } - - decoding_params.layernorm.gamma = reinterpret_cast( - decoder_layernorm_weight.data()); - decoding_params.layernorm.beta = reinterpret_cast( - decoder_layernorm_bias.data()); - // for embedding - decoding_params.embedding_table = - reinterpret_cast(word_emb.data()); - - // for weight sharing matmul - decoding_params.embedding_kernel = - reinterpret_cast(embedding_weight.data()); - // for matmul bias - decoding_params.embedding_bias = - reinterpret_cast(embedding_bias.data()); - - decoding_params.position_encoding_table = reinterpret_cast( - position_encoding_table.data()); - - if ("beam_search" == decoding_strategy) { - DecodingBeamsearch* decoding_beam_search_; - decoding_beam_search_ = new DecodingBeamsearch( - allocator_, - batch_size_, - beam_width_, - max_seq_len_, - head_num_, - size_per_head_, - vocab_size, - num_layer_, - memory_hidden_dim, - memory_max_seq_len, - start_id_, - end_id_, - beam_search_diversity_rate_, - true, // is_fuse_topk_softMax - fuse_qkv, - false, // keep_alive_beam - 0.6, // alpha - true, // normalization_before - 0, // pos_offset - ActivationType::RELU, // act - false, // pos_bias - false, // prefix_lm - -1, // finished_candidate_num - false, // early_stopping - false, // is_mbart - 0, // min_length - inner_coeff); - - decoding_beam_search_->forward(params, decoding_params); - - delete decoding_beam_search_; - } else if ("beam_search_v2" == decoding_strategy) { - DecodingBeamsearch* decoding_beam_search_; - decoding_beam_search_ = new DecodingBeamsearch( - allocator_, - batch_size_, - beam_width_, - max_seq_len_, - head_num_, - size_per_head_, - vocab_size, - num_layer_, - memory_hidden_dim, - memory_max_seq_len, - start_id_, - end_id_, - beam_search_diversity_rate_, - true, // is_fuse_topk_softMax - fuse_qkv, - true, // keep_alive_beam - alpha, - true, // normalization_before - 0, // pos_offset - ActivationType::RELU, // act - false, // pos_bias - false, // prefix_lm - -1, // finished_candidate_num - false, // early_stopping - false, // is_mbart - 0, // min_length - inner_coeff); - - decoding_beam_search_->forward(params, decoding_params); - - delete decoding_beam_search_; - } else if ("topk_sampling" == decoding_strategy || - "topp_sampling" == decoding_strategy || - "sampling" == decoding_strategy) { - DecodingSampling* decoding_sampling_; - decoding_sampling_ = new DecodingSampling( - allocator_, - batch_size_, - max_seq_len_, - head_num_, - size_per_head_, - vocab_size, - num_layer_, - memory_hidden_dim, - memory_max_seq_len, - start_id_, - end_id_, - candidate_num_, - probability_threshold_, - fuse_qkv, - true, // normalization_before - 0, // pos_offset - ActivationType::RELU, // act - false, // pos_bias - 1.0, // temperature - 1.0, // repeat_penalty - false, // prefix_lm - false, // is_mbart - 0, // min_length - inner_coeff); - - decoding_sampling_->forward(params, decoding_params); - - delete decoding_sampling_; - } else { - PD_THROW( - "Only beam_search, topk_sampling and topp_sampling are supported for " - "FastGeneration. "); - } - delete[] params; - - return {output_ids, parent_ids, sequence_length}; -} - -std::vector DecodingCUDAForward( - const paddle::Tensor& input, - const paddle::Tensor& mem_seq_len, - const paddle::Tensor& word_embedding, - const std::vector& self_ln_weight, - const std::vector& self_ln_bias, - const std::vector& self_q_weight, - const std::vector& self_q_bias, - const std::vector& self_k_weight, - const std::vector& self_k_bias, - const std::vector& self_v_weight, - const std::vector& self_v_bias, - const std::vector& self_out_weight, - const std::vector& self_out_bias, - const std::vector& cross_ln_weight, - const std::vector& cross_ln_bias, - const std::vector& cross_q_weight, - const std::vector& cross_q_bias, - const std::vector& cross_k_weight, - const std::vector& cross_k_bias, - const std::vector& cross_v_weight, - const std::vector& cross_v_bias, - const std::vector& cross_out_weight, - const std::vector& cross_out_bias, - const std::vector& ffn_ln_weight, - const std::vector& ffn_ln_bias, - const std::vector& ffn_inter_weight, - const std::vector& ffn_inter_bias, - const std::vector& ffn_out_weight, - const std::vector& ffn_out_bias, - const paddle::Tensor& decoder_ln_weight, - const paddle::Tensor& decoder_ln_bias, - const paddle::Tensor& embedding_weight, - const paddle::Tensor& embedding_bias, - const paddle::Tensor& positional_embedding_weight, - paddle::Tensor& output_ids, - paddle::Tensor& parent_ids, - paddle::Tensor& sequence_length, - const std::string& decoding_strategy, - const int& beam_size, - const int& topk, - const float& topp, - const int& n_head, - const int& size_per_head, - const int& num_layer, - const int& bos_id, - const int& eos_id, - const int64_t& max_len, - const float& beam_search_diversity_rate, - const float& alpha) { - auto stream = input.stream(); - - cublasSetStream(CublasHandle::GetInstance()->cublas_handle_, stream); - - std::vector ret; - - switch (input.type()) { - case paddle::DataType::FLOAT16: { - ret = decoding_kernel( - input, - mem_seq_len, - word_embedding, - self_ln_weight, - self_ln_bias, - self_q_weight, - self_q_bias, - self_k_weight, - self_k_bias, - self_v_weight, - self_v_bias, - self_out_weight, - self_out_bias, - cross_ln_weight, - cross_ln_bias, - cross_q_weight, - cross_q_bias, - cross_k_weight, - cross_k_bias, - cross_v_weight, - cross_v_bias, - cross_out_weight, - cross_out_bias, - ffn_ln_weight, - ffn_ln_bias, - ffn_inter_weight, - ffn_inter_bias, - ffn_out_weight, - ffn_out_bias, - decoder_ln_weight, - decoder_ln_bias, - embedding_weight, - embedding_bias, - positional_embedding_weight, - output_ids, - parent_ids, - sequence_length, - decoding_strategy, - beam_size, - topk, - topp, - n_head, - size_per_head, - num_layer, - bos_id, - eos_id, - max_len, - beam_search_diversity_rate, - alpha, - stream); - break; - } - case paddle::DataType::FLOAT32: { - ret = decoding_kernel( - input, - mem_seq_len, - word_embedding, - self_ln_weight, - self_ln_bias, - self_q_weight, - self_q_bias, - self_k_weight, - self_k_bias, - self_v_weight, - self_v_bias, - self_out_weight, - self_out_bias, - cross_ln_weight, - cross_ln_bias, - cross_q_weight, - cross_q_bias, - cross_k_weight, - cross_k_bias, - cross_v_weight, - cross_v_bias, - cross_out_weight, - cross_out_bias, - ffn_ln_weight, - ffn_ln_bias, - ffn_inter_weight, - ffn_inter_bias, - ffn_out_weight, - ffn_out_bias, - decoder_ln_weight, - decoder_ln_bias, - embedding_weight, - embedding_bias, - positional_embedding_weight, - output_ids, - parent_ids, - sequence_length, - decoding_strategy, - beam_size, - topk, - topp, - n_head, - size_per_head, - num_layer, - bos_id, - eos_id, - max_len, - beam_search_diversity_rate, - alpha, - stream); - break; - } - default: { - PD_THROW( - "NOT supported data type. " - "Only float16 and float32 are supported. "); - break; - } - } - return ret; -} diff --git a/paddlenlp/ops/fast_transformer/src/fusion_decoding_op.h b/paddlenlp/ops/fast_transformer/src/fusion_decoding_op.h deleted file mode 100644 index 419649092abe..000000000000 --- a/paddlenlp/ops/fast_transformer/src/fusion_decoding_op.h +++ /dev/null @@ -1,84 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#pragma once - -#include -#include - -#include "cublas_handle.h" - -#include "fastertransformer/decoding_beamsearch.h" -#include "fastertransformer/decoding_sampling.h" -#include "fastertransformer/open_decoder.h" -#include "fastertransformer/utils/common.h" - -#ifdef PADDLE_ON_INFERENCE -#include "paddle/extension.h" -#include "paddle_inference_api.h" -#include "paddle/common/exception.h" -#else -#include "paddle/extension.h" -#endif - - -std::vector DecodingCUDAForward( - const paddle::Tensor& input, - const paddle::Tensor& mem_seq_len, - const paddle::Tensor& word_embedding, - const std::vector& self_ln_weight, - const std::vector& self_ln_bias, - const std::vector& self_q_weight, - const std::vector& self_q_bias, - const std::vector& self_k_weight, - const std::vector& self_k_bias, - const std::vector& self_v_weight, - const std::vector& self_v_bias, - const std::vector& self_out_weight, - const std::vector& self_out_bias, - const std::vector& cross_ln_weight, - const std::vector& cross_ln_bias, - const std::vector& cross_q_weight, - const std::vector& cross_q_bias, - const std::vector& cross_k_weight, - const std::vector& cross_k_bias, - const std::vector& cross_v_weight, - const std::vector& cross_v_bias, - const std::vector& cross_out_weight, - const std::vector& cross_out_bias, - const std::vector& ffn_ln_weight, - const std::vector& ffn_ln_bias, - const std::vector& ffn_inter_weight, - const std::vector& ffn_inter_bias, - const std::vector& ffn_out_weight, - const std::vector& ffn_out_bias, - const paddle::Tensor& decoder_ln_weight, - const paddle::Tensor& decoder_ln_bias, - const paddle::Tensor& embedding_weight, - const paddle::Tensor& embedding_bias, - const paddle::Tensor& positional_embedding_weight, - paddle::Tensor& output_ids, - paddle::Tensor& parent_ids, - paddle::Tensor& sequence_length, - const std::string& decoding_strategy, - const int& beam_size, - const int& topk, - const float& topp, - const int& n_head, - const int& size_per_head, - const int& num_layer, - const int& bos_id, - const int& eos_id, - const int64_t& max_len, - const float& beam_search_diversity_rate, - const float& alpha); diff --git a/paddlenlp/ops/fast_transformer/src/fusion_encoder_op.cc b/paddlenlp/ops/fast_transformer/src/fusion_encoder_op.cc deleted file mode 100644 index a3a97b92461a..000000000000 --- a/paddlenlp/ops/fast_transformer/src/fusion_encoder_op.cc +++ /dev/null @@ -1,193 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "fusion_encoder_op.h" - -std::vector EncoderForward( - const paddle::Tensor& input, - const paddle::Tensor& attn_mask, - const std::vector& attn_query_weight, - const std::vector& attn_query_bias, - const std::vector& attn_key_weight, - const std::vector& attn_key_bias, - const std::vector& attn_value_weight, - const std::vector& attn_value_bias, - const std::vector& attn_output_weight, - const std::vector& attn_output_bias, - const std::vector& attn_output_layernorm_weight, - const std::vector& attn_output_layernorm_bias, - const std::vector& output_layernorm_weight, - const std::vector& output_layernorm_bias, - const std::vector& ffn_intermediate_weight, - const std::vector& ffn_intermediate_bias, - const std::vector& ffn_output_weight, - const std::vector& ffn_output_bias, - // const paddle::Tensor& sequence_id_offset, - // const paddle::Tensor& trt_seqlen_offset, - // const paddle::Tensor& amax_list, - const int64_t& head_num, - const int64_t& size_per_head, - const bool& use_gelu, - const bool& remove_padding, - const int64_t& int8_mode, - const int64_t& num_layer, - const int64_t& layer_idx, - const bool& allow_gemm_test, - const bool& use_trt_kernel, - const bool& normalize_before) { - if (input.place() == paddle::PlaceType::kGPU) { - auto shape = input.shape(); - std::vector encoder_out({ - paddle::Tensor(paddle::PlaceType::kGPU, shape), paddle::Tensor(paddle::PlaceType::kGPU, shape) - }); - - return EncoderCUDAForward(input, - attn_mask, - attn_query_weight, - attn_query_bias, - attn_key_weight, - attn_key_bias, - attn_value_weight, - attn_value_bias, - attn_output_weight, - attn_output_bias, - attn_output_layernorm_weight, - attn_output_layernorm_bias, - output_layernorm_weight, - output_layernorm_bias, - ffn_intermediate_weight, - ffn_intermediate_bias, - ffn_output_weight, - ffn_output_bias, - // sequence_id_offset, - // trt_seqlen_offset, - // amax_list, - encoder_out, - head_num, - size_per_head, - use_gelu, - remove_padding, - int8_mode, // no support now - num_layer, - layer_idx, - allow_gemm_test, - use_trt_kernel, - normalize_before); - } else { - PD_THROW("Not implemented place. Only GPU is supported. "); - } -} - -std::vector> EncoderInferShape( - const std::vector& input_shape, - const std::vector& attn_mask_shape, - const std::vector>& attn_query_weight_shape, - const std::vector>& attn_query_bias_shape, - const std::vector>& attn_key_weight_shape, - const std::vector>& attn_key_bias_shape, - const std::vector>& attn_value_weight_shape, - const std::vector>& attn_value_bias_shape, - const std::vector>& attn_output_weight_shape, - const std::vector>& attn_output_bias_shape, - const std::vector>& attn_output_layernorm_weight_shape, - const std::vector>& attn_output_layernorm_bias_shape, - const std::vector>& output_layernorm_weight_shape, - const std::vector>& output_layernorm_bias_shape, - const std::vector>& ffn_intermediate_weight_shape, - const std::vector>& ffn_intermediate_bias_shape, - const std::vector>& ffn_output_weight_shape, - const std::vector>& ffn_output_bias_shape, - // const std::vector& sequence_id_offset, - // const std::vector& trt_seqlen_offset, - // const std::vector& amax_list_shape, - const int64_t& head_num, - const int64_t& size_per_head, - const bool& use_gelu, - const bool& remove_padding, - const int64_t& int8_mode, // no support now - const int64_t& num_layer, - const int64_t& layer_idx, - const bool& allow_gemm_test, - const bool& use_trt_kernel, - const bool& normalize_before) { - return {input_shape}; -} - - -std::vector EncoderInferDtype( - const paddle::DataType& input, - const paddle::DataType& attn_mask, - const std::vector& attn_query_weight, - const std::vector& attn_query_bias, - const std::vector& attn_key_weight, - const std::vector& attn_key_bias, - const std::vector& attn_value_weight, - const std::vector& attn_value_bias, - const std::vector& attn_output_weight, - const std::vector& attn_output_bias, - const std::vector& attn_output_layernorm_weight, - const std::vector& attn_output_layernorm_bias, - const std::vector& output_layernorm_weight, - const std::vector& output_layernorm_bias, - const std::vector& ffn_intermediate_weight, - const std::vector& ffn_intermediate_bias, - const std::vector& ffn_output_weight, - const std::vector& ffn_output_bias) { - // const paddle::DataType& sequence_id_offset, - // const paddle::DataType& trt_seqlen_offset, - // const paddle::DataType& amax_list) { - return {input}; -} - -PD_BUILD_OP(fusion_encoder) - .Inputs({ - "Input", - "SelfAttnMask", - paddle::Vec("SelfQueryWeight"), - paddle::Vec("SelfQueryBias"), - paddle::Vec("SelfKeyWeight"), - paddle::Vec("SelfKeyBias"), - paddle::Vec("SelfValueWeight"), - paddle::Vec("SelfValueBias"), - paddle::Vec("SelfAttnOutputWeight"), - paddle::Vec("SelfAttnOutputBias"), - paddle::Vec("SelfAttnOutputLayernormWeight"), - paddle::Vec("SelfAttnOutputLayernormBias"), - paddle::Vec("OutputLayernormWeight"), - paddle::Vec("OutputLayernormBias"), - paddle::Vec("FFNInterWeight"), - paddle::Vec("FFNInterBias"), - paddle::Vec("FFNOutputWeight"), - paddle::Vec("FFNOutputBias"), - // "SequenceIdOffset", - // "TRTSeqLenOffset", - // "AmaxList", - }) - .Outputs({"EncoderOut"}) - .Attrs({"head_num: int64_t", - "size_per_head: int64_t", - "use_gelu: bool", - "remove_padding: bool", - "int8_mode: int64_t", - "num_layer: int64_t", - "layer_idx: int64_t", - "allow_gemm_test: bool", - "use_trt_kernel: bool", - "normalize_before: bool"}) - .SetKernelFn(PD_KERNEL(EncoderForward)) - .SetInferShapeFn(PD_INFER_SHAPE(EncoderInferShape)) - .SetInferDtypeFn(PD_INFER_DTYPE(EncoderInferDtype)); \ No newline at end of file diff --git a/paddlenlp/ops/fast_transformer/src/fusion_encoder_op.cu b/paddlenlp/ops/fast_transformer/src/fusion_encoder_op.cu deleted file mode 100644 index 2fe897147ef8..000000000000 --- a/paddlenlp/ops/fast_transformer/src/fusion_encoder_op.cu +++ /dev/null @@ -1,443 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "cublas_handle.h" -#include "fastertransformer/bert_encoder_transformer.h" - -#include "fastertransformer/cuda/cuda_kernels.h" -#include "fastertransformer/standard_encoder.h" -#include "fusion_encoder_op.h" -#include "pd_traits.h" - - -template -std::vector encoder_kernel( - const paddle::Tensor& input, - const paddle::Tensor& attn_mask, - const std::vector& attn_query_weight, - const std::vector& attn_query_bias, - const std::vector& attn_key_weight, - const std::vector& attn_key_bias, - const std::vector& attn_value_weight, - const std::vector& attn_value_bias, - const std::vector& attn_output_weight, - const std::vector& attn_output_bias, - /* - When calling BertEncoderTransformer(Post-Norm): - norm1 coresponds to BertInitParam.self_layernorm - norm2 coresponds to BertInitParam.ffn_layernorm - When calling OpenEncoder(Pre-Norm): - norm1 coresponds to EncoderInitParam.input_layernorm - norm2 coresponds to EncoderInitParam.self_layernorm - */ - const std::vector& norm1_weight, - const std::vector& norm1_bias, - const std::vector& norm2_weight, - const std::vector& norm2_bias, - const std::vector& ffn_intermediate_weight, - const std::vector& ffn_intermediate_bias, - const std::vector& ffn_output_weight, - const std::vector& ffn_output_bias, - // const paddle::Tensor& sequence_id_offset, - // const paddle::Tensor& trt_seqlen_offset, - // const paddle::Tensor& amax_list, - std::vector& encoder_out, - int64_t head_num_, - int64_t size_per_head_, - bool use_gelu, - bool remove_padding, - int64_t int8_mode, // no support now - int64_t num_layer_, - int64_t layer_idx_, - bool allow_gemm_test, - bool use_trt_kernel_, - bool normalize_before, - cudaStream_t stream) { - - auto input_shape = input.shape(); - int batch_size_ = input_shape[0]; - int max_seq_len_ = input_shape[1]; - typedef PDTraits traits_; - - fastertransformer::Allocator* allocator_ = - new fastertransformer::Allocator(stream); - - typedef typename traits_::DataType DataType_; - typedef typename traits_::data_t data_t_; - - int in_id = 0; - int layers = attn_query_weight.size(); - - if (normalize_before == false) { - typedef BertEncoderTransformerTraits - EncoderTraits_; - - // Post-Normalization - BertInitParam encoder_param; - - encoder_param.stream = stream; - encoder_param.cublas_handle = CublasHandle::GetInstance()->cublas_handle_; - encoder_param.cublaslt_handle = - CublasHandle::GetInstance()->cublaslt_handle_; - - encoder_param.attr_mask = - reinterpret_cast(attn_mask.data()); - - BertEncoderTransformer* encoder = - new BertEncoderTransformer( - int8_mode, allow_gemm_test, use_gelu); - - encoder->allocateBuffer(allocator_, - batch_size_, - max_seq_len_, - max_seq_len_, - head_num_, - size_per_head_, - use_trt_kernel_); - - std::vector enc_buf({ - encoder_out[0].mutable_data(input.place()), - encoder_out[1].mutable_data(input.place())}); - - for (int layer = 0; layer < layers; ++layer) { - in_id = layer & 0x1; - - if (0 == layer) { - encoder_param.from_tensor = reinterpret_cast( - input.data()); - encoder_param.to_tensor = reinterpret_cast( - input.data()); - encoder_param.transformer_out = reinterpret_cast( - enc_buf[1 - in_id]); - } else { - encoder_param.from_tensor = reinterpret_cast( - enc_buf[in_id]); - encoder_param.to_tensor = reinterpret_cast( - enc_buf[in_id]); - encoder_param.transformer_out = reinterpret_cast( - enc_buf[1 - in_id]); - } - - // self attn - encoder_param.self_attention.query_weight.kernel = - reinterpret_cast(attn_query_weight[layer].data()); - encoder_param.self_attention.query_weight.bias = - reinterpret_cast(attn_query_bias[layer].data()); - encoder_param.self_attention.key_weight.kernel = - reinterpret_cast(attn_key_weight[layer].data()); - encoder_param.self_attention.key_weight.bias = - reinterpret_cast(attn_key_bias[layer].data()); - encoder_param.self_attention.value_weight.kernel = - reinterpret_cast(attn_value_weight[layer].data()); - encoder_param.self_attention.value_weight.bias = - reinterpret_cast(attn_value_bias[layer].data()); - encoder_param.self_attention.attention_output_weight.kernel = - reinterpret_cast(attn_output_weight[layer].data()); - encoder_param.self_attention.attention_output_weight.bias = - reinterpret_cast(attn_output_bias[layer].data()); - - // self_attn_layer_norm - encoder_param.self_layernorm.gamma = - reinterpret_cast(norm1_weight[layer].data()); - encoder_param.self_layernorm.beta = - reinterpret_cast(norm1_bias[layer].data()); - encoder_param.ffn.intermediate_weight.kernel = - reinterpret_cast( - ffn_intermediate_weight[layer].data()); - encoder_param.ffn.intermediate_weight.bias = - reinterpret_cast( - ffn_intermediate_bias[layer].data()); - - encoder_param.ffn.output_weight.kernel = - reinterpret_cast(ffn_output_weight[layer].data()); - encoder_param.ffn.output_weight.bias = - reinterpret_cast(ffn_output_bias[layer].data()); - - // ffn_layer_norm - encoder_param.ffn_layernorm.gamma = - reinterpret_cast(norm2_weight[layer].data()); - encoder_param.ffn_layernorm.beta = - reinterpret_cast(norm2_bias[layer].data()); - - int valid_word_num; - - encoder_param.sequence_id_offset = nullptr; - valid_word_num = batch_size_ * max_seq_len_; - - encoder_param.valid_word_num = valid_word_num; - - encoder_param.trt_seqlen_offset = nullptr; - encoder_param.trt_seqlen_size = batch_size_ + 1; - - encoder_param.amaxList = nullptr; - - encoder->initialize(encoder_param); - encoder->forward(); - } - - encoder->freeBuffer(); - - delete allocator_; - delete encoder; - } else { - typedef OpenEncoderTraits - OpenEncoderTraits_; - - // Pre-Normalization - EncoderInitParam encoder_param; - - encoder_param.stream = stream; - encoder_param.cublas_handle = CublasHandle::GetInstance()->cublas_handle_; - encoder_param.cublaslt_handle = - CublasHandle::GetInstance()->cublaslt_handle_; - encoder_param.attr_mask = - reinterpret_cast(attn_mask.data()); - - OpenEncoder* encoder = - new OpenEncoder( - int8_mode, allow_gemm_test, use_gelu); - - encoder->allocateBuffer(allocator_, - batch_size_, - max_seq_len_, - max_seq_len_, - head_num_, - size_per_head_, - use_trt_kernel_); - - for (int layer = 0; layer < layers; ++layer) { - in_id = layer & 0x1; - - if (0 == layer) { - encoder_param.from_tensor = reinterpret_cast( - input.data()); - encoder_param.to_tensor = reinterpret_cast( - input.data()); - encoder_param.transformer_out = reinterpret_cast( - encoder_out[1 - in_id].mutable_data(input.place())); - } else { - encoder_param.from_tensor = reinterpret_cast( - encoder_out[in_id].data()); - encoder_param.to_tensor = reinterpret_cast( - encoder_out[in_id].data()); - encoder_param.transformer_out = reinterpret_cast( - encoder_out[1 - in_id].mutable_data(input.place())); - } - - // self attn - encoder_param.self_attention.query_weight.kernel = - reinterpret_cast(attn_query_weight[layer].data()); - encoder_param.self_attention.query_weight.bias = - reinterpret_cast(attn_query_bias[layer].data()); - encoder_param.self_attention.key_weight.kernel = - reinterpret_cast(attn_key_weight[layer].data()); - encoder_param.self_attention.key_weight.bias = - reinterpret_cast(attn_key_bias[layer].data()); - encoder_param.self_attention.value_weight.kernel = - reinterpret_cast(attn_value_weight[layer].data()); - encoder_param.self_attention.value_weight.bias = - reinterpret_cast(attn_value_bias[layer].data()); - encoder_param.self_attention.attention_output_weight.kernel = - reinterpret_cast(attn_output_weight[layer].data()); - encoder_param.self_attention.attention_output_weight.bias = - reinterpret_cast(attn_output_bias[layer].data()); - - // Spicific for Pre-Normalization - encoder_param.input_layernorm.gamma = - reinterpret_cast(norm1_weight[layer].data()); - encoder_param.input_layernorm.beta = - reinterpret_cast(norm1_bias[layer].data()); - - encoder_param.self_layernorm.gamma = - reinterpret_cast(norm2_weight[layer].data()); - encoder_param.self_layernorm.beta = - reinterpret_cast(norm2_bias[layer].data()); - - encoder_param.ffn.intermediate_weight.kernel = - reinterpret_cast( - ffn_intermediate_weight[layer].data()); - encoder_param.ffn.intermediate_weight.bias = - reinterpret_cast( - ffn_intermediate_bias[layer].data()); - - encoder_param.ffn.output_weight.kernel = - reinterpret_cast(ffn_output_weight[layer].data()); - encoder_param.ffn.output_weight.bias = - reinterpret_cast(ffn_output_bias[layer].data()); - - int valid_word_num; - encoder_param.sequence_id_offset = nullptr; - valid_word_num = batch_size_ * max_seq_len_; - - encoder_param.valid_word_num = valid_word_num; - - encoder_param.trt_seqlen_offset = - nullptr; // trt_seqlen_offset.data(); - encoder_param.trt_seqlen_size = batch_size_ + 1; - - encoder_param.amaxList = nullptr; - - encoder->initialize(encoder_param); - encoder->forward(); - } - - encoder->freeBuffer(); - delete allocator_; - delete encoder; - } - - return {encoder_out[1 - in_id]}; -} - -std::vector EncoderCUDAForward( - const paddle::Tensor& input, - const paddle::Tensor& attn_mask, - const std::vector& attn_query_weight, - const std::vector& attn_query_bias, - const std::vector& attn_key_weight, - const std::vector& attn_key_bias, - const std::vector& attn_value_weight, - const std::vector& attn_value_bias, - const std::vector& attn_output_weight, - const std::vector& attn_output_bias, - /* - When calling BertEncoderTransformer(Post-Norm): - norm1 coresponds to BertInitParam.self_layernorm - norm2 coresponds to BertInitParam.ffn_layernorm - When calling OpenEncoder(Pre-Norm): - norm1 coresponds to EncoderInitParam.input_layernorm - norm2 coresponds to EncoderInitParam.self_layernorm - */ - const std::vector& norm1_weight, - const std::vector& norm1_bias, - const std::vector& norm2_weight, - const std::vector& norm2_bias, - const std::vector& ffn_intermediate_weight, - const std::vector& ffn_intermediate_bias, - const std::vector& ffn_output_weight, - const std::vector& ffn_output_bias, - // const paddle::Tensor& sequence_id_offset, - // const paddle::Tensor& trt_seqlen_offset, - // const paddle::Tensor& amax_list, - std::vector& encoder_out, - int64_t head_num, - int64_t size_per_head, - bool use_gelu, - bool remove_padding, - int64_t int8_mode, - int64_t num_layer, - int64_t layer_idx, - bool allow_gemm_test, - bool use_trt_kernel, - bool normalize_before) { - auto stream = input.stream(); - - cublasSetStream(CublasHandle::GetInstance()->cublas_handle_, stream); - - std::vector ret; - - switch (input.type()) { - case paddle::DataType::FLOAT16: { - ret = encoder_kernel(input, - attn_mask, - attn_query_weight, - attn_query_bias, - attn_key_weight, - attn_key_bias, - attn_value_weight, - attn_value_bias, - attn_output_weight, - attn_output_bias, - norm1_weight, - norm1_bias, - norm2_weight, - norm2_bias, - ffn_intermediate_weight, - ffn_intermediate_bias, - ffn_output_weight, - ffn_output_bias, - // sequence_id_offset, - // trt_seqlen_offset, - // amax_list, - encoder_out, - head_num, - size_per_head, - use_gelu, - remove_padding, - int8_mode, - num_layer, - layer_idx, - allow_gemm_test, - use_trt_kernel, - normalize_before, - stream); - - break; - } - case paddle::DataType::FLOAT32: { - ret = encoder_kernel(input, - attn_mask, - attn_query_weight, - attn_query_bias, - attn_key_weight, - attn_key_bias, - attn_value_weight, - attn_value_bias, - attn_output_weight, - attn_output_bias, - norm1_weight, - norm1_bias, - norm2_weight, - norm2_bias, - ffn_intermediate_weight, - ffn_intermediate_bias, - ffn_output_weight, - ffn_output_bias, - // sequence_id_offset, - // trt_seqlen_offset, - // amax_list, - encoder_out, - head_num, - size_per_head, - use_gelu, - remove_padding, - int8_mode, - num_layer, - layer_idx, - allow_gemm_test, - use_trt_kernel, - normalize_before, - stream); - break; - } - default: { - PD_THROW( - "NOT supported data type. " - "Only float16 and float32 are supported. "); - break; - } - } - return ret; -} diff --git a/paddlenlp/ops/fast_transformer/src/fusion_encoder_op.h b/paddlenlp/ops/fast_transformer/src/fusion_encoder_op.h deleted file mode 100644 index f9427e4c0eca..000000000000 --- a/paddlenlp/ops/fast_transformer/src/fusion_encoder_op.h +++ /dev/null @@ -1,63 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#pragma once - -#include -#include - -#include "fastertransformer/bert_encoder_transformer.h" -#include "fastertransformer/utils/common.h" - -#ifdef PADDLE_ON_INFERENCE -#include "paddle/extension.h" -#include "paddle_inference_api.h" -#include "paddle/common/exception.h" -#else -#include "paddle/extension.h" -#endif - - -std::vector EncoderCUDAForward( - const paddle::Tensor& input, - const paddle::Tensor& attn_mask, - const std::vector& attn_query_weight, - const std::vector& attn_query_bias, - const std::vector& attn_key_weight, - const std::vector& attn_key_bias, - const std::vector& attn_value_weight, - const std::vector& attn_value_bias, - const std::vector& attn_output_weight, - const std::vector& attn_output_bias, - const std::vector& norm1_weight, - const std::vector& norm1_bias, - const std::vector& norm2_weight, - const std::vector& norm2_bias, - const std::vector& ffn_intermediate_weight, - const std::vector& ffn_intermediate_bias, - const std::vector& ffn_output_weight, - const std::vector& ffn_output_bias, - // const paddle::Tensor& sequence_id_offset, - // const paddle::Tensor& trt_seqlen_offset, - // const paddle::Tensor& amax_list, - std::vector& encoder_out, - int64_t head_num_, - int64_t size_per_head_, - bool use_gelu, - bool remove_padding, - int64_t int8_mode, // no support now - int64_t num_layer_, - int64_t layer_idx_, - bool allow_gemm_test, - bool use_trt_kernel_, - bool normalize_before); diff --git a/paddlenlp/ops/fast_transformer/src/fusion_force_decoding_op.cc b/paddlenlp/ops/fast_transformer/src/fusion_force_decoding_op.cc deleted file mode 100644 index 50892e56199b..000000000000 --- a/paddlenlp/ops/fast_transformer/src/fusion_force_decoding_op.cc +++ /dev/null @@ -1,340 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include -#include - -#include "fusion_force_decoding_op.h" -#include "pd_traits.h" - - -std::vector DecodingForward( - const paddle::Tensor& input, - const paddle::Tensor& mem_seq_len, - const paddle::Tensor& word_embedding, - const std::vector& self_ln_weight, - const std::vector& self_ln_bias, - const std::vector& self_q_weight, - const std::vector& self_q_bias, - const std::vector& self_k_weight, - const std::vector& self_k_bias, - const std::vector& self_v_weight, - const std::vector& self_v_bias, - const std::vector& self_out_weight, - const std::vector& self_out_bias, - const std::vector& cross_ln_weight, - const std::vector& cross_ln_bias, - const std::vector& cross_q_weight, - const std::vector& cross_q_bias, - const std::vector& cross_k_weight, - const std::vector& cross_k_bias, - const std::vector& cross_v_weight, - const std::vector& cross_v_bias, - const std::vector& cross_out_weight, - const std::vector& cross_out_bias, - const std::vector& ffn_ln_weight, - const std::vector& ffn_ln_bias, - const std::vector& ffn_inter_weight, - const std::vector& ffn_inter_bias, - const std::vector& ffn_out_weight, - const std::vector& ffn_out_bias, - const paddle::Tensor& decoder_ln_weight, - const paddle::Tensor& decoder_ln_bias, - const paddle::Tensor& embedding_weight, - const paddle::Tensor& embedding_bias, - const paddle::Tensor& positional_embedding_weight, - const paddle::Tensor& trg_word, - const std::string& decoding_strategy, - const int& beam_size, - const int& topk, - const float& topp, - const int& n_head, - const int& size_per_head, - const int& num_layer, - const int& bos_id, - const int& eos_id, - const int64_t& max_len, - const float& beam_search_diversity_rate, - const bool& rel_len, - const float& alpha) { - int batch_size = input.shape()[0]; - int max_out_len = rel_len ? max_len + input.shape()[1] : max_len; - - std::vector output_dims; - std::vector parent_ids_dims; - std::vector sequence_length_dims({batch_size}); - if (decoding_strategy == "beam_search") { - batch_size /= beam_size; - output_dims = {max_out_len, batch_size, beam_size}; - parent_ids_dims = output_dims; - } else if (decoding_strategy == "beam_search_v2") { - // Use separated alive and finish beam queues to avoid the decrease of alive - // beams. The outputs must include both the finish and alive to trace full - // path. - if (batch_size != -1) { - sequence_length_dims = {batch_size * 2}; - batch_size /= beam_size; - } else { - sequence_length_dims = {batch_size}; - } - output_dims = {max_out_len, batch_size, beam_size * 2}; - parent_ids_dims = output_dims; - } else if (decoding_strategy == "sampling") { - output_dims = {max_out_len, batch_size}; - parent_ids_dims = {1}; - } else { - PD_THROW("Not supported decoding strategy. "); - } - - if (input.place() == paddle::PlaceType::kGPU) { - auto output_ids = paddle::Tensor(paddle::PlaceType::kGPU, output_dims); - auto parent_ids = paddle::Tensor(paddle::PlaceType::kGPU, parent_ids_dims); - auto sequence_length = - paddle::Tensor(paddle::PlaceType::kGPU, sequence_length_dims); - - paddle::Tensor seq_len = paddle::Tensor(paddle::PlaceType::kGPU); - - if (mem_seq_len.place() != paddle::PlaceType::kGPU) { - seq_len = mem_seq_len.copy_to(paddle::PlaceType::kGPU); - } else { - seq_len = mem_seq_len; - } - - return DecodingCUDAForward(input, - seq_len, - word_embedding, - self_ln_weight, - self_ln_bias, - self_q_weight, - self_q_bias, - self_k_weight, - self_k_bias, - self_v_weight, - self_v_bias, - self_out_weight, - self_out_bias, - cross_ln_weight, - cross_ln_bias, - cross_q_weight, - cross_q_bias, - cross_k_weight, - cross_k_bias, - cross_v_weight, - cross_v_bias, - cross_out_weight, - cross_out_bias, - ffn_ln_weight, - ffn_ln_bias, - ffn_inter_weight, - ffn_inter_bias, - ffn_out_weight, - ffn_out_bias, - decoder_ln_weight, - decoder_ln_bias, - embedding_weight, - embedding_bias, - positional_embedding_weight, - trg_word, - output_ids, - parent_ids, - sequence_length, - decoding_strategy, - beam_size, - topk, - topp, - n_head, - size_per_head, - num_layer, - bos_id, - eos_id, - max_out_len, - beam_search_diversity_rate, - alpha); - } else { - PD_THROW("Not implemented place. Only GPU is supported. "); - } -} - -std::vector> DecodingInferShape( - const std::vector& input_shape, - const std::vector& mem_seq_len_shape, - const std::vector& word_embedding_shape, - const std::vector>& self_ln_weight_shapes, - const std::vector>& self_ln_bias_shapes, - const std::vector>& self_q_weight_shapes, - const std::vector>& self_q_bias_shapes, - const std::vector>& self_k_weight_shapes, - const std::vector>& self_k_bias_shapes, - const std::vector>& self_v_weight_shapes, - const std::vector>& self_v_bias_shapes, - const std::vector>& self_out_weight_shapes, - const std::vector>& self_out_bias_shapes, - const std::vector>& cross_ln_weight_shapes, - const std::vector>& cross_ln_bias_shapes, - const std::vector>& cross_q_weight_shapes, - const std::vector>& cross_q_bias_shapes, - const std::vector>& cross_k_weight_shapes, - const std::vector>& cross_k_bias_shapes, - const std::vector>& cross_v_weight_shapes, - const std::vector>& cross_v_bias_shapes, - const std::vector>& cross_out_weight_shapes, - const std::vector>& cross_out_bias_shapes, - const std::vector>& ffn_ln_weight_shapes, - const std::vector>& ffn_ln_bias_shapes, - const std::vector>& ffn_inter_weight_shapes, - const std::vector>& ffn_inter_bias_shapes, - const std::vector>& ffn_out_weight_shapes, - const std::vector>& ffn_out_bias_shapes, - const std::vector& decoder_ln_weight_shape, - const std::vector& decoder_ln_bias_shape, - const std::vector& embedding_weight_shape, - const std::vector& embedding_bias_shape, - const std::vector& positional_embedding_weight_shape, - const std::vector& trg_word_shape, - const std::string& decoding_strategy, - const int& beam_size, - const int& topk, - const float& topp, - const int& n_head, - const int& size_per_head, - const int& num_layer, - const int& bos_id, - const int& eos_id, - const int64_t& max_len, - const float& beam_search_diversity_rate, - const bool& rel_len, - const float& alpha) { - int batch_size = input_shape[0]; - - std::vector output_dims; - std::vector sequence_length_dims({batch_size}); - if (decoding_strategy == "beam_search") { - if (batch_size != -1) { - batch_size /= beam_size; - } - output_dims = {max_len, batch_size, beam_size}; - return {output_dims, output_dims, sequence_length_dims}; - } else if (decoding_strategy == "beam_search_v2") { - // Use separated alive and finish beam queues to avoid the decrease of alive - // beams. The outputs must include both the finish and alive to trace full - // path. - if (batch_size != -1) { - sequence_length_dims = {batch_size * 2}; - batch_size /= beam_size; - } else { - sequence_length_dims = {batch_size}; - } - output_dims = {max_len, batch_size, beam_size * 2}; - return {output_dims, output_dims, sequence_length_dims}; - } else if (decoding_strategy == "sampling") { - output_dims = {max_len, batch_size}; - return {output_dims, {1}, sequence_length_dims}; - } else { - PD_THROW("Not supported decoding strategy. "); - } -} - -std::vector DecodingInferDtype( - const paddle::DataType& input, - const paddle::DataType& mem_seq_len, - const paddle::DataType& word_embedding, - const std::vector& self_ln_weight, - const std::vector& self_ln_bias, - const std::vector& self_q_weight, - const std::vector& self_q_bias, - const std::vector& self_k_weight, - const std::vector& self_k_bias, - const std::vector& self_v_weight, - const std::vector& self_v_bias, - const std::vector& self_out_weight, - const std::vector& self_out_bias, - const std::vector& cross_ln_weight, - const std::vector& cross_ln_bias, - const std::vector& cross_q_weight, - const std::vector& cross_q_bias, - const std::vector& cross_k_weight, - const std::vector& cross_k_bias, - const std::vector& cross_v_weight, - const std::vector& cross_v_bias, - const std::vector& cross_out_weight, - const std::vector& cross_out_bias, - const std::vector& ffn_ln_weight, - const std::vector& ffn_ln_bias, - const std::vector& ffn_inter_weight, - const std::vector& ffn_inter_bias, - const std::vector& ffn_out_weight, - const std::vector& ffn_out_bias, - const paddle::DataType& decoder_ln_weight, - const paddle::DataType& decoder_ln_bias, - const paddle::DataType& embedding_weight, - const paddle::DataType& embedding_bias, - const paddle::DataType& positional_embedding_weight, - const paddle::DataType& trg_word) { - return {paddle::DataType::INT32, - paddle::DataType::INT32, - paddle::DataType::INT32}; -} - -PD_BUILD_OP(fusion_force_decoding) - .Inputs({"Input", - "MemSeqLen", - "WordEmbedding", - paddle::Vec("SelfLayernormWeight"), - paddle::Vec("SelfLayernormBias"), - paddle::Vec("SelfQueryWeight"), - paddle::Vec("SelfQueryBias"), - paddle::Vec("SelfKeyWeight"), - paddle::Vec("SelfKeyBias"), - paddle::Vec("SelfValueWeight"), - paddle::Vec("SelfValueBias"), - paddle::Vec("SelfOutWeight"), - paddle::Vec("SelfOutBias"), - paddle::Vec("CrossLayernormWeight"), - paddle::Vec("CrossLayernormBias"), - paddle::Vec("CrossQueryWeight"), - paddle::Vec("CrossQueryBias"), - paddle::Vec("CrossKeyWeight"), - paddle::Vec("CrossKeyBias"), - paddle::Vec("CrossValueWeight"), - paddle::Vec("CrossValueBias"), - paddle::Vec("CrossOutWeight"), - paddle::Vec("CrossOutBias"), - paddle::Vec("FFNLayernormWeight"), - paddle::Vec("FFNLayernormBias"), - paddle::Vec("FFNInterWeight"), - paddle::Vec("FFNInterBias"), - paddle::Vec("FFNOutWeight"), - paddle::Vec("FFNOutBias"), - "DecoderLayernormWeight", - "DecoderLayernormBias", - "EmbWeight", - "EmbBias", - "PositionEncEmb", - "TrgWord"}) - .Outputs({"OutputIds", "ParentIds", "SequenceLength"}) - .Attrs({"decoding_strategy: std::string", - "beam_size: int", - "topk: int", - "topp: float", - "n_head: int", - "size_per_head: int", - "num_layer: int", - "bos_id: int", - "eos_id: int", - "max_len: int64_t", - "beam_search_diversity_rate: float", - "rel_len: bool", - "alpha: float"}) - .SetKernelFn(PD_KERNEL(DecodingForward)) - .SetInferShapeFn(PD_INFER_SHAPE(DecodingInferShape)) - .SetInferDtypeFn(PD_INFER_DTYPE(DecodingInferDtype)); diff --git a/paddlenlp/ops/fast_transformer/src/fusion_force_decoding_op.cu b/paddlenlp/ops/fast_transformer/src/fusion_force_decoding_op.cu deleted file mode 100644 index ae269e34ae42..000000000000 --- a/paddlenlp/ops/fast_transformer/src/fusion_force_decoding_op.cu +++ /dev/null @@ -1,572 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include -#include -#include -#include - -#include -#include -#include -#include -#include - - -#include "fusion_force_decoding_op.h" -#include "pd_traits.h" - - -__global__ void get_trg_length(const int* trg_word, - int* trg_length, - const int seq_len, - const int pad_id) { - int bid = threadIdx.x; - - int cnt_nonpads = 0; - for (int i = 0; i < seq_len; ++i) { - if (pad_id != trg_word[bid * seq_len + i]) { - cnt_nonpads++; - } else { - break; - } - } - trg_length[bid] = cnt_nonpads; -} - -template -std::vector decoding_kernel( - const paddle::Tensor& input, - const paddle::Tensor& memory_sequence_length, - const paddle::Tensor& word_emb, - const std::vector& self_layernorm_weight, - const std::vector& self_layernorm_bias, - const std::vector& self_attn_query_weight, - const std::vector& self_attn_query_bias, - const std::vector& self_attn_key_weight, - const std::vector& self_attn_key_bias, - const std::vector& self_attn_value_weight, - const std::vector& self_attn_value_bias, - const std::vector& self_attn_output_weight, - const std::vector& self_attn_output_bias, - const std::vector& cross_layernorm_weight, - const std::vector& cross_layernorm_bias, - const std::vector& cross_attn_query_weight, - const std::vector& cross_attn_query_bias, - const std::vector& cross_attn_key_weight, - const std::vector& cross_attn_key_bias, - const std::vector& cross_attn_value_weight, - const std::vector& cross_attn_value_bias, - const std::vector& cross_attn_output_weight, - const std::vector& cross_attn_output_bias, - const std::vector& ffn_layernorm_weight, - const std::vector& ffn_layernorm_bias, - const std::vector& ffn_intermediate_weight, - const std::vector& ffn_intermediate_bias, - const std::vector& ffn_output_weight, - const std::vector& ffn_output_bias, - const paddle::Tensor& decoder_layernorm_weight, - const paddle::Tensor& decoder_layernorm_bias, - const paddle::Tensor& embedding_weight, - const paddle::Tensor& embedding_bias, - const paddle::Tensor& position_encoding_table, - const paddle::Tensor& trg_word, - paddle::Tensor& output_ids, - paddle::Tensor& parent_ids, - paddle::Tensor& sequence_length, - const std::string& decoding_strategy, - const int beam_size, - const int topk, - const float topp, - const int head_num_, - const int size_per_head_, - const int num_layer_, - const int start_id_, - const int end_id_, - const int64_t max_seq_len_, - const float beam_search_diversity_rate_, - const float alpha, - cudaStream_t stream) { - int beam_width_ = (decoding_strategy == "beam_search" || - decoding_strategy == "beam_search_v2") - ? beam_size - : 1; - int candidate_num_ = (decoding_strategy == "sampling") ? topk : 1; - float probability_threshold_ = (decoding_strategy == "sampling") ? topp : 0.0; - - auto input_dims = input.shape(); - int batch_size_ = (decoding_strategy == "beam_search" || - decoding_strategy == "beam_search_v2") - ? input_dims[0] / beam_width_ - : input_dims[0]; - const int memory_max_seq_len = input_dims[1]; - const int memory_hidden_dim = input_dims[2]; - const int vocab_size = word_emb.shape()[0]; - - typedef PDTraits traits_; - typedef typename traits_::DataType DataType_; - typedef typename traits_::data_t data_t_; - - DecodingInitParam decoding_params; - decoding_params.cublas_handle = CublasHandle::GetInstance()->cublas_handle_; - decoding_params.cublaslt_handle = - CublasHandle::GetInstance()->cublaslt_handle_; - - decoding_params.output_ids = output_ids.mutable_data(input.place()); - decoding_params.parent_ids = parent_ids.mutable_data(input.place()); - decoding_params.sequence_length = - sequence_length.mutable_data(input.place()); - - typedef DecoderTransformerTraits DecodingTraits_; - decoding_params.stream = stream; - fastertransformer::Allocator allocator_(stream); - - decoding_params.memory_tensor = - reinterpret_cast(input.data()); - decoding_params.memory_sequence_length = memory_sequence_length.data(); - - auto trg_word_shape = trg_word.shape(); - int trg_max_len = - (trg_word_shape.size() == 2) ? static_cast(trg_word_shape[1]) : 0; - - paddle::Tensor trg_length = - (trg_word_shape.size() == 2 && trg_word_shape[0] != 0) - ? paddle::Tensor(paddle::PlaceType::kGPU, {trg_word_shape[0]}) - : paddle::Tensor(paddle::PlaceType::kGPU, {1}); - auto trg_length_ptr = trg_length.mutable_data(input.place()); - - if (trg_word_shape.size() == 2 && trg_word_shape[0] != 0) { - decoding_params.trg_word = trg_word.data(); - - get_trg_length<<<1, trg_word_shape[0], 0, stream>>>( - decoding_params.trg_word, trg_length_ptr, trg_max_len, start_id_); - decoding_params.trg_length = trg_length_ptr; - } - - DecoderInitParam* params = - new DecoderInitParam[num_layer_]; - - int inner_coeff = ffn_intermediate_weight[0].shape()[1] / memory_hidden_dim; - - auto q_weight_shape = self_attn_query_weight[0].shape(); - auto k_weight_shape = self_attn_key_weight[0].shape(); - bool fuse_qkv = (q_weight_shape[1] == k_weight_shape[1]) ? false : true; - - for (int i = 0; i < num_layer_; i++) { - params[i].stream = stream; - params[i].cublas_handle = CublasHandle::GetInstance()->cublas_handle_; - params[i].cublaslt_handle = CublasHandle::GetInstance()->cublaslt_handle_; - - if (decoding_strategy == "beam_search" || - decoding_strategy == "beam_search_v2") { - params[i].request_batch_size = batch_size_ * beam_width_; - params[i].request_max_mem_seq_len = memory_max_seq_len; - } else if (decoding_strategy == "sampling" || - decoding_strategy == "topk_sampling" || - decoding_strategy == "topp_sampling") { - params[i].request_batch_size = batch_size_; - params[i].request_max_mem_seq_len = memory_max_seq_len; - } - - // self attn - params[i].self_layernorm.gamma = reinterpret_cast( - self_layernorm_weight[i].data()); - params[i].self_layernorm.beta = reinterpret_cast( - self_layernorm_bias[i].data()); - // query - params[i].self_attention.query_weight.kernel = - reinterpret_cast( - self_attn_query_weight[i].data()); - params[i].self_attention.query_weight.bias = - reinterpret_cast( - self_attn_query_bias[i].data()); - // key - params[i].self_attention.key_weight.kernel = - reinterpret_cast( - self_attn_key_weight[i].data()); - params[i].self_attention.key_weight.bias = - reinterpret_cast( - self_attn_key_bias[i].data()); - // value - params[i].self_attention.value_weight.kernel = - reinterpret_cast( - self_attn_value_weight[i].data()); - params[i].self_attention.value_weight.bias = - reinterpret_cast( - self_attn_value_bias[i].data()); - // out proj - params[i].self_attention.attention_output_weight.kernel = - reinterpret_cast( - self_attn_output_weight[i].data()); - params[i].self_attention.attention_output_weight.bias = - reinterpret_cast( - self_attn_output_bias[i].data()); - - // cross - params[i].cross_layernorm.gamma = reinterpret_cast( - cross_layernorm_weight[i].data()); - params[i].cross_layernorm.beta = reinterpret_cast( - cross_layernorm_bias[i].data()); - // query - params[i].cross_attention.query_weight.kernel = - reinterpret_cast( - cross_attn_query_weight[i].data()); - params[i].cross_attention.query_weight.bias = - reinterpret_cast( - cross_attn_query_bias[i].data()); - // key - params[i].cross_attention.key_weight.kernel = - reinterpret_cast( - cross_attn_key_weight[i].data()); - params[i].cross_attention.key_weight.bias = - reinterpret_cast( - cross_attn_key_bias[i].data()); - // value - params[i].cross_attention.value_weight.kernel = - reinterpret_cast( - cross_attn_value_weight[i].data()); - params[i].cross_attention.value_weight.bias = - reinterpret_cast( - cross_attn_value_bias[i].data()); - // out proj - params[i].cross_attention.attention_output_weight.kernel = - reinterpret_cast( - cross_attn_output_weight[i].data()); - params[i].cross_attention.attention_output_weight.bias = - reinterpret_cast( - cross_attn_output_bias[i].data()); - - // ffn - params[i].ffn_layernorm.gamma = reinterpret_cast( - ffn_layernorm_weight[i].data()); - params[i].ffn_layernorm.beta = reinterpret_cast( - ffn_layernorm_bias[i].data()); - // intermediate proj - params[i].ffn.intermediate_weight.kernel = - reinterpret_cast( - ffn_intermediate_weight[i].data()); - params[i].ffn.intermediate_weight.bias = reinterpret_cast( - ffn_intermediate_bias[i].data()); - // out proj - params[i].ffn.output_weight.kernel = reinterpret_cast( - ffn_output_weight[i].data()); - params[i].ffn.output_weight.bias = - reinterpret_cast(ffn_output_bias[i].data()); - } - - decoding_params.layernorm.gamma = reinterpret_cast( - decoder_layernorm_weight.data()); - decoding_params.layernorm.beta = reinterpret_cast( - decoder_layernorm_bias.data()); - // for embedding - decoding_params.embedding_table = - reinterpret_cast(word_emb.data()); - - // for weight sharing matmul - decoding_params.embedding_kernel = - reinterpret_cast(embedding_weight.data()); - // for matmul bias - decoding_params.embedding_bias = - reinterpret_cast(embedding_bias.data()); - - decoding_params.position_encoding_table = reinterpret_cast( - position_encoding_table.data()); - - if ("beam_search" == decoding_strategy) { - DecodingBeamsearch* decoding_beam_search_; - decoding_beam_search_ = new DecodingBeamsearch( - allocator_, - batch_size_, - beam_width_, - max_seq_len_, - head_num_, - size_per_head_, - vocab_size, - num_layer_, - memory_hidden_dim, - memory_max_seq_len, - start_id_, - end_id_, - beam_search_diversity_rate_, - true, // is_fuse_topk_softMax - fuse_qkv, - false, // keep_alive_beam - 0.6, // alpha - true, // normalization_before - 0, // pos_offset - ActivationType::RELU, // act - false, // pos_bias - false, // prefix_lm - -1, // finished_candidate_num - false, // early_stopping - false, // is_mbart - 0, // min_length - inner_coeff); - - decoding_beam_search_->forward(params, decoding_params); - - delete decoding_beam_search_; - } else if ("beam_search_v2" == decoding_strategy) { - DecodingBeamsearch* decoding_beam_search_; - decoding_beam_search_ = new DecodingBeamsearch( - allocator_, - batch_size_, - beam_width_, - max_seq_len_, - head_num_, - size_per_head_, - vocab_size, - num_layer_, - memory_hidden_dim, - memory_max_seq_len, - start_id_, - end_id_, - beam_search_diversity_rate_, - true, // is_fuse_topk_softMax - fuse_qkv, // is_fuse_qkv - true, // keep_alive_beam - alpha, - true, // normalization_before - 0, // pos_offset - ActivationType::RELU, // act - false, // pos_bias - false, // prefix_lm - -1, // finished_candidate_num - false, // early_stopping - false, // is_mbart - 0, // min_length - inner_coeff); - - decoding_beam_search_->forward(params, decoding_params); - - delete decoding_beam_search_; - } else if ("topk_sampling" == decoding_strategy || - "topp_sampling" == decoding_strategy || - "sampling" == decoding_strategy) { - DecodingSampling* decoding_sampling_; - decoding_sampling_ = new DecodingSampling( - allocator_, - batch_size_, - max_seq_len_, - head_num_, - size_per_head_, - vocab_size, - num_layer_, - memory_hidden_dim, - memory_max_seq_len, - start_id_, - end_id_, - candidate_num_, - probability_threshold_, - fuse_qkv, - true, // normalization_before - 0, // pos_offset - ActivationType::RELU, // act - false, // pos_bias - 1.0, // temperature - 1.0, // repeat_penalty - false, // prefix_lm - false, // is_mbart - 0, // min_length - inner_coeff); - - decoding_sampling_->forward(params, decoding_params); - - delete decoding_sampling_; - } else { - PD_THROW( - "Only beam_search, beam_search_v2 and sampling are supported for " - "FastGeneration. "); - } - delete[] params; - - return {output_ids, parent_ids, sequence_length}; -} - -std::vector DecodingCUDAForward( - const paddle::Tensor& input, - const paddle::Tensor& mem_seq_len, - const paddle::Tensor& word_embedding, - const std::vector& self_ln_weight, - const std::vector& self_ln_bias, - const std::vector& self_q_weight, - const std::vector& self_q_bias, - const std::vector& self_k_weight, - const std::vector& self_k_bias, - const std::vector& self_v_weight, - const std::vector& self_v_bias, - const std::vector& self_out_weight, - const std::vector& self_out_bias, - const std::vector& cross_ln_weight, - const std::vector& cross_ln_bias, - const std::vector& cross_q_weight, - const std::vector& cross_q_bias, - const std::vector& cross_k_weight, - const std::vector& cross_k_bias, - const std::vector& cross_v_weight, - const std::vector& cross_v_bias, - const std::vector& cross_out_weight, - const std::vector& cross_out_bias, - const std::vector& ffn_ln_weight, - const std::vector& ffn_ln_bias, - const std::vector& ffn_inter_weight, - const std::vector& ffn_inter_bias, - const std::vector& ffn_out_weight, - const std::vector& ffn_out_bias, - const paddle::Tensor& decoder_ln_weight, - const paddle::Tensor& decoder_ln_bias, - const paddle::Tensor& embedding_weight, - const paddle::Tensor& embedding_bias, - const paddle::Tensor& positional_embedding_weight, - const paddle::Tensor& trg_word, - paddle::Tensor& output_ids, - paddle::Tensor& parent_ids, - paddle::Tensor& sequence_length, - const std::string& decoding_strategy, - const int beam_size, - const int topk, - const float topp, - const int n_head, - const int size_per_head, - const int num_layer, - const int bos_id, - const int eos_id, - const int64_t max_len, - const float beam_search_diversity_rate, - const float alpha) { - auto stream = input.stream(); - cublasSetStream(CublasHandle::GetInstance()->cublas_handle_, stream); - - std::vector ret; - - switch (input.type()) { - case paddle::DataType::FLOAT16: { - ret = decoding_kernel( - input, - mem_seq_len, - word_embedding, - self_ln_weight, - self_ln_bias, - self_q_weight, - self_q_bias, - self_k_weight, - self_k_bias, - self_v_weight, - self_v_bias, - self_out_weight, - self_out_bias, - cross_ln_weight, - cross_ln_bias, - cross_q_weight, - cross_q_bias, - cross_k_weight, - cross_k_bias, - cross_v_weight, - cross_v_bias, - cross_out_weight, - cross_out_bias, - ffn_ln_weight, - ffn_ln_bias, - ffn_inter_weight, - ffn_inter_bias, - ffn_out_weight, - ffn_out_bias, - decoder_ln_weight, - decoder_ln_bias, - embedding_weight, - embedding_bias, - positional_embedding_weight, - trg_word, - output_ids, - parent_ids, - sequence_length, - decoding_strategy, - beam_size, - topk, - topp, - n_head, - size_per_head, - num_layer, - bos_id, - eos_id, - max_len, - beam_search_diversity_rate, - alpha, - stream); - break; - } - case paddle::DataType::FLOAT32: { - ret = decoding_kernel( - input, - mem_seq_len, - word_embedding, - self_ln_weight, - self_ln_bias, - self_q_weight, - self_q_bias, - self_k_weight, - self_k_bias, - self_v_weight, - self_v_bias, - self_out_weight, - self_out_bias, - cross_ln_weight, - cross_ln_bias, - cross_q_weight, - cross_q_bias, - cross_k_weight, - cross_k_bias, - cross_v_weight, - cross_v_bias, - cross_out_weight, - cross_out_bias, - ffn_ln_weight, - ffn_ln_bias, - ffn_inter_weight, - ffn_inter_bias, - ffn_out_weight, - ffn_out_bias, - decoder_ln_weight, - decoder_ln_bias, - embedding_weight, - embedding_bias, - positional_embedding_weight, - trg_word, - output_ids, - parent_ids, - sequence_length, - decoding_strategy, - beam_size, - topk, - topp, - n_head, - size_per_head, - num_layer, - bos_id, - eos_id, - max_len, - beam_search_diversity_rate, - alpha, - stream); - break; - } - default: { - PD_THROW( - "NOT supported data type. " - "Only float16 and float32 are supported. "); - break; - } - } - - return ret; -} diff --git a/paddlenlp/ops/fast_transformer/src/fusion_force_decoding_op.h b/paddlenlp/ops/fast_transformer/src/fusion_force_decoding_op.h deleted file mode 100644 index 99eef25111d6..000000000000 --- a/paddlenlp/ops/fast_transformer/src/fusion_force_decoding_op.h +++ /dev/null @@ -1,85 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#pragma once - -#include -#include - -#include "cublas_handle.h" - -#include "fastertransformer/decoding_beamsearch.h" -#include "fastertransformer/decoding_sampling.h" -#include "fastertransformer/open_decoder.h" -#include "fastertransformer/utils/common.h" - -#ifdef PADDLE_ON_INFERENCE -#include "paddle/extension.h" -#include "paddle_inference_api.h" -#include "paddle/common/exception.h" -#else -#include "paddle/extension.h" -#endif - - -std::vector DecodingCUDAForward( - const paddle::Tensor& input, - const paddle::Tensor& mem_seq_len, - const paddle::Tensor& word_embedding, - const std::vector& self_ln_weight, - const std::vector& self_ln_bias, - const std::vector& self_q_weight, - const std::vector& self_q_bias, - const std::vector& self_k_weight, - const std::vector& self_k_bias, - const std::vector& self_v_weight, - const std::vector& self_v_bias, - const std::vector& self_out_weight, - const std::vector& self_out_bias, - const std::vector& cross_ln_weight, - const std::vector& cross_ln_bias, - const std::vector& cross_q_weight, - const std::vector& cross_q_bias, - const std::vector& cross_k_weight, - const std::vector& cross_k_bias, - const std::vector& cross_v_weight, - const std::vector& cross_v_bias, - const std::vector& cross_out_weight, - const std::vector& cross_out_bias, - const std::vector& ffn_ln_weight, - const std::vector& ffn_ln_bias, - const std::vector& ffn_inter_weight, - const std::vector& ffn_inter_bias, - const std::vector& ffn_out_weight, - const std::vector& ffn_out_bias, - const paddle::Tensor& decoder_ln_weight, - const paddle::Tensor& decoder_ln_bias, - const paddle::Tensor& embedding_weight, - const paddle::Tensor& embedding_bias, - const paddle::Tensor& positional_embedding_weight, - const paddle::Tensor& trg_word, - paddle::Tensor& output_ids, - paddle::Tensor& parent_ids, - paddle::Tensor& sequence_length, - const std::string& decoding_strategy, - const int beam_size, - const int topk, - const float topp, - const int n_head, - const int size_per_head, - const int num_layer, - const int bos_id, - const int eos_id, - const int64_t max_len, - const float beam_search_diversity_rate, - const float alpha); diff --git a/paddlenlp/ops/fast_transformer/src/fusion_gpt_op.cc b/paddlenlp/ops/fast_transformer/src/fusion_gpt_op.cc deleted file mode 100644 index a32867451e9d..000000000000 --- a/paddlenlp/ops/fast_transformer/src/fusion_gpt_op.cc +++ /dev/null @@ -1,223 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include - -#include "fusion_gpt_op.h" -#include "pd_traits.h" - - -std::vector GPT2Forward( - const paddle::Tensor& input, - const paddle::Tensor& attn_mask, - const paddle::Tensor& start_length, - const paddle::Tensor& word_embedding, - const std::vector& self_ln_weight, - const std::vector& self_ln_bias, - const std::vector& self_q_weight, - const std::vector& self_q_bias, - const std::vector& self_k_weight, - const std::vector& self_k_bias, - const std::vector& self_v_weight, - const std::vector& self_v_bias, - const std::vector& self_out_weight, - const std::vector& self_out_bias, - const std::vector& ffn_ln_weight, - const std::vector& ffn_ln_bias, - const std::vector& ffn_inter_weight, - const std::vector& ffn_inter_bias, - const std::vector& ffn_out_weight, - const std::vector& ffn_out_bias, - const paddle::Tensor& decoder_ln_weight, - const paddle::Tensor& decoder_ln_bias, - const paddle::Tensor& positional_embedding_weight, - const paddle::Tensor& emb_weight, - const int& topk, - const float& topp, - const int& max_len, - const int& n_head, - const int& size_per_head, - const int& num_layer, - const int& bos_id, - const int& eos_id, - const float& temperature, - const bool& use_fp16 = false, - const int& tensor_para_size = 1, - const int& layer_para_size = 1, - const int& layer_para_batch_size = 1) { - int batch_size = input.shape()[0]; - int start_len = input.shape()[1]; - int total_len = max_len + start_len; - std::vector output_dims({total_len, batch_size}); - auto output_ids = paddle::Tensor(input.place(), output_dims); - - if (word_embedding.place() == paddle::PlaceType::kGPU) { - return GPT2CUDAForward(input, - attn_mask, - start_length, - word_embedding, - self_ln_weight, - self_ln_bias, - self_q_weight, - self_q_bias, - self_k_weight, - self_k_bias, - self_v_weight, - self_v_bias, - self_out_weight, - self_out_bias, - ffn_ln_weight, - ffn_ln_bias, - ffn_inter_weight, - ffn_inter_bias, - ffn_out_weight, - ffn_out_bias, - decoder_ln_weight, - decoder_ln_bias, - positional_embedding_weight, - emb_weight, - output_ids, - topk, - topp, - total_len, - n_head, - size_per_head, - num_layer, - bos_id, - eos_id, - temperature, - use_fp16, - tensor_para_size, - layer_para_size, - layer_para_batch_size); - } else { - PD_THROW("Not implemented place. Only GPU is supported. "); - } -} - -std::vector> GPT2InferShape( - const std::vector& input_shape, - const std::vector& attn_mask_shape, - const std::vector& start_length, - const std::vector& word_embedding_shape, - const std::vector>& self_ln_weight_shapes, - const std::vector>& self_ln_bias_shapes, - const std::vector>& self_q_weight_shapes, - const std::vector>& self_q_bias_shapes, - const std::vector>& self_k_weight_shapes, - const std::vector>& self_k_bias_shapes, - const std::vector>& self_v_weight_shapes, - const std::vector>& self_v_bias_shapes, - const std::vector>& self_out_weight_shapes, - const std::vector>& self_out_bias_shapes, - const std::vector>& ffn_ln_weight_shapes, - const std::vector>& ffn_ln_bias_shapes, - const std::vector>& ffn_inter_weight_shapes, - const std::vector>& ffn_inter_bias_shapes, - const std::vector>& ffn_out_weight_shapes, - const std::vector>& ffn_out_bias_shapes, - const std::vector& decoder_ln_weight_shape, - const std::vector& decoder_ln_bias_shape, - const std::vector& positional_embedding_weight_shape, - const std::vector& emb_weight_shape, - const int& topk, - const float& topp, - const int& max_len, - const int& n_head, - const int& size_per_head, - const int& num_layer, - const int& bos_id, - const int& eos_id, - const float& temperature, - const bool& use_fp16 = false, - const int& tensor_para_size = 1, - const int& layer_para_size = 1, - const int& layer_para_batch_size = 1) { - int64_t batch_size = input_shape[0]; - int64_t start_len = input_shape[1]; - std::vector output_dims({max_len + start_len, batch_size}); - return {output_dims}; -} - -std::vector GPT2InferDtype( - const paddle::DataType& input_dtype, - const paddle::DataType& attn_mask_dtype, - const paddle::DataType& start_length_dtype, - const paddle::DataType& word_embedding_dtype, - const std::vector& self_ln_weight_dtype, - const std::vector& self_ln_bias_dtype, - const std::vector& self_q_weight_dtype, - const std::vector& self_q_bias_dtype, - const std::vector& self_k_weight_dtype, - const std::vector& self_k_bias_dtype, - const std::vector& self_v_weight_dtype, - const std::vector& self_v_bias_dtype, - const std::vector& self_out_weight_dtype, - const std::vector& self_out_bias_dtype, - const std::vector& ffn_ln_weight_dtype, - const std::vector& ffn_ln_bias_dtype, - const std::vector& ffn_inter_weight_dtype, - const std::vector& ffn_inter_bias_dtype, - const std::vector& ffn_out_weight_dtype, - const std::vector& ffn_out_bias_dtype, - const paddle::DataType& decoder_ln_weight_dtype, - const paddle::DataType& decoder_ln_bias_dtype, - const paddle::DataType& positional_embedding_weight_dtype, - const paddle::DataType& emb_weight_dtype) { - return {paddle::DataType::INT32}; -} - -PD_BUILD_OP(fusion_gpt) - .Inputs({"Input", - "AttentionMask", - "StartLength", - "WordEmbedding", - paddle::Vec("SelfLayernormWeight"), - paddle::Vec("SelfLayernormBias"), - paddle::Vec("SelfQueryWeight"), - paddle::Vec("SelfQueryBias"), - paddle::Vec("SelfKeyWeight"), - paddle::Vec("SelfKeyBias"), - paddle::Vec("SelfValueWeight"), - paddle::Vec("SelfValueBias"), - paddle::Vec("SelfOutWeight"), - paddle::Vec("SelfOutBias"), - paddle::Vec("FFNLayernormWeight"), - paddle::Vec("FFNLayernormBias"), - paddle::Vec("FFNInterWeight"), - paddle::Vec("FFNInterBias"), - paddle::Vec("FFNOutWeight"), - paddle::Vec("FFNOutBias"), - "DecoderLayernormWeight", - "DecoderLayernormBias", - "PositionEncEmb", - "EmbWeight"}) - .Outputs({"OutputIds"}) - .Attrs({"topk: int", - "topp: float", - "max_len: int", - "n_head: int", - "size_per_head: int", - "num_layer: int", - "bos_id: int", - "eos_id: int", - "temperature: float", - "use_fp16: bool", - "tensor_para_size: int", - "layer_para_size: int", - "layer_para_batch_size: int"}) - .SetKernelFn(PD_KERNEL(GPT2Forward)) - .SetInferShapeFn(PD_INFER_SHAPE(GPT2InferShape)) - .SetInferDtypeFn(PD_INFER_DTYPE(GPT2InferDtype)); diff --git a/paddlenlp/ops/fast_transformer/src/fusion_gpt_op.cu b/paddlenlp/ops/fast_transformer/src/fusion_gpt_op.cu deleted file mode 100644 index 29bc57747f04..000000000000 --- a/paddlenlp/ops/fast_transformer/src/fusion_gpt_op.cu +++ /dev/null @@ -1,378 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -// TODO(guosheng): `HOST` conflict exists in float.h of paddle and mpi.h of mpi -#include "fusion_gpt_op.h" -#include "pd_traits.h" -#ifdef HOST -#undef HOST -#endif - -#include "fastertransformer/gpt.h" -#include "fastertransformer/utils/common.h" - -#ifdef BUILD_GPT // consistent with FasterTransformer -#include "parallel_utils.h" -#endif - -template -std::vector gpt2_kernel( - const paddle::Tensor& input, - const paddle::Tensor& attn_mask, - const paddle::Tensor& start_length, - const paddle::Tensor& word_emb, - const std::vector& self_ln_weight, - const std::vector& self_ln_bias, - const std::vector& self_q_weight, - const std::vector& self_q_bias, - const std::vector& self_k_weight, - const std::vector& self_k_bias, - const std::vector& self_v_weight, - const std::vector& self_v_bias, - const std::vector& self_out_weight, - const std::vector& self_out_bias, - const std::vector& ffn_ln_weight, - const std::vector& ffn_ln_bias, - const std::vector& ffn_inter_weight, - const std::vector& ffn_inter_bias, - const std::vector& ffn_out_weight, - const std::vector& ffn_out_bias, - const paddle::Tensor& decoder_ln_weight, - const paddle::Tensor& decoder_ln_bias, - const paddle::Tensor& positional_embedding_weight, - const paddle::Tensor& emb_weight, - paddle::Tensor& output_ids, - const int& topk, - const float& topp, - const int& max_len, - const int& n_head, - const int& size_per_head, - const int& num_layer, - const int& bos_id, - const int& eos_id, - const float& temperature, - cublasHandle_t cublas_handle_, - cublasLtHandle_t cublaslt_handle_, - cudaStream_t stream, - const int& tensor_para_size = 1, - const int& layer_para_size = 1, - const int& layer_para_batch_size = 1) { - auto input_dims = input.shape(); - int batch_size_ = input_dims[0]; - int start_len = input_dims[1]; - const int vocab_size = word_emb.shape()[0]; - - typedef PDTraits traits_; - typedef typename traits_::DataType DataType_; - typedef typename traits_::data_t data_t_; - - DecodingInitParam decoding_params; - decoding_params.cublas_handle = cublas_handle_; - decoding_params.cublaslt_handle = cublaslt_handle_; - - decoding_params.output_ids = output_ids.mutable_data(word_emb.place()); - - typedef DecoderTransformerTraits DecodingTraits_; - decoding_params.stream = stream; - fastertransformer::Allocator allocator_(stream); - - const int hidden_unit = size_per_head * n_head; - -#ifdef BUILD_GPT - auto* model_para_desc = ModelParaDescFactory::CreateModelParaDesc( - n_head, - size_per_head, - num_layer, - tensor_para_size, - layer_para_size, - layer_para_batch_size, - const_cast(word_emb.data())); - auto& tensor_parallel_param = model_para_desc->tensor_parallel_param; - auto& layer_parallel_param = model_para_desc->layer_parallel_param; - auto seed = model_para_desc->dist(model_para_desc->gen); -#else - TensorParallelParam tensor_parallel_param; - LayerParallelParam layer_parallel_param; - tensor_parallel_param.rank = 0; - tensor_parallel_param.world_size = 1; - tensor_parallel_param.local_head_num_ = n_head; - tensor_parallel_param.local_hidden_units_ = hidden_unit; - - layer_parallel_param.rank = 0; - layer_parallel_param.world_size = 1; - layer_parallel_param.layers_per_group = num_layer; - layer_parallel_param.local_batch_size = batch_size_; - int seed = -1; -#endif - - DecodingGpt* gpt_decoding; - - decoding_params.request_batch_size = batch_size_; - decoding_params.max_input_len = start_len; - decoding_params.request_input_len = start_len; - decoding_params.request_output_len = max_len - start_len; - - decoding_params.d_start_ids = const_cast(input.data()); - decoding_params.d_attn_mask = - reinterpret_cast(const_cast(attn_mask.data())); - decoding_params.d_start_lengths = start_length.data(); - - gpt_decoding = - new DecodingGpt(allocator_, - batch_size_, - max_len, - n_head, - size_per_head, - vocab_size, - num_layer, - bos_id, - eos_id, - topk, - topp, - temperature, - tensor_para_size, - layer_para_size, - true, /*is_fuse_QKV*/ - 1.0, /*repetition_penalty*/ - seed); - - gpt_decoding->set_tensor_parallel_param(tensor_parallel_param); - gpt_decoding->set_layer_parallel_param(layer_parallel_param); - - DecoderInitParam* params = - new DecoderInitParam[num_layer]; - - for (int i = 0; i < self_ln_weight.size(); ++i) { - // Allow python passing weights of all layers or only passing the - // corresponding layers to save memory. - int layer_idx = self_ln_weight.size() != num_layer - ? layer_parallel_param.rank * - layer_parallel_param.layers_per_group + - i - : i; - - params[layer_idx].stream = stream; - params[layer_idx].cublas_handle = cublas_handle_; - params[layer_idx].cublaslt_handle = cublaslt_handle_; - - params[layer_idx].request_batch_size = batch_size_; - params[layer_idx].request_max_mem_seq_len = start_len; - - params[layer_idx].self_layernorm.gamma = - reinterpret_cast(self_ln_weight[i].data()); - params[layer_idx].self_layernorm.beta = - reinterpret_cast(self_ln_bias[i].data()); - - params[layer_idx].self_attention.query_weight.kernel = - reinterpret_cast(self_q_weight[i].data()); - params[layer_idx].self_attention.query_weight.bias = - reinterpret_cast(self_q_bias[i].data()); - // For `is_fuse_QKV == true`, ignore weight and bias of key and value to - // remove requirements on python passing weights to save memory. - // params[layer_idx].self_attention.key_weight.kernel = - // reinterpret_cast(self_k_weight[i].data()); - // params[layer_idx].self_attention.key_weight.bias = - // reinterpret_cast(self_k_bias[i].data()); - // params[layer_idx].self_attention.value_weight.kernel = - // reinterpret_cast(self_v_weight[i].data()); - // params[layer_idx].self_attention.value_weight.bias = - // reinterpret_cast(self_v_bias[i].data()); - - params[layer_idx].self_attention.attention_output_weight.kernel = - reinterpret_cast(self_out_weight[i].data()); - params[layer_idx].self_attention.attention_output_weight.bias = - reinterpret_cast(self_out_bias[i].data()); - - params[layer_idx].ffn_layernorm.gamma = - reinterpret_cast(ffn_ln_weight[i].data()); - params[layer_idx].ffn_layernorm.beta = - reinterpret_cast(ffn_ln_bias[i].data()); - - params[layer_idx].ffn.intermediate_weight.kernel = - reinterpret_cast(ffn_inter_weight[i].data()); - params[layer_idx].ffn.intermediate_weight.bias = - reinterpret_cast(ffn_inter_bias[i].data()); - params[layer_idx].ffn.output_weight.kernel = - reinterpret_cast(ffn_out_weight[i].data()); - params[layer_idx].ffn.output_weight.bias = - reinterpret_cast(ffn_out_bias[i].data()); - } - - decoding_params.layernorm.gamma = - reinterpret_cast(decoder_ln_weight.data()); - decoding_params.layernorm.beta = - reinterpret_cast(decoder_ln_bias.data()); - decoding_params.embedding_table = - reinterpret_cast(word_emb.data()); - decoding_params.embedding_kernel = - reinterpret_cast(emb_weight.data()); - decoding_params.position_encoding_table = reinterpret_cast( - positional_embedding_weight.data()); - - gpt_decoding->forward_context(params, decoding_params); - gpt_decoding->forward(params, decoding_params); - - delete gpt_decoding; - delete[] params; - - return {output_ids}; -} - -std::vector GPT2CUDAForward( - const paddle::Tensor& input, - const paddle::Tensor& attn_mask, - const paddle::Tensor& start_length, - const paddle::Tensor& word_embedding, - const std::vector& self_ln_weight, - const std::vector& self_ln_bias, - const std::vector& self_q_weight, - const std::vector& self_q_bias, - const std::vector& self_k_weight, - const std::vector& self_k_bias, - const std::vector& self_v_weight, - const std::vector& self_v_bias, - const std::vector& self_out_weight, - const std::vector& self_out_bias, - const std::vector& ffn_ln_weight, - const std::vector& ffn_ln_bias, - const std::vector& ffn_inter_weight, - const std::vector& ffn_inter_bias, - const std::vector& ffn_out_weight, - const std::vector& ffn_out_bias, - const paddle::Tensor& decoder_ln_weight, - const paddle::Tensor& decoder_ln_bias, - const paddle::Tensor& positional_embedding_weight, - const paddle::Tensor& emb_weight, - paddle::Tensor& output_ids, - const int& topk, - const float& topp, - const int& max_len, - const int& n_head, - const int& size_per_head, - const int& num_layer, - const int& bos_id, - const int& eos_id, - const float& temperature, - const bool& use_fp16 = false, - const int& tensor_para_size = 1, - const int& layer_para_size = 1, - const int& layer_para_batch_size = 1) { - auto stream = word_embedding.stream(); - // TODO(guosheng): use the global cublas handle - cublasHandle_t cublas_handle_; - cublasCreate(&cublas_handle_); - cublasLtHandle_t cublaslt_handle_; - cublasLtCreate(&cublaslt_handle_); - cublasSetStream(cublas_handle_, stream); - - std::vector ret; - - if (use_fp16) { - ret = gpt2_kernel(input, - attn_mask, - start_length, - word_embedding, - self_ln_weight, - self_ln_bias, - self_q_weight, - self_q_bias, - self_k_weight, - self_k_bias, - self_v_weight, - self_v_bias, - self_out_weight, - self_out_bias, - ffn_ln_weight, - ffn_ln_bias, - ffn_inter_weight, - ffn_inter_bias, - ffn_out_weight, - ffn_out_bias, - decoder_ln_weight, - decoder_ln_bias, - positional_embedding_weight, - emb_weight, - output_ids, - topk, - topp, - max_len, - n_head, - size_per_head, - num_layer, - bos_id, - eos_id, - temperature, - cublas_handle_, - cublaslt_handle_, - stream, - tensor_para_size, - layer_para_size, - layer_para_batch_size); - } else { - ret = gpt2_kernel(input, - attn_mask, - start_length, - word_embedding, - self_ln_weight, - self_ln_bias, - self_q_weight, - self_q_bias, - self_k_weight, - self_k_bias, - self_v_weight, - self_v_bias, - self_out_weight, - self_out_bias, - ffn_ln_weight, - ffn_ln_bias, - ffn_inter_weight, - ffn_inter_bias, - ffn_out_weight, - ffn_out_bias, - decoder_ln_weight, - decoder_ln_bias, - positional_embedding_weight, - emb_weight, - output_ids, - topk, - topp, - max_len, - n_head, - size_per_head, - num_layer, - bos_id, - eos_id, - temperature, - cublas_handle_, - cublaslt_handle_, - stream, - tensor_para_size, - layer_para_size, - layer_para_batch_size); - } - - cublasDestroy(cublas_handle_); - cublasLtDestroy(cublaslt_handle_); - return ret; -} diff --git a/paddlenlp/ops/fast_transformer/src/fusion_gpt_op.h b/paddlenlp/ops/fast_transformer/src/fusion_gpt_op.h deleted file mode 100644 index 75394d5a8ee2..000000000000 --- a/paddlenlp/ops/fast_transformer/src/fusion_gpt_op.h +++ /dev/null @@ -1,71 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include - -// #include "fastertransformer/gpt.h" -// #include "fastertransformer/open_decoder.h" -// #include "fastertransformer/utils/common.h" - -#ifdef PADDLE_ON_INFERENCE -#include "paddle/extension.h" -#include "paddle_inference_api.h" -#include "paddle/common/exception.h" -#else -#include "paddle/extension.h" -#endif - - -std::vector GPT2CUDAForward( - const paddle::Tensor& input, - const paddle::Tensor& attn_mask, - const paddle::Tensor& start_length, - const paddle::Tensor& word_embedding, - const std::vector& self_ln_weight, - const std::vector& self_ln_bias, - const std::vector& self_q_weight, - const std::vector& self_q_bias, - const std::vector& self_k_weight, - const std::vector& self_k_bias, - const std::vector& self_v_weight, - const std::vector& self_v_bias, - const std::vector& self_out_weight, - const std::vector& self_out_bias, - const std::vector& ffn_ln_weight, - const std::vector& ffn_ln_bias, - const std::vector& ffn_inter_weight, - const std::vector& ffn_inter_bias, - const std::vector& ffn_out_weight, - const std::vector& ffn_out_bias, - const paddle::Tensor& decoder_ln_weight, - const paddle::Tensor& decoder_ln_bias, - const paddle::Tensor& positional_embedding_weight, - const paddle::Tensor& emb_weight, - paddle::Tensor& output_ids, - const int& topk, - const float& topp, - const int& max_len, - const int& n_head, - const int& size_per_head, - const int& num_layer, - const int& bos_id, - const int& eos_id, - const float& temperature, - const bool& use_fp16, - const int& tensor_para_size, - const int& layer_para_size, - const int& layer_para_batch_size); diff --git a/paddlenlp/ops/fast_transformer/src/fusion_gptj_op.cc b/paddlenlp/ops/fast_transformer/src/fusion_gptj_op.cc deleted file mode 100644 index 81d5411eb4be..000000000000 --- a/paddlenlp/ops/fast_transformer/src/fusion_gptj_op.cc +++ /dev/null @@ -1,203 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "fusion_gptj_op.h" -#include "pd_traits.h" - - -std::vector GPTJForward( - const paddle::Tensor& input, - const paddle::Tensor& attn_mask, - const paddle::Tensor& start_length, - const paddle::Tensor& word_embedding, - const std::vector& self_ln_weight, - const std::vector& self_ln_bias, - const std::vector& self_q_weight, - const std::vector& self_out_weight, - const std::vector& ffn_inter_weight, - const std::vector& ffn_inter_bias, - const std::vector& ffn_out_weight, - const std::vector& ffn_out_bias, - const paddle::Tensor& decoder_ln_weight, - const paddle::Tensor& decoder_ln_bias, - const paddle::Tensor& emb_weight, - const paddle::Tensor& emb_bias, - const int topk, - const float topp, - const int max_len, - const int n_head, - const int size_per_head, - const int num_layer, - const int bos_id, - const int eos_id, - const float temperature, - const int rotary_embedding_dim, - const float repetition_penalty, - const int min_length, - const bool use_fp16 = false, - const int tensor_para_size = 1, - const int layer_para_size = 1, - const int layer_para_batch_size = 1) { - int batch_size = input.shape()[0]; - int start_len = input.shape()[1]; - int total_len = max_len + start_len; - std::vector output_dims({total_len, batch_size}); - -#ifdef PADDLE_NEW_ALLOCATOR - // For PaddlePaddle>=2.3.0 - auto output_ids = paddle::empty(output_dims, paddle::DataType::INT32, input.place()); - auto gpu_place = paddle::GPUPlace(); -#else - auto output_ids = paddle::Tensor(input.place(), output_dims); - auto gpu_place = paddle::PlaceType::kGPU; -#endif - - if (word_embedding.place() == gpu_place) { - return GPTJCUDAForward(input, - attn_mask, - start_length, - word_embedding, - self_ln_weight, - self_ln_bias, - self_q_weight, - self_out_weight, - ffn_inter_weight, - ffn_inter_bias, - ffn_out_weight, - ffn_out_bias, - decoder_ln_weight, - decoder_ln_bias, - emb_weight, - emb_bias, - output_ids, - topk, - topp, - total_len, - n_head, - size_per_head, - num_layer, - bos_id, - eos_id, - temperature, - rotary_embedding_dim, - repetition_penalty, - min_length, - use_fp16, - tensor_para_size, - layer_para_size, - layer_para_batch_size); - } else { - PD_THROW("Not implemented place. Only GPU is supported. "); - } -} - -std::vector> GPTJInferShape( - const std::vector& input_shape, - const std::vector& attn_mask_shape, - const std::vector& start_length, - const std::vector& word_embedding_shape, - const std::vector>& self_ln_weight_shapes, - const std::vector>& self_ln_bias_shapes, - const std::vector>& self_q_weight_shapes, - const std::vector>& self_out_weight_shapes, - const std::vector>& ffn_inter_weight_shapes, - const std::vector>& ffn_inter_bias_shapes, - const std::vector>& ffn_out_weight_shapes, - const std::vector>& ffn_out_bias_shapes, - const std::vector& decoder_ln_weight_shape, - const std::vector& decoder_ln_bias_shape, - const std::vector& emb_weight_shape, - const std::vector& emb_bias_shape, - const int topk, - const float topp, - const int max_len, - const int n_head, - const int size_per_head, - const int num_layer, - const int bos_id, - const int eos_id, - const float temperature, - const int rotary_embedding_dim, - const float repetition_penalty, - const int min_length, - const bool use_fp16 = false, - const int tensor_para_size = 1, - const int layer_para_size = 1, - const int layer_para_batch_size = 1) { - int64_t batch_size = input_shape[0]; - int64_t start_len = input_shape[1]; - std::vector output_dims({max_len + start_len, batch_size}); - return {output_dims}; -} - -std::vector GPTJInferDtype( - const paddle::DataType& input_dtype, - const paddle::DataType& attn_mask_dtype, - const paddle::DataType& start_length_dtype, - const paddle::DataType& word_embedding_dtype, - const std::vector& self_ln_weight_dtype, - const std::vector& self_ln_bias_dtype, - const std::vector& self_q_weight_dtype, - const std::vector& self_out_weight_dtype, - const std::vector& ffn_inter_weight_dtype, - const std::vector& ffn_inter_bias_dtype, - const std::vector& ffn_out_weight_dtype, - const std::vector& ffn_out_bias_dtype, - const paddle::DataType& decoder_ln_weight_dtype, - const paddle::DataType& decoder_ln_bias_dtype, - const paddle::DataType& emb_weight_dtype, - const paddle::DataType& emb_bias_dtype) { - return {paddle::DataType::INT32}; -} - -PD_BUILD_OP(fusion_gptj) - .Inputs({"Input", - "AttentionMask", - "StartLength", - "WordEmbedding", - paddle::Vec("SelfLayernormWeight"), - paddle::Vec("SelfLayernormBias"), - paddle::Vec("SelfQueryWeight"), - paddle::Vec("SelfOutWeight"), - paddle::Vec("FFNInterWeight"), - paddle::Vec("FFNInterBias"), - paddle::Vec("FFNOutWeight"), - paddle::Vec("FFNOutBias"), - "DecoderLayernormWeight", - "DecoderLayernormBias", - "EmbWeight", - "EmbBias"}) - .Outputs({"OutputIds"}) - .Attrs({"topk: int", - "topp: float", - "max_len: int", - "n_head: int", - "size_per_head: int", - "num_layer: int", - "bos_id: int", - "eos_id: int", - "temperature: float", - "rotary_embedding_dim: int", - "repetition_penalty: float", - "min_length: int", - "use_fp16: bool", - "tensor_para_size: int", - "layer_para_size: int", - "layer_para_batch_size: int"}) - .SetKernelFn(PD_KERNEL(GPTJForward)) - .SetInferShapeFn(PD_INFER_SHAPE(GPTJInferShape)) - .SetInferDtypeFn(PD_INFER_DTYPE(GPTJInferDtype)); diff --git a/paddlenlp/ops/fast_transformer/src/fusion_gptj_op.cu b/paddlenlp/ops/fast_transformer/src/fusion_gptj_op.cu deleted file mode 100644 index 25e28210796b..000000000000 --- a/paddlenlp/ops/fast_transformer/src/fusion_gptj_op.cu +++ /dev/null @@ -1,334 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -// Use the global cublas handle -#include "cublas_handle.h" - -// TODO(guosheng): `HOST` conflict exists in float.h of paddle and mpi.h of mpi -#include "fusion_gptj_op.h" -#include "pd_traits.h" -#ifdef HOST -#undef HOST -#endif - -#include "fastertransformer/utils/common.h" - -#ifdef BUILD_GPT // consistent with FasterTransformer -#include "parallel_utils.h" -#endif - -template -std::vector gptj_kernel( - const paddle::Tensor& input, - const paddle::Tensor& attn_mask, - const paddle::Tensor& start_length, - const paddle::Tensor& word_emb, - const std::vector& self_ln_weight, - const std::vector& self_ln_bias, - const std::vector& self_q_weight, - const std::vector& self_out_weight, - const std::vector& ffn_inter_weight, - const std::vector& ffn_inter_bias, - const std::vector& ffn_out_weight, - const std::vector& ffn_out_bias, - const paddle::Tensor& decoder_ln_weight, - const paddle::Tensor& decoder_ln_bias, - const paddle::Tensor& emb_weight, - const paddle::Tensor& emb_bias, - paddle::Tensor& output_ids, - const int topk, - const float topp, - const int max_len, - const int n_head, - const int size_per_head, - const int num_layer, - const int bos_id, - const int eos_id, - const float temperature, - const int rotary_embedding_dim, - const float repetition_penalty, - const int min_length, - cudaStream_t stream, - const int tensor_para_size = 1, - const int layer_para_size = 1, - const int layer_para_batch_size = 1) { - auto input_dims = input.shape(); - int batch_size_ = input_dims[0]; - int start_len = input_dims[1]; - const int vocab_size = emb_bias.shape()[0]; - - typedef PDTraits traits_; - typedef typename traits_::DataType DataType_; - typedef typename traits_::data_t data_t_; - - DecodingInitParam decoding_params; - decoding_params.cublas_handle = CublasHandle::GetInstance()->cublas_handle_; - decoding_params.cublaslt_handle = CublasHandle::GetInstance()->cublaslt_handle_; - -#ifdef PADDLE_NEW_ALLOCATOR - // For PaddlePaddle>=2.3.0 - decoding_params.output_ids = output_ids.data(); -#else - decoding_params.output_ids = output_ids.mutable_data(word_emb.place()); -#endif - - typedef DecoderTransformerTraits DecodingTraits_; - decoding_params.stream = stream; - fastertransformer::Allocator allocator_(stream); - - const int hidden_unit = size_per_head * n_head; - -#ifdef BUILD_GPT - auto* model_para_desc = ModelParaDescFactory::CreateModelParaDesc( - n_head, - size_per_head, - num_layer, - tensor_para_size, - layer_para_size, - layer_para_batch_size, - const_cast(word_emb.data())); - auto& tensor_parallel_param = model_para_desc->tensor_parallel_param; - auto& layer_parallel_param = model_para_desc->layer_parallel_param; - auto seed = model_para_desc->dist(model_para_desc->gen); -#else - TensorParallelParam tensor_parallel_param; - LayerParallelParam layer_parallel_param; - tensor_parallel_param.rank = 0; - tensor_parallel_param.world_size = 1; - tensor_parallel_param.local_head_num_ = n_head; - tensor_parallel_param.local_hidden_units_ = hidden_unit; - - layer_parallel_param.rank = 0; - layer_parallel_param.world_size = 1; - layer_parallel_param.layers_per_group = num_layer; - layer_parallel_param.local_batch_size = batch_size_; - int seed = -1; -#endif - - DecodingGptJ* gptj_decoding; - - decoding_params.request_batch_size = batch_size_; - decoding_params.max_input_len = start_len; - decoding_params.request_input_len = start_len; - decoding_params.request_output_len = max_len - start_len; - - decoding_params.d_start_ids = const_cast(input.data()); - decoding_params.d_attn_mask = - reinterpret_cast(const_cast(attn_mask.data())); - decoding_params.d_start_lengths = start_length.data(); - - gptj_decoding = - new DecodingGptJ(allocator_, - batch_size_, - max_len, - n_head, - size_per_head, - vocab_size, - num_layer, - bos_id, - eos_id, - topk, - topp, - temperature, - tensor_para_size, - layer_para_size, - true, /*is_fuse_QKV*/ - repetition_penalty, /*repetition_penalty*/ - seed, - rotary_embedding_dim, - min_length); - - gptj_decoding->set_tensor_parallel_param(tensor_parallel_param); - gptj_decoding->set_layer_parallel_param(layer_parallel_param); - - DecoderInitParam* params = - new DecoderInitParam[num_layer]; - - for (int i = 0; i < self_ln_weight.size(); ++i) { - // Allow python passing weights of all layers or only passing the - // corresponding layers to save memory. - int layer_idx = self_ln_weight.size() != num_layer - ? layer_parallel_param.rank * - layer_parallel_param.layers_per_group + - i - : i; - - params[layer_idx].stream = stream; - params[layer_idx].cublas_handle = CublasHandle::GetInstance()->cublas_handle_; - params[layer_idx].cublaslt_handle = CublasHandle::GetInstance()->cublaslt_handle_; - - params[layer_idx].request_batch_size = batch_size_; - params[layer_idx].request_max_mem_seq_len = start_len; - - params[layer_idx].self_layernorm.gamma = - reinterpret_cast(self_ln_weight[i].data()); - params[layer_idx].self_layernorm.beta = - reinterpret_cast(self_ln_bias[i].data()); - - params[layer_idx].self_attention.query_weight.kernel = - reinterpret_cast(self_q_weight[i].data()); - params[layer_idx].self_attention.query_weight.bias = nullptr; - - params[layer_idx].self_attention.attention_output_weight.kernel = - reinterpret_cast(self_out_weight[i].data()); - params[layer_idx].self_attention.attention_output_weight.bias = nullptr; - - params[layer_idx].ffn.intermediate_weight.kernel = - reinterpret_cast(ffn_inter_weight[i].data()); - params[layer_idx].ffn.intermediate_weight.bias = - reinterpret_cast(ffn_inter_bias[i].data()); - params[layer_idx].ffn.output_weight.kernel = - reinterpret_cast(ffn_out_weight[i].data()); - params[layer_idx].ffn.output_weight.bias = - reinterpret_cast(ffn_out_bias[i].data()); - } - - decoding_params.layernorm.gamma = - reinterpret_cast(decoder_ln_weight.data()); - decoding_params.layernorm.beta = - reinterpret_cast(decoder_ln_bias.data()); - decoding_params.embedding_table = - reinterpret_cast(word_emb.data()); - decoding_params.embedding_kernel = - reinterpret_cast(emb_weight.data()); - decoding_params.embedding_bias = - reinterpret_cast(emb_bias.data()); - - gptj_decoding->forward_context(params, decoding_params); - gptj_decoding->forward(params, decoding_params); - - delete gptj_decoding; - delete[] params; - - return {output_ids}; -} - -std::vector GPTJCUDAForward( - const paddle::Tensor& input, - const paddle::Tensor& attn_mask, - const paddle::Tensor& start_length, - const paddle::Tensor& word_embedding, - const std::vector& self_ln_weight, - const std::vector& self_ln_bias, - const std::vector& self_q_weight, - const std::vector& self_out_weight, - const std::vector& ffn_inter_weight, - const std::vector& ffn_inter_bias, - const std::vector& ffn_out_weight, - const std::vector& ffn_out_bias, - const paddle::Tensor& decoder_ln_weight, - const paddle::Tensor& decoder_ln_bias, - const paddle::Tensor& emb_weight, - const paddle::Tensor& emb_bias, - paddle::Tensor& output_ids, - const int topk, - const float topp, - const int max_len, - const int n_head, - const int size_per_head, - const int num_layer, - const int bos_id, - const int eos_id, - const float temperature, - const int rotary_embedding_dim, - const float repetition_penalty, - const int min_length, - const bool use_fp16 = false, - const int tensor_para_size = 1, - const int layer_para_size = 1, - const int layer_para_batch_size = 1) { - - auto stream = word_embedding.stream(); - cublasSetStream(CublasHandle::GetInstance()->cublas_handle_, stream); - - if (use_fp16) { - return gptj_kernel(input, - attn_mask, - start_length, - word_embedding, - self_ln_weight, - self_ln_bias, - self_q_weight, - self_out_weight, - ffn_inter_weight, - ffn_inter_bias, - ffn_out_weight, - ffn_out_bias, - decoder_ln_weight, - decoder_ln_bias, - emb_weight, - emb_bias, - output_ids, - topk, - topp, - max_len, - n_head, - size_per_head, - num_layer, - bos_id, - eos_id, - temperature, - rotary_embedding_dim, - repetition_penalty, - min_length, - stream, - tensor_para_size, - layer_para_size, - layer_para_batch_size); - } else { - return gptj_kernel(input, - attn_mask, - start_length, - word_embedding, - self_ln_weight, - self_ln_bias, - self_q_weight, - self_out_weight, - ffn_inter_weight, - ffn_inter_bias, - ffn_out_weight, - ffn_out_bias, - decoder_ln_weight, - decoder_ln_bias, - emb_weight, - emb_bias, - output_ids, - topk, - topp, - max_len, - n_head, - size_per_head, - num_layer, - bos_id, - eos_id, - temperature, - rotary_embedding_dim, - repetition_penalty, - min_length, - stream, - tensor_para_size, - layer_para_size, - layer_para_batch_size); - } -} diff --git a/paddlenlp/ops/fast_transformer/src/fusion_gptj_op.h b/paddlenlp/ops/fast_transformer/src/fusion_gptj_op.h deleted file mode 100644 index 48c70553fd52..000000000000 --- a/paddlenlp/ops/fast_transformer/src/fusion_gptj_op.h +++ /dev/null @@ -1,66 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include - -#include "fastertransformer/gptj.h" -#include "fastertransformer/open_decoder.h" -#include "fastertransformer/utils/common.h" - -#ifdef PADDLE_ON_INFERENCE -#include "paddle/extension.h" -#include "paddle_inference_api.h" -#include "paddle/common/exception.h" -#else -#include "paddle/extension.h" -#endif - - -std::vector GPTJCUDAForward( - const paddle::Tensor& input, - const paddle::Tensor& attn_mask, - const paddle::Tensor& start_length, - const paddle::Tensor& word_embedding, - const std::vector& self_ln_weight, - const std::vector& self_ln_bias, - const std::vector& self_q_weight, - const std::vector& self_out_weight, - const std::vector& ffn_inter_weight, - const std::vector& ffn_inter_bias, - const std::vector& ffn_out_weight, - const std::vector& ffn_out_bias, - const paddle::Tensor& decoder_ln_weight, - const paddle::Tensor& decoder_ln_bias, - const paddle::Tensor& emb_weight, - const paddle::Tensor& emb_bias, - paddle::Tensor& output_ids, - const int topk, - const float topp, - const int max_len, - const int n_head, - const int size_per_head, - const int num_layer, - const int bos_id, - const int eos_id, - const float temperature, - const int rotary_embedding_dim, - const float repetition_penalty, - const int min_length, - const bool use_fp16, - const int tensor_para_size, - const int layer_para_size, - const int layer_para_batch_size); diff --git a/paddlenlp/ops/fast_transformer/src/fusion_mbart_decoding_op.cc b/paddlenlp/ops/fast_transformer/src/fusion_mbart_decoding_op.cc deleted file mode 100644 index 30ecb154c1f5..000000000000 --- a/paddlenlp/ops/fast_transformer/src/fusion_mbart_decoding_op.cc +++ /dev/null @@ -1,368 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include -#include - -#include "fusion_mbart_decoding_op.h" -#include "pd_traits.h" - - -std::vector MBartDecodingForward( - const paddle::Tensor& input, - const paddle::Tensor& mem_seq_len, - const paddle::Tensor& word_embedding, - const std::vector& self_ln_weight, - const std::vector& self_ln_bias, - const std::vector& self_q_weight, - const std::vector& self_q_bias, - const std::vector& self_k_weight, - const std::vector& self_k_bias, - const std::vector& self_v_weight, - const std::vector& self_v_bias, - const std::vector& self_out_weight, - const std::vector& self_out_bias, - const std::vector& cross_ln_weight, - const std::vector& cross_ln_bias, - const std::vector& cross_q_weight, - const std::vector& cross_q_bias, - const std::vector& cross_k_weight, - const std::vector& cross_k_bias, - const std::vector& cross_v_weight, - const std::vector& cross_v_bias, - const std::vector& cross_out_weight, - const std::vector& cross_out_bias, - const std::vector& ffn_ln_weight, - const std::vector& ffn_ln_bias, - const std::vector& ffn_inter_weight, - const std::vector& ffn_inter_bias, - const std::vector& ffn_out_weight, - const std::vector& ffn_out_bias, - const paddle::Tensor& decoder_ln_weight, - const paddle::Tensor& decoder_ln_bias, - const paddle::Tensor& mbart_ln_weight, - const paddle::Tensor& mbart_ln_bias, - const paddle::Tensor& embedding_weight, - const paddle::Tensor& embedding_bias, - const paddle::Tensor& positional_embedding_weight, - const paddle::Tensor& trg_word, - const std::string& decoding_strategy, - const int& beam_size, - const int& topk, - const float& topp, - const int& n_head, - const int& size_per_head, - const int& num_layer, - const int& bos_id, - const int& eos_id, - const float& temperature, - const int64_t& max_len, - const float& beam_search_diversity_rate, - const bool& rel_len, - const float& alpha, - const bool& early_stopping, - const std::string& hidden_act) { - int batch_size = input.shape()[0]; - int max_out_len = rel_len ? max_len + input.shape()[1] : max_len; - - std::vector output_dims; - std::vector parent_ids_dims; - std::vector sequence_length_dims({batch_size}); - if (decoding_strategy == "beam_search") { - batch_size /= beam_size; - output_dims = {max_out_len, batch_size, beam_size}; - parent_ids_dims = output_dims; - } else if (decoding_strategy == "beam_search_v2" || - decoding_strategy == "beam_search_v3") { - // Use separated alive and finish beam queues to avoid the decrease of alive - // beams. The outputs must include both the finish and alive to trace full - // path. - if (batch_size != -1) { - sequence_length_dims = {batch_size * 2}; - batch_size /= beam_size; - } else { - sequence_length_dims = {batch_size}; - } - output_dims = {max_out_len, batch_size, beam_size * 2}; - parent_ids_dims = output_dims; - } else if (decoding_strategy == "topk_sampling" || - decoding_strategy == "topp_sampling") { - output_dims = {max_out_len, batch_size}; - parent_ids_dims = {1}; - } else { - PD_THROW("Not supported decoding strategy. "); - } - - if (input.place() == paddle::PlaceType::kGPU) { - auto output_ids = paddle::Tensor(paddle::PlaceType::kGPU, output_dims); - auto parent_ids = paddle::Tensor(paddle::PlaceType::kGPU, parent_ids_dims); - auto sequence_length = - paddle::Tensor(paddle::PlaceType::kGPU, sequence_length_dims); - - paddle::Tensor seq_len = paddle::Tensor(paddle::PlaceType::kGPU); - - if (mem_seq_len.place() != paddle::PlaceType::kGPU) { - seq_len = mem_seq_len.copy_to(paddle::PlaceType::kGPU); - } else { - seq_len = mem_seq_len; - } - - return MBartDecodingCUDAForward(input, - seq_len, - word_embedding, - self_ln_weight, - self_ln_bias, - self_q_weight, - self_q_bias, - self_k_weight, - self_k_bias, - self_v_weight, - self_v_bias, - self_out_weight, - self_out_bias, - cross_ln_weight, - cross_ln_bias, - cross_q_weight, - cross_q_bias, - cross_k_weight, - cross_k_bias, - cross_v_weight, - cross_v_bias, - cross_out_weight, - cross_out_bias, - ffn_ln_weight, - ffn_ln_bias, - ffn_inter_weight, - ffn_inter_bias, - ffn_out_weight, - ffn_out_bias, - decoder_ln_weight, - decoder_ln_bias, - mbart_ln_weight, - mbart_ln_bias, - embedding_weight, - embedding_bias, - positional_embedding_weight, - trg_word, - output_ids, - parent_ids, - sequence_length, - decoding_strategy, - beam_size, - topk, - topp, - n_head, - size_per_head, - num_layer, - bos_id, - eos_id, - temperature, - max_out_len, - beam_search_diversity_rate, - alpha, - early_stopping, - hidden_act); - } else { - PD_THROW("Not implemented place. Only GPU is supported. "); - } -} - -std::vector> MBartDecodingInferShape( - const std::vector& input_shape, - const std::vector& mem_seq_len_shape, - const std::vector& word_embedding_shape, - const std::vector>& self_ln_weight_shapes, - const std::vector>& self_ln_bias_shapes, - const std::vector>& self_q_weight_shapes, - const std::vector>& self_q_bias_shapes, - const std::vector>& self_k_weight_shapes, - const std::vector>& self_k_bias_shapes, - const std::vector>& self_v_weight_shapes, - const std::vector>& self_v_bias_shapes, - const std::vector>& self_out_weight_shapes, - const std::vector>& self_out_bias_shapes, - const std::vector>& cross_ln_weight_shapes, - const std::vector>& cross_ln_bias_shapes, - const std::vector>& cross_q_weight_shapes, - const std::vector>& cross_q_bias_shapes, - const std::vector>& cross_k_weight_shapes, - const std::vector>& cross_k_bias_shapes, - const std::vector>& cross_v_weight_shapes, - const std::vector>& cross_v_bias_shapes, - const std::vector>& cross_out_weight_shapes, - const std::vector>& cross_out_bias_shapes, - const std::vector>& ffn_ln_weight_shapes, - const std::vector>& ffn_ln_bias_shapes, - const std::vector>& ffn_inter_weight_shapes, - const std::vector>& ffn_inter_bias_shapes, - const std::vector>& ffn_out_weight_shapes, - const std::vector>& ffn_out_bias_shapes, - const std::vector& decoder_ln_weight_shape, - const std::vector& decoder_ln_bias_shape, - const std::vector& mbart_ln_weight_shape, - const std::vector& mbart_ln_bias_shape, - const std::vector& embedding_weight_shape, - const std::vector& embedding_bias_shape, - const std::vector& positional_embedding_weight_shape, - const std::vector& trg_word_shape, - const std::string& decoding_strategy, - const int& beam_size, - const int& topk, - const float& topp, - const int& n_head, - const int& size_per_head, - const int& num_layer, - const int& bos_id, - const int& eos_id, - const float& temperature, - const int64_t& max_len, - const float& beam_search_diversity_rate, - const bool& rel_len, - const float& alpha, - const bool& early_stopping, - const std::string& hidden_act) { - int batch_size = input_shape[0]; - - std::vector output_dims; - std::vector sequence_length_dims({batch_size}); - if (decoding_strategy == "beam_search") { - if (batch_size != -1) { - batch_size /= beam_size; - } - output_dims = {max_len, batch_size, beam_size}; - return {output_dims, output_dims, sequence_length_dims}; - } else if (decoding_strategy == "beam_search_v2" || - decoding_strategy == "beam_search_v3") { - // Use separated alive and finish beam queues to avoid the decrease of alive - // beams. The outputs must include both the finish and alive to trace full - // path. - if (batch_size != -1) { - sequence_length_dims = {batch_size * 2}; - batch_size /= beam_size; - } else { - sequence_length_dims = {batch_size}; - } - output_dims = {max_len, batch_size, beam_size * 2}; - return {output_dims, output_dims, sequence_length_dims}; - } else if (decoding_strategy == "topk_sampling" || - decoding_strategy == "topp_sampling") { - output_dims = {max_len, batch_size}; - return {output_dims, {1}, sequence_length_dims}; - } else { - PD_THROW("Not supported decoding strategy. "); - } -} - -std::vector MBartDecodingInferDtype( - const paddle::DataType& input, - const paddle::DataType& mem_seq_len, - const paddle::DataType& word_embedding, - const std::vector& self_ln_weight, - const std::vector& self_ln_bias, - const std::vector& self_q_weight, - const std::vector& self_q_bias, - const std::vector& self_k_weight, - const std::vector& self_k_bias, - const std::vector& self_v_weight, - const std::vector& self_v_bias, - const std::vector& self_out_weight, - const std::vector& self_out_bias, - const std::vector& cross_ln_weight, - const std::vector& cross_ln_bias, - const std::vector& cross_q_weight, - const std::vector& cross_q_bias, - const std::vector& cross_k_weight, - const std::vector& cross_k_bias, - const std::vector& cross_v_weight, - const std::vector& cross_v_bias, - const std::vector& cross_out_weight, - const std::vector& cross_out_bias, - const std::vector& ffn_ln_weight, - const std::vector& ffn_ln_bias, - const std::vector& ffn_inter_weight, - const std::vector& ffn_inter_bias, - const std::vector& ffn_out_weight, - const std::vector& ffn_out_bias, - const paddle::DataType& decoder_ln_weight, - const paddle::DataType& decoder_ln_bias, - const paddle::DataType& mbart_ln_weight, - const paddle::DataType& mbart_ln_bias, - const paddle::DataType& embedding_weight, - const paddle::DataType& embedding_bias, - const paddle::DataType& positional_embedding_weight, - const paddle::DataType& trg_word) { - return {paddle::DataType::INT32, - paddle::DataType::INT32, - paddle::DataType::INT32}; -} - -PD_BUILD_OP(fusion_mbart_decoding) - .Inputs({"Input", - "MemSeqLen", - "WordEmbedding", - paddle::Vec("SelfLayernormWeight"), - paddle::Vec("SelfLayernormBias"), - paddle::Vec("SelfQueryWeight"), - paddle::Vec("SelfQueryBias"), - paddle::Vec("SelfKeyWeight"), - paddle::Vec("SelfKeyBias"), - paddle::Vec("SelfValueWeight"), - paddle::Vec("SelfValueBias"), - paddle::Vec("SelfOutWeight"), - paddle::Vec("SelfOutBias"), - paddle::Vec("CrossLayernormWeight"), - paddle::Vec("CrossLayernormBias"), - paddle::Vec("CrossQueryWeight"), - paddle::Vec("CrossQueryBias"), - paddle::Vec("CrossKeyWeight"), - paddle::Vec("CrossKeyBias"), - paddle::Vec("CrossValueWeight"), - paddle::Vec("CrossValueBias"), - paddle::Vec("CrossOutWeight"), - paddle::Vec("CrossOutBias"), - paddle::Vec("FFNLayernormWeight"), - paddle::Vec("FFNLayernormBias"), - paddle::Vec("FFNInterWeight"), - paddle::Vec("FFNInterBias"), - paddle::Vec("FFNOutWeight"), - paddle::Vec("FFNOutBias"), - "DecoderLayernormWeight", - "DecoderLayernormBias", - "MBARTLayernormWeight", - "MBARTLayernormBias", - "EmbWeight", - "EmbBias", - "PositionEncEmb", - "TrgWord"}) - .Outputs({"OutputIds", "ParentIds", "SequenceLength"}) - .Attrs({ - "decoding_strategy: std::string", - "beam_size: int", - "topk: int", - "topp: float", - "n_head: int", - "size_per_head: int", - "num_layer: int", - "bos_id: int", - "eos_id: int", - "temperature: float", - "max_len: int64_t", - "beam_search_diversity_rate: float", - "rel_len: bool", - "alpha: float", - "early_stopping: bool", - "hidden_act: std::string", - }) - .SetKernelFn(PD_KERNEL(MBartDecodingForward)) - .SetInferShapeFn(PD_INFER_SHAPE(MBartDecodingInferShape)) - .SetInferDtypeFn(PD_INFER_DTYPE(MBartDecodingInferDtype)); diff --git a/paddlenlp/ops/fast_transformer/src/fusion_mbart_decoding_op.cu b/paddlenlp/ops/fast_transformer/src/fusion_mbart_decoding_op.cu deleted file mode 100644 index 08da17ce7d7f..000000000000 --- a/paddlenlp/ops/fast_transformer/src/fusion_mbart_decoding_op.cu +++ /dev/null @@ -1,596 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "cublas_handle.h" - -#include "fusion_mbart_decoding_op.h" -#include "pd_traits.h" - - -__global__ void get_trg_length_mbart(const int* trg_word, - int* trg_length, - const int seq_len, - const int pad_id) { - int bid = threadIdx.x; - - int cnt_nonpads = 0; - for (int i = 0; i < seq_len; ++i) { - if (pad_id != trg_word[bid * seq_len + i]) { - cnt_nonpads++; - } else { - break; - } - } - trg_length[bid] = cnt_nonpads; -} - -template -std::vector mbart_decoding_kernel( - const paddle::Tensor& input, - const paddle::Tensor& memory_sequence_length, - const paddle::Tensor& word_emb, - const std::vector& self_layernorm_weight, - const std::vector& self_layernorm_bias, - const std::vector& self_attn_query_weight, - const std::vector& self_attn_query_bias, - const std::vector& self_attn_key_weight, - const std::vector& self_attn_key_bias, - const std::vector& self_attn_value_weight, - const std::vector& self_attn_value_bias, - const std::vector& self_attn_output_weight, - const std::vector& self_attn_output_bias, - const std::vector& cross_layernorm_weight, - const std::vector& cross_layernorm_bias, - const std::vector& cross_attn_query_weight, - const std::vector& cross_attn_query_bias, - const std::vector& cross_attn_key_weight, - const std::vector& cross_attn_key_bias, - const std::vector& cross_attn_value_weight, - const std::vector& cross_attn_value_bias, - const std::vector& cross_attn_output_weight, - const std::vector& cross_attn_output_bias, - const std::vector& ffn_layernorm_weight, - const std::vector& ffn_layernorm_bias, - const std::vector& ffn_intermediate_weight, - const std::vector& ffn_intermediate_bias, - const std::vector& ffn_output_weight, - const std::vector& ffn_output_bias, - const paddle::Tensor& decoder_layernorm_weight, - const paddle::Tensor& decoder_layernorm_bias, - const paddle::Tensor& mbart_layernorm_weight, - const paddle::Tensor& mbart_layernorm_bias, - const paddle::Tensor& embedding_weight, - const paddle::Tensor& embedding_bias, - const paddle::Tensor& position_encoding_table, - const paddle::Tensor& trg_word, - paddle::Tensor& output_ids, - paddle::Tensor& parent_ids, - paddle::Tensor& sequence_length, - const std::string& decoding_strategy, - const int& beam_size, - const int& topk, - const float& topp, - const int& head_num_, - const int& size_per_head_, - const int& num_layer_, - const int& start_id_, - const int& end_id_, - const float& temperature, - const int64_t& max_seq_len_, - const float& beam_search_diversity_rate_, - const float& alpha, - const bool& early_stopping, - const std::string& hidden_act, - cudaStream_t stream) { - int beam_width_ = (decoding_strategy == "beam_search" || - decoding_strategy == "beam_search_v2" || - decoding_strategy == "beam_search_v3") - ? beam_size - : 1; - int candidate_num_ = (decoding_strategy == "sampling") ? topk : 1; - float probability_threshold_ = (decoding_strategy == "sampling") ? topp : 0.0; - - auto input_dims = input.shape(); - int batch_size_ = (decoding_strategy == "beam_search" || - decoding_strategy == "beam_search_v2" || - decoding_strategy == "beam_search_v3") - ? input_dims[0] / beam_width_ - : input_dims[0]; - const int memory_max_seq_len = input_dims[1]; - const int memory_hidden_dim = input_dims[2]; - const int vocab_size = word_emb.shape()[0]; - - typedef PDTraits traits_; - typedef typename traits_::DataType DataType_; - typedef typename traits_::data_t data_t_; - - DecodingInitParam decoding_params; - decoding_params.cublas_handle = CublasHandle::GetInstance()->cublas_handle_; - decoding_params.cublaslt_handle = CublasHandle::GetInstance()->cublaslt_handle_; - - decoding_params.output_ids = output_ids.mutable_data(input.place()); - decoding_params.parent_ids = parent_ids.mutable_data(input.place()); - decoding_params.sequence_length = - sequence_length.mutable_data(input.place()); - - typedef DecoderTransformerTraits DecodingTraits_; - decoding_params.stream = stream; - fastertransformer::Allocator allocator_(stream); - - decoding_params.memory_tensor = - reinterpret_cast(input.data()); - decoding_params.memory_sequence_length = memory_sequence_length.data(); - - auto trg_word_shape = trg_word.shape(); - int trg_max_len = - (trg_word_shape.size() == 2) ? static_cast(trg_word_shape[1]) : 0; - - paddle::Tensor trg_length = - (trg_word_shape.size() == 2 && trg_word_shape[0] != 0) - ? paddle::Tensor(paddle::PlaceType::kGPU, {trg_word_shape[0]}) - : paddle::Tensor(paddle::PlaceType::kGPU, {1}); - auto trg_length_ptr = trg_length.mutable_data(input.place()); - - if (trg_word_shape.size() == 2 && trg_word_shape[0] != 0) { - decoding_params.trg_word = trg_word.data(); - - get_trg_length_mbart<<<1, trg_word_shape[0], 0, stream>>>( - decoding_params.trg_word, trg_length_ptr, trg_max_len, start_id_); - decoding_params.trg_length = trg_length_ptr; - } - - DecoderInitParam* params = - new DecoderInitParam[num_layer_]; - - for (int i = 0; i < num_layer_; i++) { - params[i].stream = stream; - params[i].cublas_handle = CublasHandle::GetInstance()->cublas_handle_; - params[i].cublaslt_handle = CublasHandle::GetInstance()->cublaslt_handle_; - - if (decoding_strategy == "beam_search" || - decoding_strategy == "beam_search_v2" || - decoding_strategy == "beam_search_v3") { - params[i].request_batch_size = batch_size_ * beam_width_; - params[i].request_max_mem_seq_len = memory_max_seq_len; - } else if (decoding_strategy == "sampling" || - decoding_strategy == "topk_sampling" || - decoding_strategy == "topp_sampling") { - params[i].request_batch_size = batch_size_; - params[i].request_max_mem_seq_len = memory_max_seq_len; - } - - // self attn - params[i].self_layernorm.gamma = reinterpret_cast( - self_layernorm_weight[i].data()); - params[i].self_layernorm.beta = reinterpret_cast( - self_layernorm_bias[i].data()); - // query - params[i].self_attention.query_weight.kernel = - reinterpret_cast( - self_attn_query_weight[i].data()); - params[i].self_attention.query_weight.bias = - reinterpret_cast( - self_attn_query_bias[i].data()); - // key - params[i].self_attention.key_weight.kernel = - reinterpret_cast( - self_attn_key_weight[i].data()); - params[i].self_attention.key_weight.bias = - reinterpret_cast( - self_attn_key_bias[i].data()); - // value - params[i].self_attention.value_weight.kernel = - reinterpret_cast( - self_attn_value_weight[i].data()); - params[i].self_attention.value_weight.bias = - reinterpret_cast( - self_attn_value_bias[i].data()); - // out proj - params[i].self_attention.attention_output_weight.kernel = - reinterpret_cast( - self_attn_output_weight[i].data()); - params[i].self_attention.attention_output_weight.bias = - reinterpret_cast( - self_attn_output_bias[i].data()); - - // cross - params[i].cross_layernorm.gamma = reinterpret_cast( - cross_layernorm_weight[i].data()); - params[i].cross_layernorm.beta = reinterpret_cast( - cross_layernorm_bias[i].data()); - // query - params[i].cross_attention.query_weight.kernel = - reinterpret_cast( - cross_attn_query_weight[i].data()); - params[i].cross_attention.query_weight.bias = - reinterpret_cast( - cross_attn_query_bias[i].data()); - // key - params[i].cross_attention.key_weight.kernel = - reinterpret_cast( - cross_attn_key_weight[i].data()); - params[i].cross_attention.key_weight.bias = - reinterpret_cast( - cross_attn_key_bias[i].data()); - // value - params[i].cross_attention.value_weight.kernel = - reinterpret_cast( - cross_attn_value_weight[i].data()); - params[i].cross_attention.value_weight.bias = - reinterpret_cast( - cross_attn_value_bias[i].data()); - // out proj - params[i].cross_attention.attention_output_weight.kernel = - reinterpret_cast( - cross_attn_output_weight[i].data()); - params[i].cross_attention.attention_output_weight.bias = - reinterpret_cast( - cross_attn_output_bias[i].data()); - - // ffn - params[i].ffn_layernorm.gamma = reinterpret_cast( - ffn_layernorm_weight[i].data()); - params[i].ffn_layernorm.beta = reinterpret_cast( - ffn_layernorm_bias[i].data()); - // intermediate proj - params[i].ffn.intermediate_weight.kernel = - reinterpret_cast( - ffn_intermediate_weight[i].data()); - params[i].ffn.intermediate_weight.bias = reinterpret_cast( - ffn_intermediate_bias[i].data()); - // out proj - params[i].ffn.output_weight.kernel = reinterpret_cast( - ffn_output_weight[i].data()); - params[i].ffn.output_weight.bias = - reinterpret_cast(ffn_output_bias[i].data()); - } - - decoding_params.layernorm.gamma = reinterpret_cast( - decoder_layernorm_weight.data()); - decoding_params.layernorm.beta = reinterpret_cast( - decoder_layernorm_bias.data()); - - // for mbart embedding layernorm - decoding_params.mbart_layernorm.gamma = reinterpret_cast( - mbart_layernorm_weight.data()); - decoding_params.mbart_layernorm.beta = - reinterpret_cast(mbart_layernorm_bias.data()); - - // for embedding - decoding_params.embedding_table = - reinterpret_cast(word_emb.data()); - - // for weight sharing matmul - decoding_params.embedding_kernel = - reinterpret_cast(embedding_weight.data()); - // for matmul bias - decoding_params.embedding_bias = - reinterpret_cast(embedding_bias.data()); - - decoding_params.position_encoding_table = reinterpret_cast( - position_encoding_table.data()); - - int finished_candidate_num_ = - ("beam_search_v3" == decoding_strategy) ? beam_width_ : beam_width_ * 2; - - ActivationType activate = - (hidden_act == "gelu") ? ActivationType::GELU : ActivationType::RELU; - - if ("beam_search" == decoding_strategy) { - DecodingBeamsearch* decoding_beam_search_; - decoding_beam_search_ = new DecodingBeamsearch( - allocator_, - batch_size_, - beam_width_, - max_seq_len_, - head_num_, - size_per_head_, - vocab_size, - num_layer_, - memory_hidden_dim, - memory_max_seq_len, - start_id_, - end_id_, - beam_search_diversity_rate_, - true, /*is_fuse_topk_softMax*/ - false, /*is_fuse_qkv*/ - false, /*keep_alive_beam*/ - alpha, /*alpha not used for this case*/ - true, - 2, /*pos_offset BART and MBART only for now*/ - activate, - false, // pos_bias - false /*prefix_lm*/, - -1, /*finished_candidate_num*/ - false, /*early_stopping*/ - true /*is_mbart */); - - decoding_beam_search_->forward(params, decoding_params); - - delete decoding_beam_search_; - } else if ("beam_search_v2" == decoding_strategy || - "beam_search_v3" == decoding_strategy) { - DecodingBeamsearch* decoding_beam_search_; - decoding_beam_search_ = new DecodingBeamsearch( - allocator_, - batch_size_, - beam_width_, - max_seq_len_, - head_num_, - size_per_head_, - vocab_size, - num_layer_, - memory_hidden_dim, - memory_max_seq_len, - start_id_, - end_id_, - beam_search_diversity_rate_, - true, // is_fuse_topk_softMax - false, // is_fuse_qkv - true, // keep_alive_beam - alpha, - true, - 2, /*pos_offset BART and MBART only for now*/ - activate, - false, // pos_bias - false /*prefix_lm*/, - finished_candidate_num_, /*finished_candidate_num*/ - early_stopping, /*early_stopping*/ - true /*is_mbart */); - - decoding_beam_search_->forward(params, decoding_params); - - delete decoding_beam_search_; - } else if ("topk_sampling" == decoding_strategy || - "topp_sampling" == decoding_strategy || - "sampling" == decoding_strategy) { - DecodingSampling* decoding_sampling_; - decoding_sampling_ = new DecodingSampling( - allocator_, - batch_size_, - max_seq_len_, - head_num_, - size_per_head_, - vocab_size, - num_layer_, - memory_hidden_dim, - memory_max_seq_len, - start_id_, - end_id_, - candidate_num_, - probability_threshold_, - false, /*is_fuse_qkv*/ - true, - 2, /*pos_offset BART and MBART only for now*/ - activate, - false, // pos_bias - temperature, // temperature - 1.0, // repeat_penalty - false, // prefix_lm - true /*is_mbart */); - - decoding_sampling_->forward(params, decoding_params); - - delete decoding_sampling_; - } else { - PD_THROW( - "Only beam_search, beam_search_v2 and sampling are supported for " - "FastGeneration. "); - } - delete[] params; - - return {output_ids, parent_ids, sequence_length}; -} - -std::vector MBartDecodingCUDAForward( - const paddle::Tensor& input, - const paddle::Tensor& mem_seq_len, - const paddle::Tensor& word_embedding, - const std::vector& self_ln_weight, - const std::vector& self_ln_bias, - const std::vector& self_q_weight, - const std::vector& self_q_bias, - const std::vector& self_k_weight, - const std::vector& self_k_bias, - const std::vector& self_v_weight, - const std::vector& self_v_bias, - const std::vector& self_out_weight, - const std::vector& self_out_bias, - const std::vector& cross_ln_weight, - const std::vector& cross_ln_bias, - const std::vector& cross_q_weight, - const std::vector& cross_q_bias, - const std::vector& cross_k_weight, - const std::vector& cross_k_bias, - const std::vector& cross_v_weight, - const std::vector& cross_v_bias, - const std::vector& cross_out_weight, - const std::vector& cross_out_bias, - const std::vector& ffn_ln_weight, - const std::vector& ffn_ln_bias, - const std::vector& ffn_inter_weight, - const std::vector& ffn_inter_bias, - const std::vector& ffn_out_weight, - const std::vector& ffn_out_bias, - const paddle::Tensor& decoder_ln_weight, - const paddle::Tensor& decoder_ln_bias, - const paddle::Tensor& mbart_ln_weight, - const paddle::Tensor& mbart_ln_bias, - const paddle::Tensor& embedding_weight, - const paddle::Tensor& embedding_bias, - const paddle::Tensor& positional_embedding_weight, - const paddle::Tensor& trg_word, - paddle::Tensor& output_ids, - paddle::Tensor& parent_ids, - paddle::Tensor& sequence_length, - const std::string& decoding_strategy, - const int& beam_size, - const int& topk, - const float& topp, - const int& n_head, - const int& size_per_head, - const int& num_layer, - const int& bos_id, - const int& eos_id, - const float& temperature, - const int64_t& max_len, - const float& beam_search_diversity_rate, - const float& alpha, - const bool& early_stopping, - const std::string& hidden_act) { - auto stream = input.stream(); - cublasSetStream(CublasHandle::GetInstance()->cublas_handle_, stream); - - std::vector ret; - - switch (input.type()) { - case paddle::DataType::FLOAT16: { - ret = mbart_decoding_kernel( - input, - mem_seq_len, - word_embedding, - self_ln_weight, - self_ln_bias, - self_q_weight, - self_q_bias, - self_k_weight, - self_k_bias, - self_v_weight, - self_v_bias, - self_out_weight, - self_out_bias, - cross_ln_weight, - cross_ln_bias, - cross_q_weight, - cross_q_bias, - cross_k_weight, - cross_k_bias, - cross_v_weight, - cross_v_bias, - cross_out_weight, - cross_out_bias, - ffn_ln_weight, - ffn_ln_bias, - ffn_inter_weight, - ffn_inter_bias, - ffn_out_weight, - ffn_out_bias, - decoder_ln_weight, - decoder_ln_bias, - mbart_ln_weight, - mbart_ln_bias, - embedding_weight, - embedding_bias, - positional_embedding_weight, - trg_word, - output_ids, - parent_ids, - sequence_length, - decoding_strategy, - beam_size, - topk, - topp, - n_head, - size_per_head, - num_layer, - bos_id, - eos_id, - temperature, - max_len, - beam_search_diversity_rate, - alpha, - early_stopping, - hidden_act, - stream); - break; - } - case paddle::DataType::FLOAT32: { - ret = mbart_decoding_kernel( - input, - mem_seq_len, - word_embedding, - self_ln_weight, - self_ln_bias, - self_q_weight, - self_q_bias, - self_k_weight, - self_k_bias, - self_v_weight, - self_v_bias, - self_out_weight, - self_out_bias, - cross_ln_weight, - cross_ln_bias, - cross_q_weight, - cross_q_bias, - cross_k_weight, - cross_k_bias, - cross_v_weight, - cross_v_bias, - cross_out_weight, - cross_out_bias, - ffn_ln_weight, - ffn_ln_bias, - ffn_inter_weight, - ffn_inter_bias, - ffn_out_weight, - ffn_out_bias, - decoder_ln_weight, - decoder_ln_bias, - mbart_ln_weight, - mbart_ln_bias, - embedding_weight, - embedding_bias, - positional_embedding_weight, - trg_word, - output_ids, - parent_ids, - sequence_length, - decoding_strategy, - beam_size, - topk, - topp, - n_head, - size_per_head, - num_layer, - bos_id, - eos_id, - temperature, - max_len, - beam_search_diversity_rate, - alpha, - early_stopping, - hidden_act, - stream); - break; - } - default: { - PD_THROW( - "NOT supported data type. " - "Only float16 and float32 are supported. "); - break; - } - } - - return ret; -} diff --git a/paddlenlp/ops/fast_transformer/src/fusion_mbart_decoding_op.h b/paddlenlp/ops/fast_transformer/src/fusion_mbart_decoding_op.h deleted file mode 100644 index cf21beea10f0..000000000000 --- a/paddlenlp/ops/fast_transformer/src/fusion_mbart_decoding_op.h +++ /dev/null @@ -1,88 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#pragma once - -#include -#include - -#include "fastertransformer/decoding_beamsearch.h" -#include "fastertransformer/decoding_sampling.h" -#include "fastertransformer/open_decoder.h" -#include "fastertransformer/utils/common.h" - -#ifdef PADDLE_ON_INFERENCE -#include "paddle/extension.h" -#include "paddle_inference_api.h" -#include "paddle/common/exception.h" -#else -#include "paddle/extension.h" -#endif - - -std::vector MBartDecodingCUDAForward( - const paddle::Tensor& input, - const paddle::Tensor& mem_seq_len, - const paddle::Tensor& word_embedding, - const std::vector& self_ln_weight, - const std::vector& self_ln_bias, - const std::vector& self_q_weight, - const std::vector& self_q_bias, - const std::vector& self_k_weight, - const std::vector& self_k_bias, - const std::vector& self_v_weight, - const std::vector& self_v_bias, - const std::vector& self_out_weight, - const std::vector& self_out_bias, - const std::vector& cross_ln_weight, - const std::vector& cross_ln_bias, - const std::vector& cross_q_weight, - const std::vector& cross_q_bias, - const std::vector& cross_k_weight, - const std::vector& cross_k_bias, - const std::vector& cross_v_weight, - const std::vector& cross_v_bias, - const std::vector& cross_out_weight, - const std::vector& cross_out_bias, - const std::vector& ffn_ln_weight, - const std::vector& ffn_ln_bias, - const std::vector& ffn_inter_weight, - const std::vector& ffn_inter_bias, - const std::vector& ffn_out_weight, - const std::vector& ffn_out_bias, - const paddle::Tensor& decoder_ln_weight, - const paddle::Tensor& decoder_ln_bias, - const paddle::Tensor& mbart_ln_weight, - const paddle::Tensor& mbart_ln_bias, - const paddle::Tensor& embedding_weight, - const paddle::Tensor& embedding_bias, - const paddle::Tensor& positional_embedding_weight, - const paddle::Tensor& trg_word, - paddle::Tensor& output_ids, - paddle::Tensor& parent_ids, - paddle::Tensor& sequence_length, - const std::string& decoding_strategy, - const int& beam_size, - const int& topk, - const float& topp, - const int& head_num_, - const int& size_per_head_, - const int& num_layer_, - const int& start_id_, - const int& end_id_, - const float& temperature, - const int64_t& max_seq_len_, - const float& beam_search_diversity_rate_, - const float& alpha, - const bool& early_stopping, - const std::string& hidden_act); diff --git a/paddlenlp/ops/fast_transformer/src/fusion_miro_op.cc b/paddlenlp/ops/fast_transformer/src/fusion_miro_op.cc deleted file mode 100644 index 0f5a1a2221c8..000000000000 --- a/paddlenlp/ops/fast_transformer/src/fusion_miro_op.cc +++ /dev/null @@ -1,427 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include -#include - -#include "fusion_miro_op.h" -#include "pd_traits.h" - - -std::vector MIROForward( - const paddle::Tensor& input_ids, - const paddle::Tensor& attn_mask, - const paddle::Tensor& mem_seq_len, - const paddle::Tensor& type_id, - const paddle::Tensor& decoder_type_id, - const paddle::Tensor& logits_mask, - const paddle::Tensor& word_embedding, - const paddle::Tensor& pre_decoder_ln_weight, - const paddle::Tensor& pre_decoder_ln_bias, - const std::vector& self_ln_weight, - const std::vector& self_ln_bias, - const std::vector& self_q_weight, - const std::vector& self_q_bias, - const std::vector& self_k_weight, - const std::vector& self_k_bias, - const std::vector& self_v_weight, - const std::vector& self_v_bias, - const std::vector& self_out_weight, - const std::vector& self_out_bias, - const std::vector& ffn_ln_weight, - const std::vector& ffn_ln_bias, - const std::vector& ffn_inter_weight, - const std::vector& ffn_inter_bias, - const std::vector& ffn_out_weight, - const std::vector& ffn_out_bias, - const paddle::Tensor& decoder_ln_weight, - const paddle::Tensor& decoder_ln_bias, - const paddle::Tensor& trans_weight, - const paddle::Tensor& trans_bias, - const paddle::Tensor& lm_ln_weight, - const paddle::Tensor& lm_ln_bias, - const paddle::Tensor& embedding_weight, - const paddle::Tensor& embedding_bias, - const paddle::Tensor& positional_embedding_weight, - const paddle::Tensor& type_embedding_weight, - const paddle::Tensor& role_id, - const paddle::Tensor& decoder_role_id, - const paddle::Tensor& role_embedding_table, - const paddle::Tensor& position_ids, - const paddle::Tensor& decoder_position_ids, - const std::string& decoding_strategy, - const int& beam_size, - const int& topk, - const float& topp, - const int& n_head, - const int& size_per_head, - const int& num_layer, - const int& bos_id, - const int& eos_id, - const int64_t& max_len, - const float& beam_search_diversity_rate, - const int& unk_id, - const int& mask_id, - const float& temperature, - const float& len_penalty, - const bool& normalize_before, - const bool& pos_bias, - const std::string& hidden_act, - const bool& rel_len, - const bool& early_stopping, - const int& min_length, - const int& tensor_para_size, - const int& layer_para_size, - const int& layer_para_batch_size) { - int batch_size = input_ids.shape()[0]; - int max_out_len = rel_len ? max_len + input_ids.shape()[1] : max_len; - - std::vector output_ids_dims; - std::vector output_scores_dims; - std::vector parent_ids_dims; - std::vector sequence_length_dims({batch_size}); - if (decoding_strategy == "beam_search") { - if (batch_size != -1) { - batch_size /= beam_size; - } - output_ids_dims = {max_out_len, batch_size, beam_size}; - output_scores_dims = {batch_size, beam_size}; - parent_ids_dims = output_ids_dims; - } else if (decoding_strategy == "beam_search_v2" || - decoding_strategy == "beam_search_v3") { - // Use separated alive and finish beam queues to avoid the decrease of alive - // beams. The outputs must include both the finish and alive to trace full - // path. - if (batch_size != -1) { - sequence_length_dims = {batch_size * 2}; - batch_size /= beam_size; - } else { - sequence_length_dims = {batch_size}; - } - output_ids_dims = {max_out_len, batch_size, beam_size * 2}; - output_scores_dims = {batch_size, beam_size * 2}; - parent_ids_dims = output_ids_dims; - } else if (decoding_strategy == "topk_sampling" || - decoding_strategy == "topp_sampling" || - decoding_strategy == "sampling") { - output_ids_dims = {max_out_len, batch_size}; - output_scores_dims = {batch_size}; - parent_ids_dims = {1}; - } else { - PD_THROW("Not supported decoding strategy. "); - } - auto output_ids = paddle::Tensor(input_ids.place(), output_ids_dims); - auto parent_ids = paddle::Tensor(input_ids.place(), parent_ids_dims); - auto sequence_length = - paddle::Tensor(input_ids.place(), sequence_length_dims); - auto output_scores = paddle::Tensor(input_ids.place(), output_scores_dims); - - if (input_ids.place() == paddle::PlaceType::kGPU) { - auto mem_seq_length = paddle::Tensor(paddle::PlaceType::kGPU); - - if (mem_seq_len.place() != paddle::PlaceType::kGPU) { - mem_seq_length = mem_seq_len.copy_to(paddle::PlaceType::kGPU); - } else { - mem_seq_length = mem_seq_len; - } - - return MIROCUDAForward(input_ids, - attn_mask, - mem_seq_length, - type_id, - decoder_type_id, - logits_mask, - word_embedding, - pre_decoder_ln_weight, - pre_decoder_ln_bias, - self_ln_weight, - self_ln_bias, - self_q_weight, - self_q_bias, - self_k_weight, - self_k_bias, - self_v_weight, - self_v_bias, - self_out_weight, - self_out_bias, - ffn_ln_weight, - ffn_ln_bias, - ffn_inter_weight, - ffn_inter_bias, - ffn_out_weight, - ffn_out_bias, - decoder_ln_weight, - decoder_ln_bias, - trans_weight, - trans_bias, - lm_ln_weight, - lm_ln_bias, - embedding_weight, - embedding_bias, - positional_embedding_weight, - type_embedding_weight, - role_id, - decoder_role_id, - role_embedding_table, - position_ids, - decoder_position_ids, - output_ids, - parent_ids, - sequence_length, - output_scores, - decoding_strategy, - beam_size, - topk, - topp, - n_head, - size_per_head, - num_layer, - bos_id, - eos_id, - max_out_len, - beam_search_diversity_rate, - unk_id, - mask_id, - temperature, - len_penalty, - normalize_before, - pos_bias, - hidden_act, - early_stopping, - min_length, - tensor_para_size, - layer_para_size, - layer_para_batch_size); - } else { - PD_THROW("Not implemented place. Only GPU is supported. "); - } -} - -std::vector> MIROInferShape( - const std::vector& input_ids_shape, - const std::vector& attn_mask_shape, - const std::vector& mem_seq_len_shape, - const std::vector& logits_mask_shape, - const std::vector& type_id_shape, - const std::vector& decoder_type_id_shape, - const std::vector& word_embedding_shape, - const std::vector& pre_decoder_ln_weight_shape, - const std::vector& pre_decoder_ln_bias_shape, - const std::vector>& self_ln_weight_shapes, - const std::vector>& self_ln_bias_shapes, - const std::vector>& self_q_weight_shapes, - const std::vector>& self_q_bias_shapes, - const std::vector>& self_k_weight_shapes, - const std::vector>& self_k_bias_shapes, - const std::vector>& self_v_weight_shapes, - const std::vector>& self_v_bias_shapes, - const std::vector>& self_out_weight_shapes, - const std::vector>& self_out_bias_shapes, - const std::vector>& ffn_ln_weight_shapes, - const std::vector>& ffn_ln_bias_shapes, - const std::vector>& ffn_inter_weight_shapes, - const std::vector>& ffn_inter_bias_shapes, - const std::vector>& ffn_out_weight_shapes, - const std::vector>& ffn_out_bias_shapes, - const std::vector& decoder_ln_weight_shape, - const std::vector& decoder_ln_bias_shape, - const std::vector& trans_weight_shape, - const std::vector& trans_bias_shape, - const std::vector& lm_ln_weight_shape, - const std::vector& lm_ln_bias_shape, - const std::vector& embedding_weight_shape, - const std::vector& embedding_bias_shape, - const std::vector& positional_embedding_weight_shape, - const std::vector& type_embedding_weight_shape, - const std::vector& role_id_shape, - const std::vector& decoder_role_id_shape, - const std::vector& role_embedding_table_shape, - const std::vector& position_ids_shape, - const std::vector& decoder_position_ids_shape, - const std::string& decoding_strategy, - const int& beam_size, - const int& topk, - const float& topp, - const int& n_head, - const int& size_per_head, - const int& num_layer, - const int& bos_id, - const int& eos_id, - const int64_t& max_len, - const float& beam_search_diversity_rate, - const int& unk_id, - const int& mask_id, - const float& temperature, - const float& len_penalty, - const bool& normalize_before, - const bool& pos_bias, - const std::string& hidden_act, - const bool& rel_len, - const bool& early_stopping, - const int& min_length, - const int& tensor_para_size = 1, - const int& layer_para_size = 1, - const int& layer_para_batch_size = 1) { - int batch_size = input_ids_shape[0]; - - std::vector output_ids_dims; - std::vector output_scores_dims; - std::vector sequence_length_dims({batch_size}); - if (decoding_strategy == "beam_search") { - if (batch_size != -1) { - batch_size /= beam_size; - } - output_ids_dims = {max_len, batch_size, beam_size}; - output_scores_dims = {batch_size, beam_size}; - return {output_ids_dims, output_ids_dims, sequence_length_dims, output_scores_dims}; - } else if (decoding_strategy == "beam_search_v2" || - decoding_strategy == "beam_search_v3") { - // Use separated alive and finish beam queues to avoid the decrease of alive - // beams. The outputs must include both the finish and alive to trace full - // path. - if (batch_size != -1) { - sequence_length_dims = {batch_size * 2}; - batch_size /= beam_size; - } else { - sequence_length_dims = {batch_size}; - } - output_ids_dims = {max_len, batch_size, beam_size * 2}; - output_scores_dims = {batch_size, beam_size * 2}; - return {output_ids_dims, output_ids_dims, sequence_length_dims, output_scores_dims}; - } else if (decoding_strategy == "topk_sampling" || - decoding_strategy == "topp_sampling" || - decoding_strategy == "sampling") { - output_ids_dims = {max_len, batch_size}; - output_scores_dims = {batch_size}; - return {output_ids_dims, {1}, sequence_length_dims, output_scores_dims}; - } else { - PD_THROW("Not supported decoding strategy. "); - } -} - -std::vector MIROInferDtype( - const paddle::DataType& input_ids, - const paddle::DataType& attn_mask, - const paddle::DataType& mem_seq_len, - const paddle::DataType& logits_mask, - const paddle::DataType& type_id, - const paddle::DataType& decoder_type_id, - const paddle::DataType& word_embedding, - const paddle::DataType& pre_decoder_ln_weight, - const paddle::DataType& pre_decoder_ln_bias, - const std::vector& self_ln_weight, - const std::vector& self_ln_bias, - const std::vector& self_q_weight, - const std::vector& self_q_bias, - const std::vector& self_k_weight, - const std::vector& self_k_bias, - const std::vector& self_v_weight, - const std::vector& self_v_bias, - const std::vector& self_out_weight, - const std::vector& self_out_bias, - const std::vector& ffn_ln_weight, - const std::vector& ffn_ln_bias, - const std::vector& ffn_inter_weight, - const std::vector& ffn_inter_bias, - const std::vector& ffn_out_weight, - const std::vector& ffn_out_bias, - const paddle::DataType& decoder_ln_weight, - const paddle::DataType& decoder_ln_bias, - const paddle::DataType& trans_weight, - const paddle::DataType& trans_bias, - const paddle::DataType& lm_ln_weight, - const paddle::DataType& lm_ln_bias, - const paddle::DataType& embedding_weight, - const paddle::DataType& embedding_bias, - const paddle::DataType& positional_embedding_weight, - const paddle::DataType& type_embedding_weight, - const paddle::DataType& role_id, - const paddle::DataType& decoder_role_id, - const paddle::DataType& role_embedding_table, - const paddle::DataType& position_ids, - const paddle::DataType& decoder_position_ids) { - return {paddle::DataType::INT32, - paddle::DataType::INT32, - paddle::DataType::INT32, - paddle::DataType::FLOAT32}; -} - -PD_BUILD_OP(fusion_miro) - .Inputs({"InputIds", - "AttnMask", - "MemSeqLen", - "TypeIds", - "DecTypeIds", - "LogitsMask", - "WordEmbedding", - "PreDecoderLayernormWeight", - "PreDecoderLayernormBias", - paddle::Vec("SelfLayernormWeight"), - paddle::Vec("SelfLayernormBias"), - paddle::Vec("SelfQueryWeight"), - paddle::Vec("SelfQueryBias"), - paddle::Vec("SelfKeyWeight"), - paddle::Vec("SelfKeyBias"), - paddle::Vec("SelfValueWeight"), - paddle::Vec("SelfValueBias"), - paddle::Vec("SelfOutWeight"), - paddle::Vec("SelfOutBias"), - paddle::Vec("FFNLayernormWeight"), - paddle::Vec("FFNLayernormBias"), - paddle::Vec("FFNInterWeight"), - paddle::Vec("FFNInterBias"), - paddle::Vec("FFNOutWeight"), - paddle::Vec("FFNOutBias"), - "DecoderLayernormWeight", - "DecoderLayernormBias", - "TransWeight", - "TransBias", - "LMLayernormWeight", - "LMLayernormBias", - "EmbWeight", - "EmbBias", - "PositionEncEmb", - "TypeEmb", - "RoleIds", - "DecRoleIds", - "RoleEmbedding", - "PositionIds", - "DecPositionIds"}) - .Outputs({"OutputIds", "ParentIds", "SequenceLength", "OutputScores"}) - .Attrs({"decoding_strategy: std::string", - "beam_size: int", - "topk: int", - "topp: float", - "n_head: int", - "size_per_head: int", - "num_layer: int", - "bos_id: int", - "eos_id: int", - "max_len: int64_t", - "beam_search_diversity_rate: float", - "unk_id: int", - "mask_id: int", - "temperature: float", - "len_penalty: float", - "normalize_before: bool", - "pos_bias: bool", - "hidden_act: std::string", - "rel_len: bool", - "early_stopping: bool", - "min_length: int", - "tensor_para_size: int", - "layer_para_size: int", - "layer_para_batch_size: int"}) - .SetKernelFn(PD_KERNEL(MIROForward)) - .SetInferShapeFn(PD_INFER_SHAPE(MIROInferShape)) - .SetInferDtypeFn(PD_INFER_DTYPE(MIROInferDtype)); diff --git a/paddlenlp/ops/fast_transformer/src/fusion_miro_op.cu b/paddlenlp/ops/fast_transformer/src/fusion_miro_op.cu deleted file mode 100644 index db3d57d7d423..000000000000 --- a/paddlenlp/ops/fast_transformer/src/fusion_miro_op.cu +++ /dev/null @@ -1,710 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -// TODO(guosheng): `HOST` conflict exists in float.h of paddle and mpi.h of mpi -#include "fusion_miro_op.h" -#include "pd_traits.h" -#ifdef HOST -#undef HOST -#endif - -#include "fastertransformer/decoding_beamsearch.h" -#include "fastertransformer/decoding_sampling.h" -#include "fastertransformer/utils/common.h" -#include "fastertransformer/utils/arguments.h" - -#ifdef BUILD_GPT // consistent with FasterTransformer -#include "parallel_utils.h" -#endif - - -template -std::vector miro_decoding_kernel( - const paddle::Tensor& input_ids, - const paddle::Tensor& attn_mask, - const paddle::Tensor& memory_sequence_length, - const paddle::Tensor& type_id, - const paddle::Tensor& decoder_type_id, - const paddle::Tensor& logits_mask, - const paddle::Tensor& word_emb, - const paddle::Tensor& pre_decoder_layernorm_weight, - const paddle::Tensor& pre_decoder_layernorm_bias, - const std::vector& self_layernorm_weight, - const std::vector& self_layernorm_bias, - const std::vector& self_attn_query_weight, - const std::vector& self_attn_query_bias, - const std::vector& self_attn_key_weight, - const std::vector& self_attn_key_bias, - const std::vector& self_attn_value_weight, - const std::vector& self_attn_value_bias, - const std::vector& self_attn_output_weight, - const std::vector& self_attn_output_bias, - const std::vector& ffn_layernorm_weight, - const std::vector& ffn_layernorm_bias, - const std::vector& ffn_intermediate_weight, - const std::vector& ffn_intermediate_bias, - const std::vector& ffn_output_weight, - const std::vector& ffn_output_bias, - const paddle::Tensor& decoder_layernorm_weight, - const paddle::Tensor& decoder_layernorm_bias, - const paddle::Tensor& trans_weight, - const paddle::Tensor& trans_bias, - const paddle::Tensor& lm_layernorm_weight, - const paddle::Tensor& lm_layernorm_bias, - const paddle::Tensor& embedding_weight, - const paddle::Tensor& embedding_bias, - const paddle::Tensor& position_encoding_table, - const paddle::Tensor& type_embedding_weight, - const paddle::Tensor& role_id, - const paddle::Tensor& decoder_role_id, - const paddle::Tensor& role_embedding_table, - const paddle::Tensor& position_ids, - const paddle::Tensor& decoder_position_ids, - paddle::Tensor& output_ids, - paddle::Tensor& parent_ids, - paddle::Tensor& sequence_length, - paddle::Tensor& output_scores, - const std::string& decoding_strategy, - const int beam_size, - const int topk, - const float topp, - const int head_num_, - const int size_per_head_, - const int num_layer_, - const int start_id_, - const int end_id_, - const int64_t max_seq_len_, - const float beam_search_diversity_rate_, - const int unk_id, - const int mask_id, - const float temperature, - const float len_penalty, - const bool normalize_before, - const bool pos_bias, - const std::string& hidden_act, - const bool early_stopping, - const int min_length, - cudaStream_t stream, - const int tensor_para_size = 1, - const int layer_para_size = 1, - const int layer_para_batch_size = 1) { - int beam_width_ = (decoding_strategy == "beam_search" || - decoding_strategy == "beam_search_v2" || - decoding_strategy == "beam_search_v3") - ? beam_size - : 1; - int candidate_num_ = - ("topk_sampling" == decoding_strategy || - "topp_sampling" == decoding_strategy || "sampling" == decoding_strategy) - ? topk - : 1; - float probability_threshold_ = - ("topk_sampling" == decoding_strategy || - "topp_sampling" == decoding_strategy || "sampling" == decoding_strategy) - ? topp - : 0.0; - - auto input_ids_dims = input_ids.shape(); - int batch_size_ = (decoding_strategy == "beam_search" || - decoding_strategy == "beam_search_v2" || - decoding_strategy == "beam_search_v3") - ? input_ids_dims[0] / beam_width_ - : input_ids_dims[0]; - const int memory_max_seq_len = input_ids_dims[1]; - const int memory_hidden_dim = head_num_ * size_per_head_; - const int vocab_size = word_emb.shape()[0]; - - typedef PDTraits traits_; - typedef typename traits_::DataType DataType_; - typedef typename traits_::data_t data_t_; - - DecodingInitParam decoding_params; - decoding_params.cublas_handle = CublasHandle::GetInstance()->cublas_handle_; - decoding_params.cublaslt_handle = - CublasHandle::GetInstance()->cublaslt_handle_; - - decoding_params.output_ids = output_ids.mutable_data(input_ids.place()); - decoding_params.parent_ids = parent_ids.mutable_data(input_ids.place()); - decoding_params.sequence_length = - sequence_length.mutable_data(input_ids.place()); - decoding_params.output_scores = output_scores.mutable_data(input_ids.place()); - - typedef DecoderTransformerTraits DecodingTraits_; - decoding_params.stream = stream; - fastertransformer::Allocator allocator_(stream); - - decoding_params.d_start_ids = const_cast(input_ids.data()); - decoding_params.d_attn_mask = - reinterpret_cast(const_cast(attn_mask.data())); - decoding_params.d_start_lengths = memory_sequence_length.data(); - - decoding_params.memory_sequence_length = memory_sequence_length.data(); - decoding_params.type_id = type_id.data(); - decoding_params.decoder_type_id = decoder_type_id.data(); - - if (decoding_strategy == "beam_search" || - decoding_strategy == "beam_search_v2" || - decoding_strategy == "beam_search_v3") { - decoding_params.request_batch_size = batch_size_ * beam_width_; - } else if (decoding_strategy == "sampling" || - decoding_strategy == "topk_sampling" || - decoding_strategy == "topp_sampling") { - decoding_params.request_batch_size = batch_size_; - } - decoding_params.max_input_len = memory_max_seq_len; - decoding_params.request_input_len = memory_max_seq_len; - decoding_params.request_output_len = max_seq_len_; - -#ifdef BUILD_GPT - auto* model_para_desc = ModelParaDescFactory::CreateModelParaDesc( - head_num_, - size_per_head_, - num_layer_, - tensor_para_size, - layer_para_size, - layer_para_batch_size, - const_cast(word_emb.data())); - auto& tensor_parallel_param = model_para_desc->tensor_parallel_param; - auto& layer_parallel_param = model_para_desc->layer_parallel_param; - auto seed = model_para_desc->dist(model_para_desc->gen); -#else - TensorParallelParam tensor_parallel_param; - LayerParallelParam layer_parallel_param; - tensor_parallel_param.rank = 0; - tensor_parallel_param.world_size = 1; - tensor_parallel_param.local_head_num_ = head_num_; - tensor_parallel_param.local_hidden_units_ = memory_hidden_dim; - - layer_parallel_param.rank = 0; - layer_parallel_param.world_size = 1; - layer_parallel_param.layers_per_group = num_layer_; - layer_parallel_param.local_batch_size = batch_size_; - int seed = -1; -#endif - - DecoderInitParam* params = - new DecoderInitParam[num_layer_]; - - // Allow python passing partial weights for model parallel. - int inner_coeff = - (memory_hidden_dim == self_attn_output_weight[0].shape()[0]) - ? ffn_intermediate_weight[0].shape()[1] / memory_hidden_dim - : (ffn_intermediate_weight[0].shape()[1] * tensor_para_size / - memory_hidden_dim); - - for (int i = 0; i < self_layernorm_weight.size(); i++) { - // Allow python passing weights of all layers or only passing the - // corresponding layers to save memory. - int layer_idx = self_layernorm_weight.size() != num_layer_ - ? layer_parallel_param.rank * - layer_parallel_param.layers_per_group + - i - : i; - params[layer_idx].stream = stream; - params[layer_idx].cublas_handle = CublasHandle::GetInstance()->cublas_handle_; - params[layer_idx].cublaslt_handle = CublasHandle::GetInstance()->cublaslt_handle_; - - if (decoding_strategy == "beam_search" || - decoding_strategy == "beam_search_v2" || - decoding_strategy == "beam_search_v3") { - params[layer_idx].request_batch_size = batch_size_ * beam_width_; - params[layer_idx].request_max_mem_seq_len = memory_max_seq_len; - } else if (decoding_strategy == "sampling" || - decoding_strategy == "topk_sampling" || - decoding_strategy == "topp_sampling") { - params[layer_idx].request_batch_size = batch_size_; - params[layer_idx].request_max_mem_seq_len = memory_max_seq_len; - } - - // self attn - params[layer_idx].self_layernorm.gamma = reinterpret_cast( - self_layernorm_weight[i].data()); - params[layer_idx].self_layernorm.beta = reinterpret_cast( - self_layernorm_bias[i].data()); - // query - params[layer_idx].self_attention.query_weight.kernel = - reinterpret_cast( - self_attn_query_weight[i].data()); - params[layer_idx].self_attention.query_weight.bias = - reinterpret_cast( - self_attn_query_bias[i].data()); - // key - params[layer_idx].self_attention.key_weight.kernel = - reinterpret_cast( - self_attn_key_weight[i].data()); - params[layer_idx].self_attention.key_weight.bias = - reinterpret_cast( - self_attn_key_bias[i].data()); - // value - params[layer_idx].self_attention.value_weight.kernel = - reinterpret_cast( - self_attn_value_weight[i].data()); - params[layer_idx].self_attention.value_weight.bias = - reinterpret_cast( - self_attn_value_bias[i].data()); - // out proj - params[layer_idx].self_attention.attention_output_weight.kernel = - reinterpret_cast( - self_attn_output_weight[i].data()); - - params[layer_idx].self_attention.attention_output_weight.bias = - reinterpret_cast( - self_attn_output_bias[i].data()); - - // ffn - params[layer_idx].ffn_layernorm.gamma = reinterpret_cast( - ffn_layernorm_weight[i].data()); - params[layer_idx].ffn_layernorm.beta = reinterpret_cast( - ffn_layernorm_bias[i].data()); - // intermediate proj - params[layer_idx].ffn.intermediate_weight.kernel = - reinterpret_cast( - ffn_intermediate_weight[i].data()); - params[layer_idx].ffn.intermediate_weight.bias = reinterpret_cast( - ffn_intermediate_bias[i].data()); - // out proj - params[layer_idx].ffn.output_weight.kernel = reinterpret_cast( - ffn_output_weight[i].data()); - params[layer_idx].ffn.output_weight.bias = - reinterpret_cast(ffn_output_bias[i].data()); - } - - decoding_params.pre_layernorm.gamma = reinterpret_cast( - pre_decoder_layernorm_weight.data()); - decoding_params.pre_layernorm.beta = reinterpret_cast( - pre_decoder_layernorm_bias.data()); - - decoding_params.layernorm.gamma = reinterpret_cast( - decoder_layernorm_weight.data()); - decoding_params.layernorm.beta = reinterpret_cast( - decoder_layernorm_bias.data()); - decoding_params.trans_kernel = - reinterpret_cast(trans_weight.data()); - decoding_params.trans_bias = - reinterpret_cast(trans_bias.data()); - - decoding_params.lm_layernorm.gamma = - reinterpret_cast(lm_layernorm_weight.data()); - decoding_params.lm_layernorm.beta = - reinterpret_cast(lm_layernorm_bias.data()); - - // For embedding - decoding_params.embedding_table = - reinterpret_cast(word_emb.data()); - // For weight sharing matmul - decoding_params.embedding_kernel = - reinterpret_cast(embedding_weight.data()); - // For matmul bias - decoding_params.embedding_bias = - reinterpret_cast(embedding_bias.data()); - decoding_params.position_encoding_table = reinterpret_cast( - position_encoding_table.data()); - - // For masking some id during gen. - decoding_params.logits_mask = - reinterpret_cast(logits_mask.data()); - - decoding_params.type_table = - reinterpret_cast(type_embedding_weight.data()); - - // For role embedding. - auto role_id_shape = role_id.shape(); - if (role_id_shape.size() > 0 && numel(role_id_shape) > 0) { - decoding_params.role_id = role_id.data(); - decoding_params.decoder_role_id = decoder_role_id.data(); - decoding_params.role_embedding_table = - reinterpret_cast(role_embedding_table.data()); - } - - auto position_id_shape = position_ids.shape(); - if (position_id_shape.size() > 0 && numel(position_id_shape) > 0) { - decoding_params.position_ids = position_ids.data(); - decoding_params.decoder_position_ids = decoder_position_ids.data(); - } - - ActivationType activate = - (hidden_act == "gelu") ? ActivationType::GELU : ActivationType::RELU; - - int finished_candidate_num_ = - ("beam_search_v3" == decoding_strategy) ? beam_width_ : beam_width_ * 2; - - if ("beam_search" == decoding_strategy) { - DecodingBeamsearch* miro_beam_search_; - - miro_beam_search_ = - new DecodingBeamsearch( - allocator_, - batch_size_, - beam_width_, - max_seq_len_, - head_num_, - size_per_head_, - vocab_size, - num_layer_, - memory_hidden_dim, - memory_max_seq_len, - start_id_, - end_id_, - beam_search_diversity_rate_, - true, /*is_fuse_topk_softMax*/ - true, /*is_fuse_qkv*/ - false, /*keep_alive_beam*/ - len_penalty, /*alpha not used for this case*/ - normalize_before, - 0, /*pos_offset BART only for now*/ - activate, - pos_bias, - true, /*prefix_lm*/ - -1, /*finished_candidate_num*/ - false, /*early_stopping*/ - false, /*is_mbart*/ - min_length, - inner_coeff, - true); /*is_miro*/ - miro_beam_search_->set_tensor_parallel_param( - tensor_parallel_param); - miro_beam_search_->set_layer_parallel_param( - layer_parallel_param); - miro_beam_search_->forward_context(params, decoding_params); - miro_beam_search_->forward(params, decoding_params); - - delete miro_beam_search_; - } else if ("beam_search_v2" == decoding_strategy || - "beam_search_v3" == decoding_strategy) { - DecodingBeamsearch* miro_beam_search_; - - miro_beam_search_ = - new DecodingBeamsearch( - allocator_, - batch_size_, - beam_width_, - max_seq_len_, - head_num_, - size_per_head_, - vocab_size, - num_layer_, - memory_hidden_dim, - memory_max_seq_len, - start_id_, - end_id_, - beam_search_diversity_rate_, - true, /*is_fuse_topk_softMax*/ - true, /*is_fuse_qkv*/ - true, /*keep_alive_beam*/ - len_penalty, - normalize_before, - 0, /*pos_offset BART only for now*/ - activate, - pos_bias, - true, /*prefix_lm*/ - finished_candidate_num_, - early_stopping, - false, /*is_mbart*/ - min_length, - inner_coeff, - true); /*is_miro*/ - miro_beam_search_->forward_context(params, decoding_params); - miro_beam_search_->forward(params, decoding_params); - - delete miro_beam_search_; - } else if ("topk_sampling" == decoding_strategy || - "topp_sampling" == decoding_strategy || - "sampling" == decoding_strategy) { - DecodingSampling* miro_sampling_; - - miro_sampling_ = new DecodingSampling( - allocator_, - batch_size_, - max_seq_len_, - head_num_, - size_per_head_, - vocab_size, - num_layer_, - memory_hidden_dim, - memory_max_seq_len, - start_id_, - end_id_, - candidate_num_, - probability_threshold_, - true, /*is_fuse_qkv*/ - normalize_before, - 0, /*pos_offset BART only for now*/ - activate, - pos_bias, - temperature, - 1.0, /*repeat_penalty*/ - true, /*prefix_lm*/ - false, /*is_mbart*/ - min_length, - inner_coeff, - seed, - tensor_para_size, - layer_para_size, - true); /*is_miro*/ - miro_sampling_->set_tensor_parallel_param( - tensor_parallel_param); - miro_sampling_->set_layer_parallel_param(layer_parallel_param); - miro_sampling_->forward_context(params, decoding_params); - miro_sampling_->forward(params, decoding_params); - - delete miro_sampling_; - } else { - PD_THROW( - "Only beam_search, beam_search_v2, topk_sampling and topp_sampling are " - "supported for " - "FasterTransformer. "); - } - delete[] params; - - return {output_ids, parent_ids, sequence_length, output_scores}; -} - -std::vector MIROCUDAForward( - const paddle::Tensor& input_ids, - const paddle::Tensor& attn_mask, - const paddle::Tensor& mem_seq_len, - const paddle::Tensor& type_id, - const paddle::Tensor& decoder_type_id, - const paddle::Tensor& logits_mask, - const paddle::Tensor& word_embedding, - const paddle::Tensor& pre_decoder_ln_weight, - const paddle::Tensor& pre_decoder_ln_bias, - const std::vector& self_ln_weight, - const std::vector& self_ln_bias, - const std::vector& self_q_weight, - const std::vector& self_q_bias, - const std::vector& self_k_weight, - const std::vector& self_k_bias, - const std::vector& self_v_weight, - const std::vector& self_v_bias, - const std::vector& self_out_weight, - const std::vector& self_out_bias, - const std::vector& ffn_ln_weight, - const std::vector& ffn_ln_bias, - const std::vector& ffn_inter_weight, - const std::vector& ffn_inter_bias, - const std::vector& ffn_out_weight, - const std::vector& ffn_out_bias, - const paddle::Tensor& decoder_ln_weight, - const paddle::Tensor& decoder_ln_bias, - const paddle::Tensor& trans_weight, - const paddle::Tensor& trans_bias, - const paddle::Tensor& lm_ln_weight, - const paddle::Tensor& lm_ln_bias, - const paddle::Tensor& embedding_weight, - const paddle::Tensor& embedding_bias, - const paddle::Tensor& positional_embedding_weight, - const paddle::Tensor& type_embedding_weight, - const paddle::Tensor& role_id, - const paddle::Tensor& decoder_role_id, - const paddle::Tensor& role_embedding_table, - const paddle::Tensor& position_ids, - const paddle::Tensor& decoder_position_ids, - paddle::Tensor& output_ids, - paddle::Tensor& parent_ids, - paddle::Tensor& sequence_length, - paddle::Tensor& output_scores, - const std::string& decoding_strategy, - const int beam_size, - const int topk, - const float topp, - const int n_head, - const int size_per_head, - const int num_layer, - const int bos_id, - const int eos_id, - const int64_t max_len, - const float beam_search_diversity_rate, - const int unk_id, - const int mask_id, - const float temperature, - const float len_penalty, - const bool normalize_before, - const bool pos_bias, - const std::string& hidden_act, - const bool early_stopping, - const int min_length, - const int tensor_para_size = 1, - const int layer_para_size = 1, - const int layer_para_batch_size = 1) { - auto stream = input_ids.stream(); - - cublasSetStream(CublasHandle::GetInstance()->cublas_handle_, stream); - - std::vector ret; - - switch (self_ln_weight[0].type()) { - case paddle::DataType::FLOAT16: { - ret = miro_decoding_kernel( - input_ids, - attn_mask, - mem_seq_len, - type_id, - decoder_type_id, - logits_mask, - word_embedding, - pre_decoder_ln_weight, - pre_decoder_ln_bias, - self_ln_weight, - self_ln_bias, - self_q_weight, - self_q_bias, - self_k_weight, - self_k_bias, - self_v_weight, - self_v_bias, - self_out_weight, - self_out_bias, - ffn_ln_weight, - ffn_ln_bias, - ffn_inter_weight, - ffn_inter_bias, - ffn_out_weight, - ffn_out_bias, - decoder_ln_weight, - decoder_ln_bias, - trans_weight, - trans_bias, - lm_ln_weight, - lm_ln_bias, - embedding_weight, - embedding_bias, - positional_embedding_weight, - type_embedding_weight, - role_id, - decoder_role_id, - role_embedding_table, - position_ids, - decoder_position_ids, - output_ids, - parent_ids, - sequence_length, - output_scores, - decoding_strategy, - beam_size, - topk, - topp, - n_head, - size_per_head, - num_layer, - bos_id, - eos_id, - max_len, - beam_search_diversity_rate, - unk_id, - mask_id, - temperature, - len_penalty, - normalize_before, - pos_bias, - hidden_act, - early_stopping, - min_length, - stream, - tensor_para_size, - layer_para_size, - layer_para_batch_size); - break; - } - case paddle::DataType::FLOAT32: { - ret = miro_decoding_kernel( - input_ids, - attn_mask, - mem_seq_len, - type_id, - decoder_type_id, - logits_mask, - word_embedding, - pre_decoder_ln_weight, - pre_decoder_ln_bias, - self_ln_weight, - self_ln_bias, - self_q_weight, - self_q_bias, - self_k_weight, - self_k_bias, - self_v_weight, - self_v_bias, - self_out_weight, - self_out_bias, - ffn_ln_weight, - ffn_ln_bias, - ffn_inter_weight, - ffn_inter_bias, - ffn_out_weight, - ffn_out_bias, - decoder_ln_weight, - decoder_ln_bias, - trans_weight, - trans_bias, - lm_ln_weight, - lm_ln_bias, - embedding_weight, - embedding_bias, - positional_embedding_weight, - type_embedding_weight, - role_id, - decoder_role_id, - role_embedding_table, - position_ids, - decoder_position_ids, - output_ids, - parent_ids, - sequence_length, - output_scores, - decoding_strategy, - beam_size, - topk, - topp, - n_head, - size_per_head, - num_layer, - bos_id, - eos_id, - max_len, - beam_search_diversity_rate, - unk_id, - mask_id, - temperature, - len_penalty, - normalize_before, - pos_bias, - hidden_act, - early_stopping, - min_length, - stream, - tensor_para_size, - layer_para_size, - layer_para_batch_size); - break; - } - default: { - PD_THROW( - "NOT supported data type. " - "Only float16 and float32 are supported. "); - break; - } - } - - return ret; -} diff --git a/paddlenlp/ops/fast_transformer/src/fusion_miro_op.h b/paddlenlp/ops/fast_transformer/src/fusion_miro_op.h deleted file mode 100644 index c8213cb1dcad..000000000000 --- a/paddlenlp/ops/fast_transformer/src/fusion_miro_op.h +++ /dev/null @@ -1,102 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#pragma once - -#include -#include - -// #include "fastertransformer/decoding_beamsearch.h" -// #include "fastertransformer/decoding_sampling.h" -// #include "fastertransformer/open_decoder.h" -// #include "fastertransformer/utils/common.h" -#include "cublas_handle.h" -#include "utils.h" - -#ifdef PADDLE_ON_INFERENCE -#include "paddle/extension.h" -#include "paddle_inference_api.h" -#include "paddle/common/exception.h" -#else -#include "paddle/extension.h" -#endif - - -std::vector MIROCUDAForward( - const paddle::Tensor& input_ids, - const paddle::Tensor& attn_mask, - const paddle::Tensor& mem_seq_len, - const paddle::Tensor& type_id, - const paddle::Tensor& decoder_type_id, - const paddle::Tensor& logits_mask, - const paddle::Tensor& word_embedding, - const paddle::Tensor& pre_decoder_ln_weight, - const paddle::Tensor& pre_decoder_ln_bias, - const std::vector& self_ln_weight, - const std::vector& self_ln_bias, - const std::vector& self_q_weight, - const std::vector& self_q_bias, - const std::vector& self_k_weight, - const std::vector& self_k_bias, - const std::vector& self_v_weight, - const std::vector& self_v_bias, - const std::vector& self_out_weight, - const std::vector& self_out_bias, - const std::vector& ffn_ln_weight, - const std::vector& ffn_ln_bias, - const std::vector& ffn_inter_weight, - const std::vector& ffn_inter_bias, - const std::vector& ffn_out_weight, - const std::vector& ffn_out_bias, - const paddle::Tensor& decoder_ln_weight, - const paddle::Tensor& decoder_ln_bias, - const paddle::Tensor& trans_weight, - const paddle::Tensor& trans_bias, - const paddle::Tensor& lm_ln_weight, - const paddle::Tensor& lm_ln_bias, - const paddle::Tensor& embedding_weight, - const paddle::Tensor& embedding_bias, - const paddle::Tensor& positional_embedding_weight, - const paddle::Tensor& type_embedding_weight, - const paddle::Tensor& role_id, - const paddle::Tensor& decoder_role_id, - const paddle::Tensor& role_embedding_table, - const paddle::Tensor& position_ids, - const paddle::Tensor& decoder_position_ids, - paddle::Tensor& output_ids, - paddle::Tensor& parent_ids, - paddle::Tensor& sequence_length, - paddle::Tensor& output_scores, - const std::string& decoding_strategy, - const int beam_size, - const int topk, - const float topp, - const int n_head, - const int size_per_head, - const int num_layer, - const int bos_id, - const int eos_id, - const int64_t max_len, - const float beam_search_diversity_rate, - const int unk_id, - const int mask_id, - const float temperature, - const float len_penalty, - const bool normalize_before, - const bool pos_bias, - const std::string& hidden_act, - const bool early_stopping, - const int min_length, - const int tensor_para_size, - const int layer_para_size, - const int layer_para_batch_size); diff --git a/paddlenlp/ops/fast_transformer/src/fusion_opt_op.cc b/paddlenlp/ops/fast_transformer/src/fusion_opt_op.cc deleted file mode 100644 index 21d4f1b2bad0..000000000000 --- a/paddlenlp/ops/fast_transformer/src/fusion_opt_op.cc +++ /dev/null @@ -1,227 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include - -#include "fusion_opt_op.h" -#include "pd_traits.h" - - -std::vector OPTForward( - const paddle::Tensor& input, - const paddle::Tensor& attn_mask, - const paddle::Tensor& start_length, - const paddle::Tensor& word_embedding, - const std::vector& self_ln_weight, - const std::vector& self_ln_bias, - const std::vector& self_q_weight, - const std::vector& self_q_bias, - const std::vector& self_k_weight, - const std::vector& self_k_bias, - const std::vector& self_v_weight, - const std::vector& self_v_bias, - const std::vector& self_out_weight, - const std::vector& self_out_bias, - const std::vector& ffn_ln_weight, - const std::vector& ffn_ln_bias, - const std::vector& ffn_inter_weight, - const std::vector& ffn_inter_bias, - const std::vector& ffn_out_weight, - const std::vector& ffn_out_bias, - const paddle::Tensor& decoder_ln_weight, - const paddle::Tensor& decoder_ln_bias, - const paddle::Tensor& positional_embedding_weight, - const paddle::Tensor& emb_weight, - const bool& normalize_before, - const int& topk, - const float& topp, - const int& max_len, - const int& n_head, - const int& size_per_head, - const int& num_layer, - const int& bos_id, - const int& eos_id, - const float& temperature, - const bool& use_fp16 = false, - const int& tensor_para_size = 1, - const int& layer_para_size = 1, - const int& layer_para_batch_size = 1) { - int batch_size = input.shape()[0]; - int start_len = input.shape()[1]; - int total_len = max_len + start_len; - std::vector output_dims({total_len, batch_size}); - auto output_ids = paddle::Tensor(input.place(), output_dims); - - if (word_embedding.place() == paddle::PlaceType::kGPU) { - return OPTCUDAForward(input, - attn_mask, - start_length, - word_embedding, - self_ln_weight, - self_ln_bias, - self_q_weight, - self_q_bias, - self_k_weight, - self_k_bias, - self_v_weight, - self_v_bias, - self_out_weight, - self_out_bias, - ffn_ln_weight, - ffn_ln_bias, - ffn_inter_weight, - ffn_inter_bias, - ffn_out_weight, - ffn_out_bias, - decoder_ln_weight, - decoder_ln_bias, - positional_embedding_weight, - emb_weight, - output_ids, - normalize_before, - topk, - topp, - total_len, - n_head, - size_per_head, - num_layer, - bos_id, - eos_id, - temperature, - use_fp16, - tensor_para_size, - layer_para_size, - layer_para_batch_size); - } else { - PD_THROW("Not implemented place. Only GPU is supported. "); - } -} - -std::vector> OPTInferShape( - const std::vector& input_shape, - const std::vector& attn_mask_shape, - const std::vector& start_length, - const std::vector& word_embedding_shape, - const std::vector>& self_ln_weight_shapes, - const std::vector>& self_ln_bias_shapes, - const std::vector>& self_q_weight_shapes, - const std::vector>& self_q_bias_shapes, - const std::vector>& self_k_weight_shapes, - const std::vector>& self_k_bias_shapes, - const std::vector>& self_v_weight_shapes, - const std::vector>& self_v_bias_shapes, - const std::vector>& self_out_weight_shapes, - const std::vector>& self_out_bias_shapes, - const std::vector>& ffn_ln_weight_shapes, - const std::vector>& ffn_ln_bias_shapes, - const std::vector>& ffn_inter_weight_shapes, - const std::vector>& ffn_inter_bias_shapes, - const std::vector>& ffn_out_weight_shapes, - const std::vector>& ffn_out_bias_shapes, - const std::vector& decoder_ln_weight_shape, - const std::vector& decoder_ln_bias_shape, - const std::vector& positional_embedding_weight_shape, - const std::vector& emb_weight_shape, - const bool& normalize_before, - const int& topk, - const float& topp, - const int& max_len, - const int& n_head, - const int& size_per_head, - const int& num_layer, - const int& bos_id, - const int& eos_id, - const float& temperature, - const bool& use_fp16 = false, - const int& tensor_para_size = 1, - const int& layer_para_size = 1, - const int& layer_para_batch_size = 1) { - int64_t batch_size = input_shape[0]; - int64_t start_len = input_shape[1]; - std::vector output_dims({max_len + start_len, batch_size}); - return {output_dims}; -} - -std::vector OPTInferDtype( - const paddle::DataType& input_dtype, - const paddle::DataType& attn_mask_dtype, - const paddle::DataType& start_length_dtype, - const paddle::DataType& word_embedding_dtype, - const std::vector& self_ln_weight_dtype, - const std::vector& self_ln_bias_dtype, - const std::vector& self_q_weight_dtype, - const std::vector& self_q_bias_dtype, - const std::vector& self_k_weight_dtype, - const std::vector& self_k_bias_dtype, - const std::vector& self_v_weight_dtype, - const std::vector& self_v_bias_dtype, - const std::vector& self_out_weight_dtype, - const std::vector& self_out_bias_dtype, - const std::vector& ffn_ln_weight_dtype, - const std::vector& ffn_ln_bias_dtype, - const std::vector& ffn_inter_weight_dtype, - const std::vector& ffn_inter_bias_dtype, - const std::vector& ffn_out_weight_dtype, - const std::vector& ffn_out_bias_dtype, - const paddle::DataType& decoder_ln_weight_dtype, - const paddle::DataType& decoder_ln_bias_dtype, - const paddle::DataType& positional_embedding_weight_dtype, - const paddle::DataType& emb_weight_dtype) { - return {paddle::DataType::INT32}; -} - -PD_BUILD_OP(fusion_opt) - .Inputs({"Input", - "AttentionMask", - "StartLength", - "WordEmbedding", - paddle::Vec("SelfLayernormWeight"), - paddle::Vec("SelfLayernormBias"), - paddle::Vec("SelfQueryWeight"), - paddle::Vec("SelfQueryBias"), - paddle::Vec("SelfKeyWeight"), - paddle::Vec("SelfKeyBias"), - paddle::Vec("SelfValueWeight"), - paddle::Vec("SelfValueBias"), - paddle::Vec("SelfOutWeight"), - paddle::Vec("SelfOutBias"), - paddle::Vec("FFNLayernormWeight"), - paddle::Vec("FFNLayernormBias"), - paddle::Vec("FFNInterWeight"), - paddle::Vec("FFNInterBias"), - paddle::Vec("FFNOutWeight"), - paddle::Vec("FFNOutBias"), - "DecoderLayernormWeight", - "DecoderLayernormBias", - "PositionEncEmb", - "EmbWeight"}) - .Outputs({"OutputIds"}) - .Attrs({"normalize_before: bool", - "topk: int", - "topp: float", - "max_len: int", - "n_head: int", - "size_per_head: int", - "num_layer: int", - "bos_id: int", - "eos_id: int", - "temperature: float", - "use_fp16: bool", - "tensor_para_size: int", - "layer_para_size: int", - "layer_para_batch_size: int"}) - .SetKernelFn(PD_KERNEL(OPTForward)) - .SetInferShapeFn(PD_INFER_SHAPE(OPTInferShape)) - .SetInferDtypeFn(PD_INFER_DTYPE(OPTInferDtype)); diff --git a/paddlenlp/ops/fast_transformer/src/fusion_opt_op.cu b/paddlenlp/ops/fast_transformer/src/fusion_opt_op.cu deleted file mode 100644 index 6af9f9a381ba..000000000000 --- a/paddlenlp/ops/fast_transformer/src/fusion_opt_op.cu +++ /dev/null @@ -1,384 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -// TODO(guosheng): `HOST` conflict exists in float.h of paddle and mpi.h of mpi -#include "fusion_opt_op.h" -#include "pd_traits.h" -#ifdef HOST -#undef HOST -#endif - -#include "fastertransformer/opt.h" -#include "fastertransformer/utils/common.h" - -#ifdef BUILD_GPT // consistent with FasterTransformer -#include "parallel_utils.h" -#endif - -template -std::vector opt_kernel( - const paddle::Tensor& input, - const paddle::Tensor& attn_mask, - const paddle::Tensor& start_length, - const paddle::Tensor& word_emb, - const std::vector& self_ln_weight, - const std::vector& self_ln_bias, - const std::vector& self_q_weight, - const std::vector& self_q_bias, - const std::vector& self_k_weight, - const std::vector& self_k_bias, - const std::vector& self_v_weight, - const std::vector& self_v_bias, - const std::vector& self_out_weight, - const std::vector& self_out_bias, - const std::vector& ffn_ln_weight, - const std::vector& ffn_ln_bias, - const std::vector& ffn_inter_weight, - const std::vector& ffn_inter_bias, - const std::vector& ffn_out_weight, - const std::vector& ffn_out_bias, - const paddle::Tensor& decoder_ln_weight, - const paddle::Tensor& decoder_ln_bias, - const paddle::Tensor& positional_embedding_weight, - const paddle::Tensor& emb_weight, - paddle::Tensor& output_ids, - const bool& normalize_before, - const int& topk, - const float& topp, - const int& max_len, - const int& n_head, - const int& size_per_head, - const int& num_layer, - const int& bos_id, - const int& eos_id, - const float& temperature, - cublasHandle_t cublas_handle_, - cublasLtHandle_t cublaslt_handle_, - cudaStream_t stream, - const int& tensor_para_size = 1, - const int& layer_para_size = 1, - const int& layer_para_batch_size = 1) { - auto input_dims = input.shape(); - int batch_size_ = input_dims[0]; - int start_len = input_dims[1]; - const int vocab_size = word_emb.shape()[0]; - - typedef PDTraits traits_; - typedef typename traits_::DataType DataType_; - typedef typename traits_::data_t data_t_; - - DecodingInitParam decoding_params; - decoding_params.cublas_handle = cublas_handle_; - decoding_params.cublaslt_handle = cublaslt_handle_; - - decoding_params.output_ids = output_ids.mutable_data(word_emb.place()); - - typedef DecoderTransformerTraits DecodingTraits_; - decoding_params.stream = stream; - fastertransformer::Allocator allocator_(stream); - - const int hidden_unit = size_per_head * n_head; - -#ifdef BUILD_GPT - auto* model_para_desc = ModelParaDescFactory::CreateModelParaDesc( - n_head, - size_per_head, - num_layer, - tensor_para_size, - layer_para_size, - layer_para_batch_size, - const_cast(word_emb.data())); - auto& tensor_parallel_param = model_para_desc->tensor_parallel_param; - auto& layer_parallel_param = model_para_desc->layer_parallel_param; - auto seed = model_para_desc->dist(model_para_desc->gen); -#else - TensorParallelParam tensor_parallel_param; - LayerParallelParam layer_parallel_param; - tensor_parallel_param.rank = 0; - tensor_parallel_param.world_size = 1; - tensor_parallel_param.local_head_num_ = n_head; - tensor_parallel_param.local_hidden_units_ = hidden_unit; - - layer_parallel_param.rank = 0; - layer_parallel_param.world_size = 1; - layer_parallel_param.layers_per_group = num_layer; - layer_parallel_param.local_batch_size = batch_size_; - int seed = -1; -#endif - - DecodingOpt* opt_decoding; - - decoding_params.request_batch_size = batch_size_; - decoding_params.max_input_len = start_len; - decoding_params.request_input_len = start_len; - decoding_params.request_output_len = max_len - start_len; - - decoding_params.d_start_ids = const_cast(input.data()); - - decoding_params.d_attn_mask = - reinterpret_cast(const_cast(attn_mask.data())); - decoding_params.d_start_lengths = start_length.data(); - - opt_decoding = - new DecodingOpt(allocator_, - batch_size_, - max_len, - n_head, - size_per_head, - vocab_size, - num_layer, - bos_id, - eos_id, - topk, - topp, - temperature, - tensor_para_size, - layer_para_size, - true, /*is_fuse_QKV*/ - normalize_before, - 1.0, /*repetition_penalty*/ - seed); - - opt_decoding->set_tensor_parallel_param(tensor_parallel_param); - opt_decoding->set_layer_parallel_param(layer_parallel_param); - - DecoderInitParam* params = - new DecoderInitParam[num_layer]; - - for (int i = 0; i < self_ln_weight.size(); ++i) { - // Allow python passing weights of all layers or only passing the - // corresponding layers to save memory. - int layer_idx = self_ln_weight.size() != num_layer - ? layer_parallel_param.rank * - layer_parallel_param.layers_per_group + - i - : i; - - params[layer_idx].stream = stream; - params[layer_idx].cublas_handle = cublas_handle_; - params[layer_idx].cublaslt_handle = cublaslt_handle_; - - params[layer_idx].request_batch_size = batch_size_; - params[layer_idx].request_max_mem_seq_len = start_len; - - params[layer_idx].self_layernorm.gamma = - reinterpret_cast(self_ln_weight[i].data()); - params[layer_idx].self_layernorm.beta = - reinterpret_cast(self_ln_bias[i].data()); - - params[layer_idx].self_attention.query_weight.kernel = - reinterpret_cast(self_q_weight[i].data()); - params[layer_idx].self_attention.query_weight.bias = - reinterpret_cast(self_q_bias[i].data()); - // For `is_fuse_QKV == true`, ignore weight and bias of key and value to - // remove requirements on python passing weights to save memory. - // params[layer_idx].self_attention.key_weight.kernel = - // reinterpret_cast(self_k_weight[i].data()); - // params[layer_idx].self_attention.key_weight.bias = - // reinterpret_cast(self_k_bias[i].data()); - // params[layer_idx].self_attention.value_weight.kernel = - // reinterpret_cast(self_v_weight[i].data()); - // params[layer_idx].self_attention.value_weight.bias = - // reinterpret_cast(self_v_bias[i].data()); - - params[layer_idx].self_attention.attention_output_weight.kernel = - reinterpret_cast(self_out_weight[i].data()); - params[layer_idx].self_attention.attention_output_weight.bias = - reinterpret_cast(self_out_bias[i].data()); - - params[layer_idx].ffn_layernorm.gamma = - reinterpret_cast(ffn_ln_weight[i].data()); - params[layer_idx].ffn_layernorm.beta = - reinterpret_cast(ffn_ln_bias[i].data()); - - params[layer_idx].ffn.intermediate_weight.kernel = - reinterpret_cast(ffn_inter_weight[i].data()); - params[layer_idx].ffn.intermediate_weight.bias = - reinterpret_cast(ffn_inter_bias[i].data()); - params[layer_idx].ffn.output_weight.kernel = - reinterpret_cast(ffn_out_weight[i].data()); - params[layer_idx].ffn.output_weight.bias = - reinterpret_cast(ffn_out_bias[i].data()); - } - - decoding_params.layernorm.gamma = - reinterpret_cast(decoder_ln_weight.data()); - decoding_params.layernorm.beta = - reinterpret_cast(decoder_ln_bias.data()); - decoding_params.embedding_table = - reinterpret_cast(word_emb.data()); - decoding_params.embedding_kernel = - reinterpret_cast(emb_weight.data()); - decoding_params.position_encoding_table = reinterpret_cast( - positional_embedding_weight.data()); - - opt_decoding->forward_context(params, decoding_params); - opt_decoding->forward(params, decoding_params); - - delete opt_decoding; - delete[] params; - - return {output_ids}; -} - -std::vector OPTCUDAForward( - const paddle::Tensor& input, - const paddle::Tensor& attn_mask, - const paddle::Tensor& start_length, - const paddle::Tensor& word_embedding, - const std::vector& self_ln_weight, - const std::vector& self_ln_bias, - const std::vector& self_q_weight, - const std::vector& self_q_bias, - const std::vector& self_k_weight, - const std::vector& self_k_bias, - const std::vector& self_v_weight, - const std::vector& self_v_bias, - const std::vector& self_out_weight, - const std::vector& self_out_bias, - const std::vector& ffn_ln_weight, - const std::vector& ffn_ln_bias, - const std::vector& ffn_inter_weight, - const std::vector& ffn_inter_bias, - const std::vector& ffn_out_weight, - const std::vector& ffn_out_bias, - const paddle::Tensor& decoder_ln_weight, - const paddle::Tensor& decoder_ln_bias, - const paddle::Tensor& positional_embedding_weight, - const paddle::Tensor& emb_weight, - paddle::Tensor& output_ids, - const bool& normalize_before, - const int& topk, - const float& topp, - const int& max_len, - const int& n_head, - const int& size_per_head, - const int& num_layer, - const int& bos_id, - const int& eos_id, - const float& temperature, - const bool& use_fp16 = false, - const int& tensor_para_size = 1, - const int& layer_para_size = 1, - const int& layer_para_batch_size = 1) { - auto stream = word_embedding.stream(); - // TODO(guosheng): use the global cublas handle - cublasHandle_t cublas_handle_; - cublasCreate(&cublas_handle_); - cublasLtHandle_t cublaslt_handle_; - cublasLtCreate(&cublaslt_handle_); - cublasSetStream(cublas_handle_, stream); - - std::vector ret; - - if (use_fp16) { - ret = opt_kernel(input, - attn_mask, - start_length, - word_embedding, - self_ln_weight, - self_ln_bias, - self_q_weight, - self_q_bias, - self_k_weight, - self_k_bias, - self_v_weight, - self_v_bias, - self_out_weight, - self_out_bias, - ffn_ln_weight, - ffn_ln_bias, - ffn_inter_weight, - ffn_inter_bias, - ffn_out_weight, - ffn_out_bias, - decoder_ln_weight, - decoder_ln_bias, - positional_embedding_weight, - emb_weight, - output_ids, - normalize_before, - topk, - topp, - max_len, - n_head, - size_per_head, - num_layer, - bos_id, - eos_id, - temperature, - cublas_handle_, - cublaslt_handle_, - stream, - tensor_para_size, - layer_para_size, - layer_para_batch_size); - } else { - ret = opt_kernel(input, - attn_mask, - start_length, - word_embedding, - self_ln_weight, - self_ln_bias, - self_q_weight, - self_q_bias, - self_k_weight, - self_k_bias, - self_v_weight, - self_v_bias, - self_out_weight, - self_out_bias, - ffn_ln_weight, - ffn_ln_bias, - ffn_inter_weight, - ffn_inter_bias, - ffn_out_weight, - ffn_out_bias, - decoder_ln_weight, - decoder_ln_bias, - positional_embedding_weight, - emb_weight, - output_ids, - normalize_before, - topk, - topp, - max_len, - n_head, - size_per_head, - num_layer, - bos_id, - eos_id, - temperature, - cublas_handle_, - cublaslt_handle_, - stream, - tensor_para_size, - layer_para_size, - layer_para_batch_size); - } - - cublasDestroy(cublas_handle_); - cublasLtDestroy(cublaslt_handle_); - return ret; -} diff --git a/paddlenlp/ops/fast_transformer/src/fusion_opt_op.h b/paddlenlp/ops/fast_transformer/src/fusion_opt_op.h deleted file mode 100644 index 0519df524010..000000000000 --- a/paddlenlp/ops/fast_transformer/src/fusion_opt_op.h +++ /dev/null @@ -1,71 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include - -// #include "fastertransformer/gpt.h" -// #include "fastertransformer/open_decoder.h" -// #include "fastertransformer/utils/common.h" - -#ifdef PADDLE_ON_INFERENCE -#include "paddle/extension.h" -#include "paddle_inference_api.h" -#include "paddle/common/exception.h" -#else -#include "paddle/extension.h" -#endif - -std::vector OPTCUDAForward( - const paddle::Tensor& input, - const paddle::Tensor& attn_mask, - const paddle::Tensor& start_length, - const paddle::Tensor& word_embedding, - const std::vector& self_ln_weight, - const std::vector& self_ln_bias, - const std::vector& self_q_weight, - const std::vector& self_q_bias, - const std::vector& self_k_weight, - const std::vector& self_k_bias, - const std::vector& self_v_weight, - const std::vector& self_v_bias, - const std::vector& self_out_weight, - const std::vector& self_out_bias, - const std::vector& ffn_ln_weight, - const std::vector& ffn_ln_bias, - const std::vector& ffn_inter_weight, - const std::vector& ffn_inter_bias, - const std::vector& ffn_out_weight, - const std::vector& ffn_out_bias, - const paddle::Tensor& decoder_ln_weight, - const paddle::Tensor& decoder_ln_bias, - const paddle::Tensor& positional_embedding_weight, - const paddle::Tensor& emb_weight, - paddle::Tensor& output_ids, - const bool& normalize_before, - const int& topk, - const float& topp, - const int& max_len, - const int& n_head, - const int& size_per_head, - const int& num_layer, - const int& bos_id, - const int& eos_id, - const float& temperature, - const bool& use_fp16, - const int& tensor_para_size, - const int& layer_para_size, - const int& layer_para_batch_size); diff --git a/paddlenlp/ops/fast_transformer/src/fusion_pegasus_decoding_op.cc b/paddlenlp/ops/fast_transformer/src/fusion_pegasus_decoding_op.cc deleted file mode 100644 index c98fd9f744a7..000000000000 --- a/paddlenlp/ops/fast_transformer/src/fusion_pegasus_decoding_op.cc +++ /dev/null @@ -1,372 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include -#include - -#include "fusion_pegasus_decoding_op.h" -#include "pd_traits.h" - - -std::vector PegasusDecodingForward( - const paddle::Tensor& input, - const paddle::Tensor& mem_seq_len, - const paddle::Tensor& word_embedding, - const std::vector& self_ln_weight, - const std::vector& self_ln_bias, - const std::vector& self_q_weight, - const std::vector& self_q_bias, - const std::vector& self_k_weight, - const std::vector& self_k_bias, - const std::vector& self_v_weight, - const std::vector& self_v_bias, - const std::vector& self_out_weight, - const std::vector& self_out_bias, - const std::vector& cross_ln_weight, - const std::vector& cross_ln_bias, - const std::vector& cross_q_weight, - const std::vector& cross_q_bias, - const std::vector& cross_k_weight, - const std::vector& cross_k_bias, - const std::vector& cross_v_weight, - const std::vector& cross_v_bias, - const std::vector& cross_out_weight, - const std::vector& cross_out_bias, - const std::vector& ffn_ln_weight, - const std::vector& ffn_ln_bias, - const std::vector& ffn_inter_weight, - const std::vector& ffn_inter_bias, - const std::vector& ffn_out_weight, - const std::vector& ffn_out_bias, - const paddle::Tensor& decoder_ln_weight, - const paddle::Tensor& decoder_ln_bias, - const paddle::Tensor& embedding_weight, - const paddle::Tensor& embedding_bias, - const paddle::Tensor& positional_embedding_weight, - const std::string decoding_strategy, - const int beam_size, - const int topk, - const float topp, - const int n_head, - const int size_per_head, - const int num_layer, - const int bos_id, - const int eos_id, - const float temperature, - const int64_t max_len, - const int64_t min_len, - const float beam_search_diversity_rate, - const bool rel_len, - const float alpha, - const bool early_stopping, - const std::string hidden_act) { - int batch_size = input.shape()[0]; - int max_out_len = rel_len ? max_len + input.shape()[1] : max_len; - - std::vector output_dims; - std::vector parent_ids_dims; - std::vector sequence_length_dims({batch_size}); - if (decoding_strategy == "beam_search") { - batch_size /= beam_size; - output_dims = {max_out_len, batch_size, beam_size}; - parent_ids_dims = output_dims; - } else if (decoding_strategy == "beam_search_v2" || - decoding_strategy == "beam_search_v3") { - // Use separated alive and finish beam queues to avoid the decrease of alive - // beams. The outputs must include both the finish and alive to trace full - // path. - if (batch_size != -1) { - sequence_length_dims = {batch_size * 2}; - batch_size /= beam_size; - } else { - sequence_length_dims = {batch_size}; - } - output_dims = {max_out_len, batch_size, beam_size * 2}; - parent_ids_dims = output_dims; - } else if (decoding_strategy == "topk_sampling" || - decoding_strategy == "topp_sampling") { - output_dims = {max_out_len, batch_size}; - parent_ids_dims = {1}; - } else { - PD_THROW("Not supported decoding strategy. "); - } - -#ifdef PADDLE_NEW_ALLOCATOR - // For PaddlePaddle>=2.3.0 - if (input.place() == paddle::GPUPlace()) { - auto output_ids = paddle::empty(output_dims, paddle::DataType::INT32, input.place()); - auto parent_ids = paddle::empty(parent_ids_dims, paddle::DataType::INT32, input.place()); - auto sequence_length = paddle::empty(sequence_length_dims, paddle::DataType::INT32, input.place()); - - paddle::Tensor seq_len = paddle::empty(mem_seq_len.shape(), mem_seq_len.dtype(), input.place()); - - if (mem_seq_len.place() != paddle::GPUPlace()) { - seq_len = mem_seq_len.copy_to(paddle::GPUPlace()); - } else { - seq_len = mem_seq_len; - } -#else - if (input.place() == paddle::PlaceType::kGPU) { - auto output_ids = paddle::Tensor(paddle::PlaceType::kGPU, output_dims); - auto parent_ids = paddle::Tensor(paddle::PlaceType::kGPU, parent_ids_dims); - auto sequence_length = - paddle::Tensor(paddle::PlaceType::kGPU, sequence_length_dims); - - paddle::Tensor seq_len = paddle::Tensor(paddle::PlaceType::kGPU); - - if (mem_seq_len.place() != paddle::PlaceType::kGPU) { - seq_len = mem_seq_len.copy_to(paddle::PlaceType::kGPU); - } else { - seq_len = mem_seq_len; - } -#endif - return PegasusDecodingCUDAForward(input, - seq_len, - word_embedding, - self_ln_weight, - self_ln_bias, - self_q_weight, - self_q_bias, - self_k_weight, - self_k_bias, - self_v_weight, - self_v_bias, - self_out_weight, - self_out_bias, - cross_ln_weight, - cross_ln_bias, - cross_q_weight, - cross_q_bias, - cross_k_weight, - cross_k_bias, - cross_v_weight, - cross_v_bias, - cross_out_weight, - cross_out_bias, - ffn_ln_weight, - ffn_ln_bias, - ffn_inter_weight, - ffn_inter_bias, - ffn_out_weight, - ffn_out_bias, - decoder_ln_weight, - decoder_ln_bias, - embedding_weight, - embedding_bias, - positional_embedding_weight, - output_ids, - parent_ids, - sequence_length, - decoding_strategy, - beam_size, - topk, - topp, - n_head, - size_per_head, - num_layer, - bos_id, - eos_id, - temperature, - max_out_len, - min_len, - beam_search_diversity_rate, - alpha, - early_stopping, - hidden_act); - } else { - PD_THROW("Not implemented place. Only GPU is supported. "); - } -} - -std::vector> PegasusDecodingInferShape( - const std::vector& input_shape, - const std::vector& mem_seq_len_shape, - const std::vector& word_embedding_shape, - const std::vector>& self_ln_weight_shapes, - const std::vector>& self_ln_bias_shapes, - const std::vector>& self_q_weight_shapes, - const std::vector>& self_q_bias_shapes, - const std::vector>& self_k_weight_shapes, - const std::vector>& self_k_bias_shapes, - const std::vector>& self_v_weight_shapes, - const std::vector>& self_v_bias_shapes, - const std::vector>& self_out_weight_shapes, - const std::vector>& self_out_bias_shapes, - const std::vector>& cross_ln_weight_shapes, - const std::vector>& cross_ln_bias_shapes, - const std::vector>& cross_q_weight_shapes, - const std::vector>& cross_q_bias_shapes, - const std::vector>& cross_k_weight_shapes, - const std::vector>& cross_k_bias_shapes, - const std::vector>& cross_v_weight_shapes, - const std::vector>& cross_v_bias_shapes, - const std::vector>& cross_out_weight_shapes, - const std::vector>& cross_out_bias_shapes, - const std::vector>& ffn_ln_weight_shapes, - const std::vector>& ffn_ln_bias_shapes, - const std::vector>& ffn_inter_weight_shapes, - const std::vector>& ffn_inter_bias_shapes, - const std::vector>& ffn_out_weight_shapes, - const std::vector>& ffn_out_bias_shapes, - const std::vector& decoder_ln_weight_shape, - const std::vector& decoder_ln_bias_shape, - const std::vector& embedding_weight_shape, - const std::vector& embedding_bias_shape, - const std::vector& positional_embedding_weight_shape, - const std::string decoding_strategy, - const int beam_size, - const int topk, - const float topp, - const int n_head, - const int size_per_head, - const int num_layer, - const int bos_id, - const int eos_id, - const float temperature, - const int64_t max_len, - const int64_t min_len, - const float beam_search_diversity_rate, - const bool rel_len, - const float alpha, - const bool early_stopping, - const std::string hidden_act) { - int batch_size = input_shape[0]; - - std::vector output_dims; - std::vector sequence_length_dims({batch_size}); - if (decoding_strategy == "beam_search") { - if (batch_size != -1) { - batch_size /= beam_size; - } - output_dims = {max_len, batch_size, beam_size}; - return {output_dims, output_dims, sequence_length_dims}; - } else if (decoding_strategy == "beam_search_v2" || - decoding_strategy == "beam_search_v3") { - // Use separated alive and finish beam queues to avoid the decrease of alive - // beams. The outputs must include both the finish and alive to trace full - // path. - if (batch_size != -1) { - sequence_length_dims = {batch_size * 2}; - batch_size /= beam_size; - } else { - sequence_length_dims = {batch_size}; - } - output_dims = {max_len, batch_size, beam_size * 2}; - return {output_dims, output_dims, sequence_length_dims}; - } else if (decoding_strategy == "topk_sampling" || - decoding_strategy == "topp_sampling") { - output_dims = {max_len, batch_size}; - return {output_dims, {1}, sequence_length_dims}; - } else { - PD_THROW("Not supported decoding strategy. "); - } -} - -std::vector PegasusDecodingInferDtype( - const paddle::DataType& input, - const paddle::DataType& mem_seq_len, - const paddle::DataType& word_embedding, - const std::vector& self_ln_weight, - const std::vector& self_ln_bias, - const std::vector& self_q_weight, - const std::vector& self_q_bias, - const std::vector& self_k_weight, - const std::vector& self_k_bias, - const std::vector& self_v_weight, - const std::vector& self_v_bias, - const std::vector& self_out_weight, - const std::vector& self_out_bias, - const std::vector& cross_ln_weight, - const std::vector& cross_ln_bias, - const std::vector& cross_q_weight, - const std::vector& cross_q_bias, - const std::vector& cross_k_weight, - const std::vector& cross_k_bias, - const std::vector& cross_v_weight, - const std::vector& cross_v_bias, - const std::vector& cross_out_weight, - const std::vector& cross_out_bias, - const std::vector& ffn_ln_weight, - const std::vector& ffn_ln_bias, - const std::vector& ffn_inter_weight, - const std::vector& ffn_inter_bias, - const std::vector& ffn_out_weight, - const std::vector& ffn_out_bias, - const paddle::DataType& decoder_ln_weight, - const paddle::DataType& decoder_ln_bias, - const paddle::DataType& embedding_weight, - const paddle::DataType& embedding_bias, - const paddle::DataType& positional_embedding_weight) { - return {paddle::DataType::INT32, - paddle::DataType::INT32, - paddle::DataType::INT32}; -} - -PD_BUILD_OP(fusion_pegasus_decoding) - .Inputs({"Input", - "MemSeqLen", - "WordEmbedding", - paddle::Vec("SelfLayernormWeight"), - paddle::Vec("SelfLayernormBias"), - paddle::Vec("SelfQueryWeight"), - paddle::Vec("SelfQueryBias"), - paddle::Vec("SelfKeyWeight"), - paddle::Vec("SelfKeyBias"), - paddle::Vec("SelfValueWeight"), - paddle::Vec("SelfValueBias"), - paddle::Vec("SelfOutWeight"), - paddle::Vec("SelfOutBias"), - paddle::Vec("CrossLayernormWeight"), - paddle::Vec("CrossLayernormBias"), - paddle::Vec("CrossQueryWeight"), - paddle::Vec("CrossQueryBias"), - paddle::Vec("CrossKeyWeight"), - paddle::Vec("CrossKeyBias"), - paddle::Vec("CrossValueWeight"), - paddle::Vec("CrossValueBias"), - paddle::Vec("CrossOutWeight"), - paddle::Vec("CrossOutBias"), - paddle::Vec("FFNLayernormWeight"), - paddle::Vec("FFNLayernormBias"), - paddle::Vec("FFNInterWeight"), - paddle::Vec("FFNInterBias"), - paddle::Vec("FFNOutWeight"), - paddle::Vec("FFNOutBias"), - "DecoderLayernormWeight", - "DecoderLayernormBias", - "EmbWeight", - "EmbBias", - "PositionEncEmb"}) - .Outputs({"OutputIds", "ParentIds", "SequenceLength"}) - .Attrs({ - "decoding_strategy: std::string", - "beam_size: int", - "topk: int", - "topp: float", - "n_head: int", - "size_per_head: int", - "num_layer: int", - "bos_id: int", - "eos_id: int", - "temperature: float", - "max_len: int64_t", - "min_len: int64_t", - "beam_search_diversity_rate: float", - "rel_len: bool", - "alpha: float", - "early_stopping: bool", - "hidden_act: std::string", - }) - .SetKernelFn(PD_KERNEL(PegasusDecodingForward)) - .SetInferShapeFn(PD_INFER_SHAPE(PegasusDecodingInferShape)) - .SetInferDtypeFn(PD_INFER_DTYPE(PegasusDecodingInferDtype)); diff --git a/paddlenlp/ops/fast_transformer/src/fusion_pegasus_decoding_op.cu b/paddlenlp/ops/fast_transformer/src/fusion_pegasus_decoding_op.cu deleted file mode 100644 index 753f70bf2ae0..000000000000 --- a/paddlenlp/ops/fast_transformer/src/fusion_pegasus_decoding_op.cu +++ /dev/null @@ -1,554 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "cublas_handle.h" - -#include "fusion_pegasus_decoding_op.h" -#include "pd_traits.h" - -template -std::vector pegasus_decoding_kernel( - const paddle::Tensor& input, - const paddle::Tensor& memory_sequence_length, - const paddle::Tensor& word_emb, - const std::vector& self_layernorm_weight, - const std::vector& self_layernorm_bias, - const std::vector& self_attn_query_weight, - const std::vector& self_attn_query_bias, - const std::vector& self_attn_key_weight, - const std::vector& self_attn_key_bias, - const std::vector& self_attn_value_weight, - const std::vector& self_attn_value_bias, - const std::vector& self_attn_output_weight, - const std::vector& self_attn_output_bias, - const std::vector& cross_layernorm_weight, - const std::vector& cross_layernorm_bias, - const std::vector& cross_attn_query_weight, - const std::vector& cross_attn_query_bias, - const std::vector& cross_attn_key_weight, - const std::vector& cross_attn_key_bias, - const std::vector& cross_attn_value_weight, - const std::vector& cross_attn_value_bias, - const std::vector& cross_attn_output_weight, - const std::vector& cross_attn_output_bias, - const std::vector& ffn_layernorm_weight, - const std::vector& ffn_layernorm_bias, - const std::vector& ffn_intermediate_weight, - const std::vector& ffn_intermediate_bias, - const std::vector& ffn_output_weight, - const std::vector& ffn_output_bias, - const paddle::Tensor& decoder_layernorm_weight, - const paddle::Tensor& decoder_layernorm_bias, - const paddle::Tensor& embedding_weight, - const paddle::Tensor& embedding_bias, - const paddle::Tensor& position_encoding_table, - paddle::Tensor& output_ids, - paddle::Tensor& parent_ids, - paddle::Tensor& sequence_length, - const std::string decoding_strategy, - const int beam_size, - const int topk, - const float topp, - const int head_num_, - const int size_per_head_, - const int num_layer_, - const int start_id_, - const int end_id_, - const float temperature, - const int64_t max_seq_len_, - const int64_t min_seq_len_, - const float beam_search_diversity_rate_, - const float alpha, - const bool early_stopping, - const std::string& hidden_act, - cudaStream_t stream) { - int beam_width_ = (decoding_strategy == "beam_search" || - decoding_strategy == "beam_search_v2" || - decoding_strategy == "beam_search_v3") - ? beam_size - : 1; - int candidate_num_ = (decoding_strategy == "sampling") ? topk : 1; - float probability_threshold_ = (decoding_strategy == "sampling") ? topp : 0.0; - - auto input_dims = input.shape(); - int batch_size_ = (decoding_strategy == "beam_search" || - decoding_strategy == "beam_search_v2" || - decoding_strategy == "beam_search_v3") - ? input_dims[0] / beam_width_ - : input_dims[0]; - const int memory_max_seq_len = input_dims[1]; - const int memory_hidden_dim = input_dims[2]; - const int vocab_size = word_emb.shape()[0]; - - typedef PDTraits traits_; - typedef typename traits_::DataType DataType_; - typedef typename traits_::data_t data_t_; - - DecodingInitParam decoding_params; - decoding_params.cublas_handle = CublasHandle::GetInstance()->cublas_handle_; - decoding_params.cublaslt_handle = CublasHandle::GetInstance()->cublaslt_handle_; - -#ifdef PADDLE_NEW_ALLOCATOR - // For PaddlePaddle>=2.3.0 - decoding_params.output_ids = output_ids.data(); - decoding_params.parent_ids = parent_ids.data(); - decoding_params.sequence_length = sequence_length.data(); -#else - decoding_params.output_ids = output_ids.mutable_data(input.place()); - decoding_params.parent_ids = parent_ids.mutable_data(input.place()); - decoding_params.sequence_length = sequence_length.mutable_data(input.place()); -#endif - typedef DecoderTransformerTraits DecodingTraits_; - decoding_params.stream = stream; - fastertransformer::Allocator allocator_(stream); - - decoding_params.memory_tensor = - reinterpret_cast(input.data()); - decoding_params.memory_sequence_length = memory_sequence_length.data(); - - DecoderInitParam* params = - new DecoderInitParam[num_layer_]; - - for (int i = 0; i < num_layer_; i++) { - params[i].stream = stream; - params[i].cublas_handle = CublasHandle::GetInstance()->cublas_handle_; - params[i].cublaslt_handle = CublasHandle::GetInstance()->cublaslt_handle_; - - if (decoding_strategy == "beam_search" || - decoding_strategy == "beam_search_v2" || - decoding_strategy == "beam_search_v3") { - params[i].request_batch_size = batch_size_ * beam_width_; - params[i].request_max_mem_seq_len = memory_max_seq_len; - } else if (decoding_strategy == "sampling" || - decoding_strategy == "topk_sampling" || - decoding_strategy == "topp_sampling") { - params[i].request_batch_size = batch_size_; - params[i].request_max_mem_seq_len = memory_max_seq_len; - } - - // self attn - params[i].self_layernorm.gamma = reinterpret_cast( - self_layernorm_weight[i].data()); - params[i].self_layernorm.beta = reinterpret_cast( - self_layernorm_bias[i].data()); - // query - params[i].self_attention.query_weight.kernel = - reinterpret_cast( - self_attn_query_weight[i].data()); - params[i].self_attention.query_weight.bias = - reinterpret_cast( - self_attn_query_bias[i].data()); - // key - params[i].self_attention.key_weight.kernel = - reinterpret_cast( - self_attn_key_weight[i].data()); - params[i].self_attention.key_weight.bias = - reinterpret_cast( - self_attn_key_bias[i].data()); - // value - params[i].self_attention.value_weight.kernel = - reinterpret_cast( - self_attn_value_weight[i].data()); - params[i].self_attention.value_weight.bias = - reinterpret_cast( - self_attn_value_bias[i].data()); - // out proj - params[i].self_attention.attention_output_weight.kernel = - reinterpret_cast( - self_attn_output_weight[i].data()); - params[i].self_attention.attention_output_weight.bias = - reinterpret_cast( - self_attn_output_bias[i].data()); - - // cross - params[i].cross_layernorm.gamma = reinterpret_cast( - cross_layernorm_weight[i].data()); - params[i].cross_layernorm.beta = reinterpret_cast( - cross_layernorm_bias[i].data()); - // query - params[i].cross_attention.query_weight.kernel = - reinterpret_cast( - cross_attn_query_weight[i].data()); - params[i].cross_attention.query_weight.bias = - reinterpret_cast( - cross_attn_query_bias[i].data()); - // key - params[i].cross_attention.key_weight.kernel = - reinterpret_cast( - cross_attn_key_weight[i].data()); - params[i].cross_attention.key_weight.bias = - reinterpret_cast( - cross_attn_key_bias[i].data()); - // value - params[i].cross_attention.value_weight.kernel = - reinterpret_cast( - cross_attn_value_weight[i].data()); - params[i].cross_attention.value_weight.bias = - reinterpret_cast( - cross_attn_value_bias[i].data()); - // out proj - params[i].cross_attention.attention_output_weight.kernel = - reinterpret_cast( - cross_attn_output_weight[i].data()); - params[i].cross_attention.attention_output_weight.bias = - reinterpret_cast( - cross_attn_output_bias[i].data()); - - // ffn - params[i].ffn_layernorm.gamma = reinterpret_cast( - ffn_layernorm_weight[i].data()); - params[i].ffn_layernorm.beta = reinterpret_cast( - ffn_layernorm_bias[i].data()); - // intermediate proj - params[i].ffn.intermediate_weight.kernel = - reinterpret_cast( - ffn_intermediate_weight[i].data()); - params[i].ffn.intermediate_weight.bias = reinterpret_cast( - ffn_intermediate_bias[i].data()); - // out proj - params[i].ffn.output_weight.kernel = reinterpret_cast( - ffn_output_weight[i].data()); - params[i].ffn.output_weight.bias = - reinterpret_cast(ffn_output_bias[i].data()); - } - - decoding_params.layernorm.gamma = reinterpret_cast( - decoder_layernorm_weight.data()); - decoding_params.layernorm.beta = reinterpret_cast( - decoder_layernorm_bias.data()); - - // for embedding - decoding_params.embedding_table = - reinterpret_cast(word_emb.data()); - - // for weight sharing matmul - decoding_params.embedding_kernel = - reinterpret_cast(embedding_weight.data()); - // for matmul bias - decoding_params.embedding_bias = - reinterpret_cast(embedding_bias.data()); - - decoding_params.position_encoding_table = reinterpret_cast( - position_encoding_table.data()); - - int finished_candidate_num_ = - ("beam_search_v3" == decoding_strategy) ? beam_width_ : beam_width_ * 2; - - ActivationType activate = - (hidden_act == "gelu") ? ActivationType::GELU : ActivationType::RELU; - - if ("beam_search" == decoding_strategy) { - DecodingBeamsearch* decoding_beam_search_; - decoding_beam_search_ = new DecodingBeamsearch( - allocator_, - batch_size_, - beam_width_, - max_seq_len_, - head_num_, - size_per_head_, - vocab_size, - num_layer_, - memory_hidden_dim, - memory_max_seq_len, - start_id_, - end_id_, - beam_search_diversity_rate_, - true, /*is_fuse_topk_softMax*/ - true, /*is_fuse_qkv*/ - false, /*keep_alive_beam*/ - alpha, /*alpha not used for this case*/ - true, - 0, /*pos_offset BART and MBART only for now*/ - activate, - false, // pos_bias - false /*prefix_lm*/, - -1, /*finished_candidate_num*/ - false, /*early_stopping*/ - false, /*is_mbart */ - min_seq_len_ /*min_length*/); - - decoding_beam_search_->forward(params, decoding_params); - - delete decoding_beam_search_; - } else if ("beam_search_v2" == decoding_strategy || - "beam_search_v3" == decoding_strategy) { - DecodingBeamsearch* decoding_beam_search_; - decoding_beam_search_ = new DecodingBeamsearch( - allocator_, - batch_size_, - beam_width_, - max_seq_len_, - head_num_, - size_per_head_, - vocab_size, - num_layer_, - memory_hidden_dim, - memory_max_seq_len, - start_id_, - end_id_, - beam_search_diversity_rate_, - true, // is_fuse_topk_softMax - true, // is_fuse_qkv - true, // keep_alive_beam - alpha, - true, // normalize_before - 0, /*pos_offset BART and MBART only for now*/ - activate, - false, // pos_bias - false /*prefix_lm*/, - finished_candidate_num_, /*finished_candidate_num*/ - early_stopping, /*early_stopping*/ - false, /*is_mbart */ - min_seq_len_ /*min_length*/); - - decoding_beam_search_->forward(params, decoding_params); - - delete decoding_beam_search_; - } else if ("topk_sampling" == decoding_strategy || - "topp_sampling" == decoding_strategy || - "sampling" == decoding_strategy) { - DecodingSampling* decoding_sampling_; - decoding_sampling_ = new DecodingSampling( - allocator_, - batch_size_, - max_seq_len_, - head_num_, - size_per_head_, - vocab_size, - num_layer_, - memory_hidden_dim, - memory_max_seq_len, - start_id_, - end_id_, - candidate_num_, - probability_threshold_, - true, /*is_fuse_qkv*/ - true, // normalize_before - 0, /*pos_offset BART and MBART only for now*/ - activate, - false, // pos_bias - temperature, // temperature - 1.0, // repeat_penalty - false, // prefix_lm - false, /*is_mbart */ - min_seq_len_ /*min_length*/); - - decoding_sampling_->forward(params, decoding_params); - - delete decoding_sampling_; - } else { - PD_THROW( - "Only beam_search, beam_search_v2 and sampling are supported for " - "FastGeneration. "); - } - delete[] params; - - return {output_ids, parent_ids, sequence_length}; -} - -std::vector PegasusDecodingCUDAForward( - const paddle::Tensor& input, - const paddle::Tensor& mem_seq_len, - const paddle::Tensor& word_embedding, - const std::vector& self_ln_weight, - const std::vector& self_ln_bias, - const std::vector& self_q_weight, - const std::vector& self_q_bias, - const std::vector& self_k_weight, - const std::vector& self_k_bias, - const std::vector& self_v_weight, - const std::vector& self_v_bias, - const std::vector& self_out_weight, - const std::vector& self_out_bias, - const std::vector& cross_ln_weight, - const std::vector& cross_ln_bias, - const std::vector& cross_q_weight, - const std::vector& cross_q_bias, - const std::vector& cross_k_weight, - const std::vector& cross_k_bias, - const std::vector& cross_v_weight, - const std::vector& cross_v_bias, - const std::vector& cross_out_weight, - const std::vector& cross_out_bias, - const std::vector& ffn_ln_weight, - const std::vector& ffn_ln_bias, - const std::vector& ffn_inter_weight, - const std::vector& ffn_inter_bias, - const std::vector& ffn_out_weight, - const std::vector& ffn_out_bias, - const paddle::Tensor& decoder_ln_weight, - const paddle::Tensor& decoder_ln_bias, - const paddle::Tensor& embedding_weight, - const paddle::Tensor& embedding_bias, - const paddle::Tensor& positional_embedding_weight, - paddle::Tensor& output_ids, - paddle::Tensor& parent_ids, - paddle::Tensor& sequence_length, - const std::string decoding_strategy, - const int beam_size, - const int topk, - const float topp, - const int n_head, - const int size_per_head, - const int num_layer, - const int bos_id, - const int eos_id, - const float temperature, - const int64_t max_len, - const int64_t min_len, - const float beam_search_diversity_rate, - const float alpha, - const bool early_stopping, - const std::string hidden_act) { - auto stream = input.stream(); - cublasSetStream(CublasHandle::GetInstance()->cublas_handle_, stream); - - std::vector ret; - - switch (input.type()) { - case paddle::DataType::FLOAT16: { - ret = pegasus_decoding_kernel( - input, - mem_seq_len, - word_embedding, - self_ln_weight, - self_ln_bias, - self_q_weight, - self_q_bias, - self_k_weight, - self_k_bias, - self_v_weight, - self_v_bias, - self_out_weight, - self_out_bias, - cross_ln_weight, - cross_ln_bias, - cross_q_weight, - cross_q_bias, - cross_k_weight, - cross_k_bias, - cross_v_weight, - cross_v_bias, - cross_out_weight, - cross_out_bias, - ffn_ln_weight, - ffn_ln_bias, - ffn_inter_weight, - ffn_inter_bias, - ffn_out_weight, - ffn_out_bias, - decoder_ln_weight, - decoder_ln_bias, - embedding_weight, - embedding_bias, - positional_embedding_weight, - output_ids, - parent_ids, - sequence_length, - decoding_strategy, - beam_size, - topk, - topp, - n_head, - size_per_head, - num_layer, - bos_id, - eos_id, - temperature, - max_len, - min_len, - beam_search_diversity_rate, - alpha, - early_stopping, - hidden_act, - stream); - break; - } - case paddle::DataType::FLOAT32: { - ret = pegasus_decoding_kernel( - input, - mem_seq_len, - word_embedding, - self_ln_weight, - self_ln_bias, - self_q_weight, - self_q_bias, - self_k_weight, - self_k_bias, - self_v_weight, - self_v_bias, - self_out_weight, - self_out_bias, - cross_ln_weight, - cross_ln_bias, - cross_q_weight, - cross_q_bias, - cross_k_weight, - cross_k_bias, - cross_v_weight, - cross_v_bias, - cross_out_weight, - cross_out_bias, - ffn_ln_weight, - ffn_ln_bias, - ffn_inter_weight, - ffn_inter_bias, - ffn_out_weight, - ffn_out_bias, - decoder_ln_weight, - decoder_ln_bias, - embedding_weight, - embedding_bias, - positional_embedding_weight, - output_ids, - parent_ids, - sequence_length, - decoding_strategy, - beam_size, - topk, - topp, - n_head, - size_per_head, - num_layer, - bos_id, - eos_id, - temperature, - max_len, - min_len, - beam_search_diversity_rate, - alpha, - early_stopping, - hidden_act, - stream); - break; - } - default: { - PD_THROW( - "NOT supported data type. " - "Only float16 and float32 are supported. "); - break; - } - } - - return ret; -} diff --git a/paddlenlp/ops/fast_transformer/src/fusion_pegasus_decoding_op.h b/paddlenlp/ops/fast_transformer/src/fusion_pegasus_decoding_op.h deleted file mode 100644 index 43ccad5a23c7..000000000000 --- a/paddlenlp/ops/fast_transformer/src/fusion_pegasus_decoding_op.h +++ /dev/null @@ -1,86 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#pragma once - -#include -#include - -#include "fastertransformer/decoding_beamsearch.h" -#include "fastertransformer/decoding_sampling.h" -#include "fastertransformer/open_decoder.h" -#include "fastertransformer/utils/common.h" - -#ifdef PADDLE_ON_INFERENCE -#include "paddle/extension.h" -#include "paddle_inference_api.h" -#include "paddle/common/exception.h" -#else -#include "paddle/extension.h" -#endif - - -std::vector PegasusDecodingCUDAForward( - const paddle::Tensor& input, - const paddle::Tensor& mem_seq_len, - const paddle::Tensor& word_embedding, - const std::vector& self_ln_weight, - const std::vector& self_ln_bias, - const std::vector& self_q_weight, - const std::vector& self_q_bias, - const std::vector& self_k_weight, - const std::vector& self_k_bias, - const std::vector& self_v_weight, - const std::vector& self_v_bias, - const std::vector& self_out_weight, - const std::vector& self_out_bias, - const std::vector& cross_ln_weight, - const std::vector& cross_ln_bias, - const std::vector& cross_q_weight, - const std::vector& cross_q_bias, - const std::vector& cross_k_weight, - const std::vector& cross_k_bias, - const std::vector& cross_v_weight, - const std::vector& cross_v_bias, - const std::vector& cross_out_weight, - const std::vector& cross_out_bias, - const std::vector& ffn_ln_weight, - const std::vector& ffn_ln_bias, - const std::vector& ffn_inter_weight, - const std::vector& ffn_inter_bias, - const std::vector& ffn_out_weight, - const std::vector& ffn_out_bias, - const paddle::Tensor& decoder_ln_weight, - const paddle::Tensor& decoder_ln_bias, - const paddle::Tensor& embedding_weight, - const paddle::Tensor& embedding_bias, - const paddle::Tensor& positional_embedding_weight, - paddle::Tensor& output_ids, - paddle::Tensor& parent_ids, - paddle::Tensor& sequence_length, - const std::string decoding_strategy, - const int beam_size, - const int topk, - const float topp, - const int head_num, - const int size_per_head, - const int num_layer, - const int start_id, - const int end_id, - const float temperature, - const int64_t max_len, - const int64_t min_len, - const float beam_search_diversity_rate, - const float alpha, - const bool early_stopping, - const std::string hidden_act); diff --git a/paddlenlp/ops/fast_transformer/src/fusion_t5_decoding_op.cc b/paddlenlp/ops/fast_transformer/src/fusion_t5_decoding_op.cc deleted file mode 100644 index 840b23b03929..000000000000 --- a/paddlenlp/ops/fast_transformer/src/fusion_t5_decoding_op.cc +++ /dev/null @@ -1,377 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include "fusion_t5_decoding_op.h" - -#include -#include - -#include "pd_traits.h" - - -std::vector T5DecodingForward( - const paddle::Tensor& input, - const paddle::Tensor& mem_seq_len, - const paddle::Tensor& word_embedding, - const std::vector& self_ln_weight, - const std::vector& self_ln_bias, - const std::vector& self_q_weight, - const std::vector& self_q_bias, - const std::vector& self_k_weight, - const std::vector& self_k_bias, - const std::vector& self_v_weight, - const std::vector& self_v_bias, - const std::vector& self_out_weight, - const std::vector& self_out_bias, - const std::vector& cross_ln_weight, - const std::vector& cross_ln_bias, - const std::vector& cross_q_weight, - const std::vector& cross_q_bias, - const std::vector& cross_k_weight, - const std::vector& cross_k_bias, - const std::vector& cross_v_weight, - const std::vector& cross_v_bias, - const std::vector& cross_out_weight, - const std::vector& cross_out_bias, - const std::vector& ffn_ln_weight, - const std::vector& ffn_ln_bias, - const std::vector& ffn_inter_weight_0, - const std::vector& ffn_inter_bias_0, - const std::vector& ffn_inter_weight_1, - const std::vector& ffn_inter_bias_1, - const std::vector& ffn_out_weight, - const std::vector& ffn_out_bias, - const paddle::Tensor& self_relative_attention_bias_weight, - const paddle::Tensor& decoder_ln_weight, - const paddle::Tensor& decoder_ln_bias, - const paddle::Tensor& embedding_weight, - const paddle::Tensor& embedding_bias, - const std::string& decoding_strategy, - const int& beam_size, - const int& topk, - const float& topp, - const int& n_head, - const int& size_per_head, - const int& num_layer, - const int& bos_id, - const int& eos_id, - const int64_t& max_len, - const float& beam_search_diversity_rate, - const bool& rel_len, - const float& alpha, - const float& temperature, - const bool& early_stopping, - const int& max_distance, - const int& num_buckets, - const bool& tie_word_embeddings, - const std::string& act) { - int batch_size = input.shape()[0]; - int max_out_len = rel_len ? max_len + input.shape()[1] : max_len; - - std::vector output_dims; - std::vector parent_ids_dims; - std::vector sequence_length_dims({batch_size}); - if (decoding_strategy == "beam_search") { - batch_size /= beam_size; - output_dims = {max_out_len, batch_size, beam_size}; - parent_ids_dims = output_dims; - } else if (decoding_strategy == "beam_search_v2" || - decoding_strategy == "beam_search_v3") { - // Use separated alive and finish beam queues to avoid the decrease of alive - // beams. The outputs must include both the finish and alive to trace full - // path. - if (batch_size != -1) { - sequence_length_dims = {batch_size * 2}; - batch_size /= beam_size; - } else { - sequence_length_dims = {batch_size}; - } - output_dims = {max_out_len, batch_size, beam_size * 2}; - parent_ids_dims = output_dims; - } else if (decoding_strategy == "topk_sampling" || - decoding_strategy == "topp_sampling" || - decoding_strategy == "sampling") { - output_dims = {max_out_len, batch_size}; - parent_ids_dims = {1}; - } else { - PD_THROW("Not supported decoding strategy. "); - } - - if (input.place() == paddle::PlaceType::kGPU) { - auto output_ids = paddle::Tensor(paddle::PlaceType::kGPU, output_dims); - auto parent_ids = paddle::Tensor(paddle::PlaceType::kGPU, parent_ids_dims); - auto sequence_length = - paddle::Tensor(paddle::PlaceType::kGPU, sequence_length_dims); - - paddle::Tensor seq_len = paddle::Tensor(paddle::PlaceType::kGPU); - - if (mem_seq_len.place() != paddle::PlaceType::kGPU) { - seq_len = mem_seq_len.copy_to(paddle::PlaceType::kGPU); - } else { - seq_len = mem_seq_len; - } - - return T5DecodingCUDAForward(input, - seq_len, - word_embedding, - self_ln_weight, - self_ln_bias, - self_q_weight, - self_q_bias, - self_k_weight, - self_k_bias, - self_v_weight, - self_v_bias, - self_out_weight, - self_out_bias, - cross_ln_weight, - cross_ln_bias, - cross_q_weight, - cross_q_bias, - cross_k_weight, - cross_k_bias, - cross_v_weight, - cross_v_bias, - cross_out_weight, - cross_out_bias, - ffn_ln_weight, - ffn_ln_bias, - ffn_inter_weight_0, - ffn_inter_bias_0, - ffn_inter_weight_1, - ffn_inter_bias_1, - ffn_out_weight, - ffn_out_bias, - self_relative_attention_bias_weight, - decoder_ln_weight, - decoder_ln_bias, - embedding_weight, - embedding_bias, - output_ids, - parent_ids, - sequence_length, - decoding_strategy, - beam_size, - topk, - topp, - n_head, - size_per_head, - num_layer, - bos_id, - eos_id, - max_out_len, - beam_search_diversity_rate, - alpha, - temperature, - early_stopping, - max_distance, - num_buckets, - tie_word_embeddings, - act); - } else { - PD_THROW("Not implemented place. Only GPU is supported. "); - } -} - -std::vector> T5DecodingInferShape( - const std::vector& input_shape, - const std::vector& mem_seq_len_shape, - const std::vector& word_embedding_shape, - const std::vector>& self_ln_weight_shapes, - const std::vector>& self_ln_bias_shapes, - const std::vector>& self_q_weight_shapes, - const std::vector>& self_q_bias_shapes, - const std::vector>& self_k_weight_shapes, - const std::vector>& self_k_bias_shapes, - const std::vector>& self_v_weight_shapes, - const std::vector>& self_v_bias_shapes, - const std::vector>& self_out_weight_shapes, - const std::vector>& self_out_bias_shapes, - const std::vector>& cross_ln_weight_shapes, - const std::vector>& cross_ln_bias_shapes, - const std::vector>& cross_q_weight_shapes, - const std::vector>& cross_q_bias_shapes, - const std::vector>& cross_k_weight_shapes, - const std::vector>& cross_k_bias_shapes, - const std::vector>& cross_v_weight_shapes, - const std::vector>& cross_v_bias_shapes, - const std::vector>& cross_out_weight_shapes, - const std::vector>& cross_out_bias_shapes, - const std::vector>& ffn_ln_weight_shapes, - const std::vector>& ffn_ln_bias_shapes, - const std::vector>& ffn_inter_weight_0_shapes, - const std::vector>& ffn_inter_bias_0_shapes, - const std::vector>& ffn_inter_weight_1_shapes, - const std::vector>& ffn_inter_bias_1_shapes, - const std::vector>& ffn_out_weight_shapes, - const std::vector>& ffn_out_bias_shapes, - const std::vector& self_relative_attention_bias_weight_shapes, - const std::vector& decoder_ln_weight_shape, - const std::vector& decoder_ln_bias_shape, - const std::vector& embedding_weight_shape, - const std::vector& embedding_bias_shape, - const std::string& decoding_strategy, - const int& beam_size, - const int& topk, - const float& topp, - const int& n_head, - const int& size_per_head, - const int& num_layer, - const int& bos_id, - const int& eos_id, - const int64_t& max_len, - const float& beam_search_diversity_rate, - const bool& rel_len, - const float& alpha, - const float& temperature, - const bool& early_stopping, - const int& max_distance, - const int& num_buckets, - const bool& tie_word_embeddings, - const std::string& act) { - int batch_size = input_shape[0]; - - std::vector output_dims; - std::vector sequence_length_dims({batch_size}); - if (decoding_strategy == "beam_search") { - if (batch_size != -1) { - batch_size /= beam_size; - } - output_dims = {max_len, batch_size, beam_size}; - return {output_dims, output_dims, sequence_length_dims}; - } else if (decoding_strategy == "beam_search_v2" || - decoding_strategy == "beam_search_v3") { - // Use separated alive and finish beam queues to avoid the decrease of alive - // beams. The outputs must include both the finish and alive to trace full - // path. - if (batch_size != -1) { - sequence_length_dims = {batch_size * 2}; - batch_size /= beam_size; - } else { - sequence_length_dims = {batch_size}; - } - output_dims = {max_len, batch_size, beam_size * 2}; - return {output_dims, output_dims, sequence_length_dims}; - } else if (decoding_strategy == "topk_sampling" || - decoding_strategy == "topp_sampling" || - decoding_strategy == "sampling") { - output_dims = {max_len, batch_size}; - return {output_dims, {1}, sequence_length_dims}; - } else { - PD_THROW("Not supported decoding strategy. "); - } -} - -std::vector T5DecodingInferDtype( - const paddle::DataType& input, - const paddle::DataType& mem_seq_len, - const paddle::DataType& word_embedding, - const std::vector& self_ln_weight, - const std::vector& self_ln_bias, - const std::vector& self_q_weight, - const std::vector& self_q_bias, - const std::vector& self_k_weight, - const std::vector& self_k_bias, - const std::vector& self_v_weight, - const std::vector& self_v_bias, - const std::vector& self_out_weight, - const std::vector& self_out_bias, - const std::vector& cross_ln_weight, - const std::vector& cross_ln_bias, - const std::vector& cross_q_weight, - const std::vector& cross_q_bias, - const std::vector& cross_k_weight, - const std::vector& cross_k_bias, - const std::vector& cross_v_weight, - const std::vector& cross_v_bias, - const std::vector& cross_out_weight, - const std::vector& cross_out_bias, - const std::vector& ffn_ln_weight, - const std::vector& ffn_ln_bias, - const std::vector& ffn_inter_weight_0, - const std::vector& ffn_inter_bias_0, - const std::vector& ffn_inter_weight_1, - const std::vector& ffn_inter_bias_1, - const std::vector& ffn_out_weight, - const std::vector& ffn_out_bias, - const paddle::DataType& self_relative_attention_bias_weight, - const paddle::DataType& decoder_ln_weight, - const paddle::DataType& decoder_ln_bias, - const paddle::DataType& embedding_weight, - const paddle::DataType& embedding_bias) { - return {paddle::DataType::INT32, - paddle::DataType::INT32, - paddle::DataType::INT32}; -} - - -PD_BUILD_OP(fusion_t5_decoding) - .Inputs({"Input", - "MemSeqLen", - "WordEmbedding", - paddle::Vec("SelfLayernormWeight"), - paddle::Vec("SelfLayernormBias"), - paddle::Vec("SelfQueryWeight"), - paddle::Vec("SelfQueryBias"), - paddle::Vec("SelfKeyWeight"), - paddle::Vec("SelfKeyBias"), - paddle::Vec("SelfValueWeight"), - paddle::Vec("SelfValueBias"), - paddle::Vec("SelfOutWeight"), - paddle::Vec("SelfOutBias"), - paddle::Vec("CrossLayernormWeight"), - paddle::Vec("CrossLayernormBias"), - paddle::Vec("CrossQueryWeight"), - paddle::Vec("CrossQueryBias"), - paddle::Vec("CrossKeyWeight"), - paddle::Vec("CrossKeyBias"), - paddle::Vec("CrossValueWeight"), - paddle::Vec("CrossValueBias"), - paddle::Vec("CrossOutWeight"), - paddle::Vec("CrossOutBias"), - paddle::Vec("FFNLayernormWeight"), - paddle::Vec("FFNLayernormBias"), - paddle::Vec("FFNInterWeight0"), - paddle::Vec("FFNInterBias0"), - paddle::Vec("FFNInterWeight1"), - paddle::Vec("FFNInterBias1"), - paddle::Vec("FFNOutWeight"), - paddle::Vec("FFNOutBias"), - "SelfRelativeAttentionBiasWeight", - "DecoderLayernormWeight", - "DecoderLayernormBias", - "EmbWeight", - "EmbBias"}) - .Outputs({"OutputIds", "ParentIds", "SequenceLength"}) - .Attrs({"decoding_strategy: std::string", - "beam_size: int", - "topk: int", - "topp: float", - "n_head: int", - "size_per_head: int", - "num_layer: int", - "bos_id: int", - "eos_id: int", - "max_len: int64_t", - "beam_search_diversity_rate: float", - "rel_len: bool", - "alpha: float", - "temperature: float", - "early_stopping: bool", - "max_distance: int", - "num_buckets: int", - "tie_word_embeddings: bool", - "act: std::string"}) - .SetKernelFn(PD_KERNEL(T5DecodingForward)) - .SetInferShapeFn(PD_INFER_SHAPE(T5DecodingInferShape)) - .SetInferDtypeFn(PD_INFER_DTYPE(T5DecodingInferDtype)); diff --git a/paddlenlp/ops/fast_transformer/src/fusion_t5_decoding_op.cu b/paddlenlp/ops/fast_transformer/src/fusion_t5_decoding_op.cu deleted file mode 100644 index 5fd211f5fd22..000000000000 --- a/paddlenlp/ops/fast_transformer/src/fusion_t5_decoding_op.cu +++ /dev/null @@ -1,635 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include -#include -#include -#include - -#include -#include -#include -#include -#include - - -#include "fusion_t5_decoding_op.h" -#include "pd_traits.h" - -template -std::vector t5_decoding_kernel( - const paddle::Tensor& input, - const paddle::Tensor& memory_sequence_length, - const paddle::Tensor& word_emb, - const std::vector& self_layernorm_weight, - const std::vector& self_layernorm_bias, - const std::vector& self_attn_query_weight, - const std::vector& self_attn_query_bias, - const std::vector& self_attn_key_weight, - const std::vector& self_attn_key_bias, - const std::vector& self_attn_value_weight, - const std::vector& self_attn_value_bias, - const std::vector& self_attn_output_weight, - const std::vector& self_attn_output_bias, - const std::vector& cross_layernorm_weight, - const std::vector& cross_layernorm_bias, - const std::vector& cross_attn_query_weight, - const std::vector& cross_attn_query_bias, - const std::vector& cross_attn_key_weight, - const std::vector& cross_attn_key_bias, - const std::vector& cross_attn_value_weight, - const std::vector& cross_attn_value_bias, - const std::vector& cross_attn_output_weight, - const std::vector& cross_attn_output_bias, - const std::vector& ffn_layernorm_weight, - const std::vector& ffn_layernorm_bias, - const std::vector& ffn_intermediate_weight_0, - const std::vector& ffn_intermediate_bias_0, - const std::vector& ffn_intermediate_weight_1, - const std::vector& ffn_intermediate_bias_1, - const std::vector& ffn_output_weight, - const std::vector& ffn_output_bias, - const paddle::Tensor& self_relative_attention_bias_weight, - const paddle::Tensor& decoder_layernorm_weight, - const paddle::Tensor& decoder_layernorm_bias, - const paddle::Tensor& embedding_weight, - const paddle::Tensor& embedding_bias, - paddle::Tensor& output_ids, - paddle::Tensor& parent_ids, - paddle::Tensor& sequence_length, - const std::string& decoding_strategy, - const int& beam_size, - const int& topk, - const float& topp, - const int& head_num_, - const int& size_per_head_, - const int& num_layer_, - const int& start_id_, - const int& end_id_, - const int64_t& max_seq_len_, - const float& beam_search_diversity_rate_, - const float& alpha, - const float& temperature, - const bool& early_stopping, - const int& max_distance, - const int& num_buckets, - const bool& tie_word_embeddings, - const std::string& act, - cudaStream_t stream) { - int beam_width_ = (decoding_strategy == "beam_search" || - decoding_strategy == "beam_search_v2" || - decoding_strategy == "beam_search_v3") - ? beam_size - : 1; - int candidate_num_ = - (decoding_strategy == "topk_sampling" || - decoding_strategy == "topp_sampling" || decoding_strategy == "sampling") - ? topk - : 1; - float probability_threshold_ = - (decoding_strategy == "topk_sampling" || - decoding_strategy == "topp_sampling" || decoding_strategy == "sampling") - ? topp - : 0.0; - - auto input_dims = input.shape(); - int batch_size_ = (decoding_strategy == "beam_search" || - decoding_strategy == "beam_search_v2" || - decoding_strategy == "beam_search_v3") - ? input_dims[0] / beam_width_ - : input_dims[0]; - const int memory_max_seq_len = input_dims[1]; - const int memory_hidden_dim = input_dims[2]; - const int vocab_size = word_emb.shape()[0]; - - typedef PDTraits traits_; - typedef typename traits_::DataType DataType_; - typedef typename traits_::data_t data_t_; - - DecodingInitParam decoding_params; - decoding_params.cublas_handle = CublasHandle::GetInstance()->cublas_handle_; - decoding_params.cublaslt_handle = - CublasHandle::GetInstance()->cublaslt_handle_; - - decoding_params.output_ids = output_ids.mutable_data(input.place()); - decoding_params.parent_ids = parent_ids.mutable_data(input.place()); - decoding_params.sequence_length = - sequence_length.mutable_data(input.place()); - - typedef DecoderTransformerTraits DecodingTraits_; - decoding_params.stream = stream; - fastertransformer::Allocator allocator_(stream); - - decoding_params.memory_tensor = - reinterpret_cast(input.data()); - decoding_params.memory_sequence_length = memory_sequence_length.data(); - - DecoderInitParam* params = - new DecoderInitParam[num_layer_]; - - int inner_coeff = ffn_intermediate_weight_0[0].shape()[1] / memory_hidden_dim; - int inner_size = ffn_intermediate_weight_0[0].shape()[1]; - - auto q_weight_shape = self_attn_query_weight[0].shape(); - auto k_weight_shape = self_attn_key_weight[0].shape(); - - if (decoding_strategy == "beam_search" || - decoding_strategy == "beam_search_v2" || - decoding_strategy == "beam_search_v3") { - decoding_params.request_batch_size = batch_size_ * beam_width_; - } else if (decoding_strategy == "sampling" || - decoding_strategy == "topk_sampling" || - decoding_strategy == "topp_sampling") { - decoding_params.request_batch_size = batch_size_; - } - - bool use_gated = false; - - for (int i = 0; i < num_layer_; i++) { - params[i].stream = stream; - params[i].cublas_handle = CublasHandle::GetInstance()->cublas_handle_; - params[i].cublaslt_handle = CublasHandle::GetInstance()->cublaslt_handle_; - - if (decoding_strategy == "beam_search" || - decoding_strategy == "beam_search_v2" || - decoding_strategy == "beam_search_v3") { - params[i].request_batch_size = batch_size_ * beam_width_; - params[i].request_max_mem_seq_len = memory_max_seq_len; - } else if (decoding_strategy == "sampling" || - decoding_strategy == "topk_sampling" || - decoding_strategy == "topp_sampling") { - params[i].request_batch_size = batch_size_; - params[i].request_max_mem_seq_len = memory_max_seq_len; - } - - // self attn - params[i].self_layernorm.gamma = reinterpret_cast( - self_layernorm_weight[i].data()); - - if (self_layernorm_bias[i].shape()[0] != 1) { - params[i].self_layernorm.beta = reinterpret_cast( - self_layernorm_bias[i].data()); - } else { - params[i].self_layernorm.beta = nullptr; - } - - // query - params[i].self_attention.query_weight.kernel = - reinterpret_cast( - self_attn_query_weight[i].data()); - params[i].self_attention.query_weight.bias = - reinterpret_cast( - self_attn_query_bias[i].data()); - // key - params[i].self_attention.key_weight.kernel = - reinterpret_cast( - self_attn_key_weight[i].data()); - params[i].self_attention.key_weight.bias = - reinterpret_cast( - self_attn_key_bias[i].data()); - // value - params[i].self_attention.value_weight.kernel = - reinterpret_cast( - self_attn_value_weight[i].data()); - params[i].self_attention.value_weight.bias = - reinterpret_cast( - self_attn_value_bias[i].data()); - // out proj - params[i].self_attention.attention_output_weight.kernel = - reinterpret_cast( - self_attn_output_weight[i].data()); - params[i].self_attention.attention_output_weight.bias = - reinterpret_cast( - self_attn_output_bias[i].data()); - - // cross - params[i].cross_layernorm.gamma = reinterpret_cast( - cross_layernorm_weight[i].data()); - if (cross_layernorm_bias[i].shape()[0] != 1) { - params[i].cross_layernorm.beta = reinterpret_cast( - cross_layernorm_bias[i].data()); - } else { - params[i].cross_layernorm.beta = nullptr; - } - // query - params[i].cross_attention.query_weight.kernel = - reinterpret_cast( - cross_attn_query_weight[i].data()); - params[i].cross_attention.query_weight.bias = - reinterpret_cast( - cross_attn_query_bias[i].data()); - // key - params[i].cross_attention.key_weight.kernel = - reinterpret_cast( - cross_attn_key_weight[i].data()); - params[i].cross_attention.key_weight.bias = - reinterpret_cast( - cross_attn_key_bias[i].data()); - // value - params[i].cross_attention.value_weight.kernel = - reinterpret_cast( - cross_attn_value_weight[i].data()); - params[i].cross_attention.value_weight.bias = - reinterpret_cast( - cross_attn_value_bias[i].data()); - // out proj - params[i].cross_attention.attention_output_weight.kernel = - reinterpret_cast( - cross_attn_output_weight[i].data()); - params[i].cross_attention.attention_output_weight.bias = - reinterpret_cast( - cross_attn_output_bias[i].data()); - - // ffn - params[i].ffn_layernorm.gamma = reinterpret_cast( - ffn_layernorm_weight[i].data()); - if (ffn_layernorm_bias[i].shape()[0] != 1) { - params[i].ffn_layernorm.beta = reinterpret_cast( - ffn_layernorm_bias[i].data()); - } else { - params[i].ffn_layernorm.beta = nullptr; - } - // intermediate proj - params[i].ffn.intermediate_weight.kernel = - reinterpret_cast( - ffn_intermediate_weight_0[i].data()); - params[i].ffn.intermediate_weight.bias = reinterpret_cast( - ffn_intermediate_bias_0[i].data()); - - if (ffn_intermediate_weight_1[i].shape()[0] != 1) { - use_gated = true; - params[i].ffn.intermediate_weight_1.kernel = - reinterpret_cast( - ffn_intermediate_weight_1[i].data()); - params[i].ffn.intermediate_weight_1.bias = reinterpret_cast( - ffn_intermediate_bias_1[i].data()); - } else { - params[i].ffn.intermediate_weight_1.kernel = nullptr; - params[i].ffn.intermediate_weight_1.bias = nullptr; - } - - // out proj - params[i].ffn.output_weight.kernel = reinterpret_cast( - ffn_output_weight[i].data()); - params[i].ffn.output_weight.bias = - reinterpret_cast(ffn_output_bias[i].data()); - } - - // relative bias - decoding_params.self_relative_attention_bias_weight = - reinterpret_cast( - self_relative_attention_bias_weight.data()); - - decoding_params.layernorm.gamma = reinterpret_cast( - decoder_layernorm_weight.data()); - if (decoder_layernorm_bias.shape()[0] != 1) { - decoding_params.layernorm.beta = reinterpret_cast( - decoder_layernorm_bias.data()); - } else { - decoding_params.layernorm.beta = nullptr; - } - // for embedding - decoding_params.embedding_table = - reinterpret_cast(word_emb.data()); - - // for weight sharing matmul - decoding_params.embedding_kernel = - reinterpret_cast(embedding_weight.data()); - // for matmul bias - decoding_params.embedding_bias = - reinterpret_cast(embedding_bias.data()); - - int finished_candidate_num_ = - ("beam_search_v3" == decoding_strategy) ? beam_width_ : beam_width_ * 2; - - ActivationType activate = - (act == "gelu") ? ActivationType::GELU : ActivationType::RELU; - - if ("beam_search" == decoding_strategy) { - T5DecodingBeamsearch* decoding_beam_search_; - decoding_beam_search_ = new T5DecodingBeamsearch( - allocator_, - batch_size_, - beam_width_, - max_seq_len_, - head_num_, - size_per_head_, - vocab_size, - num_layer_, - memory_hidden_dim, - memory_max_seq_len, - start_id_, - end_id_, - beam_search_diversity_rate_, - true, // is_fuse_topk_softMax - true, // fuse_qkv - false, // keep_alive_beam - 0.6, // alpha - true, // normalization_before - activate, - -1, // finished_candidate_num - false, // early_stopping - 0, // min_length - inner_coeff, - inner_size, - num_buckets, - max_distance, - tie_word_embeddings, - use_gated); - - decoding_beam_search_->forward(params, decoding_params); - - delete decoding_beam_search_; - } else if ("beam_search_v2" == decoding_strategy || - "beam_search_v3" == decoding_strategy) { - T5DecodingBeamsearch* decoding_beam_search_; - decoding_beam_search_ = new T5DecodingBeamsearch( - allocator_, - batch_size_, - beam_width_, - max_seq_len_, - head_num_, - size_per_head_, - vocab_size, - num_layer_, - memory_hidden_dim, - memory_max_seq_len, - start_id_, - end_id_, - beam_search_diversity_rate_, - true, // is_fuse_topk_softMax - true, // fuse_qkv - true, // keep_alive_beam - alpha, - true, // normalization_before - activate, - finished_candidate_num_, - early_stopping, - 0, // min_length - inner_coeff, - inner_size, - num_buckets, - max_distance, - tie_word_embeddings, - use_gated); - - decoding_beam_search_->forward(params, decoding_params); - - delete decoding_beam_search_; - } else if ("topk_sampling" == decoding_strategy || - "topp_sampling" == decoding_strategy || - "sampling" == decoding_strategy) { - - T5DecodingSampling* decoding_sampling_; - decoding_sampling_ = new T5DecodingSampling( - allocator_, - batch_size_, - max_seq_len_, - head_num_, - size_per_head_, - vocab_size, - num_layer_, - memory_hidden_dim, - memory_max_seq_len, - start_id_, - end_id_, - candidate_num_, - probability_threshold_, - true, // fuse_qkv - true, // normalization_before - activate, - 1.0, // temperature - 1.0, // repeat_penalty - 0, // min_length - inner_coeff, - inner_size, - -1, // seed - 1, // tensor_para_size - 1, // layer_para_size - num_buckets, - max_distance, - tie_word_embeddings, - use_gated); - - decoding_sampling_->forward(params, decoding_params); - - delete decoding_sampling_; - - } else { - PD_THROW( - "Only beam_search, topk_sampling and topp_sampling are supported for " - "FastGeneration. "); - } - delete[] params; - - return {output_ids, parent_ids, sequence_length}; -} - -std::vector T5DecodingCUDAForward( - const paddle::Tensor& input, - const paddle::Tensor& mem_seq_len, - const paddle::Tensor& word_embedding, - const std::vector& self_ln_weight, - const std::vector& self_ln_bias, - const std::vector& self_q_weight, - const std::vector& self_q_bias, - const std::vector& self_k_weight, - const std::vector& self_k_bias, - const std::vector& self_v_weight, - const std::vector& self_v_bias, - const std::vector& self_out_weight, - const std::vector& self_out_bias, - const std::vector& cross_ln_weight, - const std::vector& cross_ln_bias, - const std::vector& cross_q_weight, - const std::vector& cross_q_bias, - const std::vector& cross_k_weight, - const std::vector& cross_k_bias, - const std::vector& cross_v_weight, - const std::vector& cross_v_bias, - const std::vector& cross_out_weight, - const std::vector& cross_out_bias, - const std::vector& ffn_ln_weight, - const std::vector& ffn_ln_bias, - const std::vector& ffn_inter_weight_0, - const std::vector& ffn_inter_bias_0, - const std::vector& ffn_inter_weight_1, - const std::vector& ffn_inter_bias_1, - const std::vector& ffn_out_weight, - const std::vector& ffn_out_bias, - const paddle::Tensor& self_relative_attention_bias_weight, - const paddle::Tensor& decoder_ln_weight, - const paddle::Tensor& decoder_ln_bias, - const paddle::Tensor& embedding_weight, - const paddle::Tensor& embedding_bias, - paddle::Tensor& output_ids, - paddle::Tensor& parent_ids, - paddle::Tensor& sequence_length, - const std::string& decoding_strategy, - const int& beam_size, - const int& topk, - const float& topp, - const int& n_head, - const int& size_per_head, - const int& num_layer, - const int& bos_id, - const int& eos_id, - const int64_t& max_len, - const float& beam_search_diversity_rate, - const float& alpha, - const float& temperature, - const bool& early_stopping, - const int& max_distance, - const int& num_buckets, - const bool& tie_word_embeddings, - const std::string& act) { - auto stream = input.stream(); - - cublasSetStream(CublasHandle::GetInstance()->cublas_handle_, stream); - - std::vector ret; - - switch (input.type()) { - case paddle::DataType::FLOAT16: { - ret = t5_decoding_kernel( - input, - mem_seq_len, - word_embedding, - self_ln_weight, - self_ln_bias, - self_q_weight, - self_q_bias, - self_k_weight, - self_k_bias, - self_v_weight, - self_v_bias, - self_out_weight, - self_out_bias, - cross_ln_weight, - cross_ln_bias, - cross_q_weight, - cross_q_bias, - cross_k_weight, - cross_k_bias, - cross_v_weight, - cross_v_bias, - cross_out_weight, - cross_out_bias, - ffn_ln_weight, - ffn_ln_bias, - ffn_inter_weight_0, - ffn_inter_bias_0, - ffn_inter_weight_1, - ffn_inter_bias_1, - ffn_out_weight, - ffn_out_bias, - self_relative_attention_bias_weight, - decoder_ln_weight, - decoder_ln_bias, - embedding_weight, - embedding_bias, - output_ids, - parent_ids, - sequence_length, - decoding_strategy, - beam_size, - topk, - topp, - n_head, - size_per_head, - num_layer, - bos_id, - eos_id, - max_len, - beam_search_diversity_rate, - alpha, - temperature, - early_stopping, - max_distance, - num_buckets, - tie_word_embeddings, - act, - stream); - break; - } - case paddle::DataType::FLOAT32: { - ret = t5_decoding_kernel( - input, - mem_seq_len, - word_embedding, - self_ln_weight, - self_ln_bias, - self_q_weight, - self_q_bias, - self_k_weight, - self_k_bias, - self_v_weight, - self_v_bias, - self_out_weight, - self_out_bias, - cross_ln_weight, - cross_ln_bias, - cross_q_weight, - cross_q_bias, - cross_k_weight, - cross_k_bias, - cross_v_weight, - cross_v_bias, - cross_out_weight, - cross_out_bias, - ffn_ln_weight, - ffn_ln_bias, - ffn_inter_weight_0, - ffn_inter_bias_0, - ffn_inter_weight_1, - ffn_inter_bias_1, - ffn_out_weight, - ffn_out_bias, - self_relative_attention_bias_weight, - decoder_ln_weight, - decoder_ln_bias, - embedding_weight, - embedding_bias, - output_ids, - parent_ids, - sequence_length, - decoding_strategy, - beam_size, - topk, - topp, - n_head, - size_per_head, - num_layer, - bos_id, - eos_id, - max_len, - beam_search_diversity_rate, - alpha, - temperature, - early_stopping, - max_distance, - num_buckets, - tie_word_embeddings, - act, - stream); - break; - } - default: { - PD_THROW( - "NOT supported data type. " - "Only float16 and float32 are supported. "); - break; - } - } - return ret; -} diff --git a/paddlenlp/ops/fast_transformer/src/fusion_t5_decoding_op.h b/paddlenlp/ops/fast_transformer/src/fusion_t5_decoding_op.h deleted file mode 100644 index 1fe581d4879c..000000000000 --- a/paddlenlp/ops/fast_transformer/src/fusion_t5_decoding_op.h +++ /dev/null @@ -1,91 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#pragma once - -#include -#include - -#include "cublas_handle.h" -#include "fastertransformer/open_decoder.h" -#include "fastertransformer/t5_beamsearch.h" -#include "fastertransformer/t5_sampling.h" -#include "fastertransformer/utils/common.h" - -#ifdef PADDLE_ON_INFERENCE -#include "paddle/extension.h" -#include "paddle_inference_api.h" -#include "paddle/common/exception.h" -#else -#include "paddle/extension.h" -#endif - - -std::vector T5DecodingCUDAForward( - const paddle::Tensor& input, - const paddle::Tensor& mem_seq_len, - const paddle::Tensor& word_embedding, - const std::vector& self_ln_weight, - const std::vector& self_ln_bias, - const std::vector& self_q_weight, - const std::vector& self_q_bias, - const std::vector& self_k_weight, - const std::vector& self_k_bias, - const std::vector& self_v_weight, - const std::vector& self_v_bias, - const std::vector& self_out_weight, - const std::vector& self_out_bias, - const std::vector& cross_ln_weight, - const std::vector& cross_ln_bias, - const std::vector& cross_q_weight, - const std::vector& cross_q_bias, - const std::vector& cross_k_weight, - const std::vector& cross_k_bias, - const std::vector& cross_v_weight, - const std::vector& cross_v_bias, - const std::vector& cross_out_weight, - const std::vector& cross_out_bias, - const std::vector& ffn_ln_weight, - const std::vector& ffn_ln_bias, - const std::vector& ffn_inter_weight_0, - const std::vector& ffn_inter_bias_0, - const std::vector& ffn_inter_weight_1, - const std::vector& ffn_inter_bias_1, - const std::vector& ffn_out_weight, - const std::vector& ffn_out_bias, - const paddle::Tensor& self_relative_attention_bias_weight, - const paddle::Tensor& decoder_ln_weight, - const paddle::Tensor& decoder_ln_bias, - const paddle::Tensor& embedding_weight, - const paddle::Tensor& embedding_bias, - paddle::Tensor& output_ids, - paddle::Tensor& parent_ids, - paddle::Tensor& sequence_length, - const std::string& decoding_strategy, - const int& beam_size, - const int& topk, - const float& topp, - const int& n_head, - const int& size_per_head, - const int& num_layer, - const int& bos_id, - const int& eos_id, - const int64_t& max_len, - const float& beam_search_diversity_rate, - const float& alpha, - const float& temperature, - const bool& early_stopping, - const int& max_distance, - const int& num_buckets, - const bool& tie_word_embeddings, - const std::string& act); diff --git a/paddlenlp/ops/fast_transformer/src/fusion_unified_decoding_op.cc b/paddlenlp/ops/fast_transformer/src/fusion_unified_decoding_op.cc deleted file mode 100644 index 6053ad48a9b1..000000000000 --- a/paddlenlp/ops/fast_transformer/src/fusion_unified_decoding_op.cc +++ /dev/null @@ -1,417 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include -#include - -#include "fusion_unified_decoding_op.h" -#include "pd_traits.h" - - -std::vector UnifiedDecodingForward( - const paddle::Tensor& input_ids, - const paddle::Tensor& attn_mask, - const paddle::Tensor& mem_seq_len, - const paddle::Tensor& type_id, - const paddle::Tensor& decoder_type_id, - const paddle::Tensor& logits_mask, - const paddle::Tensor& word_embedding, - const std::vector& self_ln_weight, - const std::vector& self_ln_bias, - const std::vector& self_q_weight, - const std::vector& self_q_bias, - const std::vector& self_k_weight, - const std::vector& self_k_bias, - const std::vector& self_v_weight, - const std::vector& self_v_bias, - const std::vector& self_out_weight, - const std::vector& self_out_bias, - const std::vector& ffn_ln_weight, - const std::vector& ffn_ln_bias, - const std::vector& ffn_inter_weight, - const std::vector& ffn_inter_bias, - const std::vector& ffn_out_weight, - const std::vector& ffn_out_bias, - const paddle::Tensor& decoder_ln_weight, - const paddle::Tensor& decoder_ln_bias, - const paddle::Tensor& trans_weight, - const paddle::Tensor& trans_bias, - const paddle::Tensor& lm_ln_weight, - const paddle::Tensor& lm_ln_bias, - const paddle::Tensor& embedding_weight, - const paddle::Tensor& embedding_bias, - const paddle::Tensor& positional_embedding_weight, - const paddle::Tensor& type_embedding_weight, - const paddle::Tensor& role_id, - const paddle::Tensor& decoder_role_id, - const paddle::Tensor& role_embedding_table, - const paddle::Tensor& position_ids, - const paddle::Tensor& decoder_position_ids, - const std::string& decoding_strategy, - const int& beam_size, - const int& topk, - const float& topp, - const int& n_head, - const int& size_per_head, - const int& num_layer, - const int& bos_id, - const int& eos_id, - const int64_t& max_len, - const float& beam_search_diversity_rate, - const int& unk_id, - const int& mask_id, - const float& temperature, - const float& len_penalty, - const bool& normalize_before, - const bool& pos_bias, - const std::string& hidden_act, - const bool& rel_len, - const bool& early_stopping, - const int& min_length, - const int& tensor_para_size, - const int& layer_para_size, - const int& layer_para_batch_size) { - int batch_size = input_ids.shape()[0]; - int max_out_len = rel_len ? max_len + input_ids.shape()[1] : max_len; - - std::vector output_ids_dims; - std::vector output_scores_dims; - std::vector parent_ids_dims; - std::vector sequence_length_dims({batch_size}); - if (decoding_strategy == "beam_search") { - if (batch_size != -1) { - batch_size /= beam_size; - } - output_ids_dims = {max_out_len, batch_size, beam_size}; - output_scores_dims = {batch_size, beam_size}; - parent_ids_dims = output_ids_dims; - } else if (decoding_strategy == "beam_search_v2" || - decoding_strategy == "beam_search_v3") { - // Use separated alive and finish beam queues to avoid the decrease of alive - // beams. The outputs must include both the finish and alive to trace full - // path. - if (batch_size != -1) { - sequence_length_dims = {batch_size * 2}; - batch_size /= beam_size; - } else { - sequence_length_dims = {batch_size}; - } - output_ids_dims = {max_out_len, batch_size, beam_size * 2}; - output_scores_dims = {batch_size, beam_size * 2}; - parent_ids_dims = output_ids_dims; - } else if (decoding_strategy == "topk_sampling" || - decoding_strategy == "topp_sampling" || - decoding_strategy == "sampling") { - output_ids_dims = {max_out_len, batch_size}; - output_scores_dims = {batch_size}; - parent_ids_dims = {1}; - } else { - PD_THROW("Not supported decoding strategy. "); - } - auto output_ids = paddle::Tensor(input_ids.place(), output_ids_dims); - auto parent_ids = paddle::Tensor(input_ids.place(), parent_ids_dims); - auto sequence_length = - paddle::Tensor(input_ids.place(), sequence_length_dims); - auto output_scores = paddle::Tensor(input_ids.place(), output_scores_dims); - - if (input_ids.place() == paddle::PlaceType::kGPU) { - auto mem_seq_length = paddle::Tensor(paddle::PlaceType::kGPU); - - if (mem_seq_len.place() != paddle::PlaceType::kGPU) { - mem_seq_length = mem_seq_len.copy_to(paddle::PlaceType::kGPU); - } else { - mem_seq_length = mem_seq_len; - } - - return UnifiedDecodingCUDAForward(input_ids, - attn_mask, - mem_seq_length, - type_id, - decoder_type_id, - logits_mask, - word_embedding, - self_ln_weight, - self_ln_bias, - self_q_weight, - self_q_bias, - self_k_weight, - self_k_bias, - self_v_weight, - self_v_bias, - self_out_weight, - self_out_bias, - ffn_ln_weight, - ffn_ln_bias, - ffn_inter_weight, - ffn_inter_bias, - ffn_out_weight, - ffn_out_bias, - decoder_ln_weight, - decoder_ln_bias, - trans_weight, - trans_bias, - lm_ln_weight, - lm_ln_bias, - embedding_weight, - embedding_bias, - positional_embedding_weight, - type_embedding_weight, - role_id, - decoder_role_id, - role_embedding_table, - position_ids, - decoder_position_ids, - output_ids, - parent_ids, - sequence_length, - output_scores, - decoding_strategy, - beam_size, - topk, - topp, - n_head, - size_per_head, - num_layer, - bos_id, - eos_id, - max_out_len, - beam_search_diversity_rate, - unk_id, - mask_id, - temperature, - len_penalty, - normalize_before, - pos_bias, - hidden_act, - early_stopping, - min_length, - tensor_para_size, - layer_para_size, - layer_para_batch_size); - } else { - PD_THROW("Not implemented place. Only GPU is supported. "); - } -} - -std::vector> UnifiedDecodingInferShape( - const std::vector& input_ids_shape, - const std::vector& attn_mask_shape, - const std::vector& mem_seq_len_shape, - const std::vector& logits_mask_shape, - const std::vector& type_id_shape, - const std::vector& decoder_type_id_shape, - const std::vector& word_embedding_shape, - const std::vector>& self_ln_weight_shapes, - const std::vector>& self_ln_bias_shapes, - const std::vector>& self_q_weight_shapes, - const std::vector>& self_q_bias_shapes, - const std::vector>& self_k_weight_shapes, - const std::vector>& self_k_bias_shapes, - const std::vector>& self_v_weight_shapes, - const std::vector>& self_v_bias_shapes, - const std::vector>& self_out_weight_shapes, - const std::vector>& self_out_bias_shapes, - const std::vector>& ffn_ln_weight_shapes, - const std::vector>& ffn_ln_bias_shapes, - const std::vector>& ffn_inter_weight_shapes, - const std::vector>& ffn_inter_bias_shapes, - const std::vector>& ffn_out_weight_shapes, - const std::vector>& ffn_out_bias_shapes, - const std::vector& decoder_ln_weight_shape, - const std::vector& decoder_ln_bias_shape, - const std::vector& trans_weight_shape, - const std::vector& trans_bias_shape, - const std::vector& lm_ln_weight_shape, - const std::vector& lm_ln_bias_shape, - const std::vector& embedding_weight_shape, - const std::vector& embedding_bias_shape, - const std::vector& positional_embedding_weight_shape, - const std::vector& type_embedding_weight_shape, - const std::vector& role_id_shape, - const std::vector& decoder_role_id_shape, - const std::vector& role_embedding_table_shape, - const std::vector& position_ids_shape, - const std::vector& decoder_position_ids_shape, - const std::string& decoding_strategy, - const int& beam_size, - const int& topk, - const float& topp, - const int& n_head, - const int& size_per_head, - const int& num_layer, - const int& bos_id, - const int& eos_id, - const int64_t& max_len, - const float& beam_search_diversity_rate, - const int& unk_id, - const int& mask_id, - const float& temperature, - const float& len_penalty, - const bool& normalize_before, - const bool& pos_bias, - const std::string& hidden_act, - const bool& rel_len, - const bool& early_stopping, - const int& min_length, - const int& tensor_para_size = 1, - const int& layer_para_size = 1, - const int& layer_para_batch_size = 1) { - int batch_size = input_ids_shape[0]; - - std::vector output_ids_dims; - std::vector output_scores_dims; - std::vector sequence_length_dims({batch_size}); - if (decoding_strategy == "beam_search") { - if (batch_size != -1) { - batch_size /= beam_size; - } - output_ids_dims = {max_len, batch_size, beam_size}; - output_scores_dims = {batch_size, beam_size}; - return {output_ids_dims, output_ids_dims, sequence_length_dims, output_scores_dims}; - } else if (decoding_strategy == "beam_search_v2" || - decoding_strategy == "beam_search_v3") { - // Use separated alive and finish beam queues to avoid the decrease of alive - // beams. The outputs must include both the finish and alive to trace full - // path. - if (batch_size != -1) { - sequence_length_dims = {batch_size * 2}; - batch_size /= beam_size; - } else { - sequence_length_dims = {batch_size}; - } - output_ids_dims = {max_len, batch_size, beam_size * 2}; - output_scores_dims = {batch_size, beam_size * 2}; - return {output_ids_dims, output_ids_dims, sequence_length_dims, output_scores_dims}; - } else if (decoding_strategy == "topk_sampling" || - decoding_strategy == "topp_sampling" || - decoding_strategy == "sampling") { - output_ids_dims = {max_len, batch_size}; - output_scores_dims = {batch_size}; - return {output_ids_dims, {1}, sequence_length_dims, output_scores_dims}; - } else { - PD_THROW("Not supported decoding strategy. "); - } -} - -std::vector UnifiedDecodingInferDtype( - const paddle::DataType& input_ids, - const paddle::DataType& attn_mask, - const paddle::DataType& mem_seq_len, - const paddle::DataType& logits_mask, - const paddle::DataType& type_id, - const paddle::DataType& decoder_type_id, - const paddle::DataType& word_embedding, - const std::vector& self_ln_weight, - const std::vector& self_ln_bias, - const std::vector& self_q_weight, - const std::vector& self_q_bias, - const std::vector& self_k_weight, - const std::vector& self_k_bias, - const std::vector& self_v_weight, - const std::vector& self_v_bias, - const std::vector& self_out_weight, - const std::vector& self_out_bias, - const std::vector& ffn_ln_weight, - const std::vector& ffn_ln_bias, - const std::vector& ffn_inter_weight, - const std::vector& ffn_inter_bias, - const std::vector& ffn_out_weight, - const std::vector& ffn_out_bias, - const paddle::DataType& decoder_ln_weight, - const paddle::DataType& decoder_ln_bias, - const paddle::DataType& trans_weight, - const paddle::DataType& trans_bias, - const paddle::DataType& lm_ln_weight, - const paddle::DataType& lm_ln_bias, - const paddle::DataType& embedding_weight, - const paddle::DataType& embedding_bias, - const paddle::DataType& positional_embedding_weight, - const paddle::DataType& type_embedding_weight, - const paddle::DataType& role_id, - const paddle::DataType& decoder_role_id, - const paddle::DataType& role_embedding_table, - const paddle::DataType& position_ids, - const paddle::DataType& decoder_position_ids) { - return {paddle::DataType::INT32, - paddle::DataType::INT32, - paddle::DataType::INT32, - paddle::DataType::FLOAT32}; -} - -PD_BUILD_OP(fusion_unified_decoding) - .Inputs({"InputIds", - "AttnMask", - "MemSeqLen", - "TypeIds", - "DecTypeIds", - "LogitsMask", - "WordEmbedding", - paddle::Vec("SelfLayernormWeight"), - paddle::Vec("SelfLayernormBias"), - paddle::Vec("SelfQueryWeight"), - paddle::Vec("SelfQueryBias"), - paddle::Vec("SelfKeyWeight"), - paddle::Vec("SelfKeyBias"), - paddle::Vec("SelfValueWeight"), - paddle::Vec("SelfValueBias"), - paddle::Vec("SelfOutWeight"), - paddle::Vec("SelfOutBias"), - paddle::Vec("FFNLayernormWeight"), - paddle::Vec("FFNLayernormBias"), - paddle::Vec("FFNInterWeight"), - paddle::Vec("FFNInterBias"), - paddle::Vec("FFNOutWeight"), - paddle::Vec("FFNOutBias"), - "DecoderLayernormWeight", - "DecoderLayernormBias", - "TransWeight", - "TransBias", - "LMLayernormWeight", - "LMLayernormBias", - "EmbWeight", - "EmbBias", - "PositionEncEmb", - "TypeEmb", - "RoleIds", - "DecRoleIds", - "RoleEmbedding", - "PositionIds", - "DecPositionIds"}) - .Outputs({"OutputIds", "ParentIds", "SequenceLength", "OutputScores"}) - .Attrs({"decoding_strategy: std::string", - "beam_size: int", - "topk: int", - "topp: float", - "n_head: int", - "size_per_head: int", - "num_layer: int", - "bos_id: int", - "eos_id: int", - "max_len: int64_t", - "beam_search_diversity_rate: float", - "unk_id: int", - "mask_id: int", - "temperature: float", - "len_penalty: float", - "normalize_before: bool", - "pos_bias: bool", - "hidden_act: std::string", - "rel_len: bool", - "early_stopping: bool", - "min_length: int", - "tensor_para_size: int", - "layer_para_size: int", - "layer_para_batch_size: int"}) - .SetKernelFn(PD_KERNEL(UnifiedDecodingForward)) - .SetInferShapeFn(PD_INFER_SHAPE(UnifiedDecodingInferShape)) - .SetInferDtypeFn(PD_INFER_DTYPE(UnifiedDecodingInferDtype)); diff --git a/paddlenlp/ops/fast_transformer/src/fusion_unified_decoding_op.cu b/paddlenlp/ops/fast_transformer/src/fusion_unified_decoding_op.cu deleted file mode 100644 index 2df429cc9ee9..000000000000 --- a/paddlenlp/ops/fast_transformer/src/fusion_unified_decoding_op.cu +++ /dev/null @@ -1,693 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -// TODO(guosheng): `HOST` conflict exists in float.h of paddle and mpi.h of mpi -#include "fusion_unified_decoding_op.h" -#include "pd_traits.h" -#ifdef HOST -#undef HOST -#endif - -#include "fastertransformer/decoding_beamsearch.h" -#include "fastertransformer/decoding_sampling.h" -#include "fastertransformer/utils/common.h" - -#ifdef BUILD_GPT // consistent with FasterTransformer -#include "parallel_utils.h" -#endif - - -template -std::vector unified_decoding_kernel( - const paddle::Tensor& input_ids, - const paddle::Tensor& attn_mask, - const paddle::Tensor& memory_sequence_length, - const paddle::Tensor& type_id, - const paddle::Tensor& decoder_type_id, - const paddle::Tensor& logits_mask, - const paddle::Tensor& word_emb, - const std::vector& self_layernorm_weight, - const std::vector& self_layernorm_bias, - const std::vector& self_attn_query_weight, - const std::vector& self_attn_query_bias, - const std::vector& self_attn_key_weight, - const std::vector& self_attn_key_bias, - const std::vector& self_attn_value_weight, - const std::vector& self_attn_value_bias, - const std::vector& self_attn_output_weight, - const std::vector& self_attn_output_bias, - const std::vector& ffn_layernorm_weight, - const std::vector& ffn_layernorm_bias, - const std::vector& ffn_intermediate_weight, - const std::vector& ffn_intermediate_bias, - const std::vector& ffn_output_weight, - const std::vector& ffn_output_bias, - const paddle::Tensor& decoder_layernorm_weight, - const paddle::Tensor& decoder_layernorm_bias, - const paddle::Tensor& trans_weight, - const paddle::Tensor& trans_bias, - const paddle::Tensor& lm_layernorm_weight, - const paddle::Tensor& lm_layernorm_bias, - const paddle::Tensor& embedding_weight, - const paddle::Tensor& embedding_bias, - const paddle::Tensor& position_encoding_table, - const paddle::Tensor& type_embedding_weight, - const paddle::Tensor& role_id, - const paddle::Tensor& decoder_role_id, - const paddle::Tensor& role_embedding_table, - const paddle::Tensor& position_ids, - const paddle::Tensor& decoder_position_ids, - paddle::Tensor& output_ids, - paddle::Tensor& parent_ids, - paddle::Tensor& sequence_length, - paddle::Tensor& output_scores, - const std::string& decoding_strategy, - const int beam_size, - const int topk, - const float topp, - const int head_num_, - const int size_per_head_, - const int num_layer_, - const int start_id_, - const int end_id_, - const int64_t max_seq_len_, - const float beam_search_diversity_rate_, - const int unk_id, - const int mask_id, - const float temperature, - const float len_penalty, - const bool normalize_before, - const bool pos_bias, - const std::string& hidden_act, - const bool early_stopping, - const int min_length, - cudaStream_t stream, - const int tensor_para_size = 1, - const int layer_para_size = 1, - const int layer_para_batch_size = 1) { - int beam_width_ = (decoding_strategy == "beam_search" || - decoding_strategy == "beam_search_v2" || - decoding_strategy == "beam_search_v3") - ? beam_size - : 1; - int candidate_num_ = - ("topk_sampling" == decoding_strategy || - "topp_sampling" == decoding_strategy || "sampling" == decoding_strategy) - ? topk - : 1; - float probability_threshold_ = - ("topk_sampling" == decoding_strategy || - "topp_sampling" == decoding_strategy || "sampling" == decoding_strategy) - ? topp - : 0.0; - - auto input_ids_dims = input_ids.shape(); - int batch_size_ = (decoding_strategy == "beam_search" || - decoding_strategy == "beam_search_v2" || - decoding_strategy == "beam_search_v3") - ? input_ids_dims[0] / beam_width_ - : input_ids_dims[0]; - const int memory_max_seq_len = input_ids_dims[1]; - const int memory_hidden_dim = head_num_ * size_per_head_; - const int vocab_size = word_emb.shape()[0]; - - typedef PDTraits traits_; - typedef typename traits_::DataType DataType_; - typedef typename traits_::data_t data_t_; - - DecodingInitParam decoding_params; - decoding_params.cublas_handle = CublasHandle::GetInstance()->cublas_handle_; - decoding_params.cublaslt_handle = - CublasHandle::GetInstance()->cublaslt_handle_; - - decoding_params.output_ids = output_ids.mutable_data(input_ids.place()); - decoding_params.parent_ids = parent_ids.mutable_data(input_ids.place()); - decoding_params.sequence_length = - sequence_length.mutable_data(input_ids.place()); - decoding_params.output_scores = output_scores.mutable_data(input_ids.place()); - - typedef DecoderTransformerTraits DecodingTraits_; - decoding_params.stream = stream; - fastertransformer::Allocator allocator_(stream); - - decoding_params.d_start_ids = const_cast(input_ids.data()); - decoding_params.d_attn_mask = - reinterpret_cast(const_cast(attn_mask.data())); - decoding_params.d_start_lengths = memory_sequence_length.data(); - - decoding_params.memory_sequence_length = memory_sequence_length.data(); - decoding_params.type_id = type_id.data(); - decoding_params.decoder_type_id = decoder_type_id.data(); - - if (decoding_strategy == "beam_search" || - decoding_strategy == "beam_search_v2" || - decoding_strategy == "beam_search_v3") { - decoding_params.request_batch_size = batch_size_ * beam_width_; - } else if (decoding_strategy == "sampling" || - decoding_strategy == "topk_sampling" || - decoding_strategy == "topp_sampling") { - decoding_params.request_batch_size = batch_size_; - } - decoding_params.max_input_len = memory_max_seq_len; - decoding_params.request_input_len = memory_max_seq_len; - decoding_params.request_output_len = max_seq_len_; - -#ifdef BUILD_GPT - auto* model_para_desc = ModelParaDescFactory::CreateModelParaDesc( - head_num_, - size_per_head_, - num_layer_, - tensor_para_size, - layer_para_size, - layer_para_batch_size, - const_cast(word_emb.data())); - auto& tensor_parallel_param = model_para_desc->tensor_parallel_param; - auto& layer_parallel_param = model_para_desc->layer_parallel_param; - auto seed = model_para_desc->dist(model_para_desc->gen); -#else - TensorParallelParam tensor_parallel_param; - LayerParallelParam layer_parallel_param; - tensor_parallel_param.rank = 0; - tensor_parallel_param.world_size = 1; - tensor_parallel_param.local_head_num_ = head_num_; - tensor_parallel_param.local_hidden_units_ = memory_hidden_dim; - - layer_parallel_param.rank = 0; - layer_parallel_param.world_size = 1; - layer_parallel_param.layers_per_group = num_layer_; - layer_parallel_param.local_batch_size = batch_size_; - int seed = -1; -#endif - - DecoderInitParam* params = - new DecoderInitParam[num_layer_]; - - // Allow python passing partial weights for model parallel. - int inner_coeff = - (memory_hidden_dim == self_attn_output_weight[0].shape()[0]) - ? ffn_intermediate_weight[0].shape()[1] / memory_hidden_dim - : (ffn_intermediate_weight[0].shape()[1] * tensor_para_size / - memory_hidden_dim); - - for (int i = 0; i < self_layernorm_weight.size(); i++) { - // Allow python passing weights of all layers or only passing the - // corresponding layers to save memory. - int layer_idx = self_layernorm_weight.size() != num_layer_ - ? layer_parallel_param.rank * - layer_parallel_param.layers_per_group + - i - : i; - params[layer_idx].stream = stream; - params[layer_idx].cublas_handle = CublasHandle::GetInstance()->cublas_handle_; - params[layer_idx].cublaslt_handle = CublasHandle::GetInstance()->cublaslt_handle_; - - if (decoding_strategy == "beam_search" || - decoding_strategy == "beam_search_v2" || - decoding_strategy == "beam_search_v3") { - params[layer_idx].request_batch_size = batch_size_ * beam_width_; - params[layer_idx].request_max_mem_seq_len = memory_max_seq_len; - } else if (decoding_strategy == "sampling" || - decoding_strategy == "topk_sampling" || - decoding_strategy == "topp_sampling") { - params[layer_idx].request_batch_size = batch_size_; - params[layer_idx].request_max_mem_seq_len = memory_max_seq_len; - } - - // self attn - params[layer_idx].self_layernorm.gamma = reinterpret_cast( - self_layernorm_weight[i].data()); - params[layer_idx].self_layernorm.beta = reinterpret_cast( - self_layernorm_bias[i].data()); - // query - params[layer_idx].self_attention.query_weight.kernel = - reinterpret_cast( - self_attn_query_weight[i].data()); - params[layer_idx].self_attention.query_weight.bias = - reinterpret_cast( - self_attn_query_bias[i].data()); - // key - params[layer_idx].self_attention.key_weight.kernel = - reinterpret_cast( - self_attn_key_weight[i].data()); - params[layer_idx].self_attention.key_weight.bias = - reinterpret_cast( - self_attn_key_bias[i].data()); - // value - params[layer_idx].self_attention.value_weight.kernel = - reinterpret_cast( - self_attn_value_weight[i].data()); - params[layer_idx].self_attention.value_weight.bias = - reinterpret_cast( - self_attn_value_bias[i].data()); - // out proj - params[layer_idx].self_attention.attention_output_weight.kernel = - reinterpret_cast( - self_attn_output_weight[i].data()); - - params[layer_idx].self_attention.attention_output_weight.bias = - reinterpret_cast( - self_attn_output_bias[i].data()); - - // ffn - params[layer_idx].ffn_layernorm.gamma = reinterpret_cast( - ffn_layernorm_weight[i].data()); - params[layer_idx].ffn_layernorm.beta = reinterpret_cast( - ffn_layernorm_bias[i].data()); - // intermediate proj - params[layer_idx].ffn.intermediate_weight.kernel = - reinterpret_cast( - ffn_intermediate_weight[i].data()); - params[layer_idx].ffn.intermediate_weight.bias = reinterpret_cast( - ffn_intermediate_bias[i].data()); - // out proj - params[layer_idx].ffn.output_weight.kernel = reinterpret_cast( - ffn_output_weight[i].data()); - params[layer_idx].ffn.output_weight.bias = - reinterpret_cast(ffn_output_bias[i].data()); - } - - decoding_params.layernorm.gamma = reinterpret_cast( - decoder_layernorm_weight.data()); - decoding_params.layernorm.beta = reinterpret_cast( - decoder_layernorm_bias.data()); - decoding_params.trans_kernel = - reinterpret_cast(trans_weight.data()); - decoding_params.trans_bias = - reinterpret_cast(trans_bias.data()); - - decoding_params.lm_layernorm.gamma = - reinterpret_cast(lm_layernorm_weight.data()); - decoding_params.lm_layernorm.beta = - reinterpret_cast(lm_layernorm_bias.data()); - - // For embedding - decoding_params.embedding_table = - reinterpret_cast(word_emb.data()); - // For weight sharing matmul - decoding_params.embedding_kernel = - reinterpret_cast(embedding_weight.data()); - // For matmul bias - decoding_params.embedding_bias = - reinterpret_cast(embedding_bias.data()); - decoding_params.position_encoding_table = reinterpret_cast( - position_encoding_table.data()); - - // For masking some id during gen. - decoding_params.logits_mask = - reinterpret_cast(logits_mask.data()); - - decoding_params.type_table = - reinterpret_cast(type_embedding_weight.data()); - - // For role embedding. - auto role_id_shape = role_id.shape(); - if (role_id_shape.size() > 0 && numel(role_id_shape) > 0) { - decoding_params.role_id = role_id.data(); - decoding_params.decoder_role_id = decoder_role_id.data(); - decoding_params.role_embedding_table = - reinterpret_cast(role_embedding_table.data()); - } - - auto position_id_shape = position_ids.shape(); - if (position_id_shape.size() > 0 && numel(position_id_shape) > 0) { - decoding_params.position_ids = position_ids.data(); - decoding_params.decoder_position_ids = decoder_position_ids.data(); - } - - ActivationType activate = - (hidden_act == "gelu") ? ActivationType::GELU : ActivationType::RELU; - - int finished_candidate_num_ = - ("beam_search_v3" == decoding_strategy) ? beam_width_ : beam_width_ * 2; - - if ("beam_search" == decoding_strategy) { - DecodingBeamsearch* unified_decoding_beam_search_; - - unified_decoding_beam_search_ = - new DecodingBeamsearch( - allocator_, - batch_size_, - beam_width_, - max_seq_len_, - head_num_, - size_per_head_, - vocab_size, - num_layer_, - memory_hidden_dim, - memory_max_seq_len, - start_id_, - end_id_, - beam_search_diversity_rate_, - true, /*is_fuse_topk_softMax*/ - true, /*is_fuse_qkv*/ - false, /*keep_alive_beam*/ - len_penalty, /*alpha not used for this case*/ - normalize_before, - 0, /*pos_offset BART only for now*/ - activate, - pos_bias, - true, /*prefix_lm*/ - -1, /*finished_candidate_num*/ - false, /*early_stopping*/ - false, /*is_mbart*/ - min_length, - inner_coeff); - unified_decoding_beam_search_->set_tensor_parallel_param( - tensor_parallel_param); - unified_decoding_beam_search_->set_layer_parallel_param( - layer_parallel_param); - unified_decoding_beam_search_->forward_context(params, decoding_params); - unified_decoding_beam_search_->forward(params, decoding_params); - - delete unified_decoding_beam_search_; - } else if ("beam_search_v2" == decoding_strategy || - "beam_search_v3" == decoding_strategy) { - DecodingBeamsearch* unified_decoding_beam_search_; - - unified_decoding_beam_search_ = - new DecodingBeamsearch( - allocator_, - batch_size_, - beam_width_, - max_seq_len_, - head_num_, - size_per_head_, - vocab_size, - num_layer_, - memory_hidden_dim, - memory_max_seq_len, - start_id_, - end_id_, - beam_search_diversity_rate_, - true, /*is_fuse_topk_softMax*/ - true, /*is_fuse_qkv*/ - true, /*keep_alive_beam*/ - len_penalty, - normalize_before, - 0, /*pos_offset BART only for now*/ - activate, - pos_bias, - true, /*prefix_lm*/ - finished_candidate_num_, - early_stopping, - false, /*is_mbart*/ - min_length, - inner_coeff); - unified_decoding_beam_search_->forward_context(params, decoding_params); - unified_decoding_beam_search_->forward(params, decoding_params); - - delete unified_decoding_beam_search_; - } else if ("topk_sampling" == decoding_strategy || - "topp_sampling" == decoding_strategy || - "sampling" == decoding_strategy) { - DecodingSampling* unified_decoding_sampling_; - - unified_decoding_sampling_ = new DecodingSampling( - allocator_, - batch_size_, - max_seq_len_, - head_num_, - size_per_head_, - vocab_size, - num_layer_, - memory_hidden_dim, - memory_max_seq_len, - start_id_, - end_id_, - candidate_num_, - probability_threshold_, - true, /*is_fuse_qkv*/ - normalize_before, - 0, /*pos_offset BART only for now*/ - activate, - pos_bias, - temperature, - 1.0, /*repeat_penalty*/ - true, /*prefix_lm*/ - false, /*is_mbart*/ - min_length, - inner_coeff, - seed, - tensor_para_size, - layer_para_size); - unified_decoding_sampling_->set_tensor_parallel_param( - tensor_parallel_param); - unified_decoding_sampling_->set_layer_parallel_param(layer_parallel_param); - unified_decoding_sampling_->forward_context(params, decoding_params); - unified_decoding_sampling_->forward(params, decoding_params); - - delete unified_decoding_sampling_; - } else { - PD_THROW( - "Only beam_search, beam_search_v2, topk_sampling and topp_sampling are " - "supported for " - "FastGeneration. "); - } - delete[] params; - - return {output_ids, parent_ids, sequence_length, output_scores}; -} - -std::vector UnifiedDecodingCUDAForward( - const paddle::Tensor& input_ids, - const paddle::Tensor& attn_mask, - const paddle::Tensor& mem_seq_len, - const paddle::Tensor& type_id, - const paddle::Tensor& decoder_type_id, - const paddle::Tensor& logits_mask, - const paddle::Tensor& word_embedding, - const std::vector& self_ln_weight, - const std::vector& self_ln_bias, - const std::vector& self_q_weight, - const std::vector& self_q_bias, - const std::vector& self_k_weight, - const std::vector& self_k_bias, - const std::vector& self_v_weight, - const std::vector& self_v_bias, - const std::vector& self_out_weight, - const std::vector& self_out_bias, - const std::vector& ffn_ln_weight, - const std::vector& ffn_ln_bias, - const std::vector& ffn_inter_weight, - const std::vector& ffn_inter_bias, - const std::vector& ffn_out_weight, - const std::vector& ffn_out_bias, - const paddle::Tensor& decoder_ln_weight, - const paddle::Tensor& decoder_ln_bias, - const paddle::Tensor& trans_weight, - const paddle::Tensor& trans_bias, - const paddle::Tensor& lm_ln_weight, - const paddle::Tensor& lm_ln_bias, - const paddle::Tensor& embedding_weight, - const paddle::Tensor& embedding_bias, - const paddle::Tensor& positional_embedding_weight, - const paddle::Tensor& type_embedding_weight, - const paddle::Tensor& role_id, - const paddle::Tensor& decoder_role_id, - const paddle::Tensor& role_embedding_table, - const paddle::Tensor& position_ids, - const paddle::Tensor& decoder_position_ids, - paddle::Tensor& output_ids, - paddle::Tensor& parent_ids, - paddle::Tensor& sequence_length, - paddle::Tensor& output_scores, - const std::string& decoding_strategy, - const int beam_size, - const int topk, - const float topp, - const int n_head, - const int size_per_head, - const int num_layer, - const int bos_id, - const int eos_id, - const int64_t max_len, - const float beam_search_diversity_rate, - const int unk_id, - const int mask_id, - const float temperature, - const float len_penalty, - const bool normalize_before, - const bool pos_bias, - const std::string& hidden_act, - const bool early_stopping, - const int min_length, - const int tensor_para_size = 1, - const int layer_para_size = 1, - const int layer_para_batch_size = 1) { - auto stream = input_ids.stream(); - - cublasSetStream(CublasHandle::GetInstance()->cublas_handle_, stream); - - std::vector ret; - - switch (self_ln_weight[0].type()) { - case paddle::DataType::FLOAT16: { - ret = unified_decoding_kernel( - input_ids, - attn_mask, - mem_seq_len, - type_id, - decoder_type_id, - logits_mask, - word_embedding, - self_ln_weight, - self_ln_bias, - self_q_weight, - self_q_bias, - self_k_weight, - self_k_bias, - self_v_weight, - self_v_bias, - self_out_weight, - self_out_bias, - ffn_ln_weight, - ffn_ln_bias, - ffn_inter_weight, - ffn_inter_bias, - ffn_out_weight, - ffn_out_bias, - decoder_ln_weight, - decoder_ln_bias, - trans_weight, - trans_bias, - lm_ln_weight, - lm_ln_bias, - embedding_weight, - embedding_bias, - positional_embedding_weight, - type_embedding_weight, - role_id, - decoder_role_id, - role_embedding_table, - position_ids, - decoder_position_ids, - output_ids, - parent_ids, - sequence_length, - output_scores, - decoding_strategy, - beam_size, - topk, - topp, - n_head, - size_per_head, - num_layer, - bos_id, - eos_id, - max_len, - beam_search_diversity_rate, - unk_id, - mask_id, - temperature, - len_penalty, - normalize_before, - pos_bias, - hidden_act, - early_stopping, - min_length, - stream, - tensor_para_size, - layer_para_size, - layer_para_batch_size); - break; - } - case paddle::DataType::FLOAT32: { - ret = unified_decoding_kernel( - input_ids, - attn_mask, - mem_seq_len, - type_id, - decoder_type_id, - logits_mask, - word_embedding, - self_ln_weight, - self_ln_bias, - self_q_weight, - self_q_bias, - self_k_weight, - self_k_bias, - self_v_weight, - self_v_bias, - self_out_weight, - self_out_bias, - ffn_ln_weight, - ffn_ln_bias, - ffn_inter_weight, - ffn_inter_bias, - ffn_out_weight, - ffn_out_bias, - decoder_ln_weight, - decoder_ln_bias, - trans_weight, - trans_bias, - lm_ln_weight, - lm_ln_bias, - embedding_weight, - embedding_bias, - positional_embedding_weight, - type_embedding_weight, - role_id, - decoder_role_id, - role_embedding_table, - position_ids, - decoder_position_ids, - output_ids, - parent_ids, - sequence_length, - output_scores, - decoding_strategy, - beam_size, - topk, - topp, - n_head, - size_per_head, - num_layer, - bos_id, - eos_id, - max_len, - beam_search_diversity_rate, - unk_id, - mask_id, - temperature, - len_penalty, - normalize_before, - pos_bias, - hidden_act, - early_stopping, - min_length, - stream, - tensor_para_size, - layer_para_size, - layer_para_batch_size); - break; - } - default: { - PD_THROW( - "NOT supported data type. " - "Only float16 and float32 are supported. "); - break; - } - } - - return ret; -} diff --git a/paddlenlp/ops/fast_transformer/src/fusion_unified_decoding_op.h b/paddlenlp/ops/fast_transformer/src/fusion_unified_decoding_op.h deleted file mode 100644 index 071636b6029c..000000000000 --- a/paddlenlp/ops/fast_transformer/src/fusion_unified_decoding_op.h +++ /dev/null @@ -1,100 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#pragma once - -#include -#include - -// #include "fastertransformer/decoding_beamsearch.h" -// #include "fastertransformer/decoding_sampling.h" -// #include "fastertransformer/open_decoder.h" -// #include "fastertransformer/utils/common.h" -#include "cublas_handle.h" -#include "utils.h" - -#ifdef PADDLE_ON_INFERENCE -#include "paddle/extension.h" -#include "paddle_inference_api.h" -#include "paddle/common/exception.h" -#else -#include "paddle/extension.h" -#endif - - -std::vector UnifiedDecodingCUDAForward( - const paddle::Tensor& input_ids, - const paddle::Tensor& attn_mask, - const paddle::Tensor& mem_seq_len, - const paddle::Tensor& type_id, - const paddle::Tensor& decoder_type_id, - const paddle::Tensor& logits_mask, - const paddle::Tensor& word_embedding, - const std::vector& self_ln_weight, - const std::vector& self_ln_bias, - const std::vector& self_q_weight, - const std::vector& self_q_bias, - const std::vector& self_k_weight, - const std::vector& self_k_bias, - const std::vector& self_v_weight, - const std::vector& self_v_bias, - const std::vector& self_out_weight, - const std::vector& self_out_bias, - const std::vector& ffn_ln_weight, - const std::vector& ffn_ln_bias, - const std::vector& ffn_inter_weight, - const std::vector& ffn_inter_bias, - const std::vector& ffn_out_weight, - const std::vector& ffn_out_bias, - const paddle::Tensor& decoder_ln_weight, - const paddle::Tensor& decoder_ln_bias, - const paddle::Tensor& trans_weight, - const paddle::Tensor& trans_bias, - const paddle::Tensor& lm_ln_weight, - const paddle::Tensor& lm_ln_bias, - const paddle::Tensor& embedding_weight, - const paddle::Tensor& embedding_bias, - const paddle::Tensor& positional_embedding_weight, - const paddle::Tensor& type_embedding_weight, - const paddle::Tensor& role_id, - const paddle::Tensor& decoder_role_id, - const paddle::Tensor& role_embedding_table, - const paddle::Tensor& position_ids, - const paddle::Tensor& decoder_position_ids, - paddle::Tensor& output_ids, - paddle::Tensor& parent_ids, - paddle::Tensor& sequence_length, - paddle::Tensor& output_scores, - const std::string& decoding_strategy, - const int beam_size, - const int topk, - const float topp, - const int n_head, - const int size_per_head, - const int num_layer, - const int bos_id, - const int eos_id, - const int64_t max_len, - const float beam_search_diversity_rate, - const int unk_id, - const int mask_id, - const float temperature, - const float len_penalty, - const bool normalize_before, - const bool pos_bias, - const std::string& hidden_act, - const bool early_stopping, - const int min_length, - const int tensor_para_size, - const int layer_para_size, - const int layer_para_batch_size); diff --git a/paddlenlp/ops/fast_transformer/src/parallel_utils.cc b/paddlenlp/ops/fast_transformer/src/parallel_utils.cc deleted file mode 100644 index 730c3a398649..000000000000 --- a/paddlenlp/ops/fast_transformer/src/parallel_utils.cc +++ /dev/null @@ -1,148 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "parallel_utils.h" - -static std::mutex mpi_global_mutex; -static std::once_flag once_flag_init_mpi; - -void MPIExit() { - std::unique_lock global_lock(mpi_global_mutex); - MPICHECK(MPI_Finalize()); -} - -void InitMPIOnce() { - // Initialize MPI environment - std::call_once(once_flag_init_mpi, []() { - MPICHECK(MPI_Init(nullptr, nullptr)); - if (std::atexit(MPIExit)) { - throw std::runtime_error("Fail to register the MPI exit handler"); - } - }); -} - -void InitNCCLComm(ncclUniqueId& tensor_para_nccl_uid, - ncclUniqueId& layer_para_nccl_uid, - ncclComm_t& tensor_para_nccl_comm, - ncclComm_t& layer_para_nccl_comm, - int rank, - int tensor_para_size, - int layer_para_size, - int tensor_para_rank, - int layer_para_rank) { - // assume gpu_num = n * k, - // tensor parallelism group size is n - // layer parallelism group size is k - - if (tensor_para_rank == 0) { - // get the uid of each tensor parallelism group - // here, 0, 1, ..., n-1 are in group 0, - // n, ..., 2n - 1 are in group 1. - NCCLCHECK(ncclGetUniqueId(&tensor_para_nccl_uid)); - for (int i = 1; i < tensor_para_size; i++) { - printf("[INFO] rank %d sends tensor_para_nccl_uid to rank %d \n", - rank, - rank + i); - MPICHECK(MPI_Send(&tensor_para_nccl_uid, - sizeof(tensor_para_nccl_uid), - MPI_BYTE, - rank + i, - 0, - MPI_COMM_WORLD)); - } - } else { - MPI_Status status; - printf("[INFO] rank %d receives tensor_para_nccl_uid from rank %d \n", - rank, - rank - tensor_para_rank); - MPICHECK(MPI_Recv(&tensor_para_nccl_uid, - sizeof(tensor_para_nccl_uid), - MPI_BYTE, - rank - tensor_para_rank, - 0, - MPI_COMM_WORLD, - &status)); - } - - if (layer_para_rank == 0) { - // get the uid of each layer parallelism group - // 0, k, 2k, are in group 0 - // 1, k+1, 2k+1 are in group 1 - NCCLCHECK(ncclGetUniqueId(&layer_para_nccl_uid)); - for (int i = 1; i < layer_para_size; i++) { - printf("[INFO] rank %d sends layer_para_nccl_uid to rank %d \n", - rank, - rank + i * tensor_para_size); - MPICHECK(MPI_Send(&layer_para_nccl_uid, - sizeof(layer_para_nccl_uid), - MPI_BYTE, - rank + i * tensor_para_size, - 0, - MPI_COMM_WORLD)); - } - } else { - MPI_Status status; - printf("[INFO] rank %d receives layer_para_nccl_uid from rank %d \n", - rank, - rank % tensor_para_size); - MPICHECK(MPI_Recv(&layer_para_nccl_uid, - sizeof(layer_para_nccl_uid), - MPI_BYTE, - rank % tensor_para_size, - 0, - MPI_COMM_WORLD, - &status)); - } - - NCCLCHECK(ncclCommInitRank(&tensor_para_nccl_comm, - tensor_para_size, - tensor_para_nccl_uid, - tensor_para_rank)); - NCCLCHECK(ncclCommInitRank(&layer_para_nccl_comm, - layer_para_size, - layer_para_nccl_uid, - layer_para_rank)); -} - -// Make model parallel settings init only once for one model by using a global -// dict mapping parameters representing different models to corresponding -// settings. Note: `paddle::Tensor` for custom_op is re-created every step and -// we use pointers as keys. Maybe using weakref as keys is better. -static std::unordered_map> - model_para_infos; - -ModelParaDesc* ModelParaDescFactory::CreateModelParaDesc( - int head_num, - int size_per_head, - int layer_num, - int tensor_para_size, - int layer_para_size, - int layer_para_batch_size, - void* param_ptr = nullptr) { - InitMPIOnce(); - auto it = model_para_infos.find(param_ptr); - if (it != model_para_infos.end()) { - return it->second.get(); - } else { - model_para_infos.emplace(param_ptr, - std::unique_ptr( - new ModelParaDesc(head_num, - size_per_head, - layer_num, - tensor_para_size, - layer_para_size, - layer_para_batch_size))); - return model_para_infos[param_ptr].get(); - } -} diff --git a/paddlenlp/ops/fast_transformer/src/parallel_utils.h b/paddlenlp/ops/fast_transformer/src/parallel_utils.h deleted file mode 100644 index 461e79702d18..000000000000 --- a/paddlenlp/ops/fast_transformer/src/parallel_utils.h +++ /dev/null @@ -1,102 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include -#include - -#include "fastertransformer/utils/nccl_utils.h" - - -void MPIExit(); - -void InitMPIOnce(); - -void InitNCCLComm(ncclUniqueId& tensor_para_nccl_uid, - ncclUniqueId& layer_para_nccl_uid, - ncclComm_t& tensor_para_nccl_comm, - ncclComm_t& layer_para_nccl_comm, - int rank, - int tensor_para_size, - int layer_para_size, - int tensor_para_rank, - int layer_para_rank); - -struct ModelParaDesc { - TensorParallelParam tensor_parallel_param; - LayerParallelParam layer_parallel_param; - ncclComm_t tensor_para_nccl_comm, layer_para_nccl_comm; - std::mt19937_64 gen; - std::uniform_int_distribution<> dist{0, std::numeric_limits::max()}; - - ModelParaDesc(int head_num, - int size_per_head, - int layer_num, - int tensor_para_size, - int layer_para_size, - int layer_para_batch_size) { - int rank; - MPICHECK(MPI_Comm_rank(MPI_COMM_WORLD, &rank)); - const int local_head_num = head_num / tensor_para_size; - const int local_hidden_units = local_head_num * size_per_head; - const int layers_per_group = layer_num / layer_para_size; - assert(layer_num % layer_para_size == 0); - const int tensor_para_rank = rank % tensor_para_size; - const int layer_para_rank = rank / tensor_para_size; - ncclUniqueId tensor_para_nccl_uid, layer_para_nccl_uid; - InitNCCLComm(tensor_para_nccl_uid, - layer_para_nccl_uid, - tensor_para_nccl_comm, - layer_para_nccl_comm, - rank, - tensor_para_size, - layer_para_size, - tensor_para_rank, - layer_para_rank); - tensor_parallel_param.rank = tensor_para_rank; - tensor_parallel_param.world_size = tensor_para_size; - tensor_parallel_param.local_head_num_ = local_head_num; - tensor_parallel_param.local_hidden_units_ = local_hidden_units; - tensor_parallel_param.nccl_comm = tensor_para_nccl_comm; - layer_parallel_param.rank = layer_para_rank; - layer_parallel_param.world_size = layer_para_size; - layer_parallel_param.layers_per_group = layers_per_group; - layer_parallel_param.local_batch_size = layer_para_batch_size; - layer_parallel_param.nccl_comm = layer_para_nccl_comm; - // fix the seed to prevent the seed of different gpu are differnet in Tensor - // Parallel - size_t meta_seed = - *(reinterpret_cast(tensor_para_nccl_uid.internal)); - gen = std::mt19937_64(meta_seed); - } - - ~ModelParaDesc() { - if (tensor_para_nccl_comm) ncclCommDestroy(tensor_para_nccl_comm); - if (layer_para_nccl_comm) ncclCommDestroy(layer_para_nccl_comm); - } -}; - -struct ModelParaDescFactory { - static ModelParaDesc* CreateModelParaDesc(int head_num, - int size_per_head, - int layer_num, - int tensor_para_size, - int layer_para_size, - int layer_para_batch_size, - void* param_ptr); -}; diff --git a/paddlenlp/ops/fast_transformer/src/pd_traits.h b/paddlenlp/ops/fast_transformer/src/pd_traits.h deleted file mode 100644 index 0a7a1e26dd90..000000000000 --- a/paddlenlp/ops/fast_transformer/src/pd_traits.h +++ /dev/null @@ -1,37 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#pragma once - -#include "fastertransformer/utils/common.h" - -using namespace fastertransformer; - -template -class PDTraits; - -template <> -class PDTraits { -public: - typedef float DataType; - typedef float data_t; - static const OperationType OpType = OperationType::FP32; -}; - -template <> -class PDTraits { -public: - typedef half DataType; - typedef paddle::float16 data_t; - static const OperationType OpType = OperationType::FP16; -}; diff --git a/paddlenlp/ops/fast_transformer/src/utils.cc b/paddlenlp/ops/fast_transformer/src/utils.cc deleted file mode 100644 index fe9652422806..000000000000 --- a/paddlenlp/ops/fast_transformer/src/utils.cc +++ /dev/null @@ -1,25 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "utils.h" - - -const int64_t numel(const std::vector& tensor_shape) { - int size = tensor_shape.size(); - int64_t n = 1; - for (int i = 0; i < size; ++i) { - n *= tensor_shape[i]; - } - return n; -} diff --git a/paddlenlp/ops/fast_transformer/src/utils.h b/paddlenlp/ops/fast_transformer/src/utils.h deleted file mode 100644 index b4731958ab7e..000000000000 --- a/paddlenlp/ops/fast_transformer/src/utils.h +++ /dev/null @@ -1,21 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include - - -const int64_t numel(const std::vector& tensor_shape); diff --git a/paddlenlp/ops/fast_transformer/transformer/__init__.py b/paddlenlp/ops/fast_transformer/transformer/__init__.py deleted file mode 100644 index 185a92b8d94d..000000000000 --- a/paddlenlp/ops/fast_transformer/transformer/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/paddlenlp/ops/fast_transformer/transformer/decoder.py b/paddlenlp/ops/fast_transformer/transformer/decoder.py deleted file mode 100644 index 82b0f2339aec..000000000000 --- a/paddlenlp/ops/fast_transformer/transformer/decoder.py +++ /dev/null @@ -1,586 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import os - -import numpy as np -import paddle -import paddle.nn as nn -import paddle.nn.functional as F - -from paddlenlp.ops import transfer_param -from paddlenlp.ops.ext_utils import LOADED_EXT, load -from paddlenlp.transformers import ( - PositionalEmbedding, - WordEmbedding, - position_encoding_init, -) -from paddlenlp.utils.log import logger - -from .decoding import run_custom - - -def infer_transformer_decoder( - from_tensor, - memory_tensor, - mem_seq_len, - self_ln_weight, - self_ln_bias, - self_q_weight, - self_q_bias, - self_k_weight, - self_k_bias, - self_v_weight, - self_v_bias, - self_out_weight, - self_out_bias, - cross_ln_weight, - cross_ln_bias, - cross_q_weight, - cross_q_bias, - cross_k_weight, - cross_k_bias, - cross_v_weight, - cross_v_bias, - cross_out_weight, - cross_out_bias, - ffn_ln_weight, - ffn_ln_bias, - ffn_inter_weight, - ffn_inter_bias, - ffn_out_weight, - ffn_out_bias, - old_self_cache_key, - old_self_cache_value, - old_mem_cache, - step, - n_head, - size_per_head, - memory_hidden_dim, - is_fuse_qkv=False, -): - inputs_names = [ - "FromTensor", - "MemoryTensor", - "MemSeqLen", - "SelfLayernormWeight", - "SelfLayernormBias", - "SelfQueryWeight", - "SelfQueryBias", - "SelfKeyWeight", - "SelfKeyBias", - "SelfValueWeight", - "SelfValueBias", - "SelfOutWeight", - "SelfOutBias", - "CrossLayernormWeight", - "CrossLayernormBias", - "CrossQueryWeight", - "CrossQueryBias", - "CrossKeyWeight", - "CrossKeyBias", - "CrossValueWeight", - "CrossValueBias", - "CrossOutWeight", - "CrossOutBias", - "FFNLayernormWeight", - "FFNLayernormBias", - "FFNInterWeight", - "FFNInterBias", - "FFNOutWeight", - "FFNOutBias", - "OldSelfCacheKey", - "OldSelfCacheValue", - ] - - inputs_var = [ - from_tensor, - memory_tensor, - mem_seq_len, - self_ln_weight, - self_ln_bias, - self_q_weight, - self_q_bias, - self_k_weight, - self_k_bias, - self_v_weight, - self_v_bias, - self_out_weight, - self_out_bias, - cross_ln_weight, - cross_ln_bias, - cross_q_weight, - cross_q_bias, - cross_k_weight, - cross_k_bias, - cross_v_weight, - cross_v_bias, - cross_out_weight, - cross_out_bias, - ffn_ln_weight, - ffn_ln_bias, - ffn_inter_weight, - ffn_inter_bias, - ffn_out_weight, - ffn_out_bias, - old_self_cache_key, - old_self_cache_value, - old_mem_cache, - ] - - attrs_names = ["step", "n_head", "size_per_head", "memory_hidden_dim", "is_fuse_qkv"] - - attrs_val = [step, n_head, size_per_head, memory_hidden_dim, is_fuse_qkv] - - outputs_names = ["DecoderOutput", "NewSelfCacheKey", "NewSelfCacheValue", "NewMemCache"] - - outputs_dtype = [memory_tensor.dtype] * len(outputs_names) - - return run_custom("fusion_decoder", inputs_names, inputs_var, attrs_names, attrs_val, outputs_names, outputs_dtype) - - -def get_op_cache_config(use_batch_major_op_cache, size_per_head, is_fp16): - x = 8 if is_fp16 else 4 - use_batch_major_op_cache = True if use_batch_major_op_cache is True and size_per_head % x == 0 else False - x = x if use_batch_major_op_cache else 1 - return use_batch_major_op_cache, x - - -class InferTransformerDecoder(nn.Layer): - """ - FasterTransformer decoder block. - - Args: - decoder (`TransformerDecoder`): - Transformer decoder block. - n_head (`int`): - The number of head used in multi-head attention. - size_per_head (`int`): - The size of per head used in multi-head attention. - decoder_lib (`str`): - The path to decoder_lib. Default to None. - use_fp16_decoder (`bool`): - Whether to use fp16 for decoder. Default to False. - """ - - def __init__( - self, decoder, n_head, size_per_head, decoder_lib=None, use_fp16_decoder=False, use_batch_major_op_cache=False - ): - - if decoder_lib is not None and os.path.isfile(decoder_lib): - # Maybe it has been loadad by `ext_utils.load` - if "FastGeneration" not in LOADED_EXT.keys(): - ops = paddle.utils.cpp_extension.load_op_meta_info_and_register_op(decoder_lib) - LOADED_EXT["FastGeneration"] = ops - else: - if decoder_lib is not None: - logger.warning("The specified decoder_lib does not exist, and it will be built automatically.") - load("FastGeneration", verbose=True) - - super(InferTransformerDecoder, self).__init__() - self.n_head = n_head - self.size_per_head = size_per_head - self.use_batch_major_op_cache = use_batch_major_op_cache - - if use_fp16_decoder: - for idx, mod in enumerate(decoder.layers): - mod.norm1.weight = transfer_param(mod.norm1.weight) - mod.norm1.bias = transfer_param(mod.norm1.bias, is_bias=True) - mod.self_attn.q_proj.weight = transfer_param(mod.self_attn.q_proj.weight) - mod.self_attn.q_proj.bias = transfer_param(mod.self_attn.q_proj.bias, is_bias=True) - mod.self_attn.k_proj.weight = transfer_param(mod.self_attn.k_proj.weight) - mod.self_attn.k_proj.bias = transfer_param(mod.self_attn.k_proj.bias, is_bias=True) - mod.self_attn.v_proj.weight = transfer_param(mod.self_attn.v_proj.weight) - mod.self_attn.v_proj.bias = transfer_param(mod.self_attn.v_proj.bias, is_bias=True) - mod.self_attn.out_proj.weight = transfer_param(mod.self_attn.out_proj.weight) - mod.self_attn.out_proj.bias = transfer_param(mod.self_attn.out_proj.bias, is_bias=True) - - mod.norm2.weight = transfer_param(mod.norm2.weight) - mod.norm2.bias = transfer_param(mod.norm2.bias, is_bias=True) - mod.cross_attn.q_proj.weight = transfer_param(mod.cross_attn.q_proj.weight) - mod.cross_attn.q_proj.bias = transfer_param(mod.cross_attn.q_proj.bias, is_bias=True) - mod.cross_attn.k_proj.weight = transfer_param(mod.cross_attn.k_proj.weight) - mod.cross_attn.k_proj.bias = transfer_param(mod.cross_attn.k_proj.bias, is_bias=True) - mod.cross_attn.v_proj.weight = transfer_param(mod.cross_attn.v_proj.weight) - mod.cross_attn.v_proj.bias = transfer_param(mod.cross_attn.v_proj.bias, is_bias=True) - mod.cross_attn.out_proj.weight = transfer_param(mod.cross_attn.out_proj.weight) - mod.cross_attn.out_proj.bias = transfer_param(mod.cross_attn.out_proj.bias, is_bias=True) - - mod.norm3.weight = transfer_param(mod.norm3.weight) - mod.norm3.bias = transfer_param(mod.norm3.bias, is_bias=True) - mod.linear1.weight = transfer_param(mod.linear1.weight) - mod.linear1.bias = transfer_param(mod.linear1.bias, is_bias=True) - mod.linear2.weight = transfer_param(mod.linear2.weight) - mod.linear2.bias = transfer_param(mod.linear2.bias, is_bias=True) - - self.weights = [] - for idx, mod in enumerate(decoder.layers): - layer_weight = [] - layer_weight.append(mod.norm1.weight) - layer_weight.append(mod.norm1.bias) - layer_weight.append(mod.self_attn.q_proj.weight) - layer_weight.append(mod.self_attn.q_proj.bias) - layer_weight.append(mod.self_attn.k_proj.weight) - layer_weight.append(mod.self_attn.k_proj.bias) - layer_weight.append(mod.self_attn.v_proj.weight) - layer_weight.append(mod.self_attn.v_proj.bias) - layer_weight.append(mod.self_attn.out_proj.weight) - layer_weight.append(mod.self_attn.out_proj.bias) - layer_weight.append(mod.norm2.weight) - layer_weight.append(mod.norm2.bias) - layer_weight.append(mod.cross_attn.q_proj.weight) - layer_weight.append(mod.cross_attn.q_proj.bias) - layer_weight.append(mod.cross_attn.k_proj.weight) - layer_weight.append(mod.cross_attn.k_proj.bias) - layer_weight.append(mod.cross_attn.v_proj.weight) - layer_weight.append(mod.cross_attn.v_proj.bias) - layer_weight.append(mod.cross_attn.out_proj.weight) - layer_weight.append(mod.cross_attn.out_proj.bias) - layer_weight.append(mod.norm3.weight) - layer_weight.append(mod.norm3.bias) - layer_weight.append(mod.linear1.weight) - layer_weight.append(mod.linear1.bias) - layer_weight.append(mod.linear2.weight) - layer_weight.append(mod.linear2.bias) - self.weights.append(layer_weight) - - def forward( - self, - from_tensor, - memory_tensor, - mem_seq_len, - self_cache_key, - self_cache_value, - mem_cache, - step, - memory_hidden_dim, - is_fuse_qkv, - ): - decoder_output = from_tensor - self_caches_key = [] - self_caches_value = [] - mem_caches = [] - if not self.use_batch_major_op_cache: - self_cache_key = paddle.concat( - [ - self_cache_key, - paddle.zeros( - shape=[len(self.weights), 1, memory_tensor.shape[0], self.n_head * self.size_per_head], - dtype=self_cache_key.dtype, - ), - ], - axis=1, - ) - self_cache_value = paddle.concat( - [ - self_cache_value, - paddle.zeros( - shape=[len(self.weights), 1, memory_tensor.shape[0], self.n_head * self.size_per_head], - dtype=self_cache_value.dtype, - ), - ], - axis=1, - ) - for idx in range(len(self.weights)): - weight = self.weights[idx] - decoder_output, new_self_cache_key, new_self_cache_value, new_mem_cache = infer_transformer_decoder( - from_tensor=decoder_output, - memory_tensor=memory_tensor, - mem_seq_len=mem_seq_len, - self_ln_weight=weight[0], - self_ln_bias=weight[1], - self_q_weight=weight[2], - self_q_bias=weight[3], - self_k_weight=weight[4], - self_k_bias=weight[5], - self_v_weight=weight[6], - self_v_bias=weight[7], - self_out_weight=weight[8], - self_out_bias=weight[9], - cross_ln_weight=weight[10], - cross_ln_bias=weight[11], - cross_q_weight=weight[12], - cross_q_bias=weight[13], - cross_k_weight=weight[14], - cross_k_bias=weight[15], - cross_v_weight=weight[16], - cross_v_bias=weight[17], - cross_out_weight=weight[18], - cross_out_bias=weight[19], - ffn_ln_weight=weight[20], - ffn_ln_bias=weight[21], - ffn_inter_weight=weight[22], - ffn_inter_bias=weight[23], - ffn_out_weight=weight[24], - ffn_out_bias=weight[25], - old_self_cache_key=self_cache_key[idx], - old_self_cache_value=self_cache_value[idx], - old_mem_cache=mem_cache[idx], - step=step, - n_head=self.n_head, - size_per_head=self.size_per_head, - memory_hidden_dim=memory_hidden_dim, - is_fuse_qkv=is_fuse_qkv, - ) - self_caches_key.append(new_self_cache_key) - self_caches_value.append(new_self_cache_value) - mem_caches.append(new_mem_cache) - - self_cache_key = paddle.stack(self_caches_key, axis=0) - self_cache_value = paddle.stack(self_caches_value, axis=0) - mem_cache = paddle.stack(mem_caches, axis=0) - return decoder_output, self_cache_key, self_cache_value, mem_cache - - -class FasterDecoder(nn.Layer): - """ - FasterTransformer decoder for auto-regressive generation. - - Args: - src_vocab_size (`int`): - The size of source vocabulary. - trg_vocab_size (`int`): - The size of target vocabulary. - max_length (`int`): - The maximum length of input sequences. - num_encoder_layers (`int`): - The number of sub-layers to be stacked in the encoder. - num_decoder_layers (`int`): - The number of sub-layers to be stacked in the decoder. - n_head (`int`): - The number of head used in multi-head attention. - d_model (`int`): - The dimension for word embeddings, which is also the last dimension of - the input and output of multi-head attention, position-wise feed-forward - networks, encoder and decoder. - d_inner_hid (`int`): - Size of the hidden layer in position-wise feed-forward networks. - dropout (`float`): - Dropout rates. Used for pre-process, activation and inside attention. - weight_sharing (`bool`): - Whether to use weight sharing. - bos_id (`int`, optional): - The start token id and also is used as padding id. Defaults to 0. - eos_id (`int`, optional): - The end token id. Defaults to 1. - max_out_len (int, optional): - The maximum output length. Defaults to 256. - decoder_lib (`str`): - The path to decoder_lib. Default to None. - use_fp16_decoder (`bool`): - Whether to use fp16 for decoder. Default to False. - """ - - def __init__( - self, - src_vocab_size, - trg_vocab_size, - max_length, - num_encoder_layers, - num_decoder_layers, - n_head, - d_model, - d_inner_hid, - dropout, - weight_sharing, - bos_id=0, - eos_id=1, - max_out_len=256, - decoder_lib=None, - use_fp16_decoder=False, - use_batch_major_op_cache=False, - ): - super().__init__() - self.trg_vocab_size = trg_vocab_size - self.n_head = n_head - self.emb_dim = d_model - self.bos_id = bos_id - self.eos_id = eos_id - self.dropout = dropout - self.max_out_len = max_out_len - self.max_length = max_length - self.use_fp16_decoder = use_fp16_decoder - self.num_decoder_layers = num_decoder_layers - self.d_model = d_model - self.size_per_head = d_model // n_head - self.use_batch_major_op_cache, self.x = get_op_cache_config( - use_batch_major_op_cache, self.size_per_head, use_fp16_decoder - ) - - self.src_word_embedding = WordEmbedding(vocab_size=src_vocab_size, emb_dim=d_model, bos_id=self.bos_id) - # print(self.src_word_embedding.word_embedding.weight) - self.src_pos_embedding = PositionalEmbedding(emb_dim=d_model, max_length=max_length) - if weight_sharing: - assert ( - src_vocab_size == trg_vocab_size - ), "Vocabularies in source and target should be same for weight sharing." - self.trg_word_embedding = self.src_word_embedding - self.trg_pos_embedding = self.src_pos_embedding - else: - self.trg_word_embedding = WordEmbedding(vocab_size=trg_vocab_size, emb_dim=d_model, bos_id=self.bos_id) - self.trg_pos_embedding = PositionalEmbedding(emb_dim=d_model, max_length=max_length) - - self.transformer = paddle.nn.Transformer( - d_model=d_model, - nhead=n_head, - num_encoder_layers=num_encoder_layers, - num_decoder_layers=num_decoder_layers, - dim_feedforward=d_inner_hid, - dropout=dropout, - activation="relu", - normalize_before=True, - ) - - self.decoder = InferTransformerDecoder( - decoder=self.transformer.decoder, - n_head=n_head, - size_per_head=self.size_per_head, - decoder_lib=decoder_lib, - use_fp16_decoder=use_fp16_decoder, - use_batch_major_op_cache=self.use_batch_major_op_cache, - ) - - if weight_sharing: - self.linear = lambda x: paddle.matmul( - x=x, y=self.trg_word_embedding.word_embedding.weight, transpose_y=True - ) - else: - self.linear = nn.Linear(in_features=d_model, out_features=trg_vocab_size, bias_attr=False) - - def forward(self, src_word): - src_max_len = src_word.shape[-1] - mem_seq_lens = paddle.sum( - paddle.cast(src_word != self.bos_id, dtype="int32"), axis=-1, keepdim=True, dtype="int32" - ) - - src_slf_attn_bias = ( - paddle.cast(src_word == self.bos_id, dtype=paddle.get_default_dtype()).unsqueeze([1, 2]) * -1e9 - ) - - src_slf_attn_bias.stop_gradient = True - - src_pos = paddle.cast(src_word != self.bos_id, dtype="int64") * paddle.arange(start=0, end=src_max_len) - - src_emb = self.src_word_embedding(src_word) - - src_pos_emb = self.src_pos_embedding(src_pos) - src_emb = src_emb + src_pos_emb - enc_input = F.dropout(src_emb, p=self.dropout, training=self.training) if self.dropout else src_emb - enc_output = self.transformer.encoder(enc_input, src_mask=src_slf_attn_bias) - - batch_size, _, memory_hidden_dim = enc_output.shape - end_token_tensor = paddle.full(shape=[batch_size, 1], fill_value=self.eos_id, dtype="int64") - - predict_ids = [] - log_probs = paddle.full(shape=[batch_size, 1], fill_value=0, dtype="float32") - trg_word = paddle.full(shape=[batch_size, 1], fill_value=self.bos_id, dtype="int64") - - if self.use_fp16_decoder: - enc_output = paddle.cast(enc_output, "float16") - - # Init cache - if not self.use_batch_major_op_cache: - self_cache_key = paddle.zeros( - shape=[self.num_decoder_layers, 0, batch_size, self.d_model], dtype=enc_output.dtype - ) - self_cache_value = paddle.zeros( - shape=[self.num_decoder_layers, 0, batch_size, self.d_model], dtype=enc_output.dtype - ) - else: - self_cache_key = paddle.zeros( - shape=[ - self.num_decoder_layers, - batch_size, - self.n_head, - self.size_per_head // self.x, - self.max_out_len, - self.x, - ], - dtype=enc_output.dtype, - ) - self_cache_value = paddle.zeros( - shape=[self.num_decoder_layers, batch_size, self.n_head, self.max_out_len, self.size_per_head], - dtype=enc_output.dtype, - ) - mem_cache = paddle.zeros( - shape=[self.num_decoder_layers, 2, batch_size, src_max_len, self.d_model], dtype=enc_output.dtype - ) - for i in range(self.max_out_len): - trg_pos = paddle.full(shape=trg_word.shape, fill_value=i, dtype="int64") - trg_emb = self.trg_word_embedding(trg_word) - trg_pos_emb = self.trg_pos_embedding(trg_pos) - trg_emb = trg_emb + trg_pos_emb - dec_input = F.dropout(trg_emb, p=self.dropout, training=self.training) if self.dropout else trg_emb - - # TODO(gongenlei): do cast in op - if self.use_fp16_decoder: - dec_input = paddle.cast(dec_input, "float16") - dec_output, self_cache_key, self_cache_value, mem_cache = self.decoder( - from_tensor=dec_input, - memory_tensor=enc_output, - mem_seq_len=mem_seq_lens, - self_cache_key=self_cache_key, - self_cache_value=self_cache_value, - mem_cache=mem_cache, - step=i, - memory_hidden_dim=memory_hidden_dim, - is_fuse_qkv=False, - ) - - if self.use_fp16_decoder: - dec_output = paddle.cast(dec_output, "float32") - - dec_output = paddle.reshape(dec_output, shape=[-1, dec_output.shape[-1]]) - - logits = self.linear(dec_output) - step_log_probs = paddle.log(F.softmax(logits, axis=-1)) - log_probs = paddle.add(x=step_log_probs, y=log_probs) - scores = log_probs - topk_scores, topk_indices = paddle.topk(x=scores, k=1) - - finished = paddle.equal(topk_indices, end_token_tensor) - trg_word = topk_indices - log_probs = topk_scores - - predict_ids.append(topk_indices) - - # TODO(gongenlei): support static graph - if paddle.all(finished).numpy(): - break - - predict_ids = paddle.stack(predict_ids, axis=0) - finished_seq = paddle.transpose(predict_ids, [1, 2, 0]) - finished_scores = topk_scores - - return finished_seq, finished_scores - - def load(self, init_from_params): - # Load the trained model - assert init_from_params, "Please set init_from_params to load the infer model." - - model_dict = paddle.load(init_from_params, return_numpy=True) - - # To set weight[padding_idx] to 0. - model_dict["trg_word_embedding.word_embedding.weight"][self.bos_id] = [0] * self.d_model - - # To avoid a longer length than training, reset the size of position - # encoding to max_length - model_dict["encoder.pos_encoder.weight"] = position_encoding_init(self.max_length, self.d_model) - model_dict["decoder.pos_encoder.weight"] = position_encoding_init(self.max_length, self.d_model) - - if self.use_fp16_decoder: - for item in self.state_dict(): - if "decoder.layers" in item: - model_dict[item] = np.float16(model_dict[item]) - - self.load_dict(model_dict) diff --git a/paddlenlp/ops/fast_transformer/transformer/decoding.py b/paddlenlp/ops/fast_transformer/transformer/decoding.py deleted file mode 100644 index 28b30faebc2b..000000000000 --- a/paddlenlp/ops/fast_transformer/transformer/decoding.py +++ /dev/null @@ -1,4550 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import functools -import os -from collections import defaultdict -from functools import partial - -import numpy as np -import paddle -import paddle.nn as nn -from paddle.common_ops_import import LayerHelper -from paddle.framework import core - -import paddlenlp -from paddlenlp.ops.ext_utils import LOADED_EXT, load -from paddlenlp.transformers import OPTForCausalLM -from paddlenlp.transformers.t5.modeling import T5DenseGatedGeluDense, T5DenseReluDense -from paddlenlp.transformers.utils import fn_args_to_dict -from paddlenlp.utils.log import logger - - -def run_custom(op_name, inputs_names, inputs_var, attrs_names, attrs_val, outputs_names, outputs_dtype=None): - ret = [] - - if paddle.in_dynamic_mode(): - new_inputs_var = [] - for k, v in zip(inputs_names, inputs_var): - if not k.endswith("@VECTOR") and isinstance(v, (list, tuple)) and len(v) == 1: - new_inputs_var.append(v[0]) - else: - new_inputs_var.append(v) - outs = core.eager._run_custom_op(op_name, *new_inputs_var, *attrs_val) - return outs[0] if len(outs) == 1 else outs - else: - inputs = dict(zip(inputs_names, inputs_var)) - attrs = dict(zip(attrs_names, attrs_val)) - outputs = {} - - helper = LayerHelper(op_name, **locals()) - - for i, name in enumerate(outputs_names): - outputs[name] = helper.create_variable(dtype=outputs_dtype[i]) - ret.append(outputs[name]) - - helper.append_op(type=op_name, inputs=inputs, outputs=outputs, attrs=attrs) - - return ret - - -def infer_transformer_decoding( - enc_output, - memory_seq_lens, - word_emb, - slf_ln_weight, - slf_ln_bias, - slf_q_weight, - slf_q_bias, - slf_k_weight, - slf_k_bias, - slf_v_weight, - slf_v_bias, - slf_out_weight, - slf_out_bias, - cross_ln_weight, - cross_ln_bias, - cross_q_weight, - cross_q_bias, - cross_k_weight, - cross_k_bias, - cross_v_weight, - cross_v_bias, - cross_out_weight, - cross_out_bias, - ffn_ln_weight, - ffn_ln_bias, - ffn_inter_weight, - ffn_inter_bias, - ffn_out_weight, - ffn_out_bias, - decoder_ln_weight, - decoder_ln_bias, - linear_weight, - linear_bias, - pos_emb, - _decoding_strategy, - _beam_size, - _topk, - _topp, - _n_head, - _size_per_head, - _n_layer, - _bos_id, - _eos_id, - _max_out_len, - _diversity_rate, - _rel_len, - _alpha, -): - - inputs_names = [ - "Input", - "MemSeqLen", - "WordEmbedding", - "SelfLayernormWeight@VECTOR", - "SelfLayernormBias@VECTOR", - "SelfQueryWeight@VECTOR", - "SelfQueryBias@VECTOR", - "SelfKeyWeight@VECTOR", - "SelfKeyBias@VECTOR", - "SelfValueWeight@VECTOR", - "SelfValueBias@VECTOR", - "SelfOutWeight@VECTOR", - "SelfOutBias@VECTOR", - "CrossLayernormWeight@VECTOR", - "CrossLayernormBias@VECTOR", - "CrossQueryWeight@VECTOR", - "CrossQueryBias@VECTOR", - "CrossKeyWeight@VECTOR", - "CrossKeyBias@VECTOR", - "CrossValueWeight@VECTOR", - "CrossValueBias@VECTOR", - "CrossOutWeight@VECTOR", - "CrossOutBias@VECTOR", - "FFNLayernormWeight@VECTOR", - "FFNLayernormBias@VECTOR", - "FFNInterWeight@VECTOR", - "FFNInterBias@VECTOR", - "FFNOutWeight@VECTOR", - "FFNOutBias@VECTOR", - "DecoderLayernormWeight", - "DecoderLayernormBias", - "EmbWeight", - "EmbBias", - "PositionEncEmb", - ] - - inputs_var = [ - enc_output, - memory_seq_lens, - word_emb, - slf_ln_weight, - slf_ln_bias, - slf_q_weight, - slf_q_bias, - slf_k_weight, - slf_k_bias, - slf_v_weight, - slf_v_bias, - slf_out_weight, - slf_out_bias, - cross_ln_weight, - cross_ln_bias, - cross_q_weight, - cross_q_bias, - cross_k_weight, - cross_k_bias, - cross_v_weight, - cross_v_bias, - cross_out_weight, - cross_out_bias, - ffn_ln_weight, - ffn_ln_bias, - ffn_inter_weight, - ffn_inter_bias, - ffn_out_weight, - ffn_out_bias, - decoder_ln_weight, - decoder_ln_bias, - linear_weight, - linear_bias, - pos_emb, - ] - - attrs_names = [ - "decoding_strategy", - "beam_size", - "topk", - "topp", - "n_head", - "size_per_head", - "num_layer", - "bos_id", - "eos_id", - "max_len", - "beam_search_diversity_rate", - "rel_len", - "alpha", - ] - - attrs_val = [ - _decoding_strategy, - _beam_size, - _topk, - _topp, - _n_head, - _size_per_head, - _n_layer, - _bos_id, - _eos_id, - _max_out_len, - _diversity_rate, - _rel_len, - _alpha, - ] - - outputs_names = ["OutputIds", "ParentIds", "SequenceLength"] - - outputs_dtype = ["int32"] * len(outputs_names) - - return run_custom( - "fusion_decoding", inputs_names, inputs_var, attrs_names, attrs_val, outputs_names, outputs_dtype - ) - - -def infer_force_decoding( - enc_output, - memory_seq_lens, - word_emb, - slf_ln_weight, - slf_ln_bias, - slf_q_weight, - slf_q_bias, - slf_k_weight, - slf_k_bias, - slf_v_weight, - slf_v_bias, - slf_out_weight, - slf_out_bias, - cross_ln_weight, - cross_ln_bias, - cross_q_weight, - cross_q_bias, - cross_k_weight, - cross_k_bias, - cross_v_weight, - cross_v_bias, - cross_out_weight, - cross_out_bias, - ffn_ln_weight, - ffn_ln_bias, - ffn_inter_weight, - ffn_inter_bias, - ffn_out_weight, - ffn_out_bias, - decoder_ln_weight, - decoder_ln_bias, - linear_weight, - linear_bias, - pos_emb, - trg_word, - _decoding_strategy, - _beam_size, - _topk, - _topp, - _n_head, - _size_per_head, - _n_layer, - _bos_id, - _eos_id, - _max_out_len, - _diversity_rate, - _rel_len, - _alpha, -): - - inputs_names = [ - "Input", - "MemSeqLen", - "WordEmbedding", - "SelfLayernormWeight@VECTOR", - "SelfLayernormBias@VECTOR", - "SelfQueryWeight@VECTOR", - "SelfQueryBias@VECTOR", - "SelfKeyWeight@VECTOR", - "SelfKeyBias@VECTOR", - "SelfValueWeight@VECTOR", - "SelfValueBias@VECTOR", - "SelfOutWeight@VECTOR", - "SelfOutBias@VECTOR", - "CrossLayernormWeight@VECTOR", - "CrossLayernormBias@VECTOR", - "CrossQueryWeight@VECTOR", - "CrossQueryBias@VECTOR", - "CrossKeyWeight@VECTOR", - "CrossKeyBias@VECTOR", - "CrossValueWeight@VECTOR", - "CrossValueBias@VECTOR", - "CrossOutWeight@VECTOR", - "CrossOutBias@VECTOR", - "FFNLayernormWeight@VECTOR", - "FFNLayernormBias@VECTOR", - "FFNInterWeight@VECTOR", - "FFNInterBias@VECTOR", - "FFNOutWeight@VECTOR", - "FFNOutBias@VECTOR", - "DecoderLayernormWeight", - "DecoderLayernormBias", - "EmbWeight", - "EmbBias", - "PositionEncEmb", - # The input of custom op must be given. - # Dispensable() and Intermediate() are not supported. - "TrgWord", - ] - - inputs_var = [ - enc_output, - memory_seq_lens, - word_emb, - slf_ln_weight, - slf_ln_bias, - slf_q_weight, - slf_q_bias, - slf_k_weight, - slf_k_bias, - slf_v_weight, - slf_v_bias, - slf_out_weight, - slf_out_bias, - cross_ln_weight, - cross_ln_bias, - cross_q_weight, - cross_q_bias, - cross_k_weight, - cross_k_bias, - cross_v_weight, - cross_v_bias, - cross_out_weight, - cross_out_bias, - ffn_ln_weight, - ffn_ln_bias, - ffn_inter_weight, - ffn_inter_bias, - ffn_out_weight, - ffn_out_bias, - decoder_ln_weight, - decoder_ln_bias, - linear_weight, - linear_bias, - pos_emb, - # The input of custom op must be given. - # Dispensable() and Intermediate() are not supported. - trg_word, - ] - - attrs_names = [ - "decoding_strategy", - "beam_size", - "topk", - "topp", - "n_head", - "size_per_head", - "num_layer", - "bos_id", - "eos_id", - "max_len", - "beam_search_diversity_rate", - "rel_len", - "alpha", - ] - - attrs_val = [ - _decoding_strategy, - _beam_size, - _topk, - _topp, - _n_head, - _size_per_head, - _n_layer, - _bos_id, - _eos_id, - _max_out_len, - _diversity_rate, - _rel_len, - _alpha, - ] - - outputs_names = ["OutputIds", "ParentIds", "SequenceLength"] - - outputs_dtype = ["int32"] * len(outputs_names) - - return run_custom( - "fusion_force_decoding", inputs_names, inputs_var, attrs_names, attrs_val, outputs_names, outputs_dtype - ) - - -def infer_opt_decoding( - input, - attn_mask, - mem_seq_len, - word_emb, - slf_ln_weight, - slf_ln_bias, - slf_q_weight, - slf_q_bias, - slf_k_weight, - slf_k_bias, - slf_v_weight, - slf_v_bias, - slf_out_weight, - slf_out_bias, - ffn_ln_weight, - ffn_ln_bias, - ffn_inter_weight, - ffn_inter_bias, - ffn_out_weight, - ffn_out_bias, - decoder_ln_weight, - decoder_ln_bias, - pos_emb, - linear_weight, - normalize_before, - topk, - topp, - max_out_len, - head_num, - size_per_head, - num_layer, - bos_id, - eos_id, - temperature, - use_fp16_decoding, -): - helper = LayerHelper("fusion_opt", **locals()) - - inputs = { - "Input": input, - "AttentionMask": attn_mask, - "StartLength": mem_seq_len, - "WordEmbedding": word_emb, - "SelfLayernormWeight@VECTOR": slf_ln_weight, - "SelfLayernormBias@VECTOR": slf_ln_bias, - "SelfQueryWeight@VECTOR": slf_q_weight, - "SelfQueryBias@VECTOR": slf_q_bias, - "SelfKeyWeight@VECTOR": slf_k_weight, - "SelfKeyBias@VECTOR": slf_k_bias, - "SelfValueWeight@VECTOR": slf_v_weight, - "SelfValueBias@VECTOR": slf_v_bias, - "SelfOutWeight@VECTOR": slf_out_weight, - "SelfOutBias@VECTOR": slf_out_bias, - "FFNLayernormWeight@VECTOR": ffn_ln_weight, - "FFNLayernormBias@VECTOR": ffn_ln_bias, - "FFNInterWeight@VECTOR": ffn_inter_weight, - "FFNInterBias@VECTOR": ffn_inter_bias, - "FFNOutWeight@VECTOR": ffn_out_weight, - "FFNOutBias@VECTOR": ffn_out_bias, - "DecoderLayernormWeight": decoder_ln_weight, - "DecoderLayernormBias": decoder_ln_bias, - "PositionEncEmb": pos_emb, - "EmbWeight": linear_weight, - } - tensor_para_size = get_ft_para_conf().tensor_para_size - layer_para_size = get_ft_para_conf().layer_para_size - layer_para_batch_size = get_ft_para_conf().layer_para_batch_size - attrs = { - "normalize_before": normalize_before, - "topk": topk, - "topp": topp, - "max_len": max_out_len, - "n_head": head_num, - "size_per_head": size_per_head, - "num_layer": num_layer, - "bos_id": bos_id, - "eos_id": eos_id, - "temperature": temperature, - "use_fp16": use_fp16_decoding, - "tensor_para_size": tensor_para_size, - "layer_para_size": layer_para_size, - "layer_para_batch_size": layer_para_batch_size, - } - - output_ids = helper.create_variable(dtype="int32") - outputs = {"OutputIds": output_ids} - - helper.append_op(type="fusion_opt", inputs=inputs, outputs=outputs, attrs=attrs) - - return output_ids - - -def infer_gpt_decoding( - input, - attn_mask, - mem_seq_len, - word_emb, - slf_ln_weight, - slf_ln_bias, - slf_q_weight, - slf_q_bias, - slf_k_weight, - slf_k_bias, - slf_v_weight, - slf_v_bias, - slf_out_weight, - slf_out_bias, - ffn_ln_weight, - ffn_ln_bias, - ffn_inter_weight, - ffn_inter_bias, - ffn_out_weight, - ffn_out_bias, - decoder_ln_weight, - decoder_ln_bias, - pos_emb, - linear_weight, - topk, - topp, - max_out_len, - head_num, - size_per_head, - num_layer, - bos_id, - eos_id, - temperature, - use_fp16_decoding, -): - - tensor_para_size = get_ft_para_conf().tensor_para_size - layer_para_size = get_ft_para_conf().layer_para_size - layer_para_batch_size = get_ft_para_conf().layer_para_batch_size - - inputs_names = [ - "Input", - "AttentionMask", - "StartLength", - "WordEmbedding", - "SelfLayernormWeight@VECTOR", - "SelfLayernormBias@VECTOR", - "SelfQueryWeight@VECTOR", - "SelfQueryBias@VECTOR", - "SelfKeyWeight@VECTOR", - "SelfKeyBias@VECTOR", - "SelfValueWeight@VECTOR", - "SelfValueBias@VECTOR", - "SelfOutWeight@VECTOR", - "SelfOutBias@VECTOR", - "FFNLayernormWeight@VECTOR", - "FFNLayernormBias@VECTOR", - "FFNInterWeight@VECTOR", - "FFNInterBias@VECTOR", - "FFNOutWeight@VECTOR", - "FFNOutBias@VECTOR", - "DecoderLayernormWeight", - "DecoderLayernormBias", - "PositionEncEmb", - "EmbWeight", - ] - - inputs_var = [ - input, - attn_mask, - mem_seq_len, - word_emb, - slf_ln_weight, - slf_ln_bias, - slf_q_weight, - slf_q_bias, - slf_k_weight, - slf_k_bias, - slf_v_weight, - slf_v_bias, - slf_out_weight, - slf_out_bias, - ffn_ln_weight, - ffn_ln_bias, - ffn_inter_weight, - ffn_inter_bias, - ffn_out_weight, - ffn_out_bias, - decoder_ln_weight, - decoder_ln_bias, - pos_emb, - linear_weight, - ] - - attrs_names = [ - "topk", - "topp", - "max_len", - "n_head", - "size_per_head", - "num_layer", - "bos_id", - "eos_id", - "temperature", - "use_fp16", - "tensor_para_size", - "layer_para_size", - "layer_para_batch_size", - ] - - attrs_val = [ - topk, - topp, - max_out_len, - head_num, - size_per_head, - num_layer, - bos_id, - eos_id, - temperature, - use_fp16_decoding, - tensor_para_size, - layer_para_size, - layer_para_batch_size, - ] - - outputs_names = ["OutputIds"] - - outputs_dtype = ["int32"] - - return run_custom("fusion_gpt", inputs_names, inputs_var, attrs_names, attrs_val, outputs_names, outputs_dtype) - - -def infer_unified_decoding( - input_ids, - attn_mask, - memory_seq_lens, - type_id, - decoder_type_id, - logits_mask, - word_emb, - slf_ln_weight, - slf_ln_bias, - slf_q_weight, - slf_q_bias, - slf_k_weight, - slf_k_bias, - slf_v_weight, - slf_v_bias, - slf_out_weight, - slf_out_bias, - ffn_ln_weight, - ffn_ln_bias, - ffn_inter_weight, - ffn_inter_bias, - ffn_out_weight, - ffn_out_bias, - decoder_ln_weight, - decoder_ln_bias, - trans_weight, - trans_bias, - lm_ln_weight, - lm_ln_bias, - linear_weight, - linear_bias, - pos_emb, - type_emb, - role_id, - decoder_role_id, - role_emb, - position_id, - decoder_position_id, - _decoding_strategy, - _beam_size, - _topk, - _topp, - _n_head, - _size_per_head, - _n_layer, - _bos_id, - _eos_id, - _max_out_len, - _diversity_rate, - _unk_id, - _mask_id, - _temperature, - _len_penalty, - _normalize_before, - _pos_bias, - _hidden_act, - _rel_len, - _early_stopping, - _min_length, -): - - tensor_para_size = get_ft_para_conf().tensor_para_size - layer_para_size = get_ft_para_conf().layer_para_size - layer_para_batch_size = get_ft_para_conf().layer_para_batch_size - - inputs_names = [ - "InputIds", - "AttnMask", - "MemSeqLen", - "TypeIds", - "DecTypeIds", - "LogitsMask", - "WordEmbedding", - "SelfLayernormWeight@VECTOR", - "SelfLayernormBias@VECTOR", - "SelfQueryWeight@VECTOR", - "SelfQueryBias@VECTOR", - "SelfKeyWeight@VECTOR", - "SelfKeyBias@VECTOR", - "SelfValueWeight@VECTOR", - "SelfValueBias@VECTOR", - "SelfOutWeight@VECTOR", - "SelfOutBias@VECTOR", - "FFNLayernormWeight@VECTOR", - "FFNLayernormBias@VECTOR", - "FFNInterWeight@VECTOR", - "FFNInterBias@VECTOR", - "FFNOutWeight@VECTOR", - "FFNOutBias@VECTOR", - "DecoderLayernormWeight", - "DecoderLayernormBias", - "TransWeight", - "TransBias", - "LMLayernormWeight", - "LMLayernormBias", - "EmbWeight", - "EmbBias", - "PositionEncEmb", - "TypeEmb", - "RoleIds", - "DecRoleIds", - "RoleEmbedding", - "PositionIds", - "DecPositionIds", - ] - - inputs_var = [ - input_ids, - attn_mask, - memory_seq_lens, - type_id, - decoder_type_id, - logits_mask, - word_emb, - slf_ln_weight, - slf_ln_bias, - slf_q_weight, - slf_q_bias, - slf_k_weight, - slf_k_bias, - slf_v_weight, - slf_v_bias, - slf_out_weight, - slf_out_bias, - ffn_ln_weight, - ffn_ln_bias, - ffn_inter_weight, - ffn_inter_bias, - ffn_out_weight, - ffn_out_bias, - decoder_ln_weight, - decoder_ln_bias, - trans_weight, - trans_bias, - lm_ln_weight, - lm_ln_bias, - linear_weight, - linear_bias, - pos_emb, - type_emb, - role_id, - decoder_role_id, - role_emb, - position_id, - decoder_position_id, - ] - - attrs_names = [ - "decoding_strategy", - "beam_size", - "topk", - "topp", - "n_head", - "size_per_head", - "num_layer", - "bos_id", - "eos_id", - "max_len", - "beam_search_diversity_rate", - "unk_id", - "mask_id", - "temperature", - "len_penalty", - "normalize_before", - "pos_bias", - "hidden_act", - "rel_len", - "early_stopping", - "min_length", - "tensor_para_size", - "layer_para_size", - "layer_para_batch_size", - ] - - attrs_val = [ - _decoding_strategy, - _beam_size, - _topk, - float(_topp), - _n_head, - _size_per_head, - _n_layer, - _bos_id, - _eos_id, - _max_out_len, - _diversity_rate, - _unk_id, - _mask_id, - _temperature, - _len_penalty, - _normalize_before, - _pos_bias, - _hidden_act, - _rel_len, - _early_stopping, - _min_length, - tensor_para_size, - layer_para_size, - layer_para_batch_size, - ] - - outputs_names = ["OutputIds", "ParentIds", "SequenceLength", "OutputScores"] - - outputs_dtype = ["int32", "int32", "int32", "float32"] - - return run_custom( - "fusion_unified_decoding", inputs_names, inputs_var, attrs_names, attrs_val, outputs_names, outputs_dtype - ) - - -def infer_miro_decoding( - input_ids, - attn_mask, - memory_seq_lens, - type_id, - decoder_type_id, - logits_mask, - word_emb, - pre_decoder_ln_weight, - pre_decoder_ln_bias, - slf_ln_weight, - slf_ln_bias, - slf_q_weight, - slf_q_bias, - slf_k_weight, - slf_k_bias, - slf_v_weight, - slf_v_bias, - slf_out_weight, - slf_out_bias, - ffn_ln_weight, - ffn_ln_bias, - ffn_inter_weight, - ffn_inter_bias, - ffn_out_weight, - ffn_out_bias, - decoder_ln_weight, - decoder_ln_bias, - trans_weight, - trans_bias, - lm_ln_weight, - lm_ln_bias, - linear_weight, - linear_bias, - pos_emb, - type_emb, - role_id, - decoder_role_id, - role_emb, - position_id, - decoder_position_id, - _decoding_strategy, - _beam_size, - _topk, - _topp, - _n_head, - _size_per_head, - _n_layer, - _bos_id, - _eos_id, - _max_out_len, - _diversity_rate, - _unk_id, - _mask_id, - _temperature, - _len_penalty, - _normalize_before, - _pos_bias, - _hidden_act, - _rel_len, - _early_stopping, - _min_length, -): - - tensor_para_size = get_ft_para_conf().tensor_para_size - layer_para_size = get_ft_para_conf().layer_para_size - layer_para_batch_size = get_ft_para_conf().layer_para_batch_size - - inputs_names = [ - "InputIds", - "AttnMask", - "MemSeqLen", - "TypeIds", - "DecTypeIds", - "LogitsMask", - "WordEmbedding", - "PreDecoderLayernormWeight", - "PreDecoderLayernormBias", - "SelfLayernormWeight@VECTOR", - "SelfLayernormBias@VECTOR", - "SelfQueryWeight@VECTOR", - "SelfQueryBias@VECTOR", - "SelfKeyWeight@VECTOR", - "SelfKeyBias@VECTOR", - "SelfValueWeight@VECTOR", - "SelfValueBias@VECTOR", - "SelfOutWeight@VECTOR", - "SelfOutBias@VECTOR", - "FFNLayernormWeight@VECTOR", - "FFNLayernormBias@VECTOR", - "FFNInterWeight@VECTOR", - "FFNInterBias@VECTOR", - "FFNOutWeight@VECTOR", - "FFNOutBias@VECTOR", - "DecoderLayernormWeight", - "DecoderLayernormBias", - "TransWeight", - "TransBias", - "LMLayernormWeight", - "LMLayernormBias", - "EmbWeight", - "EmbBias", - "PositionEncEmb", - "TypeEmb", - "RoleIds", - "DecRoleIds", - "RoleEmbedding", - "PositionIds", - "DecPositionIds", - ] - - inputs_var = [ - input_ids, - attn_mask, - memory_seq_lens, - type_id, - decoder_type_id, - logits_mask, - word_emb, - pre_decoder_ln_weight, - pre_decoder_ln_bias, - slf_ln_weight, - slf_ln_bias, - slf_q_weight, - slf_q_bias, - slf_k_weight, - slf_k_bias, - slf_v_weight, - slf_v_bias, - slf_out_weight, - slf_out_bias, - ffn_ln_weight, - ffn_ln_bias, - ffn_inter_weight, - ffn_inter_bias, - ffn_out_weight, - ffn_out_bias, - decoder_ln_weight, - decoder_ln_bias, - trans_weight, - trans_bias, - lm_ln_weight, - lm_ln_bias, - linear_weight, - linear_bias, - pos_emb, - type_emb, - role_id, - decoder_role_id, - role_emb, - position_id, - decoder_position_id, - ] - - attrs_names = [ - "decoding_strategy", - "beam_size", - "topk", - "topp", - "n_head", - "size_per_head", - "num_layer", - "bos_id", - "eos_id", - "max_len", - "beam_search_diversity_rate", - "unk_id", - "mask_id", - "temperature", - "len_penalty", - "normalize_before", - "pos_bias", - "hidden_act", - "rel_len", - "early_stopping", - "min_length", - "tensor_para_size", - "layer_para_size", - "layer_para_batch_size", - ] - - attrs_val = [ - _decoding_strategy, - _beam_size, - _topk, - float(_topp), - _n_head, - _size_per_head, - _n_layer, - _bos_id, - _eos_id, - _max_out_len, - _diversity_rate, - _unk_id, - _mask_id, - _temperature, - _len_penalty, - _normalize_before, - _pos_bias, - _hidden_act, - _rel_len, - _early_stopping, - _min_length, - tensor_para_size, - layer_para_size, - layer_para_batch_size, - ] - - outputs_names = ["OutputIds", "ParentIds", "SequenceLength", "OutputScores"] - - outputs_dtype = ["int32", "int32", "int32", "float32"] - - return run_custom("fusion_miro", inputs_names, inputs_var, attrs_names, attrs_val, outputs_names, outputs_dtype) - - -def infer_bart_decoding( - enc_output, - memory_seq_lens, - word_emb, - slf_ln_weight, - slf_ln_bias, - slf_q_weight, - slf_q_bias, - slf_k_weight, - slf_k_bias, - slf_v_weight, - slf_v_bias, - slf_out_weight, - slf_out_bias, - cross_ln_weight, - cross_ln_bias, - cross_q_weight, - cross_q_bias, - cross_k_weight, - cross_k_bias, - cross_v_weight, - cross_v_bias, - cross_out_weight, - cross_out_bias, - ffn_ln_weight, - ffn_ln_bias, - ffn_inter_weight, - ffn_inter_bias, - ffn_out_weight, - ffn_out_bias, - decoder_ln_weight, - decoder_ln_bias, - linear_weight, - linear_bias, - pos_emb, - _decoding_strategy, - _beam_size, - _topk, - _topp, - _temperature, - _n_head, - _size_per_head, - _n_layer, - _bos_id, - _eos_id, - _max_out_len, - _min_out_len, - _diversity_rate, - _rel_len, - _alpha, - _early_stopping, -): - - inputs_names = [ - "Input", - "MemSeqLen", - "WordEmbedding", - "SelfLayernormWeight@VECTOR", - "SelfLayernormBias@VECTOR", - "SelfQueryWeight@VECTOR", - "SelfQueryBias@VECTOR", - "SelfKeyWeight@VECTOR", - "SelfKeyBias@VECTOR", - "SelfValueWeight@VECTOR", - "SelfValueBias@VECTOR", - "SelfOutWeight@VECTOR", - "SelfOutBias@VECTOR", - "CrossLayernormWeight@VECTOR", - "CrossLayernormBias@VECTOR", - "CrossQueryWeight@VECTOR", - "CrossQueryBias@VECTOR", - "CrossKeyWeight@VECTOR", - "CrossKeyBias@VECTOR", - "CrossValueWeight@VECTOR", - "CrossValueBias@VECTOR", - "CrossOutWeight@VECTOR", - "CrossOutBias@VECTOR", - "FFNLayernormWeight@VECTOR", - "FFNLayernormBias@VECTOR", - "FFNInterWeight@VECTOR", - "FFNInterBias@VECTOR", - "FFNOutWeight@VECTOR", - "FFNOutBias@VECTOR", - "DecoderLayernormWeight", - "DecoderLayernormBias", - "EmbWeight", - "EmbBias", - "PositionEncEmb", - ] - - inputs_var = [ - enc_output, - memory_seq_lens, - word_emb, - slf_ln_weight, - slf_ln_bias, - slf_q_weight, - slf_q_bias, - slf_k_weight, - slf_k_bias, - slf_v_weight, - slf_v_bias, - slf_out_weight, - slf_out_bias, - cross_ln_weight, - cross_ln_bias, - cross_q_weight, - cross_q_bias, - cross_k_weight, - cross_k_bias, - cross_v_weight, - cross_v_bias, - cross_out_weight, - cross_out_bias, - ffn_ln_weight, - ffn_ln_bias, - ffn_inter_weight, - ffn_inter_bias, - ffn_out_weight, - ffn_out_bias, - decoder_ln_weight, - decoder_ln_bias, - linear_weight, - linear_bias, - pos_emb, - ] - - attrs_names = [ - "decoding_strategy", - "beam_size", - "topk", - "topp", - "temperature", - "n_head", - "size_per_head", - "num_layer", - "bos_id", - "eos_id", - "max_len", - "min_len", - "beam_search_diversity_rate", - "rel_len", - "alpha", - "early_stopping", - ] - - attrs_val = [ - _decoding_strategy, - _beam_size, - _topk, - _topp, - _temperature, - _n_head, - _size_per_head, - _n_layer, - _bos_id, - _eos_id, - _max_out_len, - _min_out_len, - _diversity_rate, - _rel_len, - _alpha, - _early_stopping, - ] - - outputs_names = ["OutputIds", "ParentIds", "SequenceLength"] - - outputs_dtype = ["int32"] * len(outputs_names) - - return run_custom( - "fusion_bart_decoding", inputs_names, inputs_var, attrs_names, attrs_val, outputs_names, outputs_dtype - ) - - -def infer_mbart_decoding( - enc_output, - memory_seq_lens, - word_emb, - slf_ln_weight, - slf_ln_bias, - slf_q_weight, - slf_q_bias, - slf_k_weight, - slf_k_bias, - slf_v_weight, - slf_v_bias, - slf_out_weight, - slf_out_bias, - cross_ln_weight, - cross_ln_bias, - cross_q_weight, - cross_q_bias, - cross_k_weight, - cross_k_bias, - cross_v_weight, - cross_v_bias, - cross_out_weight, - cross_out_bias, - ffn_ln_weight, - ffn_ln_bias, - ffn_inter_weight, - ffn_inter_bias, - ffn_out_weight, - ffn_out_bias, - decoder_ln_weight, - decoder_ln_bias, - mbart_ln_weight, - mbart_ln_bias, - linear_weight, - linear_bias, - pos_emb, - trg_word, - _decoding_strategy, - _beam_size, - _topk, - _topp, - _n_head, - _size_per_head, - _n_layer, - _bos_id, - _eos_id, - _max_out_len, - _diversity_rate, - _rel_len, - _alpha, - _temperature, - _early_stopping, - _hidden_act, -): - - inputs_names = [ - "Input", - "MemSeqLen", - "WordEmbedding", - "SelfLayernormWeight@VECTOR", - "SelfLayernormBias@VECTOR", - "SelfQueryWeight@VECTOR", - "SelfQueryBias@VECTOR", - "SelfKeyWeight@VECTOR", - "SelfKeyBias@VECTOR", - "SelfValueWeight@VECTOR", - "SelfValueBias@VECTOR", - "SelfOutWeight@VECTOR", - "SelfOutBias@VECTOR", - "CrossLayernormWeight@VECTOR", - "CrossLayernormBias@VECTOR", - "CrossQueryWeight@VECTOR", - "CrossQueryBias@VECTOR", - "CrossKeyWeight@VECTOR", - "CrossKeyBias@VECTOR", - "CrossValueWeight@VECTOR", - "CrossValueBias@VECTOR", - "CrossOutWeight@VECTOR", - "CrossOutBias@VECTOR", - "FFNLayernormWeight@VECTOR", - "FFNLayernormBias@VECTOR", - "FFNInterWeight@VECTOR", - "FFNInterBias@VECTOR", - "FFNOutWeight@VECTOR", - "FFNOutBias@VECTOR", - "DecoderLayernormWeight", - "DecoderLayernormBias", - "MBARTLayernormWeight", - "MBARTLayernormBias", - "EmbWeight", - "EmbBias", - "PositionEncEmb", - # The input of custom op must be given. - # Dispensable() and Intermediate() are not supported. - "TrgWord", - ] - - inputs_var = [ - enc_output, - memory_seq_lens, - word_emb, - slf_ln_weight, - slf_ln_bias, - slf_q_weight, - slf_q_bias, - slf_k_weight, - slf_k_bias, - slf_v_weight, - slf_v_bias, - slf_out_weight, - slf_out_bias, - cross_ln_weight, - cross_ln_bias, - cross_q_weight, - cross_q_bias, - cross_k_weight, - cross_k_bias, - cross_v_weight, - cross_v_bias, - cross_out_weight, - cross_out_bias, - ffn_ln_weight, - ffn_ln_bias, - ffn_inter_weight, - ffn_inter_bias, - ffn_out_weight, - ffn_out_bias, - decoder_ln_weight, - decoder_ln_bias, - mbart_ln_weight, - mbart_ln_bias, - linear_weight, - linear_bias, - pos_emb, - # The input of custom op must be given. - # Dispensable() and Intermediate() are not supported. - trg_word, - ] - - attrs_names = [ - "decoding_strategy", - "beam_size", - "topk", - "topp", - "n_head", - "size_per_head", - "num_layer", - "bos_id", - "eos_id", - "temperature", - "max_len", - "beam_search_diversity_rate", - "rel_len", - "alpha", - "early_stopping", - "hidden_act", - ] - - attrs_val = [ - _decoding_strategy, - _beam_size, - _topk, - _topp, - _n_head, - _size_per_head, - _n_layer, - _bos_id, - _eos_id, - _temperature, - _max_out_len, - _diversity_rate, - _rel_len, - _alpha, - _early_stopping, - _hidden_act, - ] - - outputs_names = ["OutputIds", "ParentIds", "SequenceLength"] - - outputs_dtype = ["int32"] * len(outputs_names) - - return run_custom( - "fusion_mbart_decoding", inputs_names, inputs_var, attrs_names, attrs_val, outputs_names, outputs_dtype - ) - - -def infer_gptj_decoding( - input, - attn_mask, - mem_seq_len, - word_emb, - slf_ln_weight, - slf_ln_bias, - slf_q_weight, - slf_out_weight, - ffn_inter_weight, - ffn_inter_bias, - ffn_out_weight, - ffn_out_bias, - decoder_ln_weight, - decoder_ln_bias, - linear_weight, - linear_bias, - topk, - topp, - max_out_len, - head_num, - size_per_head, - num_layer, - bos_id, - eos_id, - temperature, - rotary_embedding_dim, - repetition_penalty, - min_length, - use_fp16_decoding, -): - tensor_para_size = get_ft_para_conf().tensor_para_size - layer_para_size = get_ft_para_conf().layer_para_size - layer_para_batch_size = get_ft_para_conf().layer_para_batch_size - - inputs = { - "Input": input, - "AttentionMask": attn_mask, - "StartLength": mem_seq_len, - "WordEmbedding": word_emb, - "SelfLayernormWeight@VECTOR": slf_ln_weight, - "SelfLayernormBias@VECTOR": slf_ln_bias, - "SelfQueryWeight@VECTOR": slf_q_weight, - "SelfOutWeight@VECTOR": slf_out_weight, - "FFNInterWeight@VECTOR": ffn_inter_weight, - "FFNInterBias@VECTOR": ffn_inter_bias, - "FFNOutWeight@VECTOR": ffn_out_weight, - "FFNOutBias@VECTOR": ffn_out_bias, - "DecoderLayernormWeight": decoder_ln_weight, - "DecoderLayernormBias": decoder_ln_bias, - "EmbWeight": linear_weight, - "EmbBias": linear_bias, - } - - attrs = { - "topk": topk, - "topp": topp, - "max_len": max_out_len, - "n_head": head_num, - "size_per_head": size_per_head, - "num_layer": num_layer, - "bos_id": bos_id, - "eos_id": eos_id, - "temperature": temperature, - "rotary_embedding_dim": rotary_embedding_dim, - "repetition_penalty": repetition_penalty, - "min_length": min_length, - "use_fp16": use_fp16_decoding, - "tensor_para_size": tensor_para_size, - "layer_para_size": layer_para_size, - "layer_para_batch_size": layer_para_batch_size, - } - - outputs_names = ["OutputIds"] - outputs_dtype = ["int32"] - - return run_custom( - op_name="fusion_gptj", - inputs_names=inputs.keys(), - inputs_var=inputs.values(), - attrs_names=attrs.keys(), - attrs_val=attrs.values(), - outputs_names=outputs_names, - outputs_dtype=outputs_dtype, - ) - - -def infer_pegasus_decoding( - enc_output, - memory_seq_lens, - word_emb, - slf_ln_weight, - slf_ln_bias, - slf_q_weight, - slf_q_bias, - slf_k_weight, - slf_k_bias, - slf_v_weight, - slf_v_bias, - slf_out_weight, - slf_out_bias, - cross_ln_weight, - cross_ln_bias, - cross_q_weight, - cross_q_bias, - cross_k_weight, - cross_k_bias, - cross_v_weight, - cross_v_bias, - cross_out_weight, - cross_out_bias, - ffn_ln_weight, - ffn_ln_bias, - ffn_inter_weight, - ffn_inter_bias, - ffn_out_weight, - ffn_out_bias, - decoder_ln_weight, - decoder_ln_bias, - linear_weight, - linear_bias, - pos_emb, - _decoding_strategy, - _beam_size, - _topk, - _topp, - _n_head, - _size_per_head, - _n_layer, - _bos_id, - _eos_id, - _max_out_len, - _min_out_len, - _diversity_rate, - _rel_len, - _alpha, - _temperature, - _early_stopping, - _hidden_act, -): - - inputs_names = [ - "Input", - "MemSeqLen", - "WordEmbedding", - "SelfLayernormWeight@VECTOR", - "SelfLayernormBias@VECTOR", - "SelfQueryWeight@VECTOR", - "SelfQueryBias@VECTOR", - "SelfKeyWeight@VECTOR", - "SelfKeyBias@VECTOR", - "SelfValueWeight@VECTOR", - "SelfValueBias@VECTOR", - "SelfOutWeight@VECTOR", - "SelfOutBias@VECTOR", - "CrossLayernormWeight@VECTOR", - "CrossLayernormBias@VECTOR", - "CrossQueryWeight@VECTOR", - "CrossQueryBias@VECTOR", - "CrossKeyWeight@VECTOR", - "CrossKeyBias@VECTOR", - "CrossValueWeight@VECTOR", - "CrossValueBias@VECTOR", - "CrossOutWeight@VECTOR", - "CrossOutBias@VECTOR", - "FFNLayernormWeight@VECTOR", - "FFNLayernormBias@VECTOR", - "FFNInterWeight@VECTOR", - "FFNInterBias@VECTOR", - "FFNOutWeight@VECTOR", - "FFNOutBias@VECTOR", - "DecoderLayernormWeight", - "DecoderLayernormBias", - "EmbWeight", - "EmbBias", - "PositionEncEmb", - # The input of custom op must be given. - # Dispensable() and Intermediate() are not supported. - ] - - inputs_var = [ - enc_output, - memory_seq_lens, - word_emb, - slf_ln_weight, - slf_ln_bias, - slf_q_weight, - slf_q_bias, - slf_k_weight, - slf_k_bias, - slf_v_weight, - slf_v_bias, - slf_out_weight, - slf_out_bias, - cross_ln_weight, - cross_ln_bias, - cross_q_weight, - cross_q_bias, - cross_k_weight, - cross_k_bias, - cross_v_weight, - cross_v_bias, - cross_out_weight, - cross_out_bias, - ffn_ln_weight, - ffn_ln_bias, - ffn_inter_weight, - ffn_inter_bias, - ffn_out_weight, - ffn_out_bias, - decoder_ln_weight, - decoder_ln_bias, - linear_weight, - linear_bias, - pos_emb, - # The input of custom op must be given. - # Dispensable() and Intermediate() are not supported. - ] - - attrs_names = [ - "decoding_strategy", - "beam_size", - "topk", - "topp", - "n_head", - "size_per_head", - "num_layer", - "bos_id", - "eos_id", - "temperature", - "max_len", - "min_len", - "beam_search_diversity_rate", - "rel_len", - "alpha", - "early_stopping", - "hidden_act", - "emb_scale", - ] - - attrs_val = [ - _decoding_strategy, - _beam_size, - _topk, - _topp, - _n_head, - _size_per_head, - _n_layer, - _bos_id, - _eos_id, - _temperature, - _max_out_len, - _min_out_len, - _diversity_rate, - _rel_len, - _alpha, - _early_stopping, - _hidden_act, - ] - - outputs_names = ["OutputIds", "ParentIds", "SequenceLength"] - - outputs_dtype = ["int32"] * len(outputs_names) - - return run_custom( - "fusion_pegasus_decoding", inputs_names, inputs_var, attrs_names, attrs_val, outputs_names, outputs_dtype - ) - - -def infer_t5_decoding( - enc_output, - memory_seq_lens, - word_emb, - slf_ln_weight, - slf_ln_bias, - slf_q_weight, - slf_q_bias, - slf_k_weight, - slf_k_bias, - slf_v_weight, - slf_v_bias, - slf_out_weight, - slf_out_bias, - cross_ln_weight, - cross_ln_bias, - cross_q_weight, - cross_q_bias, - cross_k_weight, - cross_k_bias, - cross_v_weight, - cross_v_bias, - cross_out_weight, - cross_out_bias, - ffn_ln_weight, - ffn_ln_bias, - ffn_inter_weight_0, - ffn_inter_bias_0, - ffn_inter_weight_1, - ffn_inter_bias_1, - ffn_out_weight, - ffn_out_bias, - relative_attention_bias_weight, - decoder_ln_weight, - decoder_ln_bias, - linear_weight, - linear_bias, - decoding_strategy, - beam_size, - top_k, - top_p, - head_num, - size_per_head, - num_decoder_layers, - start_id, - end_id, - max_out_len, - diversity_rate, - rel_len, - alpha, - temperature, - early_stopping, - max_distance, - relative_attention_num_buckets, - tie_word_embeddings, - act, -): - - inputs_names = [ - "Input", - "MemSeqLen", - "WordEmbedding", - "SelfLayernormWeight@VECTOR", - "SelfLayernormBias@VECTOR", - "SelfQueryWeight@VECTOR", - "SelfQueryBias@VECTOR", - "SelfKeyWeight@VECTOR", - "SelfKeyBias@VECTOR", - "SelfValueWeight@VECTOR", - "SelfValueBias@VECTOR", - "SelfOutWeight@VECTOR", - "SelfOutBias@VECTOR", - "CrossLayernormWeight@VECTOR", - "CrossLayernormBias@VECTOR", - "CrossQueryWeight@VECTOR", - "CrossQueryBias@VECTOR", - "CrossKeyWeight@VECTOR", - "CrossKeyBias@VECTOR", - "CrossValueWeight@VECTOR", - "CrossValueBias@VECTOR", - "CrossOutWeight@VECTOR", - "CrossOutBias@VECTOR", - "FFNLayernormWeight@VECTOR", - "FFNLayernormBias@VECTOR", - "FFNInterWeight0@VECTOR", - "FFNInterBias0@VECTOR", - "FFNInterWeight1@VECTOR", - "FFNInterBias1@VECTOR", - "FFNOutWeight@VECTOR", - "FFNOutBias@VECTOR", - "SelfRelativeAttentionBiasWeight", - "DecoderLayernormWeight", - "DecoderLayernormBias", - "EmbWeight", - "EmbBias", - ] - - inputs_var = [ - enc_output, - memory_seq_lens, - word_emb, - slf_ln_weight, - slf_ln_bias, - slf_q_weight, - slf_q_bias, - slf_k_weight, - slf_k_bias, - slf_v_weight, - slf_v_bias, - slf_out_weight, - slf_out_bias, - cross_ln_weight, - cross_ln_bias, - cross_q_weight, - cross_q_bias, - cross_k_weight, - cross_k_bias, - cross_v_weight, - cross_v_bias, - cross_out_weight, - cross_out_bias, - ffn_ln_weight, - ffn_ln_bias, - ffn_inter_weight_0, - ffn_inter_bias_0, - ffn_inter_weight_1, - ffn_inter_bias_1, - ffn_out_weight, - ffn_out_bias, - relative_attention_bias_weight, - decoder_ln_weight, - decoder_ln_bias, - linear_weight, - linear_bias, - ] - - attrs_names = [ - "decoding_strategy", - "beam_size", - "topk", - "topp", - "n_head", - "size_per_head", - "num_layer", - "bos_id", - "eos_id", - "max_len", - "beam_search_diversity_rate", - "rel_len", - "alpha", - "temperature", - "early_stopping", - "max_distance", - "num_buckets", - "tie_word_embeddings", - "act", - ] - - attrs_val = [ - decoding_strategy, - beam_size, - top_k, - top_p, - head_num, - size_per_head, - num_decoder_layers, - start_id, - end_id, - max_out_len, - diversity_rate, - rel_len, - alpha, - temperature, - early_stopping, - max_distance, - relative_attention_num_buckets, - tie_word_embeddings, - act, - ] - - outputs_names = ["OutputIds", "ParentIds", "SequenceLength"] - - outputs_dtype = ["int32"] * len(outputs_names) - - return run_custom( - "fusion_t5_decoding", inputs_names, inputs_var, attrs_names, attrs_val, outputs_names, outputs_dtype - ) - - -def finalize( - beam_size, - output_ids, - parent_ids, - out_seq_lens, - forced_eos_token_id=None, - max_seq_len=None, - decoding_strategy="beam_search", -): - if max_seq_len is None: - max_seq_len = paddle.max(out_seq_lens) - ids = paddle.slice(output_ids, [0], [0], [max_seq_len]) - if decoding_strategy.startswith("beam_search"): - parent_ids = paddle.slice(parent_ids, [0], [0], [max_seq_len]) % ( - beam_size * 2 if decoding_strategy.endswith("_v2") or decoding_strategy.endswith("_v3") else beam_size - ) - ids = paddle.nn.functional.gather_tree(ids, parent_ids) - if forced_eos_token_id is not None: - ids[-1, :, :] = forced_eos_token_id - else: - if forced_eos_token_id is not None: - ids[-1, :] = forced_eos_token_id - return ids - - -def transfer_param(p, is_bias=False, dtype="float16", restore_data=False): - param_shape = p.shape - # Allow CPU/GPU and float16/float32 transfer - # NOTE: str(p.place) differs between paddle develop and 2.2 - if str(p.dtype)[-len(dtype) :] == dtype and ("gpu" in str(p.place).lower() or "cuda" in str(p.place).lower()): - return p - if restore_data: - if paddle.in_dynamic_mode(): - param_data = p.numpy() - # Creating parameters with Assign initializer is too slow. Maybe we - # can cast to fp16 directly and get a tensor, while we do it more - # elaborately to get a ParamBase. Also note `VarBase.set_value` - # enforce the same dtype and can not be used directly. - new_p = type(p)(shape=param_shape, dtype=dtype, is_bias=is_bias) - new_p.value().get_tensor().set(param_data.astype(dtype), paddle.framework._current_expected_place()) - return new_p - else: - param_data = np.array(paddle.static.global_scope().find_var(p.name).get_tensor()) - return paddle.create_parameter( - shape=param_shape, - dtype=dtype, - is_bias=is_bias, - default_initializer=paddle.nn.initializer.Assign(param_data) if restore_data else None, - ) - - -def _convert_qkv(q_proj, k_proj, v_proj, attr="weight", use_numpy=True, del_param=False, dummy_tensor=None): - ft_para_conf = get_ft_para_conf() - # TODO(guosheng): maybe static graph need this - # p = fast_model.create_parameter( - # shape=[q.shape[0], q.shape[1] + k.shape[1] + v.shape[1]], - # dtype=q.dtype, - # is_bias=is_bias) - q = getattr(q_proj, attr) - k = getattr(k_proj, attr) - v = getattr(v_proj, attr) - if use_numpy: - q = q.numpy() - if del_param: - if attr == "weight": - del q_proj.weight - else: - del q_proj.bias - k = k.numpy() - if del_param: - if attr == "weight": - del k_proj.weight - else: - del k_proj.bias - v = v.numpy() - if del_param: - if attr == "weight": - del v_proj.weight - else: - del v_proj.bias - else: - if del_param: - for i in [q_proj, k_proj, v_proj]: - if attr == "weight": - del i.weight - else: - del i.bias - q = ft_para_conf.slice_weight(q, 1) - k = ft_para_conf.slice_weight(k, 1) - v = ft_para_conf.slice_weight(v, 1) - if del_param: - # NOTE: dygraph_to_static/convert_call_func.py would log the converted - # function. For linear layer, if we delete the params, log would fail. - # And the log requires weight to be a 2D tensor. - # NOTE: Assignment to parameter 'weight' should be of type - # Parameter or None, thus delete before in case of tensor. - setattr(q_proj, attr, dummy_tensor) - setattr(k_proj, attr, dummy_tensor) - setattr(v_proj, attr, dummy_tensor) - if use_numpy: - p = paddle.to_tensor(np.concatenate([q, k, v], axis=-1)) - else: - p = paddle.concat([q, k, v], axis=-1) - return p - - -def convert_params(fast_model, model, fuse_qkv=1, use_fp16=False, restore_data=False): - r""" - Convert parameters included in Transformer layer (`nn.TransformerEncoder` - and `gpt.modeling.TransformerDecoder`) from original models to the format - of faster models. - - Args: - fast_model (Layer): The faster model object. - model (Layer): The Transformer layer. It can be an instance of - `nn.TransformerEncoder` or `gpt.modeling.TransformerDecoder` - currently, and `nn.TransformerDecoder` would be supported soon. - fuse_qkv (int): 0 for nofuse, 1 for fuse, 2 for fuse and delete the - unfused parameters. If environment variable `PPFG_QKV_MEM_OPT` is - set and the weights of q/k/v is fused, it will try to delete the - original unfused weights. Note the rollback to original model would - not be guarantee anymore when the faster model failed if the original - weights are deleted. Default to 1. - use_fp16 (bool): Whether to use float16. Maybe we should use the default - dtype as the highest priority later. Default to `False`. - restore_data (bool): If `False`, need to reload the weight values. It - should be `True` for weight loaded models. Default to `False`. - - Returns: - defaultdict: Each value is a list including converted parameters in all - layers. For other parameters not included in Transformer module to - be converted, such as embeddings, you can achieve it by using the - returned dict `params` though `params['word_emb'].append()` directly - which would do CPU/GPU and fp32/fp16 transfer automatically. - """ - if fuse_qkv == 1: - fuse_qkv = 2 if os.getenv("PPFG_QKV_MEM_OPT", "0") == "1" else 1 - ft_para_conf = get_ft_para_conf() - - class _list(list): - def append(self, item): - def attr_handle_func(x): - return x - - if isinstance(item[0], nn.Layer): - # Axis is used for tensor slice in tensor parallel. - # Use None to make no slice on the tensor. - if len(item) == 2: - layer, attr = item - axis = None - else: - layer, attr, axis = item - param = getattr(layer, attr) - if axis is not None and isinstance(layer, nn.Linear): - param = ft_para_conf.slice_weight(param, axis) - param = transfer_param( - param, - is_bias=attr.endswith("bias"), - dtype="float16" if use_fp16 else "float32", - restore_data=restore_data, - ) - # NOTE: Assignment to parameter 'weight' should be of type - # Parameter or None, thus delete first in case of param is - # a tensor. - # TODO(guosheng): Make slice_weight use `output_param=True` - # and remove delattr. Currently, if `param` is Tensor rather - # than Parameter, it would not be in state_dict. - delattr(layer, attr) - setattr(layer, attr, param) - else: - # NOTE: Compared with if branch, there is no layer attribute - # refered to the transfered param, thus we should set it as - # the layer attribute to be able to convert to static graph. - # Additionally, we suppose no need to process tensor parallel - # here since the param passed in might have been processed. - if len(item) == 2: - param, is_bias = item - attr_handle = attr_handle_func - else: - param, is_bias, attr_handle = item - param = transfer_param( - param, is_bias=is_bias, dtype="float16" if use_fp16 else "float32", restore_data=restore_data - ) - attr_handle(param) - return super().append(param) - - params = defaultdict(_list) - - def _convert(module): - if isinstance( - module, - ( - nn.TransformerEncoder, - nn.TransformerDecoder, - paddlenlp.transformers.gpt.modeling.TransformerDecoder, - paddlenlp.transformers.opt.modeling.TransformerDecoder, - ), - ): - num_layer = len(module.layers) - for i, layer in enumerate(module.layers): - if not ft_para_conf.is_load(i, num_layer): - continue - # fuse_qkv: 0 for nofuse, 1 for fuse, - # 2 for fuse and delete the unfused - if fuse_qkv == 0: - params["slf_q_weight"].append((layer.self_attn.q_proj, "weight", 1)) - params["slf_q_bias"].append((layer.self_attn.q_proj, "bias", 1)) - params["slf_k_weight"].append((layer.self_attn.k_proj, "weight", 1)) - params["slf_k_bias"].append((layer.self_attn.k_proj, "bias", 1)) - params["slf_v_weight"].append((layer.self_attn.v_proj, "weight", 1)) - params["slf_v_bias"].append((layer.self_attn.v_proj, "bias", 1)) - - else: - # TODO(guosheng): Tensor with size 0 might be failed in - # paddle develop, thus use tensor with size 1 instead - # temporarily. Besides, we use 2D tensor since jit log - # requires that on linear weight. While size 0 seems all - # right in jit.to_static/jit.save. - dummy_tensor = paddle.zeros([1, 1]) - w = _convert_qkv( - layer.self_attn.q_proj, - layer.self_attn.k_proj, - layer.self_attn.v_proj, - attr="weight", - use_numpy=fuse_qkv == 2, - del_param=fuse_qkv == 2, - dummy_tensor=dummy_tensor, - ) - b = _convert_qkv( - layer.self_attn.q_proj, - layer.self_attn.k_proj, - layer.self_attn.v_proj, - attr="bias", - use_numpy=fuse_qkv == 2, - del_param=fuse_qkv == 2, - dummy_tensor=dummy_tensor, - ) - params["slf_q_weight"].append((w, False)) - params["slf_q_bias"].append((b, True)) - # NOTE: Use `params["slf_q_weight"][-1]` rather than `w`, - # since the appended tensor might be a new transfered tensor. - # Besides, to allow convert_params be called more than once, - # we find a attr name not existing to avoid overwriting the - # existing attr. - attr = "slf_q_weight_" + str(i) - while hasattr(fast_model, attr): - attr += "_" - setattr(fast_model, attr, params["slf_q_weight"][-1]) - attr = "slf_q_bias_" + str(i) - while hasattr(fast_model, attr): - attr += "_" - setattr(fast_model, attr, params["slf_q_bias"][-1]) - for key in [f"slf_{m}_{n}" for m in ("k", "v") for n in ("weight", "bias")]: - params[key].append((dummy_tensor, True if key.endswith("bias") else False)) - attr = key + "_" + str(i) - while hasattr(fast_model, attr): - attr += "_" - setattr(fast_model, attr, params[key][-1]) - if hasattr(layer, "cross_attn"): - # nn.TransformerDecoder - params["cross_q_weight"].append((layer.cross_attn.q_proj, "weight", 1)) - params["cross_q_bias"].append((layer.cross_attn.q_proj, "bias", 1)) - params["cross_k_weight"].append((layer.cross_attn.k_proj, "weight", 1)) - params["cross_k_bias"].append((layer.cross_attn.k_proj, "bias", 1)) - params["cross_v_weight"].append((layer.cross_attn.v_proj, "weight", 1)) - params["cross_v_bias"].append((layer.cross_attn.v_proj, "bias", 1)) - params["cross_out_weight"].append((layer.cross_attn.out_proj, "weight", 0)) - params["cross_out_bias"].append((layer.cross_attn.out_proj, "bias", 0)) - - params["slf_out_weight"].append((layer.self_attn.out_proj, "weight", 0)) - params["slf_out_bias"].append((layer.self_attn.out_proj, "bias")) - params["slf_ln_weight"].append((layer.norm1, "weight")) - params["slf_ln_bias"].append((layer.norm1, "bias")) - # Slice tensor when append according to axis(1 or 0) if parallel - # is enable. - params["ffn_inter_weight"].append((layer.linear1, "weight", 1)) - params["ffn_inter_bias"].append((layer.linear1, "bias", 1)) - params["ffn_out_weight"].append((layer.linear2, "weight", 0)) - params["ffn_out_bias"].append((layer.linear2, "bias")) - if hasattr(layer, "norm3"): - # nn.TransformerDecoder - params["cross_ln_weight"].append((layer.norm2, "weight")) - params["cross_ln_bias"].append((layer.norm2, "bias")) - params["ffn_ln_weight"].append((layer.norm3, "weight")) - params["ffn_ln_bias"].append((layer.norm3, "bias")) - else: - params["ffn_ln_weight"].append((layer.norm2, "weight")) - params["ffn_ln_bias"].append((layer.norm2, "bias")) - - if getattr(module, "norm", None) is not None: - params["decoder_ln_weight"].append((module.norm, "weight")) - params["decoder_ln_bias"].append((module.norm, "bias")) - elif isinstance(module, (paddlenlp.transformers.t5.modeling.T5Stack)) and module.is_decoder: - num_layer = len(module.block) - for i, block in enumerate(module.block): - if not ft_para_conf.is_load(i, num_layer): - continue - # fuse_qkv: 0 for nofuse, 1 for fuse, - # 2 for fuse and delete the unfused - if fuse_qkv == 0: - params["slf_q_weight"].append((block.layer[0].SelfAttention.q, "weight", 1)) - if getattr(block.layer[0].SelfAttention.q, "bias", None) is not None: - params["slf_q_bias"].append((block.layer[0].SelfAttention.q, "bias", 1)) - - params["slf_k_weight"].append((block.layer[0].SelfAttention.k, "weight", 1)) - if getattr(block.layer[0].SelfAttention.k, "bias", None) is not None: - params["slf_k_bias"].append((block.layer[0].SelfAttention.k, "bias", 1)) - - params["slf_v_weight"].append((block.layer[0].SelfAttention.v, "weight", 1)) - if getattr(block.layer[0].SelfAttention.v, "bias", None) is not None: - params["slf_k_bias"].append((block.layer[0].SelfAttention.v, "bias", 1)) - - else: - dummy_tensor = paddle.zeros([1, 1]) - w = _convert_qkv( - block.layer[0].SelfAttention.q, - block.layer[0].SelfAttention.k, - block.layer[0].SelfAttention.v, - attr="weight", - use_numpy=(fuse_qkv == 2), - del_param=(fuse_qkv == 2), - dummy_tensor=dummy_tensor, - ) - params["slf_q_weight"].append((w, False)) - - if ( - getattr(block.layer[0].SelfAttention.q, "bias", None) is not None - and getattr(block.layer[0].SelfAttention.k, "bias", None) is not None - and getattr(block.layer[0].SelfAttention.v, "bias", None) is not None - ): - b = _convert_qkv( - block.layer[0].SelfAttention.q, - block.layer[0].SelfAttention.k, - block.layer[0].SelfAttention.v, - attr="bias", - use_numpy=(fuse_qkv == 2), - del_param=(fuse_qkv == 2), - dummy_tensor=dummy_tensor, - ) - params["slf_q_bias"].append((b, True)) - - # NOTE: Use `params["slf_q_weight"][-1]` rather than `w`, - # since the appended tensor might be a new transfered tensor. - # Besides, to allow convert_params be called more than once, - # we find a attr name not existing to avoid overwriting the - # existing attr. - attr = "slf_q_weight_" + str(i) - while hasattr(fast_model, attr): - attr += "_" - setattr(fast_model, attr, params["slf_q_weight"][-1]) - - param_type = "weight" - if "slf_q_bias" in params.keys(): - attr = "slf_q_bias_" + str(i) - while hasattr(fast_model, attr): - attr += "_" - setattr(fast_model, attr, params["slf_q_bias"][-1]) - param_type.append("bias") - - for key in [f"slf_{m}_{n}" for m in ("k", "v") for n in param_type]: - params[key].append((dummy_tensor, True if key.endswith("bias") else False)) - attr = key + "_" + str(i) - while hasattr(fast_model, attr): - attr += "_" - setattr(fast_model, attr, params[key][-1]) - - ffn_index = 1 - if len(block.layer) == 3: - ffn_index = 2 - - params["cross_q_weight"].append((block.layer[1].EncDecAttention.q, "weight", 1)) - if getattr(block.layer[1].EncDecAttention.q, "bias", None) is not None: - params["cross_q_bias"].append((block.layer[1].EncDecAttention.q, "bias", 1)) - - params["cross_k_weight"].append((block.layer[1].EncDecAttention.k, "weight", 1)) - if getattr(block.layer[1].EncDecAttention.k, "bias", None) is not None: - params["cross_k_bias"].append((block.layer[1].EncDecAttention.k, "bias", 1)) - - params["cross_v_weight"].append((block.layer[1].EncDecAttention.v, "weight", 1)) - if getattr(block.layer[1].EncDecAttention.v, "bias", None) is not None: - params["cross_v_bias"].append((block.layer[1].EncDecAttention.v, "bias", 1)) - - params["cross_out_weight"].append((block.layer[1].EncDecAttention.o, "weight", 0)) - if getattr(block.layer[1].EncDecAttention.o, "bias", None) is not None: - params["cross_out_bias"].append((block.layer[1].EncDecAttention.o, "bias", 0)) - - params["cross_ln_weight"].append((block.layer[1].layer_norm, "weight", 0)) - if getattr(block.layer[1].layer_norm, "bias", None) is not None: - params["cross_ln_bias"].append((block.layer[1].layer_norm, "bias", 0)) - - if hasattr(block.layer[ffn_index], "DenseReluDense"): - if isinstance(block.layer[ffn_index].DenseReluDense, (T5DenseReluDense)): - params["ffn_inter_weight_0"].append((block.layer[ffn_index].DenseReluDense.wi, "weight", 1)) - if getattr(block.layer[ffn_index].DenseReluDense.wi, "bias", None) is not None: - params["ffn_inter_bias_0"].append((block.layer[ffn_index].DenseReluDense.wi, "bias", 1)) - - params["ffn_out_weight"].append((block.layer[ffn_index].DenseReluDense.wo, "weight", 0)) - if getattr(block.layer[ffn_index].DenseReluDense.wo, "bias", None) is not None: - params["ffn_out_bias"].append((block.layer[ffn_index].DenseReluDense.wo, "bias")) - elif isinstance(block.layer[ffn_index].DenseReluDense, (T5DenseGatedGeluDense)): - params["ffn_inter_weight_0"].append((block.layer[ffn_index].DenseReluDense.wi_0, "weight", 1)) - if getattr(block.layer[ffn_index].DenseReluDense.wi_0, "bias", None) is not None: - params["ffn_inter_bias_0"].append((block.layer[ffn_index].DenseReluDense.wi_0, "bias", 1)) - - params["ffn_inter_weight_1"].append((block.layer[ffn_index].DenseReluDense.wi_1, "weight", 1)) - if getattr(block.layer[ffn_index].DenseReluDense.wi_1, "bias", None) is not None: - params["ffn_inter_bias_1"].append((block.layer[ffn_index].DenseReluDense.wi_1, "bias", 1)) - - params["ffn_out_weight"].append((block.layer[ffn_index].DenseReluDense.wo, "weight", 0)) - if getattr(block.layer[ffn_index].DenseReluDense.wo, "bias", None) is not None: - params["ffn_out_bias"].append((block.layer[ffn_index].DenseReluDense.wo, "bias")) - else: - raise NotImplementedError("Faster only support T5DenseReluDense and T5DenseGatedGeluDense. ") - - params["ffn_ln_weight"].append((block.layer[ffn_index].layer_norm, "weight")) - if getattr(block.layer[ffn_index].layer_norm, "bias", None) is not None: - params["ffn_ln_bias"].append((block.layer[ffn_index].layer_norm, "bias")) - - params["slf_out_weight"].append((block.layer[0].SelfAttention.o, "weight", 0)) - if getattr(block.layer[0].SelfAttention.o, "bias", None) is not None: - params["slf_out_bias"].append((block.layer[0].SelfAttention.o, "bias")) - - params["slf_ln_weight"].append((block.layer[0].layer_norm, "weight")) - if getattr(block.layer[0].layer_norm, "bias", None) is not None: - params["slf_ln_bias"].append((block.layer[0].layer_norm, "bias")) - - if getattr(module, "norm", None) is not None: - params["decoder_ln_weight"].append((module.final_layer_norm, "weight")) - if getattr(module.final_layer_norm, "bias", None) is not None: - params["decoder_ln_bias"].append((module.final_layer_norm, "bias")) - - model.apply(_convert) - return params - - -class InferBase(nn.Layer): - def __init__(self, use_fp16_decoding): - super(InferBase, self).__init__() - self._use_fp16_decoding = use_fp16_decoding - - def default_bias(self, weight, index, is_null=False): - if is_null: - size = 1 - elif isinstance(weight, (list, tuple)): - size = weight[0].shape[index] - else: - size = weight.shape[index] - - if not hasattr(self, "default_bias_" + str(size)): - setattr( - self, - "default_bias_" + str(size), - paddle.zeros(shape=[size], dtype="float16" if self._use_fp16_decoding else "float32"), - ) - - if isinstance(weight, (list, tuple)): - return [getattr(self, "default_bias_" + str(size))] * len(weight) - else: - return [getattr(self, "default_bias_" + str(size))] - - -class InferTransformerDecoding(nn.Layer): - def __init__( - self, - decoder, - word_embedding, - positional_embedding, - linear, - num_decoder_layers, - n_head, - d_model, - bos_id=0, - eos_id=1, - decoding_strategy="beam_search", - beam_size=4, - topk=1, - topp=0.0, - max_out_len=256, - diversity_rate=0.0, - decoding_lib=None, - use_fp16_decoding=False, - rel_len=False, - alpha=0.6, - ): - # if decoding_lib is None: - # raise ValueError( - # "The args decoding_lib must be set to use FastGeneration. ") - # elif not os.path.exists(decoding_lib): - # raise ValueError("The path to decoding lib is not exist.") - if decoding_lib is not None and os.path.isfile(decoding_lib): - # Maybe it has been loadad by `ext_utils.load` - if "FastGeneration" not in LOADED_EXT.keys(): - ops = paddle.utils.cpp_extension.load_op_meta_info_and_register_op(decoding_lib) - LOADED_EXT["FastGeneration"] = ops - else: - if decoding_lib is not None: - logger.warning("The specified decoding_lib does not exist, and it will be built automatically.") - load("FastGeneration", verbose=True) - - size_per_head = d_model / n_head - # fuse_qkv can only support size_per_head of [32, 64, 128]. - if size_per_head in [32, 64, 128]: - self._fuse_qkv = True - else: - self._fuse_qkv = False - - super(InferTransformerDecoding, self).__init__() - for arg, value in locals().items(): - if arg not in ["self", "decoder", "word_embedding", "positional_embedding", "linear"]: - setattr(self, "_" + arg, value) - # process weights - if use_fp16_decoding: - for mod in decoder.layers: - mod.norm1.weight = transfer_param(mod.norm1.weight) - mod.norm1.bias = transfer_param(mod.norm1.bias, is_bias=True) - mod.self_attn.q_proj.weight = transfer_param(mod.self_attn.q_proj.weight) - mod.self_attn.q_proj.bias = transfer_param(mod.self_attn.q_proj.bias, is_bias=True) - mod.self_attn.k_proj.weight = transfer_param(mod.self_attn.k_proj.weight) - mod.self_attn.k_proj.bias = transfer_param(mod.self_attn.k_proj.bias, is_bias=True) - mod.self_attn.v_proj.weight = transfer_param(mod.self_attn.v_proj.weight) - mod.self_attn.v_proj.bias = transfer_param(mod.self_attn.v_proj.bias, is_bias=True) - mod.self_attn.out_proj.weight = transfer_param(mod.self_attn.out_proj.weight) - mod.self_attn.out_proj.bias = transfer_param(mod.self_attn.out_proj.bias, is_bias=True) - - mod.norm2.weight = transfer_param(mod.norm2.weight) - mod.norm2.bias = transfer_param(mod.norm2.bias, is_bias=True) - mod.cross_attn.q_proj.weight = transfer_param(mod.cross_attn.q_proj.weight) - mod.cross_attn.q_proj.bias = transfer_param(mod.cross_attn.q_proj.bias, is_bias=True) - mod.cross_attn.k_proj.weight = transfer_param(mod.cross_attn.k_proj.weight) - mod.cross_attn.k_proj.bias = transfer_param(mod.cross_attn.k_proj.bias, is_bias=True) - mod.cross_attn.v_proj.weight = transfer_param(mod.cross_attn.v_proj.weight) - mod.cross_attn.v_proj.bias = transfer_param(mod.cross_attn.v_proj.bias, is_bias=True) - mod.cross_attn.out_proj.weight = transfer_param(mod.cross_attn.out_proj.weight) - mod.cross_attn.out_proj.bias = transfer_param(mod.cross_attn.out_proj.bias, is_bias=True) - - mod.norm3.weight = transfer_param(mod.norm3.weight) - mod.norm3.bias = transfer_param(mod.norm3.bias, is_bias=True) - mod.linear1.weight = transfer_param(mod.linear1.weight) - mod.linear1.bias = transfer_param(mod.linear1.bias, is_bias=True) - mod.linear2.weight = transfer_param(mod.linear2.weight) - mod.linear2.bias = transfer_param(mod.linear2.bias, is_bias=True) - - decoder.norm.weight = transfer_param(decoder.norm.weight) - decoder.norm.bias = transfer_param(decoder.norm.bias, is_bias=True) - - linear.weight = transfer_param(linear.weight) - linear.bias = transfer_param(linear.bias, is_bias=True) - - positional_embedding.weight = transfer_param(positional_embedding.weight) - word_embedding.weight = transfer_param(word_embedding.weight) - - self.slf_ln_weight = [] - self.slf_ln_bias = [] - self.slf_q_weight = [] - self.slf_q_bias = [] - self.slf_k_weight = [] - self.slf_k_bias = [] - self.slf_v_weight = [] - self.slf_v_bias = [] - self.slf_out_weight = [] - self.slf_out_bias = [] - - self.cross_ln_weight = [] - self.cross_ln_bias = [] - self.cross_q_weight = [] - self.cross_q_bias = [] - self.cross_k_weight = [] - self.cross_k_bias = [] - self.cross_v_weight = [] - self.cross_v_bias = [] - self.cross_out_weight = [] - self.cross_out_bias = [] - - self.ffn_ln_weight = [] - self.ffn_ln_bias = [] - self.ffn_inter_weight = [] - self.ffn_inter_bias = [] - self.ffn_out_weight = [] - self.ffn_out_bias = [] - - for i, mod in enumerate(decoder.layers): - self.slf_ln_weight.append(mod.norm1.weight) - self.slf_ln_bias.append(mod.norm1.bias) - - if self._fuse_qkv: - q_weight_shape = mod.self_attn.q_proj.weight.shape - k_weight_shape = mod.self_attn.k_proj.weight.shape - v_weight_shape = mod.self_attn.v_proj.weight.shape - - q_weights = self.create_parameter( - shape=[q_weight_shape[0], q_weight_shape[1] + k_weight_shape[1] + v_weight_shape[1]], - dtype="float16" if use_fp16_decoding else "float32", - ) - setattr(self, "slf_q_weight_" + str(i), q_weights) - self.slf_q_weight.append(getattr(self, "slf_q_weight_" + str(i))) - - q_bias_shape = mod.self_attn.q_proj.bias.shape - k_bias_shape = mod.self_attn.k_proj.bias.shape - v_bias_shape = mod.self_attn.v_proj.bias.shape - - q_biases = self.create_parameter( - shape=[q_bias_shape[0] + k_bias_shape[0] + v_bias_shape[0]], - dtype="float16" if use_fp16_decoding else "float32", - is_bias=True, - ) - setattr(self, "slf_q_bias_" + str(i), q_biases) - self.slf_q_bias.append(getattr(self, "slf_q_bias_" + str(i))) - else: - self.slf_q_weight.append(mod.self_attn.q_proj.weight) - self.slf_q_bias.append(mod.self_attn.q_proj.bias) - - self.slf_k_weight.append(mod.self_attn.k_proj.weight) - self.slf_k_bias.append(mod.self_attn.k_proj.bias) - self.slf_v_weight.append(mod.self_attn.v_proj.weight) - self.slf_v_bias.append(mod.self_attn.v_proj.bias) - self.slf_out_weight.append(mod.self_attn.out_proj.weight) - self.slf_out_bias.append(mod.self_attn.out_proj.bias) - - self.cross_ln_weight.append(mod.norm2.weight) - self.cross_ln_bias.append(mod.norm2.bias) - self.cross_q_weight.append(mod.cross_attn.q_proj.weight) - self.cross_q_bias.append(mod.cross_attn.q_proj.bias) - self.cross_k_weight.append(mod.cross_attn.k_proj.weight) - self.cross_k_bias.append(mod.cross_attn.k_proj.bias) - self.cross_v_weight.append(mod.cross_attn.v_proj.weight) - self.cross_v_bias.append(mod.cross_attn.v_proj.bias) - self.cross_out_weight.append(mod.cross_attn.out_proj.weight) - self.cross_out_bias.append(mod.cross_attn.out_proj.bias) - - self.ffn_ln_weight.append(mod.norm3.weight) - self.ffn_ln_bias.append(mod.norm3.bias) - self.ffn_inter_weight.append(mod.linear1.weight) - self.ffn_inter_bias.append(mod.linear1.bias) - self.ffn_out_weight.append(mod.linear2.weight) - self.ffn_out_bias.append(mod.linear2.bias) - - self.decoder_ln_weight = [decoder.norm.weight] - self.decoder_ln_bias = [decoder.norm.bias] - - self.pos_emb = [positional_embedding.weight] - self.word_emb = [word_embedding.weight] - - self.linear_weight = [linear.weight] - self.linear_bias = [linear.bias] - - def forward(self, enc_output, memory_seq_lens, trg_word=None): - def parse_function(func_name): - return partial( - func_name, - word_emb=self.word_emb, - slf_ln_weight=self.slf_ln_weight, - slf_ln_bias=self.slf_ln_bias, - slf_q_weight=self.slf_q_weight, - slf_q_bias=self.slf_q_bias, - slf_k_weight=self.slf_k_weight, - slf_k_bias=self.slf_k_bias, - slf_v_weight=self.slf_v_weight, - slf_v_bias=self.slf_v_bias, - slf_out_weight=self.slf_out_weight, - slf_out_bias=self.slf_out_bias, - cross_ln_weight=self.cross_ln_weight, - cross_ln_bias=self.cross_ln_bias, - cross_q_weight=self.cross_q_weight, - cross_q_bias=self.cross_q_bias, - cross_k_weight=self.cross_k_weight, - cross_k_bias=self.cross_k_bias, - cross_v_weight=self.cross_v_weight, - cross_v_bias=self.cross_v_bias, - cross_out_weight=self.cross_out_weight, - cross_out_bias=self.cross_out_bias, - ffn_ln_weight=self.ffn_ln_weight, - ffn_ln_bias=self.ffn_ln_bias, - ffn_inter_weight=self.ffn_inter_weight, - ffn_inter_bias=self.ffn_inter_bias, - ffn_out_weight=self.ffn_out_weight, - ffn_out_bias=self.ffn_out_bias, - decoder_ln_weight=self.decoder_ln_weight, - decoder_ln_bias=self.decoder_ln_bias, - linear_weight=self.linear_weight, - linear_bias=self.linear_bias, - pos_emb=self.pos_emb, - _decoding_strategy=self._decoding_strategy, - _beam_size=self._beam_size, - _topk=self._topk, - _topp=self._topp, - _n_head=self._n_head, - _size_per_head=int(self._d_model / self._n_head), - _n_layer=self._num_decoder_layers, - _bos_id=self._bos_id, - _eos_id=self._eos_id, - _max_out_len=self._max_out_len, - _diversity_rate=self._diversity_rate, - _rel_len=self._rel_len, - _alpha=self._alpha, - ) - - if self._decoding_strategy.startswith("beam_search"): - # TODO: Due to paddle.tile bug in static graph, tile_beam_merge_with_batch - # cannot work properly. These comments should be opened after PaddlePaddle v2.2.2. - if paddle.__version__ <= "2.1.3": - enc_output = nn.decode.BeamSearchDecoder.tile_beam_merge_with_batch(enc_output, self._beam_size) - memory_seq_lens = nn.decode.BeamSearchDecoder.tile_beam_merge_with_batch( - memory_seq_lens, self._beam_size - ) - else: - enc_output_shape = enc_output.shape - batch_size = enc_output_shape[0] - max_seq_len = enc_output_shape[1] - enc_output = enc_output.unsqueeze([1]) - memory_seq_lens = memory_seq_lens.unsqueeze([1]) - enc_output = paddle.expand( - enc_output, shape=[batch_size, self._beam_size, max_seq_len, self._d_model] - ).reshape([batch_size * self._beam_size, max_seq_len, self._d_model]) - memory_seq_lens = paddle.expand(memory_seq_lens, shape=[batch_size, self._beam_size]).reshape( - [batch_size * self._beam_size] - ) - - if trg_word is None: - output_ids, parent_ids, sequence_length = parse_function(infer_transformer_decoding)( - enc_output=[enc_output], memory_seq_lens=[memory_seq_lens] - ) - else: - output_ids, parent_ids, sequence_length = parse_function(infer_force_decoding)( - enc_output=[enc_output], memory_seq_lens=[memory_seq_lens], trg_word=[trg_word] - ) - - ids = finalize( - self._beam_size, output_ids, parent_ids, sequence_length, decoding_strategy=self._decoding_strategy - ) - - return ids - - -# Patch for parallel inference to save memory -class FTParaConf(object): - r""" - Configurations for model parallel in FastGeneration. Currently only - support GPT. Please refer to `Megatron `__ - for details. - - Args: - tensor_para_size (int, optional): The size for tensor parallel. If it is - 1, tensor parallel would not be used. Default to 1. - layer_para_size (int, optional): The size for layer parallel. If it is - 1, layer parallel would not be used. Default to 1. - layer_para_batch_size (int, optional): The local batch size for pipeline - parallel. It is suggested to use `batch_size // layer_para_size`. - Default to 1. - """ - - def __init__(self, tensor_para_size=None, layer_para_size=None, layer_para_batch_size=1): - self.world_size = self._env2int( - [ # MPICH, OpenMPI, IMPI - "MPI_LOCALNRANKS", - "OMPI_COMM_WORLD_SIZE", - "PMI_SIZE", - "MV2_COMM_WORLD_SIZE", - "WORLD_SIZE", - ], - 1, - ) - self.rank = self._env2int( - [ # MPICH, OpenMPI, IMPI - "MPI_LOCALRANKID", - "OMPI_COMM_WORLD_RANK", - "PMI_RANK", - "MV2_COMM_WORLD_RANK", - "RANK", - ], - 0, - ) - if layer_para_size is None: - layer_para_size = 1 - if tensor_para_size is None: - tensor_para_size = self.world_size // layer_para_size - self.no_para = tensor_para_size == 1 and layer_para_size == 1 - self.tensor_para_size = tensor_para_size - self.layer_para_size = layer_para_size - self.layer_para_batch_size = layer_para_batch_size - - assert ( - self.world_size == tensor_para_size * layer_para_size - ), "tensor_para_size * layer_para_size must be equal to world_size." - self.tensor_para_rank = self.rank % self.tensor_para_size - self.layer_para_rank = self.rank // self.tensor_para_size - self.is_partial_model = False - - @staticmethod - def _env2int(env_list, default=-1): - for e in env_list: - val = int(os.environ.get(e, -1)) - if val >= 0: - return val - return default - - def is_last_group(self): - r""" - For layer parallel, only the process corresponding to the last layer - group can get the predict results. It is used to check whether this is - the process corresponding to the last layer group. - """ - return self.layer_para_rank == self.layer_para_size - 1 - - def is_load(self, i, num_layer): - r""" - Whether or not the given transformer layer of should be loaded to the - current parallel model. For layer parallel, there is no need not to load - other layer groups. - - Args: - i (int): The index of Transformer layer. - num_layer (int): The number of Transformer layers. - - Returns: - bool: Indicate whether or not the given transformer layer of should - be loaded to the current parallel model. - """ - if self.no_para: - return True - # Take into account model only including partial weights. - if self.is_partial_model: - return True - layers_per_device = num_layer // self.layer_para_size - return (i >= layers_per_device * self.layer_para_rank) and i < layers_per_device * (self.layer_para_rank + 1) - - def slice_weight(self, weight, axis, phase=1, out_param=False): - r""" - Get the weight slice for tensor parallel. - - Args: - weight (Tensor or ndarray): The weight or bias to be sliced. - axis (int): The axis to perform slice. - phase (int, optional): 0 is used for creating partial model when - initializing and `from_pretrained`. While 1 is used in converting - parameters to FastGeneration. No slice would be performed if - it is 1, since parameters have been sliced in `phase=0`. - out_param (bool, optional): If true, `weight` should be a Parameter - and force the output to be a Parameter. - - Returns: - Tensor or ndarray: The sliced weight. - """ - # weight can be parameter/tensor/ndarray - if self.no_para: - return weight - # Take into account model only including partial weights. - if self.is_partial_model: - if phase == 1: - # 0 for init - # 1 for convert param to FT - # TODO(guosheng): Maybe we can remove slice_weight in converting - # parameters to FT if we have sliced parameters at phase 0, while - # we allow to use non-partial model when converting parameters - # to FT currently. - return weight - if len(weight.shape) == 1: - axis = 0 - local_size = weight.shape[axis] // self.tensor_para_size - start_offset = self.tensor_para_rank * local_size - end_offset = start_offset + local_size - if len(weight.shape) == 1: - w_slice = weight[start_offset:end_offset] - else: - w_slice = weight[:, start_offset:end_offset] if axis == 1 else weight[start_offset:end_offset, :] - if out_param: - # Assume weight is also a Parameter. - w = type(weight)(shape=w_slice.shape, dtype=weight.dtype, is_bias=len(weight.shape) == 1) - # NOTE: `VarBase.set_value` would use `w.numpy()` while w is not - # initialized and can not be used directly. - # TODO(guosheng): If `w.place `can be used here, use `w.place` to - # avoid w.place and _current_expected_place are different. - w.value().get_tensor().set(w_slice, paddle.framework._current_expected_place()) - return w - else: - return w_slice - - def set_partial_model(self, is_partial_model): - r""" - This is used to set whether or not the current model has complete - parameters. - - Args: - is_partial_model (bool): It is used to set whether or not the - current model has complete parameters. - """ - self.is_partial_model = is_partial_model - - def fit_partial_model(self, model, state_to_load): - r""" - Slice every values included in `state_to_load` according to the shape - of corresponding parameters in `model`. This is used in `from_pratrained` - to get sliced parameter values. - - Args: - model (PretrainedModel): The model to use. - state_to_load (dict): The state dict including complete parameter - values of model. - - Returns: - dict: The state dict contains adjusted values. - """ - if self.no_para or not self.is_partial_model: - return state_to_load - - def fit_param(p, v): - if p.shape[0] != v.shape[0]: - return _ft_para_conf.slice_weight(v, axis=0, phase=0) - if len(p.shape) == 2 and p.shape[1] != v.shape[1]: - return _ft_para_conf.slice_weight(v, axis=1, phase=0) - return v - - for k, v in model.state_dict().items(): - if k in state_to_load: - state_to_load[k] = fit_param(v, state_to_load[k]) - return state_to_load - - -# TODO(guosheng): Maybe use context-manager to allow multiple models. -_ft_para_conf = FTParaConf() - - -def get_ft_para_conf(): - r""" - Get settings for model parallel. - - Returns: - FTParaConf: The settings for model parallel. - """ - return _ft_para_conf - - -def enable_ft_para(tensor_para_size=None, layer_para_size=None, layer_para_batch_size=1): - r""" - Enable model parallel with the given settings in FastGeneration. Currently only - support GPT. Please refer to `Megatron `__ - for details. - - Args: - tensor_para_size (int, optional): The size for tensor parallel. If it is - 1, tensor parallel would not be used. When it is None, tensor parallel - size would be set as `world_size / layer_para_size`. Default to None. - layer_para_size (int, optional): The size for layer parallel. If it is - 1, layer parallel would not be used. When it is None, it would be set - as 1. Default to None. - layer_para_batch_size (int, optional): The local batch size for pipeline - parallel. It is suggested to use `batch_size // layer_para_size`. - Default to 1. - """ - global _ft_para_conf - _ft_para_conf = FTParaConf(tensor_para_size, layer_para_size, layer_para_batch_size) - if _ft_para_conf.no_para: - return - - def reset_param(layer, attr, axis): - param = getattr(layer, attr) - # NOTE: Assignment to parameter 'weight' should be of type Parameter or - # None. Additionaly, we cannot delattr and setattr which would remove - # the param from layer._parameters and state_dict, thus cannot fit_partial_model - param = _ft_para_conf.slice_weight(param, axis, phase=0, out_param=True) - setattr(layer, attr, param) - - def layer_init_wrapper(func): - @functools.wraps(func) - def _impl(self, *args, **kwargs): - init_dict = fn_args_to_dict(func, *((self,) + args), **kwargs) - init_dict.pop("self") - assert ( - init_dict["nhead"] % _ft_para_conf.tensor_para_size == 0 - ), "The number of heads(%d) cannot be evenly divisible by `tensor_para_size`(%d)." % ( - init_dict["nhead"], - _ft_para_conf.tensor_para_size, - ) - func(self, *args, **kwargs) - # Reset parameters with corresponding slice. - for x, attr in [(m, n) for m in ("q", "k", "v") for n in ("weight", "bias")]: - reset_param(getattr(self.self_attn, x + "_proj"), attr, 1) - reset_param(self.self_attn.out_proj, "weight", 0) - reset_param(self.linear1, "weight", 1) - reset_param(self.linear1, "bias", 1) - reset_param(self.linear2, "weight", 0) - - return _impl - - def block_init_wrapper(func): - @functools.wraps(func) - def _impl(self, *args, **kwargs): - init_dict = fn_args_to_dict(func, *((self,) + args), **kwargs) - init_dict.pop("self") - num_layers = init_dict["num_hidden_layers"] - init_dict["num_hidden_layers"] //= _ft_para_conf.layer_para_size - func(self, **init_dict) - self.num_layers = num_layers - self.config["num_hidden_layers"] = num_layers - - return _impl - - def block_state_wrapper(func): - # TODO(guosheng): Uset state hook instead of block_state_wrapper. - # self.register_state_dict_hook(reidx_state_layer) - @functools.wraps(func) - def _impl(self, *args, **kwargs): - state_dict = func(self, *args, **kwargs) - arg_dict = fn_args_to_dict(func, *((self,) + args), **kwargs) - structured_name_prefix = arg_dict["structured_name_prefix"] - - def reidx_state_layer(state_dict): - prefix = structured_name_prefix + "decoder.layers." - prefix_len = len(prefix) - for name, param in list(state_dict.items()): - if name.startswith(prefix): - layer_idx_len = 0 - for i in name[prefix_len:]: - if i == ".": - break - else: - layer_idx_len += 1 - layer_idx = int(name[prefix_len : prefix_len + layer_idx_len]) - new_name = ( - name[:prefix_len] - + str(_ft_para_conf.layer_para_rank * len(self.decoder.layers) + layer_idx) - + name[prefix_len + layer_idx_len :] - ) - state_dict[new_name] = state_dict.pop(name) - - reidx_state_layer(state_dict) - return state_dict - - return _impl - - # GPT - layer_init_fn = paddlenlp.transformers.gpt.modeling.TransformerDecoderLayer.__init__ - paddlenlp.transformers.gpt.modeling.TransformerDecoderLayer.__init__ = layer_init_wrapper(layer_init_fn) - # Note that Transformer block in GPT is not created in TransformerDecoder - # but in GPTModel. - block_init_fn = paddlenlp.transformers.gpt.modeling.GPTModel.__init__ - paddlenlp.transformers.gpt.modeling.GPTModel.__init__ = block_init_wrapper(block_init_fn) - block_state_fn = paddlenlp.transformers.gpt.modeling.GPTModel.state_dict - paddlenlp.transformers.gpt.modeling.GPTModel.state_dict = block_state_wrapper(block_state_fn) - # PLATO - paddle.nn.TransformerEncoderLayer.__init__ = layer_init_wrapper(paddle.nn.TransformerEncoderLayer.__init__) - _ft_para_conf.set_partial_model(True) - # TODO(guosheng): Should we set device here, sometimes we want to create - # models on CPU first to save memory. - # paddle.set_device("gpu:" + str(_ft_para_conf.rank)) - # yield - - -class InferOptDecoding(nn.Layer): - """extract infer model parameters and feed it into the cuda decoder""" - - def __init__(self, model: OPTForCausalLM, decoding_lib=None, use_fp16_decoding=False): - if decoding_lib is not None and os.path.isfile(decoding_lib): - if "FastGeneration" not in LOADED_EXT.keys(): - ops = paddle.utils.cpp_extension.load_op_meta_info_and_register_op(decoding_lib) - LOADED_EXT["FastGeneration"] = ops - else: - if decoding_lib is not None: - logger.warning("The specified decoding_lib does not exist, and it will be built automatically.") - load( - "FastGeneration" if get_ft_para_conf().no_para else "FasterTransformerParallel", - verbose=True, - need_parallel=not get_ft_para_conf().no_para, - ) - - super(InferOptDecoding, self).__init__() - - self.use_fp16_decoding = use_fp16_decoding - self.model = model - self.head_num = self.model.opt.config["num_attention_heads"] - self.size_per_head = int(self.model.opt.config["hidden_size"] / self.head_num) - self.num_layer = self.model.opt.config["num_hidden_layers"] - self.inner_size = self.model.opt.config["intermediate_size"] - - params = convert_params(self, model, fuse_qkv=1, use_fp16=use_fp16_decoding, restore_data=True) - - if self.model.opt.embeddings.project_in is not None: - self.word_emb = paddle.matmul( - self.model.opt.embeddings.word_embeddings.weight, self.model.opt.embeddings.project_in.weight - ) - # set the linear_weight - self.linear_weight = paddle.matmul( - self.model.opt.embeddings.word_embeddings.weight, self.model.opt.decoder.project_out.weight.T - ) - else: - self.word_emb = self.model.opt.embeddings.word_embeddings.weight - self.linear_weight = self.model.opt.embeddings.word_embeddings.weight - - # reset the offset in position embedding - position_embedding = self.model.opt.embeddings.position_embeddings - self.pos_emb = paddle.concat([position_embedding.weight[2:], position_embedding.weight[:2]]) - - # if there is no final layer norm, pass empty tensor to fusion opt op - final_layer_norm = self.model.opt.decoder.final_layer_norm - if final_layer_norm is None: - self.decoder_ln_weight = paddle.empty(shape=[0]) - self.decoder_ln_bias = paddle.empty(shape=[0]) - else: - self.decoder_ln_weight = final_layer_norm.weight - self.decoder_ln_bias = final_layer_norm.bias - - self.normalize_before = self.model.decoder.final_layer_norm is not None - - for k, v in params.items(): - setattr(self, k, v) - - # check the dtype of embedding - dtype = "float16" if use_fp16_decoding else "float32" - self.word_emb = transfer_param(self.word_emb, dtype=dtype, is_bias=False, restore_data=True) - self.linear_weight = transfer_param(self.linear_weight, dtype=dtype, is_bias=False, restore_data=True) - self.pos_emb = transfer_param(self.pos_emb, dtype=dtype, is_bias=False, restore_data=True) - self.decoder_ln_weight = transfer_param(self.decoder_ln_weight, dtype=dtype, is_bias=False, restore_data=True) - self.decoder_ln_bias = transfer_param(self.decoder_ln_bias, dtype=dtype, is_bias=True, restore_data=True) - - def forward( - self, - input_ids, - mem_seq_len, - attention_mask=None, - topk=4, - topp=0.0, - bos_token_id=None, - eos_token_id=None, - pad_token_id=None, - forced_eos_token_id=None, - max_out_len=256, - temperature=1, - ): - if attention_mask is None: - batch_size = input_ids.shape[0] - attention_mask = paddle.tril( - paddle.ones( - [batch_size, mem_seq_len, mem_seq_len], dtype="float16" if self.use_fp16_decoding else "float32" - ) - ) - elif self.use_fp16_decoding and attention_mask.dtype == paddle.float32: - attention_mask = paddle.cast(attention_mask, dtype="float16") - - output_ids = infer_opt_decoding( - input=[input_ids], - attn_mask=[attention_mask], - mem_seq_len=[mem_seq_len], - word_emb=self.word_emb, - slf_ln_weight=self.slf_ln_weight, - slf_ln_bias=self.slf_ln_bias, - slf_q_weight=self.slf_q_weight, - slf_q_bias=self.slf_q_bias, - slf_k_weight=self.slf_k_weight, - slf_k_bias=self.slf_k_bias, - slf_v_weight=self.slf_v_weight, - slf_v_bias=self.slf_v_bias, - slf_out_weight=self.slf_out_weight, - slf_out_bias=self.slf_out_bias, - ffn_ln_weight=self.ffn_ln_weight, - ffn_ln_bias=self.ffn_ln_bias, - ffn_inter_weight=self.ffn_inter_weight, - ffn_inter_bias=self.ffn_inter_bias, - ffn_out_weight=self.ffn_out_weight, - ffn_out_bias=self.ffn_out_bias, - decoder_ln_weight=self.decoder_ln_weight, - decoder_ln_bias=self.decoder_ln_bias, - pos_emb=self.pos_emb, - linear_weight=self.linear_weight, - normalize_before=self.normalize_before, - topk=topk, - topp=topp, - max_out_len=max_out_len, - head_num=self.head_num, - size_per_head=self.size_per_head, - num_layer=self.num_layer, - bos_id=bos_token_id, - eos_id=eos_token_id, - temperature=temperature, - use_fp16_decoding=self.use_fp16_decoding, - ) - - output_ids = output_ids[input_ids.shape[-1] :, :] - if forced_eos_token_id is not None: - output_ids[:, -1] = forced_eos_token_id - return output_ids - - -class InferGptDecoding(nn.Layer): - def __init__(self, model, decoding_lib=None, use_fp16_decoding=False): - if decoding_lib is not None and os.path.isfile(decoding_lib): - if "FastGeneration" not in LOADED_EXT.keys(): - ops = paddle.utils.cpp_extension.load_op_meta_info_and_register_op(decoding_lib) - LOADED_EXT["FastGeneration"] = ops - else: - if decoding_lib is not None: - logger.warning("The specified decoding_lib does not exist, and it will be built automatically.") - load( - "FastGeneration" if get_ft_para_conf().no_para else "FasterTransformerParallel", - verbose=True, - need_parallel=not get_ft_para_conf().no_para, - ) - - super(InferGptDecoding, self).__init__() - - self.use_fp16_decoding = use_fp16_decoding - self.model = model - self.head_num = self.model.gpt.config["num_attention_heads"] - self.size_per_head = int(self.model.gpt.config["hidden_size"] / self.head_num) - self.num_layer = self.model.gpt.config["num_hidden_layers"] - self.inner_size = self.model.gpt.config["intermediate_size"] - - params = convert_params(self, model, fuse_qkv=1, use_fp16=use_fp16_decoding, restore_data=True) - params["word_emb"].append((self.model.gpt.embeddings.word_embeddings, "weight")) - params["pos_emb"].append((self.model.gpt.embeddings.position_embeddings, "weight")) - - # if model share word_embeddings weight - if id(self.model.gpt.embeddings.word_embeddings) == id(self.model.lm_head.weight): - params["linear_weight"].append((self.model.gpt.embeddings.word_embeddings, "weight")) - else: - params["linear_weight"].append((self.model.lm_head.weight, False, partial(setattr, self, "weight"))) - - for k, v in params.items(): - setattr(self, k, v) - - def forward( - self, - input_ids, - mem_seq_len, - attention_mask=None, - topk=4, - topp=0.0, - bos_token_id=None, - eos_token_id=None, - pad_token_id=None, - forced_eos_token_id=None, - max_out_len=256, - temperature=1, - ): - if attention_mask is None: - batch_size = input_ids.shape[0] - attention_mask = paddle.tril( - paddle.ones( - [batch_size, paddle.max(mem_seq_len), paddle.max(mem_seq_len)], - dtype="float16" if self.use_fp16_decoding else "float32", - ) - ) - elif self.use_fp16_decoding and attention_mask.dtype == paddle.float32: - attention_mask = paddle.cast(attention_mask, dtype="float16") - - (output_ids,) = infer_gpt_decoding( - input=[input_ids], - attn_mask=[attention_mask], - mem_seq_len=[mem_seq_len], - word_emb=self.word_emb, - slf_ln_weight=self.slf_ln_weight, - slf_ln_bias=self.slf_ln_bias, - slf_q_weight=self.slf_q_weight, - slf_q_bias=self.slf_q_bias, - slf_k_weight=self.slf_k_weight, - slf_k_bias=self.slf_k_bias, - slf_v_weight=self.slf_v_weight, - slf_v_bias=self.slf_v_bias, - slf_out_weight=self.slf_out_weight, - slf_out_bias=self.slf_out_bias, - ffn_ln_weight=self.ffn_ln_weight, - ffn_ln_bias=self.ffn_ln_bias, - ffn_inter_weight=self.ffn_inter_weight, - ffn_inter_bias=self.ffn_inter_bias, - ffn_out_weight=self.ffn_out_weight, - ffn_out_bias=self.ffn_out_bias, - decoder_ln_weight=self.decoder_ln_weight, - decoder_ln_bias=self.decoder_ln_bias, - pos_emb=self.pos_emb, - linear_weight=self.linear_weight, - topk=topk, - topp=topp, - max_out_len=max_out_len, - head_num=self.head_num, - size_per_head=self.size_per_head, - num_layer=self.num_layer, - bos_id=bos_token_id, - eos_id=eos_token_id, - temperature=temperature, - use_fp16_decoding=self.use_fp16_decoding, - ) - - output_ids = output_ids[input_ids.shape[-1] :, :] - if forced_eos_token_id is not None: - output_ids[:, -1] = forced_eos_token_id - return output_ids - - -class InferUnifiedDecoding(nn.Layer): - def __init__( - self, - model, - decoding_lib=None, - use_fp16_decoding=False, - logits_mask=None, - n_head=8, - hidden_dims=512, - size_per_head=64, - n_layer=6, - unk_id=0, - mask_id=30000, - normalize_before=True, - hidden_act="gelu", - ): - if decoding_lib is not None and os.path.isfile(decoding_lib): - # Maybe it has been loadad by `ext_utils.load` - if "FastGeneration" not in LOADED_EXT.keys(): - ops = paddle.utils.cpp_extension.load_op_meta_info_and_register_op(decoding_lib) - LOADED_EXT["FastGeneration"] = ops - else: - if decoding_lib is not None: - logger.warning("The specified decoding_lib does not exist, and it will be built automatically.") - load( - "FastGeneration" if get_ft_para_conf().no_para else "FasterTransformerParallel", - verbose=True, - need_parallel=not get_ft_para_conf().no_para, - ) - - super(InferUnifiedDecoding, self).__init__() - for arg, value in locals().items(): - if arg not in ["self"]: - setattr(self, "_" + arg, value) - - params = convert_params(self, model, fuse_qkv=1, use_fp16=use_fp16_decoding, restore_data=True) - params["word_emb"].append((model.embeddings.word_embeddings, "weight")) - params["pos_emb"].append((model.embeddings.position_embeddings, "weight")) - params["type_emb"].append((model.embeddings.token_type_embeddings, "weight")) - if getattr(model.embeddings, "role_embeddings", None) is not None: - params["role_emb"].append((model.embeddings.role_embeddings, "weight")) - else: - # inputs of custom op cannot be None - params["role_emb"].append((paddle.zeros(shape=[1]), False, partial(setattr, self, "default_role_emb"))) - if not self._normalize_before: - # pre-norm params has been converted in `convert_params`, and this - # is only for post-norm such as UNIMO. - params["decoder_ln_weight"].append((model.encoder_norm, "weight")) - params["decoder_ln_bias"].append((model.encoder_norm, "bias")) - params["trans_weight"].append((model.lm_head.transform, "weight")) - params["trans_bias"].append((model.lm_head.transform, "bias")) - params["lm_ln_weight"].append((model.lm_head.layer_norm, "weight")) - params["lm_ln_bias"].append((model.lm_head.layer_norm, "bias")) - # NOTE: newly created tensors should be layer attribute refered to be - # able to convert to static graph. - params["linear_weight"].append((model.lm_head.decoder_weight.t(), False, partial(setattr, self, "dec_weight"))) - params["linear_bias"].append( - (paddle.assign(model.lm_head.decoder_bias), True, partial(setattr, self, "dec_bias")) - ) - for k, v in params.items(): - setattr(self, k, v) - - def forward( - self, - input_ids, - attn_mask, - memory_seq_lens, - type_id, - decoder_type_id, - role_id=None, - decoder_role_id=None, - position_id=None, - decoder_position_id=None, - beam_size=4, - topk=4, - topp=0.0, - decoding_strategy="greedy_search", - max_out_len=256, - bos_token_id=None, - eos_token_id=None, - pad_token_id=None, - forced_eos_token_id=None, - temperature=1.0, - length_penalty=1.0, - diversity_rate=0.0, - pos_bias=True, - rel_len=False, - early_stopping=False, - min_length=0, - ): - if role_id is None: - role_id = paddle.zeros(shape=[0], dtype="int32") - decoder_role_id = paddle.zeros(shape=[0], dtype="int32") - if position_id is None: - position_id = paddle.zeros(shape=[0], dtype="int32") - decoder_position_id = paddle.zeros(shape=[0], dtype="int32") - - if decoding_strategy == "greedy_search": - decoding_strategy = "topk_sampling" - topk = 1 - topp = 0.0 - elif decoding_strategy in ["sampling", "topk_sampling", "topp_sampling"]: - if topp == 1 and topk > 0: - decoding_strategy = "topk_sampling" - topp = 0.0 - elif topp > 0 and topk == 0: - decoding_strategy = "topp_sampling" - else: - raise AttributeError( - "Only topk sampling or topp sampling are supported. " - "Topk sampling and topp sampling cannot be both applied in the fast version." - ) - elif decoding_strategy.startswith("beam_search"): - decoding_strategy = "beam_search_v3" - - output_ids, parent_ids, sequence_length, output_scores = infer_unified_decoding( - input_ids=[input_ids], - attn_mask=[attn_mask], - memory_seq_lens=[memory_seq_lens], - type_id=[type_id], - decoder_type_id=[decoder_type_id], - logits_mask=[self._logits_mask], - word_emb=self.word_emb, - slf_ln_weight=self.slf_ln_weight, - slf_ln_bias=self.slf_ln_bias, - slf_q_weight=self.slf_q_weight, - slf_q_bias=self.slf_q_bias, - slf_k_weight=self.slf_k_weight, - slf_k_bias=self.slf_k_bias, - slf_v_weight=self.slf_v_weight, - slf_v_bias=self.slf_v_bias, - slf_out_weight=self.slf_out_weight, - slf_out_bias=self.slf_out_bias, - ffn_ln_weight=self.ffn_ln_weight, - ffn_ln_bias=self.ffn_ln_bias, - ffn_inter_weight=self.ffn_inter_weight, - ffn_inter_bias=self.ffn_inter_bias, - ffn_out_weight=self.ffn_out_weight, - ffn_out_bias=self.ffn_out_bias, - decoder_ln_weight=self.decoder_ln_weight, - decoder_ln_bias=self.decoder_ln_bias, - trans_weight=self.trans_weight, - trans_bias=self.trans_bias, - lm_ln_weight=self.lm_ln_weight, - lm_ln_bias=self.lm_ln_bias, - linear_weight=self.linear_weight, - linear_bias=self.linear_bias, - pos_emb=self.pos_emb, - type_emb=self.type_emb, - role_id=[role_id], - decoder_role_id=[decoder_role_id], - role_emb=self.role_emb, - position_id=[position_id], - decoder_position_id=[decoder_position_id], - _decoding_strategy=decoding_strategy, - _beam_size=beam_size, - _topk=topk, - _topp=topp, - _n_head=self._n_head, - _size_per_head=self._size_per_head, - _n_layer=self._n_layer, - _bos_id=bos_token_id, - _eos_id=eos_token_id, - _max_out_len=max_out_len, - _diversity_rate=-diversity_rate, - _unk_id=self._unk_id, - _mask_id=self._mask_id, - _temperature=temperature, - _len_penalty=length_penalty, - _normalize_before=self._normalize_before, - _pos_bias=pos_bias, - _hidden_act=self._hidden_act, - _rel_len=rel_len, - _early_stopping=early_stopping, - _min_length=min_length, - ) - ids = finalize( - beam_size, - output_ids, - parent_ids, - sequence_length, - forced_eos_token_id=forced_eos_token_id, - decoding_strategy=decoding_strategy, - ) - return ids, output_scores - - -class InferMIRODecoding(nn.Layer): - def __init__( - self, - model, - decoding_lib=None, - use_fp16_decoding=False, - logits_mask=None, - n_head=8, - hidden_dims=512, - size_per_head=64, - n_layer=6, - unk_id=0, - mask_id=30000, - normalize_before=True, - hidden_act="relu", - ): - - if decoding_lib is not None and os.path.isfile(decoding_lib): - # Maybe it has been loadad by `ext_utils.load` - if "FasterTransformer" not in LOADED_EXT.keys(): - ops = paddle.utils.cpp_extension.load_op_meta_info_and_register_op(decoding_lib) - LOADED_EXT["FasterTransformer"] = ops - else: - if decoding_lib is not None: - logger.warning("The specified decoding_lib does not exist, and it will be built automatically.") - load( - "FasterTransformer" if get_ft_para_conf().no_para else "FasterTransformerParallel", - verbose=True, - need_parallel=not get_ft_para_conf().no_para, - ) - - super(InferMIRODecoding, self).__init__() - for arg, value in locals().items(): - if arg not in ["self"]: - setattr(self, "_" + arg, value) - - params = convert_params(self, model, fuse_qkv=1, use_fp16=use_fp16_decoding, restore_data=True) - params["word_emb"].append((model.embeddings.word_embeddings, "weight")) - params["pos_emb"].append((model.embeddings.position_embeddings, "weight")) - params["type_emb"].append((model.embeddings.token_type_embeddings, "weight")) - if getattr(model.embeddings, "role_embeddings", None) is not None: - params["role_emb"].append((model.embeddings.role_embeddings, "weight")) - else: - # inputs of custom op cannot be None - params["role_emb"].append((paddle.zeros(shape=[1]), False, partial(setattr, self, "default_role_emb"))) - # if not self._normalize_before: - # # pre-norm params has been converted in `convert_params`, and this - # # is only for post-norm such as UNIMO. - # params["decoder_ln_weight"].append((model.encoder_norm, "weight")) - # params["decoder_ln_bias"].append((model.encoder_norm, "bias")) - params["pre_decoder_ln_weight"].append((model.encoder_norm, "weight")) - params["pre_decoder_ln_bias"].append((model.encoder_norm, "bias")) - - params["trans_weight"].append((model.lm_head.transform, "weight")) - params["trans_bias"].append((model.lm_head.transform, "bias")) - params["lm_ln_weight"].append((model.lm_head.layer_norm, "weight")) - params["lm_ln_bias"].append((model.lm_head.layer_norm, "bias")) - # NOTE: newly created tensors should be layer attribute refered to be - # able to convert to static graph. - params["linear_weight"].append((model.lm_head.decoder_weight.t(), False, partial(setattr, self, "dec_weight"))) - params["linear_bias"].append( - (paddle.assign(model.lm_head.decoder_bias), True, partial(setattr, self, "dec_bias")) - ) - for k, v in params.items(): - setattr(self, k, v) - - def forward( - self, - input_ids, - attn_mask, - memory_seq_lens, - type_id, - decoder_type_id, - role_id=None, - decoder_role_id=None, - position_id=None, - decoder_position_id=None, - beam_size=4, - topk=4, - topp=0.0, - decoding_strategy="greedy_search", - max_out_len=256, - bos_token_id=None, - eos_token_id=None, - pad_token_id=None, - forced_eos_token_id=None, - temperature=1.0, - length_penalty=1.0, - diversity_rate=0.0, - pos_bias=True, - rel_len=False, - early_stopping=False, - min_length=0, - ): - if role_id is None: - role_id = paddle.zeros(shape=[0], dtype="int32") - decoder_role_id = paddle.zeros(shape=[0], dtype="int32") - if position_id is None: - position_id = paddle.zeros(shape=[0], dtype="int32") - decoder_position_id = paddle.zeros(shape=[0], dtype="int32") - - if decoding_strategy == "greedy_search": - decoding_strategy = "topk_sampling" - topk = 1 - topp = 0.0 - elif decoding_strategy in ["sampling", "topk_sampling", "topp_sampling"]: - if topp == 1 and topk > 0: - decoding_strategy = "topk_sampling" - topp = 0.0 - elif topp > 0 and topk == 0: - decoding_strategy = "topp_sampling" - else: - raise AttributeError( - "Only topk sampling or topp sampling are supported. " - "Topk sampling and topp sampling cannot be both applied in the faster version." - ) - elif decoding_strategy.startswith("beam_search"): - decoding_strategy = "beam_search_v3" - - output_ids, parent_ids, sequence_length, output_scores = infer_miro_decoding( - input_ids=[input_ids], - attn_mask=[attn_mask], - memory_seq_lens=[memory_seq_lens], - type_id=[type_id], - decoder_type_id=[decoder_type_id], - logits_mask=[self._logits_mask], - word_emb=self.word_emb, - pre_decoder_ln_weight=self.pre_decoder_ln_weight, - pre_decoder_ln_bias=self.pre_decoder_ln_bias, - slf_ln_weight=self.slf_ln_weight, - slf_ln_bias=self.slf_ln_bias, - slf_q_weight=self.slf_q_weight, - slf_q_bias=self.slf_q_bias, - slf_k_weight=self.slf_k_weight, - slf_k_bias=self.slf_k_bias, - slf_v_weight=self.slf_v_weight, - slf_v_bias=self.slf_v_bias, - slf_out_weight=self.slf_out_weight, - slf_out_bias=self.slf_out_bias, - ffn_ln_weight=self.ffn_ln_weight, - ffn_ln_bias=self.ffn_ln_bias, - ffn_inter_weight=self.ffn_inter_weight, - ffn_inter_bias=self.ffn_inter_bias, - ffn_out_weight=self.ffn_out_weight, - ffn_out_bias=self.ffn_out_bias, - decoder_ln_weight=self.decoder_ln_weight, - decoder_ln_bias=self.decoder_ln_bias, - trans_weight=self.trans_weight, - trans_bias=self.trans_bias, - lm_ln_weight=self.lm_ln_weight, - lm_ln_bias=self.lm_ln_bias, - linear_weight=self.linear_weight, - linear_bias=self.linear_bias, - pos_emb=self.pos_emb, - type_emb=self.type_emb, - role_id=[role_id], - decoder_role_id=[decoder_role_id], - role_emb=self.role_emb, - position_id=[position_id], - decoder_position_id=[decoder_position_id], - _decoding_strategy=decoding_strategy, - _beam_size=beam_size, - _topk=topk, - _topp=topp, - _n_head=self._n_head, - _size_per_head=self._size_per_head, - _n_layer=self._n_layer, - _bos_id=bos_token_id, - _eos_id=eos_token_id, - _max_out_len=max_out_len, - _diversity_rate=-diversity_rate, - _unk_id=self._unk_id, - _mask_id=self._mask_id, - _temperature=temperature, - _len_penalty=length_penalty, - _normalize_before=self._normalize_before, - _pos_bias=pos_bias, - _hidden_act=self._hidden_act, - _rel_len=rel_len, - _early_stopping=early_stopping, - _min_length=min_length, - ) - - ids = finalize( - beam_size, - output_ids, - parent_ids, - sequence_length, - forced_eos_token_id=forced_eos_token_id, - decoding_strategy=decoding_strategy, - ) - - return ids, output_scores - - -class InferBartDecoding(nn.Layer): - def __init__(self, model, decoding_lib=None, use_fp16_decoding=False): - if decoding_lib is not None and os.path.isfile(decoding_lib): - # Maybe it has been loadad by `ext_utils.load` - if "FastGeneration" not in LOADED_EXT.keys(): - ops = paddle.utils.cpp_extension.load_op_meta_info_and_register_op(decoding_lib) - LOADED_EXT["FastGeneration"] = ops - else: - if decoding_lib is not None: - logger.warning("The specified decoding_lib does not exist, and it will be built automatically.") - load("FastGeneration", verbose=True) - - super(InferBartDecoding, self).__init__() - for arg, value in locals().items(): - if arg not in ["self", "model", "word_embedding", "positional_embedding", "linear"]: - setattr(self, "_" + arg, value) - self._num_decoder_layers = model.bart.config["decoder_layers"] - self._n_head = model.bart.config["decoder_attention_heads"] - self._d_model = model.bart.config["d_model"] - - params = convert_params(self, model.get_decoder(), fuse_qkv=2, use_fp16=use_fp16_decoding, restore_data=True) - params["decoder_ln_weight"].append((model.decoder.decoder_layernorm_embedding, "weight")) - params["decoder_ln_bias"].append((model.decoder.decoder_layernorm_embedding, "bias")) - params["word_emb"].append((model.decoder.embed_tokens, "weight")) - params["pos_emb"].append((model.decoder.decoder_embed_positions, "weight")) - params["linear_weight"].append((model.lm_head_weight.t(), False, partial(setattr, self, "lm_head_weight_"))) - params["linear_bias"].append((model.final_logits_bias, True, partial(setattr, self, "lm_head_bias_"))) - for k, v in params.items(): - setattr(self, k, v) - - def forward( - self, - enc_output, - memory_seq_lens, - beam_size=4, - top_k=1, - top_p=0.0, - temperature=1.0, - decoding_strategy="beam_search_v3", - max_out_len=256, - min_out_len=256, - diversity_rate=0.0, - rel_len=False, - bos_token_id=None, - eos_token_id=None, - pad_token_id=None, - forced_eos_token_id=None, - alpha=0.6, - early_stopping=False, - ): - # beam_search/beam_search_v2/beam_search_v3 should be corrected to beam_search_v3. - if decoding_strategy.startswith("beam_search"): - decoding_strategy = "beam_search_v3" - elif decoding_strategy == "greedy_search": - decoding_strategy = "topk_sampling" - top_k = 1 - top_p = 0.0 - elif decoding_strategy in ["sampling", "topk_sampling", "topp_sampling"]: - if top_p == 1 and top_k > 0: - decoding_strategy = "topk_sampling" - top_p = 0.0 - elif top_p > 0 and top_k == 0: - decoding_strategy = "topp_sampling" - else: - raise AttributeError( - "Only topk sampling or topp sampling are supported. " - "Topk sampling and topp sampling cannot be both applied in the fast version. " - ) - - output_ids, parent_ids, sequence_length = infer_bart_decoding( - [enc_output], - [memory_seq_lens], - self.word_emb, - self.slf_ln_weight, - self.slf_ln_bias, - self.slf_q_weight, - self.slf_q_bias, - self.slf_k_weight, - self.slf_k_bias, - self.slf_v_weight, - self.slf_v_bias, - self.slf_out_weight, - self.slf_out_bias, - self.cross_ln_weight, - self.cross_ln_bias, - self.cross_q_weight, - self.cross_q_bias, - self.cross_k_weight, - self.cross_k_bias, - self.cross_v_weight, - self.cross_v_bias, - self.cross_out_weight, - self.cross_out_bias, - self.ffn_ln_weight, - self.ffn_ln_bias, - self.ffn_inter_weight, - self.ffn_inter_bias, - self.ffn_out_weight, - self.ffn_out_bias, - self.decoder_ln_weight, - self.decoder_ln_bias, - self.linear_weight, - self.linear_bias, - self.pos_emb, - decoding_strategy, - beam_size, - top_k, - top_p, - temperature, - self._n_head, - int(self._d_model / self._n_head), - self._num_decoder_layers, - bos_token_id, - eos_token_id, - max_out_len, - min_out_len, - -diversity_rate, - rel_len, - alpha, - early_stopping, - ) - - ids = finalize( - beam_size, - output_ids, - parent_ids, - sequence_length, - forced_eos_token_id=forced_eos_token_id, - decoding_strategy=decoding_strategy, - ) - return ids - - -class InferMBartDecoding(nn.Layer): - def __init__(self, model, decoding_lib=None, use_fp16_decoding=False, hidden_act="gelu"): - if decoding_lib is not None and os.path.isfile(decoding_lib): - # Maybe it has been loadad by `ext_utils.load` - if "FastGeneration" not in LOADED_EXT.keys(): - ops = paddle.utils.cpp_extension.load_op_meta_info_and_register_op(decoding_lib) - LOADED_EXT["FastGeneration"] = ops - else: - if decoding_lib is not None: - logger.warning("The specified decoding_lib does not exist, and it will be built automatically.") - load("FastGeneration", verbose=True) - - super(InferMBartDecoding, self).__init__() - for arg, value in locals().items(): - if arg not in ["self", "model", "word_embedding", "positional_embedding", "linear"]: - setattr(self, "_" + arg, value) - self._num_decoder_layers = model.mbart.config["decoder_layers"] - self._n_head = model.mbart.config["decoder_attention_heads"] - self._d_model = model.mbart.config["d_model"] - - # process weights - if use_fp16_decoding: - for mod in model.mbart.decoder.decoder.layers: - mod.norm1.weight = transfer_param(mod.norm1.weight, restore_data=True) - mod.norm1.bias = transfer_param(mod.norm1.bias, is_bias=True, restore_data=True) - mod.self_attn.q_proj.weight = transfer_param(mod.self_attn.q_proj.weight, restore_data=True) - mod.self_attn.q_proj.bias = transfer_param(mod.self_attn.q_proj.bias, is_bias=True, restore_data=True) - mod.self_attn.k_proj.weight = transfer_param(mod.self_attn.k_proj.weight, restore_data=True) - mod.self_attn.k_proj.bias = transfer_param(mod.self_attn.k_proj.bias, is_bias=True, restore_data=True) - mod.self_attn.v_proj.weight = transfer_param(mod.self_attn.v_proj.weight, restore_data=True) - mod.self_attn.v_proj.bias = transfer_param(mod.self_attn.v_proj.bias, is_bias=True, restore_data=True) - mod.self_attn.out_proj.weight = transfer_param(mod.self_attn.out_proj.weight, restore_data=True) - mod.self_attn.out_proj.bias = transfer_param( - mod.self_attn.out_proj.bias, is_bias=True, restore_data=True - ) - - mod.norm2.weight = transfer_param(mod.norm2.weight, restore_data=True) - mod.norm2.bias = transfer_param(mod.norm2.bias, is_bias=True, restore_data=True) - mod.cross_attn.q_proj.weight = transfer_param(mod.cross_attn.q_proj.weight, restore_data=True) - mod.cross_attn.q_proj.bias = transfer_param( - mod.cross_attn.q_proj.bias, is_bias=True, restore_data=True - ) - mod.cross_attn.k_proj.weight = transfer_param(mod.cross_attn.k_proj.weight, restore_data=True) - mod.cross_attn.k_proj.bias = transfer_param( - mod.cross_attn.k_proj.bias, is_bias=True, restore_data=True - ) - mod.cross_attn.v_proj.weight = transfer_param(mod.cross_attn.v_proj.weight, restore_data=True) - mod.cross_attn.v_proj.bias = transfer_param( - mod.cross_attn.v_proj.bias, is_bias=True, restore_data=True - ) - mod.cross_attn.out_proj.weight = transfer_param(mod.cross_attn.out_proj.weight, restore_data=True) - mod.cross_attn.out_proj.bias = transfer_param( - mod.cross_attn.out_proj.bias, is_bias=True, restore_data=True - ) - - mod.norm3.weight = transfer_param(mod.norm3.weight, restore_data=True) - mod.norm3.bias = transfer_param(mod.norm3.bias, is_bias=True, restore_data=True) - mod.linear1.weight = transfer_param(mod.linear1.weight, restore_data=True) - mod.linear1.bias = transfer_param(mod.linear1.bias, is_bias=True, restore_data=True) - mod.linear2.weight = transfer_param(mod.linear2.weight, restore_data=True) - mod.linear2.bias = transfer_param(mod.linear2.bias, is_bias=True, restore_data=True) - - model.decoder.decoder_layernorm_embedding.weight = transfer_param( - model.decoder.decoder_layernorm_embedding.weight, restore_data=True - ) - model.decoder.decoder_layernorm_embedding.bias = transfer_param( - model.decoder.decoder_layernorm_embedding.bias, is_bias=True, restore_data=True - ) - - model.decoder.decoder.norm.weight = transfer_param(model.decoder.decoder.norm.weight, restore_data=True) - model.decoder.decoder.norm.bias = transfer_param( - model.decoder.decoder.norm.bias, is_bias=True, restore_data=True - ) - - model.lm_head_weight = transfer_param(model.lm_head_weight, restore_data=True) - model.final_logits_bias = transfer_param(model.final_logits_bias, is_bias=True, restore_data=True) - - model.decoder.decoder_embed_positions.weight = transfer_param( - model.decoder.decoder_embed_positions.weight, restore_data=True - ) - model.decoder.embed_tokens.weight = transfer_param(model.decoder.embed_tokens.weight, restore_data=True) - - self.slf_ln_weight = [] - self.slf_ln_bias = [] - self.slf_q_weight = [] - self.slf_q_bias = [] - self.slf_k_weight = [] - self.slf_k_bias = [] - self.slf_v_weight = [] - self.slf_v_bias = [] - self.slf_out_weight = [] - self.slf_out_bias = [] - - self.cross_ln_weight = [] - self.cross_ln_bias = [] - self.cross_q_weight = [] - self.cross_q_bias = [] - self.cross_k_weight = [] - self.cross_k_bias = [] - self.cross_v_weight = [] - self.cross_v_bias = [] - self.cross_out_weight = [] - self.cross_out_bias = [] - - self.ffn_ln_weight = [] - self.ffn_ln_bias = [] - self.ffn_inter_weight = [] - self.ffn_inter_bias = [] - self.ffn_out_weight = [] - self.ffn_out_bias = [] - - for mod in model.mbart.decoder.decoder.layers: - self.slf_ln_weight.append(mod.norm1.weight) - self.slf_ln_bias.append(mod.norm1.bias) - self.slf_q_weight.append(mod.self_attn.q_proj.weight) - self.slf_q_bias.append(mod.self_attn.q_proj.bias) - self.slf_k_weight.append(mod.self_attn.k_proj.weight) - self.slf_k_bias.append(mod.self_attn.k_proj.bias) - self.slf_v_weight.append(mod.self_attn.v_proj.weight) - self.slf_v_bias.append(mod.self_attn.v_proj.bias) - self.slf_out_weight.append(mod.self_attn.out_proj.weight) - self.slf_out_bias.append(mod.self_attn.out_proj.bias) - - self.cross_ln_weight.append(mod.norm2.weight) - self.cross_ln_bias.append(mod.norm2.bias) - self.cross_q_weight.append(mod.cross_attn.q_proj.weight) - self.cross_q_bias.append(mod.cross_attn.q_proj.bias) - self.cross_k_weight.append(mod.cross_attn.k_proj.weight) - self.cross_k_bias.append(mod.cross_attn.k_proj.bias) - self.cross_v_weight.append(mod.cross_attn.v_proj.weight) - self.cross_v_bias.append(mod.cross_attn.v_proj.bias) - self.cross_out_weight.append(mod.cross_attn.out_proj.weight) - self.cross_out_bias.append(mod.cross_attn.out_proj.bias) - - self.ffn_ln_weight.append(mod.norm3.weight) - self.ffn_ln_bias.append(mod.norm3.bias) - self.ffn_inter_weight.append(mod.linear1.weight) - self.ffn_inter_bias.append(mod.linear1.bias) - self.ffn_out_weight.append(mod.linear2.weight) - self.ffn_out_bias.append(mod.linear2.bias) - - self.decoder_ln_weight = [model.decoder.decoder.norm.weight] - self.decoder_ln_bias = [model.decoder.decoder.norm.bias] - - self.mbart_ln_weight = [model.decoder.decoder_layernorm_embedding.weight] - self.mbart_ln_bias = [model.decoder.decoder_layernorm_embedding.bias] - - self.pos_emb = [model.decoder.decoder_embed_positions.weight] - self.word_emb = [model.decoder.embed_tokens.weight] - - setattr(self, "lm_head_weight_", model.lm_head_weight.t()) - self.linear_weight = [getattr(self, "lm_head_weight_")] - self.linear_bias = [model.final_logits_bias] - - def forward( - self, - enc_output, - memory_seq_lens, - trg_word=None, - beam_size=4, - top_k=1, - top_p=0.0, - decoding_strategy="beam_search_v3", - max_out_len=256, - diversity_rate=0.0, - rel_len=False, - bos_token_id=None, - eos_token_id=None, - pad_token_id=None, - alpha=0.6, - temperature=1.0, - early_stopping=False, - ): - # Beam_search/beam_search_v2/beam_search_v3 should be corrected to beam_search_v3. - if decoding_strategy.startswith("beam_search"): - decoding_strategy = "beam_search_v3" - elif decoding_strategy == "greedy_search": - decoding_strategy = "topk_sampling" - top_k = 1 - top_p = 0.0 - elif decoding_strategy in ["sampling", "topk_sampling", "topp_sampling"]: - if top_p == 1 and top_k > 0: - decoding_strategy = "topk_sampling" - top_p = 0.0 - elif top_p > 0 and top_k == 0: - decoding_strategy = "topp_sampling" - else: - raise AttributeError( - "Only topk sampling or topp sampling are supported. " - "Topk sampling and topp sampling cannot be both applied in the fast version. " - ) - output_ids, parent_ids, sequence_length = infer_mbart_decoding( - [enc_output], - [memory_seq_lens], - self.word_emb, - self.slf_ln_weight, - self.slf_ln_bias, - self.slf_q_weight, - self.slf_q_bias, - self.slf_k_weight, - self.slf_k_bias, - self.slf_v_weight, - self.slf_v_bias, - self.slf_out_weight, - self.slf_out_bias, - self.cross_ln_weight, - self.cross_ln_bias, - self.cross_q_weight, - self.cross_q_bias, - self.cross_k_weight, - self.cross_k_bias, - self.cross_v_weight, - self.cross_v_bias, - self.cross_out_weight, - self.cross_out_bias, - self.ffn_ln_weight, - self.ffn_ln_bias, - self.ffn_inter_weight, - self.ffn_inter_bias, - self.ffn_out_weight, - self.ffn_out_bias, - self.decoder_ln_weight, - self.decoder_ln_bias, - self.mbart_ln_weight, - self.mbart_ln_bias, - self.linear_weight, - self.linear_bias, - self.pos_emb, - trg_word, - decoding_strategy, - beam_size, - top_k, - top_p, - self._n_head, - int(self._d_model / self._n_head), - self._num_decoder_layers, - bos_token_id, - eos_token_id, - max_out_len, - -diversity_rate, - rel_len, - alpha, - temperature, - early_stopping, - self._hidden_act, - ) - - ids = finalize(beam_size, output_ids, parent_ids, sequence_length, decoding_strategy=decoding_strategy) - return ids - - -def convert_gptj_params(fast_model, model, fuse_qkv=1, use_fp16=False, restore_data=False, permutation=None): - r""" - Convert parameters included in Transformer layer from original models - to the format of faster models. - - Args: - fast_model (Layer): The faster model object. - model (Layer): The Transformer layer. - fuse_qkv (int): 0 for nofuse, 1 for fuse, 2 for fuse and delete the - unfused parameters. If environment variable `PPFG_QKV_MEM_OPT` is - set and the weights of q/k/v is fused, it will try to delete the - original unfused weights. Note the rollback to original model would - not be guarantee anymore when the faster model failed if the original - weights are deleted. Default to 1. - use_fp16 (bool): Whether to use float16. Maybe we should use the default - dtype as the highest priority later. Default to `False`. - restore_data (bool): If `False`, need to reload the weight values. It - should be `True` for weight loaded models. Default to `False`. - - Returns: - defaultdict: Each value is a list including converted parameters in all - layers. For other parameters not included in Transformer module to - be converted, such as embeddings, you can achieve it by using the - returned dict `params` though `params['word_emb'].append()` directly - which would do CPU/GPU and fp32/fp16 transfer automatically. - """ - if fuse_qkv == 1: - fuse_qkv = 2 if os.getenv("PPFG_QKV_MEM_OPT", "0") == "1" else 1 - ft_para_conf = get_ft_para_conf() - - class _list(list): - def append(self, item): - def attr_handle_func(x): - return x - - if isinstance(item[0], nn.Layer): - # Axis is used for tensor slice in tensor parallel. - # Use None to make no slice on the tensor. - if len(item) == 2: - layer, attr = item - axis = None - else: - layer, attr, axis = item - param = getattr(layer, attr) - if axis is not None and isinstance(layer, nn.Linear): - param = ft_para_conf.slice_weight(param, axis) - param = transfer_param( - param, - is_bias=attr.endswith("bias"), - dtype="float16" if use_fp16 else "float32", - restore_data=restore_data, - ) - # NOTE: Assignment to parameter 'weight' should be of type - # Parameter or None, thus delete first in case of param is - # a tensor. - # TODO(guosheng): Make slice_weight use `output_param=True` - # and remove delattr. Currently, if `param` is Tensor rather - # than Parameter, it would not be in state_dict. - delattr(layer, attr) - setattr(layer, attr, param) - else: - # NOTE: Compared with if branch, there is no layer attribute - # refered to the transfered param, thus we should set it as - # the layer attribute to be able to convert to static graph. - # Additionally, we suppose no need to process tensor parallel - # here since the param passed in might have been processed. - if len(item) == 2: - param, is_bias = item - attr_handle = attr_handle_func - else: - param, is_bias, attr_handle = item - param = transfer_param( - param, is_bias=is_bias, dtype="float16" if use_fp16 else "float32", restore_data=restore_data - ) - attr_handle(param) - return super().append(param) - - params = defaultdict(_list) - - def _convert(module): - num_layer = len(module) - for i, layer in enumerate(module): - if not ft_para_conf.is_load(i, num_layer): - continue - # TODO(guosheng): Tensor with size 0 might be failed in - # paddle develop, thus use tensor with size 1 instead - # temporarily. Besides, we use 2D tensor since jit log - # requires that on linear weight. While size 0 seems all - # right in jit.to_static/jit.save. - dummy_tensor = paddle.zeros([1, 1]) - if permutation is not None: - qkv = layer.attn.qkv_proj.weight.numpy() - qkv = qkv[:, permutation] - if fuse_qkv == 2: - del layer.attn.qkv_proj.weight - setattr(layer.attn.qkv_proj, "weight", dummy_tensor) - w = paddle.to_tensor(qkv) - else: - w = _convert_qkv( - layer.attn.q_proj, - layer.attn.k_proj, - layer.attn.v_proj, - attr="weight", - use_numpy=fuse_qkv == 2, - del_param=fuse_qkv == 2, - dummy_tensor=dummy_tensor, - ) - params["slf_q_weight"].append((w, False)) - # NOTE: Use `params["slf_q_weight"][-1]` rather than `w`, - # since the appended tensor might be a new transfered tensor. - # Besides, to allow convert_params be called more than once, - # we find a attr name not existing to avoid overwriting the - # existing attr. - attr = "slf_q_weight_" + str(i) - while hasattr(fast_model, attr): - attr += "_" - setattr(fast_model, attr, params["slf_q_weight"][-1]) - - params["slf_out_weight"].append((layer.attn.out_proj, "weight", 0)) - params["slf_ln_weight"].append((layer.ln_1, "weight")) - params["slf_ln_bias"].append((layer.ln_1, "bias")) - # Slice tensor when append according to axis(1 or 0) if parallel - # is enable. - params["ffn_inter_weight"].append((layer.mlp.fc_in, "weight", 1)) - params["ffn_inter_bias"].append((layer.mlp.fc_in, "bias", 1)) - params["ffn_out_weight"].append((layer.mlp.fc_out, "weight", 0)) - params["ffn_out_bias"].append((layer.mlp.fc_out, "bias")) - - _convert(model) - return params - - -class InferGptJDecoding(nn.Layer): - def __init__(self, model, decoding_lib=None, use_fp16_decoding=False, transpose_qkv=False): - if decoding_lib is not None and os.path.isfile(decoding_lib): - if "FastGeneration" not in LOADED_EXT.keys(): - ops = paddle.utils.cpp_extension.load_op_meta_info_and_register_op(decoding_lib) - LOADED_EXT["FastGeneration"] = ops - else: - if decoding_lib is not None: - logger.warning("The specified decoding_lib does not exist, and it will be built automatically.") - load( - "FastGeneration" if get_ft_para_conf().no_para else "FasterTransformerParallel", - verbose=True, - need_parallel=not get_ft_para_conf().no_para, - ) - - super(InferGptJDecoding, self).__init__() - - self.use_fp16_decoding = use_fp16_decoding - self.model = model - self.head_num = self.model.transformer.config["n_head"] - self.size_per_head = int(self.model.transformer.config["n_embd"] / self.head_num) - self.num_layer = self.model.transformer.config["n_layer"] - self.rotary_embedding_dim = self.model.transformer.config["rotary_dim"] - logger.info("Converting model weights, it will cost a few seconds.....") - permutation = None - if transpose_qkv: - # GPTJ is different with CodeGen in attention project layer. - local_dim = self.model.transformer.config["n_embd"] // 4 - base_permutation = [0, 3, 6, 9, 2, 5, 8, 11, 1, 4, 7, 10] - permutation = np.concatenate([np.arange(i * local_dim, (i + 1) * local_dim) for i in base_permutation]) - params = convert_gptj_params( - self, - model.transformer.h, - fuse_qkv=2, - use_fp16=use_fp16_decoding, - restore_data=True, - permutation=permutation, - ) - - params["word_emb"].append((self.model.transformer.wte, "weight")) - params["decoder_ln_weight"].append((self.model.transformer.ln_f, "weight")) - params["decoder_ln_bias"].append((self.model.transformer.ln_f, "bias")) - params["linear_weight"].append((self.model.lm_head.weight.t(), partial(setattr, self, "linear_weight_out"))) - params["linear_bias"].append((self.model.lm_head, "bias")) - - for k, v in params.items(): - setattr(self, k, v) - logger.info("Already converted model weights.") - - def forward( - self, - input_ids, - mem_seq_len, - attention_mask=None, - topk=4, - topp=0.0, - bos_token_id=None, - eos_token_id=None, - pad_token_id=None, - forced_eos_token_id=None, - max_out_len=256, - temperature=1, - repetition_penalty=1.0, - min_length=0, - ): - if attention_mask is None: - batch_size, input_length = input_ids.shape - attention_mask = paddle.unsqueeze((input_ids != pad_token_id).astype("float32"), axis=[1]) - causal_mask = paddle.tril(paddle.ones([batch_size, input_length, input_length], dtype="float32")) - attention_mask = paddle.logical_and(attention_mask, causal_mask) - if not self.use_fp16_decoding: - attention_mask = paddle.cast(attention_mask, dtype="float32") - else: - attention_mask = paddle.cast(attention_mask, dtype="float16") - - if self.use_fp16_decoding and attention_mask.dtype == paddle.float32: - attention_mask = paddle.cast(attention_mask, dtype="float16") - - (output_ids,) = infer_gptj_decoding( - input=[input_ids], - attn_mask=[attention_mask], - mem_seq_len=[mem_seq_len], - word_emb=self.word_emb, - slf_ln_weight=self.slf_ln_weight, - slf_ln_bias=self.slf_ln_bias, - slf_q_weight=self.slf_q_weight, - slf_out_weight=self.slf_out_weight, - ffn_inter_weight=self.ffn_inter_weight, - ffn_inter_bias=self.ffn_inter_bias, - ffn_out_weight=self.ffn_out_weight, - ffn_out_bias=self.ffn_out_bias, - decoder_ln_weight=self.decoder_ln_weight, - decoder_ln_bias=self.decoder_ln_bias, - linear_weight=self.linear_weight, - linear_bias=self.linear_bias, - topk=topk, - topp=topp, - max_out_len=max_out_len, - head_num=self.head_num, - size_per_head=self.size_per_head, - num_layer=self.num_layer, - bos_id=bos_token_id, - eos_id=eos_token_id, - temperature=temperature, - rotary_embedding_dim=self.rotary_embedding_dim, - repetition_penalty=repetition_penalty, - min_length=min_length, - use_fp16_decoding=self.use_fp16_decoding, - ) - - output_ids = output_ids[input_ids.shape[-1] :, :] - if forced_eos_token_id is not None: - output_ids[:, -1] = forced_eos_token_id - return output_ids - - -class InferPegasusDecoding(nn.Layer): - def __init__(self, model, decoding_lib=None, use_fp16_decoding=False, hidden_act="gelu"): - if decoding_lib is not None and os.path.isfile(decoding_lib): - # Maybe it has been loadad by `ext_utils.load` - if "FastGeneration" not in LOADED_EXT.keys(): - ops = paddle.utils.cpp_extension.load_op_meta_info_and_register_op(decoding_lib) - LOADED_EXT["FastGeneration"] = ops - else: - if decoding_lib is not None: - logger.warning("The specified decoding_lib does not exist, and it will be built automatically.") - load("FastGeneration", verbose=True) - - super(InferPegasusDecoding, self).__init__() - self._hidden_act = hidden_act - self._num_decoder_layers = model.pegasus.config["num_decoder_layers"] - self._n_head = model.pegasus.config["decoder_attention_heads"] - self._d_model = model.pegasus.config["d_model"] - - params = convert_params(self, model.decoder.decoder, fuse_qkv=2, use_fp16=use_fp16_decoding, restore_data=True) - - self.decoder_ln_weight = [ - transfer_param( - model.decoder.decoder_layernorm.weight, - is_bias=False, - dtype="float16" if use_fp16_decoding else "float32", - restore_data=True, - ) - ] - self.decoder_ln_bias = [ - transfer_param( - model.decoder.decoder_layernorm.bias, - is_bias=True, - dtype="float16" if use_fp16_decoding else "float32", - restore_data=True, - ) - ] - - self.pos_emb = [ - transfer_param( - model.decoder.decoder_embed_positions.weight, - is_bias=False, - dtype="float16" if use_fp16_decoding else "float32", - restore_data=True, - ) - ] - self.word_emb = [ - transfer_param( - model.decoder.embed_tokens.weight, - is_bias=False, - dtype="float16" if use_fp16_decoding else "float32", - restore_data=True, - ) - ] - setattr( - self, - "lm_head_weight_", - transfer_param( - model.lm_head_weight.t(), - is_bias=False, - dtype="float16" if use_fp16_decoding else "float32", - restore_data=True, - ), - ) - self.linear_weight = [getattr(self, "lm_head_weight_")] - self.linear_bias = [ - transfer_param( - model.final_logits_bias, - is_bias=True, - dtype="float16" if use_fp16_decoding else "float32", - restore_data=True, - ) - ] - for k, v in params.items(): - setattr(self, k, v) - - def forward( - self, - enc_output, - memory_seq_lens, - beam_size=4, - top_k=1, - top_p=0.0, - decoding_strategy="beam_search_v3", - max_out_len=256, - min_out_len=256, - diversity_rate=0.0, - rel_len=False, - bos_token_id=None, - eos_token_id=None, - pad_token_id=None, - alpha=0.6, - temperature=1.0, - early_stopping=False, - forced_eos_token_id=None, - ): - # Beam_search/beam_search_v2/beam_search_v3 should be corrected to beam_search_v3. - if decoding_strategy.startswith("beam_search"): - decoding_strategy = "beam_search_v3" - elif decoding_strategy == "greedy_search": - decoding_strategy = "topk_sampling" - top_k = 1 - top_p = 0.0 - elif decoding_strategy in ["sampling", "topk_sampling", "topp_sampling"]: - if top_p == 1 and top_k > 0: - decoding_strategy = "topk_sampling" - top_p = 0.0 - elif top_p > 0 and top_k == 0: - decoding_strategy = "topp_sampling" - else: - raise AttributeError( - "Only topk sampling or topp sampling are supported. " - "Topk sampling and topp sampling cannot be both applied in the fast version. " - ) - output_ids, parent_ids, sequence_length = infer_pegasus_decoding( - [enc_output], - [memory_seq_lens], - self.word_emb, - self.slf_ln_weight, - self.slf_ln_bias, - self.slf_q_weight, - self.slf_q_bias, - self.slf_k_weight, - self.slf_k_bias, - self.slf_v_weight, - self.slf_v_bias, - self.slf_out_weight, - self.slf_out_bias, - self.cross_ln_weight, - self.cross_ln_bias, - self.cross_q_weight, - self.cross_q_bias, - self.cross_k_weight, - self.cross_k_bias, - self.cross_v_weight, - self.cross_v_bias, - self.cross_out_weight, - self.cross_out_bias, - self.ffn_ln_weight, - self.ffn_ln_bias, - self.ffn_inter_weight, - self.ffn_inter_bias, - self.ffn_out_weight, - self.ffn_out_bias, - self.decoder_ln_weight, - self.decoder_ln_bias, - self.linear_weight, - self.linear_bias, - self.pos_emb, - decoding_strategy, - beam_size, - top_k, - top_p, - self._n_head, - int(self._d_model / self._n_head), - self._num_decoder_layers, - bos_token_id, - eos_token_id, - max_out_len, - min_out_len, - diversity_rate, - rel_len, - alpha, - temperature, - early_stopping, - self._hidden_act, - ) - - ids = finalize( - beam_size, - output_ids, - parent_ids, - sequence_length, - forced_eos_token_id=forced_eos_token_id, - decoding_strategy=decoding_strategy, - ) - return ids - - -class InferT5Decoding(InferBase): - def __init__(self, model, decoding_lib=None, use_fp16_decoding=False): - - if decoding_lib is not None and os.path.isfile(decoding_lib): - # Maybe it has been loadad by `ext_utils.load` - if "FastGeneration" not in LOADED_EXT.keys(): - ops = paddle.utils.cpp_extension.load_op_meta_info_and_register_op(decoding_lib) - LOADED_EXT["FastGeneration"] = ops - else: - if decoding_lib is not None: - logger.warning("The specified decoding_lib does not exist, and it will be built automatically.") - load("FastGeneration", verbose=True) - - super(InferT5Decoding, self).__init__(use_fp16_decoding) - for arg, value in locals().items(): - if arg not in ["self", "model"]: - setattr(self, "_" + arg, value) - - self._num_decoder_layers = model.config.num_decoder_layers - self._n_head = model.config.num_heads - self._d_model = model.config.d_model - self._relative_attention_num_buckets = model.config.relative_attention_num_buckets - self.tie_word_embeddings = model.config.tie_word_embeddings - self.act = model.config.feed_forward_proj - - if "gelu" in self.act: - self.act = "gelu" - elif "relu" in self.act: - self.act = "relu" - else: - raise ValueError("Only gelu and relu are available in Faster. ") - - # NOTE: using config when support. - self._max_distance = 128 - - params = convert_params(self, model.t5.decoder, fuse_qkv=2, use_fp16=use_fp16_decoding, restore_data=True) - - self.decoder_ln_weight = [ - transfer_param( - model.t5.decoder.final_layer_norm.weight, - is_bias=False, - dtype="float16" if use_fp16_decoding else "float32", - restore_data=True, - ) - ] - - self.word_emb = [ - transfer_param( - model.t5.decoder.embed_tokens.weight, - is_bias=False, - dtype="float16" if use_fp16_decoding else "float32", - restore_data=True, - ) - ] - - if self.tie_word_embeddings: - setattr( - self, - "lm_head_weight_", - transfer_param( - model.t5.decoder.embed_tokens.weight.t(), - is_bias=False, - dtype="float16" if use_fp16_decoding else "float32", - restore_data=True, - ), - ) - else: - setattr( - self, - "lm_head_weight_", - transfer_param( - paddle.assign(model.lm_head.weight), - is_bias=False, - dtype="float16" if use_fp16_decoding else "float32", - restore_data=True, - ), - ) - - self.linear_weight = [getattr(self, "lm_head_weight_")] - self.linear_bias = self.default_bias(self.linear_weight, 1) - - setattr( - self, - "relative_attn_bias_w", - transfer_param( - model.t5.decoder.block[0].layer[0].SelfAttention.relative_attention_bias.weight, - is_bias=False, - dtype="float16" if use_fp16_decoding else "float32", - restore_data=True, - ), - ) - self.relative_attention_bias_weight = [getattr(self, "relative_attn_bias_w")] - for k, v in params.items(): - setattr(self, k, v) - - self.zeros_t = paddle.zeros(shape=[1, 1], dtype="float16" if use_fp16_decoding else "float32") - if getattr(self, "slf_k_weight", None) is None: - self.slf_k_weight = [self.zeros_t] * model.t5.config["num_decoder_layers"] - if getattr(self, "slf_v_weight", None) is None: - self.slf_v_weight = [self.zeros_t] * model.t5.config["num_decoder_layers"] - - def forward( - self, - enc_output, - memory_seq_lens, - beam_size=4, - top_k=1, - top_p=0.0, - decoding_strategy="beam_search_v3", - max_out_len=256, - diversity_rate=0.0, - rel_len=False, - bos_token_id=None, - eos_token_id=None, - pad_token_id=None, - alpha=0.6, - temperature=1.0, - early_stopping=False, - ): - # Beam_search/beam_search_v2/beam_search_v3 should be corrected to beam_search_v3. - if decoding_strategy.startswith("beam_search"): - decoding_strategy = "beam_search_v3" - elif decoding_strategy == "greedy_search": - decoding_strategy = "topk_sampling" - top_k = 1 - top_p = 0.0 - elif decoding_strategy in ["sampling", "topk_sampling", "topp_sampling"]: - if top_p == 1 and top_k > 0: - decoding_strategy = "topk_sampling" - top_p = 0.0 - elif top_p > 0 and top_k == 0: - decoding_strategy = "topp_sampling" - else: - raise AttributeError( - "Only topk sampling or topp sampling are supported. " - "Topk sampling and topp sampling cannot be both applied in the fast version. " - ) - - output_ids, parent_ids, sequence_length = infer_t5_decoding( - enc_output=[enc_output], - memory_seq_lens=[memory_seq_lens], - word_emb=self.word_emb, - slf_ln_weight=self.slf_ln_weight, - slf_ln_bias=getattr(self, "slf_ln_bias", self.default_bias(self.slf_ln_weight, 0, True)), - slf_q_weight=self.slf_q_weight, - slf_q_bias=getattr(self, "slf_q_bias", self.default_bias(self.slf_q_weight, 1)), - slf_k_weight=self.slf_k_weight, - slf_k_bias=getattr(self, "slf_k_bias", self.default_bias(self.slf_k_weight, 1)), - slf_v_weight=self.slf_v_weight, - slf_v_bias=getattr(self, "slf_v_bias", self.default_bias(self.slf_v_weight, 1)), - slf_out_weight=self.slf_out_weight, - slf_out_bias=getattr(self, "slf_out_bias", self.default_bias(self.slf_out_weight, 1)), - relative_attention_bias_weight=self.relative_attention_bias_weight, - cross_ln_weight=self.cross_ln_weight, - cross_ln_bias=getattr(self, "cross_ln_bias", self.default_bias(self.cross_ln_weight, 0, True)), - cross_q_weight=self.cross_q_weight, - cross_q_bias=getattr(self, "cross_q_bias", self.default_bias(self.cross_q_weight, 1)), - cross_k_weight=self.cross_k_weight, - cross_k_bias=getattr(self, "cross_k_bias", self.default_bias(self.cross_k_weight, 1)), - cross_v_weight=self.cross_v_weight, - cross_v_bias=getattr(self, "cross_v_bias", self.default_bias(self.cross_v_weight, 1)), - cross_out_weight=self.cross_out_weight, - cross_out_bias=getattr(self, "cross_out_bias", self.default_bias(self.cross_out_weight, 1)), - ffn_ln_weight=self.ffn_ln_weight, - ffn_ln_bias=getattr(self, "ffn_ln_bias", self.default_bias(self.ffn_ln_weight, 0, True)), - ffn_inter_weight_0=self.ffn_inter_weight_0, - ffn_inter_bias_0=getattr(self, "ffn_inter_bias_0", self.default_bias(self.ffn_inter_weight_0, 1)), - ffn_inter_weight_1=getattr( - self, "ffn_inter_weight_1", self.default_bias(self.ffn_inter_weight_0, 1, True) - ), - ffn_inter_bias_1=getattr(self, "ffn_inter_bias_1", self.default_bias(self.ffn_inter_weight_1, 1)) - if hasattr(self, "ffn_inter_weight_1") - else getattr(self, "ffn_inter_bias_1", self.default_bias(self.ffn_inter_weight_0, 1, True)), - ffn_out_weight=self.ffn_out_weight, - ffn_out_bias=getattr(self, "ffn_out_bias", self.default_bias(self.ffn_out_weight, 1)), - decoder_ln_weight=self.decoder_ln_weight, - decoder_ln_bias=getattr(self, "decoder_ln_bias", self.default_bias(self.decoder_ln_weight, 0, True)), - linear_weight=self.linear_weight, - linear_bias=getattr(self, "linear_bias", self.default_bias(self.linear_weight, 1)), - decoding_strategy=decoding_strategy, - beam_size=beam_size, - top_k=top_k, - top_p=top_p, - head_num=self._n_head, - size_per_head=int(self._d_model / self._n_head), - num_decoder_layers=self._num_decoder_layers, - start_id=bos_token_id, - end_id=eos_token_id, - max_out_len=max_out_len, - diversity_rate=-diversity_rate, - rel_len=rel_len, - alpha=alpha, - temperature=temperature, - early_stopping=early_stopping, - max_distance=self._max_distance, - relative_attention_num_buckets=self._relative_attention_num_buckets, - tie_word_embeddings=self.tie_word_embeddings, - act=self.act, - ) - - ids = finalize(beam_size, output_ids, parent_ids, sequence_length, decoding_strategy=decoding_strategy) - - return ids diff --git a/paddlenlp/ops/fast_transformer/transformer/encoder.py b/paddlenlp/ops/fast_transformer/transformer/encoder.py deleted file mode 100644 index da14723dbb54..000000000000 --- a/paddlenlp/ops/fast_transformer/transformer/encoder.py +++ /dev/null @@ -1,456 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle -from paddle.nn import TransformerEncoder, TransformerEncoderLayer - -from paddlenlp.ops.ext_utils import LOADED_EXT, load -from paddlenlp.ops.fast_transformer.transformer.decoding import transfer_param -from paddlenlp.utils.log import logger - -from .decoding import run_custom - - -def infer_transformer_encoder( - input, - attn_mask, - q_weight, - q_bias, - k_weight, - k_bias, - v_weight, - v_bias, - attn_out_weight, - attn_out_bias, - norm1_weight, - norm1_bias, - norm2_weight, - norm2_bias, - ffn_inter_weight, - ffn_inter_bias, - ffn_out_weight, - ffn_out_bias, - # sequence_id_offset, - # trt_seqlen_offset, - # amax_list, - n_head, - size_per_head, - n_layer=12, - use_gelu=True, - remove_padding=False, - int8_mode=0, - layer_idx=0, - allow_gemm_test=False, - use_trt_kernel=False, - normalize_before=False, -): - """ - Fusion Encoder API intergrating Encoder inference in FastGeneration. It - accepts the weight and bias of TransformerEncoder and some other parameters - for inference. - """ - inputs_names = [ - "Input", - "SelfAttnMask", - "SelfQueryWeight@VECTOR", - "SelfQueryBias@VECTOR", - "SelfKeyWeight@VECTOR", - "SelfKeyBias@VECTOR", - "SelfValueWeight@VECTOR", - "SelfValueBias@VECTOR", - "SelfAttnOutputWeight@VECTOR", - "SelfAttnOutputBias@VECTOR", - "SelfAttnOutputLayernormWeight@VECTOR", - "SelfAttnOutputLayernormBias@VECTOR", - "OutputLayernormWeight@VECTOR", - "OutputLayernormBias@VECTOR", - "FFNInterWeight@VECTOR", - "FFNInterBias@VECTOR", - "FFNOutputWeight@VECTOR", - "FFNOutputBias@VECTOR", - # 'SequenceIdOffset', - # "TRTSeqLenOffset", - # 'AmaxList' - ] - - inputs_var = [ - input, - attn_mask, - q_weight, - q_bias, - k_weight, - k_bias, - v_weight, - v_bias, - attn_out_weight, - attn_out_bias, - norm1_weight, - norm1_bias, - norm2_weight, - norm2_bias, - ffn_inter_weight, - ffn_inter_bias, - ffn_out_weight, - ffn_out_bias, - # 'SequenceIdOffset': sequence_id_offset, - # "TRTSeqLenOffset": trt_seqlen_offset, - # 'AmaxList': amax_list - ] - - attrs_names = [ - "head_num", - "size_per_head", - "use_gelu", - "remove_padding", - "int8_mode", - "num_layer", - "layer_idx", - "allow_gemm_test", - "use_trt_kernel", - "normalize_before", - ] - - attrs_val = [ - n_head, - size_per_head, - use_gelu, - remove_padding, - int8_mode, - n_layer, - layer_idx, - allow_gemm_test, - use_trt_kernel, - normalize_before, - ] - - outputs_names = ["EncoderOut"] - - outputs_dtype = [input[0].dtype] - - return run_custom("fusion_encoder", inputs_names, inputs_var, attrs_names, attrs_val, outputs_names, outputs_dtype) - - -def encoder_layer_forward(self, src, src_mask, cache=None, sequence_id_offset=None, trt_seq_len=None): - """ - Redefines `forward` function of `paddle.nn.TransformerEncoderLayer` for - integrating FastGeneration for inference. - - The original `forward` function would not be replaced unless - `enable_fast_encoder` is called by objects of its base class. After - replacing, objects of `paddle.nn.TransformerEncoderLayer` also have the - same member variables as before. - - After inference, `disable_fast_encoder` could be called to restore the - `forward` function of `paddle.nn.TransformerEncoder` and - `paddle.nn.TransformerEncoderLayer`. - - Args: - src (Tensor): - The input of Transformer encoder layer. It is a tensor with shape - `[batch_size, sequence_length, d_model]`. The data type should be - float32 or float64. - src_mask (Tensor, optional): - A tensor used in multi-head attention to prevents attention to some - unwanted positions, usually the paddings or the subsequent - positions. It is a tensor with shape `[batch_size, 1, 1, sequence_length]`. - When the data type is bool, the unwanted positions have `False` - values and the others have `True` values. When the data type is int, - the unwanted positions have 0 values and the others have 1 values. - When the data type is float, the unwanted positions have `-INF` - values and the others have 0 values. It can be None when nothing - wanted or needed to be prevented attention to. Defaults to None. - - Returns: - src(Tensor|tuple): - It is a tensor that has the same shape and data type as `enc_input`, - representing the output of Transformer encoder layer. Or a tuple if - `cache` is not None, except for encoder layer output, the tuple - includes the new cache which is same as input `cache` argument but - `incremental_cache` has an incremental length. See - `paddle.nn.MultiHeadAttention.gen_cache` and - `paddle.nn.MultiHeadAttention.forward` for more details. - """ - if cache is not None: - raise NotImplementedError("cache in encoder is not supported now") - - src = infer_transformer_encoder( - input=[src], - attn_mask=[src_mask], - q_weight=[self.self_attn.q_proj.weight], - q_bias=[self.self_attn.q_proj.bias], - k_weight=[self.self_attn.k_proj.weight], - k_bias=[self.self_attn.k_proj.bias], - v_weight=[self.self_attn.v_proj.weight], - v_bias=[self.self_attn.v_proj.bias], - attn_out_weight=[self.self_attn.out_proj.weight], - attn_out_bias=[self.self_attn.out_proj.bias], - norm1_weight=[self.norm1.weight], - norm1_bias=[self.norm1.bias], - norm2_weight=[self.norm2.weight], - norm2_bias=[self.norm2.bias], - ffn_inter_weight=[self.linear1.weight], - ffn_inter_bias=[self.linear1.bias], - ffn_out_weight=[self.linear2.weight], - ffn_out_bias=[self.linear2.bias], - # sequence_id_offset=paddle.to_tensor([]), - # trt_seqlen_offset=paddle.to_tensor([]), - # amax_list=paddle.to_tensor([]), # int8 mode is not supported. - n_head=self._config["nhead"], - size_per_head=self._config["d_model"] // self._config["nhead"], - use_gelu=self._config["activation"] == "gelu", - normalize_before=self._config["normalize_before"] is True, - ) - - return src - - -def encoder_forward(self, src, src_mask=None, cache=None): - """ - Redefines `forward` function of `paddle.nn.TransformerEncoder` for - integrating FastGeneration for inference. - - The original `forward` function would not be replaced unless - `enable_fast_encoder` is called by objects of its base class. After - replacing, objects of `paddle.nn.TransformerEncoder` also have the same - member variables as before. - - After inference, `disable_fast_encoder` could be called to restore the - `forward` function of `paddle.nn.TransformerEncoder` and - `paddle.nn.TransformerEncoderLayer`. - - Args: - src (Tensor): - The input of Transformer encoder. It is a tensor - with shape `[batch_size, sequence_length, d_model]`. The data - type should be float32 or float16. - src_mask (Tensor, optional): - A tensor used in multi-head attention to prevents attention to - some unwanted positions, usually the paddings or the subsequent - positions. It is a tensor with shape `[batch_size, 1, 1, sequence_length]`. - The data type must be float, the unwanted positions have `-INF` values or other non-zeros - and the wanted positions must be 0.0. - Returns: - output (Tensor|tuple): - It is a tensor that has the same shape and data type as `src`, - representing the output of Transformer encoder. Or a tuple if - `cache` is not None, except for encoder output, the tuple includes - the new cache which is same as input `cache` argument but - `incremental_cache` in it has an incremental length. See - `paddle.nn.MultiHeadAttention.gen_cache` and - `paddle.nn.MultiHeadAttention.forward` for more details. - """ - if cache is not None: - raise NotImplementedError("cache in encoder is not supported now") - - if src_mask.dtype == paddle.float16: - src_mask = paddle.cast(src_mask, dtype="float32") - src_mask = src_mask == 0.0 - if src_mask.dtype != src.dtype: - src_mask = paddle.cast(src_mask, src.dtype) - - if len(src_mask.shape) == 4: - # transpose_src_mask: [batch_size, 1, sequence_length, 1] - transpose_src_mask = paddle.transpose(src_mask, perm=[0, 1, 3, 2]) - # src_mask: [batch_size, 1, sequence_length, sequence_length] - src_mask = src_mask * transpose_src_mask - - if getattr(self, "q_weight", None) is None: - self.q_weight = [] - self.q_bias = [] - self.k_weight = [] - self.k_bias = [] - self.v_weight = [] - self.v_bias = [] - self.attn_out_weight = [] - self.attn_out_bias = [] - self.norm1_weight = [] - self.norm1_bias = [] - self.norm2_weight = [] - self.norm2_bias = [] - self.ffn_inter_weight = [] - self.ffn_inter_bias = [] - self.ffn_out_weight = [] - self.ffn_out_bias = [] - for layer in self.layers: - self.q_weight.append(layer.self_attn.q_proj.weight) - self.q_bias.append(layer.self_attn.q_proj.bias) - self.k_weight.append(layer.self_attn.k_proj.weight) - self.k_bias.append(layer.self_attn.k_proj.bias) - self.v_weight.append(layer.self_attn.v_proj.weight) - self.v_bias.append(layer.self_attn.v_proj.bias) - self.attn_out_weight.append(layer.self_attn.out_proj.weight) - self.attn_out_bias.append(layer.self_attn.out_proj.bias) - self.norm1_weight.append(layer.norm1.weight) - self.norm1_bias.append(layer.norm1.bias) - self.norm2_weight.append(layer.norm2.weight) - self.norm2_bias.append(layer.norm2.bias) - self.ffn_inter_weight.append(layer.linear1.weight) - self.ffn_inter_bias.append(layer.linear1.bias) - self.ffn_out_weight.append(layer.linear2.weight) - self.ffn_out_bias.append(layer.linear2.bias) - - output = infer_transformer_encoder( - input=[src], - attn_mask=[src_mask], - q_weight=self.q_weight, - q_bias=self.q_bias, - k_weight=self.k_weight, - k_bias=self.k_bias, - v_weight=self.v_weight, - v_bias=self.v_bias, - attn_out_weight=self.attn_out_weight, - attn_out_bias=self.attn_out_bias, - norm1_weight=self.norm1_weight, - norm1_bias=self.norm1_bias, - norm2_weight=self.norm2_weight, - norm2_bias=self.norm2_bias, - ffn_inter_weight=self.ffn_inter_weight, - ffn_inter_bias=self.ffn_inter_bias, - ffn_out_weight=self.ffn_out_weight, - ffn_out_bias=self.ffn_out_bias, - # sequence_id_offset=paddle.to_tensor([]), - # trt_seqlen_offset=paddle.to_tensor([]), - # amax_list=paddle.to_tensor([]), # int8 mode is not supported. - n_head=self.layers[0]._config["nhead"], - size_per_head=self.layers[0]._config["d_model"] // self.layers[0]._config["nhead"], - use_gelu=self.layers[0]._config["activation"] == "gelu", - normalize_before=self.layers[0]._config["normalize_before"] is True, - ) - - if self.norm is not None: - output = self.norm(output) - return output - - -def enable_fast_encoder(self, use_fp16=False, encoder_lib=None): - """ - Compiles fusion encoder operator intergrated FastGeneration using the - method of JIT(Just-In-Time) and replaces the `forward` function of - `paddle.nn.TransformerEncoder` and `paddle.nn.TransformerEncoderLayer` - objects inherited from `self` to support inference using FastGeneration. - - Examples: - - .. code-block:: python - - from paddlenlp.ops import enable_fast_encoder, disable_fast_encoder - - model.eval() - model = enable_fast_encoder(model) - enc_out = model(src, src_mask) - model = disable_fast_encoder(model) - """ - - def init_func(layer): - if isinstance(layer, TransformerEncoderLayer): - is_usable = True - if layer._config["bias_attr"] is False: - logger.warning( - "`False` for paddle.nn.TransformerEncoder's" - " parameter `bias_attr` is not supported in " - "FastGeneration by now. The original forward" - " will be involved." - ) - is_usable = False - if layer._config["activation"] not in ("relu", "gelu"): - logger.warning("Only 'relu' or 'gelu' is supported by now. " "The original forward will be involved.") - is_usable = False - if is_usable: - layer.forward = layer._ft_forward - elif isinstance(layer, TransformerEncoder): - layer.forward = layer._ft_forward - if use_fp16: - convert_to_fp16(layer) - - if not self.training: - try: - # Pass decoding lib to prevent re-building encoder. - # Todo: check weather decoding lib have contained encoder or not. - if encoder_lib is not None: - if "FastGeneration" not in LOADED_EXT.keys(): - ops = paddle.utils.cpp_extension.load_op_meta_info_and_register_op(encoder_lib) - LOADED_EXT["FastGeneration"] = ops - else: - load("FastGeneration", verbose=True) - except Exception: - logger.warning("Exception occurs when using FasterEncoder. " "The original forward will be involved. ") - return self - for layer in self.children(): - layer.apply(init_func) - return self - - -def disable_fast_encoder(self): - """ - Restores the original `forward` function of `paddle.nn.TransformerEncoder` - and `paddle.nn.TransformerEncoderLayer` objects inherited from `self`. - - Examples: - - .. code-block:: python - - from paddlenlp.ops import enable_fast_encoder, disable_fast_encoder - - model.eval() - model = enable_fast_encoder(model) - enc_out = model(src, src_mask) - model = disable_fast_encoder(model) - """ - - def init_func(layer): - if isinstance(layer, (TransformerEncoderLayer, TransformerEncoder)): - layer.forward = layer._ori_forward - - for layer in self.children(): - layer.apply(init_func) - return self - - -def convert_to_fp16(transformer_encoder): - """Convert paddle.nn.TransformerEncoder's parameter from float32 to float16 - - Args: - transformer_encoder (obeject, paddle.nn.TransformerEncoder): - The object to be converted to float16 inplaced, it must be an isinstance - of paddle.nn.TransformerEncoder. - """ - if not isinstance(transformer_encoder, paddle.nn.TransformerEncoder): - logger.warning( - "transformer_encoder is not isinstance of paddle.nn.TransformerEncoder, return itself with no parameters convertion.".format - ) - return transformer_encoder - else: - encoder_layers = transformer_encoder.layers - - for mod in encoder_layers: - mod.norm1.weight = transfer_param(mod.norm1.weight, restore_data=True) - mod.norm1.bias = transfer_param(mod.norm1.bias, is_bias=True, restore_data=True) - mod.norm2.weight = transfer_param(mod.norm2.weight, restore_data=True) - mod.norm2.bias = transfer_param(mod.norm2.bias, is_bias=True, restore_data=True) - - mod.linear1.weight = transfer_param(mod.linear1.weight, restore_data=True) - mod.linear1.bias = transfer_param(mod.linear1.bias, is_bias=True, restore_data=True) - - mod.self_attn.q_proj.weight = transfer_param(mod.self_attn.q_proj.weight, restore_data=True) - mod.self_attn.q_proj.bias = transfer_param(mod.self_attn.q_proj.bias, is_bias=True, restore_data=True) - mod.self_attn.k_proj.weight = transfer_param(mod.self_attn.k_proj.weight, restore_data=True) - mod.self_attn.k_proj.bias = transfer_param(mod.self_attn.k_proj.bias, is_bias=True, restore_data=True) - mod.self_attn.v_proj.weight = transfer_param(mod.self_attn.v_proj.weight, restore_data=True) - mod.self_attn.v_proj.bias = transfer_param(mod.self_attn.v_proj.bias, is_bias=True, restore_data=True) - mod.self_attn.out_proj.weight = transfer_param(mod.self_attn.out_proj.weight, restore_data=True) - mod.self_attn.out_proj.bias = transfer_param(mod.self_attn.out_proj.bias, is_bias=True, restore_data=True) - - mod.linear2.weight = transfer_param(mod.linear2.weight, restore_data=True) - mod.linear2.bias = transfer_param(mod.linear2.bias, is_bias=True, restore_data=True) - logger.info("Convert transformer_encoder's parameters from float32 to float16 succeessfully.") diff --git a/paddlenlp/ops/fast_transformer/transformer/fast_transformer.py b/paddlenlp/ops/fast_transformer/transformer/fast_transformer.py deleted file mode 100644 index b7b87c47a4c2..000000000000 --- a/paddlenlp/ops/fast_transformer/transformer/fast_transformer.py +++ /dev/null @@ -1,2021 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import os -import shutil - -import numpy as np -import paddle -import paddle.nn as nn -import paddle.nn.functional as F - -from paddlenlp.ops import ( - InferBartDecoding, - InferGptDecoding, - InferGptJDecoding, - InferMBartDecoding, - InferMIRODecoding, - InferOptDecoding, - InferPegasusDecoding, - InferT5Decoding, - InferTransformerDecoding, - InferUnifiedDecoding, -) -from paddlenlp.transformers import ( - BartPretrainedModel, - CodeGenPreTrainedModel, - GPTChineseTokenizer, - GPTJPretrainedModel, - GPTPretrainedModel, - GPTTokenizer, - InferTransformerModel, - MBartPretrainedModel, - OPTPretrainedModel, - PegasusPretrainedModel, - PositionalEmbedding, - T5PretrainedModel, - TransformerModel, - UnifiedTransformerPretrainedModel, - UNIMOPretrainedModel, - WordEmbedding, - position_encoding_init, -) -from paddlenlp.utils.log import logger - -from .encoder import enable_fast_encoder - - -class FasterTransformer(TransformerModel): - """ - FasterTransformer is a fast version for generation with the Transformer - model. It uses a custom op based on and enhancing NV FasterTransformer to - do fast generation. - - Args: - src_vocab_size (int): - The size of source vocabulary. - trg_vocab_size (int): - The size of target vocabulary. - max_length (int): - The maximum length of input sequences. - num_encoder_layers (int): - The number of sub-layers to be stacked in the encoder. - num_decoder_layers (int): - The number of sub-layers to be stacked in the decoder. - n_head (int): - The number of head used in multi-head attention. - d_model (int): - The dimension for word embeddings, which is also the last dimension of - the input and output of multi-head attention, position-wise feed-forward - networks, encoder and decoder. - d_inner_hid (int): - Size of the hidden layer in position-wise feed-forward networks. - dropout (float): - Dropout rates. Used for pre-process, activation and inside attention. - weight_sharing (bool): - Whether to use weight sharing. - attn_dropout (float): - The dropout probability used in MHA to drop some attention target. - If None, use the value of dropout. Defaults to None. - act_dropout (float): - The dropout probability used after FFN activition. If None, use - the value of dropout. Defaults to None. - bos_id (int, optional): - The start token id and also is used as padding id. Defaults to 0. - eos_id (int, optional): - The end token id. Defaults to 1. - pad_id (int, optional): - The pad token id. Defaults to None. If it's None, the bos_id will be used as pad_id. - decoding_strategy (str, optional): - Indicating the strategy of decoding. It can be 'beam_search', 'beam_search_v2', - 'topk_sampling' and 'topp_sampling'. For beam search strategies, - 'v2' would select the top `beam_size * 2` beams and process the top - `beam_size` alive and finish beams in them separately, while 'v1' - would only select the top `beam_size` beams and mix up the alive and - finish beams. 'v2' always searchs more and get better results, since - the alive beams would always be `beam_size` while the number of alive - beams in `v1` might decrease when meeting the end token. However, - 'v2' always generates longer results thus might do more calculation - and be slower. - beam_size (int, optional): - The beam width for beam search. Defaults to 4. - topk (int, optional): - The number of highest probability tokens to keep for top-k sampling. - Defaults to 4. - topp (float, optional): - The most probable tokens whose cumulative probability is not less than - `topp` are kept for top-p sampling. Defaults to 4. - max_out_len (int, optional): - The maximum output length. Defaults to 256. - diversity_rate (float, optional): - Refer to `A Simple, Fast Diverse Decoding Algorithm for Neural Generation `_ - for details. Bigger `diversity_rate` would lead to more diversity. - if `diversity_rate == 0` is equivalent to naive BeamSearch. Default - to 0 if not set. - use_fp16_decoding(bool, optional): - Whether to use fp16 for decoding. - enable_fast_encoder(bool, optional): - Whether to use the fast version of encoder. This is experimental option for now. - Defaults to False. - use_fp16_encoder(bool, optional): - Whether to use fp16 for encoder. Only works when enable_fast_encoder is True. - Defaults to False. - rel_len(bool, optional): - Indicating whether `max_out_len` in is the length relative to that - of source text. Only works in `v2` temporarily. It is suggest to set - a small `max_out_len` and use `rel_len=True`. Default to False if - not set. - alpha(float, optional): - The power number in length penalty calculation. Only works in `v2` - temporarily. Refer to `GNMT `_. - Default to 0.6 if not set. - """ - - def __init__( - self, - src_vocab_size, - trg_vocab_size, - max_length, - num_encoder_layers, - num_decoder_layers, - n_head, - d_model, - d_inner_hid, - dropout, - weight_sharing, - attn_dropout=None, - act_dropout=None, - bos_id=0, - eos_id=1, - pad_id=None, - decoding_strategy="beam_search", - beam_size=4, - topk=1, - topp=0.0, - max_out_len=256, - diversity_rate=0.0, - decoding_lib=None, - use_fp16_decoding=False, - enable_fast_encoder=False, - use_fp16_encoder=False, - rel_len=False, - alpha=0.6, - ): - # if decoding_lib is None: - # raise ValueError( - # "The args decoding_lib must be set to use FasterTransformer. ") - # elif not os.path.exists(decoding_lib): - # raise ValueError("The path to decoding lib is not exist.") - - args = dict(locals()) - args.pop("self") - args.pop("__class__", None) - self.decoding_strategy = args.pop("decoding_strategy") - self.beam_size = args.pop("beam_size") - self.topk = args.pop("topk") - self.topp = args.pop("topp") - self.max_out_len = args.pop("max_out_len") - self.diversity_rate = args.pop("diversity_rate") - self.decoding_lib = args.pop("decoding_lib") - self.use_fp16_decoding = args.pop("use_fp16_decoding") - self.enable_fast_encoder = args.pop("enable_fast_encoder") - self.use_fp16_encoder = args.pop("use_fp16_encoder") - self.rel_len = args.pop("rel_len") - self.alpha = args.pop("alpha") - self.dropout = dropout - self.weight_sharing = weight_sharing - self.trg_vocab_size = trg_vocab_size - self.d_model = d_model - self.bos_id = bos_id - self.pad_id = pad_id if pad_id is not None else self.bos_id - self.max_length = max_length - super(FasterTransformer, self).__init__(**args) - - if self.enable_fast_encoder: - logger.warning("enable_fast_encoder is an experimental option and subject to change.") - elif self.use_fp16_encoder: - self.use_fp16_encoder = False - - self.decoding_linear = nn.Linear(in_features=d_model, out_features=trg_vocab_size) - - if weight_sharing: - self.trg_word_embedding = WordEmbedding(vocab_size=trg_vocab_size, emb_dim=d_model, bos_id=self.bos_id) - self.trg_pos_embedding = PositionalEmbedding(emb_dim=d_model, max_length=max_length) - - self.decoding = InferTransformerDecoding( - decoder=self.transformer.decoder, - word_embedding=self.trg_word_embedding.word_embedding, - positional_embedding=self.trg_pos_embedding.pos_encoder, - linear=self.decoding_linear, - num_decoder_layers=num_decoder_layers, - n_head=n_head, - d_model=d_model, - bos_id=bos_id, - eos_id=eos_id, - decoding_strategy=decoding_strategy, - beam_size=beam_size, - topk=topk, - topp=topp, - max_out_len=max_out_len, - diversity_rate=self.diversity_rate, - decoding_lib=self.decoding_lib, - use_fp16_decoding=self.use_fp16_decoding, - rel_len=self.rel_len, - alpha=self.alpha, - ) - - def forward(self, src_word, trg_word=None): - src_max_len = src_word.shape[-1] - src_slf_attn_bias = ( - paddle.cast(src_word == self.pad_id, dtype=paddle.get_default_dtype()).unsqueeze([1, 2]) * -1e9 - ) - src_pos = paddle.cast(src_word != self.pad_id, dtype=src_word.dtype) * paddle.arange(start=0, end=src_max_len) - - # Run encoder - src_emb = self.src_word_embedding(src_word) - src_pos_emb = self.src_pos_embedding(src_pos) - src_emb = src_emb + src_pos_emb - enc_input = F.dropout(src_emb, p=self.dropout, training=False) if self.dropout else src_emb - - if self.enable_fast_encoder and self.use_fp16_encoder: - enc_input = paddle.cast(enc_input, dtype="float16") - - enc_output = self.transformer.encoder(enc_input, src_slf_attn_bias) - - if self.use_fp16_decoding and enc_output.dtype != paddle.float16: - enc_output = paddle.cast(enc_output, dtype="float16") - elif not self.use_fp16_decoding and enc_output.dtype != paddle.float32: - enc_output = paddle.cast(enc_output, dtype="float32") - - mem_seq_lens = paddle.sum(paddle.cast(src_word != self.pad_id, dtype="int32"), dtype="int32", axis=1) - ids = self.decoding(enc_output, mem_seq_lens, trg_word=trg_word) - - return ids - - def load(self, init_from_params=None, state_dict=None): - # Load the trained model - if init_from_params is None and state_dict is None: - raise ValueError("Either init_from_params or state_dict must be given to load the infer model. ") - - if state_dict is None: - state_dict = paddle.load(init_from_params, return_numpy=True) - else: - for state in state_dict: - # NOTE: This API only used in dygraph, so paddle.Tensor is enough. - if isinstance(state_dict[state], paddle.Tensor): - state_dict[state] = state_dict[state].numpy() - - # To set weight[padding_idx] to 0. - state_dict["trg_word_embedding.word_embedding.weight"][self.bos_id] = [0] * self.d_model - - # Dealing with weight sharing. - if self.weight_sharing: - state_dict["decoding_linear.weight"] = np.transpose(state_dict["trg_word_embedding.word_embedding.weight"]) - else: - state_dict["decoding_linear.weight"] = state_dict["linear.weight"] - - if self.decoding._fuse_qkv: - for item in self.state_dict(): - if "decoder" in item and "self_attn.q_proj" in item: - num_layer = item.split(".")[3] - param_type = item.split(".")[-1] - - state_dict["decoding.slf_q_" + param_type + "_" + num_layer] = np.concatenate( - ( - state_dict[item], - state_dict["transformer.decoder.layers." + num_layer + ".self_attn.k_proj." + param_type], - state_dict["transformer.decoder.layers." + num_layer + ".self_attn.v_proj." + param_type], - ), - axis=-1, - ) - - if self.use_fp16_decoding: - for item in self.state_dict(): - if "decoder" in item or "decoding.slf" in item: - state_dict[item] = np.float16(state_dict[item]) - state_dict["decoding_linear.weight"] = np.float16(state_dict["decoding_linear.weight"]) - state_dict["trg_word_embedding.word_embedding.weight"] = np.float16( - state_dict["trg_word_embedding.word_embedding.weight"] - ) - state_dict["trg_pos_embedding.pos_encoder.weight"] = np.float16( - state_dict["trg_pos_embedding.pos_encoder.weight"] - ) - state_dict["decoding_linear.bias"] = np.zeros([self.trg_vocab_size], dtype="float16") - - self.load_dict(state_dict) - - if self.enable_fast_encoder: - self = enable_fast_encoder(self, use_fp16=self.use_fp16_encoder) - - def export_params(self, init_from_params, place): - """ - This method is used for load static graph from dygraph checkpoint - or export inference model using static graph. - Do NOT support faster encoder. - - Args: - init_from_params (string): - The path to dygraph checkpoint. - place (paddle.Place): - The place to execute static graph. - - Example: - .. code-block:: - paddle.enable_static() - place = "gpu" - place = paddle.set_device(place) - reader.adapt_vocab_size(args) - - test_program = paddle.static.Program() - startup_program = paddle.static.Program() - with paddle.static.program_guard(test_program, startup_program): - src_word = paddle.static.data( - name="src_word", shape=[None, None], dtype="int64") - - # Define model - transformer = FasterTransformer( - src_vocab_size=args.src_vocab_size, - trg_vocab_size=args.trg_vocab_size, - max_length=args.max_length + 1, - num_encoder_layers=args.n_layer, - num_decoder_layers=args.n_layer, - n_head=args.n_head, - d_model=args.d_model, - d_inner_hid=args.d_inner_hid, - dropout=args.dropout, - weight_sharing=args.weight_sharing, - bos_id=args.bos_idx, - eos_id=args.eos_idx, - decoding_strategy=args.decoding_strategy, - beam_size=args.beam_size, - max_out_len=args.max_out_len, - decoding_lib=args.decoding_lib, - use_fp16_decoding=args.use_fp16_decoding, - rel_len=args.use_rel_len, - alpha=args.alpha) - - finished_seq = transformer(src_word=src_word) - - test_program = test_program.clone(for_test=True) - - exe = paddle.static.Executor(place) - exe.run(startup_program) - - # Load checkpoint. - transformer.export_params( - init_from_params=os.path.join(args.init_from_params, - "transformer.pdparams"), - place=place) - - paddle.static.save_inference_model( - os.path.join(args.inference_model_dir, "transformer"), - feed_vars=src_word, - fetch_vars=finished_seq, - executor=exe, - program=test_program) - """ - # Load the trained model - assert init_from_params, "Please set init_from_params to load the infer model." - - model_dict = paddle.load(init_from_params, return_numpy=True) - - # To set weight[padding_idx] to 0. - model_dict["trg_word_embedding.word_embedding.weight"][self.bos_id] = [0] * self.d_model - - # Dealing with weight sharing. - if self.weight_sharing: - model_dict["decoding_linear.weight"] = np.transpose(model_dict["trg_word_embedding.word_embedding.weight"]) - else: - model_dict["decoding_linear.weight"] = model_dict["linear.weight"] - - # To avoid a longer length than training, reset the size of position - # encoding to max_length - model_dict["encoder.pos_encoder.weight"] = position_encoding_init(self.max_length, self.d_model) - model_dict["decoder.pos_encoder.weight"] = position_encoding_init(self.max_length, self.d_model) - - if self.decoding._fuse_qkv: - for item in self.state_dict(): - if "decoder" in item and "self_attn.q_proj" in item: - num_layer = item.split(".")[3] - param_type = item.split(".")[-1] - - model_dict["decoding.slf_q_" + param_type + "_" + num_layer] = np.concatenate( - ( - model_dict[item], - model_dict["transformer.decoder.layers." + num_layer + ".self_attn.k_proj." + param_type], - model_dict["transformer.decoder.layers." + num_layer + ".self_attn.v_proj." + param_type], - ), - axis=-1, - ) - - if self.use_fp16_decoding: - for item in self.state_dict(): - if "decoder" in item or "decoding.slf" in item: - model_dict[item] = np.float16(model_dict[item]) - model_dict["decoding_linear.weight"] = np.float16(model_dict["decoding_linear.weight"]) - model_dict["trg_word_embedding.word_embedding.weight"] = np.float16( - model_dict["trg_word_embedding.word_embedding.weight"] - ) - model_dict["trg_pos_embedding.pos_encoder.weight"] = np.float16( - model_dict["trg_pos_embedding.pos_encoder.weight"] - ) - model_dict["decoding_linear.bias"] = np.zeros([self.trg_vocab_size], dtype="float16") - - for item in self.state_dict(): - param = self - attr_list = item.split(".") - for attr in attr_list: - param = getattr(param, attr) - param_name = param.name - var = paddle.static.global_scope().find_var(param_name).get_tensor() - var.set(model_dict[item], place) - - -class TransformerGenerator(paddle.nn.Layer): - """ - The Transformer model for auto-regressive generation with beam search. It wraps - `FasterTransformer` and `InferTransformerModel`, and automatically chioces using - `FasterTransformer` (with jit building) or the slower verison `InferTransformerModel`. - - Args: - src_vocab_size (int): - The size of source vocabulary. - trg_vocab_size (int): - The size of target vocabulary. - max_length (int): - The maximum length of input sequences. - num_encoder_layers (int): - The number of sub-layers to be stacked in the encoder. - num_decoder_layers (int): - The number of sub-layers to be stacked in the decoder. - n_head (int): - The number of head used in multi-head attention. - d_model (int): - The dimension for word embeddings, which is also the last dimension of - the input and output of multi-head attention, position-wise feed-forward - networks, encoder and decoder. - d_inner_hid (int): - Size of the hidden layer in position-wise feed-forward networks. - dropout (float): - Dropout rates. Used for pre-process, activation and inside attention. - weight_sharing (bool): - Whether to use weight sharing. - bos_id (int, optional): - The start token id and also is used as padding id. Defaults to 0. - eos_id (int, optional): - The end token id. Defaults to 1. - beam_size (int, optional): - The beam width for beam search. Defaults to 4. - max_out_len (int, optional): - The maximum output length. Defaults to 256. - activation (str, optional): - The activation used in FFN. Defaults to "relu". - normalize_before (bool, optional): - Whether to apply pre-normalization. Defaults to True. - kwargs: - The key word arguments can be `output_time_major`, `use_ft`, `use_fp16_decoding`, - `rel_len`, `alpha`: - - - `output_time_major(bool, optional)`: Indicate the data layout of predicted - Tensor. If `False`, the data layout would be batch major with shape - `[batch_size, seq_len, beam_size]`. If `True`, the data layout would - be time major with shape `[seq_len, batch_size, beam_size]`. Default - to `False`. - - - `use_ft(bool, optional)`: Whether to use FastGeneration - for decoding. Default to True if not set. - - - `use_fp16_decoding(bool, optional)`: Whether to use fp16 - for decoding. Only works when using FastGeneration. - - - `beam_search_version(str, optional)`: Indicating the strategy of - beam search. It can be 'v1' or 'v2'. 'v2' would select the top - `beam_size * 2` beams and process the top `beam_size` alive and - finish beams in them separately, while 'v1' would only select the - top `beam_size` beams and mix up the alive and finish beams. 'v2' always - searchs more and get better results, since the alive beams would - always be `beam_size` while the number of alive beams in `v1` might - decrease when meeting the end token. However, 'v2' always generates - longer results thus might do more calculation and be slower. - - - `rel_len(bool, optional)`: Indicating whether `max_out_len` in is - the length relative to that of source text. Only works in `v2` temporarily. - It is suggest to set a small `max_out_len` and use `rel_len=True`. - Default to False if not set. - - - `alpha(float, optional)`: The power number in length penalty - calculation. Refer to `GNMT `_. - Only works in `v2` temporarily. Default to 0.6 if not set. - - - diversity_rate(float, optional): Refer to `A Simple, Fast Diverse - Decoding Algorithm for Neural Generation `_ - for details. Bigger `diversity_rate` would lead to more diversity. - if `diversity_rate == 0` is equivalent to naive BeamSearch. Default - to 0 if not set. **NOTE**: Only works when using FastGeneration - temporarily. - """ - - def __init__( - self, - src_vocab_size, - trg_vocab_size, - max_length, - num_encoder_layers, - num_decoder_layers, - n_head, - d_model, - d_inner_hid, - dropout, - weight_sharing, - bos_id=0, - eos_id=1, - pad_id=None, - beam_size=4, - max_out_len=256, - activation="relu", - normalize_before=True, - **kwargs - ): - logger.warning("TransformerGenerator is an experimental API and subject to change.") - # `kwargs` can include output_time_major, use_fp16_decoding, topk, topp. - # The later three arguments can only work when using FastGeneration, - # and expose topk, topp later. - super(TransformerGenerator, self).__init__() - self.d_model = d_model - self.max_length = max_length - self.output_time_major = kwargs.pop("output_time_major", True) - # Only works for FastGeneration. - # TODO: original version supports diversity rate. - diversity_rate = kwargs.pop("diversity_rate", 0.0) - use_fp16_decoding = kwargs.pop("use_fp16_decoding", False) - use_ft = kwargs.pop("use_ft", True) - beam_search_version = kwargs.pop("beam_search_version", "v1") - rel_len = kwargs.pop("rel_len", False) - alpha = kwargs.pop("alpha", 0.6) - - # TODO: Faster version needs to update attr to support custom - # activation and normalize_before which are both aupport in cpp codes. - if use_ft and activation == "relu" and normalize_before: - try: - decoding_strategy = "beam_search_v2" if beam_search_version == "v2" else "beam_search" - self.transformer = FasterTransformer( - src_vocab_size=src_vocab_size, - trg_vocab_size=trg_vocab_size, - max_length=max_length, - num_encoder_layers=num_encoder_layers, - num_decoder_layers=num_decoder_layers, - n_head=n_head, - d_model=d_model, - d_inner_hid=d_inner_hid, - dropout=dropout, - weight_sharing=weight_sharing, - bos_id=bos_id, - eos_id=eos_id, - pad_id=pad_id, - beam_size=beam_size, - max_out_len=max_out_len, - diversity_rate=diversity_rate, - decoding_strategy=decoding_strategy, - use_fp16_decoding=use_fp16_decoding, - rel_len=rel_len, - alpha=alpha, - ) - except Exception: - logger.warning( - "Exception occurs when using FastGeneration. " "The original forward will be involved. " - ) - if diversity_rate != 0: - logger.warning( - "diversity_rate would not work since it is only " "supported by FastGeneration temporarily." - ) - self.transformer = InferTransformerModel( - src_vocab_size=src_vocab_size, - trg_vocab_size=trg_vocab_size, - max_length=max_length, - num_encoder_layers=num_encoder_layers, - num_decoder_layers=num_decoder_layers, - n_head=n_head, - d_model=d_model, - d_inner_hid=d_inner_hid, - dropout=dropout, - weight_sharing=weight_sharing, - bos_id=bos_id, - eos_id=eos_id, - pad_id=pad_id, - beam_size=beam_size, - max_out_len=max_out_len, - output_time_major=self.output_time_major, - beam_search_version=beam_search_version, - activation=activation, - normalize_before=normalize_before, - rel_len=rel_len, - alpha=alpha, - ) - else: - if diversity_rate != 0: - logger.warning( - "diversity_rate would not work since it is only " "supported by FastGeneration temporarily." - ) - self.transformer = InferTransformerModel( - src_vocab_size=src_vocab_size, - trg_vocab_size=trg_vocab_size, - max_length=max_length, - num_encoder_layers=num_encoder_layers, - num_decoder_layers=num_decoder_layers, - n_head=n_head, - d_model=d_model, - d_inner_hid=d_inner_hid, - dropout=dropout, - weight_sharing=weight_sharing, - bos_id=bos_id, - eos_id=eos_id, - pad_id=pad_id, - beam_size=beam_size, - max_out_len=max_out_len, - output_time_major=self.output_time_major, - beam_search_version=beam_search_version, - activation=activation, - normalize_before=normalize_before, - rel_len=rel_len, - alpha=alpha, - ) - - def forward(self, src_word, trg_word=None): - r""" - Performs decoding for transformer model. - - Args: - src_word (Tensor): - The ids of source sequence words. It is a tensor with shape - `[batch_size, source_sequence_length]` and its data type can be - int or int64. - trg_word (Tensor): - The ids of target sequence words. Normally, it should NOT be - given. If it's given, force decoding with previous output token - will be trigger. Defaults to None. - - Returns: - Tensor: - An int64 tensor shaped indicating the predicted ids. Its shape is - `[batch_size, seq_len, beam_size]` or `[seq_len, batch_size, beam_size]` - according to `output_time_major`. While, when using FastGeneration - and beam search v2, the beam dimension would be doubled to include - both the top `beam_size` alive and finish beams, thus the tensor - shape is `[batch_size, seq_len, beam_size * 2]` or `[seq_len, batch_size, beam_size * 2]`. - - Example: - .. code-block:: - - import paddle - from paddlenlp.ops import TransformerGenerator - - transformer = TransformerGenerator( - src_vocab_size=30000, - trg_vocab_size=30000, - max_length=256, - num_encoder_layers=6, - num_decoder_layers=6, - n_head=8, - d_model=512, - d_inner_hid=2048, - dropout=0.1, - weight_sharing=True, - bos_id=0, - eos_id=1, - beam_size=4, - max_out_len=256) - - batch_size = 5 - seq_len = 10 - transformer( - src_word=paddle.randint(low=3, high=30000, shape=[batch_size, seq_len])) - """ - out = self.transformer(src_word, trg_word=trg_word) - # TODO(guosheng): FasterTransformer has an output with layout - # `[seq_len, batch_size, beam_size]`. While the output layout of - # original one is `[batch_size, seq_len, beam_size]`. Maybe we need - # unify them later. - if not self.output_time_major and isinstance(self.transformer, FasterTransformer): - out = paddle.transpose(out, [1, 0, 2]) - return out - - def load(self, path=None, state_dict=None): - if path is None and state_dict is None: - raise ValueError("Either path or state_dict must be given to load the infer model. ") - - if isinstance(self.transformer, FasterTransformer): - self.transformer.load(path, state_dict) - else: - if state_dict is None: - state_dict = paddle.load(path) - self.transformer.load_dict(state_dict) - - -class FasterOPT(OPTPretrainedModel): - def __init__(self, model, decoding_lib=None, use_fp16_decoding=False): - super(FasterOPT, self).__init__(model.config) - self._model = model - self.use_fp16_decoding = use_fp16_decoding - self.decoding = InferOptDecoding(model=model, decoding_lib=decoding_lib, use_fp16_decoding=use_fp16_decoding) - - def forward( - self, - input_ids, - seq_len=None, - attention_mask=None, - top_k=4, - top_p=0.0, - max_length=256, - bos_token_id=None, - eos_token_id=None, - pad_token_id=None, - forced_eos_token_id=None, - temperature=0, - decode_strategy="sample", - num_return_sequences=1, - **model_kwargs - ): - if input_ids.dtype == paddle.int64: - input_ids = paddle.cast(input_ids, "int32") - - # change top_p to zero if not using top_p sampling for FT - if decode_strategy == "greedy_search": - top_p = 0.0 - top_k = 1 - if top_p == 1.0: - top_p = 0.0 - if seq_len is None: - seq_len = paddle.sum(paddle.cast(input_ids != pad_token_id, dtype="int32"), axis=-1, dtype="int32") - - if bos_token_id == pad_token_id and paddle.sum(paddle.any(input_ids == pad_token_id), dtype="int64") > 0: - seq_len = seq_len + 1 - - if num_return_sequences > 1: - input_ids, model_kwargs = self.expand_inputs_for_generation( - input_ids, expand_size=num_return_sequences, seq_len=seq_len, attention_mask=attention_mask - ) - seq_len = model_kwargs["seq_len"] - attention_mask = model_kwargs.get("attention_mask", None) - - return self.decoding( - input_ids, - mem_seq_len=seq_len, - attention_mask=attention_mask, - topk=top_k, - topp=top_p, - max_out_len=max_length, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - pad_token_id=pad_token_id, - forced_eos_token_id=forced_eos_token_id, - temperature=temperature, - ) - - def export_params(self, state_to_load, place): - for item in state_to_load: - param_data = np.array(state_to_load[item]) - if self.use_fp16_decoding: - param_data = np.float16(param_data) - - param = self - attr_list = item.split(".") - attr_list = ["decoding", "model"] + attr_list - for attr in attr_list: - param = getattr(param, attr) - param_name = param.name - var = paddle.static.global_scope().find_var(param_name).get_tensor() - var.set(param_data, place) - - def save_resources(self, tokenizer, path): - vocab_file = os.path.join(path, "vocab.txt") - if isinstance(tokenizer, GPTTokenizer): - with open(vocab_file, "w", encoding="utf-8") as f: - for token in tokenizer.encoder: - f.write(token + "\n") - merges_file = os.path.join(path, "merges.txt") - shutil.copyfile(tokenizer._merges_file, merges_file) - elif isinstance(tokenizer, GPTChineseTokenizer): - tokenizer.save_resources(path) - - generate = forward - - -class FasterGPT(GPTPretrainedModel): - def __init__(self, model, decoding_lib=None, use_fp16_decoding=False): - super(FasterGPT, self).__init__(model.config) - self._model = model - self.use_fp16_decoding = use_fp16_decoding - self.decoding = InferGptDecoding(model=model, decoding_lib=decoding_lib, use_fp16_decoding=use_fp16_decoding) - - def forward( - self, - input_ids, - seq_len=None, - attention_mask=None, - top_k=4, - top_p=0.0, - max_length=256, - bos_token_id=None, - eos_token_id=None, - pad_token_id=None, - forced_eos_token_id=None, - temperature=0, - decode_strategy="sample", - num_return_sequences=1, - **model_kwargs - ): - if input_ids.dtype == paddle.int64: - input_ids = paddle.cast(input_ids, "int32") - - # change top_p to zero if not using top_p sampling for FT - if decode_strategy == "greedy_search": - top_p = 0.0 - top_k = 1 - if top_p == 1.0: - top_p = 0.0 - if seq_len is None: - seq_len = paddle.sum(paddle.cast(input_ids != pad_token_id, dtype="int32"), axis=-1, dtype="int32") - - if bos_token_id == pad_token_id and paddle.sum(paddle.any(input_ids == pad_token_id), dtype="int64") > 0: - seq_len = seq_len + 1 - - if num_return_sequences > 1: - input_ids, model_kwargs = self.expand_inputs_for_generation( - input_ids, expand_size=num_return_sequences, seq_len=seq_len, attention_mask=attention_mask - ) - seq_len = model_kwargs["seq_len"] - attention_mask = model_kwargs.get("attention_mask", None) - - return self.decoding( - input_ids, - mem_seq_len=seq_len, - attention_mask=attention_mask, - topk=top_k, - topp=top_p, - max_out_len=max_length, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - pad_token_id=pad_token_id, - forced_eos_token_id=forced_eos_token_id, - temperature=temperature, - ) - - def export_params(self, state_to_load, place): - for item in state_to_load: - param_data = np.array(state_to_load[item]) - if self.use_fp16_decoding: - param_data = np.float16(param_data) - - param = self - attr_list = item.split(".") - attr_list = ["decoding", "model"] + attr_list - for attr in attr_list: - param = getattr(param, attr) - param_name = param.name - var = paddle.static.global_scope().find_var(param_name).get_tensor() - var.set(param_data, place) - - def save_resources(self, tokenizer, path): - vocab_file = os.path.join(path, "vocab.txt") - if isinstance(tokenizer, GPTTokenizer): - with open(vocab_file, "w", encoding="utf-8") as f: - for token in tokenizer.encoder: - f.write(token + "\n") - merges_file = os.path.join(path, "merges.txt") - shutil.copyfile(tokenizer._merges_file, merges_file) - elif isinstance(tokenizer, GPTChineseTokenizer): - tokenizer.save_resources(path) - - generate = forward - - -class FasterUnifiedTransformer(UnifiedTransformerPretrainedModel): - def __init__(self, model, decoding_lib=None, use_fp16_decoding=False): - super(FasterUnifiedTransformer, self).__init__(model.config) - self._model = model - self._use_fp16_decoding = use_fp16_decoding - self.vocab_size = model.lm_head.decoder_bias.shape[0] - self.unk_token_id = self._model.config.unk_token_id - self.mask_token_id = self._model.config.mask_token_id - self.bos_token_id = self._model.config.bos_token_id - self.pad_token_id = self._model.config.pad_token_id - self.logits_mask = self.generate_logits_mask(use_fp16_decoding) - self._n_head = self._model.config.num_attention_heads - self._hidden_dims = self._model.config.hidden_size - self._normalize_before = self._model.config.normalize_before - self._size_per_head = self._hidden_dims // self._n_head - self._n_layer = self._model.config.num_hidden_layers - self._hidden_act = self._model.config.hidden_act - - self.decoding = InferUnifiedDecoding( - model=self._model, - decoding_lib=decoding_lib, - use_fp16_decoding=use_fp16_decoding, - logits_mask=self.logits_mask, - n_head=self._n_head, - hidden_dims=self._hidden_dims, - size_per_head=self._size_per_head, - n_layer=self._n_layer, - unk_id=self.unk_token_id, - mask_id=self.mask_token_id, - normalize_before=self._normalize_before, - hidden_act=self._hidden_act, - ) - - def prepare_inputs_for_generation( - self, input_ids, token_type_ids, attention_mask, seq_len, position_ids=None, role_ids=None, **kwargs - ): - input_ids = input_ids[:, :-1] - if input_ids.dtype == paddle.int64: - input_ids = paddle.cast(input_ids, dtype="int32") - - if token_type_ids.dtype == paddle.int64: - token_type_ids = paddle.cast(token_type_ids, dtype="int32") - decoder_type_ids = token_type_ids[:, -1:] - token_type_ids = token_type_ids[:, :-1] - - # TODO(guosheng): attention_mask of UnifiedTransformer uses 0/-INF - # and is 4D. While now we want to use 1/0 to unify all models and - # tokenizers. - attention_mask = attention_mask[:, :, :-1, :-1] if attention_mask.ndim == 4 else attention_mask[:, :-1, :-1] - attention_mask = paddle.cast(attention_mask == 0, dtype="float16" if self._use_fp16_decoding else "float32") - - seq_len = seq_len - 1 - if seq_len.dtype == paddle.int64: - seq_len = paddle.cast(seq_len, dtype="int32") - - if position_ids is not None: - if position_ids.dtype == paddle.int64: - position_ids = paddle.cast(position_ids, dtype="int32") - decoder_position_ids = position_ids[:, -1:] - position_ids = position_ids[:, :-1] - else: - decoder_position_ids = None - - field_values = {} - if role_ids is not None: - if role_ids.dtype == paddle.int64: - role_ids = paddle.cast(role_ids, dtype="int32") - decoder_role_ids = role_ids[:, -1:] - role_ids = role_ids[:, :-1] - else: - decoder_role_ids = None - - field_values["input_ids"] = input_ids - field_values["token_type_ids"] = token_type_ids - field_values["attention_mask"] = attention_mask - field_values["seq_len"] = seq_len - field_values["decoder_type_ids"] = decoder_type_ids - field_values["position_ids"] = position_ids - field_values["decoder_position_ids"] = decoder_position_ids - field_values["role_ids"] = role_ids - field_values["decoder_role_ids"] = decoder_role_ids - - return field_values - - def generate_logits_mask(self, use_fp16_decoding): - # pre-process distribution - logits_mask = np.zeros(shape=[self.vocab_size], dtype=np.float32) - - if use_fp16_decoding: - logits_mask[self.unk_token_id] = -1e4 - logits_mask[self.bos_token_id] = -1e4 - logits_mask[self.pad_token_id] = -1e4 - else: - logits_mask[self.unk_token_id] = -1e9 - logits_mask[self.bos_token_id] = -1e9 - logits_mask[self.pad_token_id] = -1e9 - - logits_mask_t = paddle.assign(logits_mask) - if use_fp16_decoding: - return paddle.cast(logits_mask_t, dtype="float16") - else: - return logits_mask_t - - def forward( - self, - input_ids, - token_type_ids, - attention_mask, - seq_len=None, - role_ids=None, - position_ids=None, - max_length=128, - min_length=0, - top_k=4, - top_p=0.0, - decode_strategy="sampling", - bos_token_id=None, - eos_token_id=None, - pad_token_id=None, - num_beams=4, - diversity_rate=0.0, - temperature=1.0, - num_return_sequences=1, - length_penalty=0.6, - early_stopping=False, - forced_eos_token_id=None, - **model_kwargs - ): - - if seq_len is None: - assert input_ids is not None, "You have to specify either input_ids when generating seq_len." - seq_len = paddle.sum( - paddle.cast(input_ids != self.pad_token_id, dtype="int32"), axis=-1, keepdim=True, dtype="int32" - ) - if decode_strategy.startswith("beam_search"): - input_ids, model_kwargs = self.expand_inputs_for_generation( - input_ids, - expand_size=num_beams, - token_type_ids=token_type_ids, - position_ids=position_ids, - attention_mask=attention_mask, - seq_len=seq_len, - role_ids=role_ids, - ) - elif decode_strategy == "sampling": - input_ids, model_kwargs = self.expand_inputs_for_generation( - input_ids, - expand_size=num_return_sequences, - token_type_ids=token_type_ids, - position_ids=position_ids, - attention_mask=attention_mask, - seq_len=seq_len, - role_ids=role_ids, - ) - elif decode_strategy == "greedy_search": - model_kwargs = { - "token_type_ids": token_type_ids, - "position_ids": position_ids, - "attention_mask": attention_mask, - "seq_len": seq_len, - "role_ids": role_ids, - } - else: - raise ValueError("Only greedy search, beam search and sampling are supported. ") - - model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) - - seq_len = model_inputs.pop("seq_len") - decoder_type_ids = model_inputs.pop("decoder_type_ids") - role_ids = model_inputs.pop("role_ids", None) - decoder_role_ids = model_inputs.pop("decoder_role_ids", None) - position_ids = model_inputs.pop("position_ids", None) - decoder_position_ids = model_inputs.pop("decoder_position_ids", None) - - return self.decoding( - input_ids=model_inputs["input_ids"], - attn_mask=model_inputs["attention_mask"], - memory_seq_lens=seq_len, - type_id=model_inputs["token_type_ids"], - decoder_type_id=decoder_type_ids, - role_id=role_ids, - decoder_role_id=decoder_role_ids, - position_id=position_ids, - decoder_position_id=decoder_position_ids, - beam_size=num_beams, - diversity_rate=diversity_rate, - topk=top_k, - topp=top_p, - decoding_strategy=decode_strategy, - max_out_len=max_length, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - pad_token_id=pad_token_id, - temperature=temperature, - length_penalty=length_penalty, - pos_bias=True, - forced_eos_token_id=forced_eos_token_id, - early_stopping=early_stopping, - min_length=min_length, - ) - - generate = forward - - -class FasterUNIMOText(UNIMOPretrainedModel): - def __init__(self, model, decoding_lib=None, use_fp16_decoding=False, **kwargs): - super(FasterUNIMOText, self).__init__(model.config) - self._model = model - self._use_fp16_decoding = use_fp16_decoding - self.unk_token_id = self._model.config.unk_token_id - self.mask_token_id = self._model.config.mask_token_id - self.bos_token_id = self._model.config.bos_token_id - self.pad_token_id = self._model.config.pad_token_id - self.vocab_size = model.lm_head.decoder_bias.shape[0] - - self.logits_mask = self.generate_logits_mask(use_fp16_decoding) - self._n_head = self._model.config.num_attention_heads - self._hidden_dims = self._model.config.hidden_size - self._normalize_before = self._model.config.normalize_before - self._size_per_head = self._hidden_dims // self._n_head - self._n_layer = self._model.config.num_hidden_layers - self._hidden_act = self._model.config.hidden_act - self.trans_out = kwargs.get("trans_out", False) - - self.decoding = InferUnifiedDecoding( - model=self._model, - decoding_lib=decoding_lib, - use_fp16_decoding=use_fp16_decoding, - logits_mask=self.logits_mask, - n_head=self._n_head, - hidden_dims=self._hidden_dims, - size_per_head=self._size_per_head, - n_layer=self._n_layer, - unk_id=self.unk_token_id, - mask_id=self.mask_token_id, - normalize_before=self._normalize_before, - hidden_act=self._hidden_act, - ) - - def prepare_inputs_for_generation(self, input_ids, token_type_ids, attention_mask, **kwargs): - input_ids = input_ids[:, :-1] - if input_ids.dtype == paddle.int64: - input_ids = paddle.cast(input_ids, dtype="int32") - - if token_type_ids.dtype == paddle.int64: - token_type_ids = paddle.cast(token_type_ids, dtype="int32") - decoder_type_ids = token_type_ids[:, -1:] - token_type_ids = token_type_ids[:, :-1] - - attention_mask = attention_mask[:, :, :-1, :-1] - attention_mask = paddle.cast(attention_mask == 0, dtype="float16" if self._use_fp16_decoding else "float32") - - seq_len = kwargs.get("seq_len") - 1 - if seq_len.dtype == paddle.int64: - seq_len = paddle.cast(seq_len, dtype="int32") - - return { - "input_ids": input_ids, - "token_type_ids": token_type_ids, - "attention_mask": attention_mask, - "seq_len": seq_len, - "decoder_type_ids": decoder_type_ids, - } - - def generate_logits_mask(self, use_fp16_decoding): - # pre-process distribution - logits_mask = np.zeros(shape=[self.vocab_size], dtype=np.float32) - - if use_fp16_decoding: - logits_mask[self.unk_token_id] = -1e4 - logits_mask[self.bos_token_id] = -1e4 - logits_mask[self.pad_token_id] = -1e4 - else: - logits_mask[self.unk_token_id] = -1e9 - logits_mask[self.bos_token_id] = -1e9 - logits_mask[self.pad_token_id] = -1e9 - - logits_mask_t = paddle.assign(logits_mask) - if use_fp16_decoding: - return paddle.cast(logits_mask_t, dtype="float16") - else: - return logits_mask_t - - def forward( - self, - input_ids, - token_type_ids, - attention_mask, - seq_len=None, - max_length=128, - min_length=0, - top_k=4, - top_p=0.0, - num_beams=4, - decode_strategy="sampling", - bos_token_id=None, - eos_token_id=None, - pad_token_id=None, - diversity_rate=0.0, - temperature=1.0, - num_return_sequences=1, - length_penalty=0.6, - early_stopping=False, - forced_eos_token_id=None, - position_ids=None, - **model_kwargs - ): - - if seq_len is None: - assert input_ids is not None, "You have to specify either input_ids when generating seq_len." - seq_len = paddle.sum(paddle.cast(input_ids != self.pad_token_id, dtype="int32"), axis=-1, dtype="int32") - if decode_strategy.startswith("beam_search"): - input_ids, model_kwargs = self.expand_inputs_for_generation( - input_ids, - expand_size=num_beams, - token_type_ids=token_type_ids, - position_ids=position_ids, - attention_mask=attention_mask, - seq_len=seq_len, - ) - elif decode_strategy == "sampling": - input_ids, model_kwargs = self.expand_inputs_for_generation( - input_ids, - expand_size=num_return_sequences, - token_type_ids=token_type_ids, - position_ids=position_ids, - attention_mask=attention_mask, - seq_len=seq_len, - ) - elif decode_strategy == "greedy_search": - model_kwargs = { - "token_type_ids": token_type_ids, - "position_ids": position_ids, - "attention_mask": attention_mask, - "seq_len": seq_len, - } - else: - raise ValueError("Only greedy search, beam search and sampling are supported. ") - - model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) - seq_len = model_inputs.pop("seq_len") - decoder_type_ids = model_inputs.pop("decoder_type_ids") - - ids, output_scores = self.decoding( - input_ids=model_inputs["input_ids"], - attn_mask=model_inputs["attention_mask"], - memory_seq_lens=seq_len, - type_id=model_inputs["token_type_ids"], - decoder_type_id=decoder_type_ids, - beam_size=num_beams, - diversity_rate=diversity_rate, - topk=top_k, - topp=top_p, - decoding_strategy=decode_strategy, - max_out_len=max_length, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - pad_token_id=pad_token_id, - temperature=temperature, - length_penalty=length_penalty, - forced_eos_token_id=forced_eos_token_id, - pos_bias=False, - early_stopping=early_stopping, - min_length=min_length, - ) - if self.trans_out: - if decode_strategy.startswith("beam_search"): - ids = ids.transpose([1, 2, 0]) - else: - ids = ids.transpose([1, 0]) - return ids, output_scores - - generate = forward - - -class FasterMIRO(UNIMOPretrainedModel): - def __init__(self, model, decoding_lib=None, use_fp16_decoding=False, **kwargs): - super(FasterMIRO, self).__init__(model.config) - self._model = model - self._use_fp16_decoding = use_fp16_decoding - self.unk_token_id = self._model.config.unk_token_id - self.mask_token_id = self._model.config.mask_token_id - self.bos_token_id = self._model.config.bos_token_id - self.pad_token_id = self._model.config.pad_token_id - self.vocab_size = model.lm_head.decoder_bias.shape[0] - - self.logits_mask = self.generate_logits_mask(use_fp16_decoding) - self._n_head = self._model.config.num_attention_heads - self._hidden_dims = self._model.config.hidden_size - self._normalize_before = self._model.config.normalize_before - self._size_per_head = self._hidden_dims // self._n_head - self._n_layer = self._model.config.num_hidden_layers - self._hidden_act = self._model.config.hidden_act - self.trans_out = kwargs.get("trans_out", False) - - self.decoding = InferMIRODecoding( - model=self._model, - decoding_lib=decoding_lib, - use_fp16_decoding=use_fp16_decoding, - logits_mask=self.logits_mask, - n_head=self._n_head, - hidden_dims=self._hidden_dims, - size_per_head=self._size_per_head, - n_layer=self._n_layer, - unk_id=self.unk_token_id, - mask_id=self.mask_token_id, - normalize_before=self._normalize_before, - hidden_act=self._hidden_act, - ) - - def prepare_inputs_for_generation(self, input_ids, token_type_ids, attention_mask, **kwargs): - input_ids = input_ids[:, :-1] - if input_ids.dtype == paddle.int64: - input_ids = paddle.cast(input_ids, dtype="int32") - - if token_type_ids.dtype == paddle.int64: - token_type_ids = paddle.cast(token_type_ids, dtype="int32") - decoder_type_ids = token_type_ids[:, -1:] - token_type_ids = token_type_ids[:, :-1] - - attention_mask = attention_mask[:, :, :-1, :-1] - attention_mask = paddle.cast(attention_mask == 0, dtype="float16" if self._use_fp16_decoding else "float32") - - seq_len = kwargs.get("seq_len") - 1 - if seq_len.dtype == paddle.int64: - seq_len = paddle.cast(seq_len, dtype="int32") - - return { - "input_ids": input_ids, - "token_type_ids": token_type_ids, - "attention_mask": attention_mask, - "seq_len": seq_len, - "decoder_type_ids": decoder_type_ids, - } - - def generate_logits_mask(self, use_fp16_decoding): - # pre-process distribution - logits_mask = np.zeros(shape=[self.vocab_size], dtype=np.float32) - - if use_fp16_decoding: - logits_mask[self.unk_token_id] = -1e4 - logits_mask[self.bos_token_id] = -1e4 - logits_mask[self.pad_token_id] = -1e4 - else: - logits_mask[self.unk_token_id] = -1e9 - logits_mask[self.bos_token_id] = -1e9 - logits_mask[self.pad_token_id] = -1e9 - - logits_mask_t = paddle.assign(logits_mask) - if use_fp16_decoding: - return paddle.cast(logits_mask_t, dtype="float16") - else: - return logits_mask_t - - def forward( - self, - input_ids, - token_type_ids, - attention_mask, - seq_len=None, - max_length=128, - min_length=0, - top_k=4, - top_p=0.0, - num_beams=4, - decode_strategy="sampling", - bos_token_id=None, - eos_token_id=None, - pad_token_id=None, - diversity_rate=0.0, - temperature=1.0, - num_return_sequences=1, - length_penalty=0.6, - early_stopping=False, - forced_eos_token_id=None, - position_ids=None, - **model_kwargs - ): - - if seq_len is None: - assert input_ids is not None, "You have to specify either input_ids when generating seq_len." - seq_len = paddle.sum(paddle.cast(input_ids != self.pad_token_id, dtype="int32"), axis=-1, dtype="int32") - if decode_strategy.startswith("beam_search"): - input_ids, model_kwargs = self.expand_inputs_for_generation( - input_ids, - expand_size=num_beams, - token_type_ids=token_type_ids, - position_ids=position_ids, - attention_mask=attention_mask, - seq_len=seq_len, - ) - elif decode_strategy == "sampling": - input_ids, model_kwargs = self.expand_inputs_for_generation( - input_ids, - expand_size=num_return_sequences, - token_type_ids=token_type_ids, - position_ids=position_ids, - attention_mask=attention_mask, - seq_len=seq_len, - ) - elif decode_strategy == "greedy_search": - model_kwargs = { - "token_type_ids": token_type_ids, - "position_ids": position_ids, - "attention_mask": attention_mask, - "seq_len": seq_len, - } - else: - raise ValueError("Only greedy search, beam search and sampling are supported. ") - - model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) - seq_len = model_inputs.pop("seq_len") - decoder_type_ids = model_inputs.pop("decoder_type_ids") - - ids, output_scores = self.decoding( - input_ids=model_inputs["input_ids"], - attn_mask=model_inputs["attention_mask"], - memory_seq_lens=seq_len, - type_id=model_inputs["token_type_ids"], - decoder_type_id=decoder_type_ids, - beam_size=num_beams, - diversity_rate=diversity_rate, - topk=top_k, - topp=top_p, - decoding_strategy=decode_strategy, - max_out_len=max_length, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - pad_token_id=pad_token_id, - temperature=temperature, - length_penalty=length_penalty, - forced_eos_token_id=forced_eos_token_id, - pos_bias=False, - early_stopping=early_stopping, - min_length=min_length, - ) - if self.trans_out: - if decode_strategy.startswith("beam_search"): - ids = ids.transpose([1, 2, 0]) - else: - ids = ids.transpose([1, 0]) - return ids, output_scores - - generate = forward - - -class FasterBART(BartPretrainedModel): - enable_faster_encoder_func = enable_fast_encoder - - def __init__(self, model, decoding_lib=None, use_fp16_decoding=False, enable_fast_encoder=True): - super(FasterBART, self).__init__(model.config) - self.use_fp16_decoding = use_fp16_decoding - self._model = model - if use_fp16_decoding: - weight_attr = paddle.ParamAttr(initializer=nn.initializer.Assign(model.bart.encoder.embed_tokens.weight)) - model.bart.encoder.embed_tokens = nn.Embedding( - *model.bart.encoder.embed_tokens.weight.shape, weight_attr=weight_attr - ) - self.encoder = model.bart.get_encoder() - self.decoder = model.bart.get_decoder() - self.pad_token_id = model.bart.config["pad_token_id"] - self.enable_fast_encoder = enable_fast_encoder - - self.decoding = InferBartDecoding( - model=self._model, decoding_lib=decoding_lib, use_fp16_decoding=use_fp16_decoding - ) - if self.enable_fast_encoder: - # Must use `enable_fast_encoder` in `__init__` when dygraph to static graph. - self.encoder = FasterBART.enable_faster_encoder_func(self.encoder) - - def get_encoder(self): - return self.encoder - - def get_decoder(self): - return self.decoder - - def forward( - self, - input_ids=None, - encoder_output=None, - seq_len=None, - num_beams=4, - top_k=1, - top_p=0.0, - temperature=1.0, - decode_strategy="beam_search", - bos_token_id=None, - eos_token_id=None, - pad_token_id=None, - decoder_start_token_id=None, - min_length=0, - max_length=20, - diversity_rate=0.0, - length_penalty=0.6, - num_return_sequences=1, - early_stopping=False, - forced_eos_token_id=None, - **model_kwargs - ): - - if encoder_output is None: - assert input_ids is not None, "You have to specify either input_ids or encoder_output." - encoder_output = self.prepare_encoder_decoder_kwargs_for_generation(input_ids, model_kwargs)[ - "encoder_output" - ] - if seq_len is None: - assert input_ids is not None, "You have to specify either input_ids when generating seq_len." - seq_len = paddle.sum( - paddle.cast(input_ids != self.pad_token_id, dtype="int32"), axis=-1, keepdim=True, dtype="int32" - ) - if self.use_fp16_decoding: - encoder_output = paddle.cast(encoder_output, "float16") - if decode_strategy.startswith("beam_search") and num_beams > 1: - encoder_output, expanded_kwargs = self.expand_inputs_for_generation( - encoder_output, expand_size=num_beams, seq_len=seq_len - ) - seq_len = expanded_kwargs["seq_len"] - elif decode_strategy == "sampling" and num_return_sequences > 1: - encoder_output, expanded_kwargs = self.expand_inputs_for_generation( - encoder_output, expand_size=num_return_sequences, seq_len=seq_len - ) - seq_len = expanded_kwargs["seq_len"] - if decoder_start_token_id is not None: - bos_token_id = decoder_start_token_id - - return self.decoding( - enc_output=encoder_output, - memory_seq_lens=seq_len, - beam_size=num_beams, - top_k=top_k, - decoding_strategy=decode_strategy, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - pad_token_id=pad_token_id, - top_p=top_p, - max_out_len=max_length, - min_out_len=min_length, - temperature=temperature, - diversity_rate=diversity_rate, - alpha=length_penalty, - early_stopping=early_stopping, - forced_eos_token_id=forced_eos_token_id, - ) - - generate = forward - - -class FasterMBART(MBartPretrainedModel): - enable_faster_encoder_func = enable_fast_encoder - - def __init__(self, model, decoding_lib=None, use_fp16_decoding=False, enable_fast_encoder=False): - super(FasterMBART, self).__init__(model.config) - self.use_fp16_decoding = use_fp16_decoding - self._model = model - if use_fp16_decoding: - weight_attr = paddle.ParamAttr(initializer=nn.initializer.Assign(model.mbart.encoder.embed_tokens.weight)) - model.mbart.encoder.embed_tokens = nn.Embedding( - *model.mbart.encoder.embed_tokens.weight.shape, weight_attr=weight_attr - ) - self.encoder = model.mbart.get_encoder() - self.decoder = model.mbart.get_decoder() - self.pad_token_id = model.mbart.config["pad_token_id"] - self.enable_fast_encoder = enable_fast_encoder - - self.decoding = InferMBartDecoding( - model=self._model, - decoding_lib=decoding_lib, - use_fp16_decoding=use_fp16_decoding, - hidden_act=model.mbart.config["activation_function"], - ) - - if self.enable_fast_encoder: - # Must use `enable_fast_encoder` in `__init__` when dygraph to static graph. - self.encoder = FasterMBART.enable_faster_encoder_func(self.encoder) - - def get_encoder(self): - return self.encoder - - def get_decoder(self): - return self.decoder - - def forward( - self, - input_ids=None, - encoder_output=None, - seq_len=None, - forced_bos_token_id=None, - num_beams=4, - top_k=1, - top_p=0.0, - decode_strategy="beam_search_v3", - bos_token_id=None, - eos_token_id=None, - pad_token_id=None, - decoder_start_token_id=None, - max_length=256, - diversity_rate=0.0, - length_penalty=0.6, - temperature=1.0, - num_return_sequences=1, - early_stopping=False, - forced_eos_token_id=None, - **model_kwargs - ): - - bos_token_id = bos_token_id if bos_token_id is not None else getattr(self._model, "bos_token_id", None) - eos_token_id = eos_token_id if eos_token_id is not None else getattr(self._model, "eos_token_id", None) - pad_token_id = pad_token_id if pad_token_id is not None else getattr(self._model, "pad_token_id", None) - decoder_start_token_id = ( - decoder_start_token_id - if decoder_start_token_id is not None - else getattr(self._model, "decoder_start_token_id", None) - ) - - # (gongenlei) Not enable_fast_encoder temporarily - if encoder_output is None: - assert input_ids is not None, "You have to specify either input_ids or encoder_output." - encoder_output = self.prepare_encoder_decoder_kwargs_for_generation(input_ids, model_kwargs)[ - "encoder_output" - ] - batch_size = encoder_output.shape[0] - if seq_len is None: - assert input_ids is not None, "You have to specify either input_ids when generating seq_len." - seq_len = paddle.sum(paddle.cast(input_ids != self.pad_token_id, dtype="int32"), axis=-1, dtype="int32") - if self.use_fp16_decoding: - encoder_output = paddle.cast(encoder_output, "float16") - if decode_strategy.startswith("beam_search") and num_beams > 1: - encoder_output, expanded_kwargs = self.expand_inputs_for_generation( - encoder_output, expand_size=num_beams, seq_len=seq_len - ) - seq_len = expanded_kwargs["seq_len"] - elif decode_strategy == "sampling" and num_return_sequences > 1: - encoder_output, expanded_kwargs = self.expand_inputs_for_generation( - encoder_output, expand_size=num_return_sequences, seq_len=seq_len - ) - seq_len = expanded_kwargs["seq_len"] - if decoder_start_token_id is not None: - bos_token_id = decoder_start_token_id - - if not isinstance(forced_bos_token_id, type(input_ids)): - if forced_bos_token_id is not None: - if decode_strategy == "sampling": - forced_bos_token_id = paddle.full( - [batch_size * num_return_sequences, 1], forced_bos_token_id, dtype="int32" - ) - else: - forced_bos_token_id = paddle.full([batch_size, 1], forced_bos_token_id, dtype="int32") - else: - forced_bos_token_id = paddle.zeros([0]) - elif decode_strategy == "sampling": - num_samples = encoder_output.shape[0] - forced_bos_token_id = paddle.expand(forced_bos_token_id, shape=[num_samples, 1]) - - return self.decoding( - enc_output=encoder_output, - memory_seq_lens=seq_len, - beam_size=num_beams, - trg_word=forced_bos_token_id, - top_k=top_k, - top_p=top_p, - decoding_strategy=decode_strategy, - diversity_rate=diversity_rate, - max_out_len=max_length, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - pad_token_id=pad_token_id, - alpha=length_penalty, - temperature=temperature, - early_stopping=early_stopping, - ) - - generate = forward - - -class FasterGPTJ(GPTJPretrainedModel): - def __init__(self, model, decoding_lib=None, use_fp16_decoding=False): - super(FasterGPTJ, self).__init__(model.config) - self._model = model - self.use_fp16_decoding = use_fp16_decoding - self.decoding = InferGptJDecoding(model=model, decoding_lib=decoding_lib, use_fp16_decoding=use_fp16_decoding) - - def forward( - self, - input_ids, - seq_len=None, - attention_mask=None, - top_k=4, - top_p=0.0, - min_length=0, - max_length=256, - bos_token_id=None, - eos_token_id=None, - pad_token_id=None, - forced_eos_token_id=None, - temperature=0, - repetition_penalty=1.0, - decode_strategy="sampling", - num_return_sequences=1, - **model_kwargs - ): - if input_ids.dtype == paddle.int64: - input_ids = paddle.cast(input_ids, "int32") - - # change top_p to zero if not using top_p sampling for FT - if decode_strategy == "greedy_search": - top_p = 0.0 - top_k = 1 - if top_p == 1.0: - top_p = 0.0 - if seq_len is None: - seq_len = paddle.sum(paddle.cast(input_ids != pad_token_id, dtype="int32"), axis=-1, dtype="int32") - - if num_return_sequences > 1: - input_ids, model_kwargs = self.expand_inputs_for_generation( - input_ids, expand_size=num_return_sequences, seq_len=seq_len, attention_mask=attention_mask - ) - seq_len = model_kwargs["seq_len"] - attention_mask = model_kwargs.get("attention_mask", None) - - return self.decoding( - input_ids, - mem_seq_len=seq_len, - attention_mask=attention_mask, - topk=top_k, - topp=top_p, - max_out_len=max_length, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - pad_token_id=pad_token_id, - forced_eos_token_id=forced_eos_token_id, - temperature=temperature, - repetition_penalty=repetition_penalty, - min_length=min_length, - ) - - generate = forward - - -class FasterCodeGen(CodeGenPreTrainedModel): - def __init__(self, model, decoding_lib=None, use_fp16_decoding=False): - super(FasterCodeGen, self).__init__(model.config) - self._model = model - self.use_fp16_decoding = use_fp16_decoding - self.decoding = InferGptJDecoding( - model=model, decoding_lib=decoding_lib, use_fp16_decoding=use_fp16_decoding, transpose_qkv=True - ) - - def forward( - self, - input_ids, - seq_len=None, - attention_mask=None, - top_k=4, - top_p=0.0, - min_length=0, - max_length=256, - bos_token_id=None, - eos_token_id=None, - pad_token_id=None, - forced_eos_token_id=None, - temperature=0, - repetition_penalty=1.0, - decode_strategy="sampling", - num_return_sequences=1, - **model_kwargs - ): - if input_ids.dtype == paddle.int64: - input_ids = paddle.cast(input_ids, "int32") - - # change top_p to zero if not using top_p sampling for FT - if decode_strategy == "greedy_search": - top_p = 0.0 - top_k = 1 - if top_p == 1.0: - top_p = 0.0 - if seq_len is None: - seq_len = paddle.sum(paddle.cast(input_ids != pad_token_id, dtype="int32"), axis=-1, dtype="int32") - - if num_return_sequences > 1: - input_ids, model_kwargs = self.expand_inputs_for_generation( - input_ids, expand_size=num_return_sequences, seq_len=seq_len, attention_mask=attention_mask - ) - seq_len = model_kwargs["seq_len"] - attention_mask = model_kwargs.get("attention_mask", None) - - return self.decoding( - input_ids, - mem_seq_len=seq_len, - attention_mask=attention_mask, - topk=top_k, - topp=top_p, - max_out_len=max_length, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - pad_token_id=pad_token_id, - forced_eos_token_id=forced_eos_token_id, - temperature=temperature, - repetition_penalty=repetition_penalty, - min_length=min_length, - ) - - generate = forward - - -class FasterPegasus(PegasusPretrainedModel): - enable_faster_encoder_func = enable_fast_encoder - - def __init__(self, model, decoding_lib=None, use_fp16_decoding=False, enable_fast_encoder=False, **kwargs): - super(FasterPegasus, self).__init__(model.config) - self.use_fp16_decoding = use_fp16_decoding - self._model = model - self.encoder = model.get_encoder() - self.decoder = model.get_decoder() - self.pad_token_id = model.pegasus.config["pad_token_id"] - self.enable_fast_encoder = enable_fast_encoder - self.trans_out = kwargs.get("trans_out", False) - - self.decoding = InferPegasusDecoding( - model=self._model, - decoding_lib=decoding_lib, - use_fp16_decoding=use_fp16_decoding, - hidden_act=model.pegasus.config["activation_function"], - ) - - # TODO(gongenlei): Support faster_encoder - # if self.enable_fast_encoder: - # # Must use `enable_fast_encoder` in `__init__` when dygraph to static graph. - # self.encoder = FasterPegasus.enable_faster_encoder_func(self.encoder) - - def get_encoder(self): - return self.encoder - - def get_decoder(self): - return self.decoder - - def forward( - self, - input_ids=None, - encoder_output=None, - seq_len=None, - min_length=0, - max_length=256, - num_beams=4, - decode_strategy="beam_search_v3", - decoder_start_token_id=None, - bos_token_id=None, - eos_token_id=None, - pad_token_id=None, - diversity_rate=0.0, - length_penalty=0.6, - top_k=1, - top_p=0.0, - temperature=1.0, - num_return_sequences=1, - early_stopping=False, - forced_bos_token_id=None, - forced_eos_token_id=None, - **model_kwargs - ): - - bos_token_id = bos_token_id if bos_token_id is not None else getattr(self._model, "bos_token_id", None) - eos_token_id = eos_token_id if eos_token_id is not None else getattr(self._model, "eos_token_id", None) - pad_token_id = pad_token_id if pad_token_id is not None else getattr(self._model, "pad_token_id", None) - decoder_start_token_id = ( - decoder_start_token_id - if decoder_start_token_id is not None - else getattr(self._model, "decoder_start_token_id", None) - ) - - if encoder_output is None: - assert input_ids is not None, "You have to specify either input_ids or encoder_output." - encoder_output = self.prepare_encoder_decoder_kwargs_for_generation(input_ids, model_kwargs)[ - "encoder_output" - ] - - if seq_len is None: - assert input_ids is not None, "You have to specify either input_ids when generating seq_len." - seq_len = paddle.sum(paddle.cast(input_ids != self.pad_token_id, dtype="int32"), axis=-1, dtype="int32") - if self.use_fp16_decoding: - encoder_output = paddle.cast(encoder_output, "float16") - if decode_strategy.startswith("beam_search") and num_beams > 1: - encoder_output, expanded_kwargs = self.expand_inputs_for_generation( - encoder_output, expand_size=num_beams, seq_len=seq_len - ) - seq_len = expanded_kwargs["seq_len"] - elif decode_strategy == "sampling" and num_return_sequences > 1: - encoder_output, expanded_kwargs = self.expand_inputs_for_generation( - encoder_output, expand_size=num_return_sequences, seq_len=seq_len - ) - seq_len = expanded_kwargs["seq_len"] - if decoder_start_token_id is not None: - bos_token_id = decoder_start_token_id - - ids = self.decoding( - enc_output=encoder_output, - memory_seq_lens=seq_len, - beam_size=num_beams, - top_k=top_k, - top_p=top_p, - decoding_strategy=decode_strategy, - max_out_len=max_length, - min_out_len=min_length, - diversity_rate=diversity_rate, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - pad_token_id=pad_token_id, - alpha=length_penalty, - temperature=temperature, - early_stopping=early_stopping, - forced_eos_token_id=forced_eos_token_id, - ) - - if self.trans_out: - if decode_strategy.startswith("beam_search"): - ids = ids.transpose([1, 2, 0]) - else: - ids = ids.transpose([1, 0]) - - return ids - - generate = forward - - -class FasterT5(T5PretrainedModel): - def __init__(self, model, decoding_lib=None, use_fp16_decoding=False): - super(FasterT5, self).__init__(model.config) - self.use_fp16_decoding = use_fp16_decoding - self._model = model - if use_fp16_decoding: - weight_attr = paddle.ParamAttr(initializer=nn.initializer.Assign(model.encoder.embed_tokens.weight)) - model.encoder.embed_tokens = nn.Embedding( - *model.encoder.embed_tokens.weight.shape, weight_attr=weight_attr - ) - self.encoder = model.t5.get_encoder() - self.decoder = model.t5.get_decoder() - self.pad_token_id = model.t5.config["pad_token_id"] - - self.decoding = InferT5Decoding( - model=self._model, decoding_lib=decoding_lib, use_fp16_decoding=use_fp16_decoding - ) - - def get_encoder(self): - return self.encoder - - def get_decoder(self): - return self.decoder - - def forward( - self, - input_ids=None, - encoder_output=None, - seq_len=None, - max_length=128, - min_length=0, - top_k=4, - top_p=0.0, - num_beams=4, - decode_strategy="sampling", - decoder_start_token_id=None, - bos_token_id=None, - eos_token_id=None, - pad_token_id=None, - diversity_rate=0.0, - temperature=1.0, - num_return_sequences=1, - length_penalty=0.6, - early_stopping=False, - forced_eos_token_id=None, - **model_kwargs - ): - - bos_token_id = bos_token_id if bos_token_id is not None else getattr(self._model, "bos_token_id", None) - eos_token_id = eos_token_id if eos_token_id is not None else getattr(self._model, "eos_token_id", None) - pad_token_id = pad_token_id if pad_token_id is not None else getattr(self._model, "pad_token_id", None) - - if encoder_output is None: - assert input_ids is not None, "You have to specify either input_ids or encoder_output." - encoder_output = self.prepare_encoder_decoder_kwargs_for_generation(input_ids, model_kwargs)[ - "encoder_output" - ] - - if isinstance(encoder_output, (list, tuple)): - encoder_output = encoder_output[0] - - if seq_len is None: - assert input_ids is not None, "You have to specify either input_ids when generating seq_len." - seq_len = paddle.sum(paddle.cast(input_ids != self.pad_token_id, dtype="int32"), axis=-1, dtype="int32") - if self.use_fp16_decoding: - encoder_output = paddle.cast(encoder_output, "float16") - if decode_strategy.startswith("beam_search") and num_beams > 1: - encoder_output, expanded_kwargs = self.expand_inputs_for_generation( - encoder_output, expand_size=num_beams, seq_len=seq_len - ) - seq_len = expanded_kwargs["seq_len"] - elif decode_strategy == "sampling" and num_return_sequences > 1: - encoder_output, expanded_kwargs = self.expand_inputs_for_generation( - encoder_output, expand_size=num_return_sequences, seq_len=seq_len - ) - seq_len = expanded_kwargs["seq_len"] - if decoder_start_token_id is not None: - bos_token_id = decoder_start_token_id - - return self.decoding( - enc_output=encoder_output, - memory_seq_lens=seq_len, - beam_size=num_beams, - top_k=top_k, - top_p=top_p, - decoding_strategy=decode_strategy, - max_out_len=max_length, - diversity_rate=diversity_rate, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - pad_token_id=pad_token_id, - alpha=length_penalty, - temperature=temperature, - early_stopping=early_stopping, - ) - - generate = forward diff --git a/paddlenlp/ops/patches/FasterTransformer/CMakeLists.txt b/paddlenlp/ops/patches/FasterTransformer/CMakeLists.txt deleted file mode 100644 index 9e4e460d265a..000000000000 --- a/paddlenlp/ops/patches/FasterTransformer/CMakeLists.txt +++ /dev/null @@ -1,418 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -cmake_minimum_required(VERSION 3.8 FATAL_ERROR) -project(FasterTransformer LANGUAGES CXX CUDA) - -find_package(CUDA 10.1 REQUIRED) - -find_program(CCACHE_PROGRAM ccache) -if(CCACHE_PROGRAM) - set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache) - set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK ccache) -endif() - -option(BUILD_PD "Build in PaddlePaddle mode" ON) -option(BUILD_GPT "Build project with gpt" ON) -option(BUILD_ENCODER "Build project with encoder" ON) - -if(BUILD_ENCODER) - add_definitions(-DBUILD_ENCODER) -endif() - -if(BUILD_GPT) - message(STATUS "Add DBUILD_GPT, requires MPI and NCCL") - add_definitions("-DBUILD_GPT") - set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules) - find_package(MPI REQUIRED) - find_package(NCCL REQUIRED) - #if(${NCCL_VERSION} LESS 2.7) - # message(FATAL_ERROR "NCCL_VERSION ${NCCL_VERSION} is less than 2.7") - #endif() - set(CMAKE_MODULE_PATH "") # prevent the bugs for pytorch building -endif() - -set(CXX_STD "17" CACHE STRING "C++ standard") - -set(CUDA_PATH ${CUDA_TOOLKIT_ROOT_DIR}) - -list(APPEND CMAKE_MODULE_PATH ${CUDA_PATH}/lib64) - -if (${CUDA_VERSION} GREATER_EQUAL 11.0) - message(STATUS "Add DCUDA11_MODE") - add_definitions("-DCUDA11_MODE") -endif() - -# profiling -option(USE_NVTX "Whether or not to use nvtx" OFF) -if(USE_NVTX) - message(STATUS "NVTX is enabled.") - add_definitions("-DUSE_NVTX") -endif() - -# setting compiler flags -set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}") -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") -set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -Wall -ldl") - -# if (SM STREQUAL 80 OR -# SM STREQUAL 86 OR -# SM STREQUAL 70 OR -# SM STREQUAL 75 OR -# SM STREQUAL 61 OR -# SM STREQUAL 60) -# #set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_${SM},code=\\\"sm_${SM},compute_${SM}\\\" -rdc=true") -# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_${SM},code=\\\"sm_${SM},compute_${SM}\\\"") -# if (SM STREQUAL 70 OR SM STREQUAL 75 OR SM STREQUAL 80 OR SM STREQUAL 86) -# set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DWMMA") -# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DWMMA") -# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DWMMA") -# endif() -# message("-- Assign GPU architecture (sm=${SM})") - -# else() -# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} \ -# -gencode=arch=compute_70,code=\\\"sm_70,compute_70\\\" \ -# -gencode=arch=compute_75,code=\\\"sm_75,compute_75\\\" \ -# ") -# # -rdc=true") -# set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DWMMA") -# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DWMMA") -# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DWMMA") -# message("-- Assign GPU architecture (sm=70,75)") -# endif() - -set(SM_SETS 52 60 61 70 75 80) -set(USING_WMMA False) -set(FIND_SM False) - -foreach(SM_NUM IN LISTS SM_SETS) - string(FIND "${SM}" "${SM_NUM}" SM_POS) - if(SM_POS GREATER -1) - if(FIND_SM STREQUAL False) - set(ENV{TORCH_CUDA_ARCH_LIST} "") - endif() - set(FIND_SM True) - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_${SM_NUM},code=\\\"sm_${SM_NUM},compute_${SM_NUM}\\\"") - - if (SM_NUM STREQUAL 70 OR SM_NUM STREQUAL 75 OR SM_NUM STREQUAL 80 OR SM_NUM STREQUAL 86) - set(USING_WMMA True) - endif() - - set(CMAKE_CUDA_ARCHITECTURES ${SM_NUM}) - message("-- Assign GPU architecture (sm=${SM_NUM})") - endif() -endforeach() - -if(USING_WMMA STREQUAL True) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DWMMA") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DWMMA") - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DWMMA") - message("-- Use WMMA") -endif() - -if(NOT (FIND_SM STREQUAL True)) - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} \ - -gencode=arch=compute_70,code=\\\"sm_70,compute_70\\\" \ - -gencode=arch=compute_75,code=\\\"sm_75,compute_75\\\" \ - -gencode=arch=compute_80,code=\\\"sm_80,compute_80\\\" \ - ") - # -rdc=true") - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DWMMA") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DWMMA") - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DWMMA") - - set(CMAKE_CUDA_ARCHITECTURES 70 75 80) - message("-- Assign GPU architecture (sm=70,75,80)") -endif() - -set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -Wall -O0") -set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -Wall -O0") -# set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -O0 -G -Xcompiler -Wall --ptxas-options=-v --resource-usage") -set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -O0 -G -Xcompiler -Wall") - -set(CMAKE_CXX17_STANDARD_COMPILE_OPTION "-std=c++{CXX_STD}") -set(CMAKE_CXX17_EXTENSION_COMPILE_OPTION "-std=gnu++{CXX_STD}") -set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda") -set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr") -set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --std=c++${CXX_STD}") - -set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3") -# set(CMAKE_CUDA_FLAGS_RELEASE "${CMAKE_CUDA_FLAGS_RELEASE} -Xcompiler -O3 --ptxas-options=--verbose") -set(CMAKE_CUDA_FLAGS_RELEASE "${CMAKE_CUDA_FLAGS_RELEASE} -Xcompiler -O3") - -set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) -set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) -set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) - -set(COMMON_HEADER_DIRS - ${PROJECT_SOURCE_DIR} - ${CUDA_PATH}/include -) - -set(COMMON_LIB_DIRS - ${CUDA_PATH}/lib64 -) - -if(NOT PY_CMD) - set(PYTHON_PATH "python" CACHE STRING "Python path") -else() - set(PYTHON_PATH ${PY_CMD} CACHE STRING "Python path") -endif() - -add_definitions(-w) - -if(BUILD_PD) - add_definitions(-DPADDLE_WITH_CUDA) - - if(ON_INFER) - add_definitions(-DPADDLE_ON_INFERENCE) - - link_directories(${COMMON_LIB_DIRS}) - - if(NOT WITH_STATIC_LIB) - add_definitions("-DPADDLE_WITH_SHARED_LIB") - else() - # PD_INFER_DECL is mainly used to set the dllimport/dllexport attribute in dynamic library mode. - # Set it to empty in static library mode to avoid compilation issues. - add_definitions("/DPD_INFER_DECL=") - endif() - - macro(safe_set_static_flag) - foreach(flag_var - CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE - CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO) - if(${flag_var} MATCHES "/MD") - string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}") - endif(${flag_var} MATCHES "/MD") - endforeach(flag_var) - endmacro() - - if(NOT DEFINED PADDLE_LIB) - message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/path/paddle/lib") - endif() - - include_directories("${PADDLE_LIB}/paddle/include/") - set(PADDLE_LIB_THIRD_PARTY_PATH "${PADDLE_LIB}/third_party/install/") - include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}protobuf/include") - include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}glog/include") - include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}gflags/include") - include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/include") - if (WITH_ONNXRUNTIME) - include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/include") - include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/include") - endif() - - link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}protobuf/lib") - link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}glog/lib") - link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}gflags/lib") - link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/lib") - link_directories("${PADDLE_LIB}/paddle/lib") - if (WITH_ONNXRUNTIME) - include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib") - include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/lib") - endif() - - if(WITH_MKL) - set(FLAG_OPENMP "-fopenmp") - endif() - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAG_OPENMP}") - - if (USE_TENSORRT AND WITH_GPU) - set(TENSORRT_ROOT "" CACHE STRING "The root directory of TensorRT library") - if("${TENSORRT_ROOT}" STREQUAL "") - message(FATAL_ERROR "The TENSORRT_ROOT is empty, you must assign it a value with CMake command. Such as: -DTENSORRT_ROOT=TENSORRT_ROOT_PATH ") - endif() - set(TENSORRT_INCLUDE_DIR ${TENSORRT_ROOT}/include) - set(TENSORRT_LIB_DIR ${TENSORRT_ROOT}/lib) - endif() - - if (USE_TENSORRT AND WITH_GPU) - include_directories("${TENSORRT_INCLUDE_DIR}") - link_directories("${TENSORRT_LIB_DIR}") - endif() - - if(WITH_MKL) - set(MATH_LIB_PATH "${PADDLE_LIB_THIRD_PARTY_PATH}mklml") - include_directories("${MATH_LIB_PATH}/include") - set(MKLDNN_PATH "${PADDLE_LIB_THIRD_PARTY_PATH}mkldnn") - if(EXISTS ${MKLDNN_PATH}) - include_directories("${MKLDNN_PATH}/include") - set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libmkldnn.so.0) - endif() - else() - set(OPENBLAS_LIB_PATH "${PADDLE_LIB_THIRD_PARTY_PATH}openblas") - include_directories("${OPENBLAS_LIB_PATH}/include/openblas") - endif() - - else() - execute_process(COMMAND ${PYTHON_PATH} "-c" "from __future__ import print_function; import paddle; print(paddle.sysconfig.get_include())" - RESULT_VARIABLE _INC_PYTHON_SUCCESS - OUTPUT_VARIABLE _INC_PYTHON_VALUES) - if (NOT _INC_PYTHON_SUCCESS MATCHES 0) - message(FATAL_ERROR "Python config Error.") - endif() - string(REGEX REPLACE ";" "\\\\;" _INC_PYTHON_VALUES ${_INC_PYTHON_VALUES}) - string(REGEX REPLACE "\n" ";" _INC_PYTHON_VALUES ${_INC_PYTHON_VALUES}) - list(GET _INC_PYTHON_VALUES 0 PY_INCLUDE_DIR) - - list(APPEND COMMON_HEADER_DIRS ${PY_INCLUDE_DIR}) - list(APPEND COMMON_HEADER_DIRS ${PY_INCLUDE_DIR}/third_party) - - execute_process(COMMAND ${PYTHON_PATH} "-c" "from __future__ import print_function; import paddle; print(paddle.sysconfig.get_lib())" - RESULT_VARIABLE _LIB_PYTHON_SUCCESS - OUTPUT_VARIABLE _LIB_PYTHON_VALUES) - if (NOT _LIB_PYTHON_SUCCESS MATCHES 0) - message(FATAL_ERROR "Python config Error.") - endif() - string(REGEX REPLACE ";" "\\\\;" _LIB_PYTHON_VALUES ${_LIB_PYTHON_VALUES}) - string(REGEX REPLACE "\n" ";" _LIB_PYTHON_VALUES ${_LIB_PYTHON_VALUES}) - list(GET _LIB_PYTHON_VALUES 0 PY_LIB_DIR) - list(APPEND COMMON_LIB_DIRS ${PY_LIB_DIR}) - - include_directories(${PY_INCLUDE_DIR}) - include_directories(${PY_INCLUDE_DIR}\third_party) - - endif() -endif() - -if(BUILD_GPT) - list(APPEND COMMON_HEADER_DIRS ${NCCL_INCLUDE_DIRS}) - get_filename_component(NCCL_LIB_DIRS ${NCCL_LIBRARIES} DIRECTORY) - list(APPEND COMMON_LIB_DIRS ${NCCL_LIB_DIRS}) -endif() - -list(APPEND COMMON_HEADER_DIRS ${MPI_INCLUDE_PATH}) - -include_directories( - ${COMMON_HEADER_DIRS} -) - -list(APPEND COMMON_LIB_DIRS /usr/local/mpi/lib) - -link_directories( - ${COMMON_LIB_DIRS} -) - -add_subdirectory(fastertransformer) -add_subdirectory(tools) -# add_subdirectory(sample) - -######################################## - -if(BUILD_GPT) -# Following feature requires cmake 3.15 -# TODO Remove this part or modify such that we can run it under cmake 3.10 -cmake_minimum_required(VERSION 3.15 FATAL_ERROR) -add_library(transformer-static STATIC - $ - $ - $ - $ - $ - $ - # trt_fused_multi_head_attention, gpt_triton_backend have been removed to - # resolve encoder ON_INFER compiling issue. - # $ - $ - $ - $ - $ - $ - $ - $) -set_property(TARGET transformer-static PROPERTY POSITION_INDEPENDENT_CODE ON) -set_property(TARGET transformer-static PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) -target_link_libraries(transformer-static PUBLIC -lcublas -lcudart -lcurand -lnccl -lmpi nvtx_utils) - -add_library(transformer-shared SHARED - $ - $ - $ - $ - $ - $ - # $ - $ - $ - $ - $ - $ - $ - $) - # $) -## add_library(transformer-shared SHARED $) -set_target_properties(transformer-shared PROPERTIES POSITION_INDEPENDENT_CODE ON) -set_target_properties(transformer-shared PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON) -set_target_properties(transformer-shared PROPERTIES LINKER_LANGUAGE CXX) -target_link_libraries(transformer-shared PUBLIC ${NCCL_LIBRARIES} ${MPI_LIBRARIES} -lcublas -lcublasLt -lcudart -lcurand ) - -include(GNUInstallDirs) -set(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/FasterTransformer) - -include(CMakePackageConfigHelpers) -configure_package_config_file( - ${CMAKE_CURRENT_LIST_DIR}/cmake/FasterTransformerConfig.cmake.in - ${CMAKE_CURRENT_BINARY_DIR}/FasterTransformerConfig.cmake - INSTALL_DESTINATION ${INSTALL_CONFIGDIR} -) - -install( - FILES - ${CMAKE_CURRENT_BINARY_DIR}/FasterTransformerConfig.cmake - DESTINATION ${INSTALL_CONFIGDIR} -) - -install( - TARGETS - transformer-shared - EXPORT - transformer-shared-targets - LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/fastertransformer - ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/fastertransformer -) - -install( - EXPORT - transformer-shared-targets - FILE - FasterTransformerTargets.cmake - DESTINATION - ${INSTALL_CONFIGDIR} -) - -file(GLOB_RECURSE HEADER_FILES "*.h" "*.hpp" "*.cuh") -foreach ( file ${HEADER_FILES} ) - file( RELATIVE_PATH rfile ${CMAKE_CURRENT_SOURCE_DIR} ${file} ) - get_filename_component( dir ${rfile} DIRECTORY ) - install( FILES ${file} DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/fastertransformer/include/${dir} ) -endforeach() - - -################################################################################ -add_executable(gpt sample/cpp/gpt_sample.cc ) -target_link_libraries(gpt PUBLIC -lcublas -lcublasLt -lcudart -lcurand -lnccl -lmpi transformer-static) -# target_link_libraries(gpt PUBLIC -lcublas -lcublasLt -lcudart -lcurand -lnccl -lmpi decoder decoding) -export( - EXPORT - transformer-shared-targets - FILE - ${CMAKE_CURRENT_BINARY_DIR}/FasterTransformerTargets.cmake - NAMESPACE - TritonCore:: -) - -export(PACKAGE FasterTransformer) - -endif() # BUILD_GPT diff --git a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/CMakeLists.txt b/paddlenlp/ops/patches/FasterTransformer/fastertransformer/CMakeLists.txt deleted file mode 100644 index 12fbd83615b5..000000000000 --- a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/CMakeLists.txt +++ /dev/null @@ -1,27 +0,0 @@ -# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -cmake_minimum_required(VERSION 3.8) -add_subdirectory(cuda) -add_subdirectory(utils) -add_subdirectory(gemm_test) -if(BUILD_TF) - add_subdirectory(tf_op) -endif() - -if(BUILD_PYT) - add_subdirectory(th_op) -endif() - -# add_subdirectory(trt_fused_multihead_attention) -# add_subdirectory(triton_backend) diff --git a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/bert_encoder_transformer.h b/paddlenlp/ops/patches/FasterTransformer/fastertransformer/bert_encoder_transformer.h deleted file mode 100644 index 8bd5738f00ee..000000000000 --- a/paddlenlp/ops/patches/FasterTransformer/fastertransformer/bert_encoder_transformer.h +++ /dev/null @@ -1,1123 +0,0 @@ -/* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * BERT Encoder transformer - **/ - -#pragma once - -#include -#include "fastertransformer/cuda/cuda_int8_kernels.h" -#include "fastertransformer/cuda/cuda_kernels.h" -#include "fastertransformer/cuda/open_attention.h" -#include "fastertransformer/gemm_test/encoder_gemm_func.h" -#include "fastertransformer/gemm_test/encoder_igemm_func.h" -#include "fastertransformer/utils/allocator.h" -#include "fastertransformer/utils/common_structure.h" -#include "fastertransformer/utils/functions.h" - -namespace fastertransformer { - -template -class BertInitParam { -public: - const T *from_tensor = nullptr; - const T *to_tensor = nullptr; - - AttentionWeight self_attention; - const T *attr_mask = nullptr; - LayerNormWeight self_layernorm; - - FFNWeight ffn; - LayerNormWeight ffn_layernorm; - - T *transformer_out; - cublasHandle_t cublas_handle = nullptr; - cublasLtHandle_t cublaslt_handle = nullptr; - cudaStream_t stream = 0; - - const int *sequence_id_offset = nullptr; - int valid_word_num = -1; - int layer_idx = 0; - int layer_num = 12; - - // Part 1: - // First 80 are for activation amaxs. For each activation amax, there are 4 - // values: amax, amax/127.0f, amax/127.0f/127.0f, 127.0f/amax -- input_amax - // 0-3 , Q_aftergemm_amax 4-7, Qbias_amax 8-11, K_aftergemm_amax 12-15, - // Kbias_amax 16-19, V_aftergemm_amax 20-23, Vbias_amax 24-27, bmm1_amax - // 28-31, Softmax_amax 32-35, bmm2_amax 36-39, Proj_aftergemm_scale 40-43, - // ProjBiasNorm_amax 44-47, FC1_aftergemm_amax 48-51, F1Bias_amax 52-55, - // FC2_aftergemm_amax 56-59, F2BiasNorm_amax 60-63, reserve 64-79 - // Part 2: - // Kernel amaxs, for each kernel amax list, there are output_channel values : - // query_weight_amax_list, key_weight_amax_list, value_weight_amax_list, - // proj_weight_amax_list, FC1_weight_amax_list, FC2_weight_amax_list - // Part 3: - // Int8 gemm deQFactor list (8 values): Q_deQ_scale, K_deQ_scale, - // V_deQ_scale, bmm1_deQ_scale, bmm2_deQ_scale, FC0_deQ_scale, FC1_deQ_scale, - // FC2_deQ_scale - // Part 4: - // Amax used in trt fused mha kernel (3 values) : QKVbias_amax, Softmax_amax, - // bmm2_amax - const float *amaxList = nullptr; - const int *trt_seqlen_offset = nullptr; - int trt_seqlen_size = -1; -}; - -template class MultiHeadAttention_> -class BertEncoderTransformerTraits; - -template