PaddlePaddle · ZHUI · Jun 12, 2024 · May 30, 2024
diff --git a/csrc/xpu/README.md b/csrc/xpu/README.md
@@ -0,0 +1,32 @@
+# ernie-bot-custom-ops
+ernie bot 昆仑自定义算子库。
+
+## 快速开始
+# 构建 XDNN plugin 和 Paddle 自定义算子库
+```
+$ cd src
+$ wget https://baidu-kunlun-product.su.bcebos.com/KL-SDK/klsdk-dev/20240429/xdnn-ubuntu_x86_64.tar.gz
+$ wget https://baidu-kunlun-product.su.bcebos.com/KL-SDK/klsdk-dev/20240429/xre-ubuntu_x86_64.tar.gz
+$ wget -q --no-check-certificate https://klx-sdk-release-public.su.bcebos.com/xtdk_llvm15/dev/2.7.98.2/xtdk-llvm15-ubuntu1604_x86_64.tar.gz
+$ tar -xf xdnn-ubuntu_x86_64.tar.gz
+$ tar -xf xre-ubuntu_x86_64.tar.gz
+$ tar -xf xtdk-llvm15-ubuntu1604_x86_64.tar.gz
+$ export PWD=$(pwd)
+$ export XDNN_PATH=${PWD}/xdnn-ubuntu_x86_64/
+$ export XRE_PATH=${PWD}/xre-ubuntu_x86_64/
+$ export CLANG_PATH=${PWD}/xtdk-llvm15-ubuntu1604_x86_64/
+$ bash ./cmake_build.sh
+```
+
+## 测试
+# 运行 add2 单测
+```
+$ cd test/python
+$ python test_get_padding_offset_v2.py
+```
+
+## 如何贡献
+```
+$ pip install pre-commit==2.17.0
+$ pre-commit install
+```
diff --git a/csrc/xpu/src/cmake_build.sh b/csrc/xpu/src/cmake_build.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+# export XDNN_PATH=Paddle/build/third_party/xpu/src/extern_xpu/xdnn-ubuntu_x86_64/ # <path_to_xdnn>
+# export XRE_PATH=Paddle/build/third_party/xpu/src/extern_xpu/xre-ubuntu_x86_64/  # <path_to_xre>
+# export CLANG_PATH=xtdk-ubuntu_1604_x86_64 # <path_to_xtdk>
+# export HOST_SYSROOT=/opt/compiler/gcc-8.2/bin/gcc # <path_to_gcc>
+
+cd plugin
+./cmake_build.sh
+cd -
+
+python -m pip  uninstall paddlenlp_ops -y
+python setup.py install
diff --git a/csrc/xpu/src/get_padding_offset_v2.cc b/csrc/xpu/src/get_padding_offset_v2.cc
@@ -0,0 +1,94 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <paddle/phi/backends/xpu/xpu_context.h>
+#include "paddle/extension.h"
+#include "xpu/plugin.h"
+
+std::vector<paddle::Tensor> GetPaddingOffset(const paddle::Tensor& input_ids,
+                                             const paddle::Tensor& cum_offsets,
+                                             const paddle::Tensor& token_num,
+                                             const paddle::Tensor& seq_len) {
+  phi::XPUPlace place(phi::backends::xpu::GetXPUCurrentDeviceId());
+  auto dev_ctx = paddle::experimental::DeviceContextPool::Instance().Get(place);
+  auto xpu_ctx = static_cast<const phi::XPUContext*>(dev_ctx);
+
+  std::vector<int64_t> input_ids_shape = input_ids.shape();
+  const int bsz = seq_len.shape()[0];
+  const int seq_length = input_ids_shape[1];
+  auto cum_offsets_out = cum_offsets.copy_to(cum_offsets.place(), false);
+  auto cpu_token_num = token_num.copy_to(paddle::CPUPlace(), false);
+
+
+  const int token_num_data = cpu_token_num.data<int64_t>()[0];
+  auto x_remove_padding = paddle::full(
+      {token_num_data}, 0, paddle::DataType::INT64, input_ids.place());
+  auto padding_offset = paddle::full(
+      {token_num_data}, 0, paddle::DataType::INT32, input_ids.place());
+  auto cu_seqlens_q =
+      paddle::full({bsz + 1}, 0, paddle::DataType::INT32, input_ids.place());
+  auto cu_seqlens_k =
+      paddle::full({bsz + 1}, 0, paddle::DataType::INT32, input_ids.place());
+  int r = baidu::xpu::api::plugin::get_padding_offset(
+      xpu_ctx->x_context(),
+      padding_offset.data<int>(),
+      cum_offsets_out.data<int>(),
+      cu_seqlens_q.data<int>(),
+      cu_seqlens_k.data<int>(),
+      x_remove_padding.data<int64_t>(),
+      input_ids.data<int64_t>(),
+      cum_offsets.data<int>(),
+      seq_len.data<int>(),
+      seq_length,
+      bsz);
+  PD_CHECK(r == 0, "baidu::xpu::api::plugin::get_padding_offset failed.");
+  return {x_remove_padding,
+          cum_offsets_out,
+          padding_offset,
+          cu_seqlens_q,
+          cu_seqlens_k};
+}
+
+std::vector<std::vector<int64_t>> GetPaddingOffsetInferShape(
+    const std::vector<int64_t>& input_ids_shape,
+    const std::vector<int64_t>& cum_offsets_shape,
+    const std::vector<int64_t>& token_num_shape,
+    const std::vector<int64_t>& seq_len_shape) {
+  int64_t bsz = seq_len_shape[0];
+  int64_t seq_len = input_ids_shape[1];
+  return {{-1}, {bsz}, {-1}, {bsz + 1}, {bsz + 1}};
+}
+
+std::vector<paddle::DataType> GetPaddingOffsetInferDtype(
+    const paddle::DataType& input_ids_dtype,
+    const paddle::DataType& cum_offsets_dtype,
+    const paddle::DataType& token_num_dtype,
+    const paddle::DataType& seq_len_dtype) {
+  return {input_ids_dtype,
+          seq_len_dtype,
+          seq_len_dtype,
+          seq_len_dtype,
+          seq_len_dtype};
+}
+
+PD_BUILD_OP(get_padding_offset_v2)
+    .Inputs({"input_ids", "token_num", "cum_offsets", "seq_len"})
+    .Outputs({"x_remove_padding",
+              "cum_offsets_out",
+              "padding_offset",
+              "cu_seqlens_q",
+              "cu_seqlens_k"})
+    .SetKernelFn(PD_KERNEL(GetPaddingOffset))
+    .SetInferShapeFn(PD_INFER_SHAPE(GetPaddingOffsetInferShape))
+    .SetInferDtypeFn(PD_INFER_DTYPE(GetPaddingOffsetInferDtype));
diff --git a/csrc/xpu/src/get_token_penalty_multi_scores_v2.cc b/csrc/xpu/src/get_token_penalty_multi_scores_v2.cc
@@ -0,0 +1,108 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <paddle/phi/backends/xpu/xpu_context.h>
+#include "paddle/extension.h"
+#include "paddle/phi/core/enforce.h"
+#include "xpu/plugin.h"
+
+void TokenPenaltyMultiScores(const paddle::Tensor& pre_ids,
+                             const paddle::Tensor& logits,
+                             const paddle::Tensor& penalty_scores,
+                             const paddle::Tensor& frequency_scores,
+                             const paddle::Tensor& presence_scores,
+                             const paddle::Tensor& temperatures,
+                             const paddle::Tensor& bad_tokens,
+                             const paddle::Tensor& cur_len,
+                             const paddle::Tensor& min_len,
+                             const paddle::Tensor& eos_token_id) {
+  phi::XPUPlace place(phi::backends::xpu::GetXPUCurrentDeviceId());
+  auto dev_ctx = paddle::experimental::DeviceContextPool::Instance().Get(place);
+  auto xpu_ctx = static_cast<const phi::XPUContext*>(dev_ctx);
+  int64_t bs = logits.shape()[0];
+  PADDLE_ENFORCE_LE(
+      bs,
+      640,
+      phi::errors::InvalidArgument(
+          "Only support bsz <= 1024, but received bsz is %d", bs));
+  int64_t length = logits.shape()[1];
+  int64_t length_id = pre_ids.shape()[1];
+  int64_t length_bad_words = bad_tokens.shape()[0];
+  int64_t end_length = eos_token_id.shape()[0];
+  switch (logits.type()) {
+    case paddle::DataType::FLOAT16: {
+      using XPUType = typename XPUTypeTrait<float16>::Type;
+      typedef paddle::float16 data_t;
+      int r = baidu::xpu::api::plugin::token_penalty_multi_scores(
+          xpu_ctx->x_context(),
+          pre_ids.data<int64_t>(),
+          reinterpret_cast<XPUType*>(
+              const_cast<data_t*>(logits.data<data_t>())),
+          reinterpret_cast<const XPUType*>(penalty_scores.data<data_t>()),
+          reinterpret_cast<const XPUType*>(frequency_scores.data<data_t>()),
+          reinterpret_cast<const XPUType*>(presence_scores.data<data_t>()),
+          temperatures.data<float>(),
+          cur_len.data<int64_t>(),
+          min_len.data<int64_t>(),
+          eos_token_id.data<int64_t>(),
+          bad_tokens.data<int64_t>(),
+          bs,
+          length,
+          length_id,
+          end_length,
+          length_bad_words);
+      PD_CHECK(r == 0, "xpu::plugin::token_penalty_multi_scores failed.");
+    } break;
+    case paddle::DataType::FLOAT32: {
+      int r = baidu::xpu::api::plugin::token_penalty_multi_scores(
+          xpu_ctx->x_context(),
+          pre_ids.data<int64_t>(),
+          const_cast<float*>(logits.data<float>()),
+          penalty_scores.data<float>(),
+          frequency_scores.data<float>(),
+          presence_scores.data<float>(),
+          temperatures.data<float>(),
+          cur_len.data<int64_t>(),
+          min_len.data<int64_t>(),
+          eos_token_id.data<int64_t>(),
+          bad_tokens.data<int64_t>(),
+          bs,
+          length,
+          length_id,
+          end_length,
+          length_bad_words);
+      PD_CHECK(r == 0, "xpu::plugin::token_penalty_multi_scores failed.");
+    } break;
+    default:
+      PD_THROW(
+          "NOT supported data type. "
+          "Only float16 and float32 are supported. ");
+      break;
+  }
+}
+
+PD_BUILD_OP(get_token_penalty_multi_scores_v2)
+    .Inputs({"pre_ids",
+             "logits",
+             "penalty_scores",
+             "frequency_scores",
+             "presence_scores",
+             "temperatures",
+             "bad_tokens",
+             "cur_len",
+             "min_len",
+             "eos_token_id"})
+    .Outputs({"logits_out"})
+    .SetInplaceMap({{"logits", "logits_out"}})
+    .SetKernelFn(PD_KERNEL(TokenPenaltyMultiScores));