update docs and shell

395822456@qq.com · 395822456@qq.com · commit d96f9bbbbf9a · 2024-06-26T10:12:52.000Z
diff --git a/csrc/cpu/0001-patch-fp16-and-bf16.patch b/csrc/cpu/0001-patch-fp16-and-bf16.patch
diff --git a/csrc/cpu/0001-patch-fp32.patch b/csrc/cpu/0001-patch-fp32.patch
diff --git a/csrc/cpu/README.md b/csrc/cpu/README.md
@@ -0,0 +1,9 @@
+# cpu-custom-ops
+
+## 快速开始
+# 构建 cpu 自定义算子库
+```
+$ 前提条件:机器支持avx指令
+$ cd src
+$ bash setup.sh
+```
diff --git a/csrc/cpu/setup.sh b/csrc/cpu/setup.sh
@@ -0,0 +1,63 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#1. download XFT
+if [ ! -d xFasterTransformer]; then
+    git clone --branch v1.7.2 https://github.com/intel/xFasterTransformer.git
+fi
+
+#2.cp patch
+cd xFasterTransformer
+git checkout .
+cd ..
+
+if lscpu | grep -q "avx512_bf16"; then
+    echo "apply bf16 and fp16."
+    if [ ! -f 0001-patch-fp16-and-bf16.patch ]; then
+        echo "Error:  0001-patch-fp16-and-bf16.patch not exist."
+        exit 1
+    fi
+    # apply patch
+    cp ./0001-patch-fp16-and-bf16.patch  ./xFasterTransformer/paddle.patch
+else
+    echo "apply fp32 "
+    if [ ! -f 0001-patch-fp32.patch ]; then
+        echo "Error:  does 0001-patch-fp32.patch not exist."
+        exit 1
+    fi
+    cp ./0001-patch-fp32.patch  ./xFasterTransformer/paddle.patch
+fi
+
+#3. apply patch
+cd xFasterTransformer
+git apply paddle.patch
+
+#4. build xFasterTransformer
+sh ./3rdparty/prepare_oneccl.sh
+source ./3rdparty/oneccl/build/_install/env/setvars.sh
+source /workspace/cpu_repo/xFasterTransformer/3rdparty/oneccl/build/_install/env/setvars.sh
+
+rm -rf build
+mkdir build && cd build
+cmake ..
+make -j
+
+#xft
+export  XFT_HEADER_DIR=$PWD
+export XFT_LIB_DIR=$XFT_HEADER_DIR/build
+export LD_LIBRARY_PATH=$XFT_LIB_DIR:$LD_LIBRARY_PATH
+
+#setup cpu paddle_nlp ops
+cd ..
+python ./src/setup_cpu.py install
diff --git a/csrc/cpu/src/set_value_by_flags.cc b/csrc/cpu/src/set_value_by_flags.cc
@@ -27,10 +27,9 @@ void set_value_by_flag_and_id(const bool *stop_flags, int64_t *pre_ids_all, cons
 
 std::vector<paddle::Tensor> SetValueByFlagsAndIdx(const paddle::Tensor& pre_ids_all, const paddle::Tensor& pre_ids_now, const paddle::Tensor& step_idx, const paddle::Tensor& stop_flags) {
     std::vector<int64_t> pre_ids_all_shape = pre_ids_all.shape();
-    auto stop_flags_out = stop_flags.copy_to(stop_flags.place(), false); // gpu -> gpu
+    auto stop_flags_out = stop_flags.copy_to(stop_flags.place(), false); 
 
     int bs = stop_flags.shape()[0];
-    // max_len的长度
     int length = pre_ids_all_shape[1];
 
     set_value_by_flag_and_id(stop_flags.data<bool>(), const_cast<int64_t*>(pre_ids_all.data<int64_t>()), pre_ids_now.data<int64_t>(), step_idx.data<int64_t>(), bs, length);
diff --git a/csrc/cpu/src/setup_cpu.py b/csrc/cpu/src/setup_cpu.py
@@ -14,6 +14,7 @@
 
 import os
 import site
+import subprocess
 
 from paddle.utils.cpp_extension import CppExtension, setup
 
@@ -32,21 +33,45 @@ def build_extensions(self):
         super().build_extensions()
 
 
+def check_avx512_bf16__support():
+    try:
+        result = subprocess.run(
+            ["lscpu", "|", "grep", '"avx512_bf16"'],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True,
+            shell=True,
+        )
+
+        if "avx512_bf16" in result.stdout.lower():
+            return True
+        else:
+            return False
+
+    except Exception as e:
+        print(f"Error checking AVX512 support: {e}")
+        return False
+
+
 # cc flags
 paddle_extra_compile_args = [
     "-std=c++17",
     "-shared",
     "-fPIC",
     "-Wno-parentheses",
     "-DPADDLE_WITH_CUSTOM_KERNEL",
-    "-DAVX512_FP32_WEIGHT_ONLY_FP16=true",
-    "-DAVX512_FP32_WEIGHT_ONLY_INT8=true",
-    # bf16 机器
-    # "-DAVX512_BF16_WEIGHT_ONLY_BF16=true",
-    # "-DAVX512_BF16_WEIGHT_ONLY_BF16=true",
 ]
 
-
+if check_avx512_bf16__support():
+    paddle_extra_compile_args += [
+        "-DAVX512_BF16_WEIGHT_ONLY_BF16=true",
+        "-DAVX512_BF16_WEIGHT_ONLY_BF16=true",
+    ]
+else:
+    paddle_extra_compile_args += [
+        "-DAVX512_FP32_WEIGHT_ONLY_FP16=true",
+        "-DAVX512_FP32_WEIGHT_ONLY_INT8=true",
+    ]
 # include path
 site_packages_path = site.getsitepackages()
 paddle_custom_kernel_include = [os.path.join(path, "paddle", "include") for path in site_packages_path]
@@ -55,10 +80,7 @@ def build_extensions(self):
 XFT_LIBRARY_DIR = os.environ["XFT_LIB_DIR"]
 
 # include path third_party
-compile_third_party_path = os.path.join(os.environ["PADDLE_BINARY_DIR"], "third_party")
 paddle_custom_kernel_include += [
-    os.path.join(compile_third_party_path, "install/gflags/include"),  # gflags
-    os.path.join(compile_third_party_path, "install/glog/include"),  # glog
     os.path.join(XFT_INCLUDE_DIR, "include"),  # glog
     os.path.join(XFT_INCLUDE_DIR, "src/common"),  # src
     os.path.join(XFT_INCLUDE_DIR, "src/kernel"),  # src
@@ -79,11 +101,11 @@ def build_extensions(self):
 
 custom_kernel_dot_module = CppExtension(
     sources=[
-        "xft_llama_layer.cc",
-        "../../generation/save_with_output.cc",
-        "token_penalty_multi_scores.cc",
-        "stop_generation_multi_ends.cc",
-        "set_value_by_flags.cc",
+        "./src/xft_llama_layer.cc",
+        "../generation/save_with_output.cc",
+        "./src/token_penalty_multi_scores.cc",
+        "./src/stop_generation_multi_ends.cc",
+        "./src/set_value_by_flags.cc",
     ],
     include_dirs=paddle_custom_kernel_include,
     library_dirs=paddle_custom_kernel_library_dir,
diff --git a/csrc/cpu/src/xft_llama_layer.cc b/csrc/cpu/src/xft_llama_layer.cc
@@ -44,9 +44,9 @@ std::vector<paddle::Tensor> InvokeLLaMALayer(
   auto out = paddle::empty_like(input);
   auto batchSize = input.shape()[0];
   auto inputSeqLen = input.shape()[1];
-  auto past_seq_len=pastSeqLen.data<int64_t>()[0];
-  auto cur_seq_len=currentSeqLen.data<int64_t>()[0];
-  auto step_id=step.data<int64_t>()[0];
+  auto past_seq_len = pastSeqLen.data<int64_t>()[0];
+  auto cur_seq_len = currentSeqLen.data<int64_t>()[0];
+  auto step_id = step.data<int64_t>()[0];
   auto input_ptr = reinterpret_cast<const void *>(input.data<float>());
   auto ln1Gamma_ptr = reinterpret_cast<const float *>(ln1Gamma.data<float>());
   auto qkvWeight_ptr = reinterpret_cast<const void *>(qkvWeight.data<float>());
@@ -64,12 +64,16 @@ std::vector<paddle::Tensor> InvokeLLaMALayer(
     xft_data_type = xft::DataType::bf16;
   }
   auto xft_act_type = xft::ActivationType::SILU;
-  if (activation == "silu") {
-    xft_act_type = xft::ActivationType::SILU;
+  if (activation == "relu") {
+    xft_act_type = xft::ActivationType::RELU;
+  } else if (activation == "gelu") {
+    xft_act_type = xft::ActivationType::GELU;
+  } else if (activation == "swiglu") {
+    xft_act_type = xft::ActivationType::SWIGLU;
   }
   auto xft_norm_type = xft::NormType::RMS;
-  if (normType == "rmsnorm") {
-    xft_norm_type = xft::NormType::RMS;
+  if (normType == "layernorm") {
+    xft_norm_type = xft::NormType::LN;
   }
   invokeLayerLLaMA(xft_data_type,
                    xft_act_type,
diff --git a/llm/docs/inference.md b/llm/docs/inference.md
@@ -97,6 +97,9 @@ cd ./paddlenlp/csrc/xpu/src && sh cmake_build.sh
 # 动态图模型推理命令参考
 python ./predict/predictor.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --dtype float16
 
+#Cpu设备使用avx指令动态图推理参考
+python ./predict/predictor.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --dtype float32 --avx_mode --avx_type "fp16" --device "cpu"
+
 # PrefixTuning动态图推理参考
 python ./predict/predictor.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --dtype float16 --export_precache true --prefix_path ./checkpoints/llama_prefix_ckpts
 
@@ -117,6 +120,9 @@ python ./predict/predictor.py --model_name_or_path checkpoints/llama_ptq_ckpts -
 # 动转静命令参考
 python ./predict/export_model.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --output_path ./inference --dtype float16
 
+# Cpu动转静avx指令动转静参考
+python ./predict/export_model.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --output_path ./inference --dtype float16 --avx_mode --avx_type "fp16" --device "cpu"
+
 # PrefixTuning动转静命令参考
 python ./predict/export_model.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --output_path ./inference --dtype float16 --export_precache true
 
@@ -137,6 +143,9 @@ python ./predict/export_model.py --model_name_or_path checkpoints/llama_ptq_ckpt
 # 静态图推理命令参考
 python ./predict/predictor.py  --model_name_or_path ./inference --inference_model --quant_type weight_only_int8 --dtype "float16" --mode "static"
 
+#Cpu使用avx指令静态图推理参考
+python ./predict/predictor.py  --model_name_or_path ./inference --inference_model --avx_mode --avx_type "fp16" --dtype "float32" --mode "static" --device "cpu"
+
 # PrefixTuning静态图推理命令参考
 python ./predict/predictor.py  --model_name_or_path ./inference --inference_model --quant_type weight_only_int8 --dtype "float16" --mode "static" --export_precache true --prefix_path ./checkpoints/llama_prefix_ckpts
 
diff --git a/llm/predict/predictor.py b/llm/predict/predictor.py
@@ -401,6 +401,9 @@ def __init__(self, config: PredictorArgument, tokenizer: PretrainedTokenizer):
         self.arange_tensor_encoder = paddle.arange(config.total_max_length, dtype=self.dtype)
 
         if config.device == "cpu" and config.avx_model:
+            assert (
+                "llama" in self.architectures and self.model_config.model_type != "llama-img2txt"
+            ), "avx_mode only support llama now"
             self.cache_kvs = None
             self.attention_mask = None
             self.tgt_generation_mask = None
diff --git a/paddlenlp/experimental/transformers/fused_transformer_layers.py b/paddlenlp/experimental/transformers/fused_transformer_layers.py
@@ -1093,11 +1093,7 @@ def init_weight_shape(self, config):
         self.gate_weight_shape = [self.embed_dim, self.dim_feedforward]
         self.up_weight_shape = [self.embed_dim, self.dim_feedforward]
         self.down_weight_shape = [self.dim_feedforward, self.embed_dim]
-        self.qkv_weight_shape = (
-            [self.embed_dim, (self.num_heads + 2 * self.kv_num_heads) * self.head_dim]
-            if config.trans_qkvw
-            else [self.embed_dim, (self.num_heads + 2 * self.kv_num_heads) * self.head_dim]
-        )
+        self.qkv_weight_shape = [self.embed_dim, (self.num_heads + 2 * self.kv_num_heads) * self.head_dim]
         self.linear_weight_shape = [self.num_heads * self.head_dim, self.embed_dim]
         self.ffn1_weight_shape = (
             [self.embed_dim, self.dim_feedforward * 2]
diff --git a/paddlenlp/experimental/transformers/llama/modeling.py b/paddlenlp/experimental/transformers/llama/modeling.py
@@ -161,10 +161,6 @@ def __init__(self, config: LlamaConfig):
     def set_transformer_block(self, transformer_config, max_position_embeddings, compute_type):
         self.transformer_block = FusedMultiTransformerAvx(transformer_config, max_position_embeddings, compute_type)
 
-    def remove_padding(self, input_ids, seq_lens_this_time):
-        pass
-
-    # This function is a little different from prepare_input_ids_for_generation in paddlenlp/transformers/generation/utils.py
     @staticmethod
     def prepare_input_ids_for_generation(bos_token_id, encoder_output=None):
         batch_size = 1
@@ -193,7 +189,6 @@ def forward(
             raise ValueError("You have to specify either input_ids or inputs_embeds")
 
         # genereate a fake input_ids according to inputs_embeds
-        # this is usually occurred in img2txt multimodal model when first enter into this forward function.
         if input_ids is None and inputs_embeds is not None:
             input_ids = self.prepare_input_ids_for_generation(self.config.bos_token_id, inputs_embeds)
         if inputs_embeds is not None:
@@ -295,13 +290,11 @@ def set_state_dict(self, state_dict):
                 concated_ffn1_weight = np.concatenate(
                     [unfused_state_dict["mlp.gate_proj.weight"], unfused_state_dict["mlp.up_proj.weight"]], axis=-1
                 )
-            # ffn1_weight_tensor = paddle.to_tensor(concated_ffn1_weight)
             gate_up_list = split_fn(concated_ffn1_weight)
             gate_weight_tensor = paddle.to_tensor(gate_up_list[0])
             up_weight_tensor = paddle.to_tensor(gate_up_list[1])
 
             qkv_weight_tensor = paddle.to_tensor(concated_qkv_weight)
-            # 不需要离线处理量化权重
             self.transformer_block.qkv_weights[idx].set_value(
                 qkv_weight_tensor.cast(self.transformer_block.qkv_weights[idx].dtype)
             )