update docs and shell

395822456@qq.com · 395822456@qq.com · commit 9c40b5df9296 · 2024-06-26T09:51:44.000Z
diff --git a/csrc/cpu/0001-patch-fp16-and-bf16.patch b/csrc/cpu/0001-patch-fp16-and-bf16.patch
diff --git a/csrc/cpu/0001-patch-fp32.patch b/csrc/cpu/0001-patch-fp32.patch
diff --git a/csrc/cpu/README.md b/csrc/cpu/README.md
@@ -0,0 +1,9 @@
+# cpu-custom-ops
+
+## 快速开始
+# 构建 cpu 自定义算子库
+```
+$ 前提条件:机器支持avx指令
+$ cd src
+$ bash setup.sh
+```
diff --git a/csrc/cpu/setup.sh b/csrc/cpu/setup.sh
@@ -0,0 +1,63 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#1. download XFT
+if [ ! -d xFasterTransformer]; then
+    git clone --branch v1.7.2 https://github.com/intel/xFasterTransformer.git
+fi
+
+#2.cp patch
+cd xFasterTransformer
+git checkout .
+cd ..
+
+if lscpu | grep -q "avx512_bf16"; then
+    echo "apply bf16 and fp16."
+    if [ ! -f 0001-patch-fp16-and-bf16.patch ]; then
+        echo "Error:  0001-patch-fp16-and-bf16.patch not exist."
+        exit 1
+    fi
+    # apply patch
+    cp ./0001-patch-fp16-and-bf16.patch  ./xFasterTransformer/paddle.patch
+else
+    echo "apply fp32 "
+    if [ ! -f 0001-patch-fp32.patch ]; then
+        echo "Error:  does 0001-patch-fp32.patch not exist."
+        exit 1
+    fi
+    cp ./0001-patch-fp32.patch  ./xFasterTransformer/paddle.patch
+fi
+
+#3. apply patch
+cd xFasterTransformer
+git apply paddle.patch
+
+#4. build xFasterTransformer
+sh ./3rdparty/prepare_oneccl.sh
+source ./3rdparty/oneccl/build/_install/env/setvars.sh
+source /workspace/cpu_repo/xFasterTransformer/3rdparty/oneccl/build/_install/env/setvars.sh
+
+rm -rf build
+mkdir build && cd build
+cmake ..
+make -j
+
+#xft
+export  XFT_HEADER_DIR=$PWD
+export XFT_LIB_DIR=$XFT_HEADER_DIR/build
+export LD_LIBRARY_PATH=$XFT_LIB_DIR:$LD_LIBRARY_PATH
+
+#setup cpu paddle_nlp ops
+cd ..
+python ./src/setup_cpu.py install
diff --git a/csrc/cpu/src/set_value_by_flags.cc b/csrc/cpu/src/set_value_by_flags.cc
@@ -30,7 +30,6 @@ std::vector<paddle::Tensor> SetValueByFlagsAndIdx(const paddle::Tensor& pre_ids_
     auto stop_flags_out = stop_flags.copy_to(stop_flags.place(), false); // gpu -> gpu
 
     int bs = stop_flags.shape()[0];
-    // max_len的长度
     int length = pre_ids_all_shape[1];
 
     set_value_by_flag_and_id(stop_flags.data<bool>(), const_cast<int64_t*>(pre_ids_all.data<int64_t>()), pre_ids_now.data<int64_t>(), step_idx.data<int64_t>(), bs, length);
diff --git a/csrc/cpu/src/setup_cpu.py b/csrc/cpu/src/setup_cpu.py
@@ -41,7 +41,7 @@ def build_extensions(self):
     "-DPADDLE_WITH_CUSTOM_KERNEL",
     "-DAVX512_FP32_WEIGHT_ONLY_FP16=true",
     "-DAVX512_FP32_WEIGHT_ONLY_INT8=true",
-    # bf16 机器
+    # AVX512_FP16 optimization
     # "-DAVX512_BF16_WEIGHT_ONLY_BF16=true",
     # "-DAVX512_BF16_WEIGHT_ONLY_BF16=true",
 ]
@@ -55,10 +55,10 @@ def build_extensions(self):
 XFT_LIBRARY_DIR = os.environ["XFT_LIB_DIR"]
 
 # include path third_party
-compile_third_party_path = os.path.join(os.environ["PADDLE_BINARY_DIR"], "third_party")
+import pdb
+
+pdb.set_trace()
 paddle_custom_kernel_include += [
-    os.path.join(compile_third_party_path, "install/gflags/include"),  # gflags
-    os.path.join(compile_third_party_path, "install/glog/include"),  # glog
     os.path.join(XFT_INCLUDE_DIR, "include"),  # glog
     os.path.join(XFT_INCLUDE_DIR, "src/common"),  # src
     os.path.join(XFT_INCLUDE_DIR, "src/kernel"),  # src
@@ -79,11 +79,11 @@ def build_extensions(self):
 
 custom_kernel_dot_module = CppExtension(
     sources=[
-        "xft_llama_layer.cc",
-        "../../generation/save_with_output.cc",
-        "token_penalty_multi_scores.cc",
-        "stop_generation_multi_ends.cc",
-        "set_value_by_flags.cc",
+        "./src/xft_llama_layer.cc",
+        "../generation/save_with_output.cc",
+        "./src/token_penalty_multi_scores.cc",
+        "./src/stop_generation_multi_ends.cc",
+        "./src/set_value_by_flags.cc",
     ],
     include_dirs=paddle_custom_kernel_include,
     library_dirs=paddle_custom_kernel_library_dir,
diff --git a/csrc/cpu/src/xft_llama_layer.cc b/csrc/cpu/src/xft_llama_layer.cc
@@ -44,9 +44,9 @@ std::vector<paddle::Tensor> InvokeLLaMALayer(
   auto out = paddle::empty_like(input);
   auto batchSize = input.shape()[0];
   auto inputSeqLen = input.shape()[1];
-  auto past_seq_len=pastSeqLen.data<int64_t>()[0];
-  auto cur_seq_len=currentSeqLen.data<int64_t>()[0];
-  auto step_id=step.data<int64_t>()[0];
+  auto past_seq_len = pastSeqLen.data<int64_t>()[0];
+  auto cur_seq_len = currentSeqLen.data<int64_t>()[0];
+  auto step_id = step.data<int64_t>()[0];
   auto input_ptr = reinterpret_cast<const void *>(input.data<float>());
   auto ln1Gamma_ptr = reinterpret_cast<const float *>(ln1Gamma.data<float>());
   auto qkvWeight_ptr = reinterpret_cast<const void *>(qkvWeight.data<float>());
@@ -64,12 +64,16 @@ std::vector<paddle::Tensor> InvokeLLaMALayer(
     xft_data_type = xft::DataType::bf16;
   }
   auto xft_act_type = xft::ActivationType::SILU;
-  if (activation == "silu") {
-    xft_act_type = xft::ActivationType::SILU;
+  if (activation == "relu") {
+    xft_act_type = xft::ActivationType::RELU;
+  } else if (activation == "gelu") {
+    xft_act_type = xft::ActivationType::GELU;
+  } else if (activation == "swiglu") {
+    xft_act_type = xft::ActivationType::SWIGLU;
   }
   auto xft_norm_type = xft::NormType::RMS;
-  if (normType == "rmsnorm") {
-    xft_norm_type = xft::NormType::RMS;
+  if (normType == "layernorm") {
+    xft_norm_type = xft::NormType::LN;
   }
   invokeLayerLLaMA(xft_data_type,
                    xft_act_type,
diff --git a/llm/docs/inference.md b/llm/docs/inference.md
@@ -97,6 +97,9 @@ cd ./paddlenlp/csrc/xpu/src && sh cmake_build.sh
 # 动态图模型推理命令参考
 python ./predict/predictor.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --dtype float16
 
+#Cpu设备使用avx指令动态图推理参考
+python ./predict/predictor.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --dtype float32 --avx_mode --avx_type "fp16" --device "cpu"
+
 # PrefixTuning动态图推理参考
 python ./predict/predictor.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --dtype float16 --export_precache true --prefix_path ./checkpoints/llama_prefix_ckpts
 
@@ -117,6 +120,9 @@ python ./predict/predictor.py --model_name_or_path checkpoints/llama_ptq_ckpts -
 # 动转静命令参考
 python ./predict/export_model.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --output_path ./inference --dtype float16
 
+# Cpu动转静avx指令动转静参考
+python ./predict/export_model.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --output_path ./inference --dtype float16 --avx_mode --avx_type "fp16" --device "cpu"
+
 # PrefixTuning动转静命令参考
 python ./predict/export_model.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --output_path ./inference --dtype float16 --export_precache true
 
@@ -137,6 +143,9 @@ python ./predict/export_model.py --model_name_or_path checkpoints/llama_ptq_ckpt
 # 静态图推理命令参考
 python ./predict/predictor.py  --model_name_or_path ./inference --inference_model --quant_type weight_only_int8 --dtype "float16" --mode "static"
 
+#Cpu使用avx指令静态图推理参考
+python ./predict/predictor.py  --model_name_or_path ./inference --inference_model --avx_mode --avx_type "fp16" --dtype "float32" --mode "static" --device "cpu"
+
 # PrefixTuning静态图推理命令参考
 python ./predict/predictor.py  --model_name_or_path ./inference --inference_model --quant_type weight_only_int8 --dtype "float16" --mode "static" --export_precache true --prefix_path ./checkpoints/llama_prefix_ckpts
 
diff --git a/llm/predict/predictor.py b/llm/predict/predictor.py
@@ -28,7 +28,11 @@
 import paddle.incubate.multiprocessing as mp
 from paddle.base.framework import in_cinn_mode, in_pir_executor_mode
 from paddle.distributed import fleet
-from utils.utils import (
+
+current_dir = os.path.dirname(os.path.abspath(__file__))
+module_dir = os.path.join(current_dir, "..", "utils")
+sys.path.append(module_dir)
+from utils import (
     dybatch_preprocess,
     get_alibi_slopes,
     get_default_max_decoding_length,
@@ -401,6 +405,9 @@ def __init__(self, config: PredictorArgument, tokenizer: PretrainedTokenizer):
         self.arange_tensor_encoder = paddle.arange(config.total_max_length, dtype=self.dtype)
 
         if config.device == "cpu" and config.avx_model:
+            assert (
+                "llama" in self.architectures and self.model_config.model_type != "llama-img2txt"
+            ), "avx_mode only support llama now"
             self.cache_kvs = None
             self.attention_mask = None
             self.tgt_generation_mask = None
@@ -690,6 +697,8 @@ def _create_predictor(self, predictor_args: PredictorArgument):
         elif predictor_args.device == "cpu" and predictor_args.avx_model:
             config.disable_gpu()
             config.switch_ir_optim(False)
+            config.use_optimized_model(True)
+            config.enable_new_ir(True)
         else:
             device_id = int(os.environ.get("FLAGS_selected_gpus", 0))
             config.enable_use_gpu(100, device_id)
@@ -1671,14 +1680,21 @@ def benchmark(predictor, predictor_args, model_args):
     for _ in range(warmup_time):
         for bs, batch_source_text in enumerate(batch_benchmark_texts):
             outputs = predictor.predict(batch_source_text)
+    import paddle.profiler as profiler
 
+    p = profiler.Profiler(
+        targets=[profiler.ProfilerTarget.CPU], scheduler=(3, 7), on_trace_ready=profiler.export_chrome_tracing("./log")
+    )
     print("***********Start Speed Test**********")
     start = time.perf_counter()
     output_tokens = 0
+    p.start()
     for _ in range(test_time):
         for bs, batch_source_text in enumerate(batch_benchmark_texts):
             outputs = predictor.predict(batch_source_text)
             output_tokens += sum([len(output) for output in outputs])
+            p.step()
+    p.stop()
     end = time.perf_counter()
     print("Avg Elapse time is: ", (end - start) / test_time)
     print("Output tokens is: ", output_tokens)
diff --git a/paddlenlp/experimental/transformers/fused_transformer_layers.py b/paddlenlp/experimental/transformers/fused_transformer_layers.py
@@ -1093,11 +1093,7 @@ def init_weight_shape(self, config):
         self.gate_weight_shape = [self.embed_dim, self.dim_feedforward]
         self.up_weight_shape = [self.embed_dim, self.dim_feedforward]
         self.down_weight_shape = [self.dim_feedforward, self.embed_dim]
-        self.qkv_weight_shape = (
-            [self.embed_dim, (self.num_heads + 2 * self.kv_num_heads) * self.head_dim]
-            if config.trans_qkvw
-            else [self.embed_dim, (self.num_heads + 2 * self.kv_num_heads) * self.head_dim]
-        )
+        self.qkv_weight_shape = [self.embed_dim, (self.num_heads + 2 * self.kv_num_heads) * self.head_dim]
         self.linear_weight_shape = [self.num_heads * self.head_dim, self.embed_dim]
         self.ffn1_weight_shape = (
             [self.embed_dim, self.dim_feedforward * 2]
diff --git a/paddlenlp/experimental/transformers/llama/modeling.py b/paddlenlp/experimental/transformers/llama/modeling.py
@@ -161,10 +161,6 @@ def __init__(self, config: LlamaConfig):
     def set_transformer_block(self, transformer_config, max_position_embeddings, compute_type):
         self.transformer_block = FusedMultiTransformerAvx(transformer_config, max_position_embeddings, compute_type)
 
-    def remove_padding(self, input_ids, seq_lens_this_time):
-        pass
-
-    # This function is a little different from prepare_input_ids_for_generation in paddlenlp/transformers/generation/utils.py
     @staticmethod
     def prepare_input_ids_for_generation(bos_token_id, encoder_output=None):
         batch_size = 1
@@ -193,7 +189,6 @@ def forward(
             raise ValueError("You have to specify either input_ids or inputs_embeds")
 
         # genereate a fake input_ids according to inputs_embeds
-        # this is usually occurred in img2txt multimodal model when first enter into this forward function.
         if input_ids is None and inputs_embeds is not None:
             input_ids = self.prepare_input_ids_for_generation(self.config.bos_token_id, inputs_embeds)
         if inputs_embeds is not None:
@@ -295,13 +290,11 @@ def set_state_dict(self, state_dict):
                 concated_ffn1_weight = np.concatenate(
                     [unfused_state_dict["mlp.gate_proj.weight"], unfused_state_dict["mlp.up_proj.weight"]], axis=-1
                 )
-            # ffn1_weight_tensor = paddle.to_tensor(concated_ffn1_weight)
             gate_up_list = split_fn(concated_ffn1_weight)
             gate_weight_tensor = paddle.to_tensor(gate_up_list[0])
             up_weight_tensor = paddle.to_tensor(gate_up_list[1])
 
             qkv_weight_tensor = paddle.to_tensor(concated_qkv_weight)
-            # 不需要离线处理量化权重
             self.transformer_block.qkv_weights[idx].set_value(
                 qkv_weight_tensor.cast(self.transformer_block.qkv_weights[idx].dtype)
             )