update

395822456@qq.com · 395822456@qq.com · commit ac28f8fde56b · 2024-06-27T02:52:19.000Z
diff --git a/csrc/cpu/README.md b/csrc/cpu/README.md
@@ -4,6 +4,5 @@
 # 构建 cpu 自定义算子库
 ```
 $ 前提条件:机器支持avx指令
-$ cd src
 $ bash setup.sh
 ```
diff --git a/csrc/cpu/setup.sh b/csrc/cpu/setup.sh
@@ -43,18 +43,17 @@ fi
 cd xFasterTransformer
 git apply paddle.patch
 
-#4. build xFasterTransformer
+# #4. build xFasterTransformer
 sh ./3rdparty/prepare_oneccl.sh
 source ./3rdparty/oneccl/build/_install/env/setvars.sh
-source /workspace/cpu_repo/xFasterTransformer/3rdparty/oneccl/build/_install/env/setvars.sh
 
 rm -rf build
 mkdir build && cd build
 cmake ..
 make -j
 
 #xft
-export  XFT_HEADER_DIR=$PWD
+export XFT_HEADER_DIR=$PWD
 export XFT_LIB_DIR=$XFT_HEADER_DIR/build
 export LD_LIBRARY_PATH=$XFT_LIB_DIR:$LD_LIBRARY_PATH
 
diff --git a/llm/docs/inference.md b/llm/docs/inference.md
@@ -121,7 +121,7 @@ python ./predict/predictor.py --model_name_or_path checkpoints/llama_ptq_ckpts -
 python ./predict/export_model.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --output_path ./inference --dtype float16
 
 # Cpu动转静avx指令动转静参考
-python ./predict/export_model.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --output_path ./inference --dtype float16 --avx_mode --avx_type "fp16" --device "cpu"
+python ./predict/export_model.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --output_path ./inference --dtype float32 --avx_mode --avx_type "fp16" --device "cpu"
 
 # PrefixTuning动转静命令参考
 python ./predict/export_model.py --model_name_or_path meta-llama/Llama-2-7b-chat --inference_model --output_path ./inference --dtype float16 --export_precache true
diff --git a/llm/predict/predictor.py b/llm/predict/predictor.py
@@ -398,7 +398,6 @@ def __init__(self, config: PredictorArgument, tokenizer: PretrainedTokenizer):
 
         self.dtype = config.dtype or self.model_config
         self.pre_ids = paddle.full([config.batch_size, config.total_max_length], -1, dtype="int64")
-        self.arange_tensor_encoder = paddle.arange(config.total_max_length, dtype=self.dtype)
 
         if config.device == "cpu" and config.avx_model:
             assert (
@@ -409,6 +408,7 @@ def __init__(self, config: PredictorArgument, tokenizer: PretrainedTokenizer):
             self.tgt_generation_mask = None
             self.tgt_pos = None
         else:
+            self.arange_tensor_encoder = paddle.arange(config.total_max_length, dtype=self.dtype)
             self.cache_kvs = [paddle.zeros(shape, dtype=self.dtype) for shape in self.cache_kvs_shape]
             self.num_layers, self.num_attention_heads, self.head_dim = (
                 len(self.cache_kvs),
diff --git a/paddlenlp/experimental/transformers/llama/modeling.py b/paddlenlp/experimental/transformers/llama/modeling.py
@@ -143,7 +143,7 @@ def __init__(self, config: LlamaConfig):
             self.hidden_size,
             self.num_attention_heads,
             self.intermediate_size,
-            activation="swiglu",
+            activation="silu",
             num_layers=config.num_hidden_layers,
             ln_scale_attrs=ln_scale_attrs,
             qkv_weight_attrs=qkv_weight_attrs,