Skip to content

Commit 7ab4a98

Browse files
committed
[LLM][NPU] reformat npu scripts
1 parent 0331b4c commit 7ab4a98

File tree

7 files changed

+129
-13
lines changed

7 files changed

+129
-13
lines changed

llm/npu/llama/README.md

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -86,15 +86,15 @@ python -m pip install -e .
8686
# PaddleNLP仓库内置了部分昇腾专用的融合算子,以便用户享受到极致压缩的推理成本
8787
cd csrc/npu
8888
python setup.py build bdist_wheel
89-
pip install dist/paddlenlp_ops-0.0.0-cp39-cp39-linux_aarch64.whl
89+
pip install dist/paddlenlp_ops-0.0.0-cp39-cp39-linux_x86_64.whl
9090
cd -
9191
```
9292

9393
### (2)数据准备:(这将花费您2~5min时间)
9494
sft为精调策略,我们提供了广告生成数据集demo便于您调试使用
9595
```
9696
#精调:为了方便测试,我们也提供了广告生成数据集可以直接使用:
97-
cd llm/llama/npu
97+
cd llm/npu/llama
9898
wget https://bj.bcebos.com/paddlenlp/datasets/examples/AdvertiseGen.tar.gz
9999
tar -zxvf AdvertiseGen.tar.gz
100100
```
@@ -145,8 +145,7 @@ bash llama_npu_sft_N1C8.sh
145145
```
146146
为了保障极致压缩的推理成本,我们使用了静态图实现。因此需要从训练产出的动态图模型中导出静态图模型,执行如下命令进行导出:
147147
```
148-
cd ../..
149-
bash export_npu.sh ./llama/npu/output/sft_bf16_llama_N1C8/ ./inference
148+
bash export_npu.sh ./output/sft_bf16_llama_N1C8/ ./inference
150149
```
151150
最终,我们通过静态图的模型执行推理:
152151
```

llm/export_npu.sh renamed to llm/npu/llama/export_npu.sh

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,16 @@
1414

1515
set -x
1616

17-
src_path=${1:-"./llama/npu/output/sft_bf16_llama_N1C8/"}
17+
src_path=${1:-".npu/llama/output/sft_bf16_llama_N1C8/"}
1818
dst_path=${2:-"./inference"}
1919

2020
source /usr/local/Ascend/ascend-toolkit/set_env.sh
2121
source /usr/local/Ascend/atb/set_env.sh
2222

23+
src_path=`realpath $src_path`
24+
dst_path=`realpath $dst_path`
25+
cd ../../
26+
2327
export PYTHONPATH=../:$PYTHONPATH
24-
python export_model.py --model_name_or_path ${src_path} --inference_model --output_path ${dst_path} --dtype float16 --device npu --block_attn
28+
python predict/export_model.py --model_name_or_path ${src_path} --inference_model --output_path ${dst_path} --dtype float16 --device npu --block_attn
29+
cd -

llm/npu/llama/export_utils.py

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import argparse
16+
17+
import numpy as np
18+
import paddle
19+
from tqdm import tqdm
20+
21+
22+
def parse_arguments():
23+
parser = argparse.ArgumentParser()
24+
parser.add_argument("--model_path", default="inference/model", help="The directory of exported model.")
25+
return parser.parse_args()
26+
27+
28+
def trans_weight(var):
29+
shape = var.desc.shape()
30+
new_shape = [shape[1], shape[0]]
31+
var.desc.set_shape(new_shape)
32+
33+
var_data = np.array(var.get_value())
34+
var.get_value().set(var_data.T, paddle.CPUPlace())
35+
36+
37+
def convert_dequant_scale(var):
38+
deq_scale = np.array(var.get_value()).astype(np.float32)
39+
new_deq_scale = np.stack([deq_scale.reshape(-1, 1), np.zeros_like(deq_scale).reshape(-1, 1)], axis=-1).reshape(-1)
40+
var.get_value().set(np.frombuffer(new_deq_scale.tobytes(), dtype=np.int64), paddle.CPUPlace())
41+
42+
43+
def process_params(model_path):
44+
paddle.enable_static()
45+
exe = paddle.static.Executor(paddle.CPUPlace())
46+
47+
prog = paddle.static.Program()
48+
startup_prog = paddle.static.Program()
49+
scope = paddle.static.Scope()
50+
with paddle.base.scope_guard(scope):
51+
with paddle.base.program_guard(prog, startup_prog):
52+
[program, feed_target_names, fetch_targets] = paddle.static.io.load_inference_model(model_path, exe)
53+
54+
feed_targets = []
55+
for var in program.list_vars():
56+
if var.name in feed_target_names:
57+
feed_targets.append(var)
58+
59+
block = program.global_block()
60+
61+
for op in tqdm(block.ops, desc="processing the linear layer for NPU"):
62+
if op.type == "matmul_v2":
63+
w_name = op.input_arg_names[-1]
64+
if w_name.endswith("qkv_weight") and not op.attr("trans_y"):
65+
op._set_attr("trans_y", True)
66+
w = block.var(w_name)
67+
trans_weight(w)
68+
elif w_name.endswith("out_proj_weight") and not op.attr("trans_y"):
69+
op._set_attr("trans_y", True)
70+
w = block.var(w_name)
71+
trans_weight(w)
72+
elif w_name.endswith("ffn1_weight") and not op.attr("trans_y"):
73+
op._set_attr("trans_y", True)
74+
w = block.var(w_name)
75+
trans_weight(w)
76+
elif w_name.endswith("ffn2_weight") and not op.attr("trans_y"):
77+
op._set_attr("trans_y", True)
78+
w = block.var(w_name)
79+
trans_weight(w)
80+
elif w_name == "llama_lm_head_0.w_0" and not op.attr("trans_y"):
81+
op._set_attr("trans_y", True)
82+
w = block.var(w_name)
83+
trans_weight(w)
84+
85+
for var_name in tqdm(block.vars, desc="processing the dequant layer for NPU"):
86+
if var_name.endswith("qkv_out_scale"):
87+
var = block.var(var_name)
88+
convert_dequant_scale(var)
89+
elif var_name.endswith("linear_out_scale"):
90+
var = block.var(var_name)
91+
convert_dequant_scale(var)
92+
elif var_name.endswith("ffn1_out_scale"):
93+
var = block.var(var_name)
94+
convert_dequant_scale(var)
95+
elif var_name.endswith("ffn2_out_scale"):
96+
var = block.var(var_name)
97+
convert_dequant_scale(var)
98+
99+
paddle.static.save_inference_model(
100+
model_path, feed_targets, fetch_targets, exe, program=program, skip_prune_program=True
101+
)
102+
103+
104+
def main():
105+
args = parse_arguments()
106+
process_params(args.model_path)
107+
108+
109+
if __name__ == "__main__":
110+
main()

llm/npu/llama/llama_npu_lora_N1C8.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,12 +30,12 @@ export MC2_Recompute=1
3030

3131
source /usr/local/Ascend/ascend-toolkit/set_env.sh
3232
export PYTHONPATH=../../../:$PYTHONPATH
33-
ps aux | grep finetune_generation.py | grep -v grep | awk '{print $2}' | xargs kill -9
33+
ps aux | grep run_finetune.py | grep -v grep | awk '{print $2}' | xargs kill -9
3434

3535
python -u -m paddle.distributed.launch \
3636
--devices "0,1,2,3,4,5,6,7" \
3737
--log_dir "./lora_bf16_llama_N1C8" \
38-
../../finetune_generation.py \
38+
../../run_finetune.py \
3939
--device "npu" \
4040
--model_name_or_path "meta-llama/Llama-2-13b-chat" \
4141
--dataset_name_or_path "data/" \

llm/npu/llama/llama_npu_sft_N1C8.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,12 +30,12 @@ export MC2_Recompute=1
3030

3131
source /usr/local/Ascend/ascend-toolkit/set_env.sh
3232
export PYTHONPATH=../../../:$PYTHONPATH
33-
ps aux | grep finetune_generation.py | grep -v grep | awk '{print $2}' | xargs kill -9
33+
ps aux | grep run_finetune.py | grep -v grep | awk '{print $2}' | xargs kill -9
3434

3535
python -u -m paddle.distributed.launch \
3636
--devices "0,1,2,3,4,5,6,7" \
3737
--log_dir "./sft_bf16_llama_N1C8" \
38-
../../finetune_generation.py \
38+
../../run_finetune.py \
3939
--device "npu" \
4040
--model_name_or_path "meta-llama/Llama-2-13b" \
4141
--dataset_name_or_path "data/" \

llm/predict_npu.sh renamed to llm/npu/llama/predict_npu.sh

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@ model_path=${1:-"./inference"}
1818

1919
source /usr/local/Ascend/ascend-toolkit/set_env.sh
2020
source /usr/local/Ascend/atb/set_env.sh
21-
21+
model_path=`realpath $model_path`
22+
cd ../../
2223
export PYTHONPATH=../:$PYTHONPATH
23-
python predictor.py --model_name_or_path ${model_path} --inference_model --dtype "float16" --mode "static" --block_attn --device npu
24+
python predict/predictor.py --model_name_or_path ${model_path} --inference_model --dtype "float16" --mode "static" --block_attn --device npu
25+
cd -

llm/predict/export_model.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ def main():
101101
validate_pdmodel(export_args.output_path, predictor_args.model_prefix, predictor_args.device)
102102

103103
if predictor_args.device == "npu":
104-
from llama.npu.export_utils import process_params
104+
from npu.llama.export_utils import process_params
105105

106106
process_params(os.path.join(export_args.output_path, predictor_args.model_prefix))
107107

0 commit comments

Comments
 (0)