Skip to content

Commit cb1e378

Browse files
lixcliMangodadada
authored andcommitted
[Inference] Add a8w8(fp8) a8w8c8(int8) quant_type support (PaddlePaddle#9032)
* 1. add a8w8(fp8) a8w8c8(int8) quant_type support 2. add llama3.1 and qwen2 ptq config 3. update quantization.md * fix load_quant_model bug * fix load quant bug * update ll/README.md
1 parent f4f89f7 commit cb1e378

38 files changed

+2559
-79
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,3 +124,6 @@ FETCH_HEAD
124124
# vscode
125125
.vscode
126126
./ppdiffusers/ppdiffusers/version.py
127+
128+
dataset/
129+
output/

llm/README.md

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -210,7 +210,15 @@ python -u -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" ./alignment/dpo
210210
</div>
211211
<div align="center">
212212
<font size ="1">
213-
飞桨量化算法效果展示
213+
飞桨 W4和 W8A8量化算法效果展示
214+
</font>
215+
</div>
216+
<div align="center">
217+
<img width="300" alt="llm" src="https://github.com/user-attachments/assets/ab8d04ba-d589-4f54-acf1-b00c0fd9159e">
218+
</div>
219+
<div align="center">
220+
<font size ="1">
221+
飞桨 W8A8C8和 FP8量化效果展示
214222
</font>
215223
</div>
216224

@@ -220,6 +228,12 @@ python run_finetune.py ./config/llama/ptq_argument.json
220228

221229
# GPTQ 量化启动命令参考
222230
python run_finetune.py ./config/llama/ptq_argument.json
231+
232+
# W8A8C8(INT)量化启动命令参考
233+
python run_finetune.py ./config/llama/ptq_c8_argument.json
234+
235+
# W8A8(FP8)量化启动命令参考
236+
python run_finetune.py ./config/llama/fp8_ptq_argument.json
223237
```
224238

225239
更多技术细节和模型量化使用详见[量化文档](./docs/quantization.md)
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
{
2+
"model_name_or_path": "meta-llama/Meta-Llama-3.1-8B-Instruct",
3+
"quant_type": "a8w8",
4+
"per_device_train_batch_size": 8,
5+
"per_device_eval_batch_size": 8,
6+
"eval_accumulation_steps":16,
7+
"src_length": 1024,
8+
"max_length": 2048,
9+
"fp16": true,
10+
"fp16_opt_level": "O2",
11+
"dataset_name_or_path": "../dataset/AdvertiseGen",
12+
"output_dir": "../output/llama3.1/w8a8_ptq_ckpts_AdvertiseGen",
13+
"do_eval": true,
14+
"eval_with_do_generation": false,
15+
"do_ptq": true,
16+
"ptq_step": 16,
17+
"unified_checkpoint": false,
18+
"smooth": true,
19+
"smooth_step": 16,
20+
"smooth_all_linears": true,
21+
"smooth_piecewise_search": true,
22+
"smooth_k_piece": 3,
23+
"smooth_search_piece": true,
24+
"act_quant_method": "avg",
25+
"cachekv_quant_method": "avg_headwise"
26+
}
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
{
2+
"model_name_or_path": "meta-llama/Meta-Llama-3.1-8B-Instruct",
3+
"quant_type": "a8w8c8",
4+
"per_device_train_batch_size": 8,
5+
"per_device_eval_batch_size": 8,
6+
"eval_accumulation_steps":16,
7+
"src_length": 1024,
8+
"max_length": 2048,
9+
"fp16": true,
10+
"fp16_opt_level": "O2",
11+
"dataset_name_or_path": "../dataset/AdvertiseGen",
12+
"output_dir": "../output/llama3.1/w8a8_ptq_ckpts_AdvertiseGen",
13+
"do_eval": true,
14+
"eval_with_do_generation": false,
15+
"do_ptq": true,
16+
"ptq_step": 16,
17+
"unified_checkpoint": false,
18+
"smooth": true,
19+
"smooth_step": 16,
20+
"smooth_all_linears": true,
21+
"smooth_piecewise_search": true,
22+
"smooth_k_piece": 3,
23+
"smooth_search_piece": true,
24+
"act_quant_method": "avg",
25+
"cachekv_quant_method": "avg_headwise"
26+
}
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
{
2+
"model_name_or_path": "meta-llama/Meta-Llama-3.1-8B-Instruct",
3+
"quant_type": "a8w8",
4+
"use_fp8": "WA",
5+
"fp8_type": ["e4m3", "e4m3"],
6+
"per_device_train_batch_size": 8,
7+
"per_device_eval_batch_size": 8,
8+
"eval_accumulation_steps":16,
9+
"src_length": 1024,
10+
"max_length": 2048,
11+
"fp16": true,
12+
"fp16_opt_level": "O2",
13+
"dataset_name_or_path": "../dataset/AdvertiseGen",
14+
"output_dir": "../output/llama3.1/w8a8_ptq_ckpts_AdvertiseGen",
15+
"do_eval": true,
16+
"eval_with_do_generation": false,
17+
"do_ptq": true,
18+
"ptq_step": 16,
19+
"unified_checkpoint": false,
20+
"smooth": false,
21+
"weight_quant_method": "abs_max",
22+
"act_quant_method": "abs_max",
23+
"cachekv_quant_method": "abs_max"
24+
}
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
{
2+
"model_name_or_path": "meta-llama/Meta-Llama-3.1-8B-Instruct",
3+
"quant_type": "a8w8",
4+
"per_device_train_batch_size": 8,
5+
"per_device_eval_batch_size": 8,
6+
"eval_accumulation_steps":16,
7+
"src_length": 1024,
8+
"max_length": 2048,
9+
"fp16": true,
10+
"fp16_opt_level": "O2",
11+
"output_dir": "../output/llama3.1/w8a8_ptq_ckpts_ceval",
12+
"do_eval": true,
13+
"eval_with_do_generation": false,
14+
"do_ptq": false,
15+
"ptq_step": 1,
16+
"unified_checkpoint": false,
17+
"smooth": true,
18+
"smooth_step": 8,
19+
"smooth_all_linears": true,
20+
"smooth_piecewise_search": true,
21+
"smooth_k_piece": 1,
22+
"smooth_search_piece": true,
23+
"act_quant_method": "avg",
24+
"cachekv_quant_method": "avg_headwise",
25+
"load_quant_model": true,
26+
"do_ceval": true,
27+
"cot": false,
28+
"few_shot": true,
29+
"ntrain": 5,
30+
"with_prompt": false,
31+
"constrained_decoding": true,
32+
"temperature": 0.2,
33+
"n_times": 1,
34+
"do_save_csv": false,
35+
"do_test": false,
36+
"ceval_data_path": "../dataset/ceval"
37+
}
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
{
2+
"model_name_or_path": "meta-llama/Meta-Llama-3.1-8B-Instruct",
3+
"quant_type": "a8w8",
4+
"use_fp8": "WA",
5+
"per_device_train_batch_size": 8,
6+
"per_device_eval_batch_size": 8,
7+
"eval_accumulation_steps":16,
8+
"src_length": 1024,
9+
"max_length": 2048,
10+
"fp16": true,
11+
"fp16_opt_level": "O2",
12+
"dataset_name_or_path": "../dataset/ceval_ptq",
13+
"output_dir": "../output/llama3.1/wfp8afp8_ptq_ckpts_ceval",
14+
"do_eval": true,
15+
"eval_with_do_generation": false,
16+
"do_ptq": false,
17+
"ptq_step": 1,
18+
"unified_checkpoint": false,
19+
"smooth": false,
20+
"weight_quant_method": "abs_max",
21+
"act_quant_method": "abs_max",
22+
"cachekv_quant_method": "abs_max",
23+
"load_quant_model": true,
24+
"do_ceval": true
25+
}
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
{
2+
"model_name_or_path": "meta-llama/Meta-Llama-3.1-8B-Instruct",
3+
"quant_type": "a8w8",
4+
"per_device_train_batch_size": 8,
5+
"per_device_eval_batch_size": 8,
6+
"eval_accumulation_steps":16,
7+
"src_length": 1024,
8+
"max_length": 2048,
9+
"fp16": true,
10+
"fp16_opt_level": "O2",
11+
"dataset_name_or_path": "../dataset/ceval_ptq",
12+
"output_dir": "../output/llama3.1/w8a8_ptq_ckpts_ceval",
13+
"do_eval": true,
14+
"eval_with_do_generation": false,
15+
"do_ptq": true,
16+
"ptq_step": 16,
17+
"unified_checkpoint": false,
18+
"smooth": true,
19+
"smooth_step": 16,
20+
"smooth_all_linears": true,
21+
"smooth_piecewise_search": true,
22+
"smooth_k_piece": 3,
23+
"smooth_search_piece": true,
24+
"act_quant_method": "avg",
25+
"cachekv_quant_method": "avg_headwise"
26+
}
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
{
2+
"model_name_or_path": "meta-llama/Meta-Llama-3.1-8B-Instruct",
3+
"quant_type": "a8w8c8",
4+
"per_device_train_batch_size": 8,
5+
"per_device_eval_batch_size": 8,
6+
"eval_accumulation_steps":16,
7+
"src_length": 1024,
8+
"max_length": 2048,
9+
"fp16": true,
10+
"fp16_opt_level": "O2",
11+
"dataset_name_or_path": "../dataset/ceval_ptq",
12+
"output_dir": "../output/llama3.1/w8a8c8_ptq_ckpts_ceval",
13+
"do_eval": true,
14+
"eval_with_do_generation": false,
15+
"do_ptq": true,
16+
"ptq_step": 16,
17+
"unified_checkpoint": false,
18+
"smooth": true,
19+
"smooth_step": 16,
20+
"smooth_all_linears": true,
21+
"smooth_piecewise_search": true,
22+
"smooth_k_piece": 3,
23+
"smooth_search_piece": true,
24+
"act_quant_method": "abs_max",
25+
"cachekv_quant_method": "abs_max_headwise"
26+
}
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
{
2+
"model_name_or_path": "meta-llama/Meta-Llama-3.1-8B-Instruct",
3+
"quant_type": "a8w8",
4+
"use_fp8": "WA",
5+
"per_device_train_batch_size": 8,
6+
"per_device_eval_batch_size": 8,
7+
"eval_accumulation_steps":16,
8+
"src_length": 1024,
9+
"max_length": 2048,
10+
"fp16": true,
11+
"fp16_opt_level": "O2",
12+
"dataset_name_or_path": "../dataset/ceval_ptq",
13+
"output_dir": "../output/llama3.1/wfp8afp8_ptq_ckpts_ceval",
14+
"do_eval": true,
15+
"eval_with_do_generation": false,
16+
"do_ptq": true,
17+
"ptq_step": 16,
18+
"unified_checkpoint": false,
19+
"smooth": false,
20+
"weight_quant_method": "abs_max",
21+
"act_quant_method": "abs_max",
22+
"cachekv_quant_method": "abs_max"
23+
}
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
{
2+
"model_name_or_path": "meta-llama/Meta-Llama-3-8B",
3+
"per_device_train_batch_size": 8,
4+
"per_device_eval_batch_size": 8,
5+
"eval_accumulation_steps":16,
6+
"src_length": 1024,
7+
"max_length": 2048,
8+
"bf16": true,
9+
"fp16_opt_level": "O2",
10+
"dataset_name_or_path": "./data",
11+
"output_dir": "./checkpoints/llama_ptq_ckpts",
12+
"do_eval": true,
13+
"eval_with_do_generation": false,
14+
"do_ptq": false,
15+
"ptq_step": 1,
16+
"unified_checkpoint": false,
17+
"smooth": true,
18+
"smooth_step": 8,
19+
"smooth_all_linears": true,
20+
"smooth_piecewise_search": true,
21+
"smooth_k_piece": 1,
22+
"smooth_search_piece": true,
23+
"load_quant_model": true,
24+
"do_ceval": true,
25+
"ceval_data_path": "../dataset/ceval"
26+
}
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
{
2+
"model_name_or_path": "meta-llama/Meta-Llama-3-8B",
3+
"quant_type": "W8A8",
4+
"use_fp8": "WA",
5+
"fp8_type": ["e4m3", "e4m3"],
6+
"per_device_train_batch_size": 8,
7+
"per_device_eval_batch_size": 8,
8+
"eval_accumulation_steps":16,
9+
"src_length": 1024,
10+
"max_length": 2048,
11+
"fp16": true,
12+
"fp16_opt_level": "O2",
13+
"dataset_name_or_path": "./data",
14+
"output_dir": "./checkpoints/llama_ptq_ckpts",
15+
"do_eval": true,
16+
"eval_with_do_generation": false,
17+
"do_ptq": true,
18+
"ptq_step": 16,
19+
"unified_checkpoint": false,
20+
"smooth": false,
21+
"weight_quant_method": "abs_max",
22+
"act_quant_method": "abs_max",
23+
"cachekv_quant_method": "abs_max"
24+
}

llm/config/llama/ptq_c8_argument.json

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
{
2+
"model_name_or_path": "meta-llama/Meta-Llama-3-8B",
3+
"quant_type": "a8w8c8",
4+
"per_device_train_batch_size": 8,
5+
"per_device_eval_batch_size": 8,
6+
"eval_accumulation_steps":16,
7+
"src_length": 1024,
8+
"max_length": 2048,
9+
"fp16": true,
10+
"fp16_opt_level": "O2",
11+
"dataset_name_or_path": "./data",
12+
"output_dir": "./checkpoints/llama_ptq_c8_ckpts",
13+
"do_eval": true,
14+
"eval_with_do_generation": false,
15+
"do_ptq": true,
16+
"ptq_step": 16,
17+
"unified_checkpoint": false,
18+
"smooth": true,
19+
"smooth_step": 16,
20+
"smooth_all_linears": true,
21+
"smooth_piecewise_search": true,
22+
"smooth_k_piece": 3,
23+
"smooth_search_piece": true,
24+
"act_quant_method": "avg",
25+
"cachekv_quant_method": "avg_headwise"
26+
}
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
{
2+
"model_name_or_path": "Qwen/Qwen2-7B-Instruct",
3+
"quant_type": "a8w8",
4+
"per_device_train_batch_size": 8,
5+
"per_device_eval_batch_size": 8,
6+
"eval_accumulation_steps":16,
7+
"src_length": 1024,
8+
"max_length": 2048,
9+
"fp16": true,
10+
"fp16_opt_level": "O2",
11+
"dataset_name_or_path": "../dataset/AdvertiseGen",
12+
"output_dir": "../output/qwen2/w8a8_ptq_ckpts_AdvertiseGen",
13+
"do_eval": true,
14+
"eval_with_do_generation": false,
15+
"do_ptq": true,
16+
"ptq_step": 16,
17+
"unified_checkpoint": false,
18+
"smooth": true,
19+
"smooth_step": 16,
20+
"smooth_all_linears": true,
21+
"smooth_piecewise_search": true,
22+
"smooth_k_piece": 3,
23+
"smooth_search_piece": true,
24+
"act_quant_method": "abs_max",
25+
"cachekv_quant_method": "abs_max_headwise"
26+
}

0 commit comments

Comments
 (0)