Skip to content

Commit 94b78d7

Browse files
committed
Merge remote-tracking branch 'dev/develop' into uc/speed_check
2 parents d9ddc29 + 57b22e7 commit 94b78d7

File tree

98 files changed

+7997
-1003
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

98 files changed

+7997
-1003
lines changed

Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ unit-test:
4545

4646
.PHONY: install
4747
install:
48+
pip install paddlepaddle==0.0.0 -f https://www.paddlepaddle.org.cn/whl/linux/cpu-mkl/develop.html
4849
pip install -r requirements-dev.txt
4950
pip install -r requirements.txt
5051
pip install -r paddlenlp/experimental/autonlp/requirements.txt

llm/argument.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,14 @@ class ModelArgument:
126126
lora: bool = field(default=False, metadata={"help": "Whether to use LoRA technique"})
127127
lora_path: str = field(default=None, metadata={"help": "Initialize lora state dict."})
128128
lora_rank: int = field(default=8, metadata={"help": "Lora attention dimension"})
129+
use_quick_lora: bool = field(
130+
default=False,
131+
metadata={
132+
"help": "Whether to use quick lora, The use of Quick LoRa will only take effect when lora_dropout is set to 0."
133+
},
134+
)
135+
rslora: bool = field(default=False, metadata={"help": "Whether to use RsLoRA"})
136+
lora_plus_scale: float = field(default=1.0, metadata={"help": "Lora B scale in LoRA+ technique"})
129137

130138
# prefix tuning related parameters
131139
prefix_tuning: bool = field(default=False, metadata={"help": "Whether to use Prefix technique"})

llm/finetune_generation.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,7 @@ def main():
112112
weight_double_quant=model_args.weight_double_quant,
113113
weight_double_quant_block_size=model_args.weight_double_quant_block_size,
114114
)
115+
115116
if training_args.pipeline_parallel_degree > 1:
116117
if data_args.eval_with_do_generation and training_args.do_eval:
117118
raise ValueError("Plese set eval_with_do_generation to false in pipeline parallel mode.")
@@ -418,16 +419,20 @@ def neft_post_hook(module, input, output):
418419
lora_config = LoRAConfig(
419420
target_modules=target_modules,
420421
r=model_args.lora_rank,
421-
lora_alpha=2 * model_args.lora_rank,
422+
lora_alpha=2 * model_args.lora_rank if not model_args.rslora else 4,
423+
rslora=model_args.rslora,
424+
lora_plus_scale=model_args.lora_plus_scale,
422425
merge_weights=False,
423426
tensor_parallel_degree=training_args.tensor_parallel_degree,
424427
dtype=dtype,
425428
do_qat=quant_args.do_qat,
426429
base_model_name_or_path=model_args.model_name_or_path,
430+
use_quick_lora=model_args.use_quick_lora,
427431
)
428432
model = LoRAModel(model, lora_config)
429433
else:
430434
model = LoRAModel.from_pretrained(model=model, lora_path=model_args.lora_path)
435+
431436
model.print_trainable_parameters()
432437

433438
def compute_metrics_do_generation(eval_preds):

llm/llama/auto_parallel/run_pretrain_auto_static.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -552,7 +552,8 @@ def main():
552552
# if training_args.bf16:
553553
# dtype = "bfloat16"
554554

555-
model = model_class._from_config(config)
555+
# The `amp` of static graph model can't accept a model initialized with `dtype float16 or bfloat16`
556+
model = model_class._from_config(config, dtype="float32")
556557

557558
if training_args.recompute:
558559

llm/llama/fused_layers.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -58,16 +58,18 @@ def backward(ctx, y_grad):
5858

5959
if hasattr(weight, "main_grad") and hasattr(bias, "main_grad"):
6060
weight.main_grad, bias.main_grad = _C_ops.fused_linear_param_grad_add(
61-
x, y_grad, weight.main_grad, bias.main_grad, True
61+
x, y_grad, weight.main_grad, bias.main_grad, True, True
6262
)
6363
return x_grad, None, None
6464
else:
6565
if weight.grad is not None:
6666
assert bias.grad is not None
67-
weight.grad, bias.grad = _C_ops.fused_linear_param_grad_add(x, y_grad, weight.grad, bias.grad, False)
67+
weight.grad, bias.grad = _C_ops.fused_linear_param_grad_add(
68+
x, y_grad, weight.grad, bias.grad, False, True
69+
)
6870
return x_grad, None, None
6971
else:
70-
weight_grad, bias_grad = _C_ops.fused_linear_param_grad_add(x, y_grad, None, None, False)
72+
weight_grad, bias_grad = _C_ops.fused_linear_param_grad_add(x, y_grad, None, None, False, True)
7173
return x_grad, weight_grad, bias_grad
7274

7375

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
model_item=gpt_auto_pir
2+
dp_degree=2
3+
mp_degree=1
4+
pp_degree=4
5+
bs_item=16 # micro * dp * pp
6+
fp_item=fp16O1
7+
run_mode=DP2-MP1-PP4-SD2-stage1
8+
device_num=N1C8
9+
sharding_degree=2 # sharding_degree = dp_degree
10+
sharding_stage=1
11+
level=o1
12+
local_batch_size=8
13+
14+
model=gpt
15+
micro_bs=2 # local_batch_size / pp_degree
16+
17+
cd ./benchmarks
18+
bash ./test_tipc/gpt/static/new_exec_pp_pir/benchmark_common/prepare.sh
19+
# run
20+
bash ./test_tipc/gpt/static/new_exec_pp_pir/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \
21+
${sharding_degree} ${sharding_stage} ${level} 2>&1;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
model_item=gpt_auto_pir
2+
dp_degree=2
3+
mp_degree=1
4+
pp_degree=4
5+
bs_item=16 # micro * dp * pp
6+
fp_item=fp16O1
7+
run_mode=DP2-MP1-PP4-SD2-stage2
8+
device_num=N1C8
9+
sharding_degree=2 # sharding_degree = dp_degree
10+
sharding_stage=2
11+
level=o1
12+
local_batch_size=8
13+
14+
model=gpt
15+
micro_bs=2 # local_batch_size / pp_degree
16+
17+
cd ./benchmarks
18+
bash ./test_tipc/gpt/static/new_exec_pp_pir/benchmark_common/prepare.sh
19+
# run
20+
bash ./test_tipc/gpt/static/new_exec_pp_pir/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \
21+
${sharding_degree} ${sharding_stage} ${level} 2>&1;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
model_item=gpt_auto_pir
2+
dp_degree=2
3+
mp_degree=2
4+
pp_degree=2
5+
bs_item=16 # micro * dp * pp
6+
fp_item=fp16O1
7+
run_mode=DP2-MP2-PP2-SD2-stage1
8+
device_num=N1C8
9+
sharding_degree=2 # sharding_degree = dp_degree
10+
sharding_stage=1
11+
level=o1
12+
local_batch_size=8
13+
14+
model=gpt
15+
micro_bs=4 # local_batch_size / pp_degree
16+
17+
cd ./benchmarks
18+
bash ./test_tipc/gpt/static/new_exec_pp_pir/benchmark_common/prepare.sh
19+
# run
20+
bash ./test_tipc/gpt/static/new_exec_pp_pir/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \
21+
${sharding_degree} ${sharding_stage} ${level} 2>&1;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
model_item=gpt_auto_pir
2+
dp_degree=2
3+
mp_degree=2
4+
pp_degree=2
5+
bs_item=16 # micro * dp * pp
6+
fp_item=fp16O1
7+
run_mode=DP2-MP2-PP2-SD2-stage2
8+
device_num=N1C8
9+
sharding_degree=2 # sharding_degree = dp_degree
10+
sharding_stage=2
11+
level=o1
12+
local_batch_size=8
13+
14+
model=gpt
15+
micro_bs=4 # local_batch_size / pp_degree
16+
17+
cd ./benchmarks
18+
bash ./test_tipc/gpt/static/new_exec_pp_pir/benchmark_common/prepare.sh
19+
# run
20+
bash ./test_tipc/gpt/static/new_exec_pp_pir/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \
21+
${sharding_degree} ${sharding_stage} ${level} 2>&1;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
model_item=gpt_auto_pir
2+
dp_degree=2
3+
mp_degree=1
4+
pp_degree=4
5+
bs_item=16 # micro * dp * pp
6+
fp_item=fp16O2
7+
run_mode=DP2-MP1-PP4-SD2-stage1
8+
device_num=N1C8
9+
sharding_degree=2 # sharding_degree = dp_degree
10+
sharding_stage=1
11+
level=o2
12+
local_batch_size=8
13+
14+
model=gpt
15+
micro_bs=2 # local_batch_size / pp_degree
16+
17+
cd ./benchmarks
18+
bash ./test_tipc/gpt/static/new_exec_pp_pir/benchmark_common/prepare.sh
19+
# run
20+
bash ./test_tipc/gpt/static/new_exec_pp_pir/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \
21+
${sharding_degree} ${sharding_stage} ${level} 2>&1;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
model_item=gpt_auto_pir
2+
dp_degree=2
3+
mp_degree=1
4+
pp_degree=4
5+
bs_item=16 # micro * dp * pp
6+
fp_item=fp16O2
7+
run_mode=DP2-MP1-PP4-SD2-stage2
8+
device_num=N1C8
9+
sharding_degree=2 # sharding_degree = dp_degree
10+
sharding_stage=2
11+
level=o2
12+
local_batch_size=8
13+
14+
model=gpt
15+
micro_bs=2 # local_batch_size / pp_degree
16+
17+
cd ./benchmarks
18+
bash ./test_tipc/gpt/static/new_exec_pp_pir/benchmark_common/prepare.sh
19+
# run
20+
bash ./test_tipc/gpt/static/new_exec_pp_pir/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \
21+
${sharding_degree} ${sharding_stage} ${level} 2>&1;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
model_item=gpt_auto_pir
2+
dp_degree=2
3+
mp_degree=2
4+
pp_degree=2
5+
bs_item=16 # micro * dp * pp
6+
fp_item=fp16O2
7+
run_mode=DP2-MP2-PP2-SD2-stage1
8+
device_num=N1C8
9+
sharding_degree=2 # sharding_degree = dp_degree
10+
sharding_stage=1
11+
level=o2
12+
local_batch_size=8
13+
14+
model=gpt
15+
micro_bs=4 # local_batch_size / pp_degree
16+
17+
cd ./benchmarks
18+
bash ./test_tipc/gpt/static/new_exec_pp_pir/benchmark_common/prepare.sh
19+
# run
20+
bash ./test_tipc/gpt/static/new_exec_pp_pir/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \
21+
${sharding_degree} ${sharding_stage} ${level} 2>&1;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
model_item=gpt_auto_pir
2+
dp_degree=2
3+
mp_degree=2
4+
pp_degree=2
5+
bs_item=16 # micro * dp * pp
6+
fp_item=fp16O2
7+
run_mode=DP2-MP2-PP2-SD2-stage2
8+
device_num=N1C8
9+
sharding_degree=2 # sharding_degree = dp_degree
10+
sharding_stage=2
11+
level=o2
12+
local_batch_size=8
13+
14+
model=gpt
15+
micro_bs=4 # local_batch_size / pp_degree
16+
17+
cd ./benchmarks
18+
bash ./test_tipc/gpt/static/new_exec_pp_pir/benchmark_common/prepare.sh
19+
# run
20+
bash ./test_tipc/gpt/static/new_exec_pp_pir/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \
21+
${sharding_degree} ${sharding_stage} ${level} 2>&1;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
model_item=gpt_auto_pir
2+
dp_degree=2
3+
mp_degree=1
4+
pp_degree=4
5+
bs_item=16 # micro * dp * pp
6+
fp_item=fp16O3
7+
run_mode=DP2-MP1-PP4-SD2-stage1
8+
device_num=N1C8
9+
sharding_degree=2 # sharding_degree = dp_degree
10+
sharding_stage=1
11+
level=o3
12+
local_batch_size=8
13+
14+
model=gpt
15+
micro_bs=2 # local_batch_size / pp_degree
16+
17+
cd ./benchmarks
18+
bash ./test_tipc/gpt/static/new_exec_pp_pir/benchmark_common/prepare.sh
19+
# run
20+
bash ./test_tipc/gpt/static/new_exec_pp_pir/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \
21+
${sharding_degree} ${sharding_stage} ${level} 2>&1;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
model_item=gpt_auto_pir
2+
dp_degree=2
3+
mp_degree=1
4+
pp_degree=4
5+
bs_item=16 # micro * dp * pp
6+
fp_item=fp16O3
7+
run_mode=DP2-MP1-PP4-SD2-stage2
8+
device_num=N1C8
9+
sharding_degree=2 # sharding_degree = dp_degree
10+
sharding_stage=2
11+
level=o3
12+
local_batch_size=8
13+
14+
model=gpt
15+
micro_bs=2 # local_batch_size / pp_degree
16+
17+
cd ./benchmarks
18+
bash ./test_tipc/gpt/static/new_exec_pp_pir/benchmark_common/prepare.sh
19+
# run
20+
bash ./test_tipc/gpt/static/new_exec_pp_pir/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \
21+
${sharding_degree} ${sharding_stage} ${level} 2>&1;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
model_item=gpt_auto_pir
2+
dp_degree=2
3+
mp_degree=2
4+
pp_degree=2
5+
bs_item=16 # micro * dp * pp
6+
fp_item=fp16O3
7+
run_mode=DP2-MP2-PP2-SD2-stage1
8+
device_num=N1C8
9+
sharding_degree=2 # sharding_degree = dp_degree
10+
sharding_stage=1
11+
level=o3
12+
local_batch_size=8
13+
14+
model=gpt
15+
micro_bs=4 # local_batch_size / pp_degree
16+
17+
cd ./benchmarks
18+
bash ./test_tipc/gpt/static/new_exec_pp_pir/benchmark_common/prepare.sh
19+
# run
20+
bash ./test_tipc/gpt/static/new_exec_pp_pir/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \
21+
${sharding_degree} ${sharding_stage} ${level} 2>&1;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
model_item=gpt_auto_pir
2+
dp_degree=2
3+
mp_degree=2
4+
pp_degree=2
5+
bs_item=16 # micro * dp * pp
6+
fp_item=fp16O3
7+
run_mode=DP2-MP2-PP2-SD2-stage2
8+
device_num=N1C8
9+
sharding_degree=2 # sharding_degree = dp_degree
10+
sharding_stage=2
11+
level=o3
12+
local_batch_size=8
13+
14+
model=gpt
15+
micro_bs=4 # local_batch_size / pp_degree
16+
17+
cd ./benchmarks
18+
bash ./test_tipc/gpt/static/new_exec_pp_pir/benchmark_common/prepare.sh
19+
# run
20+
bash ./test_tipc/gpt/static/new_exec_pp_pir/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \
21+
${sharding_degree} ${sharding_stage} ${level} 2>&1;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
model_item=gpt_auto_pir
2+
dp_degree=1
3+
mp_degree=1
4+
pp_degree=8
5+
bs_item=8 # micro * dp * pp
6+
fp_item=fp16O1
7+
run_mode=DP1-MP1-PP8-SD1-stage1
8+
device_num=N1C8
9+
sharding_degree=1
10+
sharding_stage=1
11+
level=o1
12+
local_batch_size=8
13+
14+
model=gpt
15+
micro_bs=1 # local_batch_size / pp_degree
16+
17+
cd ./benchmarks
18+
bash ./test_tipc/gpt/static/new_exec_pp_pir/benchmark_common/prepare.sh
19+
# run
20+
bash ./test_tipc/gpt/static/new_exec_pp_pir/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \
21+
${sharding_degree} ${sharding_stage} ${level} 2>&1;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
model_item=gpt_auto_pir
2+
dp_degree=1
3+
mp_degree=2
4+
pp_degree=4
5+
bs_item=8 # micro * dp * pp
6+
fp_item=fp16O1
7+
run_mode=DP1-MP2-PP4-SD1-stage1
8+
device_num=N1C8
9+
sharding_degree=1 # sharding_degree = dp_degree
10+
sharding_stage=1
11+
level=o1
12+
local_batch_size=8
13+
14+
model=gpt
15+
micro_bs=2 # local_batch_size / pp_degree
16+
17+
cd ./benchmarks
18+
bash ./test_tipc/gpt/static/new_exec_pp_pir/benchmark_common/prepare.sh
19+
# run
20+
bash ./test_tipc/gpt/static/new_exec_pp_pir/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \
21+
${sharding_degree} ${sharding_stage} ${level} 2>&1;

0 commit comments

Comments
 (0)