You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
disable_p2p_cache_shape, if you max sequence length is varying, please set disable_p2p_cache_shape.
551
565
disable_partial_send_recv, optmize send speed for tensor parallel.
552
-
enable_delay_scale_loss, accumulate gradients util optimizer step, all gradients div by inner pipeline accumute step. instead of div accumute step on loss directly.
566
+
enable_delay_scale_loss, accumulate gradients until optimizer step, all gradients div by inner pipeline accumute step. instead of div accumute step on loss directly.
553
567
enable_dp_comm_overlap, fuse data parallel gradient communication.
Copy file name to clipboardExpand all lines: paddlenlp/trainer/training_args.py
+33-5Lines changed: 33 additions & 5 deletions
Original file line number
Diff line number
Diff line change
@@ -241,13 +241,16 @@ class TrainingArguments:
241
241
enable_mp_async_allreduce, it supports all_reduce(dx) overlap with matmul(dw) in ColumnParallelLinear backward when it set True, which can accelerate model parallel performance.
242
242
enable_mp_skip_c_identity, it supports skip c_identity in ColumnParallelLinear and RowParallelLinear. It only works when set mp_async_allreduce is True. It can accelerate model parallel further.
243
243
enable_mp_fused_linear_param_grad_add, it supports fused_linear_param_grad_add in ColumnParallelLinear (cuda >= 11.6). It only works when mp_async_allreduce is true. It can accelerate model parallel further.
244
-
enable_delay_scale_loss, accumulate gradients util optimizer step, all gradients div by accumute step. instead of div accumute step on loss directly.
244
+
enable_delay_scale_loss, accumulate gradients until optimizer step, all gradients div by accumute step. instead of div accumute step on loss directly.
245
+
sync_param, in optimizer step, use broadcast to sync parameters those attr 'is_distributed' is False.
246
+
sync_grad, in optimizer step, use broadcast to sync gradients those attr 'is_distributed' is False.
247
+
sync_moment, in optimizer step, use broadcast to sync momentums those attr 'is_distributed' is False.
245
248
pipeline_parallel_config (`str`, *optional*)(
246
249
Some additional config it highly affect the useage of pipeline parallel, we provide some option to config it.
247
250
following config is support:
248
251
disable_p2p_cache_shape, if you max sequence length is varying, please set disable_p2p_cache_shape.
249
252
disable_partial_send_recv, optmize send speed for tensor parallel.
250
-
enable_delay_scale_loss, accumulate gradients util optimizer step, all gradients div by inner pipeline accumute step. instead of div accumute step on loss directly.
253
+
enable_delay_scale_loss, accumulate gradients until optimizer step, all gradients div by inner pipeline accumute step. instead of div accumute step on loss directly.
251
254
enable_dp_comm_overlap, fuse data parallel gradient communication.
enable_release_grads, reduce peak memory usage by releasing gradients after each iteration. The creation of gradients will be postponed until backward propagation of the next iteration.
@@ -600,7 +603,10 @@ class TrainingArguments:
600
603
"enable_mp_async_allreduce, it supports all_reduce(dx) overlap with matmul(dw) in ColumnParallelLinear backward when it set True, which can accelerate model parallel performance. \n"
601
604
"enable_mp_skip_c_identity, it supports skip c_identity in ColumnParallelLinear and RowParallelLinear. It only works when set mp_async_allreduce is True. It can accelerate model parallel further.\n"
602
605
"enable_mp_fused_linear_param_grad_add, it supports fused_linear_param_grad_add in ColumnParallelLinear (cuda >= 11.6). It only works when mp_async_allreduce is true. It can accelerate model parallel further.\n"
603
-
"enable_delay_scale_loss, accumulate gradients util optimizer step, all gradients div by accumute step. instead of div accumute step on loss directly.\n"
606
+
"enable_delay_scale_loss, accumulate gradients until optimizer step, all gradients div by accumute step. instead of div accumute step on loss directly.\n"
607
+
"sync_param, in optimizer step, use broadcast to sync parameters those attr 'is_distributed' is False.\n"
608
+
"sync_grad, in optimizer step, use broadcast to sync gradients those attr 'is_distributed' is False.\n"
609
+
"sync_moment, in optimizer step, use broadcast to sync momentums those attr 'is_distributed' is False.\n"
604
610
)
605
611
},
606
612
)
@@ -612,7 +618,7 @@ class TrainingArguments:
612
618
"following config is support:\n"
613
619
"disable_p2p_cache_shape, if you max sequence length is varying, please set disable_p2p_cache_shape. \n"
614
620
"disable_partial_send_recv, optmize send speed for tensor parallel.\n"
615
-
"enable_delay_scale_loss, accumulate gradients util optimizer step, all gradients div by inner pipeline accumute step. instead of div accumute step on loss directly.\n"
621
+
"enable_delay_scale_loss, accumulate gradients until optimizer step, all gradients div by inner pipeline accumute step. instead of div accumute step on loss directly.\n"
616
622
"enable_dp_comm_overlap, fuse data parallel gradient communication. \n"
"enable_overlap_p2p_comm, overlap p2p communication with computation. \n"
@@ -1062,10 +1068,13 @@ def __post_init__(self):
1062
1068
"enable_mp_skip_c_identity",
1063
1069
"enable_mp_fused_linear_param_grad_add",
1064
1070
"enable_delay_scale_loss",
1071
+
"sync_param",
1072
+
"sync_grad",
1073
+
"sync_moment",
1065
1074
]:
1066
1075
raiseValueError(
1067
1076
f"Found unknown tensor parallell config {x}, "
1068
-
f"accept config is enable_mp_async_allreduce, enable_mp_skip_c_identityand enable_mp_fused_linear_param_grad_add"
1077
+
f"accept config is enable_mp_async_allreduce, enable_mp_skip_c_identity, enable_mp_fused_linear_param_grad_add, sync_param, sync_grad and sync_moment."
1069
1078
)
1070
1079
try:
1071
1080
if"enable_mp_async_allreduce"inmp_config:
@@ -1083,6 +1092,25 @@ def __post_init__(self):
1083
1092
warnings.warn(
1084
1093
"enable_mp_fused_linear_param_grad_add only works with enable_mp_async_allreduce. It will not work."
1085
1094
)
1095
+
1096
+
sync_param=sync_grad=sync_moment=True# For CI test
1097
+
1098
+
# sync_param = "sync_param" in mp_config
1099
+
# sync_grad = "sync_grad" in mp_config
1100
+
# sync_moment = "sync_moment" in mp_config
1101
+
1102
+
# sync_param_name = [""] matches any parameter name.
1103
+
# If sync_param, sync_grad and sync_moment are not set, the default value in Paddle is :
0 commit comments