You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
default=None, metadata={"help": "Build-in pretrained model name or the path to local model."}
107
107
)
108
-
use_flash_attention: bool=field(default=False, metadata={"help": "Whether to use flash attention"})
109
108
tokenizer_name_or_path: Optional[str] =field(
110
109
default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
111
110
)
112
-
use_fused_rms_norm: bool=field(
113
-
default=False,
114
-
metadata={"help": "llama or other model, use_fused_rms_norm"},
115
-
)
116
111
fuse_attention_qkv: bool=field(
117
-
default=False,
112
+
default=None,
118
113
metadata={"help": "whether to fuse attention qkv"},
119
114
)
120
115
fuse_attention_ffn: bool=field(
121
-
default=False,
116
+
default=None,
122
117
metadata={"help": "whether to fuse first up and gate proj in mlp block"},
123
118
)
124
-
recompute_granularity: str=field(
125
-
default="full",
126
-
metadata={"help": "Choose among ['full', 'core_attn', 'full_attn']"},
127
-
)
128
-
virtual_pp_degree: int=field(
129
-
default=1,
130
-
metadata={"help": "virtual_pp_degree"},
131
-
)
132
119
hidden_dropout_prob: float=field(default=0.1, metadata={"help": "The hidden dropout prob."})
133
120
attention_probs_dropout_prob: float=field(default=0.1, metadata={"help": "The attention hidden dropout prob."})
134
121
@@ -138,32 +125,6 @@ class ModelArgument:
138
125
"help": "Pre-training from existing paddlenlp model weights. Default False and model will train from scratch. If set True, the model_name_or_path argument must exist in the paddlenlp models."
139
126
},
140
127
)
141
-
sequence_parallel: bool=field(
142
-
default=False,
143
-
metadata={"help": "whether to use sequence parallel"},
144
-
)
145
-
fuse_sequence_parallel_allreduce: bool=field(
146
-
default=False,
147
-
metadata={"help": "whether to use fuse sequence parallel allreduce"},
148
-
)
149
-
use_fused_rope: Optional[bool] =field(
150
-
default=False,
151
-
metadata={"help": "Enable rope fusion or not."},
152
-
)
153
-
no_recompute_layers: Optional[List[int]] =field(
154
-
default=None,
155
-
metadata={"help": "Specify the full transformer layers that should not be recomputed."},
156
-
)
157
-
pp_recompute_interval: int=field(
158
-
default=1,
159
-
metadata={
160
-
"help": "The interval for the number of layers at which recomputation occurs. A value of 0 indicates no recomputation. Default is 0."
0 commit comments