update

DesmonDay · DesmonDay · commit 0a618b0dceb3 · 2024-12-10T21:43:37.000+08:00
diff --git a/llm/config/qwen/emb_argument.json b/llm/config/qwen/emb_argument.json
@@ -1,9 +1,9 @@
 {
   "model_name_or_path": "Qwen/Qwen2-0.5B",
-  "dataset_name_or_path": "./data",
+  "dataset_name_or_path": "./dureader_data",
   "output_dir": "./checkpoints/sft_ckpts",
   "per_device_train_batch_size": 1,
-  "gradient_accumulation_steps": 128,
+  "gradient_accumulation_steps": 4,
   "per_device_eval_batch_size": 1,
   "eval_accumulation_steps": 1,
   "max_steps": 2000,
@@ -12,10 +12,10 @@
   "logging_steps": 1,
   "evaluation_strategy": "no",
   "save_strategy": "epoch",
-  "max_query_len": 1024,
-  "max_passage_len": 2048,
+  "max_query_len": 512,
+  "max_passage_len": 512,
   "group_size": 4,
-  "bp16": true,
+  "bf16": true,
   "fp16_opt_level": "O2",
   "do_train": true,
   "do_eval": false,
@@ -27,8 +27,10 @@
   "save_total_limit": 1,
   "tensor_parallel_degree": 1,
   "pipeline_parallel_degree": 1,
-  "sharding": "stage2",
+  "sharding": "stage1",
   "zero_padding": false,
-  "unified_checkpoint": false,
-  "use_flash_attention": false
+  "unified_checkpoint": true,
+  "use_flash_attention": true,
+  "amp_custom_black_list": "elementwise_div",
+  "release_grads": true
 }
diff --git a/llm/utils/argument.py b/llm/utils/argument.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from dataclasses import dataclass, field
+from typing import List, Optional
 
 
 @dataclass
@@ -83,3 +84,7 @@ class EmbeddingArgument:
         default=True,
         metadata={"help": "Whether to share the negatives across all GPUs."},
     )
+    embedding_matryoshka_dims: Optional[List[int]] = field(
+        default=None,
+        metadata={"help": "The dims for matryoshka training."},
+    )