|
1 | 1 | {
|
2 | 2 | "model_name_or_path": "Qwen/Qwen2-0.5B",
|
3 |
| - "dataset_name_or_path": "./data", |
| 3 | + "dataset_name_or_path": "./dureader_data", |
4 | 4 | "output_dir": "./checkpoints/sft_ckpts",
|
5 | 5 | "per_device_train_batch_size": 1,
|
6 |
| - "gradient_accumulation_steps": 128, |
| 6 | + "gradient_accumulation_steps": 4, |
7 | 7 | "per_device_eval_batch_size": 1,
|
8 | 8 | "eval_accumulation_steps": 1,
|
9 | 9 | "max_steps": 2000,
|
|
12 | 12 | "logging_steps": 1,
|
13 | 13 | "evaluation_strategy": "no",
|
14 | 14 | "save_strategy": "epoch",
|
15 |
| - "max_query_len": 1024, |
16 |
| - "max_passage_len": 2048, |
| 15 | + "max_query_len": 512, |
| 16 | + "max_passage_len": 512, |
17 | 17 | "group_size": 4,
|
18 |
| - "bp16": true, |
| 18 | + "bf16": true, |
19 | 19 | "fp16_opt_level": "O2",
|
20 | 20 | "do_train": true,
|
21 | 21 | "do_eval": false,
|
|
27 | 27 | "save_total_limit": 1,
|
28 | 28 | "tensor_parallel_degree": 1,
|
29 | 29 | "pipeline_parallel_degree": 1,
|
30 |
| - "sharding": "stage2", |
| 30 | + "sharding": "stage1", |
31 | 31 | "zero_padding": false,
|
32 |
| - "unified_checkpoint": false, |
33 |
| - "use_flash_attention": false |
| 32 | + "unified_checkpoint": true, |
| 33 | + "use_flash_attention": true, |
| 34 | + "amp_custom_black_list": "elementwise_div", |
| 35 | + "release_grads": true |
34 | 36 | }
|
0 commit comments