Skip to content

Commit 6a41b11

Browse files
committed
fix T5 readme
Support megatron dataset for T5
1 parent 4a7665d commit 6a41b11

File tree

3 files changed

+24
-15
lines changed

3 files changed

+24
-15
lines changed

examples/language_model/t5/README.md

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,17 +20,19 @@
2020

2121
数据流是预训练的非常重要的,[预处理文档](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/model_zoo/ernie-1.0/preprocess/README.md)提供了整体的数据变动的流程示意,用户可以查看数据制作的细节文档。
2222

23-
在数据ID化步骤中,我们需要配置tokenzer_name,选择t5模型对应的tokenizer;通过下面脚本转化,我们可以得到处理好的预训练数据,token ids:[`baike_sample_ids.npy`](https://paddlenlp.bj.bcebos.com/models/transformers/t5/data//baike_sample_ids.npy), 文章索引信息[`baike_sample_idx.npz`](https://paddlenlp.bj.bcebos.com/models/transformers/t5/data//baike_sample_idx.npz).(这里提供了一个处理好的预训练数据,可点击链接下载)
23+
在数据ID化步骤中,我们需要配置tokenzer_name,选择t5模型对应的tokenizer;通过下面脚本转化,我们可以得到处理好的预训练数据,token ids:[`t5_openwebtext.bin`](https://paddlenlp.bj.bcebos.com/models/transformers/t5/data/t5_openwebtext.bin), 文章索引信息[`t5_openwebtext.idx`](https://paddlenlp.bj.bcebos.com/models/transformers/t5/data/t5_openwebtext.idx).(这里提供了一个处理好的预训练数据,可点击链接下载)
2424

2525
```shell
2626
python -u create_pretraining_data.py \
2727
--model_name t5-small \
2828
--tokenizer_name T5Tokenizer \
29-
--input_path baike_sample.jsonl \
30-
--split_sentences\
31-
--output_prefix baike_sample \
29+
--data_format JSON \
30+
--input_path openwebtext/2020-04.jsonl.zst \
31+
--split_sentences \
32+
--output_prefix t5_openwebtext \
3233
--workers 1 \
33-
--log_interval 5
34+
--log_interval 5 \
35+
--data_impl mmap
3436
```
3537

3638
#### 2. 开始训练
@@ -73,8 +75,9 @@ python -u -m paddle.distributed.launch \
7375
--disable_tqdm true \
7476
--do_train \
7577
--do_eval \
76-
--seed 1234\
77-
--device "gpu"
78+
--seed 1234 \
79+
--device "gpu" \
80+
--data_impl "mmap"
7881
```
7982

8083
其中参数释义如下:
@@ -95,6 +98,7 @@ python -u -m paddle.distributed.launch \
9598
- `dataloader_num_workers` DataLoader采样进程,当数据输入为瓶颈时,可尝试提高采样进程数目。
9699
- `eval_steps` 模型评估间隔。
97100
- `device` 训练设备,默认为GPU。
101+
- `data_impl` 指定输入文件数据制作类型,默认为mmap,可指定mmap或lazy。mmap格式在读入数据时会建立内存映射,lazy格式在读入数据时直接从文件读取。
98102

99103
### GLUE任务
100104

examples/language_model/t5/t5_run_pretrain_trainer.py

100644100755
Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,10 @@ class DataArguments:
120120
default=3,
121121
metadata={"help": "Max N Grams"},
122122
)
123+
data_impl: str = field(
124+
default="mmap",
125+
metadata={"help": "mmap/lazy format converted from preprocessed data."},
126+
)
123127

124128

125129
@dataclass
@@ -183,12 +187,13 @@ def create_pretrained_dataset(
183187

184188
def print_dataset(data, mode="train"):
185189
logger.info(f"Sample data for {mode} mode")
186-
# text_enc, text_dec, labels, loss_mask, truncated, enc_mask, dec_mask, enc_dec_mask = data
187-
# print("line 195 t5 run pretain trainer", text_enc)
188-
print(data)
189-
print(tokenizer.convert_ids_to_tokens(token for token in list(data["text_enc"])))
190-
print(tokenizer.convert_ids_to_tokens(token for token in list(data["text_dec"])))
191-
# print(tokenizer.convert_ids_to_tokens(token for token in list(data["labels"])))
190+
text_enc, text_dec = data["text_enc"], data["text_dec"]
191+
if tokenizer.pad_token_id in text_enc:
192+
text_enc = text_enc[0 : list(text_enc).index(tokenizer.pad_token_id)]
193+
logger.info(tokenizer._decode(text_enc))
194+
if tokenizer.pad_token_id in text_dec:
195+
text_dec = text_dec[0 : list(text_dec).index(tokenizer.pad_token_id)]
196+
logger.info(tokenizer._decode(text_dec))
192197

193198
print_dataset(train_ds[0], "train")
194199
print_dataset(valid_ds[0], "valid")
@@ -224,9 +229,10 @@ def get_train_data_file(args):
224229
files = [
225230
os.path.join(args.input_dir, f)
226231
for f in os.listdir(args.input_dir)
227-
if (os.path.isfile(os.path.join(args.input_dir, f)) and "_idx.npz" in str(f))
232+
if (os.path.isfile(os.path.join(args.input_dir, f)) and ("_idx.npz" in str(f) or ".idx" in str(f)))
228233
]
229234
files = [x.replace("_idx.npz", "") for x in files]
235+
files = [x.replace(".idx", "") for x in files]
230236

231237
if len(files) > 1:
232238
ret = []

model_zoo/ernie-1.0/args.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@ def parse_args(MODEL_CLASSES):
3333
parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the training logs and checkpoints will be written.")
3434
parser.add_argument("--split", type=str, default='949,50,1', help="Train/valid/test data split.")
3535
parser.add_argument("--data_impl", type=str, default='mmap', help="mmap/lazy format converted from preprocessed data.")
36-
3736
parser.add_argument("--binary_head", type=strtobool, default=True, help="True for NSP task.")
3837
parser.add_argument("--max_seq_len", type=int, default=1024, help="Max sequence length.")
3938
parser.add_argument("--micro_batch_size", default=8, type=int, help="Batch size per device for one step training.", )

0 commit comments

Comments
 (0)