Skip to content

Commit f1d65ab

Browse files
committed
Merge branch 'develop' of https://github.com/PaddlePaddle/PaddleNLP into update_predict_new
2 parents 5c2f595 + 7c18d9d commit f1d65ab

File tree

10 files changed

+58
-271
lines changed

10 files changed

+58
-271
lines changed

.markdownlint.yaml

Lines changed: 0 additions & 241 deletions
This file was deleted.

.pre-commit-config.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ repos:
4444
entry: python .copyright.hook
4545
language: system
4646
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|xpu|kps|py|sh)$
47+
# For Markdown files
4748
- repo: local
4849
hooks:
4950
- id: add-spaces-between-chinese-and-english

llm/predict/predictor.py

Lines changed: 32 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -248,16 +248,19 @@ def _preprocess(self, source):
248248
def _infer(self, inputs):
249249
raise NotImplementedError
250250

251-
def _postprocess(self, predictions):
251+
def _postprocess(self, predictions, return_tokens=False):
252252
decoded_predictions = self.tokenizer.batch_decode(
253253
predictions, skip_special_tokens=True, clean_up_tokenization_spaces=False
254254
)
255-
return decoded_predictions
255+
if return_tokens:
256+
return decoded_predictions, predictions
257+
else:
258+
return decoded_predictions
256259

257-
def predict(self, input_texts: str | list[str]):
260+
def predict(self, input_texts: str | list[str], return_tokens=False):
258261
tokenized_source = self._preprocess(input_texts)
259262
predictions = self._infer(tokenized_source)
260-
decoded_predictions = self._postprocess(predictions)
263+
decoded_predictions = self._postprocess(predictions, return_tokens=return_tokens)
261264
return decoded_predictions
262265

263266

@@ -470,13 +473,16 @@ def __init__(self, config: PredictorArgument, tokenizer: PretrainedTokenizer):
470473
)
471474
self.generation_config = None
472475

473-
def _postprocess(self, predictions):
476+
def _postprocess(self, predictions, return_tokens=False):
474477
if paddle.distributed.get_rank() == 0:
475478
tokens: np.ndarray = load_real_time_tokens()
476479
decoded_predictions = self.tokenizer.batch_decode(
477480
tokens.tolist(), skip_special_tokens=True, clean_up_tokenization_spaces=False
478481
)
479-
return decoded_predictions
482+
if return_tokens:
483+
return decoded_predictions, tokens.tolist()
484+
else:
485+
return decoded_predictions
480486
else:
481487
return None
482488

@@ -1034,7 +1040,7 @@ def _infer(self, inputs: dict[str, paddle.Tensor]):
10341040
)
10351041

10361042
@paddle.no_grad()
1037-
def predict(self, input_texts: str | list[str]):
1043+
def predict(self, input_texts: str | list[str], return_tokens=False):
10381044
self._preprocess(input_texts)
10391045

10401046
result_queue = mp.Queue()
@@ -1055,9 +1061,15 @@ def predict(self, input_texts: str | list[str]):
10551061
self.used_list[i] = []
10561062

10571063
outputs = []
1064+
output_tokens = []
10581065
while len(outputs) < self.batch_size:
1059-
outputs.append(result_queue.get(timeout=1)[-1])
1060-
return outputs
1066+
result = result_queue.get(timeout=1)
1067+
outputs.append(result[-1])
1068+
output_tokens.append(result[-2])
1069+
if return_tokens:
1070+
return outputs, output_tokens
1071+
else:
1072+
return outputs
10611073

10621074

10631075
class StaticBlockInferencePredictor(BlockInferencePredictorMixin, BasePredictor):
@@ -1180,7 +1192,7 @@ def _share_data(self):
11801192
def _infer(self):
11811193
self.predictor.run()
11821194

1183-
def predict(self, input_texts: str | list[str]):
1195+
def predict(self, input_texts: str | list[str], return_tokens=False):
11841196

11851197
s_time = time.time()
11861198
self._preprocess(input_texts)
@@ -1213,9 +1225,15 @@ def predict(self, input_texts: str | list[str]):
12131225
self.used_list[i] = []
12141226

12151227
outputs = []
1228+
output_tokens = []
12161229
while len(outputs) < self.batch_size:
1217-
outputs.append(result_queue.get(timeout=1)[-1])
1218-
return outputs
1230+
result = result_queue.get(timeout=1)
1231+
outputs.append(result[-1])
1232+
output_tokens.append(result[-2])
1233+
if return_tokens:
1234+
return outputs, output_tokens
1235+
else:
1236+
return outputs
12191237

12201238
def _preprocess(self, source):
12211239
BlockInferencePredictorMixin._preprocess(self, source)
@@ -1681,8 +1699,8 @@ def benchmark(predictor, predictor_args, model_args):
16811699
output_tokens = 0
16821700
for _ in range(test_time):
16831701
for bs, batch_source_text in enumerate(batch_benchmark_texts):
1684-
outputs = predictor.predict(batch_source_text)
1685-
output_tokens += sum([len(output) for output in outputs])
1702+
outputs, batch_tokens = predictor.predict(batch_source_text, return_tokens=True)
1703+
output_tokens += sum([len(tokens) for tokens in batch_tokens])
16861704
end = time.perf_counter()
16871705
print("Avg Elapse time is: ", (end - start) / test_time)
16881706
print("Output tokens is: ", output_tokens)

llm/utils/utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -797,7 +797,7 @@ def read_res(model_name_or_path: str, tensor_queue: mp.Queue, result_queue: mp.Q
797797
break
798798
output = np.concatenate(outputs, axis=1).tolist()
799799
seqs = tokenizer.batch_decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=False)
800-
for i, seq in enumerate(seqs):
801-
result_queue.put([i, seq])
800+
for i, (out, seq) in enumerate(zip(output, seqs)):
801+
result_queue.put([i, out, seq])
802802

803803
logger.info("Finish read result message")

paddlenlp/data/causal_dataset.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,10 +94,11 @@ def get_datasets_weights_and_num_samples(data_prefix, train_val_test_num_samples
9494
# Add 0.5% (the 1.005 factor) so in case the bleding dataset does
9595
# not uniformly distribute the number of samples, we still have
9696
# samples left to feed to the network.
97+
# (NOTE, yujun06): This is a workaround to avoid issues with indexing in the blending dataset. Therefore, we need to add 20 samples to each dataset.
9798
datasets_train_valid_test_num_samples = []
9899
for weight in weights:
99100
datasets_train_valid_test_num_samples.append(
100-
[int(math.ceil(val * weight * 1.005)) for val in train_val_test_num_samples]
101+
[int(math.ceil(val * weight * 1.005)) + 20 for val in train_val_test_num_samples]
101102
)
102103

103104
return prefixes, weights, datasets_train_valid_test_num_samples

paddlenlp/generation/utils.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1211,6 +1211,8 @@ def sample(
12111211
probs = TopPProcess(probs, top_p, min_tokens_to_keep)
12121212
if paddle.device.is_compiled_with_custom_device("gcu"):
12131213
probs = paddle.cast(probs, "float32")
1214+
if paddle.device.is_compiled_with_xpu():
1215+
probs = paddle.cast(probs, "float32")
12141216

12151217
# multinomial already support fp16 and bf16 currently, fix issue: https://github.com/PaddlePaddle/Paddle/issues/51852
12161218
next_tokens = paddle.multinomial(probs)

0 commit comments

Comments
 (0)