Skip to content

Commit 8831906

Browse files
committed
[NPU] Fix baichuan2-13b-chat infer
1 parent e204b6d commit 8831906

File tree

3 files changed

+35
-4
lines changed

3 files changed

+35
-4
lines changed

llm/predict/predictor.py

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,10 @@ class PredictorArgument:
132132

133133
@property
134134
def total_max_length(self):
135-
return 8192 # Maximum sequence length.
135+
if self.device == "npu":
136+
return self.src_length + self.max_length
137+
else:
138+
return 8192 # Maximum sequence length.
136139

137140

138141
@dataclass
@@ -859,6 +862,35 @@ def init_model_inputs(self, config: PredictorArgument):
859862
self.model_inputs["tgt_mask"] = (
860863
alibi_decoder + (1 - self.model_inputs["tgt_mask"]) * paddle.finfo(self.dtype).min
861864
).cast(self.dtype)
865+
elif config.device == "npu" and self.model_config.get("alibi", False):
866+
lower_one_tril = paddle.tril(
867+
paddle.ones(shape=(config.total_max_length, config.total_max_length), dtype=self.dtype)
868+
)
869+
lower_one_tril = lower_one_tril[None, None, :, :]
870+
src_mask = lower_one_tril.tile([config.batch_size, 1, 1, 1])
871+
tgt_mask = paddle.full(
872+
shape=[config.batch_size, 1, 1, config.total_max_length], fill_value=1, dtype=self.dtype
873+
)
874+
arange_tensor_encoder = paddle.arange(config.total_max_length).astype(self.dtype)
875+
alibi_slopes = llm_utils.get_alibi_slopes(self.num_attention_heads)
876+
alibi = alibi_slopes[None, :, None, None] * arange_tensor_encoder
877+
alibi_encoder = alibi.tile([config.batch_size, 1, config.total_max_length, 1])
878+
alibi_decoder = alibi.tile(
879+
[
880+
config.batch_size,
881+
1,
882+
1,
883+
1,
884+
]
885+
)
886+
# self.model_inputs["src_mask/tgt_mask"] is read only, will not be updated!
887+
src_mask = (
888+
alibi_encoder + (1 - src_mask) * paddle.finfo(self.dtype).min
889+
).cast(self.dtype)
890+
tgt_mask = (
891+
alibi_decoder + (1 - tgt_mask) * paddle.finfo(self.dtype).min
892+
).cast(self.dtype)
893+
self.model_inputs["rope_emb"] = paddle.concat([src_mask.reshape([-1]), tgt_mask.reshape([-1])])
862894

863895
def _preprocess(self, input_text: list[str]):
864896
if self.tokenizer.chat_template is not None:

paddlenlp/experimental/transformers/fused_transformer_layers.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,7 @@
3838
"The paddlenlp_ops package is not installed. you can read the docs and install it by hand, "
3939
"you can refer to: https://github.com/PaddlePaddle/PaddleNLP/blob/develop/csrc/README.md"
4040
)
41-
if core.is_compiled_with_xpu() or core.is_compiled_with_cuda():
42-
from paddlenlp_ops import rebuild_padding_v2
41+
from paddlenlp_ops import rebuild_padding_v2
4342

4443
if core.is_compiled_with_cuda():
4544
from paddlenlp_ops import (

paddlenlp/utils/llm_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -461,7 +461,7 @@ def get_alibi_slopes(num_heads):
461461
extra_base = 2 ** (-(2 ** -(math.log2(2 * closest_power_of_2) - 3)))
462462
num_remaining_heads = min(closest_power_of_2, num_heads - closest_power_of_2)
463463
extra_powers = np.arange(1, 1 + 2 * num_remaining_heads, 2)
464-
slopes = np.concatante([slopes, np.power(extra_base, extra_powers)], axis=0)
464+
slopes = np.concatenate([slopes, np.power(extra_base, extra_powers)], axis=0)
465465

466466
return slopes.astype("float32")
467467

0 commit comments

Comments
 (0)