Skip to content

[Prediction] Update LLM prediction. #8778

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Jul 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 7 additions & 8 deletions paddlenlp/experimental/transformers/chatglm/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from paddlenlp.experimental.transformers.generation_utils import (
GenerationInferenceModel,
)
from paddlenlp.experimental.transformers.utils import infererence_model_from_pretrained

Check warning on line 30 in paddlenlp/experimental/transformers/chatglm/modeling.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/experimental/transformers/chatglm/modeling.py#L30

Added line #L30 was not covered by tests
from paddlenlp.transformers import ChatGLMConfig, ChatGLMPretrainedModel
from paddlenlp.transformers.model_outputs import (
BaseModelOutputWithPastAndCrossAttentions,
Expand Down Expand Up @@ -388,20 +389,20 @@
head_dim = embed_dim // config.num_attention_heads

for k, v in state_dict.items():
if k.startswith("transformer.word_embeddings.weight"):
if k.startswith("chatglm.transformer.word_embeddings.weight"):

Check warning on line 392 in paddlenlp/experimental/transformers/chatglm/modeling.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/experimental/transformers/chatglm/modeling.py#L392

Added line #L392 was not covered by tests
self.word_embeddings.weight.set_value(v.astype(dtype))
continue
elif k.startswith("transformer.final_layernorm.weight"):
elif k.startswith("chatglm.transformer.final_layernorm.weight"):

Check warning on line 395 in paddlenlp/experimental/transformers/chatglm/modeling.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/experimental/transformers/chatglm/modeling.py#L395

Added line #L395 was not covered by tests
self.transformer_block.ffn_ln_scales[config.num_hidden_layers - 1].set_value(v.astype("float32"))
continue
elif k.startswith("transformer.final_layernorm.bias"):
elif k.startswith("chatglm.transformer.final_layernorm.bias"):

Check warning on line 398 in paddlenlp/experimental/transformers/chatglm/modeling.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/experimental/transformers/chatglm/modeling.py#L398

Added line #L398 was not covered by tests
self.transformer_block.ffn_ln_biases[config.num_hidden_layers - 1].set_value(v.astype("float32"))
continue
elif k.startswith("lm_head.weight"):
continue
elif k.endswith("rotary_embeddings.inv_freq") or k.endswith("rotary_emb.inv_freq"):
continue
idx = int(k.split(".")[2])
idx = int(k.split(".")[3])

Check warning on line 405 in paddlenlp/experimental/transformers/chatglm/modeling.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/experimental/transformers/chatglm/modeling.py#L405

Added line #L405 was not covered by tests
if k.endswith("input_layernorm.weight"):
if idx == 0:
self.input_layernorm.weight.set_value(v.astype(dtype))
Expand Down Expand Up @@ -583,9 +584,7 @@

@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
# TODO: Support safetensors loading.
kwargs["use_safetensors"] = False
return super().from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
return infererence_model_from_pretrained(cls, pretrained_model_name_or_path, args, kwargs, return_numpy=False)

Check warning on line 587 in paddlenlp/experimental/transformers/chatglm/modeling.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/experimental/transformers/chatglm/modeling.py#L587

Added line #L587 was not covered by tests

@classmethod
def get_cache_kvs_shape(
Expand Down Expand Up @@ -746,6 +745,6 @@
@paddle.no_grad()
def set_state_dict(self, state_dict):
self.lm_head.weight.set_value(
state_dict["transformer.word_embeddings.weight"].astype(self.lm_head.weight.dtype)
state_dict["chatglm.transformer.word_embeddings.weight"].astype(self.lm_head.weight.dtype)
)
self.model.transformer.set_state_dict({k: state_dict[k] for k in state_dict.keys()})
5 changes: 2 additions & 3 deletions paddlenlp/experimental/transformers/gpt/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from paddlenlp.experimental.transformers.generation_utils import (
GenerationInferenceModel,
)
from paddlenlp.experimental.transformers.utils import infererence_model_from_pretrained

Check warning on line 29 in paddlenlp/experimental/transformers/gpt/modeling.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/experimental/transformers/gpt/modeling.py#L29

Added line #L29 was not covered by tests
from paddlenlp.transformers import GPTConfig, GPTPretrainedModel
from paddlenlp.transformers.gpt.modeling import GPTEmbeddings, parallel_matmul
from paddlenlp.transformers.model_outputs import (
Expand Down Expand Up @@ -446,9 +447,7 @@

@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
# TODO: Support safetensors loading.
kwargs["use_safetensors"] = False
return super().from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
return infererence_model_from_pretrained(cls, pretrained_model_name_or_path, args, kwargs)

Check warning on line 450 in paddlenlp/experimental/transformers/gpt/modeling.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/experimental/transformers/gpt/modeling.py#L450

Added line #L450 was not covered by tests

@classmethod
def get_cache_kvs_shape(
Expand Down
59 changes: 4 additions & 55 deletions paddlenlp/experimental/transformers/llama/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
GenerationBlockInferenceModel,
GenerationInferenceModel,
)
from paddlenlp.experimental.transformers.utils import infererence_model_from_pretrained

Check warning on line 46 in paddlenlp/experimental/transformers/llama/modeling.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/experimental/transformers/llama/modeling.py#L46

Added line #L46 was not covered by tests
from paddlenlp.transformers import LlamaConfig, LlamaPretrainedModel
from paddlenlp.transformers.conversion_utils import split_param_func
from paddlenlp.transformers.llama.modeling import LlamaLMHead
Expand Down Expand Up @@ -1139,9 +1140,7 @@

@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
# TODO: Support safetensors loading.
kwargs["use_safetensors"] = False
return super().from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
return infererence_model_from_pretrained(cls, pretrained_model_name_or_path, args, kwargs)

@classmethod
def get_cache_kvs_shape(
Expand Down Expand Up @@ -1238,9 +1237,7 @@

@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
# TODO: Support safetensors loading.
kwargs["use_safetensors"] = False
return super().from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
return infererence_model_from_pretrained(cls, pretrained_model_name_or_path, args, kwargs)

@classmethod
def get_cache_kvs_shape(
Expand Down Expand Up @@ -1477,55 +1474,7 @@

@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
# TODO: Support safetensors loading.
kwargs["use_safetensors"] = False
from paddlenlp.transformers.utils import (
ContextManagers,
is_safetensors_available,
)

from_hf_hub = kwargs.pop("from_hf_hub", False)
config = kwargs.pop("config", None)
from_aistudio = kwargs.get("from_aistudio", False)
subfolder = kwargs.get("subfolder", None)
variant = kwargs.pop("variant", None)
use_safetensors = kwargs.pop("use_safetensors", None if is_safetensors_available() else False)
convert_from_torch = kwargs.pop("convert_from_torch", None)
cache_dir = kwargs.pop("cache_dir", None)

init_contexts = []
with ContextManagers(init_contexts):
model = cls(config)

if not config.single_card_ptq:
resolved_archive_file = pretrained_model_name_or_path
else:
resolved_archive_file = cls._resolve_model_file_path(
pretrained_model_name_or_path,
cache_dir=cache_dir,
subfolder=subfolder,
from_hf_hub=from_hf_hub,
from_aistudio=from_aistudio,
config=config,
convert_from_torch=convert_from_torch,
use_safetensors=use_safetensors,
variant=variant,
)[0]
logger.info(f"Load model form {resolved_archive_file}")

if config.tensor_parallel_degree > 1 and config.single_card_ptq:
logger.info(f"convert_tensor_parallel {config.tensor_parallel_degree}")
model.state_dict = model.convert_tensor_parallel(resolved_archive_file, config)
elif config.tensor_parallel_degree > 1:
resolved_archive_file = os.path.join(
resolved_archive_file, f"mp_{config.tensor_parallel_rank:0>2d}_sharding_00_pp_00", "model.pdparams"
)
model.state_dict = paddle.load(resolved_archive_file, return_numpy=True)
else:
model.state_dict = paddle.load(resolved_archive_file, return_numpy=True)
model.set_state_dict(model.state_dict)

return model
return infererence_model_from_pretrained(cls, pretrained_model_name_or_path, args, kwargs)

Check warning on line 1477 in paddlenlp/experimental/transformers/llama/modeling.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/experimental/transformers/llama/modeling.py#L1477

Added line #L1477 was not covered by tests

@classmethod
def get_cache_kvs_shape(
Expand Down
5 changes: 2 additions & 3 deletions paddlenlp/experimental/transformers/opt/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from paddlenlp.experimental.transformers.generation_utils import (
GenerationInferenceModel,
)
from paddlenlp.experimental.transformers.utils import infererence_model_from_pretrained

Check warning on line 29 in paddlenlp/experimental/transformers/opt/modeling.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/experimental/transformers/opt/modeling.py#L29

Added line #L29 was not covered by tests
from paddlenlp.transformers import OPTPretrainedModel
from paddlenlp.transformers.model_utils import (
dy2st_nocheck_guard_context,
Expand Down Expand Up @@ -329,9 +330,7 @@

@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
# TODO: Support safetensors loading.
kwargs["use_safetensors"] = kwargs.get("use_safetensors", False)
return super().from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
return infererence_model_from_pretrained(cls, pretrained_model_name_or_path, args, kwargs)

Check warning on line 333 in paddlenlp/experimental/transformers/opt/modeling.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/experimental/transformers/opt/modeling.py#L333

Added line #L333 was not covered by tests

@classmethod
def get_cache_kvs_shape(
Expand Down
9 changes: 3 additions & 6 deletions paddlenlp/experimental/transformers/qwen/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from paddlenlp.experimental.transformers.generation_utils import (
GenerationInferenceModel,
)
from paddlenlp.experimental.transformers.utils import infererence_model_from_pretrained

Check warning on line 30 in paddlenlp/experimental/transformers/qwen/modeling.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/experimental/transformers/qwen/modeling.py#L30

Added line #L30 was not covered by tests
from paddlenlp.transformers import QWenConfig, QWenPretrainedModel
from paddlenlp.transformers.model_outputs import (
BaseModelOutputWithPast,
Expand Down Expand Up @@ -377,12 +378,8 @@
self.lm_head = new_embeddings

@classmethod
def from_pretrained(
cls, pretrained_model_name_or_path, from_hf_hub: bool = False, subfolder: str | None = None, *args, **kwargs
):
# TODO: Support safetensors loading.
kwargs["use_safetensors"] = False
return super().from_pretrained(pretrained_model_name_or_path, from_hf_hub, subfolder, *args, **kwargs)
def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
return infererence_model_from_pretrained(cls, pretrained_model_name_or_path, args, kwargs)

Check warning on line 382 in paddlenlp/experimental/transformers/qwen/modeling.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/experimental/transformers/qwen/modeling.py#L381-L382

Added lines #L381 - L382 were not covered by tests

@classmethod
def get_cache_kvs_shape(
Expand Down
77 changes: 77 additions & 0 deletions paddlenlp/experimental/transformers/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations

Check warning on line 14 in paddlenlp/experimental/transformers/utils.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/experimental/transformers/utils.py#L14

Added line #L14 was not covered by tests

import os

Check warning on line 16 in paddlenlp/experimental/transformers/utils.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/experimental/transformers/utils.py#L16

Added line #L16 was not covered by tests

import paddle

Check warning on line 18 in paddlenlp/experimental/transformers/utils.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/experimental/transformers/utils.py#L18

Added line #L18 was not covered by tests

from paddlenlp.transformers.model_utils import (

Check warning on line 20 in paddlenlp/experimental/transformers/utils.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/experimental/transformers/utils.py#L20

Added line #L20 was not covered by tests
dtype_guard,
load_tp_checkpoint,
no_init_weights,
)
from paddlenlp.transformers.utils import (

Check warning on line 25 in paddlenlp/experimental/transformers/utils.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/experimental/transformers/utils.py#L25

Added line #L25 was not covered by tests
ContextManagers,
is_paddle_support_lazy_init,
is_safetensors_available,
)


def infererence_model_from_pretrained(cls, pretrained_model_name_or_path, args, kwargs, return_numpy=True):

Check warning on line 32 in paddlenlp/experimental/transformers/utils.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/experimental/transformers/utils.py#L32

Added line #L32 was not covered by tests
r"""
Instantiate a pretrained model configuration from a pre-trained model name or path.
"""
config = kwargs.pop("config", None)
cache_dir = kwargs.pop("cache_dir", None)
dtype = kwargs.pop("dtype", None)
if dtype is None:
dtype = config.dtype
subfolder = kwargs.pop("subfolder", None)
if subfolder is None:
subfolder = ""
variant = kwargs.pop("variant", None)
use_safetensors = kwargs.pop("use_safetensors", None if is_safetensors_available() else False)
low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", False)

Check warning on line 46 in paddlenlp/experimental/transformers/utils.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/experimental/transformers/utils.py#L36-L46

Added lines #L36 - L46 were not covered by tests

init_contexts = []
if low_cpu_mem_usage or config.quantization_config.is_weight_quantize():

Check warning on line 49 in paddlenlp/experimental/transformers/utils.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/experimental/transformers/utils.py#L48-L49

Added lines #L48 - L49 were not covered by tests
# Instantiate model.
init_contexts.append(no_init_weights(_enable=True))
if is_paddle_support_lazy_init():
init_contexts.append(paddle.LazyGuard())
if dtype:
init_contexts.append(dtype_guard(dtype))

Check warning on line 55 in paddlenlp/experimental/transformers/utils.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/experimental/transformers/utils.py#L51-L55

Added lines #L51 - L55 were not covered by tests

# init the model
with ContextManagers(init_contexts):
model = cls(config)

Check warning on line 59 in paddlenlp/experimental/transformers/utils.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/experimental/transformers/utils.py#L58-L59

Added lines #L58 - L59 were not covered by tests

resolved_archive_file, _, _, _ = cls._resolve_model_file_path(

Check warning on line 61 in paddlenlp/experimental/transformers/utils.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/experimental/transformers/utils.py#L61

Added line #L61 was not covered by tests
pretrained_model_name_or_path,
cache_dir=cache_dir,
subfolder=subfolder,
from_hf_hub=False,
from_aistudio=False,
config=config,
convert_from_torch=False,
use_safetensors=use_safetensors,
variant=variant,
)

model_path = os.path.dirname(resolved_archive_file)
state_dict = load_tp_checkpoint(model_path, cls, config, return_numpy=return_numpy)
model.set_state_dict(state_dict)

Check warning on line 75 in paddlenlp/experimental/transformers/utils.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/experimental/transformers/utils.py#L73-L75

Added lines #L73 - L75 were not covered by tests

return model

Check warning on line 77 in paddlenlp/experimental/transformers/utils.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/experimental/transformers/utils.py#L77

Added line #L77 was not covered by tests
Loading
Loading