Skip to content

[Cherry-pick] Fix safetensors shape #8703

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 27 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
0da8c5d
cherry-pick add scaling (#8264)
lugimzzz Apr 12, 2024
4749af3
Upgrade paddlenlp to 2.8.0 (#8266)
w5688414 Apr 12, 2024
6c1f449
[BugFix] Try except sequence parallel utils (#8189) (#8274)
DesmonDay Apr 15, 2024
dc5a6af
save_model: checkpoint_done --> model_done
gongel Apr 22, 2024
7314063
fix import
JunnYu Apr 23, 2024
d4062e5
Revert "fix import"
JunnYu Apr 23, 2024
590cee9
Support Llama3 (#8315)
ZHUI Apr 23, 2024
871070d
bug fixer (#8314) (#8318)
FeixLiu Apr 24, 2024
0f428bb
[Distributed] [CustomDevices] Adapt SP on lora && polish MC2 APIs (#8…
SylarTiaNII Apr 23, 2024
3105c18
fix 0f428bbe47daed3cd861f7047c3e9acbec4ea0b1 try import
JunnYu Apr 24, 2024
89daaa3
[Trainer] Fix sharding overlap bug (#8334)
DesmonDay Apr 26, 2024
27d0e60
Remove truncate (#8375)
KB-Ding May 7, 2024
9e4a4f4
Fix llama3 eot id. (#8373)
ZHUI May 9, 2024
debb2ad
[Trainer] update distributed dataloader (#8426)
DesmonDay May 13, 2024
fc860a3
Fix load RNG compatibility. (#8451)
ZHUI May 16, 2024
08898bf
Cherry-Pick fast_safe_open (#8458)
ZHUI May 20, 2024
7a24bcc
Cherry pick type promotion fix. (#8463)
zxcd May 21, 2024
8879f79
quick fix from pretrained. (#8487)
ZHUI May 23, 2024
bbf945b
Release/2.8 (#8437)
Galaxy1458 May 24, 2024
82a7177
quick fix os.path.split (#8508)
DesmonDay May 29, 2024
4d33655
[fea] Cherry-picked MOE updates from develop (#8531)
bo-ke Jun 3, 2024
6757ff9
[LLM] relocate tensor_parallel_output to avoid conflict (#8419) (#8533)
DesmonDay Jun 3, 2024
7c8d713
Update sequence_parallel for predict (#8547)
DesmonDay Jun 5, 2024
c628f12
Cp/fix (#8569)
ZHUI Jun 7, 2024
5b027c8
Don't save moe_group (#8570)
DesmonDay Jun 7, 2024
db99efd
release 2.8.1 (#8636)
ZHUI Jun 20, 2024
ad271a6
[Safetensors] Fix safetensors shape (#8702)
DesmonDay Jul 3, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions docs/trainer.md
Original file line number Diff line number Diff line change
Expand Up @@ -705,4 +705,8 @@ Trainer 是一个简单,但功能完整的 Paddle训练和评估模块,并
Whether use flatten_param_grads method in optimizer,
only used on NPU devices.(default:False)

--use_expert_parallel
Whether to enable MoE (Mixture of Experts) expert parallel training.
(default: False)

```
14 changes: 10 additions & 4 deletions llm/finetune_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import sys
from dataclasses import dataclass, field
from functools import partial
from typing import Optional

import paddle
from argument import (
Expand Down Expand Up @@ -45,6 +46,7 @@
AutoConfig,
AutoModelForCausalLM,
AutoTokenizer,
Llama3Tokenizer,
LlamaTokenizer,
)
from paddlenlp.utils.log import logger
Expand All @@ -65,6 +67,10 @@ class FinetuneArguments(TrainingArguments):
default=0,
metadata={"help": "The steps use to control the learing rate."},
)
tensor_parallel_output: Optional[bool] = field(
default=False,
metadata={"help": "whether to output logits in distributed status"},
)


def read_local_dataset(path):
Expand Down Expand Up @@ -139,7 +145,7 @@ def main():
if not training_args.autotuner_benchmark:
model = AutoModelForCausalLMPipe.from_pretrained(
model_args.model_name_or_path,
tensor_parallel_output=False,
tensor_parallel_output=training_args.tensor_parallel_output,
tensor_parallel_degree=training_args.tensor_parallel_degree,
tensor_parallel_rank=training_args.tensor_parallel_rank,
use_flash_attention=model_args.use_flash_attention,
Expand All @@ -151,7 +157,7 @@ def main():
# NOTE(gongenlei): new add autotuner_benchmark
model_config = AutoConfig.from_pretrained(
model_args.model_name_or_path,
tensor_parallel_output=False,
tensor_parallel_output=training_args.tensor_parallel_output,
tensor_parallel_degree=training_args.tensor_parallel_degree,
tensor_parallel_rank=training_args.tensor_parallel_rank,
dtype=dtype,
Expand All @@ -162,7 +168,7 @@ def main():
else:
model_config = AutoConfig.from_pretrained(
model_args.model_name_or_path,
tensor_parallel_output=False,
tensor_parallel_output=training_args.tensor_parallel_output,
tensor_parallel_degree=training_args.tensor_parallel_degree,
tensor_parallel_rank=training_args.tensor_parallel_rank,
dtype=dtype,
Expand Down Expand Up @@ -232,7 +238,7 @@ def neft_post_hook(module, input, output):
if tokenizer.chat_template is not None:
data_args.eval_with_do_generation = False

if isinstance(tokenizer, LlamaTokenizer):
if isinstance(tokenizer, LlamaTokenizer) or isinstance(tokenizer, Llama3Tokenizer):
tokenizer.pad_token_id = tokenizer.eos_token_id

if data_args.dataset_name_or_path is None:
Expand Down
11 changes: 11 additions & 0 deletions llm/run_pretrain.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
)
from paddlenlp.utils.batch_sampler import DistributedBatchSampler
from paddlenlp.utils.log import logger
from paddlenlp.utils.tools import get_env_device


def add_start_docstrings(*docstr):
Expand Down Expand Up @@ -483,6 +484,16 @@ def main():
config.num_attention_heads % config.sep_parallel_degree == 0
), f"num_attention_heads:{config.num_attention_heads} must be divisible by sep_parallel_degree {config.sep_parallel_degree}"

if get_env_device() == "xpu" and training_args.gradient_accumulation_steps > 1:
try:
from paddle_xpu.layers.nn.linear import LinearConfig # noqa: F401

LinearConfig.enable_accumulate_steps_opt()
LinearConfig.set_accumulate_steps(training_args.gradient_accumulation_steps)
except ImportError:
# It's OK, not use accumulate_steps optimization
pass

print("Final pre-training config:", config)

# Set the dtype for loading model
Expand Down
9 changes: 9 additions & 0 deletions llm/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,9 +125,11 @@ def get_lora_target_modules(model):
".*v_proj.*",
".*k_proj.*",
".*o_proj.*",
".*qkv_proj.*",
".*gate_proj.*",
".*down_proj.*",
".*up_proj.*",
".*gate_up_fused_proj.*",
]
elif model.base_model_prefix == "opt":
target_modules = [
Expand Down Expand Up @@ -209,6 +211,13 @@ def prediction_step(
# keepdim in order to maintain the same shape as logits
if isinstance(logits, (list, tuple)):
logits = logits[0]
# all gather logits when enabling tensor_parallel_output
if self.args.tensor_parallel_degree > 1 and getattr(self.args, "tensor_parallel_output", False):
hcg = fleet.get_hybrid_communicate_group()
model_parallel_group = hcg.get_model_parallel_group()
gathered_logits = []
dist.all_gather(gathered_logits, logits, group=model_parallel_group)
logits = paddle.concat(gathered_logits, axis=-1)
return (loss, logits.argmax(axis=-1, keepdim=True), labels)

loss = None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,13 +48,16 @@
MinLengthLogitsProcessor,
RepetitionPenaltyLogitsProcessor,
)
from paddle.distributed.fleet.utils.sequence_parallel_utils import (
ColumnSequenceParallelLinear,
GatherOp,
RowSequenceParallelLinear,
ScatterOp,
mark_as_sequence_parallel_parameter,
)
try:
from paddle.distributed.fleet.utils.sequence_parallel_utils import (
ColumnSequenceParallelLinear,
GatherOp,
RowSequenceParallelLinear,
ScatterOp,
mark_as_sequence_parallel_parameter,
)
except:
pass

from paddlenlp.transformers.segment_parallel_utils import ReshardLayer

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,12 @@
from ppfleetx.core.module.basic_module import BasicModule
from ppfleetx.data.tokenizers import GPTTokenizer
from ppfleetx.distributed.apis import env
from paddle.distributed.fleet.utils.sequence_parallel_utils import (
register_sequence_parallel_allreduce_hooks,
)
try:
from paddle.distributed.fleet.utils.sequence_parallel_utils import (
register_sequence_parallel_allreduce_hooks,
)
except:
pass
from ppfleetx.utils.log import logger

# TODO(haohongxiang): to solve the problem of cross-reference
Expand Down
2 changes: 1 addition & 1 deletion paddlenlp/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
PADDLENLP_STABLE_VERSION = "PADDLENLP_STABLE_VERSION"


__version__ = "2.7.1.post"
__version__ = "2.8.1.post"
if os.getenv(PADDLENLP_STABLE_VERSION):
__version__ = __version__.replace(".post", "")

Expand Down
Loading
Loading