Skip to content

Commit db80bdd

Browse files
authored
[Models] Add Llama-3.2 (#9199)
* add llama3.2
1 parent 1694d7c commit db80bdd

File tree

3 files changed

+96
-29
lines changed

3 files changed

+96
-29
lines changed

README.md

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ Unified Checkpoint 大模型存储格式在模型参数分布上支持动态扩
7070
| [LLama2](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/llm/config/llama) | meta-llama/Llama-2-7b, meta-llama/Llama-2-7b-chat, meta-llama/Llama-2-13b, meta-llama/Llama-2-13b-chat, meta-llama/Llama-2-70b, meta-llama/Llama-2-70b-chat |
7171
| [LLama3](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/llm/config/llama) | meta-llama/Meta-Llama-3-8B, meta-llama/Meta-Llama-3-8B-Instruct, meta-llama/Meta-Llama-3-70B, meta-llama/Meta-Llama-3-70B-Instruct |
7272
| [LLama3.1](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/llm/config/llama) | meta-llama/Meta-Llama-3.1-8B, meta-llama/Meta-Llama-3.1-8B-Instruct, meta-llama/Meta-Llama-3.1-70B, meta-llama/Meta-Llama-3.1-70B-Instruct, meta-llama/Meta-Llama-3.1-405B, meta-llama/Meta-Llama-3.1-405B-Instruct, meta-llama/Llama-Guard-3-8B |
73+
| [LLama3.2](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/llm/config/llama) | meta-llama/Llama-3.2-1B, meta-llama/Llama-3.2-1B-Instruct, meta-llama/Llama-3.2-3B, meta-llama/Llama-3.2-3B-Instruct, meta-llama/Llama-Guard-3-1B |
7374
| [Baichuan](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/llm/config/baichuan) | baichuan-inc/Baichuan-7B, baichuan-inc/Baichuan-13B-Base, baichuan-inc/Baichuan-13B-Chat |
7475
| [Baichuan2](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/llm/config/baichuan) | baichuan-inc/Baichuan2-7B-Base, baichuan-inc/Baichuan2-7B-Chat, baichuan-inc/Baichuan2-13B-Base, baichuan-inc/Baichuan2-13B-Chat |
7576
| [Bloom](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/llm/config/bloom) | bigscience/bloom-560m, bigscience/bloom-560m-bf16, bigscience/bloom-1b1, bigscience/bloom-3b, bigscience/bloom-7b1, bigscience/bloomz-560m, bigscience/bloomz-1b1, bigscience/bloomz-3b, bigscience/bloomz-7b1-mt, bigscience/bloomz-7b1-p3, bigscience/bloomz-7b1, bellegroup/belle-7b-2m |
@@ -85,7 +86,7 @@ Unified Checkpoint 大模型存储格式在模型参数分布上支持动态扩
8586
| [Qwen2](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/llm/config/qwen/) | Qwen/Qwen2-0.5B, Qwen/Qwen2-0.5B-Instruct, Qwen/Qwen2-1.5B, Qwen/Qwen2-1.5B-Instruct, Qwen/Qwen2-7B, Qwen/Qwen2-7B-Instruct, Qwen/Qwen2-72B, Qwen/Qwen2-72B-Instruct, Qwen/Qwen2-57B-A14B, Qwen/Qwen2-57B-A14B-Instruct |
8687
| [Qwen2-Math](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/llm/config/qwen/) | Qwen/Qwen2-Math-1.5B, Qwen/Qwen2-Math-1.5B-Instruct, Qwen/Qwen2-Math-7B, Qwen/Qwen2-Math-7B-Instruct, Qwen/Qwen2-Math-72B, Qwen/Qwen2-Math-72B-Instruct, Qwen/Qwen2-Math-RM-72B |
8788
| [Qwen2.5](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/llm/config/qwen/) | Qwen/Qwen2.5-0.5B, Qwen/Qwen2.5-0.5B-Instruct, Qwen/Qwen2.5-1.5B, Qwen/Qwen2.5-1.5B-Instruct, Qwen/Qwen2.5-3B, Qwen/Qwen2.5-3B-Instruct, Qwen/Qwen2.5-7B, Qwen/Qwen2.5-7B-Instruct, Qwen/Qwen2.5-14B, Qwen/Qwen2.5-14B-Instruct, Qwen/Qwen2.5-32B, Qwen/Qwen2.5-32B-Instruct, Qwen/Qwen2.5-72B, Qwen/Qwen2.5-72B-Instruct |
88-
| [Qwen2.5-Math](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/llm/config/qwen/) | Qwen/Qwen2.5-Math-1.5B, Qwen/Qwen2.5-Math-1.5B-Instruct, Qwen/Qwen2.5-Math-7B, Qwen/Qwen2.5-Math-7B-Instruct, Qwen/Qwen2.5-Math-72B, Qwen/Qwen2.5-Math-72B-Instruct, Qwen/Qwen2.5-Math-RM-72B |
89+
| [Qwen2.5-Math](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/llm/config/qwen/) | Qwen/Qwen2.5-Math-1.5B, Qwen/Qwen2.5-Math-1.5B-Instruct, Qwen/Qwen2.5-Math-7B, Qwen/Qwen2.5-Math-7B-Instruct, Qwen/Qwen2.5-Math-72B, Qwen/Qwen2.5-Math-72B-Instruct, Qwen/Qwen2.5-Math-RM-72B |
8990
| [Qwen2.5-Coder](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/llm/config/qwen/) | Qwen/Qwen2.5-Coder-1.5B, Qwen/Qwen2.5-Coder-1.5B-Instruct, Qwen/Qwen2.5-Coder-7B, Qwen/Qwen2.5-Coder-7B-Instruct |
9091
| [Yuan2](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/llm/config/yuan/) | IEITYuan/Yuan2-2B, IEITYuan/Yuan2-51B, IEITYuan/Yuan2-102B |
9192

@@ -96,9 +97,6 @@ Unified Checkpoint 大模型存储格式在模型参数分布上支持动态扩
9697
|:---------------------:|:--------:|:------------:|:--------:|:------------:|:------:|:------:|:----------:|
9798
| | | 基础能力 | 序列并行 | stage1 | stage2 | stage3 | |
9899
| Llama ||||||||
99-
| Llama2 ||||||||
100-
| Llama3 ||||||||
101-
| Llama3.1 ||||||||
102100
| Qwen ||||||||
103101
| Qwen1.5 ||||||||
104102
| Qwen2 ||||||||
@@ -119,7 +117,7 @@ Unified Checkpoint 大模型存储格式在模型参数分布上支持动态扩
119117

120118
| 模型名称/能力支持 | Pretrain | SFT | LoRA | Prefix Tuning | DPO | RLHF | Quantization | Torch convert |
121119
|:------------------:|:--------:|:---:|:----:|:-------------:|:---:|:----:|:------------:|:-------------:|
122-
| LLaMA |||||||||
120+
| Llama |||||||||
123121
| Qwen |||||| 🚧 | 🚧 ||
124122
| Mixtral ||||| 🚧 | 🚧 | 🚧 | 🚧 |
125123
| Mistral |||||| 🚧 | 🚧 ||
@@ -151,7 +149,7 @@ Unified Checkpoint 大模型存储格式在模型参数分布上支持动态扩
151149
* python >= 3.8
152150
* paddlepaddle >= 3.0.0b0
153151

154-
如果您尚未安装PaddlePaddle,请参考 [飞桨官网](https://www.paddlepaddle.org.cn/) 进行安装。
152+
如果您尚未安装 PaddlePaddle,请参考 [飞桨官网](https://www.paddlepaddle.org.cn/) 进行安装。
155153

156154
### pip 安装
157155

paddlenlp/transformers/llama/modeling.py

Lines changed: 44 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,7 @@ def assign_kv_heads(num_kv_heads: int, num_gpus: int):
173173
return assignment_list
174174

175175

176-
def parallel_matmul(x: Tensor, y: Tensor, tensor_parallel_output=True):
176+
def parallel_matmul(x: Tensor, y: Tensor, transpose_y=False, tensor_parallel_output=True):
177177
is_fleet_init = True
178178
tensor_parallel_degree = 1
179179
try:
@@ -191,15 +191,15 @@ def parallel_matmul(x: Tensor, y: Tensor, tensor_parallel_output=True):
191191
if is_fleet_init and tensor_parallel_degree > 1 and y_is_distributed:
192192
# if not running under distributed.launch, it will raise AttributeError: 'Fleet' object has no attribute '_hcg'
193193
input_parallel = paddle.distributed.collective._c_identity(x, group=model_parallel_group)
194-
logits = paddle.matmul(input_parallel, y, transpose_y=False)
194+
logits = paddle.matmul(input_parallel, y, transpose_y=transpose_y)
195195

196196
if tensor_parallel_output:
197197
return logits
198198

199199
return paddle.distributed.collective._c_concat(logits, group=model_parallel_group)
200200

201201
else:
202-
logits = paddle.matmul(x, y, transpose_y=False)
202+
logits = paddle.matmul(x, y, transpose_y=transpose_y)
203203
return logits
204204

205205

@@ -1267,7 +1267,8 @@ def _get_name_mappings(cls, config: LlamaConfig) -> list[StateDictNameMapping]:
12671267
for mapping in model_mappings:
12681268
mapping[0] = "model." + mapping[0]
12691269
mapping[1] = "llama." + mapping[1]
1270-
model_mappings.append(["lm_head.weight", "lm_head.weight", "transpose"])
1270+
if not config.tie_word_embeddings:
1271+
model_mappings.append(["lm_head.weight", "lm_head.weight", "transpose"])
12711272

12721273
mappings = [StateDictNameMapping(*mapping, index=index) for index, mapping in enumerate(model_mappings)]
12731274
return mappings
@@ -1288,13 +1289,17 @@ def get_tensor_parallel_split_mappings(num_layers):
12881289
final_actions = {}
12891290

12901291
base_actions = {
1291-
"lm_head.weight": partial(fn, is_column=True),
12921292
# Row Linear
12931293
"embed_tokens.weight": partial(fn, is_column=False),
12941294
"layers.0.self_attn.o_proj.weight": partial(fn, is_column=False),
12951295
"layers.0.mlp.down_proj.weight": partial(fn, is_column=False),
12961296
}
12971297

1298+
if config.tie_word_embeddings:
1299+
base_actions["lm_head.weight"] = partial(fn, is_column=False)
1300+
else:
1301+
base_actions["lm_head.weight"] = partial(fn, is_column=True)
1302+
12981303
if not config.vocab_size % config.tensor_parallel_degree == 0:
12991304
base_actions.pop("lm_head.weight")
13001305
base_actions.pop("embed_tokens.weight")
@@ -1842,29 +1847,40 @@ def backward(ctx, grad):
18421847

18431848

18441849
class LlamaLMHead(nn.Layer):
1845-
def __init__(self, config: LlamaConfig):
1850+
def __init__(self, config: LlamaConfig, embedding_weights=None, transpose_y=False):
18461851
super(LlamaLMHead, self).__init__()
18471852
self.config = config
18481853
if config.tensor_parallel_degree > 1 and config.vocab_size % config.tensor_parallel_degree == 0:
18491854
vocab_size = config.vocab_size // config.tensor_parallel_degree
18501855
else:
18511856
vocab_size = config.vocab_size
18521857

1853-
if vocab_size != config.vocab_size:
1854-
with get_rng_state_tracker().rng_state():
1858+
self.transpose_y = transpose_y
1859+
if transpose_y:
1860+
if embedding_weights is not None:
1861+
self.weight = embedding_weights
1862+
else:
18551863
self.weight = self.create_parameter(
1856-
shape=[config.hidden_size, vocab_size],
1864+
shape=[vocab_size, config.hidden_size],
18571865
dtype=paddle.get_default_dtype(),
18581866
)
18591867
else:
1860-
self.weight = self.create_parameter(
1861-
shape=[config.hidden_size, vocab_size],
1862-
dtype=paddle.get_default_dtype(),
1863-
)
1868+
if vocab_size != config.vocab_size:
1869+
with get_rng_state_tracker().rng_state():
1870+
self.weight = self.create_parameter(
1871+
shape=[config.hidden_size, vocab_size],
1872+
dtype=paddle.get_default_dtype(),
1873+
)
1874+
else:
1875+
self.weight = self.create_parameter(
1876+
shape=[config.hidden_size, vocab_size],
1877+
dtype=paddle.get_default_dtype(),
1878+
)
18641879
# Must set distributed attr for Tensor Parallel !
18651880
self.weight.is_distributed = True if (vocab_size != config.vocab_size) else False
18661881
if self.weight.is_distributed:
1867-
self.weight.split_axis = 1
1882+
# for tie_word_embeddings
1883+
self.weight.split_axis = 0 if self.transpose_y else 1
18681884
if get_env_device() == "xpu":
18691885
try:
18701886
from paddle_xpu.layers.nn import ( # noqa: F401
@@ -1892,22 +1908,33 @@ def forward(self, hidden_states, tensor_parallel_output=None):
18921908

18931909
if get_env_device() == "xpu" and self.xpu_parallel_matmul is not None:
18941910
logits = self.xpu_parallel_matmul(
1895-
hidden_states, self.weight, tensor_parallel_output=tensor_parallel_output, training=self.training
1911+
hidden_states,
1912+
self.weight,
1913+
transpose_y=self.transpose_y,
1914+
tensor_parallel_output=tensor_parallel_output,
1915+
training=self.training,
18961916
)
18971917
else:
1898-
logits = parallel_matmul(hidden_states, self.weight, tensor_parallel_output=tensor_parallel_output)
1918+
logits = parallel_matmul(
1919+
hidden_states, self.weight, transpose_y=self.transpose_y, tensor_parallel_output=tensor_parallel_output
1920+
)
18991921
return logits
19001922

19011923

19021924
class LlamaForCausalLM(LlamaPretrainedModel):
19031925
enable_to_static_method = True
1926+
_tied_weights_keys = ["lm_head.weight"]
19041927

19051928
def __init__(self, config):
19061929
super().__init__(config)
19071930
self.config = config
19081931

19091932
self.llama = LlamaModel(config)
1910-
self.lm_head = LlamaLMHead(config)
1933+
if config.tie_word_embeddings:
1934+
self.lm_head = LlamaLMHead(config, embedding_weights=self.llama.embed_tokens.weight, transpose_y=True)
1935+
self.tie_weights()
1936+
else:
1937+
self.lm_head = LlamaLMHead(config)
19111938
self.criterion = LlamaPretrainingCriterion(config)
19121939

19131940
def get_input_embeddings(self):

paddlenlp/transformers/llama/modeling_pp.py

Lines changed: 48 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,11 @@
1717
import paddle
1818
import paddle.distributed.fleet as fleet
1919
import paddle.nn as nn
20-
from paddle.distributed.fleet.meta_parallel import LayerDesc, PipelineLayer
20+
from paddle.distributed.fleet.meta_parallel import (
21+
LayerDesc,
22+
PipelineLayer,
23+
SharedLayerDesc,
24+
)
2125
from paddle.distributed.fleet.utils import recompute
2226

2327
from paddlenlp.transformers.model_utils import PipelinePretrainedModel
@@ -102,6 +106,13 @@ def return_args(
102106
return ret
103107

104108

109+
def get_attr(layer, name):
110+
if getattr(layer, name, None) is not None:
111+
return getattr(layer, name, None)
112+
else:
113+
return get_attr(layer._layer, name)
114+
115+
105116
class LlamaEmbeddingPipe(nn.Layer):
106117
"""Extends LlamaEmbeddings to forward attention_mask through the pipeline."""
107118

@@ -119,6 +130,10 @@ def __init__(self, config):
119130
else:
120131
self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)
121132

133+
@property
134+
def embedding_weight(self):
135+
return get_attr(self.embed_tokens, "weight")
136+
122137
def forward(self, args):
123138
"""_summary_
124139
@@ -269,6 +284,15 @@ def forward(self, args):
269284
return self.norm(hidden_states)
270285

271286

287+
class LlamaLMHeadPipe(LlamaLMHead):
288+
def __init__(self, config, transpose_y=False):
289+
super(LlamaLMHeadPipe, self).__init__(config, transpose_y=transpose_y)
290+
291+
@property
292+
def embedding_weight(self):
293+
return get_attr(self, "weight")
294+
295+
272296
class LlamaForCausalLMPipe(PipelinePretrainedModel, PipelineLayer):
273297
"""LlamaForPretraining adapted for pipeline parallelism.
274298
@@ -332,14 +356,35 @@ def get_hcg():
332356
config.tensor_parallel_degree = tensor_parallel_degree
333357
config.tensor_parallel_rank = tensor_parallel_rank
334358

335-
self.add_sequential_layer(LayerDesc(LlamaEmbeddingPipe, config=config), "llama")
359+
if config.tie_word_embeddings:
360+
self.add_sequential_layer(
361+
SharedLayerDesc(
362+
"llama_shared_weight", LlamaEmbeddingPipe, shared_weight_attr="embedding_weight", config=config
363+
),
364+
"llama",
365+
)
366+
else:
367+
self.add_sequential_layer(LayerDesc(LlamaEmbeddingPipe, config=config), "llama")
368+
336369
for i in range(config.num_hidden_layers):
337370
self.add_sequential_layer(
338371
LayerDesc(LlamaDecoderLayerPipe, config=config, layerwise_recompute=i not in self.no_recompute_layers),
339372
f"llama.layers.{i}",
340373
)
341374
self.add_sequential_layer(LayerDesc(LlamaRMSNormPipe, config=config), "llama")
342-
self.add_head(config)
375+
if config.tie_word_embeddings:
376+
self.add_sequential_layer(
377+
SharedLayerDesc(
378+
"llama_shared_weight",
379+
LlamaLMHeadPipe,
380+
shared_weight_attr="embedding_weight",
381+
config=config,
382+
**{"transpose_y": True},
383+
),
384+
"lm_head",
385+
)
386+
else:
387+
self.add_sequential_layer(LayerDesc(LlamaLMHeadPipe, config=config), "lm_head")
343388

344389
recompute_interval = 0
345390

@@ -366,8 +411,5 @@ def get_hcg():
366411
# DON'T init PipelinePretrainedModel
367412
# PipelinePretrainedModel.__init__(self.super(), config=config)
368413

369-
def add_head(self, config):
370-
self.add_sequential_layer(LayerDesc(LlamaLMHead, config=config), "lm_head")
371-
372414
def get_loss_fn(self, config):
373415
return LlamaPretrainingCriterion(config)

0 commit comments

Comments
 (0)