Skip to content

Commit e869c1c

Browse files
authored
Merge branch 'develop' into sd3_dreambooth
2 parents e9dea72 + 907a798 commit e869c1c

File tree

115 files changed

+15338
-602
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

115 files changed

+15338
-602
lines changed

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,10 @@ coverage.xml
7272
# NPU meta folder
7373
kernel_meta/
7474

75+
pretrained/
76+
playground/
77+
log/
78+
7579
# MAC
7680
*.DS_Store
7781

README.md

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,13 @@ PaddleMIX是基于飞桨的多模态大模型开发套件,聚合图像、文
1919

2020
## 最新进展
2121

22-
📚《飞桨多模态大模型开发套件PaddleMIX 2.0 震撼发布》,图文音视频场景全覆盖,多模态高效助力产业创新。超大规模训练支持,覆盖图文预训练、文生图、跨模态视觉任务,覆盖金融、教育、电商、医疗等产业场景。8月8日(周四)20:00 带你直播了解多模态大模型最新架构,深度解析PaddleMIX高性能模型库,手把手演示LLaVA模型训推全流程。[报名链接](https://www.wjx.top/vm/wKqysjx.aspx?udsid=449688)
22+
📚《飞桨多模态大模型开发套件PaddleMIX 2.0 震撼发布》,图文音视频场景全覆盖,多模态高效助力产业创新。超大规模训练支持,覆盖图文预训练、文生图、跨模态视觉任务,覆盖金融、教育、电商、医疗等产业场景。8月8日(周四)20:00 带你直播了解多模态大模型最新架构,深度解析PaddleMIX高性能模型库,手把手演示LLaVA模型训推全流程。[报名链接](https://www.wjx.top/vm/wKqysjx.aspx?udsid=449688)
23+
24+
**2024.09.11 更新**
25+
* 新增Qwen2-VL、InternVL2、SD3等模型
2326

2427
**2024.07.25 发布PaddleMIX v2.0**
25-
* 多模态理解:新增LLaVA系列,Qwen-VL等;新增Auto模块统一SFT训练流程;新增mixtoken训练策略,SFT吞吐量提升5.6倍。
28+
* 多模态理解:新增LLaVA系列Qwen-VL等;新增Auto模块统一SFT训练流程;新增mixtoken训练策略,SFT吞吐量提升5.6倍。
2629
* 多模态生成:发布[PPDiffusers 0.24.1](./ppdiffusers/README.md)版本,支持视频生成能力,文生图模型新增LCM。新增飞桨版peft,accelerate后端。提供基于飞桨开发的ComfyUI插件。
2730
* 多模态数据处理工具箱[DataCopilot](./paddlemix/datacopilot/):支持自定义数据结构,数据转换,离线格式检查;支持基本的统计信息,数据可视化功能。
2831

@@ -104,10 +107,10 @@ pip install -e .
104107
<tbody>
105108
<tr align="center" valign="center">
106109
<td>
107-
<b>多模态预训练</b>
110+
<b>多模态理解</b>
108111
</td>
109112
<td>
110-
<b>扩散类模型</b>
113+
<b>多模态生成</b>
111114
</td>
112115
</tr>
113116
<tr valign="top">
@@ -116,14 +119,19 @@ pip install -e .
116119
</ul>
117120
<li><b>图文预训练</b></li>
118121
<ul>
122+
<li><a href="paddlemix/examples/clip">CLIP</a></li>
119123
<li><a href="paddlemix/examples/evaclip">EVA-CLIP</a></li>
124+
<li><a href="paddlemix/examples/llava">LLaVA</a></li>
125+
<li><a href="paddlemix/examples/llava">LLaVA-1.5</a></li>
126+
<li><a href="paddlemix/examples/llava">LLaVA-NeXT</a></li>
127+
<li><a href="paddlemix/examples/qwen_vl">Qwen-VL</a></li>
128+
<li><a href="paddlemix/examples/qwen2_vl">Qwen2-VL</a></li>
129+
<li><a href="paddlemix/examples/internvl2">InternVL2</a></li>
130+
<li><a href="paddlemix/examples/minimonkey">Mini-Monkey</a></li>
120131
<li><a href="paddlemix/examples/coca">CoCa</a></li>
121-
<li><a href="paddlemix/examples/clip">CLIP</a></li>
122132
<li><a href="paddlemix/examples/blip2">BLIP-2</a></li>
123133
<li><a href="paddlemix/examples/minigpt4">miniGPT-4</a></li>
124134
<li><a href="paddlemix/examples/visualglm">VIsualGLM</a></li>
125-
<li><a href="paddlemix/examples/qwen_vl">Qwen_VL</a></li>
126-
<li><a href="paddlemix/examples/llava">LLaVA</a></li>
127135
<li><a href="paddlemix/examples/cogvlm">CogVLM && CogAgent</a></li>
128136
<li><a href="paddlemix/examples/internlm_xcomposer2">InternLM-XComposer2</a></li>
129137
</ul>

README_EN.md

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ pip install -e .
100100
<tbody>
101101
<tr align="center" valign="center">
102102
<td>
103-
<b>Multi-modal Pre-training</b>
103+
<b>Multi-modal Large Language Models</b>
104104
</td>
105105
<td>
106106
<b>Diffusion-based Models</b>
@@ -112,14 +112,19 @@ pip install -e .
112112
</ul>
113113
<li><b>Image-Text Pre-training</b></li>
114114
<ul>
115+
<li><a href="paddlemix/examples/clip">CLIP</a></li>
115116
<li><a href="paddlemix/examples/evaclip">EVA-CLIP</a></li>
117+
<li><a href="paddlemix/examples/llava">LLaVA</a></li>
118+
<li><a href="paddlemix/examples/llava">LLaVA-1.5</a></li>
119+
<li><a href="paddlemix/examples/llava">LLaVA-NeXT</a></li>
120+
<li><a href="paddlemix/examples/qwen_vl">Qwen-VL</a></li>
121+
<li><a href="paddlemix/examples/qwen2_vl">Qwen2-VL</a></li>
122+
<li><a href="paddlemix/examples/internvl2">InternVL2</a></li>
123+
<li><a href="paddlemix/examples/minimonkey">Mini-Monkey</a></li>
116124
<li><a href="paddlemix/examples/coca">CoCa</a></li>
117-
<li><a href="paddlemix/examples/clip">CLIP</a></li>
118125
<li><a href="paddlemix/examples/blip2">BLIP-2</a></li>
119126
<li><a href="paddlemix/examples/minigpt4">miniGPT-4</a></li>
120127
<li><a href="paddlemix/examples/visualglm">VIsualGLM</a></li>
121-
<li><a href="paddlemix/examples/qwen_vl">Qwen_VL</a></li>
122-
<li><a href="paddlemix/examples/llava">LLaVA</a></li>
123128
<li><a href="paddlemix/examples/cogvlm">CogVLM && CogAgent</a></li>
124129
<li><a href="paddlemix/examples/internlm_xcomposer2">InternLM-XComposer2</a></li>
125130
</ul>

comfyui/ComfyUI_ppdiffusers/basic_nodes.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,11 @@ def __init__(self):
3636
@classmethod
3737
def INPUT_TYPES(s):
3838
return {
39-
"required": {"images": ("IMAGE",), "filename_prefix": ("STRING", {"default": "ComfyUI"})},
39+
"required": {
40+
"images": ("IMAGE",),
41+
"filename_prefix": ("STRING", {"default": "ComfyUI"}),
42+
"censor": ("BOOLEAN", {"default": True})
43+
},
4044
"hidden": {"prompt": "PROMPT", "extra_pnginfo": "EXTRA_PNGINFO"},
4145
}
4246

@@ -59,15 +63,18 @@ def censor_image(self, image):
5963
print(response)
6064
return response["result"]["pass"]
6165

62-
def save_images(self, images, filename_prefix="ComfyUI", prompt=None, extra_pnginfo=None):
66+
def save_images(self, images, censor=True, filename_prefix="ComfyUI", prompt=None, extra_pnginfo=None):
6367
filename_prefix += self.prefix_append
6468
full_output_folder, filename, counter, subfolder, filename_prefix = folder_paths.get_save_image_path(
6569
filename_prefix, self.output_dir, images[0].shape[1], images[0].shape[0]
6670
)
6771
results = list()
6872
for (batch_number, image) in enumerate(images):
6973
img = Image.fromarray(image)
70-
pass_censor = self.censor_image(img)
74+
if censor:
75+
pass_censor = self.censor_image(img)
76+
else:
77+
pass_censor = True
7178
# breakpoint()
7279
if pass_censor:
7380
metadata = None

comfyui/ComfyUI_ppdiffusers/sd_pipe_nodes.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,6 @@ def INPUT_TYPES(cls):
4040
def load_checkpoint(self, ckpt_name):
4141
ckpt_path = folder_paths.get_full_path("checkpoints", ckpt_name)
4242
pipe = StableDiffusionPipeline.from_single_file(ckpt_path)
43-
pipe = pipe.to(dtype=paddle.float16)
4443
return (pipe,)
4544

4645

comfyui/ComfyUI_ppdiffusers/sdxl_pipe_nodes.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,6 @@ def INPUT_TYPES(cls):
4141
def load_checkpoint(self, ckpt_name):
4242
ckpt_path = folder_paths.get_full_path("checkpoints", ckpt_name)
4343
pipe = StableDiffusionXLPipeline.from_single_file(ckpt_path)
44-
pipe = pipe.to(dtype=paddle.float16)
4544
return (pipe,)
4645

4746

deploy/llava/export_model.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,13 +30,13 @@ def export_encode_text(model, config, compute_dtype):
3030

3131

3232
def export_encode_image(model, compute_dtype):
33-
33+
paddle.save(model.llama.image_newline,args.save_path + "/encode_image/clip/image_newline.pdparams")
3434
# convert to static graph with specific input description
3535
model = paddle.jit.to_static(
3636
model.encode_images,
3737
input_spec=[
38-
paddle.static.InputSpec(shape=[None, 3, 336, 336], dtype=compute_dtype), # images
39-
],
38+
paddle.static.InputSpec(shape=[None,3, 336, 336], dtype=compute_dtype), # images
39+
]
4040
)
4141

4242
# save to static model
@@ -76,6 +76,7 @@ def export_encode_image(model, compute_dtype):
7676
vision_tower = model.get_vision_tower()
7777
vision_tower.load_model()
7878
model.eval()
79+
7980
export_encode_image(model, compute_dtype)
8081

8182
elif args.encode_text:

deploy/llava/run_static_predict.py

Lines changed: 88 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@
2626
IMAGE_TOKEN_INDEX,
2727
)
2828
from paddlemix.models.llava.conversation import conv_templates
29+
from paddlemix.models.llava.mm_utils import load_image,get_anyres_image_grid_shape
30+
from paddlemix.models.llava.base_model import unpad_image
2931
from paddlemix.utils.log import logger
3032

3133

@@ -39,15 +41,20 @@ def __init__(self, args):
3941

4042
self.args = args
4143
self.config = AutoConfigMIX.from_pretrained(args.model_name_or_path)
44+
self.clip_config = AutoConfigMIX.from_pretrained(self.config.mm_vision_tower)
45+
4246

4347
self.tokenizer = AutoTokenizerMIX.from_pretrained(args.model_name_or_path)
44-
self.processor, _ = AutoProcessorMIX.from_pretrained(args.model_name_or_path, eval="eval")
48+
self.processor, _ = AutoProcessorMIX.from_pretrained(args.model_name_or_path, image_aspect_ratio=self.config.image_aspect_ratio,eval="eval")
4549

4650
self.first_predictor = self.create_predictor(args.first_model_path)
4751
print(f"first_model_path: {args.first_model_path}, {self.first_predictor}")
52+
4853
self.second_predictor = self.create_predictor(args.second_model_path)
4954
print(f"second_model_path: {args.second_model_path}, {self.second_predictor}")
5055

56+
self.image_newline = paddle.load(os.path.join(args.first_model_path, "image_newline.pdparams"))
57+
5158
def create_predictor(self, model_path):
5259

5360
from paddlenlp.utils.import_utils import import_module
@@ -77,9 +84,79 @@ def create_predictor(self, model_path):
7784
return predictor
7885

7986
@paddle.no_grad()
80-
def encode_images(self, pixel_values):
81-
language_model_inputs = self.first_predictor.run(pixel_values)
82-
return language_model_inputs
87+
def encode_images(self, images, image_sizes):
88+
if type(images) is list or images.ndim == 5:
89+
if type(images) is list:
90+
images = [(x.unsqueeze(axis=0) if x.ndim == 3 else x) for x in images]
91+
concat_images = paddle.concat(x=[image for image in images], axis=0)
92+
93+
image_features = self.first_predictor.run(concat_images)[0]
94+
95+
split_sizes = [image.shape[0] for image in images]
96+
image_features = paddle.split(image_features, split_sizes, axis=0)
97+
mm_patch_merge_type = getattr(self.config, "mm_patch_merge_type", "flat")
98+
image_aspect_ratio = getattr(self.config, "image_aspect_ratio", "square")
99+
if mm_patch_merge_type == "flat":
100+
image_features = [x.flatten(start_axis=0, stop_axis=1) for x in image_features]
101+
elif mm_patch_merge_type.startswith("spatial"):
102+
new_image_features = []
103+
for image_idx, image_feature in enumerate(image_features):
104+
if image_feature.shape[0] > 1:
105+
base_image_feature = image_feature[0]
106+
image_feature = image_feature[1:]
107+
height = width = self.clip_config.image_resolution // self.clip_config.vision_patch_size
108+
assert height * width == base_image_feature.shape[0]
109+
if image_aspect_ratio == "anyres":
110+
num_patch_width, num_patch_height = get_anyres_image_grid_shape(
111+
image_sizes[image_idx],
112+
self.config.image_grid_pinpoints,
113+
self.clip_config.image_resolution,
114+
)
115+
116+
image_feature = paddle.reshape(
117+
image_feature, (num_patch_height, num_patch_width, height, width, -1)
118+
)
119+
else:
120+
raise NotImplementedError
121+
if "unpad" in mm_patch_merge_type:
122+
image_feature = image_feature.transpose(perm=[4, 0, 2, 1, 3])
123+
image_feature = image_feature.flatten(start_axis=1, stop_axis=2).flatten(
124+
start_axis=2, stop_axis=3
125+
)
126+
image_feature = unpad_image(image_feature, image_sizes[image_idx])
127+
image_feature = paddle.concat(
128+
x=(
129+
image_feature,
130+
self.image_newline[:, (None), (None)].expand(
131+
shape=[*image_feature.shape[:-1], 1]
132+
).astype(image_feature.dtype),
133+
),
134+
axis=-1,
135+
)
136+
x = image_feature.flatten(start_axis=1, stop_axis=2)
137+
perm_12 = list(range(x.ndim))
138+
perm_12[0] = 1
139+
perm_12[1] = 0
140+
image_feature = x.transpose(perm=perm_12)
141+
else:
142+
image_feature = image_feature.transpose(perm=[0, 2, 1, 3, 4])
143+
image_feature = image_feature.flatten(start_axis=0, stop_axis=3)
144+
image_feature = paddle.concat(x=(base_image_feature, image_feature), axis=0)
145+
else:
146+
image_feature = image_feature[0]
147+
if "unpad" in mm_patch_merge_type:
148+
image_feature = paddle.concat(
149+
x=(image_feature, self.image_newline[None].to(image_feature.place)), axis=0
150+
)
151+
new_image_features.append(image_feature)
152+
image_features = new_image_features
153+
image_features = paddle.stack(x=image_features, axis=0)
154+
else:
155+
raise ValueError(f"Unexpected mm_patch_merge_type: {self.config.mm_patch_merge_type}")
156+
else:
157+
image_features = self.first_predictor.run(images)[0]
158+
159+
return image_features
83160

84161
@paddle.no_grad()
85162
def generate_with_image_features(self, image_features, input_ids):
@@ -225,9 +302,9 @@ def pre_processing(self, inp, first_message):
225302
conv.append_message(conv.roles[1], None)
226303
prompt = conv.get_prompt()
227304
record = {"image": self.args.image_file, "conversations": prompt}
228-
305+
image_size = load_image(args.image_file).size
229306
data_dict = self.processor(record=record, image_aspect_ratio=self.config.image_aspect_ratio)
230-
307+
data_dict['image_size'] = [image_size]
231308
return data_dict
232309

233310
def post_processing(self, generate_ids):
@@ -245,8 +322,8 @@ def run_benchmark(self):
245322
inp = "user: Generate the caption in English with grounding"
246323
data_dict = self.pre_processing(inp, first_message)
247324
image = paddle.cast(data_dict["images"], self.compute_dtype)
248-
249-
image_features = self.encode_images(image)[0]
325+
326+
image_features = self.encode_images(image,data_dict['image_size'])
250327

251328
generate_ids, _ = self.generate_with_image_features(
252329
image_features,
@@ -277,9 +354,9 @@ def predict(self):
277354
print(f"{roles[1]}: ", end="")
278355
data_dict = self.pre_processing(inp, first_message)
279356
image = paddle.cast(data_dict["images"], self.compute_dtype)
280-
281-
image_features = self.encode_images(image)[0]
282-
357+
358+
image_features = self.encode_images(image,data_dict['image_size'])
359+
283360
generate_ids, _ = self.generate_with_image_features(
284361
image_features,
285362
data_dict["input_ids"],

paddlemix/auto/modeling.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,11 +28,11 @@
2828
url_file_exists,
2929
)
3030
from paddlenlp.utils.env import HF_CACHE_HOME as PPNLP_HF_CACHE_HOME
31-
from paddlenlp.utils.env import MODEL_HOME as PPNLP_MODEL_HOME
3231
from paddlenlp.utils.import_utils import import_module
3332
from paddlenlp.utils.log import logger
3433

3534
from paddlemix.utils.env import MODEL_HOME as PPMIX_MODEL_HOME
35+
3636
from .configuration import get_configurations
3737

3838
__all__ = [
@@ -57,6 +57,8 @@
5757
"qwen_vl": "QWenLMHeadModel",
5858
"sam": "SamModel",
5959
"visualglm": "VisualGLMForConditionalGeneration",
60+
"llava_qwen": "LlavaQwenForCausalLM",
61+
"internvl2": "InternVLChatModel",
6062
}
6163

6264

@@ -180,7 +182,6 @@ def _from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
180182
subfolder = kwargs.get("subfolder", "")
181183
cache_dir = resolve_cache_dir(from_hf_hub, from_aistudio, cache_dir)
182184
kwargs["cache_dir"] = cache_dir
183-
184185

185186
if from_hf_hub:
186187
if hf_file_exists(repo_id=pretrained_model_name_or_path, filename=cls.model_config_file):
@@ -230,7 +231,7 @@ def _from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
230231
[COMMUNITY_MODEL_PREFIX, pretrained_model_name_or_path, cls.legacy_model_config_file]
231232
)
232233
cache_dir = os.path.join(cache_dir, pretrained_model_name_or_path, subfolder)
233-
234+
234235
try:
235236
if url_file_exists(standard_community_url):
236237
resolved_vocab_file = get_path_from_url_with_filelock(standard_community_url, cache_dir)
@@ -248,7 +249,7 @@ def _from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
248249
"- or a correct model-identifier of community-contributed pretrained models,\n"
249250
"- or the correct path to a directory containing relevant modeling files(model_weights and model_config).\n"
250251
)
251-
252+
252253
if os.path.exists(resolved_vocab_file):
253254
model_class = cls._get_model_class_from_config(pretrained_model_name_or_path, resolved_vocab_file)
254255
logger.info(f"We are using {model_class} to load '{pretrained_model_name_or_path}'.")

paddlemix/auto/processing.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,10 @@ def get_processor_mapping():
5151
model_name = "qwen_vl"
5252
elif "internlm_xcomposer2" in file_name:
5353
model_name = "internlm_xcomposer2"
54+
elif "llava_next" in file_name:
55+
model_name = "llava_next"
56+
elif "internvl2" in file_name:
57+
model_name = "internvl2"
5458
else:
5559
model_name = file_name.split("_")[0]
5660

@@ -91,7 +95,6 @@ def __init__(self, *args, **kwargs):
9195

9296
@classmethod
9397
def _get_processor_class(cls, pretrained_model_name_or_path, text_model_name_or_path=None, **kwargs):
94-
9598
name_or_path = None
9699
processor = None
97100
tokenizer = None
@@ -120,7 +123,9 @@ def _get_processor_class(cls, pretrained_model_name_or_path, text_model_name_or_
120123
text_model_name_or_path = pretrained_model_name_or_path
121124

122125
for names, processor_class in cls._processor_mapping.items():
126+
123127
if names.lower() in pretrained_model_name_or_path.lower().replace("-", "_").replace("vicuna", "llava"):
128+
124129
attributes = processor_class["processor"].attributes
125130
attributes_dict = {}
126131

0 commit comments

Comments
 (0)