PaddlePaddle
diff --git a/‎.gitignore
Lines changed: 4 additions & 0 deletions b/‎.gitignore
Lines changed: 4 additions & 0 deletions
diff --git a/‎README.md
Lines changed: 15 additions & 7 deletions b/‎README.md
Lines changed: 15 additions & 7 deletions
diff --git a/‎README_EN.md
Lines changed: 9 additions & 4 deletions b/‎README_EN.md
Lines changed: 9 additions & 4 deletions
diff --git a/‎comfyui/ComfyUI_ppdiffusers/basic_nodes.py
Lines changed: 10 additions & 3 deletions b/‎comfyui/ComfyUI_ppdiffusers/basic_nodes.py
Lines changed: 10 additions & 3 deletions
diff --git a/‎comfyui/ComfyUI_ppdiffusers/sd_pipe_nodes.py
Lines changed: 0 additions & 1 deletion b/‎comfyui/ComfyUI_ppdiffusers/sd_pipe_nodes.py
Lines changed: 0 additions & 1 deletion
diff --git a/‎comfyui/ComfyUI_ppdiffusers/sdxl_pipe_nodes.py
Lines changed: 0 additions & 1 deletion b/‎comfyui/ComfyUI_ppdiffusers/sdxl_pipe_nodes.py
Lines changed: 0 additions & 1 deletion
diff --git a/‎deploy/llava/export_model.py
Lines changed: 4 additions & 3 deletions b/‎deploy/llava/export_model.py
Lines changed: 4 additions & 3 deletions
diff --git a/‎deploy/llava/run_static_predict.py
Lines changed: 88 additions & 11 deletions b/‎deploy/llava/run_static_predict.py
Lines changed: 88 additions & 11 deletions
diff --git a/‎paddlemix/auto/modeling.py
Lines changed: 5 additions & 4 deletions b/‎paddlemix/auto/modeling.py
Lines changed: 5 additions & 4 deletions
diff --git a/‎paddlemix/auto/processing.py
Lines changed: 6 additions & 1 deletion b/‎paddlemix/auto/processing.py
Lines changed: 6 additions & 1 deletion
@@ -72,6 +72,10 @@ coverage.xml
 # NPU meta folder
 kernel_meta/
 
+pretrained/
+playground/
+log/
+
 # MAC
 *.DS_Store
 
 
@@ -19,10 +19,13 @@ PaddleMIX是基于飞桨的多模态大模型开发套件，聚合图像、文
 
 ## 最新进展
 
-📚《飞桨多模态大模型开发套件PaddleMIX 2.0 震撼发布》，图文音视频场景全覆盖，多模态高效助力产业创新。超大规模训练支持，覆盖图文预训练、文生图、跨模态视觉任务，覆盖金融、教育、电商、医疗等产业场景。8月8日（周四）20：00 带你直播了解多模态大模型最新架构，深度解析PaddleMIX高性能模型库，手把手演示LLaVA模型训推全流程。[报名链接](https://www.wjx.top/vm/wKqysjx.aspx?udsid=449688)
+📚《飞桨多模态大模型开发套件PaddleMIX 2.0 震撼发布》，图文音视频场景全覆盖，多模态高效助力产业创新。超大规模训练支持，覆盖图文预训练、文生图、跨模态视觉任务，覆盖金融、教育、电商、医疗等产业场景。8月8日（周四）20：00 带你直播了解多模态大模型最新架构，深度解析PaddleMIX高性能模型库，手把手演示LLaVA模型训推全流程。[报名链接](https://www.wjx.top/vm/wKqysjx.aspx?udsid=449688)  
+
+**2024.09.11 更新**
+* 新增Qwen2-VL、InternVL2、SD3等模型
 
 **2024.07.25 发布PaddleMIX v2.0**
-* 多模态理解：新增LLaVA系列,Qwen-VL等；新增Auto模块统一SFT训练流程；新增mixtoken训练策略，SFT吞吐量提升5.6倍。
+* 多模态理解：新增LLaVA系列，Qwen-VL等；新增Auto模块统一SFT训练流程；新增mixtoken训练策略，SFT吞吐量提升5.6倍。
 * 多模态生成：发布[PPDiffusers 0.24.1](./ppdiffusers/README.md)版本，支持视频生成能力，文生图模型新增LCM。新增飞桨版peft，accelerate后端。提供基于飞桨开发的ComfyUI插件。
 * 多模态数据处理工具箱[DataCopilot](./paddlemix/datacopilot/)：支持自定义数据结构，数据转换，离线格式检查；支持基本的统计信息，数据可视化功能。
 
@@ -104,10 +107,10 @@ pip install -e .
   <tbody>
     <tr align="center" valign="center">
       <td>
-        <b>多模态预训练</b>
+        <b>多模态理解</b>
       </td>
       <td>
-        <b>扩散类模型</b>
+        <b>多模态生成</b>
       </td>
     </tr>
     <tr valign="top">
@@ -116,14 +119,19 @@ pip install -e .
         </ul>
           <li><b>图文预训练</b></li>
         <ul>
+            <li><a href="paddlemix/examples/clip">CLIP</a></li>
             <li><a href="paddlemix/examples/evaclip">EVA-CLIP</a></li>
+            <li><a href="paddlemix/examples/llava">LLaVA</a></li>
+            <li><a href="paddlemix/examples/llava">LLaVA-1.5</a></li>
+            <li><a href="paddlemix/examples/llava">LLaVA-NeXT</a></li>
+            <li><a href="paddlemix/examples/qwen_vl">Qwen-VL</a></li>
+            <li><a href="paddlemix/examples/qwen2_vl">Qwen2-VL</a></li>
+            <li><a href="paddlemix/examples/internvl2">InternVL2</a></li>
+            <li><a href="paddlemix/examples/minimonkey">Mini-Monkey</a></li>
             <li><a href="paddlemix/examples/coca">CoCa</a></li>
-            <li><a href="paddlemix/examples/clip">CLIP</a></li>
             <li><a href="paddlemix/examples/blip2">BLIP-2</a></li>
             <li><a href="paddlemix/examples/minigpt4">miniGPT-4</a></li>
             <li><a href="paddlemix/examples/visualglm">VIsualGLM</a></li>
-            <li><a href="paddlemix/examples/qwen_vl">Qwen_VL</a></li>
-            <li><a href="paddlemix/examples/llava">LLaVA</a></li>
             <li><a href="paddlemix/examples/cogvlm">CogVLM && CogAgent</a></li>
             <li><a href="paddlemix/examples/internlm_xcomposer2">InternLM-XComposer2</a></li>
       </ul>
 
@@ -100,7 +100,7 @@ pip install -e .
   <tbody>
     <tr align="center" valign="center">
       <td>
-        <b>Multi-modal Pre-training</b>
+        <b>Multi-modal Large Language Models</b>
       </td>
       <td>
         <b>Diffusion-based Models</b>
@@ -112,14 +112,19 @@ pip install -e .
         </ul>
           <li><b>Image-Text Pre-training</b></li>
         <ul>
+            <li><a href="paddlemix/examples/clip">CLIP</a></li>
             <li><a href="paddlemix/examples/evaclip">EVA-CLIP</a></li>
+            <li><a href="paddlemix/examples/llava">LLaVA</a></li>
+            <li><a href="paddlemix/examples/llava">LLaVA-1.5</a></li>
+            <li><a href="paddlemix/examples/llava">LLaVA-NeXT</a></li>
+            <li><a href="paddlemix/examples/qwen_vl">Qwen-VL</a></li>
+            <li><a href="paddlemix/examples/qwen2_vl">Qwen2-VL</a></li>
+            <li><a href="paddlemix/examples/internvl2">InternVL2</a></li>
+            <li><a href="paddlemix/examples/minimonkey">Mini-Monkey</a></li>
             <li><a href="paddlemix/examples/coca">CoCa</a></li>
-            <li><a href="paddlemix/examples/clip">CLIP</a></li>
             <li><a href="paddlemix/examples/blip2">BLIP-2</a></li>
             <li><a href="paddlemix/examples/minigpt4">miniGPT-4</a></li>
             <li><a href="paddlemix/examples/visualglm">VIsualGLM</a></li>
-            <li><a href="paddlemix/examples/qwen_vl">Qwen_VL</a></li>
-            <li><a href="paddlemix/examples/llava">LLaVA</a></li>
             <li><a href="paddlemix/examples/cogvlm">CogVLM && CogAgent</a></li>
             <li><a href="paddlemix/examples/internlm_xcomposer2">InternLM-XComposer2</a></li>
       </ul>
 
@@ -36,7 +36,11 @@ def __init__(self):
     @classmethod
     def INPUT_TYPES(s):
         return {
-            "required": {"images": ("IMAGE",), "filename_prefix": ("STRING", {"default": "ComfyUI"})},
+            "required": {
+                "images": ("IMAGE",), 
+                "filename_prefix": ("STRING", {"default": "ComfyUI"}),
+                "censor": ("BOOLEAN", {"default": True})
+            },
             "hidden": {"prompt": "PROMPT", "extra_pnginfo": "EXTRA_PNGINFO"},
         }
 
@@ -59,15 +63,18 @@ def censor_image(self, image):
         print(response)
         return response["result"]["pass"]
 
-    def save_images(self, images, filename_prefix="ComfyUI", prompt=None, extra_pnginfo=None):
+    def save_images(self, images, censor=True, filename_prefix="ComfyUI", prompt=None, extra_pnginfo=None):
         filename_prefix += self.prefix_append
         full_output_folder, filename, counter, subfolder, filename_prefix = folder_paths.get_save_image_path(
             filename_prefix, self.output_dir, images[0].shape[1], images[0].shape[0]
         )
         results = list()
         for (batch_number, image) in enumerate(images):
             img = Image.fromarray(image)
-            pass_censor = self.censor_image(img)
+            if censor:
+                pass_censor = self.censor_image(img)
+            else:
+                pass_censor = True
             # breakpoint()
             if pass_censor:
                 metadata = None
 
@@ -40,7 +40,6 @@ def INPUT_TYPES(cls):
     def load_checkpoint(self, ckpt_name):
         ckpt_path = folder_paths.get_full_path("checkpoints", ckpt_name)
         pipe = StableDiffusionPipeline.from_single_file(ckpt_path)
-        pipe = pipe.to(dtype=paddle.float16)
         return (pipe,)
 
 
 
@@ -41,7 +41,6 @@ def INPUT_TYPES(cls):
     def load_checkpoint(self, ckpt_name):
         ckpt_path = folder_paths.get_full_path("checkpoints", ckpt_name)
         pipe = StableDiffusionXLPipeline.from_single_file(ckpt_path)
-        pipe = pipe.to(dtype=paddle.float16)
         return (pipe,)
 
 
 
@@ -30,13 +30,13 @@ def export_encode_text(model, config, compute_dtype):
 
 
 def export_encode_image(model, compute_dtype):
-
+    paddle.save(model.llama.image_newline,args.save_path + "/encode_image/clip/image_newline.pdparams")
     # convert to static graph with specific input description
     model = paddle.jit.to_static(
         model.encode_images,
         input_spec=[
-            paddle.static.InputSpec(shape=[None, 3, 336, 336], dtype=compute_dtype),  # images
-        ],
+            paddle.static.InputSpec(shape=[None,3, 336, 336], dtype=compute_dtype),  # images
+        ]
     )
 
     # save to static model
@@ -76,6 +76,7 @@ def export_encode_image(model, compute_dtype):
         vision_tower = model.get_vision_tower()
         vision_tower.load_model()
         model.eval()
+        
         export_encode_image(model, compute_dtype)
 
     elif args.encode_text:
 
@@ -26,6 +26,8 @@
     IMAGE_TOKEN_INDEX,
 )
 from paddlemix.models.llava.conversation import conv_templates
+from paddlemix.models.llava.mm_utils import load_image,get_anyres_image_grid_shape
+from paddlemix.models.llava.base_model import unpad_image
 from paddlemix.utils.log import logger
 
 
@@ -39,15 +41,20 @@ def __init__(self, args):
 
         self.args = args
         self.config = AutoConfigMIX.from_pretrained(args.model_name_or_path)
+        self.clip_config = AutoConfigMIX.from_pretrained(self.config.mm_vision_tower)
+
 
         self.tokenizer = AutoTokenizerMIX.from_pretrained(args.model_name_or_path)
-        self.processor, _ = AutoProcessorMIX.from_pretrained(args.model_name_or_path, eval="eval")
+        self.processor, _ = AutoProcessorMIX.from_pretrained(args.model_name_or_path, image_aspect_ratio=self.config.image_aspect_ratio,eval="eval")
 
         self.first_predictor = self.create_predictor(args.first_model_path)
         print(f"first_model_path: {args.first_model_path}, {self.first_predictor}")
+
         self.second_predictor = self.create_predictor(args.second_model_path)
         print(f"second_model_path: {args.second_model_path}, {self.second_predictor}")
 
+        self.image_newline = paddle.load(os.path.join(args.first_model_path, "image_newline.pdparams"))
+
     def create_predictor(self, model_path):
 
         from paddlenlp.utils.import_utils import import_module
@@ -77,9 +84,79 @@ def create_predictor(self, model_path):
         return predictor
 
     @paddle.no_grad()
-    def encode_images(self, pixel_values):
-        language_model_inputs = self.first_predictor.run(pixel_values)
-        return language_model_inputs
+    def encode_images(self, images, image_sizes):
+        if type(images) is list or images.ndim == 5:
+            if type(images) is list:
+                images = [(x.unsqueeze(axis=0) if x.ndim == 3 else x) for x in images]
+            concat_images = paddle.concat(x=[image for image in images], axis=0)
+
+            image_features = self.first_predictor.run(concat_images)[0]
+      
+            split_sizes = [image.shape[0] for image in images]
+            image_features = paddle.split(image_features, split_sizes, axis=0)
+            mm_patch_merge_type = getattr(self.config, "mm_patch_merge_type", "flat")
+            image_aspect_ratio = getattr(self.config, "image_aspect_ratio", "square")
+            if mm_patch_merge_type == "flat":
+                image_features = [x.flatten(start_axis=0, stop_axis=1) for x in image_features]
+            elif mm_patch_merge_type.startswith("spatial"):
+                new_image_features = []
+                for image_idx, image_feature in enumerate(image_features):
+                    if image_feature.shape[0] > 1:
+                        base_image_feature = image_feature[0]
+                        image_feature = image_feature[1:]
+                        height = width = self.clip_config.image_resolution // self.clip_config.vision_patch_size
+                        assert height * width == base_image_feature.shape[0]
+                        if image_aspect_ratio == "anyres":
+                            num_patch_width, num_patch_height = get_anyres_image_grid_shape(
+                                image_sizes[image_idx],
+                                self.config.image_grid_pinpoints,
+                                self.clip_config.image_resolution,
+                            )
+
+                            image_feature = paddle.reshape(
+                                image_feature, (num_patch_height, num_patch_width, height, width, -1)
+                            )
+                        else:
+                            raise NotImplementedError
+                        if "unpad" in mm_patch_merge_type:
+                            image_feature = image_feature.transpose(perm=[4, 0, 2, 1, 3])
+                            image_feature = image_feature.flatten(start_axis=1, stop_axis=2).flatten(
+                                start_axis=2, stop_axis=3
+                            )
+                            image_feature = unpad_image(image_feature, image_sizes[image_idx])
+                            image_feature = paddle.concat(
+                                x=(
+                                    image_feature,
+                                    self.image_newline[:, (None), (None)].expand(
+                                        shape=[*image_feature.shape[:-1], 1]
+                                    ).astype(image_feature.dtype),
+                                ),
+                                axis=-1,
+                            )
+                            x = image_feature.flatten(start_axis=1, stop_axis=2)
+                            perm_12 = list(range(x.ndim))
+                            perm_12[0] = 1
+                            perm_12[1] = 0
+                            image_feature = x.transpose(perm=perm_12)
+                        else:
+                            image_feature = image_feature.transpose(perm=[0, 2, 1, 3, 4])
+                            image_feature = image_feature.flatten(start_axis=0, stop_axis=3)
+                        image_feature = paddle.concat(x=(base_image_feature, image_feature), axis=0)
+                    else:
+                        image_feature = image_feature[0]
+                        if "unpad" in mm_patch_merge_type:
+                            image_feature = paddle.concat(
+                                x=(image_feature, self.image_newline[None].to(image_feature.place)), axis=0
+                            )
+                    new_image_features.append(image_feature)
+                image_features = new_image_features
+                image_features = paddle.stack(x=image_features, axis=0)
+            else:
+                raise ValueError(f"Unexpected mm_patch_merge_type: {self.config.mm_patch_merge_type}")
+        else:
+            image_features = self.first_predictor.run(images)[0]
+        
+        return image_features
 
     @paddle.no_grad()
     def generate_with_image_features(self, image_features, input_ids):
@@ -225,9 +302,9 @@ def pre_processing(self, inp, first_message):
         conv.append_message(conv.roles[1], None)
         prompt = conv.get_prompt()
         record = {"image": self.args.image_file, "conversations": prompt}
-
+        image_size = load_image(args.image_file).size
         data_dict = self.processor(record=record, image_aspect_ratio=self.config.image_aspect_ratio)
-
+        data_dict['image_size'] = [image_size]
         return data_dict
 
     def post_processing(self, generate_ids):
@@ -245,8 +322,8 @@ def run_benchmark(self):
             inp = "user: Generate the caption in English with grounding"
             data_dict = self.pre_processing(inp, first_message)
             image = paddle.cast(data_dict["images"], self.compute_dtype)
-            
-            image_features = self.encode_images(image)[0]
+          
+            image_features = self.encode_images(image,data_dict['image_size'])
 
             generate_ids, _ = self.generate_with_image_features(
                 image_features,
@@ -277,9 +354,9 @@ def predict(self):
                 print(f"{roles[1]}: ", end="")
                 data_dict = self.pre_processing(inp, first_message)
                 image = paddle.cast(data_dict["images"], self.compute_dtype)
-
-                image_features = self.encode_images(image)[0]
-
+               
+                image_features = self.encode_images(image,data_dict['image_size'])
+           
                 generate_ids, _ = self.generate_with_image_features(
                     image_features,
                     data_dict["input_ids"],
 
@@ -28,11 +28,11 @@
     url_file_exists,
 )
 from paddlenlp.utils.env import HF_CACHE_HOME as PPNLP_HF_CACHE_HOME
-from paddlenlp.utils.env import MODEL_HOME as PPNLP_MODEL_HOME
 from paddlenlp.utils.import_utils import import_module
 from paddlenlp.utils.log import logger
 
 from paddlemix.utils.env import MODEL_HOME as PPMIX_MODEL_HOME
+
 from .configuration import get_configurations
 
 __all__ = [
@@ -57,6 +57,8 @@
     "qwen_vl": "QWenLMHeadModel",
     "sam": "SamModel",
     "visualglm": "VisualGLMForConditionalGeneration",
+    "llava_qwen": "LlavaQwenForCausalLM",
+    "internvl2": "InternVLChatModel",
 }
 
 
@@ -180,7 +182,6 @@ def _from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
         subfolder = kwargs.get("subfolder", "")
         cache_dir = resolve_cache_dir(from_hf_hub, from_aistudio, cache_dir)
         kwargs["cache_dir"] = cache_dir
-      
 
         if from_hf_hub:
             if hf_file_exists(repo_id=pretrained_model_name_or_path, filename=cls.model_config_file):
@@ -230,7 +231,7 @@ def _from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
                 [COMMUNITY_MODEL_PREFIX, pretrained_model_name_or_path, cls.legacy_model_config_file]
             )
             cache_dir = os.path.join(cache_dir, pretrained_model_name_or_path, subfolder)
-           
+
             try:
                 if url_file_exists(standard_community_url):
                     resolved_vocab_file = get_path_from_url_with_filelock(standard_community_url, cache_dir)
@@ -248,7 +249,7 @@ def _from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
                     "- or a correct model-identifier of community-contributed pretrained models,\n"
                     "- or the correct path to a directory containing relevant modeling files(model_weights and model_config).\n"
                 )
-            
+
             if os.path.exists(resolved_vocab_file):
                 model_class = cls._get_model_class_from_config(pretrained_model_name_or_path, resolved_vocab_file)
                 logger.info(f"We are using {model_class} to load '{pretrained_model_name_or_path}'.")
 
@@ -51,6 +51,10 @@ def get_processor_mapping():
             model_name = "qwen_vl"
         elif "internlm_xcomposer2" in file_name:
             model_name = "internlm_xcomposer2"
+        elif "llava_next" in file_name:
+            model_name = "llava_next"
+        elif "internvl2" in file_name:
+            model_name = "internvl2"
         else:
             model_name = file_name.split("_")[0]
 
@@ -91,7 +95,6 @@ def __init__(self, *args, **kwargs):
 
     @classmethod
     def _get_processor_class(cls, pretrained_model_name_or_path, text_model_name_or_path=None, **kwargs):
-
         name_or_path = None
         processor = None
         tokenizer = None
@@ -120,7 +123,9 @@ def _get_processor_class(cls, pretrained_model_name_or_path, text_model_name_or_
             text_model_name_or_path = pretrained_model_name_or_path
 
         for names, processor_class in cls._processor_mapping.items():
+
             if names.lower() in pretrained_model_name_or_path.lower().replace("-", "_").replace("vicuna", "llava"):
+
                 attributes = processor_class["processor"].attributes
                 attributes_dict = {}