PaddlePaddle
diff --git a/‎paddlemix/examples/qwen2_vl/README.md
Lines changed: 42 additions & 0 deletions b/‎paddlemix/examples/qwen2_vl/README.md
Lines changed: 42 additions & 0 deletions
diff --git a/‎paddlemix/examples/qwen2_vl/multi_image_infer.py
Lines changed: 65 additions & 0 deletions b/‎paddlemix/examples/qwen2_vl/multi_image_infer.py
Lines changed: 65 additions & 0 deletions
diff --git a/‎paddlemix/examples/qwen2_vl/single_image_infer.py
Lines changed: 67 additions & 0 deletions b/‎paddlemix/examples/qwen2_vl/single_image_infer.py
Lines changed: 67 additions & 0 deletions
diff --git a/‎paddlemix/examples/qwen2_vl/video_infer.py
Lines changed: 94 additions & 0 deletions b/‎paddlemix/examples/qwen2_vl/video_infer.py
Lines changed: 94 additions & 0 deletions
diff --git a/‎paddlemix/models/qwen2_vl/__init__.py
Lines changed: 22 additions & 0 deletions b/‎paddlemix/models/qwen2_vl/__init__.py
Lines changed: 22 additions & 0 deletions
@@ -0,0 +1,42 @@
+# Qwen2-VL
+
+## 1. 模型介绍
+
+[Qwen2-VL](https://qwenlm.github.io/blog/qwen2-vl/) 是大规模视觉语言模型。可以以图像、文本、检测框、视频作为输入，并以文本和检测框作为输出。
+本仓库提供paddle版本的Qwen2-VL-2B-Instruct和Qwen2-VL-7B-Instruct模型。
+
+
+## 2 环境准备
+- **python >= 3.10**
+- tiktoken
+> 注：tiktoken 要求python >= 3.8
+- paddlepaddle-gpu >= 2.6.1
+- paddlenlp >= 3.0.0
+
+> 注：请确保安装了以上依赖，否则无法运行。同时，需要安装 paddlemix/external_ops 下的自定义OP, `python setup.py install`。如果安装后仍然找不到算子，需要额外设置PYTHONPATH
+
+## 3 快速开始
+
+### a. 单图预测
+```bash
+python paddlemix/examples/qwen2_vl/single_image_infer.py
+```
+
+### b. 多图预测
+```bash
+python paddlemix/examples/qwen2_vl/multi_image_infer.py
+```
+
+### c. 视频预测
+```bash
+python paddlemix/examples/qwen2_vl/video_infer.py
+```
+
+## 参考文献
+```BibTeX
+@article{Qwen2-VL,
+  title={Qwen2-VL},
+  author={Qwen team},
+  year={2024}
+}
+```
@@ -0,0 +1,65 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddlenlp.transformers import Qwen2Tokenizer
+
+from paddlemix.models.qwen2_vl.modeling_qwen2_vl import Qwen2VLForConditionalGeneration
+from paddlemix.processors.qwen2_vl_processing import (
+    Qwen2VLImageProcessor,
+    Qwen2VLProcessor,
+    process_vision_info,
+)
+
+MODEL_NAME = "Qwen/Qwen2-VL-2B-Instruct"
+model = Qwen2VLForConditionalGeneration.from_pretrained(MODEL_NAME, dtype="bfloat16")
+
+image_processor = Qwen2VLImageProcessor.from_pretrained(MODEL_NAME)
+tokenizer = Qwen2Tokenizer.from_pretrained(MODEL_NAME)
+processor = Qwen2VLProcessor(image_processor, tokenizer)
+
+# min_pixels = 256*28*28 # 200704
+# max_pixels = 1280*28*28 # 1003520
+# processor = Qwen2VLProcessor(image_processor, tokenizer, min_pixels=min_pixels, max_pixels=max_pixels)
+
+
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "image", "image": "./image1.jpg"},
+            {"type": "image", "image": "./image2.jpg"},
+            {"type": "text", "text": "Identify the similarities between these images."},
+        ],
+    }
+]
+
+# Preparation for inference
+image_inputs, video_inputs = process_vision_info(messages)
+
+question = "Identify the similarities between these images."
+image_pad_tokens = '<|vision_start|><|image_pad|><|vision_end|>' * len(image_inputs)
+text = f'<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{image_pad_tokens}{question}<|im_end|>\n<|im_start|>assistant\n'
+
+inputs = processor(
+    text=[text],
+    images=image_inputs,
+    videos=video_inputs,
+    padding=True,
+    return_tensors="pd",
+)
+
+# Inference: Generation of the output
+generated_ids = model.generate(**inputs, max_new_tokens=128)  # already trimmed in paddle
+output_text = processor.batch_decode(generated_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
+print("output_text:\n", output_text[0])
@@ -0,0 +1,67 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddlenlp.transformers import Qwen2Tokenizer
+
+from paddlemix.models.qwen2_vl.modeling_qwen2_vl import Qwen2VLForConditionalGeneration
+from paddlemix.processors.qwen2_vl_processing import (
+    Qwen2VLImageProcessor,
+    Qwen2VLProcessor,
+    process_vision_info,
+)
+
+MODEL_NAME = "Qwen/Qwen2-VL-2B-Instruct"
+model = Qwen2VLForConditionalGeneration.from_pretrained(MODEL_NAME, dtype="bfloat16")
+
+image_processor = Qwen2VLImageProcessor.from_pretrained(MODEL_NAME)
+tokenizer = Qwen2Tokenizer.from_pretrained(MODEL_NAME)
+processor = Qwen2VLProcessor(image_processor, tokenizer)
+
+# min_pixels = 256*28*28 # 200704
+# max_pixels = 1280*28*28 # 1003520
+# processor = Qwen2VLProcessor(image_processor, tokenizer, min_pixels=min_pixels, max_pixels=max_pixels)
+
+
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "image",
+                "image": "./image1.jpg",
+            },
+            {"type": "text", "text": "Describe this image."},
+        ],
+    }
+]
+
+# Preparation for inference
+image_inputs, video_inputs = process_vision_info(messages)
+
+question = "Describe this image."
+image_pad_token = '<|vision_start|><|image_pad|><|vision_end|>'
+text = f'<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{image_pad_token}{question}<|im_end|>\n<|im_start|>assistant\n'
+
+inputs = processor(
+    text=[text],
+    images=image_inputs,
+    videos=video_inputs,
+    padding=True,
+    return_tensors="pd",
+)
+
+# Inference: Generation of the output
+generated_ids = model.generate(**inputs, max_new_tokens=128)  # already trimmed in paddle
+output_text = processor.batch_decode(generated_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
+print("output_text:\n", output_text[0])
@@ -0,0 +1,94 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddlenlp.transformers import Qwen2Tokenizer
+
+from paddlemix.models.qwen2_vl.modeling_qwen2_vl import Qwen2VLForConditionalGeneration
+from paddlemix.processors.qwen2_vl_processing import (
+    Qwen2VLImageProcessor,
+    Qwen2VLProcessor,
+    process_vision_info,
+)
+
+MODEL_NAME = "Qwen/Qwen2-VL-2B-Instruct"
+model = Qwen2VLForConditionalGeneration.from_pretrained(MODEL_NAME, dtype="bfloat16")
+
+image_processor = Qwen2VLImageProcessor.from_pretrained(MODEL_NAME)
+tokenizer = Qwen2Tokenizer.from_pretrained(MODEL_NAME)
+processor = Qwen2VLProcessor(image_processor, tokenizer)
+
+# min_pixels = 256*28*28 # 200704
+# max_pixels = 1280*28*28 # 1003520
+# processor = Qwen2VLProcessor(image_processor, tokenizer, min_pixels=min_pixels, max_pixels=max_pixels)
+
+
+# # Messages containing a images list as a video and a text query
+# messages = [
+#     {
+#         "role": "user",
+#         "content": [
+#             {
+#                 "type": "video",
+#                 "video": [
+#                     "file:///path/to/frame1.jpg",
+#                     "file:///path/to/frame2.jpg",
+#                     "file:///path/to/frame3.jpg",
+#                     "file:///path/to/frame4.jpg",
+#                 ],
+#                 "fps": 1.0,
+#             },
+#             {"type": "text", "text": "Describe this video."},
+#         ],
+#     }
+# ]
+
+
+# Messages containing a video and a text query
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "video",
+                "video": "./video1.mp4",
+                "max_pixels": 360 * 420,
+                "fps": 1.0,
+            },
+            {"type": "text", "text": "Describe this video."},
+        ],
+    }
+]
+
+
+# Preparation for inference
+image_inputs, video_inputs = process_vision_info(messages)
+
+question = "Describe this video."
+video_pad_token = '<|vision_start|><|video_pad|><|vision_end|>'
+text = f'<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{video_pad_token}{question}<|im_end|>\n<|im_start|>assistant\n'
+
+
+inputs = processor(
+    text=[text],
+    images=image_inputs,
+    videos=video_inputs,
+    padding=True,
+    return_tensors="pd",
+)
+
+# Inference: Generation of the output
+generated_ids = model.generate(**inputs, max_new_tokens=128)  # already trimmed in paddle
+print("generated_ids:\n", generated_ids)
+output_text = processor.batch_decode(generated_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
+print("output_text:\n", output_text[0])
@@ -0,0 +1,22 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .configuration_qwen2_vl import Qwen2VLConfig
+from .modeling_qwen2_vl import (
+    Qwen2VLForConditionalGeneration,
+    Qwen2VLModel,
+    Qwen2VLPreTrainedModel,
+)
+
+__all__ = ["Qwen2VLConfig", "Qwen2VLForConditionalGeneration", "Qwen2VLModel", "Qwen2VLPreTrainedModel"]