PaddlePaddle · nemonameless · Oct 17, 2024 · Oct 15, 2024 · Oct 15, 2024 · Oct 16, 2024
diff --git a/paddlemix/examples/qwen2_vl/README.md b/paddlemix/examples/qwen2_vl/README.md
@@ -2,28 +2,24 @@
 
 ## 1. 模型介绍
 
-[Qwen2-VL](https://qwenlm.github.io/blog/qwen2-vl/) 是大规模视觉语言模型。可以以图像、文本、检测框、视频作为输入，并以文本和检测框作为输出。
-本仓库提供paddle版本的Qwen2-VL-2B-Instruct和Qwen2-VL-7B-Instruct模型。
+[Qwen2-VL](https://qwenlm.github.io/blog/qwen2-vl/) 是大规模视觉语言模型。可以以图像、文本、检测框、视频作为输入，并以文本和检测框作为输出。本仓库提供paddle版本的`Qwen2-VL-2B-Instruct`和`Qwen2-VL-7B-Instruct`模型。
 
 
 ## 2 环境准备
 - **python >= 3.10**
-- tiktoken
-> 注：tiktoken 要求python >= 3.8
 - **paddlepaddle-gpu 要求版本develop**
 ```
 # 安装示例
 python -m pip install paddlepaddle-gpu==0.0.0.post118 -f https://www.paddlepaddle.org.cn/whl/linux/gpu/develop.html
 ```
 
-
 - paddlenlp >= 3.0.0(默认开启flash_attn，推荐源码编译安装)
 
 > 注：
 * 请确保安装了以上依赖，否则无法运行。同时，需要安装 paddlemix/external_ops 下的自定义OP, `python setup.py install`。如果安装后仍然找不到算子，需要额外设置PYTHONPATH
-* 使用flash_attn 要求H或者A卡，开启后显存变化如下：2B模型: 49G -> 13G ｜ 7B模型: 61G -> 25G
+* 使用flash_attn 要求A100/A800显卡或者H20显卡，开启后推理显存变化如下：2B模型: 49G -> 13G ｜ 7B模型: 61G -> 25G
 
-## 3 快速开始
+## 3 推理预测
 
 ### a. 单图预测
 ```bash
@@ -40,6 +36,40 @@ python paddlemix/examples/qwen2_vl/multi_image_infer.py
 python paddlemix/examples/qwen2_vl/video_infer.py
 ```
 
+## 4 模型微调
+
+### 4.1 微调数据准备
+
+SFT数据集选择6个公开的数据集，包括`dvqa`、`chartqa`、`ai2d`、`docvqa`、`geoqa+`、`synthdog_en`，详见`paddlemix/examples/qwen2_vl/configs/baseline_6data_330k.json`
+
+PaddleMIX团队整理后的下载链接为：
+```
+wget https://paddlenlp.bj.bcebos.com/datasets/paddlemix/playground.tar
+wget https://paddlenlp.bj.bcebos.com/datasets/paddlemix/playground/opensource_json.tar
+```
+opensource_json.tar需下载解压在playground/目录下，opensource里是数据标注的jsonl文件。
+
+### 4.2 微调命令
+
+注意：此微调训练为冻结视觉编码器而放开LLM训练的，2B模型微调训练的显存大小约为30G，7B模型微调训练的显存大小约为75G。
+
+```bash
+# 2B
+sh paddlemix/examples/qwen2_vl/shell/basline_2b_bs32_1e8.sh
+
+# 7B
+sh paddlemix/examples/qwen2_vl/shell/basline_7b_bs32_1e8.sh
+```
+
+### 4.3 微调后使用
+
+同按步骤3中的模型推理预测，只需将`paddlemix/examples/qwen2_vl/single_image_infer.py`中的`MODEL_NAME`参数修改为微调后的模型路径即可。
+
+```bash
+python paddlemix/examples/qwen2_vl/single_image_infer.py
+```
+
+
 ## 参考文献
 ```BibTeX
 @article{Qwen2-VL,

diff --git a/paddlemix/examples/qwen2_vl/configs/add_llavaov_doc_ocr.json b/paddlemix/examples/qwen2_vl/configs/add_llavaov_doc_ocr.json
@@ -0,0 +1,128 @@
+{
+  "dvqa_train_200k": {
+    "annotation": "playground/opensource_json/dvqa_train_200k.json",
+    "repeat_time": 1
+  },
+  "chartqa_train_18k": {
+    "annotation": "playground/opensource_json/chartqa_train_18k.json",
+    "repeat_time": 1
+  },
+  "ai2d_train_12k": {
+    "annotation": "playground/opensource_json/ai2d_train_12k.json",
+    "repeat_time": 1
+  },
+  "docvqa_train_10k": {
+    "annotation": "playground/opensource_json/docvqa_train_10k.json",
+    "repeat_time": 1
+  },
+  "geoqa+": {
+    "annotation": "playground/opensource_json/geoqa+.json",
+    "repeat_time": 1
+  },
+  "synthdog_en": {
+    "annotation": "playground/opensource_json/synthdog_en.json",
+    "repeat_time": 1
+  },
+
+  "updated_ChromeWriting_8k": {
+    "file_name": "LLaVA-OneVision-Data_OCR/updated_ChromeWriting_8825.json",
+    "repeat_time": 1
+  },
+  "updated_K12_Printing_256k": {
+    "file_name": "LLaVA-OneVision-Data_OCR/updated_K12_Printing_256636.json",
+    "repeat_time": 1
+  },
+  "updated_Rendered_Text_9k": {
+    "file_name": "LLaVA-OneVision-Data_OCR/updated_Rendered_Text_9995.json",
+    "repeat_time": 1
+  },
+  "updated_TextOCR_29k": {
+    "file_name": "LLaVA-OneVision-Data_OCR/updated_TextOCR_29288.json",
+    "repeat_time": 1
+  },
+
+  "ai2d(cauldron_llava_format)_filter_sptoken_2429": {
+    "file_name": "LLaVA-OneVision-Data_doc/ai2d(cauldron_llava_format)_filter_sptoken_2429.json",
+    "repeat_time": 1
+  },
+  "ai2d(gpt4v)_filter_sptoken_4864": {
+    "file_name": "LLaVA-OneVision-Data_doc/ai2d(gpt4v)_filter_sptoken_4864.json",
+    "repeat_time": 1
+  },
+  "ai2d(internvl)_filter_sptoken_12403": {
+    "file_name": "LLaVA-OneVision-Data_doc/ai2d(internvl)_filter_sptoken_12403.json",
+    "repeat_time": 1
+  },
+  "chart2text(cauldron)_filter_sptoken_26956": {
+    "file_name": "LLaVA-OneVision-Data_doc/chart2text(cauldron)_filter_sptoken_26956.json",
+    "repeat_time": 1
+  },
+  "chartqa(cauldron_llava_format)_filter_sptoken_18260": {
+    "file_name": "LLaVA-OneVision-Data_doc/chartqa(cauldron_llava_format)_filter_sptoken_18260.json",
+    "repeat_time": 1
+  },
+  "diagram_image_to_text(cauldron)_filter_sptoken_295": {
+    "file_name": "LLaVA-OneVision-Data_doc/diagram_image_to_text(cauldron)_filter_sptoken_295.json",
+    "repeat_time": 1
+  },
+  "dvqa(cauldron_llava_format)_filter_sptoken_199995": {
+    "file_name": "LLaVA-OneVision-Data_doc/dvqa(cauldron_llava_format)_filter_sptoken_199995.json",
+    "repeat_time": 1
+  },
+  "FigureQA(MathV360K)_filter_sptoken_17587": {
+    "file_name": "LLaVA-OneVision-Data_doc/FigureQA(MathV360K)_filter_sptoken_17587.json",
+    "repeat_time": 1
+  },
+  "hitab(cauldron_llava_format)_filter_sptoken_2495": {
+    "file_name": "LLaVA-OneVision-Data_doc/hitab(cauldron_llava_format)_filter_sptoken_2495.json",
+    "repeat_time": 1
+  },
+  "infographic_vqa_llava_format_filter_sptoken_2113": {
+    "file_name": "LLaVA-OneVision-Data_doc/infographic_vqa_llava_format_filter_sptoken_2113.json",
+    "repeat_time": 1
+  },
+  "lrv_filter_sptoken_1776": {
+    "file_name": "LLaVA-OneVision-Data_doc/lrv_filter_sptoken_1776.json",
+    "repeat_time": 1
+  },
+  "robut_sqa_filter_sptoken_8509": {
+    "file_name": "LLaVA-OneVision-Data_doc/robut_sqa_filter_sptoken_8509.json",
+    "repeat_time": 1
+  },
+  "robut_wikisql_filter_sptoken_74984": {
+    "file_name": "LLaVA-OneVision-Data_doc/robut_wikisql_filter_sptoken_74984.json",
+    "repeat_time": 1
+  },
+  "robut_wtq_filter_sptoken_38241": {
+    "file_name": "LLaVA-OneVision-Data_doc/robut_wtq_filter_sptoken_38241.json",
+    "repeat_time": 1
+  },
+  "screen2word_filter_sptoken_15725": {
+    "file_name": "LLaVA-OneVision-Data_doc/screen2word_filter_sptoken_15725.json",
+    "repeat_time": 1
+  },
+  "tqa_filter_sptoken_27272": {
+    "file_name": "LLaVA-OneVision-Data_doc/tqa_filter_sptoken_27272.json",
+    "repeat_time": 1
+  },
+  "ureader_cap_filter_sptoken_91434": {
+    "file_name": "LLaVA-OneVision-Data_doc/ureader_cap_filter_sptoken_91434.json",
+    "repeat_time": 1
+  },
+  "ureader_ie_filter_sptoken_17322": {
+    "file_name": "LLaVA-OneVision-Data_doc/ureader_ie_filter_sptoken_17322.json",
+    "repeat_time": 1
+  },
+  "ureader_kg_filter_sptoken_37550": {
+    "file_name": "LLaVA-OneVision-Data_doc/ureader_kg_filter_sptoken_37550.json",
+    "repeat_time": 1
+  },
+  "ureader_qa_filter_sptoken_252954": {
+    "file_name": "LLaVA-OneVision-Data_doc/ureader_qa_filter_sptoken_252954.json",
+    "repeat_time": 1
+  },
+  "visualmrc_filter_sptoken_3022": {
+    "file_name": "LLaVA-OneVision-Data_doc/visualmrc_filter_sptoken_3022.json",
+    "repeat_time": 1
+  }
+}
diff --git a/paddlemix/examples/qwen2_vl/configs/baseline_6data_330k.json b/paddlemix/examples/qwen2_vl/configs/baseline_6data_330k.json
@@ -0,0 +1,26 @@
+{
+  "dvqa_train_200k": {
+    "annotation": "playground/opensource_json/dvqa_train_200k.json",
+    "repeat_time": 1
+  },
+  "chartqa_train_18k": {
+    "annotation": "playground/opensource_json/chartqa_train_18k.json",
+    "repeat_time": 1
+  },
+  "ai2d_train_12k": {
+    "annotation": "playground/opensource_json/ai2d_train_12k.json",
+    "repeat_time": 1
+  },
+  "docvqa_train_10k": {
+    "annotation": "playground/opensource_json/docvqa_train_10k.json",
+    "repeat_time": 1
+  },
+  "geoqa+": {
+    "annotation": "playground/opensource_json/geoqa+.json",
+    "repeat_time": 1
+  },
+  "synthdog_en": {
+    "annotation": "playground/opensource_json/synthdog_en.json",
+    "repeat_time": 1
+  }
+}
diff --git a/paddlemix/examples/qwen2_vl/multi_image_infer.py b/paddlemix/examples/qwen2_vl/multi_image_infer.py
@@ -24,7 +24,7 @@
 MODEL_NAME = "Qwen/Qwen2-VL-2B-Instruct"
 model = Qwen2VLForConditionalGeneration.from_pretrained(MODEL_NAME, dtype="bfloat16")
 
-image_processor = Qwen2VLImageProcessor.from_pretrained(MODEL_NAME)
+image_processor = Qwen2VLImageProcessor()
 tokenizer = Qwen2Tokenizer.from_pretrained(MODEL_NAME)
 processor = Qwen2VLProcessor(image_processor, tokenizer)
 
@@ -48,8 +48,8 @@
 image_inputs, video_inputs = process_vision_info(messages)
 
 question = "Identify the similarities between these images."
-image_pad_tokens = '<|vision_start|><|image_pad|><|vision_end|>' * len(image_inputs)
-text = f'<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{image_pad_tokens}{question}<|im_end|>\n<|im_start|>assistant\n'
+image_pad_tokens = "<|vision_start|><|image_pad|><|vision_end|>" * len(image_inputs)
+text = f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{image_pad_tokens}{question}<|im_end|>\n<|im_start|>assistant\n"
 
 inputs = processor(
     text=[text],