Skip to content

Commit 5b7c997

Browse files
authored
support Qwen2-VL sft training (#739)
1 parent 2e95929 commit 5b7c997

16 files changed

+1945
-50
lines changed

paddlemix/examples/qwen2_vl/README.md

Lines changed: 37 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,28 +2,24 @@
22

33
## 1. 模型介绍
44

5-
[Qwen2-VL](https://qwenlm.github.io/blog/qwen2-vl/) 是大规模视觉语言模型。可以以图像、文本、检测框、视频作为输入,并以文本和检测框作为输出。
6-
本仓库提供paddle版本的Qwen2-VL-2B-Instruct和Qwen2-VL-7B-Instruct模型。
5+
[Qwen2-VL](https://qwenlm.github.io/blog/qwen2-vl/) 是大规模视觉语言模型。可以以图像、文本、检测框、视频作为输入,并以文本和检测框作为输出。本仓库提供paddle版本的`Qwen2-VL-2B-Instruct``Qwen2-VL-7B-Instruct`模型。
76

87

98
## 2 环境准备
109
- **python >= 3.10**
11-
- tiktoken
12-
> 注:tiktoken 要求python >= 3.8
1310
- **paddlepaddle-gpu 要求版本develop**
1411
```
1512
# 安装示例
1613
python -m pip install paddlepaddle-gpu==0.0.0.post118 -f https://www.paddlepaddle.org.cn/whl/linux/gpu/develop.html
1714
```
1815

19-
2016
- paddlenlp >= 3.0.0(默认开启flash_attn,推荐源码编译安装)
2117

2218
> 注:
2319
* 请确保安装了以上依赖,否则无法运行。同时,需要安装 paddlemix/external_ops 下的自定义OP, `python setup.py install`。如果安装后仍然找不到算子,需要额外设置PYTHONPATH
24-
* 使用flash_attn 要求H或者A卡,开启后显存变化如下:2B模型: 49G -> 13G | 7B模型: 61G -> 25G
20+
* 使用flash_attn 要求A100/A800显卡或者H20显卡,开启后推理显存变化如下:2B模型: 49G -> 13G | 7B模型: 61G -> 25G
2521

26-
## 3 快速开始
22+
## 3 推理预测
2723

2824
### a. 单图预测
2925
```bash
@@ -40,6 +36,40 @@ python paddlemix/examples/qwen2_vl/multi_image_infer.py
4036
python paddlemix/examples/qwen2_vl/video_infer.py
4137
```
4238

39+
## 4 模型微调
40+
41+
### 4.1 微调数据准备
42+
43+
SFT数据集选择6个公开的数据集,包括`dvqa``chartqa``ai2d``docvqa``geoqa+``synthdog_en`,详见`paddlemix/examples/qwen2_vl/configs/baseline_6data_330k.json`
44+
45+
PaddleMIX团队整理后的下载链接为:
46+
```
47+
wget https://paddlenlp.bj.bcebos.com/datasets/paddlemix/playground.tar
48+
wget https://paddlenlp.bj.bcebos.com/datasets/paddlemix/playground/opensource_json.tar
49+
```
50+
opensource_json.tar需下载解压在playground/目录下,opensource里是数据标注的jsonl文件。
51+
52+
### 4.2 微调命令
53+
54+
注意:此微调训练为冻结视觉编码器而放开LLM训练的,2B模型微调训练的显存大小约为30G,7B模型微调训练的显存大小约为75G。
55+
56+
```bash
57+
# 2B
58+
sh paddlemix/examples/qwen2_vl/shell/basline_2b_bs32_1e8.sh
59+
60+
# 7B
61+
sh paddlemix/examples/qwen2_vl/shell/basline_7b_bs32_1e8.sh
62+
```
63+
64+
### 4.3 微调后使用
65+
66+
同按步骤3中的模型推理预测,只需将`paddlemix/examples/qwen2_vl/single_image_infer.py`中的`MODEL_NAME`参数修改为微调后的模型路径即可。
67+
68+
```bash
69+
python paddlemix/examples/qwen2_vl/single_image_infer.py
70+
```
71+
72+
4373
## 参考文献
4474
```BibTeX
4575
@article{Qwen2-VL,
Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
{
2+
"dvqa_train_200k": {
3+
"annotation": "playground/opensource_json/dvqa_train_200k.json",
4+
"repeat_time": 1
5+
},
6+
"chartqa_train_18k": {
7+
"annotation": "playground/opensource_json/chartqa_train_18k.json",
8+
"repeat_time": 1
9+
},
10+
"ai2d_train_12k": {
11+
"annotation": "playground/opensource_json/ai2d_train_12k.json",
12+
"repeat_time": 1
13+
},
14+
"docvqa_train_10k": {
15+
"annotation": "playground/opensource_json/docvqa_train_10k.json",
16+
"repeat_time": 1
17+
},
18+
"geoqa+": {
19+
"annotation": "playground/opensource_json/geoqa+.json",
20+
"repeat_time": 1
21+
},
22+
"synthdog_en": {
23+
"annotation": "playground/opensource_json/synthdog_en.json",
24+
"repeat_time": 1
25+
},
26+
27+
"updated_ChromeWriting_8k": {
28+
"file_name": "LLaVA-OneVision-Data_OCR/updated_ChromeWriting_8825.json",
29+
"repeat_time": 1
30+
},
31+
"updated_K12_Printing_256k": {
32+
"file_name": "LLaVA-OneVision-Data_OCR/updated_K12_Printing_256636.json",
33+
"repeat_time": 1
34+
},
35+
"updated_Rendered_Text_9k": {
36+
"file_name": "LLaVA-OneVision-Data_OCR/updated_Rendered_Text_9995.json",
37+
"repeat_time": 1
38+
},
39+
"updated_TextOCR_29k": {
40+
"file_name": "LLaVA-OneVision-Data_OCR/updated_TextOCR_29288.json",
41+
"repeat_time": 1
42+
},
43+
44+
"ai2d(cauldron_llava_format)_filter_sptoken_2429": {
45+
"file_name": "LLaVA-OneVision-Data_doc/ai2d(cauldron_llava_format)_filter_sptoken_2429.json",
46+
"repeat_time": 1
47+
},
48+
"ai2d(gpt4v)_filter_sptoken_4864": {
49+
"file_name": "LLaVA-OneVision-Data_doc/ai2d(gpt4v)_filter_sptoken_4864.json",
50+
"repeat_time": 1
51+
},
52+
"ai2d(internvl)_filter_sptoken_12403": {
53+
"file_name": "LLaVA-OneVision-Data_doc/ai2d(internvl)_filter_sptoken_12403.json",
54+
"repeat_time": 1
55+
},
56+
"chart2text(cauldron)_filter_sptoken_26956": {
57+
"file_name": "LLaVA-OneVision-Data_doc/chart2text(cauldron)_filter_sptoken_26956.json",
58+
"repeat_time": 1
59+
},
60+
"chartqa(cauldron_llava_format)_filter_sptoken_18260": {
61+
"file_name": "LLaVA-OneVision-Data_doc/chartqa(cauldron_llava_format)_filter_sptoken_18260.json",
62+
"repeat_time": 1
63+
},
64+
"diagram_image_to_text(cauldron)_filter_sptoken_295": {
65+
"file_name": "LLaVA-OneVision-Data_doc/diagram_image_to_text(cauldron)_filter_sptoken_295.json",
66+
"repeat_time": 1
67+
},
68+
"dvqa(cauldron_llava_format)_filter_sptoken_199995": {
69+
"file_name": "LLaVA-OneVision-Data_doc/dvqa(cauldron_llava_format)_filter_sptoken_199995.json",
70+
"repeat_time": 1
71+
},
72+
"FigureQA(MathV360K)_filter_sptoken_17587": {
73+
"file_name": "LLaVA-OneVision-Data_doc/FigureQA(MathV360K)_filter_sptoken_17587.json",
74+
"repeat_time": 1
75+
},
76+
"hitab(cauldron_llava_format)_filter_sptoken_2495": {
77+
"file_name": "LLaVA-OneVision-Data_doc/hitab(cauldron_llava_format)_filter_sptoken_2495.json",
78+
"repeat_time": 1
79+
},
80+
"infographic_vqa_llava_format_filter_sptoken_2113": {
81+
"file_name": "LLaVA-OneVision-Data_doc/infographic_vqa_llava_format_filter_sptoken_2113.json",
82+
"repeat_time": 1
83+
},
84+
"lrv_filter_sptoken_1776": {
85+
"file_name": "LLaVA-OneVision-Data_doc/lrv_filter_sptoken_1776.json",
86+
"repeat_time": 1
87+
},
88+
"robut_sqa_filter_sptoken_8509": {
89+
"file_name": "LLaVA-OneVision-Data_doc/robut_sqa_filter_sptoken_8509.json",
90+
"repeat_time": 1
91+
},
92+
"robut_wikisql_filter_sptoken_74984": {
93+
"file_name": "LLaVA-OneVision-Data_doc/robut_wikisql_filter_sptoken_74984.json",
94+
"repeat_time": 1
95+
},
96+
"robut_wtq_filter_sptoken_38241": {
97+
"file_name": "LLaVA-OneVision-Data_doc/robut_wtq_filter_sptoken_38241.json",
98+
"repeat_time": 1
99+
},
100+
"screen2word_filter_sptoken_15725": {
101+
"file_name": "LLaVA-OneVision-Data_doc/screen2word_filter_sptoken_15725.json",
102+
"repeat_time": 1
103+
},
104+
"tqa_filter_sptoken_27272": {
105+
"file_name": "LLaVA-OneVision-Data_doc/tqa_filter_sptoken_27272.json",
106+
"repeat_time": 1
107+
},
108+
"ureader_cap_filter_sptoken_91434": {
109+
"file_name": "LLaVA-OneVision-Data_doc/ureader_cap_filter_sptoken_91434.json",
110+
"repeat_time": 1
111+
},
112+
"ureader_ie_filter_sptoken_17322": {
113+
"file_name": "LLaVA-OneVision-Data_doc/ureader_ie_filter_sptoken_17322.json",
114+
"repeat_time": 1
115+
},
116+
"ureader_kg_filter_sptoken_37550": {
117+
"file_name": "LLaVA-OneVision-Data_doc/ureader_kg_filter_sptoken_37550.json",
118+
"repeat_time": 1
119+
},
120+
"ureader_qa_filter_sptoken_252954": {
121+
"file_name": "LLaVA-OneVision-Data_doc/ureader_qa_filter_sptoken_252954.json",
122+
"repeat_time": 1
123+
},
124+
"visualmrc_filter_sptoken_3022": {
125+
"file_name": "LLaVA-OneVision-Data_doc/visualmrc_filter_sptoken_3022.json",
126+
"repeat_time": 1
127+
}
128+
}
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
{
2+
"dvqa_train_200k": {
3+
"annotation": "playground/opensource_json/dvqa_train_200k.json",
4+
"repeat_time": 1
5+
},
6+
"chartqa_train_18k": {
7+
"annotation": "playground/opensource_json/chartqa_train_18k.json",
8+
"repeat_time": 1
9+
},
10+
"ai2d_train_12k": {
11+
"annotation": "playground/opensource_json/ai2d_train_12k.json",
12+
"repeat_time": 1
13+
},
14+
"docvqa_train_10k": {
15+
"annotation": "playground/opensource_json/docvqa_train_10k.json",
16+
"repeat_time": 1
17+
},
18+
"geoqa+": {
19+
"annotation": "playground/opensource_json/geoqa+.json",
20+
"repeat_time": 1
21+
},
22+
"synthdog_en": {
23+
"annotation": "playground/opensource_json/synthdog_en.json",
24+
"repeat_time": 1
25+
}
26+
}

paddlemix/examples/qwen2_vl/multi_image_infer.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
MODEL_NAME = "Qwen/Qwen2-VL-2B-Instruct"
2525
model = Qwen2VLForConditionalGeneration.from_pretrained(MODEL_NAME, dtype="bfloat16")
2626

27-
image_processor = Qwen2VLImageProcessor.from_pretrained(MODEL_NAME)
27+
image_processor = Qwen2VLImageProcessor()
2828
tokenizer = Qwen2Tokenizer.from_pretrained(MODEL_NAME)
2929
processor = Qwen2VLProcessor(image_processor, tokenizer)
3030

@@ -48,8 +48,8 @@
4848
image_inputs, video_inputs = process_vision_info(messages)
4949

5050
question = "Identify the similarities between these images."
51-
image_pad_tokens = '<|vision_start|><|image_pad|><|vision_end|>' * len(image_inputs)
52-
text = f'<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{image_pad_tokens}{question}<|im_end|>\n<|im_start|>assistant\n'
51+
image_pad_tokens = "<|vision_start|><|image_pad|><|vision_end|>" * len(image_inputs)
52+
text = f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{image_pad_tokens}{question}<|im_end|>\n<|im_start|>assistant\n"
5353

5454
inputs = processor(
5555
text=[text],

0 commit comments

Comments
 (0)