add pp-inscaptagger (PaddlePaddle#727)

pkhk-1 · lyuwenyu · web-flow · commit 2af80b1e14e5 · 2024-10-10T10:33:19.000+08:00
Co-authored-by: lyuwenyu &lt;wenyu.lyu@gmail.com&gt;
diff --git a/paddlemix/auto/processing.py b/paddlemix/auto/processing.py
@@ -124,7 +124,7 @@ def _get_processor_class(cls, pretrained_model_name_or_path, text_model_name_or_
 
         for names, processor_class in cls._processor_mapping.items():
 
-            if names.lower() in pretrained_model_name_or_path.lower().replace("-", "_").replace("vicuna", "llava"):
+            if names.lower() in pretrained_model_name_or_path.lower().replace("-", "_").replace("vicuna", "llava").replace("inscaptagger", "llava"):
 
                 attributes = processor_class["processor"].attributes
                 attributes_dict = {}
diff --git a/paddlemix/datacopilot/example/pp_inscaptagger/inference.py b/paddlemix/datacopilot/example/pp_inscaptagger/inference.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+from pathlib import Path
+from functools import partial
+from paddlemix.datacopilot.core import MMDataset
+from paddlemix.datacopilot.misc import enumerate_chunk
+from paddlemix.datacopilot.nn import PPInsCapTagger
+import paddle
+import json
+
+
+class QAschema(argparse.Action):
+    def __call__(self, parser, namespace, values, option_string=None):
+        assert len(values)%2 == 0, "QA content must be a list of pairs"
+        values = list(zip(values[0::2], values[1::2]))
+        setattr(namespace, self.dest, values)
+
+
+if __name__ == '__main__':
+
+    base = argparse.ArgumentParser(add_help=False)
+    base.add_argument('-m', '--model-name-or-path', type=str, default='paddlemix/PP-InsCapTagger')
+    base.add_argument('-t', '--dtype', type=str, default='float16')
+    base.add_argument('-k', '--k-start', type=int, default=0)
+    base.add_argument('-o', '--output-dir', default='SFT_tag_output_test')
+    base.add_argument('--seed', type=int, default=0)
+    
+    
+    parser = argparse.ArgumentParser()
+    subs = parser.add_subparsers(help='mod of data: json_data/single_data', dest='mod')
+    json_parser = subs.add_parser('json_data', parents=[base])
+    json_parser.add_argument('-d', '--dataset-path', type=str, required=True)
+    
+    single_parser = subs.add_parser('single_data', parents=[base])
+    single_parser.add_argument('-image', '--image-path', type=str, required=True)
+    single_parser.add_argument('-qa', '--qa-content', nargs='+', type=str, required=True, action=QAschema)
+    
+    args = parser.parse_args()
+    paddle.seed(seed=args.seed)
+    
+    if args.mod == 'json_data':
+        Path(args.output_dir).mkdir(parents=True, exist_ok=True)
+        m = PPInsCapTagger(args.model_name_or_path)
+        dataset = MMDataset.from_auto(args.dataset_path)
+        print('loading dataset...')
+        print('data size==', len(dataset))
+        for i, subdata in enumerate_chunk(dataset, chunk_size=1000, start=args.k_start):
+            print(f'convert {i}th(1000) data')
+            subdata: MMDataset
+            subdata = subdata.map(m.inference, max_workers=1)
+            subdata.export_json(f'{args.output_dir}/tagger_{i:05}.json')
+            print(f'{i*1000}th(1000) data save to {args.output_dir}/tagger_{i:05}.json')
+    
+    if args.mod == 'single_data':
+        item = {}
+        item["image"] = args.image_path 
+        item['conversations'] = args.qa_content
+        m = PPInsCapTagger(args.model_name_or_path)
+        tag_item = m(item)
+        print(tag_item)
diff --git a/paddlemix/datacopilot/example/pp_inscaptagger/readme.md b/paddlemix/datacopilot/example/pp_inscaptagger/readme.md
@@ -0,0 +1,112 @@
+
+# PP-InsCapTagger
+
+## 方案简介
+
+PP-InsCapTagger(Instance Capability Tagger) 是 DataCopilot 基于 PaddleMix 实现的数据集行为标签模型，用于为多模态数据实例能力打标，通过实例能力分布对数据集进行优化，可以提高模型训练效率，为数据集分析和评价提供了一种高效的方案。
+结合模型推理打标结果对LLaVA SFT数据集进行优化，可以**提高LLaVA模型SFT阶段50%的训练效率**。
+
+数据实例能力标签：在多模态任务中，每条数据都可以抽象出一种或多种能力，在训练时，模型会从这些数据中学习并增强自身对应的能力，如下图。为了评价和优化数据集，我们可以通过模型为每条多模态数据在模型训练中贡献的实例能力进行打标，并根据打标结果中数据实例能力分布进行数据集的优化，进而提升模型的训练效率。
+
+<p align="center">
+  <img src="https://github.com/user-attachments/assets/e2a8931f-ce24-47c5-9970-b42031bb28c5" align="middle" width = "800" />
+</p>
+
+PP-InsCapTagger 基于 PaddleMix 进行训练，使用 `llava-v1.6-7b` 模型作为 `base` 模型。数据集使用多模态数据 [LLaVA-Instruct-150K](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K) 的部分图片和多轮对话内容，并通过 GPT-4o 为每一条数据的实例能力进行打标，并将打标结果作为该条数据的 `tags` 属性进行保存，然后使用 DataCopilot 实现数据集的高效预处理，结合原始多轮对话内容和 `tags` 结果重构数据集的 `question` 和 `answer`。
+
+PP-InsCapTagger 部分训练和推理的细节可以参考AI Studio 项目：[基于PaddleMIX的数据集行为标签分类器训推实例](https://aistudio.baidu.com/projectdetail/7917712)
+
+## 模型使用示例
+
+本项目提供 PP-InsCapTagger 使用脚本 `inference.py`, 通过`single_data`和`json_data`两种推理模式，可以分别实现以图像-文本对输入的单条样本推理 和 以`json`文件输入的批量数据推理。
+
+### 单样本推理:
+
+输入图片：<center><img src="https://github.com/user-attachments/assets/1c2fec64-3c94-4782-bc85-ccb083c1f4b2" width = "250"/></center>
+
+输入多轮对话：
+
+```
+Q: What animal is in the image? A: The image features a dog.
+Q: What color are the dog's eyes? A: The dog has blue eyes.
+Q: Where is the dog situated in the image? A: The dog is situated inside a vehicle, on a front passenger seat.
+```
+
+```bash
+# PaddleMIX根目录下执行
+python paddlemix/datacopilot/example/pp_inscaptagger/inference.py \
+single_data \
+-m paddlemix/PP-InsCapTagger \
+-image https://paddlenlp.bj.bcebos.com/models/community/paddlemix/PP-InsCapTagger/demo.jpg \
+-qa "What animal is in the image?" "The image features a dog." \
+    "What color are the dog's eyes?" "The dog has blue eyes." \
+    "Where is the dog situated in the image?" "The dog is situated inside a vehicle, on a front passenger seat."
+```
+
+其中，`-m`表示模型所用权重路径，当值为`paddlemix/PP-InsCapTagger`时，会自动下载`PP-InsCapTagger`模型到本地；`-image`表示输入的图像地址(本地地址\http链接)；`-qa`表示输入的多轮对话内容，以空格分隔。
+
+### 批量数据推理:
+
+```bash
+# PaddleMIX根目录下执行
+python paddlemix/datacopilot/example/pp_inscaptagger/inference.py \
+json_data \
+-m paddlemix/PP-InsCapTagger \
+-d path/to/your/data.json \
+-k 0 \
+-o path/to/your/output-dir
+```
+其中，`path/to/your/data.json` 为输入的批量数据文件路径，格式如下：
+
+```json
+[
+    {
+        "image": "http://ecx.images-amazon.com/images/I/51ntbts0gmL.jpg",
+        "conversations": [
+            [
+                "<image>\nWhat is the genre of this book?",
+                "Literature & Fiction"
+            ],
+            [
+                "What is the color of this book?",
+                "Red and black"
+            ]
+
+        ]
+    },
+    {
+        "image": "http://ecx.images-amazon.com/images/I/51cc3XrLevL.jpg",
+        "conversations": [
+            [
+                "<image>\nWhat is the title of this book?",
+                "Beyond Bigger Leaner Stronger: The Advanced Guide to Building Muscle, Staying Lean, and Getting Strong (The Build Muscle, Get Lean, and Stay Healthy Series)"
+            ]
+        ]
+    }
+]
+```
+`-k`表示脚本批量处理的起始位置为第k个chunk的数据，默认为0，当处理中断时可以更改处理起始位置；`path/to/your/output-dir`表示处理结果json文件保存的位置，所有chunk的处理结果分别保存在对应的json文件中，命名为`tagger_{i:05}.json`。
+
+## 标签使用案例
+
+LLaVA v1.5模型SFT阶段训练时，使用的指令微调数据集为[LLaVA-Instruct-150K](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150)中llava_v1_5_mix665k数据集，该数据集为多个数据集混合而成，相比于预训练数据集，该数据集规模更大，同时在实例能力分布上也存在较大的差异。为了优化该数据集的实例能力分布，进而提高模型训练效率，我们使用PP-InsCapTagger对数据集进行打标，并统计标签分布。
+
+使用PP-InsCapTagger对llava_v1_5_mix665k数据集进行打标，可以得到7913个标签，对数量最多的前100个标签分布进行可视化，可以看出标签分布存在较大的差异，如下图所示：
+
+<details>
+<summary>See</summary>
+<center><img src="https://github.com/user-attachments/assets/48e30848-fe18-4e1a-a9a5-6c6f18ad9029" width = "300"/></center>
+</details>
+
+
+为了对llava_v1_5_mix665k数据集进行优化，我们使用PP-InsCapTagger打标的标签结果对数据集进行筛选，**首先确定出能够覆盖80%数据的单条数据的标签数量N，然后在数据集标签集合中选出标签数量占比前0.7%的标签作为一个筛选集合R，对于llava_v1_5_mix665k数据集中的每条数据，如果该条数据标签数量小于N，且该条数据的所有标签均在集合R中，则删除该条数据，否则保留该条数据**。通过该筛选策略，最终保留数据集规模为原始数据集的50%左右。
+
+我们分别使用llava_v1_5_mix665k数据集和筛选后的数据集进行llava-1.5-7b SFT阶段训练，对比结果如下表所示：
+
+| Version              | ScienceQA | TextVQA | VQAv2 | GQA   | mmmu  | mme            |
+|:----------------------:|:-----------:|:---------:|:-------:|:-------:|:-------:|:----------------:|
+| llava-1.5-7b <br> (paper) | 66.8 | 58.2 | 78.5 | 62.0 |  -  |  -  |
+| llava-1.5-7b <br> (rerun) | 69.01 | 57.6 | 79.0 | 62.95 | 36.89 | 1521 <br> 323 |
+| llava-1.5-7b <br> (tag 50%/our) | 70.24 | 57.12 | 78.32 | 62.14 | 37.11 | 1476 <br> 338 |
+
+通过PP-InsCapTagger的打标和优化，50%数据集与原始数据集的训练效果基本持平，大大提高了模型训练效率。
diff --git a/paddlemix/datacopilot/example/pp_inscaptagger/tools/data_analys.py b/paddlemix/datacopilot/example/pp_inscaptagger/tools/data_analys.py
@@ -0,0 +1,117 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+from collections import Counter
+import numpy as np
+import ast
+import matplotlib.pyplot as plt
+import glob
+from paddlemix.datacopilot.core import MMDataset
+from tqdm import tqdm
+
+def merge_json_files(folder_path):
+    newdataset = MMDataset()
+    pathes = sorted(glob.glob(f'{folder_path}/*.json'))
+    file_count = len(pathes)
+    for path in sorted(glob.glob(f'{folder_path}/*.json')):
+        newdataset += MMDataset.from_json(path)
+    output_file = f'merged_{file_count}.json'
+    newdataset.export_json(output_file)
+    return output_file
+def all_tag_count(data_json):
+    data = json.load(open(data_json, encoding='utf-8'))
+    tag_counts = {}
+    n=0
+    for item in data:
+        try:
+            tags = ast.literal_eval(item["tag"])['tags']
+            for tag in list(set(tags)):
+                # 如果tag中包含逗号，则分割tag
+                if',' in tag:
+                    # 使用split()方法按照逗号分割字符串
+                    split_strings = tag.split(',')
+                    # 去除每个字符串两端的空格
+                    tags = [s.strip() for s in split_strings]
+                    for tag in tags:
+                        if tag in tag_counts:
+                            tag_counts[tag] += 1
+                        else:
+                            tag_counts[tag] = 1
+
+                if tag in tag_counts:
+                    tag_counts[tag] += 1
+                else:
+                    tag_counts[tag] = 1
+        except:
+            n+=1
+    print('无效tag的数据数量：',n)
+    print('数据集总量：',len(data))
+    print('tag数量：',len(tag_counts))
+    sorted_tag_counts = sorted(tag_counts.items(), key=lambda item: item[1], reverse=True)
+    output_file = data_json.replace('.json', '_tag_count.json')
+    with open(output_file, 'w', encoding='utf-8') as f:
+        json.dump(sorted_tag_counts, f, ensure_ascii=False, indent=4)
+    return sorted_tag_counts
+    
+def one_data_tag_count(data_json):
+    data = json.load(open(data_json, encoding='utf-8'))
+    # 统计每条数据中tag的数量
+    tag_counts = []
+    for item in data:
+        try:
+            tags = ast.literal_eval(item["tag"])['tags']
+            tag_counts.append(len(tags))
+        except:
+            print(item["tag"])
+
+    # 统计每个tag数量级别的数据条数
+    tag_count_freq = Counter(tag_counts)
+    # 按tag数量排序并计算累积数据覆盖数量
+    sorted_tag_counts = sorted(tag_count_freq.items(), key=lambda x: x[0], reverse=True)                    
+    # 将统计结果保存为字典
+    tag_count_freq_dict = dict(sorted_tag_counts)
+    # 将统计结果保存到JSON文件
+    output_file = data_json.replace('.json', '_tag_count_statistics.json')
+    with open(output_file, 'w') as f:
+        json.dump(tag_count_freq_dict, f, indent=4)
+
+    cumulative_coverage = np.cumsum([count for _, count in sorted_tag_counts])
+    # 找到覆盖90%和80%数据的最少tag数量
+    total_data = len(tag_counts)
+    cover_90_percent = next(tag for tag, cum_cov in zip([tag for tag, _ in sorted_tag_counts], cumulative_coverage) if cum_cov >= 0.8 * total_data)
+    cover_80_percent = next(tag for tag, cum_cov in zip([tag for tag, _ in sorted_tag_counts], cumulative_coverage) if cum_cov >= 0.6 * total_data)
+    print(f"可以覆盖90%数据的单条数据tag数量: {cover_90_percent}")
+    print(f"可以覆盖80%数据的单条数据tag数量: {cover_80_percent}")
+
+
+def tag_count_freq_plot(tag_count_file, topn):
+    data = json.load(open(tag_count_file, encoding='utf-8'))
+    # 设置字体，使用您安装的中文字体名称
+    plt.rcParams['font.sans-serif'] = ['WenQuanYi Zen Hei']  # 例如：SimHei、Microsoft YaHei、WenQuanYi Zen Hei等
+    plt.rcParams['axes.unicode_minus'] = False  # 解决负号显示问题
+    # 确保以UTF-8编码读取JSON文件
+    data = json.load(open(tag_file, encoding='utf-8'))
+    data = data[:topn] # 绘制部分数据
+    categories, values = zip(*data)
+    plt.figure(figsize=(10, 30))
+    plt.barh(categories, values, color='skyblue')
+    plt.xlabel('数量')
+    plt.title('类别分布')
+    plt.yticks(fontsize=8)  # 调整字体大小以适应显示
+    # 保存图形
+    im_path = tag_file.replace('.json', '_plot.png')
+    plt.savefig(im_path, bbox_inches='tight')
+
+
diff --git a/paddlemix/datacopilot/example/pp_inscaptagger/tools/data_filt.py b/paddlemix/datacopilot/example/pp_inscaptagger/tools/data_filt.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import re
+import os
+import requests
+from io import BytesIO
+from PIL import Image
+from datacopilot.core import MMDataset
+import datacopilot.hub as hub
+from functools import partial
+import ast
+
+tag_num = 3
+def process(item, all_tags):
+    try:
+        tags = ast.literal_eval(item["tag"])['tags']
+        tags = set(tags)
+        
+        tag_counts=len(tags)
+
+        if tag_counts < tag_num and tags - all_tags == set():
+            return None
+        else:
+            return item
+    except:
+        return item
+
+if __name__ == '__main__':
+    tag_most_ratio = 0.007
+    all_tags = set()
+    path = 'path/to/your/tag_file.json'
+    tag_path = 'path/to/your/tag_file_tag_count.json'
+    tag_count_list = MMDataset.from_json(tag_path)
+    tag_num = len(tag_count_list)
+    print(f'{path}数据集tags的种类总数为:',tag_num)
+
+    tag_used_num = int(tag_num*tag_most_ratio)
+    print(f'数量占比前{tag_most_ratio}的tags的种类总数为:',tag_used_num)
+    for t,n in tag_count_list[:tag_used_num]:
+        all_tags.add(t)
+    print(f'使用的前{tag_most_ratio}%的tags:',all_tags)
+
+    dataset = MMDataset.from_json(path)
+    data_len = len(dataset)
+    print('原始数据集长度:',data_len)
+    func = partial(
+        process, 
+        all_tags=all_tags
+    )
+    dataset = dataset.map(func)
+    newdataset = dataset.nonempty()
+    out_data_len = len(newdataset)
+    print('筛选后数据集数量:',out_data_len)
+    print('筛选后数据集占原数据集比例: ', out_data_len/data_len)
+
+    newdataset.export_json(path.replace('.json', f'_filter_{out_data_len}_tag.json'))
diff --git a/paddlemix/datacopilot/nn/__init__.py b/paddlemix/datacopilot/nn/__init__.py
@@ -14,4 +14,5 @@
 
 
 from ._lid import FastTextLIDModel
+from .inscaptagger import PPInsCapTagger
 
diff --git a/paddlemix/datacopilot/nn/inscaptagger.py b/paddlemix/datacopilot/nn/inscaptagger.py

Original file line number	Diff line number	Diff line change
`@@ -14,4 +14,5 @@`
`14`	`14`
`15`	`15`
`16`	`16`	`from ._lid import FastTextLIDModel`
	`17`	`+from .inscaptagger import PPInsCapTagger`
`17`	`18`