Skip to content

Commit 82a867f

Browse files
authored
[wip] add mix scheme (#664)
## PaddleMIX统一多模数据格式 1. [x] 纯文 2. [x] 单图 3. [x] 多图 4. [x] interleaved 5. [ ] 音频 6. [ ] 视频 ## 功能 1. [x] `MIX`格式定义和检查 2. [x] `MM`格式到`MIX`格式转换Op --- ## 特殊字段 1. [x] `images <-> <image>id</image>` 2. [ ] `audios <-> <audio>id</audio>` 3. [ ] `videos <-> <video>id</video>` ``` [ { 'id': '000002b66c9c498e', 'images': [ { 'id': 0, 'url': 'train/000002b66c9c498e.jpg', 'heigh': 100, 'width': 100, }, { 'id': 1, 'url': 'train/000002b66c9c498e.jpg', 'heigh': 100, 'width': 100, }, ], 'conversations': [ { 'from': 'user', 'value': '<image>id</image><image>id</image> xxxx' }, { 'from': 'assistant', 'value': 'xxx' }, { 'from': 'user', 'value': 'xxxx <image>id</image>' }, { 'from': 'assistant', 'value': 'xxx' } ], }, ] ```
1 parent 4b86c51 commit 82a867f

File tree

9 files changed

+194
-15
lines changed

9 files changed

+194
-15
lines changed

paddlemix/datacopilot/core/__init__.py

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -13,15 +13,13 @@
1313
# limitations under the License.
1414

1515

16-
from .schema import (
17-
T,
18-
SCHEMA,
19-
is_valid_schema,
20-
)
16+
from .schema import T, SCHEMA, is_valid_schema
17+
from .dataset import MMDataset, ParallelMode
18+
from .register import register
2119

22-
from .dataset import (
23-
MMDataset,
24-
ParallelMode
25-
)
2620

27-
from .register import register
21+
MODILATY_TOKENS = {
22+
'image': '<image>\d+</image>',
23+
'audio': '<audio>\d+</audio>',
24+
'video': '<video>\d+</video>',
25+
}

paddlemix/datacopilot/core/dataset.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,11 @@ def shuffle(self, seed: Optional[int]=None) -> 'MMDataset':
122122
random.shuffle(self._items)
123123
return self
124124

125+
def sample(self, k: int) -> 'MMDataset':
126+
indices = random.sample(range(len(self)), k)
127+
items = [self.items[i] for i in indices]
128+
return MMDataset(items)
129+
125130
@classmethod
126131
def from_json(cls, path: str, schema: SCHEMA=SCHEMA.MM) -> 'MMDataset':
127132
with open(path, 'r') as f:

paddlemix/datacopilot/core/schema.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,74 @@ class SCHEMA(Enum):
7474
required: ['id', 'image', 'conversations']
7575
"""
7676

77+
MIX = \
78+
"""
79+
$id: 'https://example.com/schemas/multimodal_mix'
80+
$schema: 'https://json-schema.org/draft/2020-12/schema'
81+
82+
type: object
83+
properties:
84+
id:
85+
anyOf:
86+
-
87+
type: string
88+
pattern: '\S{1,}'
89+
-
90+
type: integer
91+
minimum: 0
92+
93+
images:
94+
anyOf:
95+
-
96+
type: 'null'
97+
-
98+
type: array
99+
minItems: 1
100+
items:
101+
type: object
102+
properties:
103+
id:
104+
type: integer
105+
minimum: 0
106+
url:
107+
type: string
108+
pattern: '\.(jpg|jpeg|png|webp|JPG|JPEG|PNG)$'
109+
description: '.png or .jpg or .jpeg or .webp'
110+
heigh:
111+
type: integer
112+
minimum: 0
113+
width:
114+
type: integer
115+
minimum: 0
116+
required:
117+
- id
118+
- url
119+
120+
conversations:
121+
type: array
122+
minItems: 1
123+
items:
124+
type: object
125+
properties:
126+
from:
127+
type: string
128+
description: 'user or assistant'
129+
enum:
130+
- user
131+
- assistant
132+
value:
133+
anyOf:
134+
- type: string
135+
- type: 'null'
136+
required:
137+
- from
138+
- value
139+
required:
140+
- id
141+
- images
142+
- conversations
143+
"""
144+
77145
SCHEMA_VALIDATORS = {
78146
k: JsonSchemaValidator.from_string(k.value) for k in SCHEMA
79147
}

paddlemix/datacopilot/ops/__init__.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,5 +12,6 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
from ._info import info, head
16-
from ._h5 import from_h5, check_h5, export_h5
15+
16+
from .analysis import *
17+
from .convert import *
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
16+
from ._info import info, head
17+

paddlemix/datacopilot/ops/_info.py renamed to paddlemix/datacopilot/ops/analysis/_info.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from rich.table import Table
1818
from rich.columns import Columns
1919

20-
from ..core import register, MMDataset
20+
from ...core import register, MMDataset
2121

2222

2323
@register(force=True)
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
16+
from ._h5 import from_h5, check_h5, export_h5
17+
from ._schema import convert_schema

paddlemix/datacopilot/ops/_h5.py renamed to paddlemix/datacopilot/ops/convert/_h5.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,8 @@
2424

2525
from typing import List, Union
2626

27-
from ..core import MMDataset, SCHEMA
28-
from ..misc import ParallelMode, parallel_map, freeze_rng_state, enumerate_chunk
27+
from ...core import MMDataset, SCHEMA
28+
from ...misc import ParallelMode, parallel_map, freeze_rng_state, enumerate_chunk
2929

3030
__all__ = ['export_h5', 'check_h5', 'from_h5']
3131

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
16+
from ...core import T
17+
from ...core import SCHEMA, is_valid_schema
18+
19+
20+
def convert_schema(
21+
item: T,
22+
in_schema: SCHEMA=SCHEMA.MM,
23+
out_schema: SCHEMA=SCHEMA.MIX
24+
)-> T:
25+
"""convert scheme
26+
"""
27+
if in_schema == out_schema:
28+
return item
29+
30+
# MM <-> MIX
31+
elif in_schema == SCHEMA.MM and out_schema == SCHEMA.MIX:
32+
return _convert_mm_mix(item)
33+
34+
else:
35+
raise NotImplementedError('')
36+
37+
38+
def _convert_mm_mix(item):
39+
if 'image' in item:
40+
images = [{
41+
'id': 0,
42+
'url': item['image'],
43+
}]
44+
else:
45+
images = None
46+
47+
conversations = []
48+
for conv in item['conversations']:
49+
if conv['from'] == 'human':
50+
role = 'user'
51+
if 'image' in item:
52+
if '<image>' in conv['value']:
53+
value = conv['value'].replace('<image>', '<image>0</image>')
54+
else:
55+
value = '<image>0</image>\n' + conv['value']
56+
else:
57+
value = conv['value']
58+
else:
59+
role = 'assistant'
60+
value = conv['value']
61+
62+
conversations.append({
63+
'from': role,
64+
'value': value,
65+
})
66+
67+
newitem = {
68+
'id': item['id'],
69+
'images': images,
70+
'conversations': conversations
71+
}
72+
return newitem
73+

0 commit comments

Comments
 (0)