Skip to content

Add Image Text Retrieval taskflow&pipelines API #4516

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 20 commits into from
Feb 14, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions paddlenlp/taskflow/taskflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
TextToImageGenerationTask,
TextToImageStableDiffusionTask,
)
from .vision_language_embedding import VisionLanguageTask
from .word_segmentation import SegJiebaTask, SegLACTask, SegWordTagTask
from .zero_shot_text_classification import ZeroShotTextClassificationTask

Expand Down Expand Up @@ -486,6 +487,15 @@
},
"default": {"model": "utc-large"},
},
"vision_language": {
"models": {
"PaddlePaddle/ernie_vil-2.0-base-zh": {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ERNIE系列模型这里是否可以简化为ernie_vil-2.0-base-zh

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

已跟余军沟通,ernie_vil 2.0上传的时候当成社区模型了

"task_class": VisionLanguageTask,
"task_flag": "vision_language_embeddings-2.0-base-zh",
},
},
"default": {"model": "PaddlePaddle/ernie_vil-2.0-base-zh"},
},
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

感觉除了ernie_vil应该也可以接一下clip和chineseclip吧

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

已经添加

}

support_schema_list = [
Expand Down
116 changes: 116 additions & 0 deletions paddlenlp/taskflow/vision_language_embedding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import paddle
from PIL import Image

from ..transformers import ErnieViLModel, ErnieViLProcessor
from .task import Task


class VisionLanguageTask(Task):
"""
The text_to_image generation model to generate the image.
Args:
task(string): The name of task.
model(string): The model name in the task.
kwargs (dict, optional): Additional keyword arguments passed along to the specific task.
"""

def __init__(self, task, model, **kwargs):
super().__init__(task=task, model=model, **kwargs)
self._seed = None
# we do not use batch
self._batch_size = 1
self._construct_tokenizer(image_model=model, text_model="ernie_vil-2.0-base-zh")
self._construct_model(model)

def _construct_model(self, model):
"""
Construct the inference model for the predictor.
"""
self._model = ErnieViLModel.from_pretrained(model)
self._model.eval()

def _construct_tokenizer(self, image_model, text_model):
"""
Construct the tokenizer for the predictor.
"""
self._processor = ErnieViLProcessor.from_pretrained(image_model)

def _batchify(self, data, batch_size):
"""
Generate input batches.
"""

def _parse_batch(batch_examples):
batch_texts = batch_examples["texts"]
batch_images = [Image.open(item) for item in batch_examples["images"]]

tokenizerd_inputs = self._processor(
text=batch_texts, images=batch_images, return_tensors="pd", padding="max_length", truncation=True
)

return tokenizerd_inputs

# Seperates data into some batches.
# breakpoint()
yield _parse_batch(data[0])
# one_batch = []
# for example in data:
# one_batch.append(example)
# if len(one_batch) == batch_size:
# yield _parse_batch(one_batch)
# one_batch = []
# if one_batch:
# yield _parse_batch(one_batch)

def _preprocess(self, inputs):
"""
Transform the raw text to the model inputs, two steps involved:
1) Transform the raw text to token ids.
2) Generate the other model inputs from the raw text and token ids.
"""
# inputs = self._check_input_text(inputs)
batches = self._batchify(inputs, self._batch_size)
outputs = {"batches": batches, "text": inputs}
return outputs

def _run_model(self, inputs):
"""
Run the task model from the outputs of the `_preprocess` function.
"""
all_texts = []
all_images = []
for batch_inputs in inputs["batches"]:
if len(batch_inputs["input_ids"]) > 0:
text_features = self._model.get_text_features(input_ids=batch_inputs["input_ids"])
all_texts.append(text_features)
if len(batch_inputs["pixel_values"]) > 0:
image_features = self._model.get_image_features(pixel_values=batch_inputs["pixel_values"])
all_images.append(image_features)
inputs.update({"text_features": all_texts})
inputs.update({"image_features": all_images})
return inputs

def _postprocess(self, inputs):
return inputs

def _construct_input_spec(self):
"""
Construct the input spec for the convert dygraph model to static model.
"""
self._input_spec = [
paddle.static.InputSpec(shape=[None, None], dtype="int64", name="input_ids"),
]