Support set_argument method for UIE (#4163)

linjieccc · sijunhe · web-flow · commit c3d654554421 · 2022-12-20T12:57:27.000+08:00
* Support set_argument method for UIE

* set schema to None by default

* revert

* update

Co-authored-by: Sijun He &lt;sijun.he@hotmail.com&gt;
diff --git a/paddlenlp/taskflow/information_extraction.py b/paddlenlp/taskflow/information_extraction.py
@@ -27,6 +27,7 @@
 from ..transformers import UIE, UIEM, UIEX, AutoModel, AutoTokenizer
 from ..utils.doc_parser import DocParser
 from ..utils.ie_utils import map_offset, pad_image_data
+from ..utils.log import logger
 from ..utils.tools import get_bool_ids_greater_than, get_span
 from .task import Task
 from .utils import DataCollatorGP, SchemaTree, dbc2sbc, get_id_and_prob, gp_decode
@@ -376,7 +377,7 @@ class UIETask(Task):
         },
     }
 
-    def __init__(self, task, model, schema, **kwargs):
+    def __init__(self, task, model, schema=None, **kwargs):
         super().__init__(task=task, model=model, **kwargs)
 
         self._max_seq_len = kwargs.get("max_seq_len", 512)
@@ -385,7 +386,7 @@ def __init__(self, task, model, schema, **kwargs):
         self._position_prob = kwargs.get("position_prob", 0.5)
         self._lazy_load = kwargs.get("lazy_load", False)
         self._num_workers = kwargs.get("num_workers", 0)
-        self.use_fast = kwargs.get("use_fast", False)
+        self._use_fast = kwargs.get("use_fast", False)
         self._layout_analysis = kwargs.get("layout_analysis", False)
         self._ocr_lang = kwargs.get("ocr_lang", "ch")
         self._schema_lang = kwargs.get("schema_lang", "ch")
@@ -415,14 +416,31 @@ def __init__(self, task, model, schema, **kwargs):
         else:
             self._summary_token_num = 3  # [CLS] prompt [SEP] text [SEP]
 
-        self._doc_parser = None
-        self._schema_tree = None
-        self.set_schema(schema)
+        self._parser_map = {
+            "ch": None,  # OCR-CH
+            "en": None,  # OCR-EN
+            "ch-layout": None,  # Layout-CH
+            "en-layout": None,  # Layout-EN
+        }
+        if not schema:
+            logger.warning(
+                "The schema has not been set yet, please set a schema via set_schema(). "
+                "More details about the setting of schema please refer to https://github.com/PaddlePaddle/PaddleNLP/blob/develop/applications/information_extraction/taskflow_text.md"
+            )
+            self._schema_tree = None
+        else:
+            self.set_schema(schema)
         self._check_predictor_type()
         self._get_inference_model()
         self._usage = usage
         self._construct_tokenizer()
 
+    def set_argument(self, argument: dict):
+        for k, v in argument.items():
+            if k == "input":
+                continue
+            setattr(self, f"_{k}", v)
+
     def set_schema(self, schema):
         if isinstance(schema, dict) or isinstance(schema, str):
             schema = [schema]
@@ -467,7 +485,7 @@ def _construct_tokenizer(self):
         Construct the tokenizer for the predictor.
         """
         self._tokenizer = AutoTokenizer.from_pretrained(
-            self._task_path, use_fast=self.use_fast, from_hf_hub=self.from_hf_hub
+            self._task_path, use_fast=self._use_fast, from_hf_hub=self.from_hf_hub
         )
 
     def _preprocess(self, inputs):
@@ -485,6 +503,7 @@ def _check_input_text(self, inputs):
         """
         Check whether the input meet the requirement.
         """
+        self._ocr_lang_choice = (self._ocr_lang + "-layout") if self._layout_analysis else self._ocr_lang
         inputs = inputs[0]
         if isinstance(inputs, dict) or isinstance(inputs, str):
             inputs = [inputs]
@@ -494,17 +513,17 @@ def _check_input_text(self, inputs):
                 data = {}
                 if isinstance(example, dict):
                     if "doc" in example.keys():
-                        if not self._doc_parser:
-                            self._doc_parser = DocParser(
+                        if not self._parser_map[self._ocr_lang_choice]:
+                            self._parser_map[self._ocr_lang_choice] = DocParser(
                                 ocr_lang=self._ocr_lang, layout_analysis=self._layout_analysis
                             )
                         if "layout" in example.keys():
-                            data = self._doc_parser.parse(
+                            data = self._parser_map[self._ocr_lang_choice].parse(
                                 {"doc": example["doc"]}, do_ocr=False, expand_to_a4_size=self._expand_to_a4_size
                             )
                             data["layout"] = example["layout"]
                         else:
-                            data = self._doc_parser.parse(
+                            data = self._parser_map[self._ocr_lang_choice].parse(
                                 {"doc": example["doc"]}, expand_to_a4_size=self._expand_to_a4_size
                             )
                     elif "text" in example.keys():
@@ -931,7 +950,7 @@ def _parse_inputs(self, inputs):
                             org_box[2] + offset_x,
                             org_box[3] + offset_y,
                         ]
-                        box = self._doc_parser._normalize_box(box, [img_w, img_h], [1000, 1000])
+                        box = self._parser_map[self._ocr_lang_choice]._normalize_box(box, [img_w, img_h], [1000, 1000])
                         text += segment[1]
                         bbox.extend([box] * len(segment[1]))
                     _inputs.append({"text": text, "bbox": bbox, "image": d["image"], "layout": d["layout"]})
diff --git a/paddlenlp/taskflow/taskflow.py b/paddlenlp/taskflow/taskflow.py
@@ -510,6 +510,17 @@
     "openai/disco-diffusion-clip-rn50",
     "openai/disco-diffusion-clip-rn101",
     "disco_diffusion_ernie_vil-2.0-base-zh",
+    "uie-base",
+    "uie-medium",
+    "uie-mini",
+    "uie-micro",
+    "uie-nano",
+    "uie-tiny",
+    "uie-medical-base",
+    "uie-base-en",
+    "uie-m-large",
+    "uie-m-base",
+    "uie-x-base",
 ]
 
 
@@ -617,11 +628,11 @@ def interactive_mode(self, max_turn):
     def set_schema(self, schema):
         assert (
             self.task_instance.model in support_schema_list
-        ), "This method can only be used by the task with the model of uie or wordtag."
+        ), "This method can only be used by the task based on the model of uie or wordtag."
         self.task_instance.set_schema(schema)
 
     def set_argument(self, argument):
         assert (
             self.task_instance.model in support_argument_list
-        ), "This method can only be used by the task with the model of text_to_image generation."
+        ), "This method can only be used by the task of text-to-image generation or information extraction."
         self.task_instance.set_argument(argument)