Optimize code for DuUIE Baseline (PaddlePaddle#2120)

* Clean Code for DuUIE * Remove `DynamicMultiTaskSSIGenerator`, using DynamicSSIGenerator for all Data Collator * Rename `negative_sampler` to `ssi_generator`and Rewrite __call__ for each Data Collator * Read training instances from train config and add negative instance for DuIE * Add more comments and rename variables in seq2struct/utils.py evaluation/seq2record.py for better understanding * Remove unuse arguments and tqdm code in run_seq2struct.py, add grad_clip, change default value of some argument as README.md * Change BaseStructureMarker as dataclass * Clean Code for DuUIE * Remove `DynamicMultiTaskSSIGenerator`, using DynamicSSIGenerator for all Data Collator * Rename `negative_sampler` to `ssi_generator`and Rewrite __call__ for each Data Collator, Reduce GPU Memory to 50% * Read training instances from train config and add negative instance for DuIE * Add more comments and rename variables in seq2struct/utils.py evaluation/seq2record.py for better understanding * Remove unuse arguments and tqdm code in run_seq2struct.py, add grad_clip, change default value of some argument as README.md * Change BaseStructureMarker as dataclass * Clean Code for DuUIE * Remove `DynamicMultiTaskSSIGenerator`, using DynamicSSIGenerator for all Data Collator * Rename `negative_sampler` to `ssi_generator`and Rewrite __call__ for each Data Collator, Reduce GPU Memory to 50% * Read training instances from train config and add negative instance for DuIE * Add more comments and rename variables in seq2struct/utils.py evaluation/seq2record.py for better understanding * Remove unuse arguments and tqdm code in run_seq2struct.py, add grad_clip, change default value of some argument as README.md * Change BaseStructureMarker as dataclass * Clean Code for DuUIE * Remove `DynamicMultiTaskSSIGenerator`, using DynamicSSIGenerator for all Data Collator * Rename `negative_sampler` to `ssi_generator`and Rewrite __call__ for each Data Collator, Reduce GPU Memory to 50% * Read training instances from train config and add negative instance for DuIE * Add more comments and rename variables in seq2struct/utils.py evaluation/seq2record.py for better understanding * Remove unuse arguments and tqdm code in run_seq2struct.py, add grad_clip, change default value of some argument as README.md * Change BaseStructureMarker as dataclass * Clean Code for DuUIE Baseline * Remove `DynamicMultiTaskSSIGenerator`, using DynamicSSIGenerator for all Data Collator * Rename `negative_sampler` to `ssi_generator`and Rewrite __call__ for each Data Collator, Reduce GPU Memory to 50% * Read training instances from train config and add negative instance for DuIE * Add more comments and rename variables in seq2struct/utils.py evaluation/seq2record.py for better understanding * Remove unuse arguments and tqdm code in run_seq2struct.py, add grad_clip, change default value of some argument as README.md * Change BaseStructureMarker as dataclass * fix comment in uie/seq2struct/utils.py Co-authored-by: Linjie Chen <40840292+linjieccc@users.noreply.github.com>
lijing89 · May 12, 2022 · 2730d3c · 2730d3c
1 parent d83be59
commit 2730d3c
Show file tree

Hide file tree

Showing 7 changed files with 764 additions and 673 deletions.
diff --git a/examples/information_extraction/DuUIE/README.md b/examples/information_extraction/DuUIE/README.md
@@ -120,40 +120,30 @@ python process_data.py preprocess
 
 处理之后的数据将自动生成在 data/DuUIE_pre 下，每个实例中添加了 `spot`, `asoc` 和 `spot_asoc` 三个字段。
 
-在多任务训练集 `train.json` 的每个样例中，spot/asoc 为每个任务中所有的 Spot/Asoc 类型，用于生成对应的 SSI。
-
 ### 快速基线第二步：多任务模型训练
 
 基线采用的预训练模型为字符级别的中文模型 [uie-char-small](https://paddlenlp.bj.bcebos.com/models/ccks2022/uie-char-small.zip)，该模型采用两阶段的训练方式构建：首先使用 100G 中文数据进行 Span Corruption 预训练；然后使用远距离监督产生的文本-结构数据进行结构生成预训练。
 下载解压缩后开始多任务训练。
 
 #### 多任务配置
 
-本例中采用 Yaml 配置文件来配置不同任务的数据来源和验证方式，详见 `config/multi-task-duuie.yaml`。
-
+本例中采用 Yaml 配置文件来配置不同任务的数据来源和验证方式，详见多任务配置文件 `config/multi-task-duuie.yaml`。
+本例将依据配置文件自动读取每个任务所需的训练数据进行训练，并对每个任务进行验证并汇报结果。
 ``` bash
 python3 run_seq2struct.py                              \
-  --multi_task                                         \
   --multi_task_config config/multi-task-duuie.yaml     \
   --negative_keep 1.0                                  \
   --do_train                                           \
   --metric_for_best_model=all-task-ave                 \
   --model_name_or_path=./uie-char-small                \
-  --max_source_length=384                              \
-  --max_prefix_length=-1                               \
-  --max_target_length=192                              \
   --num_train_epochs=10                                \
-  --train_file=data/duuie_pre/train.json               \
-  --validation_file=data/duuie_pre/val.json            \
-  --record_schema=data/duuie_pre/record.schema         \
-  --per_device_train_batch_size=16                     \
+  --per_device_train_batch_size=32                     \
   --per_device_eval_batch_size=256                     \
   --output_dir=output/duuie_multi_task_b32_lr5e-4      \
   --logging_dir=output/duuie_multi_task_b32_lr5e-4_log \
   --learning_rate=5e-4                                 \
-  --seed=42                                            \
   --overwrite_output_dir                               \
-  --gradient_accumulation_steps 2
+  --gradient_accumulation_steps 1
 ```
 
 训练完成后，将生成对应的文件夹 `output/duuie_multi_task_b32_lr5e-4`

diff --git a/examples/information_extraction/DuUIE/process_data.py b/examples/information_extraction/DuUIE/process_data.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python3
 # -*- coding:utf-8 -*-
+import copy
 from typing import List, Dict
 from collections import defaultdict
 import yaml
@@ -23,6 +24,8 @@ def load_jsonlines_file(filename):
 
 
 def convert_entity_schema(entity_schema):
+    """ Convert entity schmea to record schema
+    """
     spots = list()
     asocs = list()
     spot_asoc_map = dict()
@@ -33,6 +36,8 @@ def convert_entity_schema(entity_schema):
 
 
 def convert_entity_relation_schema(entity_schema, relation_schema):
+    """ Convert entity and relation chmea to record schema
+    """
     spots = list()
     asocs = list()
     spot_asoc_map = dict()
@@ -50,6 +55,8 @@ def convert_entity_relation_schema(entity_schema, relation_schema):
 
 
 def convert_event_schema(schema):
+    """ Convert event schmea to record schema
+    """
     spots = list()
     asocs = set()
     spot_asoc_map = dict()
@@ -373,8 +380,6 @@ def add_spot_asoc_to_single_file(filename):
 
 def convert_duuie_to_spotasoc(data_folder, ignore_datasets):
 
-    train_instances = list()
-    val_instances = list()
     schema_list = list()
 
     for task_folder in os.listdir(data_folder):
@@ -405,7 +410,6 @@ def convert_duuie_to_spotasoc(data_folder, ignore_datasets):
             new_instance['spot'] = record_schema.type_list
             # 添加任务中所有的 Asoc 类别
             new_instance['asoc'] = record_schema.role_list
-            train_instances += [new_instance]
 
         for line in open(
                 os.path.join(data_folder, task_folder, 'val.json'),
@@ -415,26 +419,12 @@ def convert_duuie_to_spotasoc(data_folder, ignore_datasets):
             new_instance['spot'] = record_schema.type_list
             # 添加任务中所有的 Asoc 类别
             new_instance['asoc'] = record_schema.role_list
-            val_instances += [new_instance]
 
     # 融合不同任务的 Schema
     multi_schema = merge_schema(schema_list)
-    dump_instances(train_instances, os.path.join(data_folder, 'train.json'))
-    dump_instances(val_instances, os.path.join(data_folder, 'val.json'))
     multi_schema.write_to_file(os.path.join(data_folder, 'record.schema'))
 
 
-def add_spotasoc_to_train(options):
-    """ Add spot asoc annotation
-    添加 spot asoc 标注信息
-    """
-    import shutil
-    shutil.rmtree(options.output_folder) if os.path.exists(
-        options.output_folder) else None
-    shutil.copytree(options.train_data, options.output_folder)
-    convert_duuie_to_spotasoc(options.output_folder, options.ignore_datasets)
-
-
 def dump_instances(instances, output_filename):
     with open(output_filename, 'w', encoding='utf8') as output:
         for instance in instances:
@@ -492,42 +482,139 @@ def filter_event(data_folder, event_types, output_folder):
                        os.path.join(output_folder, f"{split}.json"))
 
 
-def preprocess_event():
+def preprocess_event(data_folder, schema_folder):
     """ Preprocessing event dataset for CCKS 2022
     针对 CCKS 2022 竞赛数据进行预处理
     """
-
     # Filter event annotation in raw data, only keep the required event in CCKS 2022
     # 对事件数据进行预处理，过滤除 `灾害意外` 和 `体育竞赛` 外的事件标注
     for schema in ['灾害意外', '体育竞赛']:
         print(f'Building {schema} dataset ...')
-        data_folder = os.path.join('data', 'duuie', 'DUEE')
-        schema_file = os.path.join('data', 'seen_schema', f'{schema}.yaml')
-        output_folder = os.path.join('data', 'duuie', schema)
+        duee_folder = os.path.join(data_folder, 'DUEE')
+        schema_file = os.path.join(schema_folder, f'{schema}.yaml')
+        output_folder = os.path.join(data_folder, schema)
         schema = load_definition_schema_file(schema_file)
         filter_event(
-            data_folder=data_folder,
+            data_folder=duee_folder,
             event_types=schema['事件'],
             output_folder=output_folder, )
 
     for schema in ['金融信息']:
         print(f'Building {schema} dataset ...')
-        data_folder = os.path.join('data', 'duuie', 'DUEE_FIN_LITE')
-        schema_file = os.path.join('data', 'seen_schema', f'{schema}.yaml')
-        output_folder = os.path.join('data', 'duuie', schema)
+        duee_fin_folder = os.path.join(data_folder, 'DUEE_FIN_LITE')
+        schema_file = os.path.join(schema_folder, f'{schema}.yaml')
+        output_folder = os.path.join(data_folder, schema)
         schema = load_definition_schema_file(schema_file)
         # 依据不同事件类别将多事件抽取分割成多个单事件类型抽取
         # Separate multi-type extraction to multiple single-type extraction
         for event_type in schema['事件']:
             filter_event(
-                data_folder=data_folder,
+                data_folder=duee_fin_folder,
                 event_types={event_type: schema['事件'][event_type]},
                 output_folder=output_folder + '_' + event_type, )
 
 
+def merge_instance(instance_list):
+    """Merge instances with same text but different annotation
+    合并文本相同标记不同的实例
+    """
+
+    def all_equal(_x):
+        for __x in _x:
+            if __x != _x[0]:
+                return False
+        return True
+
+    def entity_key(_x):
+        return (tuple(_x['offset']), _x['type'])
+
+    def relation_key(_x):
+        return (
+            tuple(_x['type']),
+            tuple(_x['args'][0]['offset']),
+            _x['args'][0]['type'],
+            tuple(_x['args'][1]['offset']),
+            _x['args'][1]['type'], )
+
+    def event_key(_x):
+        return (tuple(_x['offset']), _x['type'])
+
+    assert all_equal([x['text'] for x in instance_list])
+
+    element_dict = {
+        'entity': dict(),
+        'relation': dict(),
+        'event': dict(),
+    }
+    instance_id_list = list()
+    for x in instance_list:
+        instance_id_list += [x['id']]
+        for entity in x.get('entity', list()):
+            element_dict['entity'][entity_key(entity)] = entity
+        for relation in x.get('relation', list()):
+            element_dict['relation'][relation_key(relation)] = relation
+        for event in x.get('event', list()):
+            element_dict['event'][event_key(event)] = event
+
+    return {
+        'id': '-'.join(instance_id_list),
+        'text': instance_list[0]['text'],
+        'tokens': instance_list[0]['tokens'],
+        'entity': list(element_dict['entity'].values()),
+        'relation': list(element_dict['relation'].values()),
+        'event': list(element_dict['event'].values())
+    }
+
+
+def preprocess_duie(data_folder):
+    life_folder = os.path.join(data_folder, 'DUIE_LIFE_SPO')
+    org_folder = os.path.join(data_folder, 'DUIE_ORG_SPO')
+    life_train_instances = load_jsonlines_file(f"{life_folder}/train.json")
+    org_train_instances = load_jsonlines_file(f"{org_folder}/train.json")
+    life_relation = RecordSchema.read_from_file(
+        f"{life_folder}/record.schema").role_list
+    org_relation = RecordSchema.read_from_file(
+        f"{org_folder}/record.schema").role_list
+
+    instance_dict = defaultdict(list)
+    for instance in life_train_instances + org_train_instances:
+        instance_dict[instance['text']] += [instance]
+
+    for text in instance_dict:
+        instance_dict[text] = merge_instance(instance_dict[text])
+
+    with open(f"{life_folder}/train.json", 'w') as output:
+        for instance in instance_dict.values():
+            new_instance = copy.deepcopy(instance)
+            new_instance['relation'] = list(
+                filter(lambda x: x['type'] in life_relation, instance[
+                    'relation']))
+            output.write(json.dumps(new_instance) + '\n')
+
+    with open(f"{org_folder}/train.json", 'w') as output:
+        for instance in instance_dict.values():
+            new_instance = copy.deepcopy(instance)
+            new_instance['relation'] = list(
+                filter(lambda x: x['type'] in org_relation, instance[
+                    'relation']))
+            output.write(json.dumps(new_instance) + '\n')
+
+
 def preprocess(options):
-    preprocess_event()
-    add_spotasoc_to_train(options)
+    """ Preprocessing event dataset for CCKS 2022
+    针对 CCKS 2022 竞赛数据进行预处理
+    """
+    import shutil
+    shutil.rmtree(options.output_folder) if os.path.exists(
+        options.output_folder) else None
+    shutil.copytree(options.train_data, options.output_folder)
+
+    preprocess_duie(data_folder=options.output_folder)
+    preprocess_event(
+        data_folder=options.output_folder, schema_folder=options.schema_folder)
+    convert_duuie_to_spotasoc(
+        data_folder=options.output_folder,
+        ignore_datasets=options.ignore_datasets)
 
 
 if __name__ == "__main__":
@@ -547,7 +634,12 @@ def preprocess(options):
     parser_t.add_argument(
         '--ignore_datasets',
         default=['DUEE', 'DUEE_FIN_LITE'],
+        nargs='+',
         help='Ignore dataset in `output_folder` for training')
+    parser_t.add_argument(
+        '--schema_folder',
+        default='data/seen_schema',
+        help='Path for seen schema folder')
     parser_t.set_defaults(func=preprocess)
 
     parser_a = subparsers.add_parser(