Skip to content

Commit

Permalink
optimize model initialization for UIE (PaddlePaddle#2119)
Browse files Browse the repository at this point in the history
* optimize model initialization for UIE

* Update taskflow.md

* Update taskflow.md

* Update taskflow.md

* Update README.md

* Update README.md

* Update README.md

* Update evaluation

* Update evaluation

* delete unused code
  • Loading branch information
linjieccc committed May 12, 2022
1 parent 5ec8d13 commit 8e29ffb
Show file tree
Hide file tree
Showing 12 changed files with 183 additions and 96 deletions.
33 changes: 14 additions & 19 deletions applications/speech-cmd-analysis/finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,22 +36,20 @@ def do_train():
set_seed(args.seed)

encoding_model = MODEL_MAP[args.model]['encoding_model']
hidden_size = MODEL_MAP[args.model]['hidden_size']
url = MODEL_MAP[args.model]['url']
resource_file_urls = MODEL_MAP[args.model]['resource_file_urls']

for key, val in resource_file_urls.items():
file_path = os.path.join(args.model, key)
if not os.path.exists(file_path):
get_path_from_url(val, args.model)

tokenizer = AutoTokenizer.from_pretrained(encoding_model)
model = UIE(encoding_model, hidden_size)

if args.init_from_ckpt is not None:
pretrained_model_path = args.init_from_ckpt
else:
pretrained_model_path = os.path.join(args.model, "model_state.pdparams")
if not os.path.exists(pretrained_model_path):
get_path_from_url(url, args.model)

state_dict = paddle.load(pretrained_model_path)
model.set_dict(state_dict)
print("Init from: {}".format(pretrained_model_path))
model = UIE.from_pretrained(args.model)

if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt):
state_dict = paddle.load(args.init_from_ckpt)
model.set_dict(state_dict)

if paddle.distributed.get_world_size() > 1:
model = paddle.DataParallel(model)

Expand Down Expand Up @@ -121,8 +119,7 @@ def do_train():
save_dir = os.path.join(args.save_dir, "model_%d" % global_step)
if not os.path.exists(save_dir):
os.makedirs(save_dir)
save_param_path = os.path.join(save_dir, "model_state.pdparams")
paddle.save(model.state_dict(), save_param_path)
model.save_pretrained(save_dir)

precision, recall, f1 = evaluate(model, metric, dev_data_loader)
print("Evaluation precision: %.5f, recall: %.5f, F1: %.5f" %
Expand All @@ -133,9 +130,7 @@ def do_train():
)
best_f1 = f1
save_dir = os.path.join(args.save_dir, "model_best")
save_best_param_path = os.path.join(save_dir,
"model_state.pdparams")
paddle.save(model.state_dict(), save_best_param_path)
model.save_pretrained(save_dir)
tic_train = time.time()


Expand Down
9 changes: 5 additions & 4 deletions applications/speech-cmd-analysis/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,14 @@

import paddle
import paddle.nn as nn
from paddlenlp.transformers import AutoModel
from paddlenlp.transformers import ErniePretrainedModel


class UIE(nn.Layer):
def __init__(self, encoding_model, hidden_size):
class UIE(ErniePretrainedModel):
def __init__(self, encoding_model):
super(UIE, self).__init__()
self.encoder = AutoModel.from_pretrained(encoding_model)
self.encoder = encoding_model
hidden_size = self.encoder.config["hidden_size"]
self.linear_start = paddle.nn.Linear(hidden_size, 1)
self.linear_end = paddle.nn.Linear(hidden_size, 1)
self.sigmoid = nn.Sigmoid()
Expand Down
18 changes: 12 additions & 6 deletions applications/speech-cmd-analysis/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,15 +30,21 @@
MODEL_MAP = {
"uie-base": {
"encoding_model": "ernie-3.0-base-zh",
"hidden_size": 768,
"url":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/model_state.pdparams"
"resource_file_urls": {
"model_state.pdparams":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/model_state.pdparams",
"model_config.json":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/model_config.json"
}
},
"uie-tiny": {
"encoding_model": "ernie-3.0-medium-zh",
"hidden_size": 768,
"url":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_tiny/model_state.pdparams"
"resource_file_urls": {
"model_state.pdparams":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_tiny/model_state.pdparams",
"model_config.json":
"https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_tiny/model_config.json"
}
},
}

Expand Down
44 changes: 35 additions & 9 deletions docs/model_zoo/taskflow.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,13 @@ PaddleNLP提供**开箱即用**的产业级NLP预置任务能力,无需训练
| :--------------------------------- | -------------------------------- | -------- | -------- | -------- | ---------- | ---------- | ------------------------------------------------------ |
| [中文分词](#中文分词) | `Taskflow("word_segmentation")` |||||| 多种分词模式,满足快速切分和实体粒度精准切分 |
| [词性标注](#词性标注) | `Taskflow("pos_tagging")` |||||| 基于百度前沿词法分析工具LAC |
| [命名实体识别](#命名实体识别) | `Taskflow("ner")` |||||| 覆盖最全中文实体标签 |
| [依存句法分析](#依存句法分析) | `Taskflow("dependency_parsing")` |||| || 基于最大规模中文依存句法树库研发的DDParser |
| [信息抽取](#信息抽取) | `Taskflow("information_extraction")` |||||| 适配多场景的开放域通用信息抽取工具 |
| [『解语』-知识标注](#解语知识标注) | `Taskflow("knowledge_mining")` |||||| 覆盖所有中文词汇的知识标注工具 |
| [命名实体识别](#命名实体识别) | `Taskflow("ner")` |||||| 覆盖最全中文实体标签 |
| [依存句法分析](#依存句法分析) | `Taskflow("dependency_parsing")` |||| || 基于最大规模中文依存句法树库研发的DDParser |
| [信息抽取](#信息抽取) | `Taskflow("information_extraction")`|||||| 适配多场景的开放域通用信息抽取工具 |
| [『解语』-知识标注](#解语知识标注) | `Taskflow("knowledge_mining")` |||||| 覆盖所有中文词汇的知识标注工具 |
| [文本纠错](#文本纠错) | `Taskflow("text_correction")` |||||| 融合拼音特征的端到端文本纠错模型ERNIE-CSC |
| [文本相似度](#文本相似度) | `Taskflow("text_similarity")` |||| | | 基于百度知道2200万对相似句组训练 |
| [情感倾向分析](#情感倾向分析) | `Taskflow("sentiment_analysis")` |||| || 基于情感知识增强预训练模型SKEP达到业界SOTA |
| [情感倾向分析](#情感倾向分析) | `Taskflow("sentiment_analysis")` |||| || 基于情感知识增强预训练模型SKEP达到业界SOTA |
| [生成式问答](#生成式问答) | `Taskflow("question_answering")` |||| | | 使用最大中文开源CPM模型完成问答 |
| [智能写诗](#智能写诗) | `Taskflow("poetry_generation")` |||| | | 使用最大中文开源CPM模型完成写诗 |
| [开放域对话](#开放域对话) | `Taskflow("dialogue")` |||| | | 十亿级语料训练最强中文闲聊模型PLATO-Mini,支持多轮对话 |
Expand Down Expand Up @@ -415,12 +415,24 @@ from paddlenlp import Taskflow
预测:

```python
>>> from pprint import pprint
>>> from paddlenlp import Taskflow

>>> schema = ['时间', '选手', '赛事名称'] # Define the schema for entity extraction
>>> ie = Taskflow('information_extraction', schema=schema)
>>> ie("2月8日上午北京冬奥会自由式滑雪女子大跳台决赛中中国选手谷爱凌以188.25分获得金牌!")
[{'时间': [{'text': '2月8日上午', 'start': 0, 'end': 6, 'probability': 0.9907337794563702}], '选手': [{'text': '谷爱凌', 'start': 28, 'end': 31, 'probability': 0.8914310308098763}], '赛事名称': [{'text': '北京冬奥会自由式滑雪女子大跳台决赛', 'start': 6, 'end': 23, 'probability': 0.8944207860063003}]}]
>>> pprint(ie("2月8日上午北京冬奥会自由式滑雪女子大跳台决赛中中国选手谷爱凌以188.25分获得金牌!")) # Better print results using pprint
[{'时间': [{'end': 6,
'probability': 0.9907337794563702,
'start': 0,
'text': '2月8日上午'}],
'赛事名称': [{'end': 23,
'probability': 0.8944205558197353,
'start': 6,
'text': '北京冬奥会自由式滑雪女子大跳台决赛'}],
'选手': [{'end': 31,
'probability': 0.8914297225026147,
'start': 28,
'text': '谷爱凌'}]}]
```

- 关系抽取
Expand Down Expand Up @@ -486,6 +498,7 @@ from paddlenlp import Taskflow
[{'评价维度': [{'text': '管理', 'start': 4, 'end': 6, 'probability': 0.8902373594544031, 'relations': {'观点词': [{'text': '混乱', 'start': 7, 'end': 9, 'probability': 0.9993566520321409}]}}, {'text': '票价', 'start': 11, 'end': 13, 'probability': 0.9856116411308662, 'relations': {'观点词': [{'text': '', 'start': 14, 'end': 15, 'probability': 0.995628420935013}]}}]}]
```


- 情感倾向分类

句子级情感倾向分类,即判断句子的情感倾向是“正向”还是“负向”,schema构造如下:
Expand All @@ -511,11 +524,24 @@ from paddlenlp import Taskflow
['寺庙', {'丈夫': '妻子'}]
```

预测:

```python
>>> schema = ['寺庙', {'丈夫': '妻子'}]
>>> ie.set_schema(schema)
>>> ie('李治即位后,让身在感业寺的武则天续起头发,重新纳入后宫。')
[{'寺庙': [{'text': '感业寺', 'start': 9, 'end': 12, 'probability': 0.998334669586864}], '丈夫': [{'text': '李治', 'start': 0, 'end': 2, 'probability': 0.993496447299993, 'relations': {'妻子': [{'text': '武则天', 'start': 13, 'end': 16, 'probability': 0.9994008822614759}]}}]}]
>>> pprint(ie('李治即位后,让身在感业寺的武则天续起头发,重新纳入后宫。')) # Better print results using pprint
[{'丈夫': [{'end': 2,
'probability': 0.993496447299993,
'relations': {'妻子': [{'end': 16,
'probability': 0.9994008822614759,
'start': 13,
'text': '武则天'}]},
'start': 0,
'text': '李治'}],
'寺庙': [{'end': 12,
'probability': 0.998334669586864,
'start': 9,
'text': '感业寺'}]}]
```


Expand Down
51 changes: 42 additions & 9 deletions examples/information_extraction/uie/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,12 +57,24 @@ UIE可以从自然语言文本中,抽取出结构化的关键字段信息,
```paddlenlp.Taskflow```提供通用信息抽取、评价观点抽取等能力,可抽取多种类型的信息,包括但不限于命名实体识别(如人名、地名、机构名等)、关系(如电影的导演、歌曲的发行时间等)、事件(如某路口发生车祸、某地发生地震等)、以及评价维度、观点词、情感倾向等信息。用户可以使用自然语言自定义抽取目标,无需训练即可统一抽取输入文本中的对应信息。**实现开箱即用,并满足各类信息抽取需求**

```python
>>> from pprint import pprint
>>> from paddlenlp import Taskflow

>>> schema = ['时间', '选手', '赛事名称'] # Define the schema for entity extraction
>>> ie = Taskflow('information_extraction', schema=schema)
>>> ie("2月8日上午北京冬奥会自由式滑雪女子大跳台决赛中中国选手谷爱凌以188.25分获得金牌!")
[{'时间': [{'text': '2月8日上午', 'start': 0, 'end': 6, 'probability': 0.9907337794563702}], '选手': [{'text': '谷爱凌', 'start': 28, 'end': 31, 'probability': 0.8914310308098763}], '赛事名称': [{'text': '北京冬奥会自由式滑雪女子大跳台决赛', 'start': 6, 'end': 23, 'probability': 0.8944207860063003}]}]
>>> pprint(ie("2月8日上午北京冬奥会自由式滑雪女子大跳台决赛中中国选手谷爱凌以188.25分获得金牌!"))
[{'时间': [{'end': 6,
'probability': 0.9907337794563702,
'start': 0,
'text': '2月8日上午'}],
'赛事名称': [{'end': 23,
'probability': 0.8944205558197353,
'start': 6,
'text': '北京冬奥会自由式滑雪女子大跳台决赛'}],
'选手': [{'end': 31,
'probability': 0.8914297225026147,
'start': 28,
'text': '谷爱凌'}]}]
```

更多不同任务的使用方法请参考[Taskflow信息抽取](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/docs/model_zoo/taskflow.md#%E4%BF%A1%E6%81%AF%E6%8A%BD%E5%8F%96)
Expand Down Expand Up @@ -168,45 +180,66 @@ python finetune.py \
- `batch_size`: 批处理大小,请结合显存情况进行调整,若出现显存不足,请适当调低这一参数,默认为16。
- `max_seq_len`: 文本最大切分长度,输入超过最大长度时会对输入文本进行自动切分,默认为512。
- `num_epochs`: 训练轮数,默认为100。
- `model`: 选择模型,程序会基于选择的模型进行模型微调,可选有`uie-base``uie-tiny`
- `model`: 选择模型,程序会基于选择的模型进行模型微调,可选有`uie-base``uie-tiny`,默认为`uie-base`
- `seed`: 随机种子,默认为1000.
- `logging_steps`: 日志打印的间隔steps数,默认10。
- `valid_steps`: evaluate的间隔steps数,默认100。
- `device`: 选用什么设备进行训练,可选cpu或gpu。

模型选择:

| 模型 | 结构 |
| :---: | :--------: |
| `uie-tiny`| 6-layers, 768-hidden, 12-heads |
| `uie-base` (默认)| 12-layers, 768-hidden, 12-heads |

#### 模型评估

通过运行以下命令进行模型评估:

```shell
python evaluate.py \
--model_path "./checkpoint/model_best/model_state.pdparams" \
--model_path "./checkpoint/model_best" \
--test_path "./data/dev.txt" \
--model "uie-base" \
--batch_size 16 \
--max_seq_len 512
```

可配置参数说明:

- `model_path`: 进行评估的模型权重文件
- `model_path`: 进行评估的模型文件夹路径,路径下需包含模型权重文件`model_state.pdparams`及配置文件`model_config.json`
- `test_path`: 进行评估的测试集文件。
- `batch_size`: 批处理大小,请结合显存情况进行调整,若出现显存不足,请适当调低这一参数,默认为16。
- `max_seq_len`: 文本最大切分长度,输入超过最大长度时会对输入文本进行自动切分,默认为512。
- `model`: 进行效果评估的模型类型,可选有`uie-base``uie-tiny`

#### 定制模型一键预测

`paddlenlp.Taskflow`装载定制模型,通过`task_path`指定模型权重文件的路径,路径下需要包含训练好的模型权重文件`model_state.pdparams`

```python
>>> from pprint import pprint
>>> from paddlenlp import Taskflow

>>> schema = ['出发地', '目的地', '费用', '时间']
# 设定抽取目标和定制化模型权重路径
>>> my_ie = Taskflow("information_extraction", schema=schema, task_path='./checkpoint/model_best')
>>> my_ie("城市内交通费7月5日金额114广州至佛山")
[{'出发地': [{'text': '广州', 'start': 15, 'end': 17, 'probability': 0.9975287467835301}], '目的地': [{'text': '佛山', 'start': 18, 'end': 20, 'probability': 0.9998511131226735}], '费用': [{'text': '114', 'start': 12, 'end': 15, 'probability': 0.9994474579292856}], '时间': [{'text': '7月5日', 'start': 6, 'end': 10, 'probability': 0.9999476678061399}]}]
>>> pprint(my_ie("城市内交通费7月5日金额114广州至佛山"))
[{'出发地': [{'end': 17,
'probability': 0.9975287467835301,
'start': 15,
'text': '广州'}],
'时间': [{'end': 10,
'probability': 0.9999476678061399,
'start': 6,
'text': '7月5日'}],
'目的地': [{'end': 20,
'probability': 0.9998511131226735,
'start': 18,
'text': '佛山'}],
'费用': [{'end': 15,
'probability': 0.9994474579292856,
'start': 12,
'text': '114'}]}]
```

#### Few-Shot实验
Expand Down
7 changes: 2 additions & 5 deletions examples/information_extraction/uie/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,12 +52,10 @@ def evaluate(model, metric, data_loader):

def do_eval():
encoding_model = MODEL_MAP[args.model]['encoding_model']
hidden_size = MODEL_MAP[args.model]['hidden_size']
resource_file_urls = MODEL_MAP[args.model]['resource_file_urls']

tokenizer = AutoTokenizer.from_pretrained(encoding_model)
model = UIE(encoding_model, hidden_size)
state_dict = paddle.load(args.model_path)
model.load_dict(state_dict)
model = UIE.from_pretrained(args.model_path)

test_ds = load_dataset(
reader,
Expand Down Expand Up @@ -87,7 +85,6 @@ def do_eval():
parser.add_argument('--test_path', type=str, default=None, help="The path of test set.")
parser.add_argument("--batch_size", type=int, default=16, help="Batch size per GPU/CPU for training.")
parser.add_argument("--max_seq_len", type=int, default=512, help="The maximum total input sequence length after tokenization.")
parser.add_argument("--model", choices=["uie-base", "uie-tiny"], default="uie-base", type=str, help="Select the pretrained model for few-shot learning.")

args = parser.parse_args()
# yapf: enable
Expand Down
Loading

0 comments on commit 8e29ffb

Please sign in to comment.