Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add model compression API #2777

Merged
merged 5 commits into from
Aug 10, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 23 additions & 52 deletions model_zoo/ernie-3.0/compress_qa.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,69 +14,45 @@

import os
import sys
import yaml
from functools import partial
import distutils.util
import os.path as osp
from typing import Optional

import numpy as np
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
import paddlenlp
from paddlenlp.data import DataCollatorWithPadding

from paddlenlp.trainer import (
PdArgumentParser,
TrainingArguments,
Trainer,
)
from paddlenlp.data import DataCollatorWithPadding
from paddlenlp.trainer import PdArgumentParser, CompressionArguments, Trainer
from paddlenlp.trainer import EvalPrediction, get_last_checkpoint
from paddlenlp.transformers import (
AutoTokenizer,
AutoModelForQuestionAnswering,
)
from compress_trainer import CompressConfig, PTQConfig
from paddlenlp.transformers import AutoTokenizer, AutoModelForQuestionAnswering
from paddlenlp.utils.log import logger
from datasets import load_metric, load_dataset

sys.path.append("../ernie-1.0/finetune")

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

为什么这里会有 ernie-1.0 路径?

from question_answering import (
QuestionAnsweringTrainer,
CrossEntropyLossForSQuAD,
prepare_train_features,
prepare_validation_features,
)
from utils import (
ALL_DATASETS,
DataArguments,
ModelArguments,
)
from question_answering import QuestionAnsweringTrainer, CrossEntropyLossForSQuAD, prepare_train_features, prepare_validation_features
Copy link

@tianxin1860 tianxin1860 Jul 12, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这里有个疑问,为什么类似 QuestionAnsweringTrainer 这种下游任务的 Trainer 实现没有进框架,而是放在 model_zoo/ernie-1.0 目录下呢?@wawltor

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

可以考虑进入框架,这里没有放的主要原因是 参考 的 huggingface。

当时可能的考虑有:

  1. 给出了一些用户改造代码Trainer的示例。
  2. 是否有很强的通用性。

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

from utils import ALL_DATASETS, DataArguments, ModelArguments

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

有个疑问哈: 为什么这3个数据类型 ALL_DATASETS, DataArguments, ModelArguments 没有进 Trainer 框架,而是放在 ernie1.0/finetune/utils 里? @ZHUI

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这些是 custom 用户自定义的东西。

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

跟数据,任务类型关系比较大。这里应该是 ernie-3.0 和 ernie-1.0 任务比较相似,所以共用。但是对于其他模型来讲,可能不一定适用。

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

好的,清楚了。



def main():
parser = PdArgumentParser(
(ModelArguments, DataArguments, TrainingArguments))
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
(ModelArguments, DataArguments, CompressionArguments))
model_args, data_args, compression_args = parser.parse_args_into_dataclasses(
)

paddle.set_device(training_args.device)
paddle.set_device(compression_args.device)
data_args.dataset = data_args.dataset.strip()

if data_args.dataset in ALL_DATASETS:
# if you custom you hyper-parameters in yaml config, it will overwrite all args.
config = ALL_DATASETS[data_args.dataset]
for args in (model_args, data_args, training_args):
for args in (model_args, data_args, compression_args):
for arg in vars(args):
if arg in config.keys():
setattr(args, arg, config[arg])
Comment on lines 42 to 48

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这一段逻辑能否加上注释说明?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

第43行有注释,如果有自定义的yaml文件,则会替换args传递来的参数;


training_args.per_device_train_batch_size = config["batch_size"]
training_args.per_device_eval_batch_size = config["batch_size"]
compression_args.per_device_train_batch_size = config["batch_size"]
compression_args.per_device_eval_batch_size = config["batch_size"]

# Log model and data config
training_args.print_config(model_args, "Model")
training_args.print_config(data_args, "Data")
compression_args.print_config(model_args, "Model")
compression_args.print_config(data_args, "Data")

dataset_config = data_args.dataset.split(" ")
raw_datasets = load_dataset(
Expand All @@ -102,7 +78,7 @@ def main():

train_dataset = raw_datasets["train"]
# Create train feature from dataset
with training_args.main_process_first(
with compression_args.main_process_first(
desc="train dataset map pre-processing"):
# Dataset pre-process
train_dataset = train_dataset.map(
Expand All @@ -115,7 +91,7 @@ def main():
desc="Running tokenizer on train dataset",
)
eval_examples = raw_datasets["validation"]
with training_args.main_process_first(
with compression_args.main_process_first(
desc="evaluate dataset map pre-processing"):
eval_dataset = eval_examples.map(
partial(prepare_validation_features,
Expand Down Expand Up @@ -151,25 +127,20 @@ def post_processing_function(examples, features, predictions, stage="eval"):

trainer = QuestionAnsweringTrainer(
model=model,
args=training_args,
args=compression_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
eval_examples=eval_examples,
data_collator=data_collator,
post_process_function=post_processing_function,
tokenizer=tokenizer)

output_dir = os.path.join(model_args.model_name_or_path, "compress")
if not os.path.exists(output_dir):
os.makedirs(output_dir)

prune = True
compress_config = CompressConfig(quantization_config=PTQConfig(
algo_list=['hist', 'mse'], batch_size_list=[4, 8, 16]))
trainer.compress(output_dir,
pruning=prune,
quantization=True,
compress_config=compress_config)
if not os.path.exists(compression_args.output_dir):
os.makedirs(compression_args.output_dir)

compression_args.print_config()

trainer.compress()


if __name__ == "__main__":
Expand Down
82 changes: 31 additions & 51 deletions model_zoo/ernie-3.0/compress_seq_cls.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,61 +14,46 @@

import os
import sys
import yaml
from functools import partial

import paddle

from paddlenlp.data import DataCollatorWithPadding
from paddlenlp.datasets import load_dataset
from paddlenlp.trainer import (
PdArgumentParser,
TrainingArguments,
Trainer,
)

from paddlenlp.transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
)
from paddlenlp.trainer import PdArgumentParser, Trainer, CompressionArguments
from paddlenlp.transformers import AutoTokenizer, AutoModelForSequenceClassification
from paddlenlp.utils.log import logger

from compress_trainer import CompressConfig, PTQConfig

sys.path.append("../ernie-1.0/finetune")

from sequence_classification import seq_trans_fn, clue_trans_fn
from utils import (
ALL_DATASETS,
DataArguments,
ModelArguments,
)
from utils import ALL_DATASETS, DataArguments, ModelArguments


def main():
parser = PdArgumentParser(
(ModelArguments, DataArguments, TrainingArguments))
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
(ModelArguments, DataArguments, CompressionArguments))
model_args, data_args, compression_args = parser.parse_args_into_dataclasses(
)

paddle.set_device(training_args.device)
paddle.set_device(compression_args.device)

data_args.dataset = data_args.dataset.strip()

if data_args.dataset in ALL_DATASETS:
# if you custom you hyper-parameters in yaml config, it will overwrite all args.
# If you custom you hyper-parameters in yaml config, it will overwrite all args.
config = ALL_DATASETS[data_args.dataset]
logger.info("Over-writing training config by yaml config!")
for args in (model_args, data_args, training_args):
logger.info("Over-writing compression config by yaml config!")
for args in (model_args, data_args, compression_args):
for arg in vars(args):
if arg in config.keys():
setattr(args, arg, config[arg])

training_args.per_device_train_batch_size = config["batch_size"]
training_args.per_device_eval_batch_size = config["batch_size"]
compression_args.per_device_train_batch_size = config["batch_size"]
compression_args.per_device_eval_batch_size = config["batch_size"]

# Log model and data config
training_args.print_config(model_args, "Model")
training_args.print_config(data_args, "Data")
compression_args.print_config(model_args, "Model")
compression_args.print_config(data_args, "Data")

dataset_config = data_args.dataset.split(" ")
raw_datasets = load_dataset(
Expand All @@ -81,42 +66,37 @@ def main():
raw_datasets['train'].label_list)

criterion = paddle.nn.CrossEntropyLoss()
# Define tokenizer, model, loss function.
# Defines tokenizer, model, loss function.
tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
model = AutoModelForSequenceClassification.from_pretrained(
model_args.model_name_or_path, num_classes=num_classes)

# Define dataset pre-process function
# Defines dataset pre-process function
if "clue" in data_args.dataset:
trans_fn = partial(clue_trans_fn, tokenizer=tokenizer, args=data_args)
else:
trans_fn = partial(seq_trans_fn, tokenizer=tokenizer, args=data_args)

# Define data collector
# Defines data collector
data_collator = DataCollatorWithPadding(tokenizer)

train_dataset = raw_datasets["train"].map(trans_fn)
eval_dataset = raw_datasets["dev"].map(trans_fn)

trainer = Trainer(model=model,
args=training_args,
data_collator=data_collator,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
tokenizer=tokenizer,
criterion=criterion)

output_dir = os.path.join(model_args.model_name_or_path, "compress")
if not os.path.exists(output_dir):
os.makedirs(output_dir)

compress_config = CompressConfig(quantization_config=PTQConfig(
algo_list=['hist', 'mse'], batch_size_list=[4, 8, 16]))

trainer.compress(output_dir,
pruning=True,
quantization=True,
compress_config=compress_config)
trainer = Trainer(
model=model,
args=compression_args,
data_collator=data_collator,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
criterion=criterion) # Strategy`dynabert` needs arguments `criterion`

compression_args.print_config()

if not os.path.exists(compression_args.output_dir):
os.makedirs(compression_args.output_dir)

trainer.compress()


if __name__ == "__main__":
Expand Down
Loading