August 2023 update

hfxunlp · Aug 10, 2023 · 635c75c · 635c75c
1 parent 5494a8b
commit 635c75c
Show file tree

Hide file tree

Showing 404 changed files with 7,343 additions and 3,372 deletions.
diff --git a/README.md b/README.md
@@ -100,7 +100,7 @@ Tokenized case-sensitive BLEU measured with [multi-bleu.perl](https://github.com
 
 ## Acknowledgments
 
-Hongfei Xu enjoys a doctoral grant from [China Scholarship Council](https://www.csc.edu.cn/) ([2018]3101, 201807040056) while maintaining this project.
+Hongfei Xu is partially supported by the Education Department of Henan Province (Grant No. 232300421386) while maintaining this project.
 
 Details of this project can be found [here](https://arxiv.org/abs/1903.07402), and please cite it if you enjoy the implementation :)
 

diff --git a/adv/eva/eva_probe.py b/adv/eva/eva_probe.py
@@ -1,24 +1,22 @@
 #encoding: utf-8
 
 import sys
-
 import torch
 
-from utils.tqdm import tqdm
-
-from utils.h5serial import h5File
-
-import cnfg.probe as cnfg
-from cnfg.ihyp import *
-
-from transformer.Probe.NMT import NMT
 from loss.base import LabelSmoothingLoss
 from parallel.base import DataParallelCriterion
 from parallel.parallelMT import DataParallelMT
+from transformer.Probe.NMT import NMT
+from utils.base import set_random_seed
+from utils.fmt.base4torch import parse_cuda
+from utils.h5serial import h5File
+from utils.io import load_model_cpu
+from utils.torch.comp import torch_autocast, torch_compile, torch_inference_mode
+from utils.tqdm import tqdm
 
-from utils.base import *
+import cnfg.probe as cnfg
+from cnfg.ihyp import *
 from cnfg.vocab.base import pad_id
-from utils.fmt.base4torch import parse_cuda
 
 probe_reorder = cnfg.probe_reorder
 
@@ -37,7 +35,7 @@ def eva(ed, nd, model, lossf, mv_device, multi_gpu, use_amp=False):
 	sum_loss = 0.0
 	model.eval()
 	src_grp, tgt_grp = ed["src"], ed["tgt"]
-	with torch.no_grad():
+	with torch_inference_mode():
 		for i in tqdm(range(nd), mininterval=tqdm_mininterval):
 			bid = str(i)
 			seq_batch = torch.from_numpy(src_grp[bid][()])
@@ -48,7 +46,7 @@ def eva(ed, nd, model, lossf, mv_device, multi_gpu, use_amp=False):
 				seq_o = seq_o.to(mv_device, non_blocking=True)
 			seq_batch, seq_o = seq_batch.long(), seq_o.long()
 			ot = seq_o.narrow(1, ind_shift, lo).contiguous()
-			with autocast(enabled=use_amp):
+			with torch_autocast(enabled=use_amp):
 				output = model(seq_batch, seq_o.narrow(1, 0, lo))
 				loss = lossf(output, ot)
 				if multi_gpu:
@@ -71,7 +69,7 @@ def eva(ed, nd, model, lossf, mv_device, multi_gpu, use_amp=False):
 nword = td["nword"][()].tolist()
 nwordi, nwordt = nword[0], nword[-1]
 
-mymodel = NMT(cnfg.isize, nwordi, nwordt, cnfg.nlayer, cnfg.ff_hsize, cnfg.drop, cnfg.attn_drop, cnfg.share_emb, cnfg.nhead, cache_len_default, cnfg.attn_hsize, cnfg.norm_output, cnfg.bindDecoderEmb, cnfg.forbidden_indexes, cnfg.num_layer_fwd)
+mymodel = NMT(cnfg.isize, nwordi, nwordt, cnfg.nlayer, cnfg.ff_hsize, cnfg.drop, cnfg.attn_drop, cnfg.act_drop, cnfg.share_emb, cnfg.nhead, cache_len_default, cnfg.attn_hsize, cnfg.norm_output, cnfg.bindDecoderEmb, cnfg.forbidden_indexes, cnfg.num_layer_fwd)
 
 mymodel = load_model_cpu(sys.argv[2], mymodel)
 mymodel.apply(load_fixing)
@@ -100,6 +98,9 @@ def eva(ed, nd, model, lossf, mv_device, multi_gpu, use_amp=False):
 		mymodel = DataParallelMT(mymodel, device_ids=cuda_devices, output_device=cuda_device.index, host_replicate=True, gather_output=False)
 		lossf = DataParallelCriterion(lossf, device_ids=cuda_devices, output_device=cuda_device.index, replicate_once=True)
 
+mymodel = torch_compile(mymodel, *torch_compile_args, **torch_compile_kwargs)
+lossf = torch_compile(lossf, *torch_compile_args, **torch_compile_kwargs)
+
 use_amp = cnfg.use_amp and use_cuda
 
 vloss, vprec = eva(td, ntest, mymodel, lossf, cuda_device, multi_gpu, use_amp)

diff --git a/adv/eva/prompt/roberta/eva_single.py b/adv/eva/prompt/roberta/eva_single.py
@@ -1,25 +1,23 @@
 #encoding: utf-8
 
 import sys
-
 import torch
 
-from utils.tqdm import tqdm
-
-from utils.h5serial import h5File
-
-import cnfg.prompt.roberta.base as cnfg
-from cnfg.prompt.roberta.ihyp import *
-from cnfg.vocab.plm.roberta import vocab_size
-
-from transformer.Prompt.RoBERTa.NMT import NMT
 from loss.base import NLLLoss
 from parallel.base import DataParallelCriterion
 from parallel.parallelMT import DataParallelMT
-
-from utils.base import *
+from transformer.Prompt.RoBERTa.NMT import NMT
+from utils.base import set_random_seed
 from utils.fmt.base4torch import parse_cuda
 from utils.fmt.plm.base import fix_parameter_name
+from utils.h5serial import h5File
+from utils.io import load_model_cpu
+from utils.torch.comp import torch_autocast, torch_compile, torch_inference_mode
+from utils.tqdm import tqdm
+
+import cnfg.prompt.roberta.base as cnfg
+from cnfg.prompt.roberta.ihyp import *
+from cnfg.vocab.plm.roberta import vocab_size
 
 def load_fixing(module):
 
@@ -31,7 +29,7 @@ def eva(ed, nd, model, lossf, mv_device, multi_gpu, use_amp=False):
 	sum_loss = 0.0
 	model.eval()
 	src_grp, tgt_grp = ed["src"], ed["tgt"]
-	with torch.no_grad():
+	with torch_inference_mode():
 		for i in tqdm(range(nd), mininterval=tqdm_mininterval):
 			bid = str(i)
 			seq_batch = torch.from_numpy(src_grp[bid][()])
@@ -40,7 +38,7 @@ def eva(ed, nd, model, lossf, mv_device, multi_gpu, use_amp=False):
 				seq_batch = seq_batch.to(mv_device, non_blocking=True)
 				seq_o = seq_o.to(mv_device, non_blocking=True)
 			seq_batch, seq_o = seq_batch.long(), seq_o.long()
-			with autocast(enabled=use_amp):
+			with torch_autocast(enabled=use_amp):
 				output = model(seq_batch)
 				loss = lossf(output, seq_o)
 				if multi_gpu:
@@ -55,12 +53,9 @@ def eva(ed, nd, model, lossf, mv_device, multi_gpu, use_amp=False):
 	w = float(w)
 	return sum_loss / w, (w - r) / w * 100.0
 
-td = h5File(sys.argv[1], "r")
-
-ntest = td["ndata"][()].item()
 nwordi = nwordt = vocab_size
 
-mymodel = NMT(cnfg.isize, nwordi, nwordt, cnfg.nlayer, fhsize=cnfg.ff_hsize, dropout=cnfg.drop, attn_drop=cnfg.attn_drop, global_emb=cnfg.share_emb, num_head=cnfg.nhead, xseql=cache_len_default, ahsize=cnfg.attn_hsize, norm_output=cnfg.norm_output, bindDecoderEmb=cnfg.bindDecoderEmb, forbidden_index=cnfg.forbidden_indexes, model_name=cnfg.model_name)
+mymodel = NMT(cnfg.isize, nwordi, nwordt, cnfg.nlayer, fhsize=cnfg.ff_hsize, dropout=cnfg.drop, attn_drop=cnfg.attn_drop, act_drop=cnfg.act_drop, global_emb=cnfg.share_emb, num_head=cnfg.nhead, xseql=cache_len_default, ahsize=cnfg.attn_hsize, norm_output=cnfg.norm_output, bindDecoderEmb=cnfg.bindDecoderEmb, forbidden_index=cnfg.forbidden_indexes, model_name=cnfg.model_name)
 
 # important to load the pre-trained model, as the load_plm function not only load parameters, but also may introduce new parameters, which affects the parameter alignment.
 pre_trained_m = cnfg.pre_trained_m
@@ -93,10 +88,12 @@ def eva(ed, nd, model, lossf, mv_device, multi_gpu, use_amp=False):
 		mymodel = DataParallelMT(mymodel, device_ids=cuda_devices, output_device=cuda_device.index, host_replicate=True, gather_output=False)
 		lossf = DataParallelCriterion(lossf, device_ids=cuda_devices, output_device=cuda_device.index, replicate_once=True)
 
-use_amp = cnfg.use_amp and use_cuda
+mymodel = torch_compile(mymodel, *torch_compile_args, **torch_compile_kwargs)
+lossf = torch_compile(lossf, *torch_compile_args, **torch_compile_kwargs)
 
-vloss, vprec = eva(td, ntest, mymodel, lossf, cuda_device, multi_gpu, use_amp)
+use_amp = cnfg.use_amp and use_cuda
 
-td.close()
+with h5File(sys.argv[1], "r") as td:
+	vloss, vprec = eva(td, td["ndata"][()].item(), mymodel, lossf, cuda_device, multi_gpu, use_amp)
 
 print("loss/error: %.3f %.2f" % (vloss, vprec,))
diff --git a/adv/examples/plm/bart.py b/adv/examples/plm/bart.py
@@ -1,26 +1,24 @@
 #encoding: utf-8
 
 import torch
+from transformers import BartModel
 
+from transformer.PLM.BART.NMT import NMT
 from utils.fmt.plm.base import fix_parameter_name
-from utils.fmt.plm.roberta.base import ldvocab
+from utils.torch.comp import torch_inference_mode
 
 import cnfg.plm.bart.base as cnfg
 from cnfg.plm.bart.ihyp import *
 from cnfg.vocab.plm.roberta import vocab_size
 
-from transformer.PLM.BART.NMT import NMT
-
-from transformers import BartModel
-
 def init_fixing(module):
 
 	if hasattr(module, "fix_init"):
 		module.fix_init()
 
 print("load pre-trained models")
 
-tmod = NMT(cnfg.isize, vocab_size, vocab_size, cnfg.nlayer, fhsize=cnfg.ff_hsize, dropout=cnfg.drop, attn_drop=cnfg.attn_drop, global_emb=cnfg.share_emb, num_head=cnfg.nhead, xseql=cache_len_default, ahsize=cnfg.attn_hsize, norm_output=cnfg.norm_output, bindDecoderEmb=cnfg.bindDecoderEmb, forbidden_index=cnfg.forbidden_indexes, model_name=cnfg.model_name)
+tmod = NMT(cnfg.isize, vocab_size, vocab_size, cnfg.nlayer, fhsize=cnfg.ff_hsize, dropout=cnfg.drop, attn_drop=cnfg.attn_drop, act_drop=cnfg.act_drop, global_emb=cnfg.share_emb, num_head=cnfg.nhead, xseql=cache_len_default, ahsize=cnfg.attn_hsize, norm_output=cnfg.norm_output, bindDecoderEmb=cnfg.bindDecoderEmb, forbidden_index=cnfg.forbidden_indexes, model_name=cnfg.model_name)
 tmod.apply(init_fixing)
 tmod.load_plm(fix_parameter_name(torch.load("plm/bart-base/pytorch_model.bin", map_location="cpu")))
 
@@ -34,7 +32,7 @@ def init_fixing(module):
 tde = torch.as_tensor([0, 100, 50264, 15162, 4, 2], dtype=torch.long).unsqueeze(0)
 tdo = torch.as_tensor([2, 100, 33, 41, 15162, 4, 2], dtype=torch.long).unsqueeze(0)
 
-with torch.no_grad():
+with torch_inference_mode():
 	ers = smod(input_ids=tde, decoder_input_ids=tdo).last_hidden_state
 	print("forward for test")
 	trs = tmod(tde, tdo)

diff --git a/adv/examples/plm/bert.py b/adv/examples/plm/bert.py
@@ -1,26 +1,24 @@
 #encoding: utf-8
 
 import torch
+from transformers import BertModel
 
+from transformer.PLM.BERT.NMT import NMT
 from utils.fmt.plm.base import fix_parameter_name
-from utils.fmt.plm.bert.base import ldvocab
+from utils.torch.comp import torch_inference_mode
 
 import cnfg.plm.bert.base as cnfg
 from cnfg.plm.bert.ihyp import *
 from cnfg.vocab.plm.bert import vocab_size
 
-from transformer.PLM.BERT.NMT import NMT
-
-from transformers import BertModel
-
 def init_fixing(module):
 
 	if hasattr(module, "fix_init"):
 		module.fix_init()
 
 print("load pre-trained models")
 
-tmod = NMT(cnfg.isize, vocab_size, vocab_size, cnfg.nlayer, fhsize=cnfg.ff_hsize, dropout=cnfg.drop, attn_drop=cnfg.attn_drop, global_emb=cnfg.share_emb, num_head=cnfg.nhead, xseql=cache_len_default, ahsize=cnfg.attn_hsize, norm_output=cnfg.norm_output, bindDecoderEmb=cnfg.bindDecoderEmb, forbidden_index=cnfg.forbidden_indexes, model_name=cnfg.model_name)
+tmod = NMT(cnfg.isize, vocab_size, vocab_size, cnfg.nlayer, fhsize=cnfg.ff_hsize, dropout=cnfg.drop, attn_drop=cnfg.attn_drop, act_drop=cnfg.act_drop, global_emb=cnfg.share_emb, num_head=cnfg.nhead, xseql=cache_len_default, ahsize=cnfg.attn_hsize, norm_output=cnfg.norm_output, bindDecoderEmb=cnfg.bindDecoderEmb, forbidden_index=cnfg.forbidden_indexes, model_name=cnfg.model_name)
 tmod.apply(init_fixing)
 tmod.load_plm(fix_parameter_name(torch.load("plm/bert-base-cased/pytorch_model.bin", map_location="cpu")))
 
@@ -33,7 +31,7 @@ def init_fixing(module):
 print("forward with transformers")
 td = torch.as_tensor([101, 146, 1138, 1126, 12075, 119, 102], dtype=torch.long).unsqueeze(0)
 
-with torch.no_grad():
+with torch_inference_mode():
 	ers = smod(td).last_hidden_state
 	print("forward for test")
 	trs = tmod(td)

diff --git a/adv/examples/plm/mbart.py b/adv/examples/plm/mbart.py
@@ -0,0 +1,47 @@
+#encoding: utf-8
+
+import torch
+from transformers import MBartForConditionalGeneration, MBartTokenizerFast as Tokenizer
+
+from transformer.PLM.MBART.NMT import NMT
+from utils.fmt.plm.base import fix_parameter_name
+from utils.torch.comp import torch_inference_mode
+
+import cnfg.plm.mbart.base as cnfg
+from cnfg.plm.mbart.ihyp import *
+from cnfg.vocab.plm.mbart import vocab_size
+
+def init_fixing(module):
+
+	if hasattr(module, "fix_init"):
+		module.fix_init()
+
+print("load pre-trained models")
+tokenizer = Tokenizer(tokenizer_file="plm/mbart-large-cc25/tokenizer.json")
+
+tmod = NMT(cnfg.isize, vocab_size, vocab_size, cnfg.nlayer, fhsize=cnfg.ff_hsize, dropout=cnfg.drop, attn_drop=cnfg.attn_drop, act_drop=cnfg.act_drop, global_emb=cnfg.share_emb, num_head=cnfg.nhead, xseql=cache_len_default, ahsize=cnfg.attn_hsize, norm_output=cnfg.norm_output, bindDecoderEmb=cnfg.bindDecoderEmb, forbidden_index=cnfg.forbidden_indexes, model_name=cnfg.model_name)
+tmod.apply(init_fixing)
+tmod.load_plm(fix_parameter_name(torch.load("plm/mbart-large-cc25/pytorch_model.bin", map_location="cpu")))
+tmod.eval()
+
+print("load models with transformers")
+smod = MBartForConditionalGeneration.from_pretrained("plm/mbart-large-cc25")
+smod.eval()
+
+print("forward with transformers")
+tde = torch.as_tensor([17, 765, 142, 108787, 5, 2, 250004], dtype=torch.long).unsqueeze(0)
+tdo = torch.as_tensor([250004, 17, 765, 142, 108787, 5, 2], dtype=torch.long).unsqueeze(0)
+
+print("forward for test")
+with torch_inference_mode():
+	ers = smod(input_ids=tde, decoder_input_ids=tdo, output_hidden_states=True).decoder_hidden_states[-1]
+	trs = tmod(tde, tdo)
+
+print(ers)
+print(trs)
+
+with torch_inference_mode():
+	ers = smod.generate(tde, decoder_start_token_id=250004)
+	trs = tmod.decode(tde, lang_id=250004)
+print(tokenizer.convert_ids_to_tokens(ers.squeeze(0)))
+print(tokenizer.convert_ids_to_tokens(trs.squeeze(0)))
diff --git a/adv/examples/plm/roberta.py b/adv/examples/plm/roberta.py
@@ -1,26 +1,24 @@
 #encoding: utf-8
 
 import torch
+from transformers import RobertaModel
 
+from transformer.PLM.RoBERTa.NMT import NMT
 from utils.fmt.plm.base import fix_parameter_name
-from utils.fmt.plm.roberta.base import ldvocab
+from utils.torch.comp import torch_inference_mode
 
 import cnfg.plm.roberta.base as cnfg
 from cnfg.plm.roberta.ihyp import *
 from cnfg.vocab.plm.roberta import vocab_size
 
-from transformer.PLM.RoBERTa.NMT import NMT
-
-from transformers import RobertaModel
-
 def init_fixing(module):
 
 	if hasattr(module, "fix_init"):
 		module.fix_init()
 
 print("load pre-trained models")
 
-tmod = NMT(cnfg.isize, vocab_size, vocab_size, cnfg.nlayer, fhsize=cnfg.ff_hsize, dropout=cnfg.drop, attn_drop=cnfg.attn_drop, global_emb=cnfg.share_emb, num_head=cnfg.nhead, xseql=cache_len_default, ahsize=cnfg.attn_hsize, norm_output=cnfg.norm_output, bindDecoderEmb=cnfg.bindDecoderEmb, forbidden_index=cnfg.forbidden_indexes, model_name=cnfg.model_name)
+tmod = NMT(cnfg.isize, vocab_size, vocab_size, cnfg.nlayer, fhsize=cnfg.ff_hsize, dropout=cnfg.drop, attn_drop=cnfg.attn_drop, act_drop=cnfg.act_drop, global_emb=cnfg.share_emb, num_head=cnfg.nhead, xseql=cache_len_default, ahsize=cnfg.attn_hsize, norm_output=cnfg.norm_output, bindDecoderEmb=cnfg.bindDecoderEmb, forbidden_index=cnfg.forbidden_indexes, model_name=cnfg.model_name)
 tmod.apply(init_fixing)
 tmod.load_plm(fix_parameter_name(torch.load("plm/roberta-base/pytorch_model.bin", map_location="cpu")))
 
@@ -33,7 +31,7 @@ def init_fixing(module):
 print("forward with transformers")
 td = torch.as_tensor([0, 100, 33, 41, 15162, 4, 2], dtype=torch.long).unsqueeze(0)
 
-with torch.no_grad():
+with torch_inference_mode():
 	ers = smod(td).last_hidden_state
 	print("forward for test")
 	trs = tmod(td)

diff --git a/adv/examples/plm/t5.py b/adv/examples/plm/t5.py
@@ -1,17 +1,16 @@
 #encoding: utf-8
 
 import torch
+from transformers import T5ForConditionalGeneration, T5TokenizerFast as Tokenizer
 
+from transformer.PLM.T5.NMT import NMT
 from utils.fmt.plm.base import fix_parameter_name
+from utils.torch.comp import torch_inference_mode
 
 import cnfg.plm.t5.base as cnfg
 from cnfg.plm.t5.ihyp import *
 from cnfg.vocab.plm.t5 import vocab_size
 
-from transformer.PLM.T5.NMT import NMT
-
-from transformers import T5ForConditionalGeneration, T5TokenizerFast as Tokenizer
-
 def init_fixing(module):
 
 	if hasattr(module, "fix_init"):
@@ -20,7 +19,7 @@ def init_fixing(module):
 print("load pre-trained models")
 tokenizer = Tokenizer(tokenizer_file="plm/t5-base/tokenizer.json")
 
-tmod = NMT(cnfg.isize, vocab_size, vocab_size, cnfg.nlayer, fhsize=cnfg.ff_hsize, dropout=cnfg.drop, attn_drop=cnfg.attn_drop, global_emb=cnfg.share_emb, num_head=cnfg.nhead, xseql=cache_len_default, ahsize=cnfg.attn_hsize, norm_output=cnfg.norm_output, bindDecoderEmb=cnfg.bindDecoderEmb, forbidden_index=cnfg.forbidden_indexes, model_name=cnfg.model_name)
+tmod = NMT(cnfg.isize, vocab_size, vocab_size, cnfg.nlayer, fhsize=cnfg.ff_hsize, dropout=cnfg.drop, attn_drop=cnfg.attn_drop, act_drop=cnfg.act_drop, global_emb=cnfg.share_emb, num_head=cnfg.nhead, xseql=cache_len_default, ahsize=cnfg.attn_hsize, norm_output=cnfg.norm_output, bindDecoderEmb=cnfg.bindDecoderEmb, forbidden_index=cnfg.forbidden_indexes, model_name=cnfg.model_name)
 tmod.apply(init_fixing)
 tmod.load_plm(fix_parameter_name(torch.load("plm/t5-base/pytorch_model.bin", map_location="cpu")))
 
@@ -34,15 +33,15 @@ def init_fixing(module):
 tde = torch.as_tensor([27, 43, 192, 16981, 5, 1], dtype=torch.long).unsqueeze(0)
 tdo = torch.as_tensor([0, 531, 25, 241, 80, 58], dtype=torch.long).unsqueeze(0)
 
-with torch.no_grad():
+with torch_inference_mode():
 	ers = smod(input_ids=tde, decoder_input_ids=tdo, output_hidden_states=True).decoder_hidden_states[-1]
 	print("forward for test")
 	trs = tmod(tde, tdo)
 print(ers)
 print(trs)
 
 tde = torch.as_tensor([27, 43, 32099, 16981, 5, 32098, 241, 80, 58, 1], dtype=torch.long).unsqueeze(0)
-with torch.no_grad():
+with torch_inference_mode():
 	ers = smod.generate(tde)
 	trs = tmod.decode(tde)
 print(tokenizer.convert_ids_to_tokens(ers.squeeze(0)))