Hello World :-)

hfxunlp · Apr 2, 2020 · 07250af · 07250af
1 parent 6befd65
commit 07250af
Show file tree

Hide file tree

Showing 66 changed files with 924 additions and 424 deletions.
diff --git a/README.md b/README.md
@@ -167,9 +167,6 @@ share_emb = False
 # number of heads for multi-head attention.
 nhead = max(1, isize // 64)
 
-# maximum steps cached for the positional embedding.
-cache_len = 256
-
 # warm up steps for the training.
 warm_step = 8000
 # scalar of learning rate
@@ -182,6 +179,8 @@ attn_hsize = None
 seed = 666666
 ```
 
+Configure advanced details with `cnfg/hyp.py`:
+
 ## Training
 
 Just execute the following command to launch the training:
@@ -227,7 +226,7 @@ where `rsf` is the result file, `h5f` is HDF5 formatted input of file of your co
 
 Foundamental models needed for the construction of transformer.
 
-### `loss.py`
+### `loss/`
 
 Implementation of label smoothing loss function required by the training of transformer.
 
@@ -249,53 +248,7 @@ An example depends on Flask to provide simple Web service and REST API about how
 
 ### `transformer/`
 
-#### `NMT.py`
-
-The transformer model encapsulates encoder and decoder. Switch [the comment line](https://github.com/anoidgit/transformer/blob/master/transformer/NMT.py#L9-L11) to make a choice between the standard decoder and the average decoder.
-
-#### `Encoder.py`
-
-The encoder of transformer.
-
-#### `Decoder.py`
-
-The standard decoder of transformer.
-
-#### `AvgDecoder.py`
-
-The average decoder of transformer proposed by [Accelerating Neural Transformer via an Average Attention Network](https://www.aclweb.org/anthology/P18-1166/).
-
-#### `EnsembleNMT.py`
-
-A model encapsulates several NMT models to do ensemble decoding. Switch [the comment line](https://github.com/anoidgit/transformer/blob/master/transformer/EnsembleNMT.py#L9-L11) to make a choice between the standard decoder and the average decoder.
-
-#### `EnsembleEncoder.py`
-
-A model encapsulates several encoders for ensemble decoding.
-
-#### `EnsembleDecoder.py`
-
-A model encapsulates several standard decoders for ensemble decoding.
-
-#### `EnsembleAvgDecoder.py`
-
-A model encapsulates several average decoders proposed by [Accelerating Neural Transformer via an Average Attention Network](https://www.aclweb.org/anthology/P18-1166/) for ensemble decoding.
-
-#### `AGG/`
-
-Implementation of aggregation models.
-
-##### `Hier*.py`
-
-Hierarchical aggregation proposed in [Exploiting Deep Representations for Neural Machine Translation](https://www.aclweb.org/anthology/D18-1457/).
-
-#### `TA/`
-
-Implementation of transparent attention proposed in [Training Deeper Neural Machine Translation Models with Transparent Attention](https://aclweb.org/anthology/D18-1338).
-
-#### `SC/`
-
-Implementation of sentential context proposed in [Exploiting Sentential Context for Neural Machine Translation](https://www.aclweb.org/anthology/P19-1624/).
+Implementations of seq2seq models.
 
 ### `parallel/`
 

diff --git a/adv/predict/doc/para/predict_doc_para.py b/adv/predict/doc/para/predict_doc_para.py
@@ -9,6 +9,7 @@
 import h5py
 
 import cnfg.docpara as cnfg
+from cnfg.ihyp import *
 
 from transformer.Doc.Para.Base.NMT import NMT
 from transformer.EnsembleNMT import NMT as Ensemble
@@ -31,15 +32,15 @@ def load_fixing(module):
 vcbt = reverse_dict(vcbt)
 
 if len(sys.argv) == 4:
-	mymodel = NMT(cnfg.isize, nwordi, nwordt, cnfg.nlayer, cnfg.ff_hsize, cnfg.drop, cnfg.attn_drop, cnfg.share_emb, cnfg.nhead, cnfg.cache_len, cnfg.attn_hsize, cnfg.norm_output, cnfg.bindDecoderEmb, cnfg.forbidden_indexes, cnfg.num_prev_sent, cnfg.num_layer_context)
+	mymodel = NMT(cnfg.isize, nwordi, nwordt, cnfg.nlayer, cnfg.ff_hsize, cnfg.drop, cnfg.attn_drop, cnfg.share_emb, cnfg.nhead, cache_len_default, cnfg.attn_hsize, cnfg.norm_output, cnfg.bindDecoderEmb, cnfg.forbidden_indexes, cnfg.num_prev_sent, cnfg.num_layer_context)
 
 	mymodel = load_model_cpu(sys.argv[3], mymodel)
 	mymodel.apply(load_fixing)
 
 else:
 	models = []
 	for modelf in sys.argv[3:]:
-		tmp = NMT(cnfg.isize, nwordi, nwordt, cnfg.nlayer, cnfg.ff_hsize, cnfg.drop, cnfg.attn_drop, cnfg.share_emb, cnfg.nhead, cnfg.cache_len, cnfg.attn_hsize, cnfg.norm_output, cnfg.bindDecoderEmb, cnfg.forbidden_indexes, cnfg.num_prev_sent, cnfg.num_layer_context)
+		tmp = NMT(cnfg.isize, nwordi, nwordt, cnfg.nlayer, cnfg.ff_hsize, cnfg.drop, cnfg.attn_drop, cnfg.share_emb, cnfg.nhead, cache_len_default, cnfg.attn_hsize, cnfg.norm_output, cnfg.bindDecoderEmb, cnfg.forbidden_indexes, cnfg.num_prev_sent, cnfg.num_layer_context)
 
 		tmp = load_model_cpu(modelf, tmp)
 		tmp.apply(load_fixing)
@@ -54,6 +55,7 @@ def load_fixing(module):
 
 use_cuda, cuda_device, cuda_devices, multi_gpu = parse_cuda_decode(cnfg.use_cuda, cnfg.gpuid, cnfg.multi_gpu_decoding)
 
+# Important to make cudnn methods deterministic
 set_random_seed(cnfg.seed, use_cuda)
 
 if use_cuda:

diff --git a/adv/rank/doc/para/rank_loss_doc_para.py b/adv/rank/doc/para/rank_loss_doc_para.py
@@ -13,13 +13,14 @@
 import h5py
 
 import cnfg.docpara as cnfg
+from cnfg.ihyp import *
 
 from transformer.Doc.Para.Base.NMT import NMT
 from transformer.EnsembleNMT import NMT as Ensemble
 from parallel.parallelMT import DataParallelMT
 from parallel.base import DataParallelCriterion
 
-from loss import LabelSmoothingLoss
+from loss.base import LabelSmoothingLoss
 
 from utils.base import *
 from utils.fmt.base4torch import parse_cuda
@@ -38,15 +39,15 @@ def load_fixing(module):
 cuda_device = torch.device(cnfg.gpuid)
 
 if len(sys.argv) == 4:
-	mymodel = NMT(cnfg.isize, nwordi, nwordt, cnfg.nlayer, cnfg.ff_hsize, cnfg.drop, cnfg.attn_drop, cnfg.share_emb, cnfg.nhead, cnfg.cache_len, cnfg.attn_hsize, cnfg.norm_output, cnfg.bindDecoderEmb, cnfg.forbidden_indexes, cnfg.num_prev_sent, cnfg.num_layer_context)
+	mymodel = NMT(cnfg.isize, nwordi, nwordt, cnfg.nlayer, cnfg.ff_hsize, cnfg.drop, cnfg.attn_drop, cnfg.share_emb, cnfg.nhead, cache_len_default, cnfg.attn_hsize, cnfg.norm_output, cnfg.bindDecoderEmb, cnfg.forbidden_indexes, cnfg.num_prev_sent, cnfg.num_layer_context)
 
 	mymodel = load_model_cpu(sys.argv[3], mymodel)
 	mymodel.apply(load_fixing)
 
 else:
 	models = []
 	for modelf in sys.argv[3:]:
-		tmp = NMT(cnfg.isize, nwordi, nwordt, cnfg.nlayer, cnfg.ff_hsize, cnfg.drop, cnfg.attn_drop, cnfg.share_emb, cnfg.nhead, cnfg.cache_len, cnfg.attn_hsize, cnfg.norm_output, cnfg.bindDecoderEmb, cnfg.forbidden_indexes, cnfg.num_prev_sent, cnfg.num_layer_context)
+		tmp = NMT(cnfg.isize, nwordi, nwordt, cnfg.nlayer, cnfg.ff_hsize, cnfg.drop, cnfg.attn_drop, cnfg.share_emb, cnfg.nhead, cache_len_default, cnfg.attn_hsize, cnfg.norm_output, cnfg.bindDecoderEmb, cnfg.forbidden_indexes, cnfg.num_prev_sent, cnfg.num_layer_context)
 
 		tmp = load_model_cpu(modelf, tmp)
 		tmp.apply(load_fixing)
@@ -59,9 +60,8 @@ def load_fixing(module):
 lossf = LabelSmoothingLoss(nwordt, cnfg.label_smoothing, ignore_index=0, reduction='none', forbidden_index=cnfg.forbidden_indexes)
 
 use_cuda, cuda_device, cuda_devices, multi_gpu = parse_cuda(cnfg.use_cuda, cnfg.gpuid)
-# disable multi_gpu, not supported
-multi_gpu, cuda_devices = False, None
 
+# Important to make cudnn methods deterministic
 set_random_seed(cnfg.seed, use_cuda)
 
 if use_cuda:

diff --git a/adv/rank/doc/rank_loss_sent.py b/adv/rank/doc/rank_loss_sent.py
@@ -13,13 +13,14 @@
 import h5py
 
 import cnfg.base as cnfg
+from cnfg.ihyp import *
 
 from transformer.NMT import NMT
 from transformer.EnsembleNMT import NMT as Ensemble
 from parallel.parallelMT import DataParallelMT
 from parallel.base import DataParallelCriterion
 
-from loss import LabelSmoothingLoss
+from loss.base import LabelSmoothingLoss
 
 from utils.base import *
 from utils.fmt.base4torch import parse_cuda
@@ -38,15 +39,15 @@ def load_fixing(module):
 cuda_device = torch.device(cnfg.gpuid)
 
 if len(sys.argv) == 4:
-	mymodel = NMT(cnfg.isize, nwordi, nwordt, cnfg.nlayer, cnfg.ff_hsize, cnfg.drop, cnfg.attn_drop, cnfg.share_emb, cnfg.nhead, cnfg.cache_len, cnfg.attn_hsize, cnfg.norm_output, cnfg.bindDecoderEmb, cnfg.forbidden_indexes)
+	mymodel = NMT(cnfg.isize, nwordi, nwordt, cnfg.nlayer, cnfg.ff_hsize, cnfg.drop, cnfg.attn_drop, cnfg.share_emb, cnfg.nhead, cache_len_default, cnfg.attn_hsize, cnfg.norm_output, cnfg.bindDecoderEmb, cnfg.forbidden_indexes)
 
 	mymodel = load_model_cpu(sys.argv[3], mymodel)
 	mymodel.apply(load_fixing)
 
 else:
 	models = []
 	for modelf in sys.argv[3:]:
-		tmp = NMT(cnfg.isize, nwordi, nwordt, cnfg.nlayer, cnfg.ff_hsize, cnfg.drop, cnfg.attn_drop, cnfg.share_emb, cnfg.nhead, cnfg.cache_len, cnfg.attn_hsize, cnfg.norm_output, cnfg.bindDecoderEmb, cnfg.forbidden_indexes)
+		tmp = NMT(cnfg.isize, nwordi, nwordt, cnfg.nlayer, cnfg.ff_hsize, cnfg.drop, cnfg.attn_drop, cnfg.share_emb, cnfg.nhead, cache_len_default, cnfg.attn_hsize, cnfg.norm_output, cnfg.bindDecoderEmb, cnfg.forbidden_indexes)
 
 		tmp = load_model_cpu(modelf, tmp)
 		tmp.apply(load_fixing)
@@ -60,6 +61,7 @@ def load_fixing(module):
 
 use_cuda, cuda_device, cuda_devices, multi_gpu = parse_cuda(cnfg.use_cuda, cnfg.gpuid)
 
+# Important to make cudnn methods deterministic
 set_random_seed(cnfg.seed, use_cuda)
 
 if use_cuda: