diff --git a/.gitignore b/.gitignore
index 10353dea..168abd66 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,7 +1,8 @@
 dea
 docs/_build
-tensorlaye/__pacache__
-tensorlaye/.DS_Store
+tensorlayer
+tensorlayer/__pacache__
+tensorlayer/.DS_Store
 .DS_Store
 dist
 build/
@@ -10,3 +11,8 @@ data/.DS_Store
 *.pyc
 *.gz
 .spyproject/
+.vscode/*
+model.npz
+env/
+venv/
+.idea/
diff --git a/README.md b/README.md
index fcfd81df..96bfeaf2 100644
--- a/README.md
+++ b/README.md
@@ -4,11 +4,15 @@ This is a 200 lines implementation of Twitter/Cornell-Movie Chatbot, please read
 
 - [Practical-Seq2Seq](http://suriyadeepan.github.io/2016-12-31-practical-seq2seq/)
 - [The Unreasonable Effectiveness of Recurrent Neural Networks](http://karpathy.github.io/2015/05/21/rnn-effectiveness/)
-- [Understanding LSTM Networks](http://colah.github.io/posts/2015-08-Understanding-LSTMs/) (option)
+- [Understanding LSTM Networks](http://colah.github.io/posts/2015-08-Understanding-LSTMs/) (optional)
 
-### Model
+### Prerequisites
 
+- Python 3.6
+- [TensorFlow](https://github.com/tensorflow/tensorflow) >= 2.0
+- [TensorLayer](https://github.com/zsdonghao/tensorlayer) >= 2.0
 
+### Model
 
 <table class="image">
 <div align="center">
@@ -18,9 +22,14 @@ This is a 200 lines implementation of Twitter/Cornell-Movie Chatbot, please read
 </div>
 </table>
 
-### Results
+### Training
+
+```
+python3 main.py
+```
 
-<!---#### Twitter-->
+
+### Results
 
 ```
 Query > happy birthday have a nice day
@@ -35,13 +44,3 @@ Query > donald trump won last nights presidential debate according to snap onlin
  > he is not a racist
  > he is a liar
  > trump needs to be president
-
-```
-<!---
-#### Cornell Moive
-
-
-```
-
-```
--->
diff --git a/data/__init__.py b/data/__init__.py
index c680e9ae..e0cce255 100644
--- a/data/__init__.py
+++ b/data/__init__.py
@@ -1,6 +1,6 @@
 
 from __future__ import absolute_import
 
-
+# from . import twitter
 # from . import imagenet_classes
 # from . import
diff --git a/main.py b/main.py
new file mode 100644
index 00000000..b5eab72f
--- /dev/null
+++ b/main.py
@@ -0,0 +1,142 @@
+#! /usr/bin/python
+# -*- coding: utf-8 -*-
+
+import tensorflow as tf
+import tensorlayer as tl
+import numpy as np
+from tensorlayer.cost import cross_entropy_seq, cross_entropy_seq_with_mask
+from tqdm import tqdm
+from sklearn.utils import shuffle
+from data.twitter import data
+from tensorlayer.models.seq2seq import Seq2seq
+from tensorlayer.models.seq2seq_with_attention import Seq2seqLuongAttention
+import os
+
+
+def initial_setup(data_corpus):
+    metadata, idx_q, idx_a = data.load_data(PATH='data/{}/'.format(data_corpus))
+    (trainX, trainY), (testX, testY), (validX, validY) = data.split_dataset(idx_q, idx_a)
+    trainX = tl.prepro.remove_pad_sequences(trainX.tolist())
+    trainY = tl.prepro.remove_pad_sequences(trainY.tolist())
+    testX = tl.prepro.remove_pad_sequences(testX.tolist())
+    testY = tl.prepro.remove_pad_sequences(testY.tolist())
+    validX = tl.prepro.remove_pad_sequences(validX.tolist())
+    validY = tl.prepro.remove_pad_sequences(validY.tolist())
+    return metadata, trainX, trainY, testX, testY, validX, validY
+
+
+
+if __name__ == "__main__":
+    data_corpus = "twitter"
+
+    #data preprocessing
+    metadata, trainX, trainY, testX, testY, validX, validY = initial_setup(data_corpus)
+
+    # Parameters
+    src_len = len(trainX)
+    tgt_len = len(trainY)
+
+    assert src_len == tgt_len
+
+    batch_size = 32
+    n_step = src_len // batch_size
+    src_vocab_size = len(metadata['idx2w']) # 8002 (0~8001)
+    emb_dim = 1024
+
+    word2idx = metadata['w2idx']   # dict  word 2 index
+    idx2word = metadata['idx2w']   # list index 2 word
+
+    unk_id = word2idx['unk']   # 1
+    pad_id = word2idx['_']     # 0
+
+    start_id = src_vocab_size  # 8002
+    end_id = src_vocab_size + 1  # 8003
+
+    word2idx.update({'start_id': start_id})
+    word2idx.update({'end_id': end_id})
+    idx2word = idx2word + ['start_id', 'end_id']
+
+    src_vocab_size = tgt_vocab_size = src_vocab_size + 2
+
+    num_epochs = 50
+    vocabulary_size = src_vocab_size
+    
+
+
+    def inference(seed, top_n):
+        model_.eval()
+        seed_id = [word2idx.get(w, unk_id) for w in seed.split(" ")]
+        sentence_id = model_(inputs=[[seed_id]], seq_length=20, start_token=start_id, top_n = top_n)
+        sentence = []
+        for w_id in sentence_id[0]:
+            w = idx2word[w_id]
+            if w == 'end_id':
+                break
+            sentence = sentence + [w]
+        return sentence
+
+    decoder_seq_length = 20
+    model_ = Seq2seq(
+        decoder_seq_length = decoder_seq_length,
+        cell_enc=tf.keras.layers.GRUCell,
+        cell_dec=tf.keras.layers.GRUCell,
+        n_layer=3,
+        n_units=256,
+        embedding_layer=tl.layers.Embedding(vocabulary_size=vocabulary_size, embedding_size=emb_dim),
+        )
+    
+
+    # Uncomment below statements if you have already saved the model
+
+    # load_weights = tl.files.load_npz(name='model.npz')
+    # tl.files.assign_weights(load_weights, model_)
+
+    optimizer = tf.optimizers.Adam(learning_rate=0.001)
+    model_.train()
+
+    seeds = ["happy birthday have a nice day",
+                 "donald trump won last nights presidential debate according to snap online polls"]
+    for epoch in range(num_epochs):
+        model_.train()
+        trainX, trainY = shuffle(trainX, trainY, random_state=0)
+        total_loss, n_iter = 0, 0
+        for X, Y in tqdm(tl.iterate.minibatches(inputs=trainX, targets=trainY, batch_size=batch_size, shuffle=False), 
+                        total=n_step, desc='Epoch[{}/{}]'.format(epoch + 1, num_epochs), leave=False):
+
+            X = tl.prepro.pad_sequences(X)
+            _target_seqs = tl.prepro.sequences_add_end_id(Y, end_id=end_id)
+            _target_seqs = tl.prepro.pad_sequences(_target_seqs, maxlen=decoder_seq_length)
+            _decode_seqs = tl.prepro.sequences_add_start_id(Y, start_id=start_id, remove_last=False)
+            _decode_seqs = tl.prepro.pad_sequences(_decode_seqs, maxlen=decoder_seq_length)
+            _target_mask = tl.prepro.sequences_get_mask(_target_seqs)
+
+            with tf.GradientTape() as tape:
+                ## compute outputs
+                output = model_(inputs = [X, _decode_seqs])
+                
+                output = tf.reshape(output, [-1, vocabulary_size])
+                ## compute loss and update model
+                loss = cross_entropy_seq_with_mask(logits=output, target_seqs=_target_seqs, input_mask=_target_mask)
+
+                grad = tape.gradient(loss, model_.all_weights)
+                optimizer.apply_gradients(zip(grad, model_.all_weights))
+            
+            total_loss += loss
+            n_iter += 1
+
+        # printing average loss after every epoch
+        print('Epoch [{}/{}]: loss {:.4f}'.format(epoch + 1, num_epochs, total_loss / n_iter))
+
+        for seed in seeds:
+            print("Query >", seed)
+            top_n = 3
+            for i in range(top_n):
+                sentence = inference(seed, top_n)
+                print(" >", ' '.join(sentence))
+
+        tl.files.save_npz(model_.all_weights, name='model.npz')
+
+
+        
+    
+    
diff --git a/main_simple_seq2seq.py b/main_simple_seq2seq.py
deleted file mode 100644
index a679b9a1..00000000
--- a/main_simple_seq2seq.py
+++ /dev/null
@@ -1,221 +0,0 @@
-#! /usr/bin/python
-# -*- coding: utf8 -*-
-"""Sequence to Sequence Learning for Twitter/Cornell Chatbot.
-
-References
-----------
-http://suriyadeepan.github.io/2016-12-31-practical-seq2seq/
-"""
-import tensorflow as tf
-import tensorlayer as tl
-from tensorlayer.layers import *
-
-import tensorflow as tf
-import numpy as np
-import time
-
-###============= prepare data
-from data.twitter import data
-metadata, idx_q, idx_a = data.load_data(PATH='data/twitter/')                   # Twitter
-# from data.cornell_corpus import data
-# metadata, idx_q, idx_a = data.load_data(PATH='data/cornell_corpus/')          # Cornell Moive
-(trainX, trainY), (testX, testY), (validX, validY) = data.split_dataset(idx_q, idx_a)
-
-trainX = trainX.tolist()
-trainY = trainY.tolist()
-testX = testX.tolist()
-testY = testY.tolist()
-validX = validX.tolist()
-validY = validY.tolist()
-
-trainX = tl.prepro.remove_pad_sequences(trainX)
-trainY = tl.prepro.remove_pad_sequences(trainY)
-testX = tl.prepro.remove_pad_sequences(testX)
-testY = tl.prepro.remove_pad_sequences(testY)
-validX = tl.prepro.remove_pad_sequences(validX)
-validY = tl.prepro.remove_pad_sequences(validY)
-
-###============= parameters
-xseq_len = len(trainX)#.shape[-1]
-yseq_len = len(trainY)#.shape[-1]
-assert xseq_len == yseq_len
-batch_size = 32
-n_step = int(xseq_len/batch_size)
-xvocab_size = len(metadata['idx2w']) # 8002 (0~8001)
-emb_dim = 1024
-
-w2idx = metadata['w2idx']   # dict  word 2 index
-idx2w = metadata['idx2w']   # list index 2 word
-
-unk_id = w2idx['unk']   # 1
-pad_id = w2idx['_']     # 0
-
-start_id = xvocab_size  # 8002
-end_id = xvocab_size+1  # 8003
-
-w2idx.update({'start_id': start_id})
-w2idx.update({'end_id': end_id})
-idx2w = idx2w + ['start_id', 'end_id']
-
-xvocab_size = yvocab_size = xvocab_size + 2
-
-""" A data for Seq2Seq should look like this:
-input_seqs : ['how', 'are', 'you', '<PAD_ID'>]
-decode_seqs : ['<START_ID>', 'I', 'am', 'fine', '<PAD_ID'>]
-target_seqs : ['I', 'am', 'fine', '<END_ID>', '<PAD_ID'>]
-target_mask : [1, 1, 1, 1, 0]
-"""
-
-print("encode_seqs", [idx2w[id] for id in trainX[10]])
-target_seqs = tl.prepro.sequences_add_end_id([trainY[10]], end_id=end_id)[0]
-    # target_seqs = tl.prepro.remove_pad_sequences([target_seqs], pad_id=pad_id)[0]
-print("target_seqs", [idx2w[id] for id in target_seqs])
-decode_seqs = tl.prepro.sequences_add_start_id([trainY[10]], start_id=start_id, remove_last=False)[0]
-    # decode_seqs = tl.prepro.remove_pad_sequences([decode_seqs], pad_id=pad_id)[0]
-print("decode_seqs", [idx2w[id] for id in decode_seqs])
-target_mask = tl.prepro.sequences_get_mask([target_seqs])[0]
-print("target_mask", target_mask)
-print(len(target_seqs), len(decode_seqs), len(target_mask))
-
-###============= model
-def model(encode_seqs, decode_seqs, is_train=True, reuse=False):
-    with tf.variable_scope("model", reuse=reuse):
-        # for chatbot, you can use the same embedding layer,
-        # for translation, you may want to use 2 seperated embedding layers
-        with tf.variable_scope("embedding") as vs:
-            net_encode = EmbeddingInputlayer(
-                inputs = encode_seqs,
-                vocabulary_size = xvocab_size,
-                embedding_size = emb_dim,
-                name = 'seq_embedding')
-            vs.reuse_variables()
-            tl.layers.set_name_reuse(True)
-            net_decode = EmbeddingInputlayer(
-                inputs = decode_seqs,
-                vocabulary_size = xvocab_size,
-                embedding_size = emb_dim,
-                name = 'seq_embedding')
-        net_rnn = Seq2Seq(net_encode, net_decode,
-                cell_fn = tf.contrib.rnn.BasicLSTMCell,
-                n_hidden = emb_dim,
-                initializer = tf.random_uniform_initializer(-0.1, 0.1),
-                encode_sequence_length = retrieve_seq_length_op2(encode_seqs),
-                decode_sequence_length = retrieve_seq_length_op2(decode_seqs),
-                initial_state_encode = None,
-                dropout = (0.5 if is_train else None),
-                n_layer = 3,
-                return_seq_2d = True,
-                name = 'seq2seq')
-        net_out = DenseLayer(net_rnn, n_units=xvocab_size, act=tf.identity, name='output')
-    return net_out, net_rnn
-
-# model for training
-encode_seqs = tf.placeholder(dtype=tf.int64, shape=[batch_size, None], name="encode_seqs")
-decode_seqs = tf.placeholder(dtype=tf.int64, shape=[batch_size, None], name="decode_seqs")
-target_seqs = tf.placeholder(dtype=tf.int64, shape=[batch_size, None], name="target_seqs")
-target_mask = tf.placeholder(dtype=tf.int64, shape=[batch_size, None], name="target_mask") # tl.prepro.sequences_get_mask()
-net_out, _ = model(encode_seqs, decode_seqs, is_train=True, reuse=False)
-
-# model for inferencing
-encode_seqs2 = tf.placeholder(dtype=tf.int64, shape=[1, None], name="encode_seqs")
-decode_seqs2 = tf.placeholder(dtype=tf.int64, shape=[1, None], name="decode_seqs")
-net, net_rnn = model(encode_seqs2, decode_seqs2, is_train=False, reuse=True)
-y = tf.nn.softmax(net.outputs)
-
-# loss for training
-    # print(net_out.outputs)    # (?, 8004)
-    # print(target_seqs)    # (32, ?)
-    # loss_weights = tf.ones_like(target_seqs, dtype=tf.float32)
-    # loss = tf.contrib.legacy_seq2seq.sequence_loss(net_out.outputs, target_seqs, loss_weights, yvocab_size)
-loss = tl.cost.cross_entropy_seq_with_mask(logits=net_out.outputs, target_seqs=target_seqs, input_mask=target_mask, return_details=False, name='cost')
-
-net_out.print_params(False)
-
-lr = 0.0001
-train_op = tf.train.AdamOptimizer(learning_rate=lr).minimize(loss)
-# Truncated Backpropagation for training (option)
-# max_grad_norm = 30
-# grads, _ = tf.clip_by_global_norm(tf.gradients(loss, net_out.all_params),max_grad_norm)
-# optimizer = tf.train.GradientDescentOptimizer(lr)
-# train_op = optimizer.apply_gradients(zip(grads, net_out.all_params))
-
-# sess = tf.InteractiveSession()
-sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False))
-tl.layers.initialize_global_variables(sess)
-tl.files.load_and_assign_npz(sess=sess, name='n.npz', network=net)
-
-###============= train
-n_epoch = 50
-for epoch in range(n_epoch):
-    epoch_time = time.time()
-    ## shuffle training data
-    from sklearn.utils import shuffle
-    trainX, trainY = shuffle(trainX, trainY, random_state=0)
-    ## train an epoch
-    total_err, n_iter = 0, 0
-    for X, Y in tl.iterate.minibatches(inputs=trainX, targets=trainY, batch_size=batch_size, shuffle=False):
-        step_time = time.time()
-
-        X = tl.prepro.pad_sequences(X)
-        _target_seqs = tl.prepro.sequences_add_end_id(Y, end_id=end_id)
-        _target_seqs = tl.prepro.pad_sequences(_target_seqs)
-
-        _decode_seqs = tl.prepro.sequences_add_start_id(Y, start_id=start_id, remove_last=False)
-        _decode_seqs = tl.prepro.pad_sequences(_decode_seqs)
-        _target_mask = tl.prepro.sequences_get_mask(_target_seqs)
-
-        ## you can view the data here
-        # for i in range(len(X)):
-        #     print(i, [idx2w[id] for id in X[i]])
-        #     # print(i, [idx2w[id] for id in Y[i]])
-        #     print(i, [idx2w[id] for id in _target_seqs[i]])
-        #     print(i, [idx2w[id] for id in _decode_seqs[i]])
-        #     print(i, _target_mask[i])
-        #     print(len(_target_seqs[i]), len(_decode_seqs[i]), len(_target_mask[i]))
-        # exit()
-
-        _, err = sess.run([train_op, loss],
-                        {encode_seqs: X,
-                        decode_seqs: _decode_seqs,
-                        target_seqs: _target_seqs,
-                        target_mask: _target_mask})
-
-        if n_iter % 200 == 0:
-            print("Epoch[%d/%d] step:[%d/%d] loss:%f took:%.5fs" % (epoch, n_epoch, n_iter, n_step, err, time.time() - step_time))
-
-        total_err += err; n_iter += 1
-
-        ###============= inference
-        if n_iter % 1000 == 0:
-            seeds = ["happy birthday have a nice day",
-                    "donald trump won last nights presidential debate according to snap online polls"]
-            for seed in seeds:
-                print("Query >", seed)
-                seed_id = [w2idx[w] for w in seed.split(" ")]
-                for _ in range(5):  # 1 Query --> 5 Reply
-                    # 1. encode, get state
-                    state = sess.run(net_rnn.final_state_encode,
-                                    {encode_seqs2: [seed_id]})
-                    # 2. decode, feed start_id, get first word
-                    #   ref https://github.com/zsdonghao/tensorlayer/blob/master/example/tutorial_ptb_lstm_state_is_tuple.py
-                    o, state = sess.run([y, net_rnn.final_state_decode],
-                                    {net_rnn.initial_state_decode: state,
-                                    decode_seqs2: [[start_id]]})
-                    w_id = tl.nlp.sample_top(o[0], top_k=3)
-                    w = idx2w[w_id]
-                    # 3. decode, feed state iteratively
-                    sentence = [w]
-                    for _ in range(30): # max sentence length
-                        o, state = sess.run([y, net_rnn.final_state_decode],
-                                        {net_rnn.initial_state_decode: state,
-                                        decode_seqs2: [[w_id]]})
-                        w_id = tl.nlp.sample_top(o[0], top_k=2)
-                        w = idx2w[w_id]
-                        if w_id == end_id:
-                            break
-                        sentence = sentence + [w]
-                    print(" >", ' '.join(sentence))
-
-    print("Epoch[%d/%d] averaged loss:%f took:%.5fs" % (epoch, n_epoch, total_err/n_iter, time.time()-epoch_time))
-
-    tl.files.save_npz(net.all_params, name='n.npz', sess=sess)
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 00000000..291f69c6
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,7 @@
+scikit-learn
+tensorflow
+tensorlayer
+numpy
+click
+tqdm
+nltk
diff --git a/tensorlayer/__init__.py b/tensorlayer/__init__.py
deleted file mode 100644
index af277d2f..00000000
--- a/tensorlayer/__init__.py
+++ /dev/null
@@ -1,32 +0,0 @@
-"""
-Deep learning and Reinforcement learning library for Researchers and Engineers
-"""
-from __future__ import absolute_import
-
-
-try:
-    install_instr = "Please make sure you install a recent enough version of TensorFlow."
-    import tensorflow
-except ImportError:
-    raise ImportError("__init__.py : Could not import TensorFlow." + install_instr)
-
-from . import activation
-from . import cost
-from . import files
-from . import iterate
-from . import layers
-from . import ops
-from . import utils
-from . import visualize
-from . import prepro
-from . import nlp
-from . import rein
-
-# alias
-act = activation
-vis = visualize
-
-__version__ = "1.6.3rc"
-
-global_flag = {}
-global_dict = {}
diff --git a/tensorlayer/activation.py b/tensorlayer/activation.py
deleted file mode 100644
index 7b6b6402..00000000
--- a/tensorlayer/activation.py
+++ /dev/null
@@ -1,109 +0,0 @@
-#! /usr/bin/python
-# -*- coding: utf8 -*-
-
-
-
-import tensorflow as tf
-
-def identity(x, name=None):
-    """The identity activation function, Shortcut is ``linear``.
-
-    Parameters
-    ----------
-    x : a tensor input
-        input(s)
-
-
-    Returns
-    --------
-    A `Tensor` with the same type as `x`.
-    """
-    return x
-
-# Shortcut
-linear = identity
-
-def ramp(x=None, v_min=0, v_max=1, name=None):
-    """The ramp activation function.
-
-    Parameters
-    ----------
-    x : a tensor input
-        input(s)
-    v_min : float
-        if input(s) smaller than v_min, change inputs to v_min
-    v_max : float
-        if input(s) greater than v_max, change inputs to v_max
-    name : a string or None
-        An optional name to attach to this activation function.
-
-
-    Returns
-    --------
-    A `Tensor` with the same type as `x`.
-    """
-    return tf.clip_by_value(x, clip_value_min=v_min, clip_value_max=v_max, name=name)
-
-def leaky_relu(x=None, alpha=0.1, name="LeakyReLU"):
-    """The LeakyReLU, Shortcut is ``lrelu``.
-
-    Modified version of ReLU, introducing a nonzero gradient for negative
-    input.
-
-    Parameters
-    ----------
-    x : A `Tensor` with type `float`, `double`, `int32`, `int64`, `uint8`,
-        `int16`, or `int8`.
-    alpha : `float`. slope.
-    name : a string or None
-        An optional name to attach to this activation function.
-
-    Examples
-    ---------
-    >>> network = tl.layers.DenseLayer(network, n_units=100, name = 'dense_lrelu',
-    ...                 act= lambda x : tl.act.lrelu(x, 0.2))
-
-    References
-    ------------
-    - `Rectifier Nonlinearities Improve Neural Network Acoustic Models, Maas et al. (2013) <http://web.stanford.edu/~awni/papers/relu_hybrid_icml2013_final.pdf>`_
-    """
-    with tf.name_scope(name) as scope:
-        # x = tf.nn.relu(x)
-        # m_x = tf.nn.relu(-x)
-        # x -= alpha * m_x
-        x = tf.maximum(x, alpha * x)
-    return x
-
-#Shortcut
-lrelu = leaky_relu
-
-def pixel_wise_softmax(output, name='pixel_wise_softmax'):
-    """Return the softmax outputs of images, every pixels have multiple label, the sum of a pixel is 1.
-    Usually be used for image segmentation.
-
-    Parameters
-    ------------
-    output : tensor
-        - For 2d image, 4D tensor [batch_size, height, weight, channel], channel >= 2.
-        - For 3d image, 5D tensor [batch_size, depth, height, weight, channel], channel >= 2.
-
-    Examples
-    ---------
-    >>> outputs = pixel_wise_softmax(network.outputs)
-    >>> dice_loss = 1 - dice_coe(outputs, y_, epsilon=1e-5)
-
-    References
-    -----------
-    - `tf.reverse <https://www.tensorflow.org/versions/master/api_docs/python/array_ops.html#reverse>`_
-    """
-    with tf.name_scope(name) as scope:
-        return tf.nn.softmax(output)
-        ## old implementation
-        # exp_map = tf.exp(output)
-        # if output.get_shape().ndims == 4:   # 2d image
-        #     evidence = tf.add(exp_map, tf.reverse(exp_map, [False, False, False, True]))
-        # elif output.get_shape().ndims == 5: # 3d image
-        #     evidence = tf.add(exp_map, tf.reverse(exp_map, [False, False, False, False, True]))
-        # else:
-        #     raise Exception("output parameters should be 2d or 3d image, not %s" % str(output._shape))
-        # return tf.div(exp_map, evidence)
diff --git a/tensorlayer/cost.py b/tensorlayer/cost.py
deleted file mode 100644
index 05ed0a81..00000000
--- a/tensorlayer/cost.py
+++ /dev/null
@@ -1,635 +0,0 @@
-#! /usr/bin/python
-# -*- coding: utf8 -*-
-
-import logging
-import tensorflow as tf
-import numbers
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import standard_ops
-
-## Cost Functions
-
-def cross_entropy(output, target, name=None):
-    """It is a softmax cross-entropy operation, returns the TensorFlow expression of cross-entropy of two distributions, implement
-    softmax internally. See ``tf.nn.sparse_softmax_cross_entropy_with_logits``.
-
-    Parameters
-    ----------
-    output : Tensorflow variable
-        A distribution with shape: [batch_size, n_feature].
-    target : Tensorflow variable
-        A batch of index with shape: [batch_size, ].
-    name : string
-        Name of this loss.
-
-    Examples
-    --------
-    >>> ce = tl.cost.cross_entropy(y_logits, y_target_logits, 'my_loss')
-
-    References
-    -----------
-    - About cross-entropy: `wiki <https://en.wikipedia.org/wiki/Cross_entropy>`_.\n
-    - The code is borrowed from: `here <https://en.wikipedia.org/wiki/Cross_entropy>`_.
-    """
-    # try: # old
-    #     return tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=output, targets=target))
-    # except: # TF 1.0
-    assert name is not None, "Please give a unique name to tl.cost.cross_entropy for TF1.0+"
-    return tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target, logits=output, name=name))
-
-def sigmoid_cross_entropy(output, target, name=None):
-    """It is a sigmoid cross-entropy operation, see ``tf.nn.sigmoid_cross_entropy_with_logits``.
-    """
-    # try: # TF 1.0
-    return tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=target, logits=output, name=name))
-    # except:
-    #     return tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=output, targets=target))
-
-
-def binary_cross_entropy(output, target, epsilon=1e-8, name='bce_loss'):
-    """Computes binary cross entropy given `output`.
-
-    For brevity, let `x = output`, `z = target`.  The binary cross entropy loss is
-
-        loss(x, z) = - sum_i (x[i] * log(z[i]) + (1 - x[i]) * log(1 - z[i]))
-
-    Parameters
-    ----------
-    output : tensor of type `float32` or `float64`.
-    target : tensor of the same type and shape as `output`.
-    epsilon : float
-        A small value to avoid output is zero.
-    name : string
-        An optional name to attach to this layer.
-
-    References
-    -----------
-    - `DRAW <https://github.com/ericjang/draw/blob/master/draw.py#L73>`_
-    """
-#     from tensorflow.python.framework import ops
-#     with ops.op_scope([output, target], name, "bce_loss") as name:
-#         output = ops.convert_to_tensor(output, name="preds")
-#         target = ops.convert_to_tensor(targets, name="target")
-    with tf.name_scope(name):
-        return tf.reduce_mean(tf.reduce_sum(-(target * tf.log(output + epsilon) +
-                              (1. - target) * tf.log(1. - output + epsilon)), axis=1))
-
-
-def mean_squared_error(output, target, is_mean=False):
-    """Return the TensorFlow expression of mean-square-error of two distributions.
-
-    Parameters
-    ----------
-    output : 2D or 4D tensor.
-    target : 2D or 4D tensor.
-    is_mean : boolean, if True, use ``tf.reduce_mean`` to compute the loss of one data, otherwise, use ``tf.reduce_sum`` (default).
-
-    References
-    ------------
-    - `Wiki Mean Squared Error <https://en.wikipedia.org/wiki/Mean_squared_error>`_
-    """
-    with tf.name_scope("mean_squared_error_loss"):
-        if output.get_shape().ndims == 2:   # [batch_size, n_feature]
-            if is_mean:
-                mse = tf.reduce_mean(tf.reduce_mean(tf.squared_difference(output, target), 1))
-            else:
-                mse = tf.reduce_mean(tf.reduce_sum(tf.squared_difference(output, target), 1))
-        elif output.get_shape().ndims == 4: # [batch_size, w, h, c]
-            if is_mean:
-                mse = tf.reduce_mean(tf.reduce_mean(tf.squared_difference(output, target), [1, 2, 3]))
-            else:
-                mse = tf.reduce_mean(tf.reduce_sum(tf.squared_difference(output, target), [1, 2, 3]))
-        return mse
-
-def normalized_mean_square_error(output, target):
-    """Return the TensorFlow expression of normalized mean-square-error of two distributions.
-
-    Parameters
-    ----------
-    output : 2D or 4D tensor.
-    target : 2D or 4D tensor.
-    """
-    with tf.name_scope("mean_squared_error_loss"):
-        if output.get_shape().ndims == 2:   # [batch_size, n_feature]
-            nmse_a = tf.sqrt(tf.reduce_sum(tf.squared_difference(output, target), axis=1))
-            nmse_b = tf.sqrt(tf.reduce_sum(tf.square(target), axis=1))
-        elif output.get_shape().ndims == 4: # [batch_size, w, h, c]
-            nmse_a = tf.sqrt(tf.reduce_sum(tf.squared_difference(output, target), axis=[1,2,3]))
-            nmse_b = tf.sqrt(tf.reduce_sum(tf.square(target), axis=[1,2,3]))
-        nmse = tf.reduce_mean(nmse_a / nmse_b)
-    return nmse
-
-
-
-def dice_coe(output, target, loss_type='jaccard', axis=[1,2,3], smooth=1e-5):
-    """Soft dice (Sørensen or Jaccard) coefficient for comparing the similarity
-    of two batch of data, usually be used for binary image segmentation
-    i.e. labels are binary. The coefficient between 0 to 1, 1 means totally match.
-
-    Parameters
-    -----------
-    output : tensor
-        A distribution with shape: [batch_size, ....], (any dimensions).
-    target : tensor
-        A distribution with shape: [batch_size, ....], (any dimensions).
-    loss_type : string
-        ``jaccard`` or ``sorensen``, default is ``jaccard``.
-    axis : list of integer
-        All dimensions are reduced, default ``[1,2,3]``.
-    smooth : float
-        This small value will be added to the numerator and denominator.
-        If both output and target are empty, it makes sure dice is 1.
-        If either output or target are empty (all pixels are background), dice = ```smooth/(small_value + smooth)``,
-        then if smooth is very small, dice close to 0 (even the image values lower than the threshold),
-        so in this case, higher smooth can have a higher dice.
-
-    Examples
-    ---------
-    >>> outputs = tl.act.pixel_wise_softmax(network.outputs)
-    >>> dice_loss = 1 - tl.cost.dice_coe(outputs, y_)
-
-    References
-    -----------
-    - `Wiki-Dice <https://en.wikipedia.org/wiki/Sørensen–Dice_coefficient>`_
-    """
-    inse = tf.reduce_sum(output * target, axis=axis)
-    if loss_type == 'jaccard':
-        l = tf.reduce_sum(output * output, axis=axis)
-        r = tf.reduce_sum(target * target, axis=axis)
-    elif loss_type == 'sorensen':
-        l = tf.reduce_sum(output, axis=axis)
-        r = tf.reduce_sum(target, axis=axis)
-    else:
-        raise Exception("Unknow loss_type")
-    ## old axis=[0,1,2,3]
-    # dice = 2 * (inse) / (l + r)
-    # epsilon = 1e-5
-    # dice = tf.clip_by_value(dice, 0, 1.0-epsilon) # if all empty, dice = 1
-    ## new haodong
-    dice = (2. * inse + smooth) / (l + r + smooth)
-    ##
-    dice = tf.reduce_mean(dice)
-    return dice
-
-
-def dice_hard_coe(output, target, threshold=0.5, axis=[1,2,3], smooth=1e-5):
-    """Non-differentiable Sørensen–Dice coefficient for comparing the similarity
-    of two batch of data, usually be used for binary image segmentation i.e. labels are binary.
-    The coefficient between 0 to 1, 1 if totally match.
-
-    Parameters
-    -----------
-    output : tensor
-        A distribution with shape: [batch_size, ....], (any dimensions).
-    target : tensor
-        A distribution with shape: [batch_size, ....], (any dimensions).
-    threshold : float
-        The threshold value to be true.
-    axis : list of integer
-        All dimensions are reduced, default ``[1,2,3]``.
-    smooth : float
-        This small value will be added to the numerator and denominator, see ``dice_coe``.
-
-    References
-    -----------
-    - `Wiki-Dice <https://en.wikipedia.org/wiki/Sørensen–Dice_coefficient>`_
-    """
-    output = tf.cast(output > threshold, dtype=tf.float32)
-    target = tf.cast(target > threshold, dtype=tf.float32)
-    inse = tf.reduce_sum(tf.multiply(output, target), axis=axis)
-    l = tf.reduce_sum(output, axis=axis)
-    r = tf.reduce_sum(target, axis=axis)
-    ## old axis=[0,1,2,3]
-    # hard_dice = 2 * (inse) / (l + r)
-    # epsilon = 1e-5
-    # hard_dice = tf.clip_by_value(hard_dice, 0, 1.0-epsilon)
-    ## new haodong
-    hard_dice = (2. * inse + smooth) / (l + r + smooth)
-    ##
-    hard_dice = tf.reduce_mean(hard_dice)
-    return hard_dice
-
-
-def iou_coe(output, target, threshold=0.5, axis=[1,2,3], smooth=1e-5):
-    """Non-differentiable Intersection over Union (IoU) for comparing the
-    similarity of two batch of data, usually be used for evaluating binary image segmentation.
-    The coefficient between 0 to 1, 1 means totally match.
-
-    Parameters
-    -----------
-    output : tensor
-        A distribution with shape: [batch_size, ....], (any dimensions).
-    target : tensor
-        A distribution with shape: [batch_size, ....], (any dimensions).
-    threshold : float
-        The threshold value to be true.
-    axis : list of integer
-        All dimensions are reduced, default ``[1,2,3]``.
-    smooth : float
-        This small value will be added to the numerator and denominator, see ``dice_coe``.
-
-    Notes
-    ------
-    - IoU cannot be used as training loss, people usually use dice coefficient for training, IoU and hard-dice for evaluating.
-    """
-    pre = tf.cast(output > threshold, dtype=tf.float32)
-    truth = tf.cast(target > threshold, dtype=tf.float32)
-    inse = tf.reduce_sum(tf.multiply(pre, truth), axis=axis) # AND
-    union = tf.reduce_sum(tf.cast(tf.add(pre, truth)>= 1, dtype=tf.float32), axis=axis) # OR
-    ## old axis=[0,1,2,3]
-    # epsilon = 1e-5
-    # batch_iou = inse / (union + epsilon)
-    ## new haodong
-    batch_iou = (inse + smooth) / (union + smooth)
-    iou = tf.reduce_mean(batch_iou)
-    return iou#, pre, truth, inse, union
-
-# ## test soft/hard dice and iou
-# import numpy as np
-# y = np.zeros((1,10,10,1))
-# # y[0,0:5,0:5]=1.0
-# o = np.zeros((1,10,10,1))
-# # o[:,:,:,:] = 0            # what we want: dice=0   iou=0  OK
-# # o[0,0:2,0:2]=0.3          # what we want: dice larger iou=0  OK
-# # o[0,0:2,0:2]=0.6          # what we want: dice larger  iou small  OK
-# # o[0,0:3,0:3]=0.6          # what we want: dice larger iou larger OK
-# # o[0,0:3,0:3]=1            # what we want: dice larger iou same OK
-# # o[0,0:5,0:5]=1            # what we want: dice=1 iou=1  OK
-# # o[0,0:5,0:5]=0.3          # what we want: dice smaller  iou=0  OK
-# # o[0,0:5,0:5]=1e-2           # what we want: dice≈0 iou=0  OK
-# # o[0,8:10,8:10]=1.0        # what we want: dice=0 iou=0  OK
-# # o[0,8:10,8:10]=1e-10        # what we want: dice=0 iou=0  OK
-# # y[:,:,:,:] = o[:,:,:,:] = 0 # what we want: dice=1 iou=1  OK
-# ## why in u-net, dice=1 hard-dice=1 iou=1 exist?? print bug?
-#
-# d = dice_coe(o, y, 'jaccard', smooth=1.)
-# hd = dice_hard_coe(o, y, smooth=1e-5)
-# i = iou_coe(o, y, smooth=1e-5)
-# sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
-# # sess.run(tf.local_variables_initializer())
-# print(sess.run([d,hd,i]))
-# # p, t, i, u = sess.run([pre, truth, inse, union])
-# # import pprint
-# # pprint.pprint(((y>0.5)*(o>0.5)).astype(int).tolist())
-# # pprint.pprint(p.tolist())
-# # pprint.pprint(t.tolist())
-# # pprint.pprint(i)
-# # pprint.pprint(u)
-# exit()
-
-
-def cross_entropy_seq(logits, target_seqs, batch_size=None):#, batch_size=1, num_steps=None):
-    """Returns the expression of cross-entropy of two sequences, implement
-    softmax internally. Normally be used for Fixed Length RNN outputs.
-
-    Parameters
-    ----------
-    logits : Tensorflow variable
-        2D tensor, ``network.outputs``, [batch_size*n_steps (n_examples), number of output units]
-    target_seqs : Tensorflow variable
-        target : 2D tensor [batch_size, n_steps], if the number of step is dynamic, please use ``cross_entropy_seq_with_mask`` instead.
-    batch_size : None or int.
-        If not None, the return cost will be divided by batch_size.
-
-    Examples
-    --------
-    >>> see PTB tutorial for more details
-    >>> input_data = tf.placeholder(tf.int32, [batch_size, num_steps])
-    >>> targets = tf.placeholder(tf.int32, [batch_size, num_steps])
-    >>> cost = tl.cost.cross_entropy_seq(network.outputs, targets)
-    """
-    # try: # TF 1.0
-    sequence_loss_by_example_fn = tf.contrib.legacy_seq2seq.sequence_loss_by_example
-    # except:
-    #     sequence_loss_by_example_fn = tf.nn.seq2seq.sequence_loss_by_example
-
-    loss = sequence_loss_by_example_fn(
-        [logits],
-        [tf.reshape(target_seqs, [-1])],
-        [tf.ones_like(tf.reshape(target_seqs, [-1]), dtype=tf.float32)])
-        # [tf.ones([batch_size * num_steps])])
-    cost = tf.reduce_sum(loss) #/ batch_size
-    if batch_size is not None:
-        cost = cost / batch_size
-    return cost
-
-
-def cross_entropy_seq_with_mask(logits, target_seqs, input_mask, return_details=False, name=None):
-    """Returns the expression of cross-entropy of two sequences, implement
-    softmax internally. Normally be used for Dynamic RNN outputs.
-
-    Parameters
-    -----------
-    logits : network identity outputs
-        2D tensor, ``network.outputs``, [batch_size, number of output units].
-    target_seqs : int of tensor, like word ID.
-        [batch_size, ?]
-    input_mask : the mask to compute loss
-        The same size with target_seqs, normally 0 and 1.
-    return_details : boolean
-        - If False (default), only returns the loss.
-        - If True, returns the loss, losses, weights and targets (reshape to one vetcor).
-
-    Examples
-    --------
-    - see Image Captioning Example.
-    """
-    targets = tf.reshape(target_seqs, [-1])   # to one vector
-    weights = tf.to_float(tf.reshape(input_mask, [-1]))   # to one vector like targets
-    losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=targets, name=name) * weights
-    #losses = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=targets, name=name)) # for TF1.0 and others
-
-    # try: ## TF1.0
-    loss = tf.divide(tf.reduce_sum(losses),   # loss from mask. reduce_sum before element-wise mul with mask !!
-                    tf.reduce_sum(weights),
-                    name="seq_loss_with_mask")
-    # except: ## TF0.12
-    #     loss = tf.div(tf.reduce_sum(losses),   # loss from mask. reduce_sum before element-wise mul with mask !!
-    #                     tf.reduce_sum(weights),
-    #                     name="seq_loss_with_mask")
-    if return_details:
-        return loss, losses, weights, targets
-    else:
-        return loss
-
-
-def cosine_similarity(v1, v2):
-    """Cosine similarity [-1, 1], `wiki <https://en.wikipedia.org/wiki/Cosine_similarity>`_.
-
-    Parameters
-    -----------
-    v1, v2 : tensor of [batch_size, n_feature], with the same number of features.
-
-    Returns
-    -----------
-    a tensor of [batch_size, ]
-    """
-    # try: ## TF1.0
-    cost = tf.reduce_sum(tf.multiply(v1, v2), 1) / (tf.sqrt(tf.reduce_sum(tf.multiply(v1, v1), 1)) * tf.sqrt(tf.reduce_sum(tf.multiply(v2, v2), 1)))
-    # except: ## TF0.12
-    #     cost = tf.reduce_sum(tf.mul(v1, v2), reduction_indices=1) / (tf.sqrt(tf.reduce_sum(tf.mul(v1, v1), reduction_indices=1)) * tf.sqrt(tf.reduce_sum(tf.mul(v2, v2), reduction_indices=1)))
-    return cost
-
-
-## Regularization Functions
-def li_regularizer(scale, scope=None):
-  """li regularization removes the neurons of previous layer, `i` represents `inputs`.\n
-  Returns a function that can be used to apply group li regularization to weights.\n
-  The implementation follows `TensorFlow contrib <https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/layers/python/layers/regularizers.py>`_.
-
-  Parameters
-  ----------
-  scale : float
-    A scalar multiplier `Tensor`. 0.0 disables the regularizer.
-  scope: An optional scope name for TF12+.
-
-  Returns
-  --------
-  A function with signature `li(weights, name=None)` that apply Li regularization.
-
-  Raises
-  ------
-  ValueError : if scale is outside of the range [0.0, 1.0] or if scale is not a float.
-  """
-  import numbers
-  from tensorflow.python.framework import ops
-  from tensorflow.python.ops import standard_ops
-  # from tensorflow.python.platform import tf_logging as logging
-
-  if isinstance(scale, numbers.Integral):
-    raise ValueError('scale cannot be an integer: %s' % scale)
-  if isinstance(scale, numbers.Real):
-    if scale < 0.:
-      raise ValueError('Setting a scale less than 0 on a regularizer: %g' %
-                       scale)
-    if scale >= 1.:
-      raise ValueError('Setting a scale greater than 1 on a regularizer: %g' %
-                       scale)
-    if scale == 0.:
-      logging.info('Scale of 0 disables regularizer.')
-      return lambda _, name=None: None
-
-  def li(weights, name=None):
-    """Applies li regularization to weights."""
-    with tf.name_scope('li_regularizer') as scope:
-        my_scale = ops.convert_to_tensor(scale,
-                                           dtype=weights.dtype.base_dtype,
-                                           name='scale')
-        # if tf.__version__ <= '0.12':
-        #     standard_ops_fn = standard_ops.mul
-        # else:
-        standard_ops_fn = standard_ops.multiply
-        return standard_ops_fn(
-          my_scale,
-          standard_ops.reduce_sum(standard_ops.sqrt(standard_ops.reduce_sum(tf.square(weights), 1))),
-          name=scope)
-  return li
-
-
-
-def lo_regularizer(scale, scope=None):
-  """lo regularization removes the neurons of current layer, `o` represents `outputs`\n
-  Returns a function that can be used to apply group lo regularization to weights.\n
-  The implementation follows `TensorFlow contrib <https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/layers/python/layers/regularizers.py>`_.
-
-  Parameters
-  ----------
-  scale : float
-    A scalar multiplier `Tensor`. 0.0 disables the regularizer.
-  scope: An optional scope name for TF12+.
-
-  Returns
-  -------
-  A function with signature `lo(weights, name=None)` that apply Lo regularization.
-
-  Raises
-  ------
-  ValueError : If scale is outside of the range [0.0, 1.0] or if scale is not a float.
-  """
-  import numbers
-  from tensorflow.python.framework import ops
-  from tensorflow.python.ops import standard_ops
-  # from tensorflow.python.platform import tf_logging as logging
-
-  if isinstance(scale, numbers.Integral):
-    raise ValueError('scale cannot be an integer: %s' % scale)
-  if isinstance(scale, numbers.Real):
-    if scale < 0.:
-      raise ValueError('Setting a scale less than 0 on a regularizer: %g' %
-                       scale)
-    if scale >= 1.:
-      raise ValueError('Setting a scale greater than 1 on a regularizer: %g' %
-                       scale)
-    if scale == 0.:
-      logging.info('Scale of 0 disables regularizer.')
-      return lambda _, name=None: None
-
-  def lo(weights, name='lo_regularizer'):
-    """Applies group column regularization to weights."""
-    with tf.name_scope(name) as scope:
-        my_scale = ops.convert_to_tensor(scale,
-                                       dtype=weights.dtype.base_dtype,
-                                       name='scale')
-        # if tf.__version__ <= '0.12':
-        #     standard_ops_fn = standard_ops.mul
-        # else:
-        standard_ops_fn = standard_ops.multiply
-        return standard_ops_fn(
-          my_scale,
-          standard_ops.reduce_sum(standard_ops.sqrt(standard_ops.reduce_sum(tf.square(weights), 0))),
-          name=scope)
-  return lo
-
-def maxnorm_regularizer(scale=1.0, scope=None):
-  """Max-norm regularization returns a function that can be used
-  to apply max-norm regularization to weights.
-  About max-norm: `wiki <https://en.wikipedia.org/wiki/Matrix_norm#Max_norm>`_.\n
-  The implementation follows `TensorFlow contrib <https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/layers/python/layers/regularizers.py>`_.
-
-  Parameters
-  ----------
-  scale : float
-    A scalar multiplier `Tensor`. 0.0 disables the regularizer.
-  scope: An optional scope name.
-
-  Returns
-  ---------
-  A function with signature `mn(weights, name=None)` that apply Lo regularization.
-
-  Raises
-  --------
-  ValueError : If scale is outside of the range [0.0, 1.0] or if scale is not a float.
-  """
-  import numbers
-  from tensorflow.python.framework import ops
-  from tensorflow.python.ops import standard_ops
-
-  if isinstance(scale, numbers.Integral):
-    raise ValueError('scale cannot be an integer: %s' % scale)
-  if isinstance(scale, numbers.Real):
-    if scale < 0.:
-      raise ValueError('Setting a scale less than 0 on a regularizer: %g' %
-                       scale)
-    # if scale >= 1.:
-    #   raise ValueError('Setting a scale greater than 1 on a regularizer: %g' %
-    #                    scale)
-    if scale == 0.:
-      logging.info('Scale of 0 disables regularizer.')
-      return lambda _, name=None: None
-
-  def mn(weights, name='max_regularizer'):
-    """Applies max-norm regularization to weights."""
-    with tf.name_scope(name) as scope:
-          my_scale = ops.convert_to_tensor(scale,
-                                           dtype=weights.dtype.base_dtype,
-                                           name='scale')
-        #   if tf.__version__ <= '0.12':
-        #       standard_ops_fn = standard_ops.mul
-        #   else:
-          standard_ops_fn = standard_ops.multiply
-          return standard_ops_fn(my_scale, standard_ops.reduce_max(standard_ops.abs(weights)), name=scope)
-  return mn
-
-def maxnorm_o_regularizer(scale, scope):
-  """Max-norm output regularization removes the neurons of current layer.\n
-  Returns a function that can be used to apply max-norm regularization to each column of weight matrix.\n
-  The implementation follows `TensorFlow contrib <https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/layers/python/layers/regularizers.py>`_.
-
-  Parameters
-  ----------
-  scale : float
-    A scalar multiplier `Tensor`. 0.0 disables the regularizer.
-  scope: An optional scope name.
-
-  Returns
-  ---------
-  A function with signature `mn_o(weights, name=None)` that apply Lo regularization.
-
-  Raises
-  ---------
-  ValueError : If scale is outside of the range [0.0, 1.0] or if scale is not a float.
-  """
-  import numbers
-  from tensorflow.python.framework import ops
-  from tensorflow.python.ops import standard_ops
-
-  if isinstance(scale, numbers.Integral):
-    raise ValueError('scale cannot be an integer: %s' % scale)
-  if isinstance(scale, numbers.Real):
-    if scale < 0.:
-      raise ValueError('Setting a scale less than 0 on a regularizer: %g' %
-                       scale)
-    # if scale >= 1.:
-    #   raise ValueError('Setting a scale greater than 1 on a regularizer: %g' %
-    #                    scale)
-    if scale == 0.:
-      logging.info('Scale of 0 disables regularizer.')
-      return lambda _, name=None: None
-
-  def mn_o(weights, name='maxnorm_o_regularizer'):
-     """Applies max-norm regularization to weights."""
-     with tf.name_scope(name) as scope:
-          my_scale = ops.convert_to_tensor(scale,
-                                           dtype=weights.dtype.base_dtype,
-                                                   name='scale')
-          if tf.__version__ <= '0.12':
-             standard_ops_fn = standard_ops.mul
-          else:
-             standard_ops_fn = standard_ops.multiply
-          return standard_ops_fn(my_scale, standard_ops.reduce_sum(standard_ops.reduce_max(standard_ops.abs(weights), 0)), name=scope)
-  return mn_o
-
-def maxnorm_i_regularizer(scale, scope=None):
-  """Max-norm input regularization removes the neurons of previous layer.\n
-  Returns a function that can be used to apply max-norm regularization to each row of weight matrix.\n
-  The implementation follows `TensorFlow contrib <https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/layers/python/layers/regularizers.py>`_.
-
-  Parameters
-  ----------
-  scale : float
-    A scalar multiplier `Tensor`. 0.0 disables the regularizer.
-  scope: An optional scope name.
-
-  Returns
-  ---------
-  A function with signature `mn_i(weights, name=None)` that apply Lo regularization.
-
-  Raises
-  ---------
-  ValueError : If scale is outside of the range [0.0, 1.0] or if scale is not a float.
-  """
-  import numbers
-  from tensorflow.python.framework import ops
-  from tensorflow.python.ops import standard_ops
-
-  if isinstance(scale, numbers.Integral):
-    raise ValueError('scale cannot be an integer: %s' % scale)
-  if isinstance(scale, numbers.Real):
-    if scale < 0.:
-      raise ValueError('Setting a scale less than 0 on a regularizer: %g' %
-                       scale)
-    # if scale >= 1.:
-    #   raise ValueError('Setting a scale greater than 1 on a regularizer: %g' %
-    #                    scale)
-    if scale == 0.:
-      logging.info('Scale of 0 disables regularizer.')
-      return lambda _, name=None: None
-
-  def mn_i(weights, name='maxnorm_i_regularizer'):
-     """Applies max-norm regularization to weights."""
-     with tf.name_scope(name) as scope:
-          my_scale = ops.convert_to_tensor(scale,
-                                           dtype=weights.dtype.base_dtype,
-                                                   name='scale')
-          if tf.__version__ <= '0.12':
-             standard_ops_fn = standard_ops.mul
-          else:
-             standard_ops_fn = standard_ops.multiply
-          return standard_ops_fn(my_scale, standard_ops.reduce_sum(standard_ops.reduce_max(standard_ops.abs(weights), 1)), name=scope)
-  return mn_i
-
-
-
-
-
-#
diff --git a/tensorlayer/db.py b/tensorlayer/db.py
deleted file mode 100644
index af9f5ba7..00000000
--- a/tensorlayer/db.py
+++ /dev/null
@@ -1,552 +0,0 @@
-#! /usr/bin/python
-# -*- coding: utf8 -*-
-"""
-Experimental Database Management System.
-
-Latest Version
-"""
-
-
-import tensorflow as tf
-import tensorlayer as tl
-import numpy as np
-import time
-import math
-
-
-import uuid
-
-import pymongo
-import gridfs
-import pickle
-from pymongo import MongoClient
-from datetime import datetime
-
-import inspect
-
-def AutoFill(func):
-    def func_wrapper(self,*args,**kwargs):
-        d=inspect.getcallargs(func,self,*args,**kwargs)
-        d['args'].update({"studyID":self.studyID})
-        return  func(**d)
-    return func_wrapper
-
-
-
-
-
-
-class TensorDB(object):
-    """TensorDB is a MongoDB based manager that help you to manage data, network topology, parameters and logging.
-
-    Parameters
-    -------------
-    ip : string, localhost or IP address.
-    port : int, port number.
-    db_name : string, database name.
-    user_name : string, set to None if it donnot need authentication.
-    password : string.
-
-    Properties
-    ------------
-    db : ``pymongo.MongoClient[db_name]``, xxxxxx
-    datafs : ``gridfs.GridFS(self.db, collection="datafs")``, xxxxxxxxxx
-    modelfs : ``gridfs.GridFS(self.db, collection="modelfs")``,
-    paramsfs : ``gridfs.GridFS(self.db, collection="paramsfs")``,
-    db.Params : Collection for
-    db.TrainLog : Collection for
-    db.ValidLog : Collection for
-    db.TestLog : Collection for
-    studyID : string, unique ID, if None random generate one.
-
-    Dependencies
-    -------------
-    1 : MongoDB, as TensorDB is based on MongoDB, you need to install it in your
-       local machine or remote machine.
-    2 : pip install pymongo, for MongoDB python API.
-
-    Optional Tools
-    ----------------
-    1 : You may like to install MongoChef or Mongo Management Studo APP for
-       visualizing or testing your MongoDB.
-    """
-    def __init__(
-        self,
-        ip = 'localhost',
-        port = 27017,
-        db_name = 'db_name',
-        user_name = None,
-        password = 'password',
-        studyID=None
-    ):
-        ## connect mongodb
-        client = MongoClient(ip, port)
-        self.db = client[db_name]
-        if user_name != None:
-            self.db.authenticate(user_name, password)
-
-
-        if studyID is None:
-            self.studyID=str(uuid.uuid1())
-        else:
-            self.studyID=studyID
-
-        ## define file system (Buckets)
-        self.datafs = gridfs.GridFS(self.db, collection="datafs")
-        self.modelfs = gridfs.GridFS(self.db, collection="modelfs")
-        self.paramsfs = gridfs.GridFS(self.db, collection="paramsfs")
-        self.archfs=gridfs.GridFS(self.db,collection="ModelArchitecture")
-        ##
-        print("[TensorDB] Connect SUCCESS {}:{} {} {} {}".format(ip, port, db_name, user_name, studyID))
-
-        self.ip = ip
-        self.port = port
-        self.db_name = db_name
-        self.user_name = user_name
-
-    def __autofill(self,args):
-        return args.update({'studyID':self.studyID})
-
-    def __serialization(self,ps):
-        return pickle.dumps(ps, protocol=2)
-
-    def __deserialization(self,ps):
-        return pickle.loads(ps)
-
-    def save_params(self, params=[], args={}):#, file_name='parameters'):
-        """ Save parameters into MongoDB Buckets, and save the file ID into Params Collections.
-
-        Parameters
-        ----------
-        params : a list of parameters
-        args : dictionary, item meta data.
-
-        Returns
-        ---------
-        f_id : the Buckets ID of the parameters.
-        """
-        self.__autofill(args)
-        s = time.time()
-        f_id = self.paramsfs.put(self.__serialization(params))#, file_name=file_name)
-        args.update({'f_id': f_id, 'time': datetime.utcnow()})
-        self.db.Params.insert_one(args)
-        # print("[TensorDB] Save params: {} SUCCESS, took: {}s".format(file_name, round(time.time()-s, 2)))
-        print("[TensorDB] Save params: SUCCESS, took: {}s".format(round(time.time()-s, 2)))
-        return f_id
-
-    @AutoFill
-    def find_one_params(self, args={},sort=None):
-        """ Find one parameter from MongoDB Buckets.
-
-        Parameters
-        ----------
-        args : dictionary, find items.
-
-        Returns
-        --------
-        params : the parameters, return False if nothing found.
-        f_id : the Buckets ID of the parameters, return False if nothing found.
-        """
-
-        s = time.time()
-        # print(args)
-        d = self.db.Params.find_one(filter=args,sort=sort)
-
-        if d is not None:
-            f_id = d['f_id']
-        else:
-            print("[TensorDB] FAIL! Cannot find: {}".format(args))
-            return False, False
-        try:
-            params = self.__deserialization(self.paramsfs.get(f_id).read())
-            print("[TensorDB] Find one params SUCCESS, {} took: {}s".format(args, round(time.time()-s, 2)))
-            return params, f_id
-        except:
-            return False, False
-
-    @AutoFill
-    def find_all_params(self, args={}):
-        """ Find all parameter from MongoDB Buckets
-
-        Parameters
-        ----------
-        args : dictionary, find items
-
-        Returns
-        --------
-        params : the parameters, return False if nothing found.
-
-        """
-
-        s = time.time()
-        pc = self.db.Params.find(args)
-
-        if pc is not None:
-            f_id_list = pc.distinct('f_id')
-            params = []
-            for f_id in f_id_list: # you may have multiple Buckets files
-                tmp = self.paramsfs.get(f_id).read()
-                params.append(self.__deserialization(tmp))
-        else:
-            print("[TensorDB] FAIL! Cannot find any: {}".format(args))
-            return False
-
-        print("[TensorDB] Find all params SUCCESS, took: {}s".format(round(time.time()-s, 2)))
-        return params
-
-    @AutoFill
-    def del_params(self, args={}):
-        """ Delete params in MongoDB uckets.
-
-        Parameters
-        -----------
-        args : dictionary, find items to delete, leave it empty to delete all parameters.
-        """
-
-        pc = self.db.Params.find(args)
-        f_id_list = pc.distinct('f_id')
-        # remove from Buckets
-        for f in f_id_list:
-            self.paramsfs.delete(f)
-        # remove from Collections
-        self.db.Params.remove(args)
-
-        print("[TensorDB] Delete params SUCCESS: {}".format(args))
-
-    def _print_dict(self, args):
-        # return " / ".join(str(key) + ": "+ str(value) for key, value in args.items())
-
-        string = ''
-        for key, value in args.items():
-            if key is not '_id':
-                string += str(key) + ": "+ str(value) + " / "
-        return string
-
-    ## =========================== LOG =================================== ##
-    @AutoFill
-    def train_log(self, args={}):
-        """Save the training log.
-
-        Parameters
-        -----------
-        args : dictionary, items to save.
-
-        Examples
-        ---------
-        >>> db.train_log(time=time.time(), {'loss': loss, 'acc': acc})
-        """
-
-        _result = self.db.TrainLog.insert_one(args)
-        _log = self._print_dict(args)
-        #print("[TensorDB] TrainLog: " +_log)
-        return _result
-
-    @AutoFill
-    def del_train_log(self, args={}):
-        """ Delete train log.
-
-        Parameters
-        -----------
-        args : dictionary, find items to delete, leave it empty to delete all log.
-        """
-
-        self.db.TrainLog.delete_many(args)
-        print("[TensorDB] Delete TrainLog SUCCESS")
-
-    @AutoFill
-    def valid_log(self, args={}):
-        """Save the validating log.
-
-        Parameters
-        -----------
-        args : dictionary, items to save.
-
-        Examples
-        ---------
-        >>> db.valid_log(time=time.time(), {'loss': loss, 'acc': acc})
-        """
-
-        _result = self.db.ValidLog.insert_one(args)
-        # _log = "".join(str(key) + ": " + str(value) for key, value in args.items())
-        _log = self._print_dict(args)
-        print("[TensorDB] ValidLog: " +_log)
-        return _result
-
-    @AutoFill
-    def del_valid_log(self, args={}):
-        """ Delete validation log.
-
-        Parameters
-        -----------
-        args : dictionary, find items to delete, leave it empty to delete all log.
-        """
-        self.db.ValidLog.delete_many(args)
-        print("[TensorDB] Delete ValidLog SUCCESS")
-
-    @AutoFill
-    def test_log(self, args={}):
-        """Save the testing log.
-
-        Parameters
-        -----------
-        args : dictionary, items to save.
-
-        Examples
-        ---------
-        >>> db.test_log(time=time.time(), {'loss': loss, 'acc': acc})
-        """
-
-        _result = self.db.TestLog.insert_one(args)
-        # _log = "".join(str(key) + str(value) for key, value in args.items())
-        _log = self._print_dict(args)
-        print("[TensorDB] TestLog: " +_log)
-        return _result
-
-    @AutoFill
-    def del_test_log(self, args={}):
-        """ Delete test log.
-
-        Parameters
-        -----------
-        args : dictionary, find items to delete, leave it empty to delete all log.
-        """
-
-        self.db.TestLog.delete_many(args)
-        print("[TensorDB] Delete TestLog SUCCESS")
-
-    ## =========================== Network Architecture ================== ##
-    @AutoFill
-    def save_model_architecture(self,s,args={}):
-        self.__autofill(args)
-        fid=self.archfs.put(s,filename="modelarchitecture")
-        args.update({"fid":fid})
-        self.db.march.insert_one(args)
-
-    @AutoFill
-    def load_model_architecture(self,args={}):
-
-        d = self.db.march.find_one(args)
-        if d is not None:
-            fid = d['fid']
-            print(d)
-            print(fid)
-            # "print find"
-        else:
-            print("[TensorDB] FAIL! Cannot find: {}".format(args))
-            print ("no idtem")
-            return False, False
-        try:
-            archs = self.archfs.get(fid).read()
-            '''print("[TensorDB] Find one params SUCCESS, {} took: {}s".format(args, round(time.time()-s, 2)))'''
-            return archs, fid
-        except Exception as e:
-            print("exception")
-            print(e)
-            return False, False
-
-    @AutoFill
-    def save_job(self, script=None, args={}):
-        """Save the job.
-
-        Parameters
-        -----------
-        script : a script file name or None.
-        args : dictionary, items to save.
-
-        Examples
-        ---------
-        >>> # Save your job
-        >>> db.save_job('your_script.py', {'job_id': 1, 'learning_rate': 0.01, 'n_units': 100})
-        >>> # Run your job
-        >>> temp = db.find_one_job(args={'job_id': 1})
-        >>> print(temp['learning_rate'])
-        ... 0.01
-        >>> import _your_script
-        ... running your script
-        """
-        self.__autofill(args)
-        if script is not None:
-            _script = open(script, 'rb').read()
-            args.update({'script': _script, 'script_name': script})
-        # _result = self.db.Job.insert_one(args)
-        _result = self.db.Job.replace_one(args, args, upsert=True)
-        _log = self._print_dict(args)
-        print("[TensorDB] Save Job: script={}, args={}".format(script, args))
-        return _result
-
-    @AutoFill
-    def find_one_job(self, args={}):
-        """ Find one job from MongoDB Job Collections.
-
-        Parameters
-        ----------
-        args : dictionary, find items.
-
-        Returns
-        --------
-        dictionary : contains all meta data and script.
-        """
-
-
-        temp = self.db.Job.find_one(args)
-
-        if temp is not None:
-            if 'script_name' in temp.keys():
-                f = open('_' + temp['script_name'], 'wb')
-                f.write(temp['script'])
-                f.close()
-            print("[TensorDB] Find Job: {}".format(args))
-        else:
-            print("[TensorDB] FAIL! Cannot find any: {}".format(args))
-            return False
-
-        return temp
-
-    def push_job(self,margs, wargs,dargs,epoch):
-
-        ms,mid=self.load_model_architecture(margs)
-        weight,wid=self.find_one_params(wargs)
-        args={"weight":wid,"model":mid,"dargs":dargs,"epoch":epoch,"time":datetime.utcnow(),"Running":False}
-        self.__autofill(args)
-        self.db.JOBS.insert_one(args)
-
-    def peek_job(self):
-        args={'Running':False}
-        self.__autofill(args)
-        m=self.db.JOBS.find_one(args)
-        print(m)
-        if m is None:
-            return False
-
-        s=self.paramsfs.get(m['weight']).read()
-        w=self.__deserialization(s)
-
-        ach=self.archfs.get(m['model']).read()
-
-        return m['_id'], ach,w,m["dargs"],m['epoch']
-
-    def run_job(self,jid):
-        self.db.JOBS.find_one_and_update({'_id':jid},{'$set': {'Running': True,"Since":datetime.utcnow()}})
-
-    def del_job(self,jid):
-        self.db.JOBS.find_one_and_update({'_id':jid},{'$set': {'Running': True,"Finished":datetime.utcnow()}})
-
-    def __str__(self):
-        _s = "[TensorDB] Info:\n"
-        _t = _s + "    " + str(self.db)
-        return _t
-
-    # def save_bulk_data(self, data=None, filename='filename'):
-    #     """ Put bulk data into TensorDB.datafs, return file ID.
-    #     When you have a very large data, you may like to save it into GridFS Buckets
-    #     instead of Collections, then when you want to load it, XXXX
-    #
-    #     Parameters
-    #     -----------
-    #     data : serialized data.
-    #     filename : string, GridFS Buckets.
-    #
-    #     References
-    #     -----------
-    #     - MongoDB find, xxxxx
-    #     """
-    #     s = time.time()
-    #     f_id = self.datafs.put(data, filename=filename)
-    #     print("[TensorDB] save_bulk_data: {} took: {}s".format(filename, round(time.time()-s, 2)))
-    #     return f_id
-    #
-    # def save_collection(self, data=None, collect_name='collect_name'):
-    #     """ Insert data into MongoDB Collections, return xx.
-    #
-    #     Parameters
-    #     -----------
-    #     data : serialized data.
-    #     collect_name : string, MongoDB collection name.
-    #
-    #     References
-    #     -----------
-    #     - MongoDB find, xxxxx
-    #     """
-    #     s = time.time()
-    #     rl = self.db[collect_name].insert_many(data)
-    #     print("[TensorDB] save_collection: {} took: {}s".format(collect_name, round(time.time()-s, 2)))
-    #     return rl
-    #
-    # def find(self, args={}, collect_name='collect_name'):
-    #     """ Find data from MongoDB Collections.
-    #
-    #     Parameters
-    #     -----------
-    #     args : dictionary, arguments for finding.
-    #     collect_name : string, MongoDB collection name.
-    #
-    #     References
-    #     -----------
-    #     - MongoDB find, xxxxx
-    #     """
-    #     s = time.time()
-    #
-    #     pc = self.db[collect_name].find(args)  # pymongo.cursor.Cursor object
-    #     flist = pc.distinct('f_id')
-    #     fldict = {}
-    #     for f in flist: # you may have multiple Buckets files
-    #         # fldict[f] = pickle.loads(self.datafs.get(f).read())
-    #         # s2 = time.time()
-    #         tmp = self.datafs.get(f).read()
-    #         # print(time.time()-s2)
-    #         fldict[f] = pickle.loads(tmp)
-    #         # print(time.time()-s2)
-    #         # exit()
-    #     # print(round(time.time()-s, 2))
-    #     data = [fldict[x['f_id']][x['id']] for x in pc]
-    #     data = np.asarray(data)
-    #     print("[TensorDB] find: {} get: {} took: {}s".format(collect_name, pc.count(), round(time.time()-s, 2)))
-    #     return data
-
-
-
-class DBLogger:
-    """ """
-    def __init__(self,db,model):
-        self.db=db
-        self.model=model
-
-    def on_train_begin(self,logs={}):
-        print("start")
-
-    def on_train_end(self,logs={}):
-        print("end")
-
-    def on_epoch_begin(self,epoch,logs={}):
-        self.epoch=epoch
-        self.et=time.time()
-        return
-
-    def on_epoch_end(self, epoch, logs={}):
-        self.et=time.time()-self.et
-        print("ending")
-        print(epoch)
-        logs['epoch']=epoch
-        logs['time']=datetime.utcnow()
-        logs['stepTime']=self.et
-        logs['acc']=np.asscalar(logs['acc'])
-        print(logs)
-
-        w=self.model.Params
-        fid=self.db.save_params(w,logs)
-        logs.update({'params':fid})
-        self.db.valid_log(logs)
-    def on_batch_begin(self, batch,logs={}):
-        self.t=time.time()
-        self.losses = []
-        self.batch=batch
-
-    def on_batch_end(self, batch, logs={}):
-        self.t2=time.time()-self.t
-        logs['acc']=np.asscalar(logs['acc'])
-        #logs['loss']=np.asscalar(logs['loss'])
-        logs['step_time']=self.t2
-        logs['time']=datetime.utcnow()
-        logs['epoch']=self.epoch
-        logs['batch']=self.batch
-        self.db.train_log(logs)
diff --git a/tensorlayer/files.py b/tensorlayer/files.py
deleted file mode 100644
index a79b5c6e..00000000
--- a/tensorlayer/files.py
+++ /dev/null
@@ -1,1246 +0,0 @@
-#! /usr/bin/python
-# -*- coding: utf8 -*-
-
-
-import tensorflow as tf
-import os
-import numpy as np
-import re
-import sys
-import tarfile
-import gzip
-import zipfile
-from . import visualize
-from . import nlp
-import pickle
-from six.moves import urllib
-from six.moves import cPickle
-from six.moves import zip
-from tensorflow.python.platform import gfile
-
-
-## Load dataset functions
-def load_mnist_dataset(shape=(-1,784), path="data/mnist/"):
-    """Automatically download MNIST dataset
-    and return the training, validation and test set with 50000, 10000 and 10000
-    digit images respectively.
-
-    Parameters
-    ----------
-    shape : tuple
-        The shape of digit images, defaults is (-1,784)
-    path : string
-        The path that the data is downloaded to, defaults is ``data/mnist/``.
-
-    Examples
-    --------
-    >>> X_train, y_train, X_val, y_val, X_test, y_test = tl.files.load_mnist_dataset(shape=(-1,784))
-    >>> X_train, y_train, X_val, y_val, X_test, y_test = tl.files.load_mnist_dataset(shape=(-1, 28, 28, 1))
-    """
-    # We first define functions for loading MNIST images and labels.
-    # For convenience, they also download the requested files if needed.
-    def load_mnist_images(path, filename):
-        filepath = maybe_download_and_extract(filename, path, 'http://yann.lecun.com/exdb/mnist/')
-
-        print(filepath)
-        # Read the inputs in Yann LeCun's binary format.
-        with gzip.open(filepath, 'rb') as f:
-            data = np.frombuffer(f.read(), np.uint8, offset=16)
-        # The inputs are vectors now, we reshape them to monochrome 2D images,
-        # following the shape convention: (examples, channels, rows, columns)
-        data = data.reshape(shape)
-        # The inputs come as bytes, we convert them to float32 in range [0,1].
-        # (Actually to range [0, 255/256], for compatibility to the version
-        # provided at http://deeplearning.net/data/mnist/mnist.pkl.gz.)
-        return data / np.float32(256)
-
-    def load_mnist_labels(path, filename):
-        filepath = maybe_download_and_extract(filename, path, 'http://yann.lecun.com/exdb/mnist/')
-        # Read the labels in Yann LeCun's binary format.
-        with gzip.open(filepath, 'rb') as f:
-            data = np.frombuffer(f.read(), np.uint8, offset=8)
-        # The labels are vectors of integers now, that's exactly what we want.
-        return data
-
-    # Download and read the training and test set images and labels.
-    print("Load or Download MNIST > {}".format(path))
-    X_train = load_mnist_images(path, 'train-images-idx3-ubyte.gz')
-    y_train = load_mnist_labels(path, 'train-labels-idx1-ubyte.gz')
-    X_test = load_mnist_images(path, 't10k-images-idx3-ubyte.gz')
-    y_test = load_mnist_labels(path, 't10k-labels-idx1-ubyte.gz')
-
-    # We reserve the last 10000 training examples for validation.
-    X_train, X_val = X_train[:-10000], X_train[-10000:]
-    y_train, y_val = y_train[:-10000], y_train[-10000:]
-
-    # We just return all the arrays in order, as expected in main().
-    # (It doesn't matter how we do this as long as we can read them again.)
-    X_train = np.asarray(X_train, dtype=np.float32)
-    y_train = np.asarray(y_train, dtype=np.int32)
-    X_val = np.asarray(X_val, dtype=np.float32)
-    y_val = np.asarray(y_val, dtype=np.int32)
-    X_test = np.asarray(X_test, dtype=np.float32)
-    y_test = np.asarray(y_test, dtype=np.int32)
-    return X_train, y_train, X_val, y_val, X_test, y_test
-
-def load_cifar10_dataset(shape=(-1, 32, 32, 3), path='data/cifar10/', plotable=False, second=3):
-    """The CIFAR-10 dataset consists of 60000 32x32 colour images in 10 classes, with
-    6000 images per class. There are 50000 training images and 10000 test images.
-
-    The dataset is divided into five training batches and one test batch, each with
-    10000 images. The test batch contains exactly 1000 randomly-selected images from
-    each class. The training batches contain the remaining images in random order,
-    but some training batches may contain more images from one class than another.
-    Between them, the training batches contain exactly 5000 images from each class.
-
-    Parameters
-    ----------
-    shape : tupe
-        The shape of digit images: e.g. (-1, 3, 32, 32) , (-1, 32, 32, 3) , (-1, 32, 32, 3)
-    plotable : True, False
-        Whether to plot some image examples.
-    second : int
-        If ``plotable`` is True, ``second`` is the display time.
-    path : string
-        The path that the data is downloaded to, defaults is ``data/cifar10/``.
-
-    Examples
-    --------
-    >>> X_train, y_train, X_test, y_test = tl.files.load_cifar10_dataset(shape=(-1, 32, 32, 3), plotable=True)
-
-    References
-    ----------
-    - `CIFAR website <https://www.cs.toronto.edu/~kriz/cifar.html>`_
-    - `Data download link <https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz>`_
-    - `Code references <https://teratail.com/questions/28932>`_
-    """
-
-    print("Load or Download cifar10 > {}".format(path))
-
-    #Helper function to unpickle the data
-    def unpickle(file):
-        fp = open(file, 'rb')
-        if sys.version_info.major == 2:
-            data = pickle.load(fp)
-        elif sys.version_info.major == 3:
-            data = pickle.load(fp, encoding='latin-1')
-        fp.close()
-        return data
-
-    filename = 'cifar-10-python.tar.gz'
-    url = 'https://www.cs.toronto.edu/~kriz/'
-    #Download and uncompress file
-    maybe_download_and_extract(filename, path, url, extract=True)
-
-    #Unpickle file and fill in data
-    X_train = None
-    y_train = []
-    for i in range(1,6):
-        data_dic = unpickle(os.path.join(path, 'cifar-10-batches-py/', "data_batch_{}".format(i)))
-        if i == 1:
-            X_train = data_dic['data']
-        else:
-            X_train = np.vstack((X_train, data_dic['data']))
-        y_train += data_dic['labels']
-
-    test_data_dic = unpickle(os.path.join(path,  'cifar-10-batches-py/', "test_batch"))
-    X_test = test_data_dic['data']
-    y_test = np.array(test_data_dic['labels'])
-
-    if shape == (-1, 3, 32, 32):
-        X_test = X_test.reshape(shape)
-        X_train = X_train.reshape(shape)
-    elif shape == (-1, 32, 32, 3):
-        X_test = X_test.reshape(shape, order='F')
-        X_train = X_train.reshape(shape, order='F')
-        X_test = np.transpose(X_test, (0, 2, 1, 3))
-        X_train = np.transpose(X_train, (0, 2, 1, 3))
-    else:
-        X_test = X_test.reshape(shape)
-        X_train = X_train.reshape(shape)
-
-    y_train = np.array(y_train)
-
-    if plotable == True:
-        print('\nCIFAR-10')
-        import matplotlib.pyplot as plt
-        fig = plt.figure(1)
-
-        print('Shape of a training image: X_train[0]',X_train[0].shape)
-
-        plt.ion()       # interactive mode
-        count = 1
-        for row in range(10):
-            for col in range(10):
-                a = fig.add_subplot(10, 10, count)
-                if shape == (-1, 3, 32, 32):
-                    # plt.imshow(X_train[count-1], interpolation='nearest')
-                    plt.imshow(np.transpose(X_train[count-1], (1, 2, 0)), interpolation='nearest')
-                    # plt.imshow(np.transpose(X_train[count-1], (2, 1, 0)), interpolation='nearest')
-                elif shape == (-1, 32, 32, 3):
-                    plt.imshow(X_train[count-1], interpolation='nearest')
-                    # plt.imshow(np.transpose(X_train[count-1], (1, 0, 2)), interpolation='nearest')
-                else:
-                    raise Exception("Do not support the given 'shape' to plot the image examples")
-                plt.gca().xaxis.set_major_locator(plt.NullLocator())    # 不显示刻度(tick)
-                plt.gca().yaxis.set_major_locator(plt.NullLocator())
-                count = count + 1
-        plt.draw()      # interactive mode
-        plt.pause(3)   # interactive mode
-
-        print("X_train:",X_train.shape)
-        print("y_train:",y_train.shape)
-        print("X_test:",X_test.shape)
-        print("y_test:",y_test.shape)
-
-    X_train = np.asarray(X_train, dtype=np.float32)
-    X_test = np.asarray(X_test, dtype=np.float32)
-    y_train = np.asarray(y_train, dtype=np.int32)
-    y_test = np.asarray(y_test, dtype=np.int32)
-
-    return X_train, y_train, X_test, y_test
-
-def load_ptb_dataset(path='data/ptb/'):
-    """Penn TreeBank (PTB) dataset is used in many LANGUAGE MODELING papers,
-    including "Empirical Evaluation and Combination of Advanced Language
-    Modeling Techniques", "Recurrent Neural Network Regularization".
-    It consists of 929k training words, 73k validation words, and 82k test
-    words. It has 10k words in its vocabulary.
-
-    Parameters
-    ----------
-    path : : string
-        The path that the data is downloaded to, defaults is ``data/ptb/``.
-
-    Returns
-    --------
-    train_data, valid_data, test_data, vocabulary size
-
-    Examples
-    --------
-    >>> train_data, valid_data, test_data, vocab_size = tl.files.load_ptb_dataset()
-
-    Code References
-    ---------------
-    - ``tensorflow.models.rnn.ptb import reader``
-
-    Download Links
-    ---------------
-    - `Manual download <http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz>`_
-    """
-    print("Load or Download Penn TreeBank (PTB) dataset > {}".format(path))
-
-    #Maybe dowload and uncompress tar, or load exsisting files
-    filename = 'simple-examples.tgz'
-    url = 'http://www.fit.vutbr.cz/~imikolov/rnnlm/'
-    maybe_download_and_extract(filename, path, url, extract=True)
-
-    data_path = os.path.join(path, 'simple-examples', 'data')
-    train_path = os.path.join(data_path, "ptb.train.txt")
-    valid_path = os.path.join(data_path, "ptb.valid.txt")
-    test_path = os.path.join(data_path, "ptb.test.txt")
-
-    word_to_id = nlp.build_vocab(nlp.read_words(train_path))
-
-    train_data = nlp.words_to_word_ids(nlp.read_words(train_path), word_to_id)
-    valid_data = nlp.words_to_word_ids(nlp.read_words(valid_path), word_to_id)
-    test_data = nlp.words_to_word_ids(nlp.read_words(test_path), word_to_id)
-    vocabulary = len(word_to_id)
-
-    # print(nlp.read_words(train_path))     # ... 'according', 'to', 'mr.', '<unk>', '<eos>']
-    # print(train_data)                 # ...  214,         5,    23,    1,       2]
-    # print(word_to_id)                 # ... 'beyond': 1295, 'anti-nuclear': 9599, 'trouble': 1520, '<eos>': 2 ... }
-    # print(vocabulary)                 # 10000
-    # exit()
-    return train_data, valid_data, test_data, vocabulary
-
-def load_matt_mahoney_text8_dataset(path='data/mm_test8/'):
-    """Download a text file from Matt Mahoney's website
-    if not present, and make sure it's the right size.
-    Extract the first file enclosed in a zip file as a list of words.
-    This dataset can be used for Word Embedding.
-
-    Parameters
-    ----------
-    path : : string
-        The path that the data is downloaded to, defaults is ``data/mm_test8/``.
-
-    Returns
-    --------
-    word_list : a list
-        a list of string (word).\n
-        e.g. [.... 'their', 'families', 'who', 'were', 'expelled', 'from', 'jerusalem', ...]
-
-    Examples
-    --------
-    >>> words = tl.files.load_matt_mahoney_text8_dataset()
-    >>> print('Data size', len(words))
-    """
-
-    print("Load or Download matt_mahoney_text8 Dataset> {}".format(path))
-
-    filename = 'text8.zip'
-    url = 'http://mattmahoney.net/dc/'
-    maybe_download_and_extract(filename, path, url, expected_bytes=31344016)
-
-    with zipfile.ZipFile(os.path.join(path, filename)) as f:
-        word_list = f.read(f.namelist()[0]).split()
-        for idx, word in enumerate(word_list):
-            word_list[idx] = word_list[idx].decode()
-    return word_list
-
-def load_imdb_dataset(path='data/imdb/', nb_words=None, skip_top=0,
-              maxlen=None, test_split=0.2, seed=113,
-              start_char=1, oov_char=2, index_from=3):
-    """Load IMDB dataset
-
-    Parameters
-    ----------
-    path : : string
-        The path that the data is downloaded to, defaults is ``data/imdb/``.
-
-    Examples
-    --------
-    >>> X_train, y_train, X_test, y_test = tl.files.load_imdb_dataset(
-    ...                                 nb_words=20000, test_split=0.2)
-    >>> print('X_train.shape', X_train.shape)
-    ... (20000,)  [[1, 62, 74, ... 1033, 507, 27],[1, 60, 33, ... 13, 1053, 7]..]
-    >>> print('y_train.shape', y_train.shape)
-    ... (20000,)  [1 0 0 ..., 1 0 1]
-
-    References
-    -----------
-    - `Modified from keras. <https://github.com/fchollet/keras/blob/master/keras/datasets/imdb.py>`_
-    """
-
-    filename = "imdb.pkl"
-    url = 'https://s3.amazonaws.com/text-datasets/'
-    maybe_download_and_extract(filename, path, url)
-
-    if filename.endswith(".gz"):
-        f = gzip.open(os.path.join(path, filename), 'rb')
-    else:
-        f = open(os.path.join(path, filename), 'rb')
-
-    X, labels = cPickle.load(f)
-    f.close()
-
-    np.random.seed(seed)
-    np.random.shuffle(X)
-    np.random.seed(seed)
-    np.random.shuffle(labels)
-
-    if start_char is not None:
-        X = [[start_char] + [w + index_from for w in x] for x in X]
-    elif index_from:
-        X = [[w + index_from for w in x] for x in X]
-
-    if maxlen:
-        new_X = []
-        new_labels = []
-        for x, y in zip(X, labels):
-            if len(x) < maxlen:
-                new_X.append(x)
-                new_labels.append(y)
-        X = new_X
-        labels = new_labels
-    if not X:
-        raise Exception('After filtering for sequences shorter than maxlen=' +
-                        str(maxlen) + ', no sequence was kept. '
-                        'Increase maxlen.')
-    if not nb_words:
-        nb_words = max([max(x) for x in X])
-
-    # by convention, use 2 as OOV word
-    # reserve 'index_from' (=3 by default) characters: 0 (padding), 1 (start), 2 (OOV)
-    if oov_char is not None:
-        X = [[oov_char if (w >= nb_words or w < skip_top) else w for w in x] for x in X]
-    else:
-        nX = []
-        for x in X:
-            nx = []
-            for w in x:
-                if (w >= nb_words or w < skip_top):
-                    nx.append(w)
-            nX.append(nx)
-        X = nX
-
-    X_train = np.array(X[:int(len(X) * (1 - test_split))])
-    y_train = np.array(labels[:int(len(X) * (1 - test_split))])
-
-    X_test = np.array(X[int(len(X) * (1 - test_split)):])
-    y_test = np.array(labels[int(len(X) * (1 - test_split)):])
-
-    return X_train, y_train, X_test, y_test
-
-def load_nietzsche_dataset(path='data/nietzsche/'):
-    """Load Nietzsche dataset.
-    Returns a string.
-
-    Parameters
-    ----------
-    path : string
-        The path that the data is downloaded to, defaults is ``data/nietzsche/``.
-
-    Examples
-    --------
-    >>> see tutorial_generate_text.py
-    >>> words = tl.files.load_nietzsche_dataset()
-    >>> words = basic_clean_str(words)
-    >>> words = words.split()
-    """
-    print("Load or Download nietzsche dataset > {}".format(path))
-
-    filename = "nietzsche.txt"
-    url = 'https://s3.amazonaws.com/text-datasets/'
-    filepath = maybe_download_and_extract(filename, path, url)
-
-    with open(filepath, "r") as f:
-        words = f.read()
-        return words
-
-def load_wmt_en_fr_dataset(path='data/wmt_en_fr/'):
-    """It will download English-to-French translation data from the WMT'15
-    Website (10^9-French-English corpus), and the 2013 news test from
-    the same site as development set.
-    Returns the directories of training data and test data.
-
-    Parameters
-    ----------
-    path : string
-        The path that the data is downloaded to, defaults is ``data/wmt_en_fr/``.
-
-    References
-    ----------
-    - Code modified from /tensorflow/models/rnn/translation/data_utils.py
-
-    Notes
-    -----
-    Usually, it will take a long time to download this dataset.
-    """
-    # URLs for WMT data.
-    _WMT_ENFR_TRAIN_URL = "http://www.statmt.org/wmt10/"
-    _WMT_ENFR_DEV_URL = "http://www.statmt.org/wmt15/"
-
-    def gunzip_file(gz_path, new_path):
-        """Unzips from gz_path into new_path."""
-        print("Unpacking %s to %s" % (gz_path, new_path))
-        with gzip.open(gz_path, "rb") as gz_file:
-            with open(new_path, "wb") as new_file:
-                for line in gz_file:
-                    new_file.write(line)
-
-    def get_wmt_enfr_train_set(path):
-        """Download the WMT en-fr training corpus to directory unless it's there."""
-        filename = "training-giga-fren.tar"
-        maybe_download_and_extract(filename, path, _WMT_ENFR_TRAIN_URL, extract=True)
-        train_path = os.path.join(path, "giga-fren.release2.fixed")
-        gunzip_file(train_path + ".fr.gz", train_path + ".fr")
-        gunzip_file(train_path + ".en.gz", train_path + ".en")
-        return train_path
-
-    def get_wmt_enfr_dev_set(path):
-        """Download the WMT en-fr training corpus to directory unless it's there."""
-        filename = "dev-v2.tgz"
-        dev_file = maybe_download_and_extract(filename, path, _WMT_ENFR_DEV_URL, extract=False)
-        dev_name = "newstest2013"
-        dev_path = os.path.join(path, "newstest2013")
-        if not (gfile.Exists(dev_path + ".fr") and gfile.Exists(dev_path + ".en")):
-            print("Extracting tgz file %s" % dev_file)
-            with tarfile.open(dev_file, "r:gz") as dev_tar:
-              fr_dev_file = dev_tar.getmember("dev/" + dev_name + ".fr")
-              en_dev_file = dev_tar.getmember("dev/" + dev_name + ".en")
-              fr_dev_file.name = dev_name + ".fr"  # Extract without "dev/" prefix.
-              en_dev_file.name = dev_name + ".en"
-              dev_tar.extract(fr_dev_file, path)
-              dev_tar.extract(en_dev_file, path)
-        return dev_path
-
-    print("Load or Download WMT English-to-French translation > {}".format(path))
-
-    train_path = get_wmt_enfr_train_set(path)
-    dev_path = get_wmt_enfr_dev_set(path)
-
-    return train_path, dev_path
-
-def load_flickr25k_dataset(tag='sky', path="data/flickr25k", n_threads=50, printable=False):
-    """Returns a list of images by a given tag from Flick25k dataset,
-    it will download Flickr25k from `the official website <http://press.liacs.nl/mirflickr/mirdownload.html>`_
-    at the first time you use it.
-
-    Parameters
-    ------------
-    tag : string or None
-        If you want to get images with tag, use string like 'dog', 'red', see `Flickr Search <https://www.flickr.com/search/>`_.
-        If you want to get all images, set to ``None``.
-    path : string
-        The path that the data is downloaded to, defaults is ``data/flickr25k/``.
-    n_threads : int, number of thread to read image.
-    printable : bool, print infomation when reading images, default is ``False``.
-
-    Examples
-    -----------
-    - Get images with tag of sky
-    >>> images = tl.files.load_flickr25k_dataset(tag='sky')
-
-    - Get all images
-    >>> images = tl.files.load_flickr25k_dataset(tag=None, n_threads=100, printable=True)
-    """
-    filename = 'mirflickr25k.zip'
-    url = 'http://press.liacs.nl/mirflickr/mirflickr25k/'
-    ## download dataset
-    if folder_exists(path+"/mirflickr") is False:
-        print("[*] Flickr25k is nonexistent in {}".format(path))
-        maybe_download_and_extract(filename, path, url, extract=True)
-        del_file(path+'/'+filename)
-    ## return images by the given tag.
-    # 1. image path list
-    folder_imgs = path+"/mirflickr"
-    path_imgs = load_file_list(path=folder_imgs, regx='\\.jpg', printable=False)
-    path_imgs.sort(key=natural_keys)
-    # print(path_imgs[0:10])
-    # 2. tag path list
-    folder_tags = path+"/mirflickr/meta/tags"
-    path_tags = load_file_list(path=folder_tags, regx='\\.txt', printable=False)
-    path_tags.sort(key=natural_keys)
-    # print(path_tags[0:10])
-    # 3. select images
-    if tag is None:
-        print("[Flickr25k] reading all images")
-    else:
-        print("[Flickr25k] reading images with tag: {}".format(tag))
-    images_list = []
-    for idx in range(0, len(path_tags)):
-        tags = read_file(folder_tags+'/'+path_tags[idx]).split('\n')
-        # print(idx+1, tags)
-        if tag is None or tag in tags:
-            images_list.append(path_imgs[idx])
-
-    images = visualize.read_images(images_list, folder_imgs, n_threads=n_threads, printable=printable)
-    return images
-
-def load_flickr1M_dataset(tag='sky', size=10, path="data/flickr1M", n_threads=50, printable=False):
-    """Returns a list of images by a given tag from Flickr1M dataset,
-    it will download Flickr1M from `the official website <http://press.liacs.nl/mirflickr/mirdownload.html>`_
-    at the first time you use it.
-
-    Parameters
-    ------------
-    tag : string or None
-        If you want to get images with tag, use string like 'dog', 'red', see `Flickr Search <https://www.flickr.com/search/>`_.
-        If you want to get all images, set to ``None``.
-    size : int 1 to 10.
-        1 means 100k images ... 5 means 500k images, 10 means all 1 million images. Default is 10.
-    path : string
-        The path that the data is downloaded to, defaults is ``data/flickr25k/``.
-    n_threads : int, number of thread to read image.
-    printable : bool, print infomation when reading images, default is ``False``.
-
-    Examples
-    ----------
-    - Use 200k images
-    >>> images = tl.files.load_flickr1M_dataset(tag='zebra', size=2)
-
-    - Use 1 Million images
-    >>> images = tl.files.load_flickr1M_dataset(tag='zebra')
-    """
-    print("[Flickr1M] using {}% of images = {}".format(size*10, size*100000))
-    images_zip = ['images0.zip', 'images1.zip', 'images2.zip', 'images3.zip',
-             'images4.zip',  'images5.zip', 'images6.zip', 'images7.zip',
-             'images8.zip',  'images9.zip']
-    tag_zip = 'tags.zip'
-    url = 'http://press.liacs.nl/mirflickr/mirflickr1m/'
-    ## download dataset
-    for image_zip in images_zip[0:size]:
-        image_folder = image_zip.split(".")[0]
-        # print(path+"/"+image_folder)
-        if folder_exists(path+"/"+image_folder) is False:
-            # print(image_zip)
-            print("[Flickr1M] {} is missing in {}".format(image_folder, path))
-            maybe_download_and_extract(image_zip, path, url, extract=True)
-            del_file(path+'/'+image_zip)
-            os.system("mv {} {}".format(path+'/images',path+'/'+image_folder))
-        else:
-            print("[Flickr1M] {} exists in {}".format(image_folder, path))
-    ## download tag
-    if folder_exists(path+"/tags") is False:
-        print("[Flickr1M] tag files is nonexistent in {}".format(path))
-        maybe_download_and_extract(tag_zip, path, url, extract=True)
-        del_file(path+'/'+tag_zip)
-    else:
-        print("[Flickr1M] tags exists in {}".format(path))
-
-    ## 1. image path list
-    images_list = []
-    images_folder_list = []
-    for i in range(0, size):
-        images_folder_list += load_folder_list(path=path+'/images%d'%i)
-    images_folder_list.sort(key=lambda s : int(s.split('/')[-1]))   # folder/images/ddd
-    # print(images_folder_list)
-    # exit()
-    for folder in images_folder_list[0:size*10]:
-        tmp = load_file_list(path=folder, regx='\\.jpg', printable=False)
-        tmp.sort(key=lambda s : int(s.split('.')[-2]))  # ddd.jpg
-        # print(tmp[0::570])
-        images_list.extend([folder+'/'+x for x in tmp])
-    # print('IM', len(images_list), images_list[0::6000])
-    ## 2. tag path list
-    tag_list = []
-    tag_folder_list = load_folder_list(path+"/tags")
-    tag_folder_list.sort(key=lambda s : int(s.split('/')[-1]))  # folder/images/ddd
-
-    for folder in tag_folder_list[0:size*10]:
-        # print(folder)
-        tmp = load_file_list(path=folder, regx='\\.txt', printable=False)
-        tmp.sort(key=lambda s : int(s.split('.')[-2])) # ddd.txt
-        tmp = [folder+'/'+s for s in tmp]
-        tag_list += tmp
-    # print('T', len(tag_list), tag_list[0::6000])
-    # exit()
-    ## 3. select images
-    print("[Flickr1M] searching tag: {}".format(tag))
-    select_images_list = []
-    for idx in range(0, len(tag_list)):
-        tags = read_file(tag_list[idx]).split('\n')
-        if tag in tags:
-            select_images_list.append(images_list[idx])
-            # print(idx, tags, tag_list[idx], images_list[idx])
-    print("[Flickr1M] reading images with tag: {}".format(tag))
-    images = visualize.read_images(select_images_list, '', n_threads=n_threads, printable=printable)
-    return images
-
-def load_cyclegan_dataset(filename='summer2winter_yosemite', path='data/cyclegan'):
-    """Load image data from CycleGAN's database, see `this link <https://people.eecs.berkeley.edu/~taesung_park/CycleGAN/datasets/>`_.
-
-    Parameters
-    ------------
-    filename : string
-        The dataset you want, see `this link <https://people.eecs.berkeley.edu/~taesung_park/CycleGAN/datasets/>`_.
-    path : string
-        The path that the data is downloaded to, defaults is `data/cyclegan`
-
-    Examples
-    ---------
-    >>> im_train_A, im_train_B, im_test_A, im_test_B = load_cyclegan_dataset(filename='summer2winter_yosemite')
-    """
-    url = 'https://people.eecs.berkeley.edu/~taesung_park/CycleGAN/datasets/'
-
-    if folder_exists(path+"/"+filename) is False:
-        print("[*] {} is nonexistent in {}".format(filename, path))
-        maybe_download_and_extract(filename+'.zip', path, url, extract=True)
-        del_file(path+'/'+filename+'.zip')
-
-    def load_image_from_folder(path):
-        path_imgs = load_file_list(path=path, regx='\\.jpg', printable=False)
-        return visualize.read_images(path_imgs, path=path, n_threads=10, printable=False)
-    im_train_A = load_image_from_folder(path+"/"+filename+"/trainA")
-    im_train_B = load_image_from_folder(path+"/"+filename+"/trainB")
-    im_test_A = load_image_from_folder(path+"/"+filename+"/testA")
-    im_test_B = load_image_from_folder(path+"/"+filename+"/testB")
-
-    return im_train_A, im_train_B, im_test_A, im_test_B
-
-
-## Load and save network list npz
-def save_npz(save_list=[], name='model.npz', sess=None):
-    """Input parameters and the file name, save parameters into .npz file. Use tl.utils.load_npz() to restore.
-
-    Parameters
-    ----------
-    save_list : a list
-        Parameters want to be saved.
-    name : a string or None
-        The name of the .npz file.
-    sess : None or Session
-
-    Examples
-    --------
-    >>> tl.files.save_npz(network.all_params, name='model_test.npz', sess=sess)
-    ... File saved to: model_test.npz
-    >>> load_params = tl.files.load_npz(name='model_test.npz')
-    ... Loading param0, (784, 800)
-    ... Loading param1, (800,)
-    ... Loading param2, (800, 800)
-    ... Loading param3, (800,)
-    ... Loading param4, (800, 10)
-    ... Loading param5, (10,)
-    >>> put parameters into a TensorLayer network, please see assign_params()
-
-    Notes
-    -----
-    If you got session issues, you can change the value.eval() to value.eval(session=sess)
-
-    References
-    ----------
-    - `Saving dictionary using numpy <http://stackoverflow.com/questions/22315595/saving-dictionary-of-header-information-using-numpy-savez>`_
-    """
-    ## save params into a list
-    save_list_var = []
-    if sess:
-        save_list_var = sess.run(save_list)
-    else:
-        try:
-            for k, value in enumerate(save_list):
-                save_list_var.append(value.eval())
-        except:
-            print(" Fail to save model, Hint: pass the session into this function, save_npz(network.all_params, name='model.npz', sess=sess)")
-    np.savez(name, params=save_list_var)
-    save_list_var = None
-    del save_list_var
-    print("[*] %s saved" % name)
-
-    ## save params into a dictionary
-    # rename_dict = {}
-    # for k, value in enumerate(save_dict):
-    #     rename_dict.update({'param'+str(k) : value.eval()})
-    # np.savez(name, **rename_dict)
-    # print('Model is saved to: %s' % name)
-
-def load_npz(path='', name='model.npz'):
-    """Load the parameters of a Model saved by tl.files.save_npz().
-
-    Parameters
-    ----------
-    path : a string
-        Folder path to .npz file.
-    name : a string or None
-        The name of the .npz file.
-
-    Returns
-    --------
-    params : list
-        A list of parameters in order.
-
-    Examples
-    --------
-    - See save_npz and assign_params
-
-    References
-    ----------
-    - `Saving dictionary using numpy <http://stackoverflow.com/questions/22315595/saving-dictionary-of-header-information-using-numpy-savez>`_
-    """
-    ## if save_npz save params into a dictionary
-    # d = np.load( path+name )
-    # params = []
-    # print('Load Model')
-    # for key, val in sorted( d.items() ):
-    #     params.append(val)
-    #     print('Loading %s, %s' % (key, str(val.shape)))
-    # return params
-    ## if save_npz save params into a list
-    d = np.load( path+name )
-    # for val in sorted( d.items() ):
-    #     params = val
-    #     return params
-    return d['params']
-    # print(d.items()[0][1]['params'])
-    # exit()
-    # return d.items()[0][1]['params']
-
-def assign_params(sess, params, network):
-    """Assign the given parameters to the TensorLayer network.
-
-    Parameters
-    ----------
-    sess : TensorFlow Session. Automatically run when sess is not None.
-    params : a list
-        A list of parameters in order.
-    network : a :class:`Layer` class
-        The network to be assigned
-
-    Returns
-    --------
-    ops : list
-        A list of tf ops in order that assign params. Support sess.run(ops) manually.
-
-    Examples
-    --------
-    >>> Save your network as follow:
-    >>> tl.files.save_npz(network.all_params, name='model_test.npz')
-    >>> network.print_params()
-    ...
-    ... Next time, load and assign your network as follow:
-    >>> tl.layers.initialize_global_variables(sess)
-    >>> load_params = tl.files.load_npz(name='model_test.npz')
-    >>> tl.files.assign_params(sess, load_params, network)
-    >>> network.print_params()
-
-    References
-    ----------
-    - `Assign value to a TensorFlow variable <http://stackoverflow.com/questions/34220532/how-to-assign-value-to-a-tensorflow-variable>`_
-    """
-    ops = []
-    for idx, param in enumerate(params):
-        ops.append(network.all_params[idx].assign(param))
-    if sess is not None:
-        sess.run(ops)
-    return ops
-
-def load_and_assign_npz(sess=None, name=None, network=None):
-    """Load model from npz and assign to a network.
-
-    Parameters
-    -------------
-    sess : TensorFlow Session
-    name : string
-        Model path.
-    network : a :class:`Layer` class
-        The network to be assigned
-
-    Returns
-    --------
-    Returns False if faild to model is not exist.
-
-    Examples
-    ---------
-    >>> tl.files.load_and_assign_npz(sess=sess, name='net.npz', network=net)
-    """
-    assert network is not None
-    assert sess is not None
-    if not os.path.exists(name):
-        print("[!] Load {} failed!".format(name))
-        return False
-    else:
-        params = load_npz(name=name)
-        assign_params(sess, params, network)
-        print("[*] Load {} SUCCESS!".format(name))
-        return network
-
-## Load and save network dict npz
-def save_npz_dict(save_list=[], name='model.npz', sess=None):
-    """Input parameters and the file name, save parameters as a dictionary into .npz file.
-    Use ``tl.files.load_and_assign_npz_dict()`` to restore.
-
-    Parameters
-    ----------
-    save_list : a list to tensor for parameters
-        Parameters want to be saved.
-    name : a string
-        The name of the .npz file.
-    sess : Session
-    """
-    assert sess is not None
-    save_list_names = [tensor.name for tensor in save_list]
-    save_list_var = sess.run(save_list)
-    save_var_dict = {save_list_names[idx]: val for idx, val in enumerate(save_list_var)}
-    np.savez(name, **save_var_dict)
-    save_list_var = None
-    save_var_dict = None
-    del save_list_var
-    del save_var_dict
-    print("[*] Model saved in npz_dict %s" % name)
-
-def load_and_assign_npz_dict(name='model.npz', sess=None):
-    """Restore the parameters saved by ``tl.files.save_npz_dict()``.
-
-    Parameters
-    ----------
-    name : a string
-        The name of the .npz file.
-    sess : Session
-    """
-    assert sess is not None
-    if not os.path.exists(name):
-        print("[!] Load {} failed!".format(name))
-        return False
-
-    params = np.load(name)
-    if len(params.keys()) != len(set(params.keys())):
-        raise Exception("Duplication in model npz_dict %s" % name)
-    ops = list()
-    for key in params.keys():
-        try:
-            # tensor = tf.get_default_graph().get_tensor_by_name(key)
-            # varlist = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=key)
-            varlist = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=key)
-            if len(varlist) > 1:
-                raise Exception("[!] Multiple candidate variables to be assigned for name %s" % key)
-            elif len(varlist) == 0:
-                raise KeyError
-            else:
-                ops.append(varlist[0].assign(params[key]))
-                print("[*] params restored: %s" % key)
-        except KeyError:
-            print("[!] Warning: Tensor named %s not found in network." % key)
-
-    sess.run(ops)
-    print("[*] Model restored from npz_dict %s" % name)
-
-# def save_npz_dict(save_list=[], name='model.npz', sess=None):
-#     """Input parameters and the file name, save parameters as a dictionary into .npz file. Use tl.utils.load_npz_dict() to restore.
-#
-#     Parameters
-#     ----------
-#     save_list : a list
-#         Parameters want to be saved.
-#     name : a string or None
-#         The name of the .npz file.
-#     sess : None or Session
-#
-#     Notes
-#     -----
-#     This function tries to avoid a potential broadcasting error raised by numpy.
-#
-#     """
-#     ## save params into a list
-#     save_list_var = []
-#     if sess:
-#         save_list_var = sess.run(save_list)
-#     else:
-#         try:
-#             for k, value in enumerate(save_list):
-#                 save_list_var.append(value.eval())
-#         except:
-#             print(" Fail to save model, Hint: pass the session into this function, save_npz_dict(network.all_params, name='model.npz', sess=sess)")
-#     save_var_dict = {str(idx):val for idx, val in enumerate(save_list_var)}
-#     np.savez(name, **save_var_dict)
-#     save_list_var = None
-#     save_var_dict = None
-#     del save_list_var
-#     del save_var_dict
-#     print("[*] %s saved" % name)
-#
-# def load_npz_dict(path='', name='model.npz'):
-#     """Load the parameters of a Model saved by tl.files.save_npz_dict().
-#
-#     Parameters
-#     ----------
-#     path : a string
-#         Folder path to .npz file.
-#     name : a string or None
-#         The name of the .npz file.
-#
-#     Returns
-#     --------
-#     params : list
-#         A list of parameters in order.
-#     """
-#     d = np.load( path+name )
-#     saved_list_var = [val[1] for val in sorted(d.items(), key=lambda tup: int(tup[0]))]
-#     return saved_list_var
-
-
-
-## Load and save network ckpt
-def save_ckpt(sess=None, mode_name='model.ckpt', save_dir='checkpoint', var_list=[], global_step=None, printable=False):
-    """Save parameters into ckpt file.
-
-    Parameters
-    ------------
-    sess : Session.
-    mode_name : string, name of the model, default is ``model.ckpt``.
-    save_dir : string, path / file directory to the ckpt, default is ``checkpoint``.
-    var_list : list of variables, if not given, save all global variables.
-    global_step : int or None, step number.
-    printable : bool, if True, print all params info.
-
-    Examples
-    ---------
-    - see ``tl.files.load_ckpt()``.
-    """
-    assert sess is not None
-    ckpt_file = os.path.join(save_dir, mode_name)
-    if var_list == []:
-        var_list = tf.global_variables()
-
-    print("[*] save %s n_params: %d" % (ckpt_file, len(var_list)))
-
-    if printable:
-        for idx, v in enumerate(var_list):
-            print("  param {:3}: {:15}   {}".format(idx, v.name, str(v.get_shape())))
-
-    saver = tf.train.Saver(var_list)
-    saver.save(sess, ckpt_file, global_step=global_step)
-
-def load_ckpt(sess=None, mode_name='model.ckpt', save_dir='checkpoint', var_list=[], is_latest=True, printable=False):
-    """Load parameters from ckpt file.
-
-    Parameters
-    ------------
-    sess : Session.
-    mode_name : string, name of the model, default is ``model.ckpt``.
-        Note that if ``is_latest`` is True, this function will get the ``mode_name`` automatically.
-    save_dir : string, path / file directory to the ckpt, default is ``checkpoint``.
-    var_list : list of variables, if not given, save all global variables.
-    is_latest : bool, if True, load the latest ckpt, if False, load the ckpt with the name of ```mode_name``.
-    printable : bool, if True, print all params info.
-
-    Examples
-    ----------
-    - Save all global parameters.
-    >>> tl.files.save_ckpt(sess=sess, mode_name='model.ckpt', save_dir='model', printable=True)
-    - Save specific parameters.
-    >>> tl.files.save_ckpt(sess=sess, mode_name='model.ckpt', var_list=net.all_params, save_dir='model', printable=True)
-    - Load latest ckpt.
-    >>> tl.files.load_ckpt(sess=sess, var_list=net.all_params, save_dir='model', printable=True)
-    - Load specific ckpt.
-    >>> tl.files.load_ckpt(sess=sess, mode_name='model.ckpt', var_list=net.all_params, save_dir='model', is_latest=False, printable=True)
-    """
-    assert sess is not None
-
-    if is_latest:
-        ckpt_file = tf.train.latest_checkpoint(save_dir)
-    else:
-        ckpt_file = os.path.join(save_dir, mode_name)
-
-    if var_list == []:
-        var_list = tf.global_variables()
-
-    print("[*] load %s n_params: %d" % (ckpt_file, len(var_list)))
-
-    if printable:
-        for idx, v in enumerate(var_list):
-            print("  param {:3}: {:15}   {}".format(idx, v.name, str(v.get_shape())))
-
-    try:
-        saver = tf.train.Saver(var_list)
-        saver.restore(sess, ckpt_file)
-    except Exception as e:
-        print(e)
-        print("[*] load ckpt fail ...")
-
-
-
-## Load and save variables
-def save_any_to_npy(save_dict={}, name='file.npy'):
-    """Save variables to .npy file.
-
-    Examples
-    ---------
-    >>> tl.files.save_any_to_npy(save_dict={'data': ['a','b']}, name='test.npy')
-    >>> data = tl.files.load_npy_to_any(name='test.npy')
-    >>> print(data)
-    ... {'data': ['a','b']}
-    """
-    np.save(name, save_dict)
-
-def load_npy_to_any(path='', name='file.npy'):
-    """Load .npy file.
-
-    Examples
-    ---------
-    - see save_any_to_npy()
-    """
-    file_path = os.path.join(path, name)
-    try:
-        npy = np.load(file_path).item()
-    except:
-        npy = np.load(file_path)
-    finally:
-        try:
-            return npy
-        except:
-            print("[!] Fail to load %s" % file_path)
-            exit()
-
-
-
-
-## Folder functions
-def file_exists(filepath):
-    """ Check whether a file exists by given file path. """
-    return os.path.isfile(filepath)
-
-def folder_exists(folderpath):
-    """ Check whether a folder exists by given folder path. """
-    return os.path.isdir(folderpath)
-
-def del_file(filepath):
-    """ Delete a file by given file path. """
-    os.remove(filepath)
-
-def del_folder(folderpath):
-    """ Delete a folder by given folder path. """
-    os.rmdir(folderpath)
-
-def read_file(filepath):
-    """ Read a file and return a string.
-
-    Examples
-    ---------
-    >>> data = tl.files.read_file('data.txt')
-    """
-    with open(filepath, 'r') as afile:
-        return afile.read()
-
-def load_file_list(path=None, regx='\.npz', printable=True):
-    """Return a file list in a folder by given a path and regular expression.
-
-    Parameters
-    ----------
-    path : a string or None
-        A folder path.
-    regx : a string
-        The regx of file name.
-    printable : boolean, whether to print the files infomation.
-
-    Examples
-    ----------
-    >>> file_list = tl.files.load_file_list(path=None, regx='w1pre_[0-9]+\.(npz)')
-    """
-    if path == False:
-        path = os.getcwd()
-    file_list = os.listdir(path)
-    return_list = []
-    for idx, f in enumerate(file_list):
-        if re.search(regx, f):
-            return_list.append(f)
-    # return_list.sort()
-    if printable:
-        print('Match file list = %s' % return_list)
-        print('Number of files = %d' % len(return_list))
-    return return_list
-
-def load_folder_list(path=""):
-    """Return a folder list in a folder by given a folder path.
-
-    Parameters
-    ----------
-    path : a string or None
-        A folder path.
-    """
-    return [os.path.join(path,o) for o in os.listdir(path) if os.path.isdir(os.path.join(path,o))]
-
-def exists_or_mkdir(path, verbose=True):
-    """Check a folder by given name, if not exist, create the folder and return False,
-    if directory exists, return True.
-
-    Parameters
-    ----------
-    path : a string
-        A folder path.
-    verbose : boolean
-        If True, prints results, deaults is True
-
-    Returns
-    --------
-    True if folder exist, otherwise, returns False and create the folder
-
-    Examples
-    --------
-    >>> tl.files.exists_or_mkdir("checkpoints/train")
-    """
-    if not os.path.exists(path):
-        if verbose:
-            print("[*] creates %s ..." % path)
-        os.makedirs(path)
-        return False
-    else:
-        if verbose:
-            print("[!] %s exists ..." % path)
-        return True
-
-def maybe_download_and_extract(filename, working_directory, url_source, extract=False, expected_bytes=None):
-    """Checks if file exists in working_directory otherwise tries to dowload the file,
-    and optionally also tries to extract the file if format is ".zip" or ".tar"
-
-    Parameters
-    -----------
-    filename : string
-        The name of the (to be) dowloaded file.
-    working_directory : string
-        A folder path to search for the file in and dowload the file to
-    url : string
-        The URL to download the file from
-    extract : bool, defaults is False
-        If True, tries to uncompress the dowloaded file is ".tar.gz/.tar.bz2" or ".zip" file
-    expected_bytes : int/None
-        If set tries to verify that the downloaded file is of the specified size, otherwise raises an Exception,
-        defaults is None which corresponds to no check being performed
-
-    Returns
-    ----------
-    filepath to dowloaded (uncompressed) file
-
-    Examples
-    --------
-    >>> down_file = tl.files.maybe_download_and_extract(filename = 'train-images-idx3-ubyte.gz',
-                                                        working_directory = 'data/',
-                                                        url_source = 'http://yann.lecun.com/exdb/mnist/')
-    >>> tl.files.maybe_download_and_extract(filename = 'ADEChallengeData2016.zip',
-                                            working_directory = 'data/',
-                                            url_source = 'http://sceneparsing.csail.mit.edu/data/',
-                                            extract=True)
-    """
-    # We first define a download function, supporting both Python 2 and 3.
-    def _download(filename, working_directory, url_source):
-        def _dlProgress(count, blockSize, totalSize):
-            if(totalSize != 0):
-                percent = float(count * blockSize) / float(totalSize) * 100.0
-                sys.stdout.write("\r" "Downloading " + filename + "...%d%%" % percent)
-                sys.stdout.flush()
-        if sys.version_info[0] == 2:
-            from urllib import urlretrieve
-        else:
-            from urllib.request import urlretrieve
-        filepath = os.path.join(working_directory, filename)
-        urlretrieve(url_source+filename, filepath, reporthook=_dlProgress)
-
-    exists_or_mkdir(working_directory, verbose=False)
-    filepath = os.path.join(working_directory, filename)
-
-    if not os.path.exists(filepath):
-        _download(filename, working_directory, url_source)
-        print()
-        statinfo = os.stat(filepath)
-        print('Succesfully downloaded', filename, statinfo.st_size, 'bytes.')
-        if(not(expected_bytes is None) and (expected_bytes != statinfo.st_size)):
-            raise Exception('Failed to verify ' + filename + '. Can you get to it with a browser?')
-        if(extract):
-            if tarfile.is_tarfile(filepath):
-                print('Trying to extract tar file')
-                tarfile.open(filepath, 'r').extractall(working_directory)
-                print('... Success!')
-            elif zipfile.is_zipfile(filepath):
-                print('Trying to extract zip file')
-                with zipfile.ZipFile(filepath) as zf:
-                    zf.extractall(working_directory)
-                print('... Success!')
-            else:
-                print("Unknown compression_format only .tar.gz/.tar.bz2/.tar and .zip supported")
-    return filepath
-
-
-## Sort
-def natural_keys(text):
-    """Sort list of string with number in human order.
-
-    Examples
-    ----------
-    >>> l = ['im1.jpg', 'im31.jpg', 'im11.jpg', 'im21.jpg', 'im03.jpg', 'im05.jpg']
-    >>> l.sort(key=tl.files.natural_keys)
-    ... ['im1.jpg', 'im03.jpg', 'im05', 'im11.jpg', 'im21.jpg', 'im31.jpg']
-    >>> l.sort() # that is what we dont want
-    ... ['im03.jpg', 'im05', 'im1.jpg', 'im11.jpg', 'im21.jpg', 'im31.jpg']
-
-    Reference
-    ----------
-    alist.sort(key=natural_keys) sorts in human order
-    http://nedbatchelder.com/blog/200712/human_sorting.html
-    (See Toothy's implementation in the comments)
-    """
-    def atoi(text):
-        return int(text) if text.isdigit() else text
-    return [ atoi(c) for c in re.split('(\d+)', text) ]
-
-# Visualizing npz files
-def npz_to_W_pdf(path=None, regx='w1pre_[0-9]+\.(npz)'):
-    """Convert the first weight matrix of .npz file to .pdf by using tl.visualize.W().
-
-    Parameters
-    ----------
-    path : a string or None
-        A folder path to npz files.
-    regx : a string
-        Regx for the file name.
-
-    Examples
-    --------
-    >>> Convert the first weight matrix of w1_pre...npz file to w1_pre...pdf.
-    >>> tl.files.npz_to_W_pdf(path='/Users/.../npz_file/', regx='w1pre_[0-9]+\.(npz)')
-    """
-    file_list = load_file_list(path=path, regx=regx)
-    for f in file_list:
-        W = load_npz(path, f)[0]
-        print("%s --> %s" % (f, f.split('.')[0]+'.pdf'))
-        visualize.W(W, second=10, saveable=True, name=f.split('.')[0], fig_idx=2012)
diff --git a/tensorlayer/iterate.py b/tensorlayer/iterate.py
deleted file mode 100644
index 9778fc6f..00000000
--- a/tensorlayer/iterate.py
+++ /dev/null
@@ -1,432 +0,0 @@
-#! /usr/bin/python
-# -*- coding: utf8 -*-
-
-
-
-import numpy as np
-from six.moves import xrange
-
-def minibatches(inputs=None, targets=None, batch_size=None, shuffle=False):
-    """Generate a generator that input a group of example in numpy.array and
-    their labels, return the examples and labels by the given batchsize.
-
-    Parameters
-    ----------
-    inputs : numpy.array
-        (X) The input features, every row is a example.
-    targets : numpy.array
-        (y) The labels of inputs, every row is a example.
-    batch_size : int
-        The batch size.
-    shuffle : boolean
-        Indicating whether to use a shuffling queue, shuffle the dataset before return.
-
-    Hints
-    -------
-    - If you have two inputs, e.g. X1 (1000, 100) and X2 (1000, 80), you can ``np.hstack((X1, X2))
-    into (1000, 180) and feed into ``inputs``, then you can split a batch of X1 and X2.
-
-    Examples
-    --------
-    >>> X = np.asarray([['a','a'], ['b','b'], ['c','c'], ['d','d'], ['e','e'], ['f','f']])
-    >>> y = np.asarray([0,1,2,3,4,5])
-    >>> for batch in tl.iterate.minibatches(inputs=X, targets=y, batch_size=2, shuffle=False):
-    >>>     print(batch)
-    ... (array([['a', 'a'],
-    ...        ['b', 'b']],
-    ...         dtype='<U1'), array([0, 1]))
-    ... (array([['c', 'c'],
-    ...        ['d', 'd']],
-    ...         dtype='<U1'), array([2, 3]))
-    ... (array([['e', 'e'],
-    ...        ['f', 'f']],
-    ...         dtype='<U1'), array([4, 5]))
-    """
-    assert len(inputs) == len(targets)
-    if shuffle:
-        indices = np.arange(len(inputs))
-        np.random.shuffle(indices)
-    for start_idx in range(0, len(inputs) - batch_size + 1, batch_size):
-        if shuffle:
-            excerpt = indices[start_idx:start_idx + batch_size]
-        else:
-            excerpt = slice(start_idx, start_idx + batch_size)
-        yield inputs[excerpt], targets[excerpt]
-
-def seq_minibatches(inputs, targets, batch_size, seq_length, stride=1):
-    """Generate a generator that return a batch of sequence inputs and targets.
-    If ``batch_size = 100, seq_length = 5``, one return will have ``500`` rows (examples).
-
-    Examples
-    --------
-    - Synced sequence input and output.
-    >>> X = np.asarray([['a','a'], ['b','b'], ['c','c'], ['d','d'], ['e','e'], ['f','f']])
-    >>> y = np.asarray([0, 1, 2, 3, 4, 5])
-    >>> for batch in tl.iterate.seq_minibatches(inputs=X, targets=y, batch_size=2, seq_length=2, stride=1):
-    >>>     print(batch)
-    ... (array([['a', 'a'],
-    ...        ['b', 'b'],
-    ...         ['b', 'b'],
-    ...         ['c', 'c']],
-    ...         dtype='<U1'), array([0, 1, 1, 2]))
-    ... (array([['c', 'c'],
-    ...         ['d', 'd'],
-    ...         ['d', 'd'],
-    ...         ['e', 'e']],
-    ...         dtype='<U1'), array([2, 3, 3, 4]))
-    ...
-    ...
-
-    - Many to One
-    >>> return_last = True
-    >>> num_steps = 2
-    >>> X = np.asarray([['a','a'], ['b','b'], ['c','c'], ['d','d'], ['e','e'], ['f','f']])
-    >>> Y = np.asarray([0,1,2,3,4,5])
-    >>> for batch in tl.iterate.seq_minibatches(inputs=X, targets=Y, batch_size=2, seq_length=num_steps, stride=1):
-    >>>     x, y = batch
-    >>>     if return_last:
-    >>>         tmp_y = y.reshape((-1, num_steps) + y.shape[1:])
-    >>>     y = tmp_y[:, -1]
-    >>>     print(x, y)
-    ... [['a' 'a']
-    ... ['b' 'b']
-    ... ['b' 'b']
-    ... ['c' 'c']] [1 2]
-    ... [['c' 'c']
-    ... ['d' 'd']
-    ... ['d' 'd']
-    ... ['e' 'e']] [3 4]
-    """
-    assert len(inputs) == len(targets)
-    n_loads = (batch_size * stride) + (seq_length - stride)
-    for start_idx in range(0, len(inputs) - n_loads + 1, (batch_size * stride)):
-        seq_inputs = np.zeros((batch_size, seq_length) + inputs.shape[1:],
-                              dtype=inputs.dtype)
-        seq_targets = np.zeros((batch_size, seq_length) + targets.shape[1:],
-                               dtype=targets.dtype)
-        for b_idx in xrange(batch_size):
-            start_seq_idx = start_idx + (b_idx * stride)
-            end_seq_idx = start_seq_idx + seq_length
-            seq_inputs[b_idx] = inputs[start_seq_idx:end_seq_idx]
-            seq_targets[b_idx] = targets[start_seq_idx:end_seq_idx]
-        flatten_inputs = seq_inputs.reshape((-1,) + inputs.shape[1:])
-        flatten_targets = seq_targets.reshape((-1,) + targets.shape[1:])
-        yield flatten_inputs, flatten_targets
-
-def seq_minibatches2(inputs, targets, batch_size, num_steps):
-    """Generate a generator that iterates on two list of words. Yields (Returns) the source contexts and
-    the target context by the given batch_size and num_steps (sequence_length),
-    see ``PTB tutorial``. In TensorFlow's tutorial, this generates the batch_size pointers into the raw
-    PTB data, and allows minibatch iteration along these pointers.
-
-    - Hint, if the input data are images, you can modify the code as follow.
-
-    .. code-block:: python
-
-        from
-        data = np.zeros([batch_size, batch_len)
-        to
-        data = np.zeros([batch_size, batch_len, inputs.shape[1], inputs.shape[2], inputs.shape[3]])
-
-    Parameters
-    ----------
-    inputs : a list
-            the context in list format; note that context usually be
-            represented by splitting by space, and then convert to unique
-            word IDs.
-    targets : a list
-            the context in list format; note that context usually be
-            represented by splitting by space, and then convert to unique
-            word IDs.
-    batch_size : int
-            the batch size.
-    num_steps : int
-            the number of unrolls. i.e. sequence_length
-
-    Yields
-    ------
-    Pairs of the batched data, each a matrix of shape [batch_size, num_steps].
-
-    Raises
-    ------
-    ValueError : if batch_size or num_steps are too high.
-
-    Examples
-    --------
-    >>> X = [i for i in range(20)]
-    >>> Y = [i for i in range(20,40)]
-    >>> for batch in tl.iterate.seq_minibatches2(X, Y, batch_size=2, num_steps=3):
-    ...     x, y = batch
-    ...     print(x, y)
-    ...
-    ... [[  0.   1.   2.]
-    ... [ 10.  11.  12.]]
-    ... [[ 20.  21.  22.]
-    ... [ 30.  31.  32.]]
-    ...
-    ... [[  3.   4.   5.]
-    ... [ 13.  14.  15.]]
-    ... [[ 23.  24.  25.]
-    ... [ 33.  34.  35.]]
-    ...
-    ... [[  6.   7.   8.]
-    ... [ 16.  17.  18.]]
-    ... [[ 26.  27.  28.]
-    ... [ 36.  37.  38.]]
-
-    Code References
-    ---------------
-    - ``tensorflow/models/rnn/ptb/reader.py``
-    """
-    assert len(inputs) == len(targets)
-    data_len = len(inputs)
-    batch_len = data_len // batch_size
-    # data = np.zeros([batch_size, batch_len])
-    data = np.zeros((batch_size, batch_len) + inputs.shape[1:],
-                          dtype=inputs.dtype)
-    data2 = np.zeros([batch_size, batch_len])
-
-    for i in range(batch_size):
-        data[i] = inputs[batch_len * i:batch_len * (i + 1)]
-        data2[i] = targets[batch_len * i:batch_len * (i + 1)]
-
-    epoch_size = (batch_len - 1) // num_steps
-
-    if epoch_size == 0:
-        raise ValueError("epoch_size == 0, decrease batch_size or num_steps")
-
-    for i in range(epoch_size):
-        x = data[:, i*num_steps:(i+1)*num_steps]
-        x2 = data2[:, i*num_steps:(i+1)*num_steps]
-        yield (x, x2)
-
-
-def ptb_iterator(raw_data, batch_size, num_steps):
-    """
-    Generate a generator that iterates on a list of words, see PTB tutorial. Yields (Returns) the source contexts and
-    the target context by the given batch_size and num_steps (sequence_length).\n
-    see ``PTB tutorial``.
-
-    e.g. x = [0, 1, 2]  y = [1, 2, 3] , when batch_size = 1, num_steps = 3,
-    raw_data = [i for i in range(100)]
-
-    In TensorFlow's tutorial, this generates batch_size pointers into the raw
-    PTB data, and allows minibatch iteration along these pointers.
-
-    Parameters
-    ----------
-    raw_data : a list
-            the context in list format; note that context usually be
-            represented by splitting by space, and then convert to unique
-            word IDs.
-    batch_size : int
-            the batch size.
-    num_steps : int
-            the number of unrolls. i.e. sequence_length
-
-    Yields
-    ------
-    Pairs of the batched data, each a matrix of shape [batch_size, num_steps].
-    The second element of the tuple is the same data time-shifted to the
-    right by one.
-
-    Raises
-    ------
-    ValueError : if batch_size or num_steps are too high.
-
-    Examples
-    --------
-    >>> train_data = [i for i in range(20)]
-    >>> for batch in tl.iterate.ptb_iterator(train_data, batch_size=2, num_steps=3):
-    >>>     x, y = batch
-    >>>     print(x, y)
-    ... [[ 0  1  2] <---x                       1st subset/ iteration
-    ...  [10 11 12]]
-    ... [[ 1  2  3] <---y
-    ...  [11 12 13]]
-    ...
-    ... [[ 3  4  5]  <--- 1st batch input       2nd subset/ iteration
-    ...  [13 14 15]] <--- 2nd batch input
-    ... [[ 4  5  6]  <--- 1st batch target
-    ...  [14 15 16]] <--- 2nd batch target
-    ...
-    ... [[ 6  7  8]                             3rd subset/ iteration
-    ...  [16 17 18]]
-    ... [[ 7  8  9]
-    ...  [17 18 19]]
-
-    Code References
-    ----------------
-    - ``tensorflow/models/rnn/ptb/reader.py``
-    """
-    raw_data = np.array(raw_data, dtype=np.int32)
-
-    data_len = len(raw_data)
-    batch_len = data_len // batch_size
-    data = np.zeros([batch_size, batch_len], dtype=np.int32)
-    for i in range(batch_size):
-        data[i] = raw_data[batch_len * i:batch_len * (i + 1)]
-
-    epoch_size = (batch_len - 1) // num_steps
-
-    if epoch_size == 0:
-        raise ValueError("epoch_size == 0, decrease batch_size or num_steps")
-
-    for i in range(epoch_size):
-        x = data[:, i*num_steps:(i+1)*num_steps]
-        y = data[:, i*num_steps+1:(i+1)*num_steps+1]
-        yield (x, y)
-
-
-
-# def minibatches_for_sequence2D(inputs, targets, batch_size, sequence_length, stride=1):
-#     """
-#     Input a group of example in 2D numpy.array and their labels.
-#     Return the examples and labels by the given batchsize, sequence_length.
-#     Use for RNN.
-#
-#     Parameters
-#     ----------
-#     inputs : numpy.array
-#         (X) The input features, every row is a example.
-#     targets : numpy.array
-#         (y) The labels of inputs, every row is a example.
-#     batchsize : int
-#         The batch size must be a multiple of sequence_length: int(batch_size % sequence_length) == 0
-#     sequence_length : int
-#         The sequence length
-#     stride : int
-#         The stride step
-#
-#     Examples
-#     --------
-#     >>> sequence_length = 2
-#     >>> batch_size = 4
-#     >>> stride = 1
-#     >>> X_train = np.asarray([[1,2,3],[4,5,6],[7,8,9],[10,11,12],[13,14,15],[16,17,18],[19,20,21],[22,23,24]])
-#     >>> y_train = np.asarray(['0','1','2','3','4','5','6','7'])
-#     >>> print('X_train = %s' % X_train)
-#     >>> print('y_train = %s' % y_train)
-#     >>> for batch in minibatches_for_sequence2D(X_train, y_train, batch_size=batch_size, sequence_length=sequence_length, stride=stride):
-#     >>>     inputs, targets = batch
-#     >>>     print(inputs)
-#     >>>     print(targets)
-#     ... [[ 1.  2.  3.]
-#     ... [ 4.  5.  6.]
-#     ... [ 4.  5.  6.]
-#     ... [ 7.  8.  9.]]
-#     ... [1 2]
-#     ... [[  4.   5.   6.]
-#     ... [  7.   8.   9.]
-#     ... [  7.   8.   9.]
-#     ... [ 10.  11.  12.]]
-#     ... [2 3]
-#     ... ...
-#     ... [[ 16.  17.  18.]
-#     ... [ 19.  20.  21.]
-#     ... [ 19.  20.  21.]
-#     ... [ 22.  23.  24.]]
-#     ... [6 7]
-#     """
-#     print('len(targets)=%d batch_size=%d sequence_length=%d stride=%d' % (len(targets), batch_size, sequence_length, stride))
-#     assert len(inputs) == len(targets), '1 feature vector have 1 target vector/value' #* sequence_length
-#     # assert int(batch_size % sequence_length) == 0, 'batch_size % sequence_length must == 0\
-#     # batch_size is number of examples rather than number of targets'
-#
-#     # print(inputs.shape, len(inputs), len(inputs[0]))
-#
-#     n_targets = int(batch_size/sequence_length)
-#     # n_targets = int(np.ceil(batch_size/sequence_length))
-#     X = np.empty(shape=(0,len(inputs[0])), dtype=np.float32)
-#     y = np.zeros(shape=(1, n_targets), dtype=np.int32)
-#
-#     for idx in range(sequence_length, len(inputs), stride):  # go through all example during 1 epoch
-#         for n in range(n_targets):   # for num of target
-#             X = np.concatenate((X, inputs[idx-sequence_length+n:idx+n]))
-#             y[0][n] = targets[idx-1+n]
-#             # y = np.vstack((y, targets[idx-1+n]))
-#         yield X, y[0]
-#         X = np.empty(shape=(0,len(inputs[0])))
-#         # y = np.empty(shape=(1,0))
-#
-#
-# def minibatches_for_sequence4D(inputs, targets, batch_size, sequence_length, stride=1): #
-#     """
-#     Input a group of example in 4D numpy.array and their labels.
-#     Return the examples and labels by the given batchsize, sequence_length.
-#     Use for RNN.
-#
-#     Parameters
-#     ----------
-#     inputs : numpy.array
-#         (X) The input features, every row is a example.
-#     targets : numpy.array
-#         (y) The labels of inputs, every row is a example.
-#     batchsize : int
-#         The batch size must be a multiple of sequence_length: int(batch_size % sequence_length) == 0
-#     sequence_length : int
-#         The sequence length
-#     stride : int
-#         The stride step
-#
-#     Examples
-#     --------
-#     >>> sequence_length = 2
-#     >>> batch_size = 2
-#     >>> stride = 1
-#     >>> X_train = np.asarray([[1,2,3],[4,5,6],[7,8,9],[10,11,12],[13,14,15],[16,17,18],[19,20,21],[22,23,24]])
-#     >>> y_train = np.asarray(['0','1','2','3','4','5','6','7'])
-#     >>> X_train = np.expand_dims(X_train, axis=1)
-#     >>> X_train = np.expand_dims(X_train, axis=3)
-#     >>> for batch in minibatches_for_sequence4D(X_train, y_train, batch_size=batch_size, sequence_length=sequence_length, stride=stride):
-#     >>>     inputs, targets = batch
-#     >>>     print(inputs)
-#     >>>     print(targets)
-#     ... [[[[ 1.]
-#     ...    [ 2.]
-#     ...    [ 3.]]]
-#     ... [[[ 4.]
-#     ...   [ 5.]
-#     ...   [ 6.]]]]
-#     ... [1]
-#     ... [[[[ 4.]
-#     ...    [ 5.]
-#     ...    [ 6.]]]
-#     ... [[[ 7.]
-#     ...   [ 8.]
-#     ...   [ 9.]]]]
-#     ... [2]
-#     ... ...
-#     ... [[[[ 19.]
-#     ...    [ 20.]
-#     ...    [ 21.]]]
-#     ... [[[ 22.]
-#     ...   [ 23.]
-#     ...   [ 24.]]]]
-#     ... [7]
-#     """
-#     print('len(targets)=%d batch_size=%d sequence_length=%d stride=%d' % (len(targets), batch_size, sequence_length, stride))
-#     assert len(inputs) == len(targets), '1 feature vector have 1 target vector/value' #* sequence_length
-#     # assert int(batch_size % sequence_length) == 0, 'in LSTM, batch_size % sequence_length must == 0\
-#     # batch_size is number of X_train rather than number of targets'
-#     assert stride >= 1, 'stride must be >=1, at least move 1 step for each iternation'
-#
-#     n_example, n_channels, width, height = inputs.shape
-#     print('n_example=%d n_channels=%d width=%d height=%d' % (n_example, n_channels, width, height))
-#
-#     n_targets = int(np.ceil(batch_size/sequence_length)) # 实际为 batchsize/sequence_length + 1
-#     print(n_targets)
-#     X = np.zeros(shape=(batch_size, n_channels, width, height), dtype=np.float32)
-#     # X = np.zeros(shape=(n_targets, sequence_length, n_channels, width, height), dtype=np.float32)
-#     y = np.zeros(shape=(1,n_targets), dtype=np.int32)
-#     # y = np.empty(shape=(0,1), dtype=np.float32)
-#     # time.sleep(2)
-#     for idx in range(sequence_length, n_example-n_targets+2, stride):  # go through all example during 1 epoch
-#         for n in range(n_targets):   # for num of target
-#             # print(idx+n, inputs[idx-sequence_length+n : idx+n].shape)
-#             X[n*sequence_length : (n+1)*sequence_length] = inputs[idx+n-sequence_length : idx+n]
-#             # X[n] = inputs[idx-sequence_length+n:idx+n]
-#             y[0][n] = targets[idx+n-1]
-#             # y = np.vstack((y, targets[idx-1+n]))
-#         # y = targets[idx: idx+n_targets]
-#         yield X, y[0]
diff --git a/tensorlayer/layers.py b/tensorlayer/layers.py
deleted file mode 100644
index 8cfbcb83..00000000
--- a/tensorlayer/layers.py
+++ /dev/null
@@ -1,6251 +0,0 @@
-#! /usr/bin/python
-# -*- coding: utf8 -*-
-
-
-
-import tensorflow as tf
-import time
-from . import visualize
-from . import utils
-from . import files
-from . import cost
-from . import iterate
-from . import ops
-import numpy as np
-from six.moves import xrange
-import random, warnings
-import copy
-import inspect
-# __all__ = [
-#     "Layer",
-#     "DenseLayer",
-# ]
-
-
-# set_keep = locals()
-set_keep = globals()
-set_keep['_layers_name_list'] =[]
-set_keep['name_reuse'] = False
-
-try:  # For TF12 and later
-    TF_GRAPHKEYS_VARIABLES = tf.GraphKeys.GLOBAL_VARIABLES
-except:  # For TF11 and before
-    TF_GRAPHKEYS_VARIABLES = tf.GraphKeys.VARIABLES
-
-## Variable Operation
-def flatten_reshape(variable, name=''):
-    """Reshapes high-dimension input to a vector.
-    [batch_size, mask_row, mask_col, n_mask] ---> [batch_size, mask_row * mask_col * n_mask]
-
-    Parameters
-    ----------
-    variable : a tensorflow variable
-    name : a string or None
-        An optional name to attach to this layer.
-
-    Examples
-    --------
-    >>> W_conv2 = weight_variable([5, 5, 100, 32])   # 64 features for each 5x5 patch
-    >>> b_conv2 = bias_variable([32])
-    >>> W_fc1 = weight_variable([7 * 7 * 32, 256])
-
-    >>> h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
-    >>> h_pool2 = max_pool_2x2(h_conv2)
-    >>> h_pool2.get_shape()[:].as_list() = [batch_size, 7, 7, 32]
-    ...         [batch_size, mask_row, mask_col, n_mask]
-    >>> h_pool2_flat = tl.layers.flatten_reshape(h_pool2)
-    ...         [batch_size, mask_row * mask_col * n_mask]
-    >>> h_pool2_flat_drop = tf.nn.dropout(h_pool2_flat, keep_prob)
-    ...
-    """
-    dim = 1
-    for d in variable.get_shape()[1:].as_list():
-        dim *= d
-    return tf.reshape(variable, shape=[-1, dim], name=name)
-
-def clear_layers_name():
-    """Clear all layer names in set_keep['_layers_name_list'],
-    enable layer name reuse.
-
-    Examples
-    ---------
-    >>> network = tl.layers.InputLayer(x, name='input_layer')
-    >>> network = tl.layers.DenseLayer(network, n_units=800, name='relu1')
-    ...
-    >>> tl.layers.clear_layers_name()
-    >>> network2 = tl.layers.InputLayer(x, name='input_layer')
-    >>> network2 = tl.layers.DenseLayer(network2, n_units=800, name='relu1')
-    ...
-    """
-    set_keep['_layers_name_list'] =[]
-
-def set_name_reuse(enable=True):
-    """Enable or disable reuse layer name. By default, each layer must has unique
-    name. When you want two or more input placeholder (inference) share the same
-    model parameters, you need to enable layer name reuse, then allow the
-    parameters have same name scope.
-
-    Parameters
-    ------------
-    enable : boolean, enable name reuse. (None means False).
-
-    Examples
-    ------------
-    >>> def embed_seq(input_seqs, is_train, reuse):
-    >>>    with tf.variable_scope("model", reuse=reuse):
-    >>>         tl.layers.set_name_reuse(reuse)
-    >>>         network = tl.layers.EmbeddingInputlayer(
-    ...                     inputs = input_seqs,
-    ...                     vocabulary_size = vocab_size,
-    ...                     embedding_size = embedding_size,
-    ...                     name = 'e_embedding')
-    >>>        network = tl.layers.DynamicRNNLayer(network,
-    ...                     cell_fn = tf.contrib.rnn.BasicLSTMCell,
-    ...                     n_hidden = embedding_size,
-    ...                     dropout = (0.7 if is_train else None),
-    ...                     initializer = w_init,
-    ...                     sequence_length = tl.layers.retrieve_seq_length_op2(input_seqs),
-    ...                     return_last = True,
-    ...                     name = 'e_dynamicrnn',)
-    >>>    return network
-    >>>
-    >>> net_train = embed_seq(t_caption, is_train=True, reuse=False)
-    >>> net_test = embed_seq(t_caption, is_train=False, reuse=True)
-
-    - see ``tutorial_ptb_lstm.py`` for example.
-    """
-    set_keep['name_reuse'] = enable
-
-def initialize_rnn_state(state):
-    """Return the initialized RNN state.
-    The input is LSTMStateTuple or State of RNNCells.
-
-    Parameters
-    -----------
-    state : a RNN state.
-    """
-    try: # TF1.0
-        LSTMStateTuple = tf.contrib.rnn.LSTMStateTuple
-    except:
-        LSTMStateTuple = tf.nn.rnn_cell.LSTMStateTuple
-
-    if isinstance(state, LSTMStateTuple):
-        c = state.c.eval()
-        h = state.h.eval()
-        return (c, h)
-    else:
-        new_state = state.eval()
-        return new_state
-
-def print_all_variables(train_only=False):
-    """Print all trainable and non-trainable variables
-    without tl.layers.initialize_global_variables(sess)
-
-    Parameters
-    ----------
-    train_only : boolean
-        If True, only print the trainable variables, otherwise, print all variables.
-    """
-    # tvar = tf.trainable_variables() if train_only else tf.all_variables()
-    if train_only:
-        t_vars = tf.trainable_variables()
-        print("  [*] printing trainable variables")
-    else:
-        try: # TF1.0
-            t_vars = tf.global_variables()
-        except: # TF0.12
-            t_vars = tf.all_variables()
-        print("  [*] printing global variables")
-    for idx, v in enumerate(t_vars):
-        print("  var {:3}: {:15}   {}".format(idx, str(v.get_shape()), v.name))
-
-def get_variables_with_name(name, train_only=True, printable=False):
-    """Get variable list by a given name scope.
-
-    Examples
-    ---------
-    >>> dense_vars = tl.layers.get_variable_with_name('dense', True, True)
-    """
-    print("  [*] geting variables with %s" % name)
-    # tvar = tf.trainable_variables() if train_only else tf.all_variables()
-    if train_only:
-        t_vars = tf.trainable_variables()
-    else:
-        try: # TF1.0
-            t_vars = tf.global_variables()
-        except: # TF0.12
-            t_vars = tf.all_variables()
-
-    d_vars = [var for var in t_vars if name in var.name]
-    if printable:
-        for idx, v in enumerate(d_vars):
-            print("  got {:3}: {:15}   {}".format(idx, v.name, str(v.get_shape())))
-    return d_vars
-
-def get_layers_with_name(network=None, name="", printable=False):
-    """Get layer list in a network by a given name scope.
-
-    Examples
-    ---------
-    >>> layers = tl.layers.get_layers_with_name(network, "CNN", True)
-    """
-    assert network is not None
-    print("  [*] geting layers with %s" % name)
-
-    layers = []
-    i = 0
-    for layer in network.all_layers:
-        # print(type(layer.name))
-        if name in layer.name:
-            layers.append(layer)
-            if printable:
-                # print(layer.name)
-                print("  got {:3}: {:15}   {}".format(i, layer.name, str(layer.get_shape())))
-                i = i + 1
-    return layers
-
-def list_remove_repeat(l=None):
-    """Remove the repeated items in a list, and return the processed list.
-    You may need it to create merged layer like Concat, Elementwise and etc.
-
-    Parameters
-    ----------
-    l : a list
-
-    Examples
-    ---------
-    >>> l = [2, 3, 4, 2, 3]
-    >>> l = list_remove_repeat(l)
-    ... [2, 3, 4]
-    """
-    l2 = []
-    [l2.append(i) for i in l if not i in l2]
-    return l2
-
-def initialize_global_variables(sess=None):
-    """Excute ``sess.run(tf.global_variables_initializer())`` for TF12+ or
-    sess.run(tf.initialize_all_variables()) for TF11.
-
-    Parameters
-    ----------
-    sess : a Session
-    """
-    assert sess is not None
-    # try:    # TF12+
-    sess.run(tf.global_variables_initializer())
-    # except: # TF11
-    #     sess.run(tf.initialize_all_variables())
-
-
-## Basic layer
-class Layer(object):
-    """
-    The :class:`Layer` class represents a single layer of a neural network. It
-    should be subclassed when implementing new types of layers.
-    Because each layer can keep track of the layer(s) feeding into it, a
-    network's output :class:`Layer` instance can double as a handle to the full
-    network.
-
-    Parameters
-    ----------
-    inputs : a :class:`Layer` instance
-        The `Layer` class feeding into this layer.
-    name : a string or None
-        An optional name to attach to this layer.
-    """
-    def __init__(
-        self,
-        inputs = None,
-        name ='layer'
-    ):
-        self.inputs = inputs
-        scope_name = tf.get_variable_scope().name
-        if scope_name:
-            name = scope_name + '/' + name
-        if (name in set_keep['_layers_name_list']) and set_keep['name_reuse'] == False:
-            raise Exception("Layer '%s' already exists, please choice other 'name' or reuse this layer\
-            \nHint : Use different name for different 'Layer' (The name is used to control parameter sharing)" % name)
-        else:
-            self.name = name
-            if name not in ['', None, False]:
-                set_keep['_layers_name_list'].append(name)
-
-    def print_params(self, details=True):
-        ''' Print all info of parameters in the network'''
-        for i, p in enumerate(self.all_params):
-            if details:
-                try:
-                    # print("  param {:3}: {:15} (mean: {:<18}, median: {:<18}, std: {:<18})   {}".format(i, str(p.eval().shape), p.eval().mean(), np.median(p.eval()), p.eval().std(), p.name))
-                    val = p.eval()
-                    print("  param {:3}: {:20} {:15}    {} (mean: {:<18}, median: {:<18}, std: {:<18})   ".format(i, p.name, str(val.shape), p.dtype.name, val.mean(), np.median(val), val.std()))
-                except Exception as e:
-                    print(str(e))
-                    raise Exception("Hint: print params details after tl.layers.initialize_global_variables(sess) or use network.print_params(False).")
-            else:
-                print("  param {:3}: {:20} {:15}    {}".format(i, p.name, str(p.get_shape()), p.dtype.name))
-        print("  num of params: %d" % self.count_params())
-
-    def print_layers(self):
-        ''' Print all info of layers in the network '''
-        for i, layer in enumerate(self.all_layers):
-            # print("  layer %d: %s" % (i, str(layer)))
-            print("  layer {:3}: {:20} {:15}    {}".format(i, layer.name, str(layer.get_shape()), layer.dtype.name))
-
-    def count_params(self):
-        ''' Return the number of parameters in the network '''
-        n_params = 0
-        for i, p in enumerate(self.all_params):
-            n = 1
-            # for s in p.eval().shape:
-            for s in p.get_shape():
-                try:
-                    s = int(s)
-                except:
-                    s = 1
-                if s:
-                    n = n * s
-            n_params = n_params + n
-        return n_params
-
-    def __str__(self):
-        # print("\nIt is a Layer class")
-        # self.print_params(False)
-        # self.print_layers()
-        return "  Last layer is: %s" % self.__class__.__name__
-
-## Input layer
-class InputLayer(Layer):
-    """
-    The :class:`InputLayer` class is the starting layer of a neural network.
-
-    Parameters
-    ----------
-    inputs : a placeholder or tensor
-        The input tensor data.
-    name : a string or None
-        An optional name to attach to this layer.
-    """
-    def __init__(
-        self,
-        inputs = None,
-        name ='input_layer'
-    ):
-        Layer.__init__(self, inputs=inputs, name=name)
-        print("  [TL] InputLayer  %s: %s" % (self.name, inputs.get_shape()))
-        self.outputs = inputs
-        self.all_layers = []
-        self.all_params = []
-        self.all_drop = {}
-
-## OneHot layer
-class OneHotInputLayer(Layer):
-    """
-    The :class:`OneHotInputLayer` class is the starting layer of a neural network, see ``tf.one_hot``.
-
-    Parameters
-    ----------
-    inputs : a placeholder or tensor
-        The input tensor data.
-    name : a string or None
-        An optional name to attach to this layer.
-    depth : If the input indices is rank N, the output will have rank N+1. The new axis is created at dimension axis (default: the new axis is appended at the end).
-    on_value : If on_value is not provided, it will default to the value 1 with type dtype.
-        default, None
-    off_value : If off_value is not provided, it will default to the value 0 with type dtype.
-        default, None
-    axis : default, None
-    dtype : default, None
-    """
-    def __init__(
-        self,
-        inputs = None,
-        depth = None,
-        on_value = None,
-        off_value = None,
-        axis = None,
-        dtype=None,
-        name ='input_layer'
-    ):
-        Layer.__init__(self, inputs=inputs, name=name)
-        assert depth != None, "depth is not given"
-        print("  [TL]:Instantiate OneHotInputLayer  %s: %s" % (self.name, inputs.get_shape()))
-        self.outputs = tf.one_hot(inputs, depth, on_value=on_value, off_value=off_value, axis=axis, dtype=dtype)
-        self.all_layers = []
-        self.all_params = []
-        self.all_drop = {}
-
-## Word Embedding Input layer
-class Word2vecEmbeddingInputlayer(Layer):
-    """
-    The :class:`Word2vecEmbeddingInputlayer` class is a fully connected layer,
-    for Word Embedding. Words are input as integer index.
-    The output is the embedded word vector.
-
-    Parameters
-    ----------
-    inputs : placeholder
-        For word inputs. integer index format.
-    train_labels : placeholder
-        For word labels. integer index format.
-    vocabulary_size : int
-        The size of vocabulary, number of words.
-    embedding_size : int
-        The number of embedding dimensions.
-    num_sampled : int
-        The Number of negative examples for NCE loss.
-    nce_loss_args : a dictionary
-        The arguments for tf.nn.nce_loss()
-    E_init : embedding initializer
-        The initializer for initializing the embedding matrix.
-    E_init_args : a dictionary
-        The arguments for embedding initializer
-    nce_W_init : NCE decoder biases initializer
-        The initializer for initializing the nce decoder weight matrix.
-    nce_W_init_args : a dictionary
-        The arguments for initializing the nce decoder weight matrix.
-    nce_b_init : NCE decoder biases initializer
-        The initializer for tf.get_variable() of the nce decoder bias vector.
-    nce_b_init_args : a dictionary
-        The arguments for tf.get_variable() of the nce decoder bias vector.
-    name : a string or None
-        An optional name to attach to this layer.
-
-    Variables
-    --------------
-    nce_cost : a tensor
-        The NCE loss.
-    outputs : a tensor
-        The outputs of embedding layer.
-    normalized_embeddings : tensor
-        Normalized embedding matrix
-
-    Examples
-    --------
-    - Without TensorLayer : see tensorflow/examples/tutorials/word2vec/word2vec_basic.py
-    >>> train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
-    >>> train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
-    >>> embeddings = tf.Variable(
-    ...     tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
-    >>> embed = tf.nn.embedding_lookup(embeddings, train_inputs)
-    >>> nce_weights = tf.Variable(
-    ...     tf.truncated_normal([vocabulary_size, embedding_size],
-    ...                    stddev=1.0 / math.sqrt(embedding_size)))
-    >>> nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
-    >>> cost = tf.reduce_mean(
-    ...    tf.nn.nce_loss(weights=nce_weights, biases=nce_biases,
-    ...               inputs=embed, labels=train_labels,
-    ...               num_sampled=num_sampled, num_classes=vocabulary_size,
-    ...               num_true=1))
-
-    - With TensorLayer : see tutorial_word2vec_basic.py
-    >>> train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
-    >>> train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
-    >>> emb_net = tl.layers.Word2vecEmbeddingInputlayer(
-    ...         inputs = train_inputs,
-    ...         train_labels = train_labels,
-    ...         vocabulary_size = vocabulary_size,
-    ...         embedding_size = embedding_size,
-    ...         num_sampled = num_sampled,
-    ...        name ='word2vec_layer',
-    ...    )
-    >>> cost = emb_net.nce_cost
-    >>> train_params = emb_net.all_params
-    >>> train_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(
-    ...                                             cost, var_list=train_params)
-    >>> normalized_embeddings = emb_net.normalized_embeddings
-
-    References
-    ----------
-    - `tensorflow/examples/tutorials/word2vec/word2vec_basic.py <https://github.com/tensorflow/tensorflow/blob/r0.7/tensorflow/examples/tutorials/word2vec/word2vec_basic.py>`_
-    """
-    def __init__(
-        self,
-        inputs = None,
-        train_labels = None,
-        vocabulary_size = 80000,
-        embedding_size = 200,
-        num_sampled = 64,
-        nce_loss_args = {},
-        E_init = tf.random_uniform_initializer(minval=-1.0, maxval=1.0),
-        E_init_args = {},
-        nce_W_init = tf.truncated_normal_initializer(stddev=0.03),
-        nce_W_init_args = {},
-        nce_b_init = tf.constant_initializer(value=0.0),
-        nce_b_init_args = {},
-        name ='word2vec_layer',
-    ):
-        Layer.__init__(self, name=name)
-        self.inputs = inputs
-        print("  [TL] Word2vecEmbeddingInputlayer %s: (%d, %d)" % (self.name, vocabulary_size, embedding_size))
-        # Look up embeddings for inputs.
-        # Note: a row of 'embeddings' is the vector representation of a word.
-        # for the sake of speed, it is better to slice the embedding matrix
-        # instead of transfering a word id to one-hot-format vector and then
-        # multiply by the embedding matrix.
-        # embed is the outputs of the hidden layer (embedding layer), it is a
-        # row vector with 'embedding_size' values.
-        with tf.variable_scope(name) as vs:
-            embeddings = tf.get_variable(name='embeddings',
-                                    shape=(vocabulary_size, embedding_size),
-                                    initializer=E_init,
-                                    **E_init_args)
-            embed = tf.nn.embedding_lookup(embeddings, self.inputs)
-            # Construct the variables for the NCE loss (i.e. negative sampling)
-            nce_weights = tf.get_variable(name='nce_weights',
-                                    shape=(vocabulary_size, embedding_size),
-                                    initializer=nce_W_init,
-                                    **nce_W_init_args)
-            nce_biases = tf.get_variable(name='nce_biases',
-                                    shape=(vocabulary_size),
-                                    initializer=nce_b_init,
-                                    **nce_b_init_args)
-
-        # Compute the average NCE loss for the batch.
-        # tf.nce_loss automatically draws a new sample of the negative labels
-        # each time we evaluate the loss.
-        self.nce_cost = tf.reduce_mean(
-            tf.nn.nce_loss(weights=nce_weights, biases=nce_biases,
-                           inputs=embed, labels=train_labels,
-                           num_sampled=num_sampled, num_classes=vocabulary_size,
-                           **nce_loss_args))
-
-        self.outputs = embed
-        self.normalized_embeddings = tf.nn.l2_normalize(embeddings, 1)
-
-        self.all_layers = [self.outputs]
-        self.all_params = [embeddings, nce_weights, nce_biases]
-        self.all_drop = {}
-
-class EmbeddingInputlayer(Layer):
-    """
-    The :class:`EmbeddingInputlayer` class is a fully connected layer,
-    for Word Embedding. Words are input as integer index.
-    The output is the embedded word vector.
-
-    If you have a pre-train matrix, you can assign the matrix into it.
-    To train a word embedding matrix, you can used class:`Word2vecEmbeddingInputlayer`.
-
-    Note that, do not update this embedding matrix.
-
-    Parameters
-    ----------
-    inputs : placeholder
-        For word inputs. integer index format.
-        a 2D tensor : [batch_size, num_steps(num_words)]
-    vocabulary_size : int
-        The size of vocabulary, number of words.
-    embedding_size : int
-        The number of embedding dimensions.
-    E_init : embedding initializer
-        The initializer for initializing the embedding matrix.
-    E_init_args : a dictionary
-        The arguments for embedding initializer
-    name : a string or None
-        An optional name to attach to this layer.
-
-    Variables
-    ------------
-    outputs : a tensor
-        The outputs of embedding layer.
-        the outputs 3D tensor : [batch_size, num_steps(num_words), embedding_size]
-
-    Examples
-    --------
-    >>> vocabulary_size = 50000
-    >>> embedding_size = 200
-    >>> model_file_name = "model_word2vec_50k_200"
-    >>> batch_size = None
-    ...
-    >>> all_var = tl.files.load_npy_to_any(name=model_file_name+'.npy')
-    >>> data = all_var['data']; count = all_var['count']
-    >>> dictionary = all_var['dictionary']
-    >>> reverse_dictionary = all_var['reverse_dictionary']
-    >>> tl.files.save_vocab(count, name='vocab_'+model_file_name+'.txt')
-    >>> del all_var, data, count
-    ...
-    >>> load_params = tl.files.load_npz(name=model_file_name+'.npz')
-    >>> x = tf.placeholder(tf.int32, shape=[batch_size])
-    >>> y_ = tf.placeholder(tf.int32, shape=[batch_size, 1])
-    >>> emb_net = tl.layers.EmbeddingInputlayer(
-    ...                inputs = x,
-    ...                vocabulary_size = vocabulary_size,
-    ...                embedding_size = embedding_size,
-    ...                name ='embedding_layer')
-    >>> tl.layers.initialize_global_variables(sess)
-    >>> tl.files.assign_params(sess, [load_params[0]], emb_net)
-    >>> word = b'hello'
-    >>> word_id = dictionary[word]
-    >>> print('word_id:', word_id)
-    ... 6428
-    ...
-    >>> words = [b'i', b'am', b'hao', b'dong']
-    >>> word_ids = tl.files.words_to_word_ids(words, dictionary)
-    >>> context = tl.files.word_ids_to_words(word_ids, reverse_dictionary)
-    >>> print('word_ids:', word_ids)
-    ... [72, 1226, 46744, 20048]
-    >>> print('context:', context)
-    ... [b'i', b'am', b'hao', b'dong']
-    ...
-    >>> vector = sess.run(emb_net.outputs, feed_dict={x : [word_id]})
-    >>> print('vector:', vector.shape)
-    ... (1, 200)
-    >>> vectors = sess.run(emb_net.outputs, feed_dict={x : word_ids})
-    >>> print('vectors:', vectors.shape)
-    ... (4, 200)
-
-    """
-    def __init__(
-        self,
-        inputs = None,
-        vocabulary_size = 80000,
-        embedding_size = 200,
-        E_init = tf.random_uniform_initializer(-0.1, 0.1),
-        E_init_args = {},
-        name ='embedding_layer',
-    ):
-        Layer.__init__(self, name=name)
-        self.inputs = inputs
-        print("  [TL] EmbeddingInputlayer %s: (%d, %d)" % (self.name, vocabulary_size, embedding_size))
-
-        with tf.variable_scope(name) as vs:
-            embeddings = tf.get_variable(name='embeddings',
-                                    shape=(vocabulary_size, embedding_size),
-                                    initializer=E_init,
-                                    **E_init_args)
-            embed = tf.nn.embedding_lookup(embeddings, self.inputs)
-
-        self.outputs = embed
-
-        self.all_layers = [self.outputs]
-        self.all_params = [embeddings]
-        self.all_drop = {}
-
-## Dense layer
-class DenseLayer(Layer):
-    """
-    The :class:`DenseLayer` class is a fully connected layer.
-
-    Parameters
-    ----------
-    layer : a :class:`Layer` instance
-        The `Layer` class feeding into this layer.
-    n_units : int
-        The number of units of the layer.
-    act : activation function
-        The function that is applied to the layer activations.
-    W_init : weights initializer
-        The initializer for initializing the weight matrix.
-    b_init : biases initializer or None
-        The initializer for initializing the bias vector. If None, skip biases.
-    W_init_args : dictionary
-        The arguments for the weights tf.get_variable.
-    b_init_args : dictionary
-        The arguments for the biases tf.get_variable.
-    name : a string or None
-        An optional name to attach to this layer.
-
-    Examples
-    --------
-    >>> network = tl.layers.InputLayer(x, name='input_layer')
-    >>> network = tl.layers.DenseLayer(
-    ...                 network,
-    ...                 n_units=800,
-    ...                 act = tf.nn.relu,
-    ...                 W_init=tf.truncated_normal_initializer(stddev=0.1),
-    ...                 name ='relu_layer'
-    ...                 )
-
-    >>> Without TensorLayer, you can do as follow.
-    >>> W = tf.Variable(
-    ...     tf.random_uniform([n_in, n_units], -1.0, 1.0), name='W')
-    >>> b = tf.Variable(tf.zeros(shape=[n_units]), name='b')
-    >>> y = tf.nn.relu(tf.matmul(inputs, W) + b)
-
-    Notes
-    -----
-    If the input to this layer has more than two axes, it need to flatten the
-    input by using :class:`FlattenLayer` in this case.
-    """
-    def __init__(
-        self,
-        layer = None,
-        n_units = 100,
-        act = tf.identity,
-        W_init = tf.truncated_normal_initializer(stddev=0.1),
-        b_init = tf.constant_initializer(value=0.0),
-        W_init_args = {},
-        b_init_args = {},
-        name ='dense_layer',
-    ):
-        Layer.__init__(self, name=name)
-        self.inputs = layer.outputs
-        if self.inputs.get_shape().ndims != 2:
-            raise Exception("The input dimension must be rank 2, please reshape or flatten it")
-
-        n_in = int(self.inputs.get_shape()[-1])
-        self.n_units = n_units
-        print("  [TL] DenseLayer  %s: %d %s" % (self.name, self.n_units, act.__name__))
-        with tf.variable_scope(name) as vs:
-            W = tf.get_variable(name='W', shape=(n_in, n_units), initializer=W_init, **W_init_args )
-            if b_init is not None:
-                try:
-                    b = tf.get_variable(name='b', shape=(n_units), initializer=b_init, **b_init_args )
-                except: # If initializer is a constant, do not specify shape.
-                    b = tf.get_variable(name='b', initializer=b_init, **b_init_args )
-                self.outputs = act(tf.matmul(self.inputs, W) + b)
-            else:
-                self.outputs = act(tf.matmul(self.inputs, W))
-
-        # Hint : list(), dict() is pass by value (shallow), without them, it is
-        # pass by reference.
-        self.all_layers = list(layer.all_layers)
-        self.all_params = list(layer.all_params)
-        self.all_drop = dict(layer.all_drop)
-        self.all_layers.extend( [self.outputs] )
-        if b_init is not None:
-            self.all_params.extend( [W, b] )
-        else:
-            self.all_params.extend( [W] )
-
-class ReconLayer(DenseLayer):
-    """
-    The :class:`ReconLayer` class is a reconstruction layer `DenseLayer` which
-    use to pre-train a `DenseLayer`.
-
-    Parameters
-    ----------
-    layer : a :class:`Layer` instance
-        The `Layer` class feeding into this layer.
-    x_recon : tensorflow variable
-        The variables used for reconstruction.
-    name : a string or None
-        An optional name to attach to this layer.
-    n_units : int
-        The number of units of the layer, should be equal to x_recon
-    act : activation function
-        The activation function that is applied to the reconstruction layer.
-        Normally, for sigmoid layer, the reconstruction activation is sigmoid;
-        for rectifying layer, the reconstruction activation is softplus.
-
-    Examples
-    --------
-    >>> network = tl.layers.InputLayer(x, name='input_layer')
-    >>> network = tl.layers.DenseLayer(network, n_units=196,
-    ...                                 act=tf.nn.sigmoid, name='sigmoid1')
-    >>> recon_layer1 = tl.layers.ReconLayer(network, x_recon=x, n_units=784,
-    ...                                 act=tf.nn.sigmoid, name='recon_layer1')
-    >>> recon_layer1.pretrain(sess, x=x, X_train=X_train, X_val=X_val,
-    ...                         denoise_name=None, n_epoch=1200, batch_size=128,
-    ...                         print_freq=10, save=True, save_name='w1pre_')
-
-    Methods
-    -------
-    pretrain(self, sess, x, X_train, X_val, denoise_name=None, n_epoch=100, batch_size=128, print_freq=10, save=True, save_name='w1pre_')
-        Start to pre-train the parameters of previous DenseLayer.
-
-    Notes
-    -----
-    The input layer should be `DenseLayer` or a layer has only one axes.
-    You may need to modify this part to define your own cost function.
-    By default, the cost is implemented as follow:
-    - For sigmoid layer, the implementation can be `UFLDL <http://deeplearning.stanford.edu/wiki/index.php/UFLDL_Tutorial>`_
-    - For rectifying layer, the implementation can be `Glorot (2011). Deep Sparse Rectifier Neural Networks <http://doi.org/10.1.1.208.6449>`_
-    """
-    def __init__(
-        self,
-        layer = None,
-        x_recon = None,
-        name = 'recon_layer',
-        n_units = 784,
-        act = tf.nn.softplus,
-    ):
-        DenseLayer.__init__(self, layer=layer, n_units=n_units, act=act, name=name)
-        print("     [TL] %s is a ReconLayer" % self.name)
-
-        # y : reconstruction outputs; train_params : parameters to train
-        # Note that: train_params = [W_encoder, b_encoder, W_decoder, b_encoder]
-        y = self.outputs
-        self.train_params = self.all_params[-4:]
-
-        # =====================================================================
-        #
-        # You need to modify the below cost function and optimizer so as to
-        # implement your own pre-train method.
-        #
-        # =====================================================================
-        lambda_l2_w = 0.004
-        learning_rate = 0.0001
-        print("     lambda_l2_w: %f" % lambda_l2_w)
-        print("     learning_rate: %f" % learning_rate)
-
-        # Mean-square-error i.e. quadratic-cost
-        mse = tf.reduce_sum(tf.squared_difference(y, x_recon),  1)
-        mse = tf.reduce_mean(mse)            # in theano: mse = ((y - x) ** 2 ).sum(axis=1).mean()
-            # mse = tf.reduce_mean(tf.reduce_sum(tf.square(tf.sub(y, x_recon)),  1))
-            # mse = tf.reduce_mean(tf.squared_difference(y, x_recon)) # <haodong>: Error
-            # mse = tf.sqrt(tf.reduce_mean(tf.square(y - x_recon)))   # <haodong>: Error
-        # Cross-entropy
-            # ce = cost.cross_entropy(y, x_recon)                                               # <haodong>: list , list , Error (only be used for softmax output)
-            # ce = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(y, x_recon))          # <haodong>: list , list , Error (only be used for softmax output)
-            # ce = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(y, x_recon))   # <haodong>: list , index , Error (only be used for softmax output)
-        L2_w = tf.contrib.layers.l2_regularizer(lambda_l2_w)(self.train_params[0]) \
-                + tf.contrib.layers.l2_regularizer(lambda_l2_w)(self.train_params[2])           # faster than the code below
-            # L2_w = lambda_l2_w * tf.reduce_mean(tf.square(self.train_params[0])) + lambda_l2_w * tf.reduce_mean( tf.square(self.train_params[2]))
-        # DropNeuro
-        P_o = cost.lo_regularizer(0.03)(self.train_params[0])   # + cost.lo_regularizer(0.5)(self.train_params[2])    # <haodong>: if add lo on decoder, no neuron will be broken
-        P_i = cost.li_regularizer(0.03)(self.train_params[0])  # + cost.li_regularizer(0.001)(self.train_params[2])
-
-        # L1 of activation outputs
-        activation_out = self.all_layers[-2]
-        L1_a = 0.001 * tf.reduce_mean(activation_out)   # <haodong>:  theano: T.mean( self.a[i] )         # some neuron are broken, white and black
-            # L1_a = 0.001 * tf.reduce_mean( tf.reduce_sum(activation_out, 0) )         # <haodong>: some neuron are broken, white and black
-            # L1_a = 0.001 * 100 * tf.reduce_mean( tf.reduce_sum(activation_out, 1) )   # <haodong>: some neuron are broken, white and black
-        # KL Divergence
-        beta = 4
-        rho = 0.15
-        p_hat = tf.reduce_mean(activation_out, 0)   # theano: p_hat = T.mean( self.a[i], axis=0 )
-        try: ## TF1.0
-            KLD = beta * tf.reduce_sum( rho * tf.log(tf.divide(rho, p_hat)) + (1- rho) * tf.log((1- rho)/ (tf.subtract(float(1), p_hat))) )
-        except: ## TF0.12
-            KLD = beta * tf.reduce_sum( rho * tf.log(tf.div(rho, p_hat)) + (1- rho) * tf.log((1- rho)/ (tf.sub(float(1), p_hat))) )
-            # KLD = beta * tf.reduce_sum( rho * tf.log(rho/ p_hat) + (1- rho) * tf.log((1- rho)/(1- p_hat)) )
-            # theano: L1_a = l1_a[i] * T.sum( rho[i] * T.log(rho[i]/ p_hat) + (1- rho[i]) * T.log((1- rho[i])/(1- p_hat)) )
-        # Total cost
-        if act == tf.nn.softplus:
-            print('     use: mse, L2_w, L1_a')
-            self.cost = mse + L1_a + L2_w
-        elif act == tf.nn.sigmoid:
-            # ----------------------------------------------------
-            # Cross-entropy was used in Denoising AE
-            # print('     use: ce, L2_w, KLD')
-            # self.cost = ce + L2_w + KLD
-            # ----------------------------------------------------
-            # Mean-squared-error was used in Vanilla AE
-            print('     use: mse, L2_w, KLD')
-            self.cost = mse + L2_w + KLD
-            # ----------------------------------------------------
-            # Add DropNeuro penalty (P_o) can remove neurons of AE
-            # print('     use: mse, L2_w, KLD, P_o')
-            # self.cost = mse + L2_w + KLD + P_o
-            # ----------------------------------------------------
-            # Add DropNeuro penalty (P_i) can remove neurons of previous layer
-            #   If previous layer is InputLayer, it means remove useless features
-            # print('     use: mse, L2_w, KLD, P_i')
-            # self.cost = mse + L2_w + KLD + P_i
-        else:
-            raise Exception("Don't support the given reconstruct activation function")
-
-        self.train_op = tf.train.AdamOptimizer(learning_rate, beta1=0.9, beta2=0.999,
-                                        epsilon=1e-08, use_locking=False).minimize(self.cost, var_list=self.train_params)
-                # self.train_op = tf.train.GradientDescentOptimizer(1.0).minimize(self.cost, var_list=self.train_params)
-
-    def pretrain(self, sess, x, X_train, X_val, denoise_name=None, n_epoch=100, batch_size=128, print_freq=10,
-                  save=True, save_name='w1pre_'):
-        # ====================================================
-        #
-        # You need to modify the cost function in __init__() so as to
-        # get your own pre-train method.
-        #
-        # ====================================================
-        print("     [*] %s start pretrain" % self.name)
-        print("     batch_size: %d" % batch_size)
-        if denoise_name:
-            print("     denoising layer keep: %f" % self.all_drop[set_keep[denoise_name]])
-            dp_denoise = self.all_drop[set_keep[denoise_name]]
-        else:
-            print("     no denoising layer")
-
-        for epoch in range(n_epoch):
-            start_time = time.time()
-            for X_train_a, _ in iterate.minibatches(X_train, X_train, batch_size, shuffle=True):
-                dp_dict = utils.dict_to_one( self.all_drop )
-                if denoise_name:
-                    dp_dict[set_keep[denoise_name]] = dp_denoise
-                feed_dict = {x: X_train_a}
-                feed_dict.update(dp_dict)
-                sess.run(self.train_op, feed_dict=feed_dict)
-
-            if epoch + 1 == 1 or (epoch + 1) % print_freq == 0:
-                print("Epoch %d of %d took %fs" % (epoch + 1, n_epoch, time.time() - start_time))
-                train_loss, n_batch = 0, 0
-                for X_train_a, _ in iterate.minibatches(X_train, X_train, batch_size, shuffle=True):
-                    dp_dict = utils.dict_to_one( self.all_drop )
-                    feed_dict = {x: X_train_a}
-                    feed_dict.update(dp_dict)
-                    err = sess.run(self.cost, feed_dict=feed_dict)
-                    train_loss += err
-                    n_batch += 1
-                print("   train loss: %f" % (train_loss/ n_batch))
-                val_loss, n_batch = 0, 0
-                for X_val_a, _ in iterate.minibatches(X_val, X_val, batch_size, shuffle=True):
-                    dp_dict = utils.dict_to_one( self.all_drop )
-                    feed_dict = {x: X_val_a}
-                    feed_dict.update(dp_dict)
-                    err = sess.run(self.cost, feed_dict=feed_dict)
-                    val_loss += err
-                    n_batch += 1
-                print("   val loss: %f" % (val_loss/ n_batch))
-                if save:
-                    try:
-                        visualize.W(self.train_params[0].eval(), second=10, saveable=True, shape=[28,28], name=save_name+str(epoch+1), fig_idx=2012)
-                        files.save_npz([self.all_params[0]] , name=save_name+str(epoch+1)+'.npz')
-                    except:
-                        raise Exception("You should change the visualize.W() in ReconLayer.pretrain(), if you want to save the feature images for different dataset")
-
-## Noise layer
-class DropoutLayer(Layer):
-    """
-    The :class:`DropoutLayer` class is a noise layer which randomly set some
-    values to zero by a given keeping probability.
-
-    Parameters
-    ----------
-    layer : a :class:`Layer` instance
-        The `Layer` class feeding into this layer.
-    keep : float
-        The keeping probability, the lower more values will be set to zero.
-    is_fix : boolean
-        Default False, if True, the keeping probability is fixed and cannot be changed via feed_dict.
-    is_train : boolean
-        If False, skip this layer, default is True.
-    seed : int or None
-        An integer or None to create random seed.
-    name : a string or None
-        An optional name to attach to this layer.
-
-    Examples
-    --------
-    - Define network
-    >>> network = tl.layers.InputLayer(x, name='input_layer')
-    >>> network = tl.layers.DropoutLayer(network, keep=0.8, name='drop1')
-    >>> network = tl.layers.DenseLayer(network, n_units=800, act = tf.nn.relu, name='relu1')
-    >>> ...
-
-    - For training, enable dropout as follow.
-    >>> feed_dict = {x: X_train_a, y_: y_train_a}
-    >>> feed_dict.update( network.all_drop )     # enable noise layers
-    >>> sess.run(train_op, feed_dict=feed_dict)
-    >>> ...
-
-    - For testing, disable dropout as follow.
-    >>> dp_dict = tl.utils.dict_to_one( network.all_drop ) # disable noise layers
-    >>> feed_dict = {x: X_val_a, y_: y_val_a}
-    >>> feed_dict.update(dp_dict)
-    >>> err, ac = sess.run([cost, acc], feed_dict=feed_dict)
-    >>> ...
-
-    Notes
-    -------
-    - A frequent question regarding :class:`DropoutLayer` is that why it donot have `is_train` like :class:`BatchNormLayer`.
-    In many simple cases, user may find it is better to use one inference instead of two inferences for training and testing seperately, :class:`DropoutLayer`
-    allows you to control the dropout rate via `feed_dict`. However, you can fix the keeping probability by setting `is_fix` to True.
-    """
-    def __init__(
-        self,
-        layer = None,
-        keep = 0.5,
-        is_fix = False,
-        is_train = True,
-        seed = None,
-        name = 'dropout_layer',
-    ):
-        Layer.__init__(self, name=name)
-        if is_train is False:
-            print("  [TL] skip DropoutLayer")
-            self.outputs = layer.outputs
-            self.all_layers = list(layer.all_layers)
-            self.all_params = list(layer.all_params)
-            self.all_drop = dict(layer.all_drop)
-        else:
-            self.inputs = layer.outputs
-            print("  [TL] DropoutLayer %s: keep:%f is_fix:%s" % (self.name, keep, is_fix))
-
-            # The name of placeholder for keep_prob is the same with the name
-            # of the Layer.
-            if is_fix:
-                self.outputs = tf.nn.dropout(self.inputs, keep, seed=seed, name=name)
-            else:
-                set_keep[name] = tf.placeholder(tf.float32)
-                self.outputs = tf.nn.dropout(self.inputs, set_keep[name], seed=seed, name=name) # 1.2
-
-            self.all_layers = list(layer.all_layers)
-            self.all_params = list(layer.all_params)
-            self.all_drop = dict(layer.all_drop)
-            if is_fix is False:
-                self.all_drop.update( {set_keep[name]: keep} )
-            self.all_layers.extend( [self.outputs] )
-
-        # print(set_keep[name])
-        #   Tensor("Placeholder_2:0", dtype=float32)
-        # print(denoising1)
-        #   Tensor("Placeholder_2:0", dtype=float32)
-        # print(self.all_drop[denoising1])
-        #   0.8
-        #
-        # https://www.tensorflow.org/versions/r0.8/tutorials/mnist/tf/index.html
-        # The optional feed_dict argument allows the caller to override the
-        # value of tensors in the graph. Each key in feed_dict can be one of
-        # the following types:
-        # If the key is a Tensor, the value may be a Python scalar, string,
-        # list, or numpy ndarray that can be converted to the same dtype as that
-        # tensor. Additionally, if the key is a placeholder, the shape of the
-        # value will be checked for compatibility with the placeholder.
-        # If the key is a SparseTensor, the value should be a SparseTensorValue.
-
-class GaussianNoiseLayer(Layer):
-    """
-    The :class:`GaussianNoiseLayer` class is noise layer that adding noise with
-    normal distribution to the activation.
-
-    Parameters
-    ------------
-    layer : a :class:`Layer` instance
-        The `Layer` class feeding into this layer.
-    mean : float
-    stddev : float
-    is_train : boolean
-        If False, skip this layer, default is True.
-    seed : int or None
-        An integer or None to create random seed.
-    name : a string or None
-        An optional name to attach to this layer.
-    """
-    def __init__(
-        self,
-        layer = None,
-        mean = 0.0,
-        stddev = 1.0,
-        is_train = True,
-        seed = None,
-        name = 'gaussian_noise_layer',
-    ):
-        Layer.__init__(self, name=name)
-        if is_train is False:
-            print("  [TL] skip GaussianNoiseLayer")
-            self.outputs = layer.outputs
-            self.all_layers = list(layer.all_layers)
-            self.all_params = list(layer.all_params)
-            self.all_drop = dict(layer.all_drop)
-        else:
-            self.inputs = layer.outputs
-            print("  [TL] GaussianNoiseLayer %s: mean:%f stddev:%f" % (self.name, mean, stddev))
-            with tf.variable_scope(name) as vs:
-                # noise = np.random.normal(0.0 , sigma , tf.to_int64(self.inputs).get_shape())
-                noise = tf.random_normal(shape = self.inputs.get_shape(), mean=mean, stddev=stddev, seed=seed)
-                self.outputs = self.inputs + noise
-            self.all_layers = list(layer.all_layers)
-            self.all_params = list(layer.all_params)
-            self.all_drop = dict(layer.all_drop)
-
-class DropconnectDenseLayer(Layer):
-    """
-    The :class:`DropconnectDenseLayer` class is ``DenseLayer`` with DropConnect
-    behaviour which randomly remove connection between this layer to previous
-    layer by a given keeping probability.
-
-    Parameters
-    ----------
-    layer : a :class:`Layer` instance
-        The `Layer` class feeding into this layer.
-    keep : float
-        The keeping probability, the lower more values will be set to zero.
-    n_units : int
-        The number of units of the layer.
-    act : activation function
-        The function that is applied to the layer activations.
-    W_init : weights initializer
-        The initializer for initializing the weight matrix.
-    b_init : biases initializer
-        The initializer for initializing the bias vector.
-    W_init_args : dictionary
-        The arguments for the weights tf.get_variable().
-    b_init_args : dictionary
-        The arguments for the biases tf.get_variable().
-    name : a string or None
-        An optional name to attach to this layer.
-
-    Examples
-    --------
-    >>> network = tl.layers.InputLayer(x, name='input_layer')
-    >>> network = tl.layers.DropconnectDenseLayer(network, keep = 0.8,
-    ...         n_units=800, act = tf.nn.relu, name='dropconnect_relu1')
-    >>> network = tl.layers.DropconnectDenseLayer(network, keep = 0.5,
-    ...         n_units=800, act = tf.nn.relu, name='dropconnect_relu2')
-    >>> network = tl.layers.DropconnectDenseLayer(network, keep = 0.5,
-    ...         n_units=10, act = tl.activation.identity, name='output_layer')
-
-    References
-    ----------
-    - `Wan, L. (2013). Regularization of neural networks using dropconnect <http://machinelearning.wustl.edu/mlpapers/papers/icml2013_wan13>`_
-    """
-    def __init__(
-        self,
-        layer = None,
-        keep = 0.5,
-        n_units = 100,
-        act = tf.identity,
-        W_init = tf.truncated_normal_initializer(stddev=0.1),
-        b_init = tf.constant_initializer(value=0.0),
-        W_init_args = {},
-        b_init_args = {},
-        name ='dropconnect_layer',
-    ):
-        Layer.__init__(self, name=name)
-        self.inputs = layer.outputs
-        if self.inputs.get_shape().ndims != 2:
-            raise Exception("The input dimension must be rank 2")
-        n_in = int(self.inputs.get_shape()[-1])
-        self.n_units = n_units
-        print("  [TL] DropconnectDenseLayer %s: %d %s" % (self.name, self.n_units, act.__name__))
-
-        with tf.variable_scope(name) as vs:
-            W = tf.get_variable(name='W', shape=(n_in, n_units), initializer=W_init, **W_init_args )
-            b = tf.get_variable(name='b', shape=(n_units), initializer=b_init, **b_init_args )
-            self.outputs = act(tf.matmul(self.inputs, W) + b)#, name=name)    # 1.2
-
-        set_keep[name] = tf.placeholder(tf.float32)
-        W_dropcon = tf.nn.dropout(W,  set_keep[name])
-        self.outputs = act(tf.matmul(self.inputs, W_dropcon) + b)
-
-        self.all_layers = list(layer.all_layers)
-        self.all_params = list(layer.all_params)
-        self.all_drop = dict(layer.all_drop)
-        self.all_drop.update( {set_keep[name]: keep} )
-        self.all_layers.extend( [self.outputs] )
-        self.all_params.extend( [W, b] )
-
-## Convolutional layer (Pro)
-
-class Conv1dLayer(Layer):
-    """
-    The :class:`Conv1dLayer` class is a 1D CNN layer, see `tf.nn.convolution <https://www.tensorflow.org/api_docs/python/tf/nn/convolution>`_.
-
-    Parameters
-    ----------
-    layer : a :class:`Layer` instance
-        The `Layer` class feeding into this layer, [batch, in_width, in_channels].
-    act : activation function, None for identity.
-    shape : list of shape
-        shape of the filters, [filter_length, in_channels, out_channels].
-    stride : an int.
-        The number of entries by which the filter is moved right at each step.
-    dilation_rate : an int.
-        Specifies the filter upsampling/input downsampling rate.
-    padding : a string from: "SAME", "VALID".
-        The type of padding algorithm to use.
-    use_cudnn_on_gpu : An optional bool. Defaults to True.
-    data_format : As it is 1D conv, default is 'NWC'.
-    W_init : weights initializer
-        The initializer for initializing the weight matrix.
-    b_init : biases initializer or None
-        The initializer for initializing the bias vector. If None, skip biases.
-    W_init_args : dictionary
-        The arguments for the weights tf.get_variable().
-    b_init_args : dictionary
-        The arguments for the biases tf.get_variable().
-    name : a string or None
-        An optional name to attach to this layer.
-    """
-    def __init__(
-        self,
-        layer = None,
-        act = tf.identity,
-        shape = [5, 1, 5],
-        stride = 1,
-        dilation_rate = 1,
-        padding='SAME',
-        use_cudnn_on_gpu=None,
-        data_format='NWC',
-        W_init = tf.truncated_normal_initializer(stddev=0.02),
-        b_init = tf.constant_initializer(value=0.0),
-        W_init_args = {},
-        b_init_args = {},
-        name ='cnn_layer',
-    ):
-        Layer.__init__(self, name=name)
-        self.inputs = layer.outputs
-        print("  [TL] Conv1dLayer %s: shape:%s stride:%s pad:%s act:%s" %
-                            (self.name, str(shape), str(stride), padding, act.__name__))
-        if act is None:
-            act = tf.identity
-        with tf.variable_scope(name) as vs:
-            W = tf.get_variable(name='W_conv1d', shape=shape, initializer=W_init, **W_init_args )
-            self.outputs = tf.nn.convolution(
-                self.inputs,
-                W,
-                strides=(stride,),
-                padding=padding,
-                dilation_rate=(dilation_rate,),
-                data_format=data_format
-            ) #1.2
-            if b_init:
-                b = tf.get_variable(name='b_conv1d', shape=(shape[-1]), initializer=b_init, **b_init_args )
-                self.outputs = self.outputs + b
-
-            self.outputs = act(self.outputs)
-
-        self.all_layers = list(layer.all_layers)
-        self.all_params = list(layer.all_params)
-        self.all_drop = dict(layer.all_drop)
-        self.all_layers.extend( [self.outputs] )
-        if b_init:
-            self.all_params.extend( [W, b] )
-        else:
-            self.all_params.extend( [W] )
-
-class Conv2dLayer(Layer):
-    """
-    The :class:`Conv2dLayer` class is a 2D CNN layer, see `tf.nn.conv2d <https://www.tensorflow.org/versions/master/api_docs/python/nn.html#conv2d>`_.
-
-    Parameters
-    ----------
-    layer : a :class:`Layer` instance
-        The `Layer` class feeding into this layer.
-    act : activation function
-        The function that is applied to the layer activations.
-    shape : list of shape
-        shape of the filters, [filter_height, filter_width, in_channels, out_channels].
-    strides : a list of ints.
-        The stride of the sliding window for each dimension of input.\n
-        It Must be in the same order as the dimension specified with format.
-    padding : a string from: "SAME", "VALID".
-        The type of padding algorithm to use.
-    W_init : weights initializer
-        The initializer for initializing the weight matrix.
-    b_init : biases initializer or None
-        The initializer for initializing the bias vector. If None, skip biases.
-    W_init_args : dictionary
-        The arguments for the weights tf.get_variable().
-    b_init_args : dictionary
-        The arguments for the biases tf.get_variable().
-    use_cudnn_on_gpu : bool, default is None.
-    data_format : string "NHWC" or "NCHW", default is "NHWC"
-    name : a string or None
-        An optional name to attach to this layer.
-
-    Notes
-    ------
-    - shape = [h, w, the number of output channel of previous layer, the number of output channels]
-    - the number of output channel of a layer is its last dimension.
-
-    Examples
-    --------
-    >>> x = tf.placeholder(tf.float32, shape=[None, 28, 28, 1])
-    >>> network = tl.layers.InputLayer(x, name='input_layer')
-    >>> network = tl.layers.Conv2dLayer(network,
-    ...                   act = tf.nn.relu,
-    ...                   shape = [5, 5, 1, 32],  # 32 features for each 5x5 patch
-    ...                   strides=[1, 1, 1, 1],
-    ...                   padding='SAME',
-    ...                   W_init=tf.truncated_normal_initializer(stddev=5e-2),
-    ...                   W_init_args={},
-    ...                   b_init = tf.constant_initializer(value=0.0),
-    ...                   b_init_args = {},
-    ...                   name ='cnn_layer1')     # output: (?, 28, 28, 32)
-    >>> network = tl.layers.PoolLayer(network,
-    ...                   ksize=[1, 2, 2, 1],
-    ...                   strides=[1, 2, 2, 1],
-    ...                   padding='SAME',
-    ...                   pool = tf.nn.max_pool,
-    ...                   name ='pool_layer1',)   # output: (?, 14, 14, 32)
-
-    >>> Without TensorLayer, you can implement 2d convolution as follow.
-    >>> W = tf.Variable(W_init(shape=[5, 5, 1, 32], ), name='W_conv')
-    >>> b = tf.Variable(b_init(shape=[32], ), name='b_conv')
-    >>> outputs = tf.nn.relu( tf.nn.conv2d(inputs, W,
-    ...                       strides=[1, 1, 1, 1],
-    ...                       padding='SAME') + b )
-    """
-    def __init__(
-        self,
-        layer = None,
-        act = tf.identity,
-        shape = [5, 5, 1, 100],
-        strides=[1, 1, 1, 1],
-        padding='SAME',
-        W_init = tf.truncated_normal_initializer(stddev=0.02),
-        b_init = tf.constant_initializer(value=0.0),
-        W_init_args = {},
-        b_init_args = {},
-        use_cudnn_on_gpu = None,
-        data_format = None,
-        name ='cnn_layer',
-    ):
-        Layer.__init__(self, name=name)
-        self.inputs = layer.outputs
-        print("  [TL] Conv2dLayer %s: shape:%s strides:%s pad:%s act:%s" %
-                            (self.name, str(shape), str(strides), padding, act.__name__))
-
-        with tf.variable_scope(name) as vs:
-            W = tf.get_variable(name='W_conv2d', shape=shape, initializer=W_init, **W_init_args )
-            if b_init:
-                b = tf.get_variable(name='b_conv2d', shape=(shape[-1]), initializer=b_init, **b_init_args )
-                self.outputs = act( tf.nn.conv2d(self.inputs, W, strides=strides, padding=padding, use_cudnn_on_gpu=use_cudnn_on_gpu, data_format=data_format) + b )
-            else:
-                self.outputs = act( tf.nn.conv2d(self.inputs, W, strides=strides, padding=padding, use_cudnn_on_gpu=use_cudnn_on_gpu, data_format=data_format))
-
-        self.all_layers = list(layer.all_layers)
-        self.all_params = list(layer.all_params)
-        self.all_drop = dict(layer.all_drop)
-        self.all_layers.extend( [self.outputs] )
-        if b_init:
-            self.all_params.extend( [W, b] )
-        else:
-            self.all_params.extend( [W] )
-
-class DeConv2dLayer(Layer):
-    """
-    The :class:`DeConv2dLayer` class is deconvolutional 2D layer, see `tf.nn.conv2d_transpose <https://www.tensorflow.org/versions/master/api_docs/python/nn.html#conv2d_transpose>`_.
-
-    Parameters
-    ----------
-    layer : a :class:`Layer` instance
-        The `Layer` class feeding into this layer.
-    act : activation function
-        The function that is applied to the layer activations.
-    shape : list of shape
-        shape of the filters, [height, width, output_channels, in_channels], filter's in_channels dimension must match that of value.
-    output_shape : list of output shape
-        representing the output shape of the deconvolution op.
-    strides : a list of ints.
-        The stride of the sliding window for each dimension of the input tensor.
-    padding : a string from: "SAME", "VALID".
-        The type of padding algorithm to use.
-    W_init : weights initializer
-        The initializer for initializing the weight matrix.
-    b_init : biases initializer
-        The initializer for initializing the bias vector. If None, skip biases.
-    W_init_args : dictionary
-        The arguments for the weights initializer.
-    b_init_args : dictionary
-        The arguments for the biases initializer.
-    name : a string or None
-        An optional name to attach to this layer.
-
-    Notes
-    -----
-    - shape = [h, w, the number of output channels of this layer, the number of output channel of previous layer]
-    - output_shape = [batch_size, any, any, the number of output channels of this layer]
-    - the number of output channel of a layer is its last dimension.
-
-    Examples
-    ---------
-    - A part of the generator in DCGAN example
-    >>> batch_size = 64
-    >>> inputs = tf.placeholder(tf.float32, [batch_size, 100], name='z_noise')
-    >>> net_in = tl.layers.InputLayer(inputs, name='g/in')
-    >>> net_h0 = tl.layers.DenseLayer(net_in, n_units = 8192,
-    ...                            W_init = tf.random_normal_initializer(stddev=0.02),
-    ...                            act = tf.identity, name='g/h0/lin')
-    >>> print(net_h0.outputs._shape)
-    ... (64, 8192)
-    >>> net_h0 = tl.layers.ReshapeLayer(net_h0, shape = [-1, 4, 4, 512], name='g/h0/reshape')
-    >>> net_h0 = tl.layers.BatchNormLayer(net_h0, act=tf.nn.relu, is_train=is_train, name='g/h0/batch_norm')
-    >>> print(net_h0.outputs._shape)
-    ... (64, 4, 4, 512)
-    >>> net_h1 = tl.layers.DeConv2dLayer(net_h0,
-    ...                            shape = [5, 5, 256, 512],
-    ...                            output_shape = [batch_size, 8, 8, 256],
-    ...                            strides=[1, 2, 2, 1],
-    ...                            act=tf.identity, name='g/h1/decon2d')
-    >>> net_h1 = tl.layers.BatchNormLayer(net_h1, act=tf.nn.relu, is_train=is_train, name='g/h1/batch_norm')
-    >>> print(net_h1.outputs._shape)
-    ... (64, 8, 8, 256)
-
-    - U-Net
-    >>> ....
-    >>> conv10 = tl.layers.Conv2dLayer(conv9, act=tf.nn.relu,
-    ...        shape=[3,3,1024,1024], strides=[1,1,1,1], padding='SAME',
-    ...        W_init=w_init, b_init=b_init, name='conv10')
-    >>> print(conv10.outputs)
-    ... (batch_size, 32, 32, 1024)
-    >>> deconv1 = tl.layers.DeConv2dLayer(conv10, act=tf.nn.relu,
-    ...         shape=[3,3,512,1024], strides=[1,2,2,1], output_shape=[batch_size,64,64,512],
-    ...         padding='SAME', W_init=w_init, b_init=b_init, name='devcon1_1')
-    """
-    def __init__(
-        self,
-        layer = None,
-        act = tf.identity,
-        shape = [3, 3, 128, 256],
-        output_shape = [1, 256, 256, 128],
-        strides = [1, 2, 2, 1],
-        padding = 'SAME',
-        W_init = tf.truncated_normal_initializer(stddev=0.02),
-        b_init = tf.constant_initializer(value=0.0),
-        W_init_args = {},
-        b_init_args = {},
-        name ='decnn2d_layer',
-    ):
-        Layer.__init__(self, name=name)
-        self.inputs = layer.outputs
-        print("  [TL] DeConv2dLayer %s: shape:%s out_shape:%s strides:%s pad:%s act:%s" %
-                            (self.name, str(shape), str(output_shape), str(strides), padding, act.__name__))
-        # print("  DeConv2dLayer: Untested")
-        with tf.variable_scope(name) as vs:
-            W = tf.get_variable(name='W_deconv2d', shape=shape, initializer=W_init, **W_init_args )
-            if b_init:
-                b = tf.get_variable(name='b_deconv2d', shape=(shape[-2]), initializer=b_init, **b_init_args )
-                self.outputs = act( tf.nn.conv2d_transpose(self.inputs, W, output_shape=output_shape, strides=strides, padding=padding) + b )
-            else:
-                self.outputs = act( tf.nn.conv2d_transpose(self.inputs, W, output_shape=output_shape, strides=strides, padding=padding))
-
-        self.all_layers = list(layer.all_layers)
-        self.all_params = list(layer.all_params)
-        self.all_drop = dict(layer.all_drop)
-        self.all_layers.extend( [self.outputs] )
-        if b_init:
-            self.all_params.extend( [W, b] )
-        else:
-            self.all_params.extend( [W] )
-
-class Conv3dLayer(Layer):
-    """
-    The :class:`Conv3dLayer` class is a 3D CNN layer, see `tf.nn.conv3d <https://www.tensorflow.org/versions/master/api_docs/python/nn.html#conv3d>`_.
-
-    Parameters
-    ----------
-    layer : a :class:`Layer` instance
-        The `Layer` class feeding into this layer.
-    act : activation function
-        The function that is applied to the layer activations.
-    shape : list of shape
-        shape of the filters, [filter_depth, filter_height, filter_width, in_channels, out_channels].
-    strides : a list of ints. 1-D of length 4.
-        The stride of the sliding window for each dimension of input. Must be in the same order as the dimension specified with format.
-    padding : a string from: "SAME", "VALID".
-        The type of padding algorithm to use.
-    W_init : weights initializer
-        The initializer for initializing the weight matrix.
-    b_init : biases initializer
-        The initializer for initializing the bias vector.
-    W_init_args : dictionary
-        The arguments for the weights initializer.
-    b_init_args : dictionary
-        The arguments for the biases initializer.
-    name : a string or None
-        An optional name to attach to this layer.
-    """
-    def __init__(
-        self,
-        layer = None,
-        act = tf.identity,
-        shape = [2, 2, 2, 64, 128],
-        strides=[1, 2, 2, 2, 1],
-        padding='SAME',
-        W_init = tf.truncated_normal_initializer(stddev=0.02),
-        b_init = tf.constant_initializer(value=0.0),
-        W_init_args = {},
-        b_init_args = {},
-        name ='cnn3d_layer',
-    ):
-        Layer.__init__(self, name=name)
-        self.inputs = layer.outputs
-        print("  [TL] Conv3dLayer %s: shape:%s strides:%s pad:%s act:%s" % (self.name, str(shape), str(strides), padding, act.__name__))
-
-        with tf.variable_scope(name) as vs:
-            # W = tf.Variable(W_init(shape=shape, **W_init_args), name='W_conv')
-            # b = tf.Variable(b_init(shape=[shape[-1]], **b_init_args), name='b_conv')
-            W = tf.get_variable(name='W_conv3d', shape=shape, initializer=W_init, **W_init_args )
-            b = tf.get_variable(name='b_conv3d', shape=(shape[-1]), initializer=b_init, **b_init_args )
-            self.outputs = act( tf.nn.conv3d(self.inputs, W, strides=strides, padding=padding, name=None) + b )
-
-        # self.outputs = act( tf.nn.conv3d(self.inputs, W, strides=strides, padding=padding, name=None) + b )
-
-        self.all_layers = list(layer.all_layers)
-        self.all_params = list(layer.all_params)
-        self.all_drop = dict(layer.all_drop)
-        self.all_layers.extend( [self.outputs] )
-        self.all_params.extend( [W, b] )
-
-class DeConv3dLayer(Layer):
-    """The :class:`DeConv3dLayer` class is deconvolutional 3D layer, see `tf.nn.conv3d_transpose <https://www.tensorflow.org/versions/master/api_docs/python/nn.html#conv3d_transpose>`_.
-
-    Parameters
-    ----------
-    layer : a :class:`Layer` instance
-        The `Layer` class feeding into this layer.
-    act : activation function
-        The function that is applied to the layer activations.
-    shape : list of shape
-        shape of the filters, [depth, height, width, output_channels, in_channels], filter's in_channels dimension must match that of value.
-    output_shape : list of output shape
-        representing the output shape of the deconvolution op.
-    strides : a list of ints.
-        The stride of the sliding window for each dimension of the input tensor.
-    padding : a string from: "SAME", "VALID".
-        The type of padding algorithm to use.
-    W_init : weights initializer
-        The initializer for initializing the weight matrix.
-    b_init : biases initializer
-        The initializer for initializing the bias vector.
-    W_init_args : dictionary
-        The arguments for the weights initializer.
-    b_init_args : dictionary
-        The arguments for the biases initializer.
-    name : a string or None
-        An optional name to attach to this layer.
-    """
-    def __init__(
-        self,
-        layer = None,
-        act = tf.identity,
-        shape = [2, 2, 2, 128, 256],
-        output_shape = [1, 12, 32, 32, 128],
-        strides = [1, 2, 2, 2, 1],
-        padding = 'SAME',
-        W_init = tf.truncated_normal_initializer(stddev=0.02),
-        b_init = tf.constant_initializer(value=0.0),
-        W_init_args = {},
-        b_init_args = {},
-        name ='decnn3d_layer',
-    ):
-        Layer.__init__(self, name=name)
-        self.inputs = layer.outputs
-        print("  [TL] DeConv3dLayer %s: shape:%s out_shape:%s strides:%s pad:%s act:%s" %
-                            (self.name, str(shape), str(output_shape), str(strides), padding, act.__name__))
-
-        with tf.variable_scope(name) as vs:
-            W = tf.get_variable(name='W_deconv3d', shape=shape, initializer=W_init, **W_init_args )
-            b = tf.get_variable(name='b_deconv3d', shape=(shape[-2]), initializer=b_init, **b_init_args )
-
-            self.outputs = act( tf.nn.conv3d_transpose(self.inputs, W, output_shape=output_shape, strides=strides, padding=padding) + b )
-
-        self.all_layers = list(layer.all_layers)
-        self.all_params = list(layer.all_params)
-        self.all_drop = dict(layer.all_drop)
-        self.all_layers.extend( [self.outputs] )
-        self.all_params.extend( [W, b] )
-
-class UpSampling2dLayer(Layer):
-    """The :class:`UpSampling2dLayer` class is upSampling 2d layer, see `tf.image.resize_images <https://www.tensorflow.org/versions/master/api_docs/python/image/resizing#resize_images>`_.
-
-    Parameters
-    -----------
-    layer : a layer class with 4-D Tensor of shape [batch, height, width, channels] or 3-D Tensor of shape [height, width, channels].
-    size : a tuple of int or float.
-        (height, width) scale factor or new size of height and width.
-    is_scale : boolean, if True (default), size is scale factor, otherwise, size is number of pixels of height and width.
-    method : 0, 1, 2, 3. ResizeMethod. Defaults to ResizeMethod.BILINEAR.
-        - ResizeMethod.BILINEAR, Bilinear interpolation.
-        - ResizeMethod.NEAREST_NEIGHBOR, Nearest neighbor interpolation.
-        - ResizeMethod.BICUBIC, Bicubic interpolation.
-        - ResizeMethod.AREA, Area interpolation.
-    align_corners : bool. If true, exactly align all 4 corners of the input and output. Defaults to false.
-    name : a string or None
-        An optional name to attach to this layer.
-    """
-    def __init__(
-        self,
-        layer = None,
-        size = [],
-        is_scale = True,
-        method = 0,
-        align_corners = False,
-        name ='upsample2d_layer',
-    ):
-        Layer.__init__(self, name=name)
-        self.inputs = layer.outputs
-        if len(self.inputs.get_shape()) == 3:
-            if is_scale:
-                size_h = size[0] * int(self.inputs.get_shape()[0])
-                size_w = size[1] * int(self.inputs.get_shape()[1])
-                size = [int(size_h), int(size_w)]
-        elif len(self.inputs.get_shape()) == 4:
-            if is_scale:
-                size_h = size[0] * int(self.inputs.get_shape()[1])
-                size_w = size[1] * int(self.inputs.get_shape()[2])
-                size = [int(size_h), int(size_w)]
-        else:
-            raise Exception("Donot support shape %s" % self.inputs.get_shape())
-        print("  [TL] UpSampling2dLayer %s: is_scale:%s size:%s method:%d align_corners:%s" %
-                                (name, is_scale, size, method, align_corners))
-        with tf.variable_scope(name) as vs:
-            try:
-                self.outputs = tf.image.resize_images(self.inputs, size=size, method=method, align_corners=align_corners)
-            except: # for TF 0.10
-                self.outputs = tf.image.resize_images(self.inputs, new_height=size[0], new_width=size[1], method=method, align_corners=align_corners)
-
-        self.all_layers = list(layer.all_layers)
-        self.all_params = list(layer.all_params)
-        self.all_drop = dict(layer.all_drop)
-        self.all_layers.extend( [self.outputs] )
-
-class DownSampling2dLayer(Layer):
-    """The :class:`DownSampling2dLayer` class is downSampling 2d layer, see `tf.image.resize_images <https://www.tensorflow.org/versions/master/api_docs/python/image/resizing#resize_images>`_.
-
-    Parameters
-    -----------
-    layer : a layer class with 4-D Tensor of shape [batch, height, width, channels] or 3-D Tensor of shape [height, width, channels].
-    size : a tupe of int or float.
-        (height, width) scale factor or new size of height and width.
-    is_scale : boolean, if True (default), size is scale factor, otherwise, size is number of pixels of height and width.
-    method : 0, 1, 2, 3. ResizeMethod. Defaults to ResizeMethod.BILINEAR.
-        - ResizeMethod.BILINEAR, Bilinear interpolation.
-        - ResizeMethod.NEAREST_NEIGHBOR, Nearest neighbor interpolation.
-        - ResizeMethod.BICUBIC, Bicubic interpolation.
-        - ResizeMethod.AREA, Area interpolation.
-    align_corners : bool. If true, exactly align all 4 corners of the input and output. Defaults to false.
-    name : a string or None
-        An optional name to attach to this layer.
-    """
-    def __init__(
-        self,
-        layer = None,
-        size = [],
-        is_scale = True,
-        method = 0,
-        align_corners = False,
-        name ='downsample2d_layer',
-    ):
-        Layer.__init__(self, name=name)
-        self.inputs = layer.outputs
-        if len(self.inputs.get_shape()) == 3:
-            if is_scale:
-                size_h = size[0] * int(self.inputs.get_shape()[0])
-                size_w = size[1] * int(self.inputs.get_shape()[1])
-                size = [int(size_h), int(size_w)]
-        elif len(self.inputs.get_shape()) == 4:
-            if is_scale:
-                size_h = size[0] * int(self.inputs.get_shape()[1])
-                size_w = size[1] * int(self.inputs.get_shape()[2])
-                size = [int(size_h), int(size_w)]
-        else:
-            raise Exception("Donot support shape %s" % self.inputs.get_shape())
-        print("  [TL] DownSampling2dLayer %s: is_scale:%s size:%s method:%d, align_corners:%s" %
-                                (name, is_scale, size, method, align_corners))
-        with tf.variable_scope(name) as vs:
-            try:
-                self.outputs = tf.image.resize_images(self.inputs, size=size, method=method, align_corners=align_corners)
-            except: # for TF 0.10
-                self.outputs = tf.image.resize_images(self.inputs, new_height=size[0], new_width=size[1], method=method, align_corners=align_corners)
-
-        self.all_layers = list(layer.all_layers)
-        self.all_params = list(layer.all_params)
-        self.all_drop = dict(layer.all_drop)
-        self.all_layers.extend( [self.outputs] )
-
-
-def AtrousConv1dLayer(net, n_filter=32, filter_size=2, stride=1, dilation=1, act=None,
-        padding='SAME', use_cudnn_on_gpu=None,data_format='NWC',
-        W_init = tf.truncated_normal_initializer(stddev=0.02),
-        b_init = tf.constant_initializer(value=0.0),
-        W_init_args = {}, b_init_args = {},name ='conv1d',):
-    """Wrapper for :class:`AtrousConv1dLayer`, if you don't understand how to use :class:`Conv1dLayer`, this function may be easier.
-
-    Parameters
-    ----------
-    net : TensorLayer layer.
-    n_filter : number of filter.
-    filter_size : an int.
-    stride : an int.
-    dilation : an int, filter dilation size.
-    act : None or activation function.
-    others : see :class:`Conv1dLayer`.
-    """
-    if act is None:
-        act = tf.identity
-    net = Conv1dLayer(layer = net,
-            act = act,
-            shape = [filter_size, int(net.outputs.get_shape()[-1]), n_filter],
-            stride = stride,
-            padding = padding,
-            dilation_rate = dilation,
-            use_cudnn_on_gpu = use_cudnn_on_gpu,
-            data_format = data_format,
-            W_init = W_init,
-            b_init = b_init,
-            W_init_args = W_init_args,
-            b_init_args = b_init_args,
-            name = name,
-        )
-    return net
-
-
-class AtrousConv2dLayer(Layer):
-    """The :class:`AtrousConv2dLayer` class is Atrous convolution (a.k.a. convolution with holes or dilated convolution) 2D layer, see `tf.nn.atrous_conv2d <https://www.tensorflow.org/versions/master/api_docs/python/nn.html#atrous_conv2d>`_.
-
-    Parameters
-    -----------
-    layer : a layer class with 4-D Tensor of shape [batch, height, width, channels].
-    filters : A 4-D Tensor with the same type as value and shape [filter_height, filter_width, in_channels, out_channels]. filters' in_channels dimension must match that of value. Atrous convolution is equivalent to standard convolution with upsampled filters with effective height filter_height + (filter_height - 1) * (rate - 1) and effective width filter_width + (filter_width - 1) * (rate - 1), produced by inserting rate - 1 zeros along consecutive elements across the filters' spatial dimensions.
-    n_filter : number of filter.
-    filter_size : tuple (height, width) for filter size.
-    rate : A positive int32. The stride with which we sample input values across the height and width dimensions. Equivalently, the rate by which we upsample the filter values by inserting zeros across the height and width dimensions. In the literature, the same parameter is sometimes called input stride or dilation.
-    act : activation function, None for linear.
-    padding : A string, either 'VALID' or 'SAME'. The padding algorithm.
-    W_init : weights initializer. The initializer for initializing the weight matrix.
-    b_init : biases initializer or None. The initializer for initializing the bias vector. If None, skip biases.
-    W_init_args : dictionary. The arguments for the weights tf.get_variable().
-    b_init_args : dictionary. The arguments for the biases tf.get_variable().
-    name : a string or None, an optional name to attach to this layer.
-    """
-    def __init__(
-        self,
-        layer = None,
-        n_filter = 32,
-        filter_size = (3,3),
-        rate = 2,
-        act = None,
-        padding = 'SAME',
-        W_init = tf.truncated_normal_initializer(stddev=0.02),
-        b_init = tf.constant_initializer(value=0.0),
-        W_init_args = {},
-        b_init_args = {},
-        name = 'atrou2d'
-    ):
-        Layer.__init__(self, name=name)
-        self.inputs = layer.outputs
-        print("  [TL] AtrousConv2dLayer %s: n_filter:%d filter_size:%s rate:%d pad:%s act:%s" %
-                            (self.name, n_filter, filter_size, rate, padding, act.__name__))
-        if act is None:
-            act = tf.identity
-        with tf.variable_scope(name) as vs:
-            shape = [filter_size[0], filter_size[1], int(self.inputs.get_shape()[-1]), n_filter]
-            filters = tf.get_variable(name='filter', shape=shape, initializer=W_init, **W_init_args )
-            if b_init:
-                b = tf.get_variable(name='b', shape=(n_filter), initializer=b_init, **b_init_args )
-                self.outputs = act(tf.nn.atrous_conv2d(self.inputs, filters, rate, padding) + b)
-            else:
-                self.outputs = act(tf.nn.atrous_conv2d(self.inputs, filters, rate, padding))
-
-        self.all_layers = list(layer.all_layers)
-        self.all_params = list(layer.all_params)
-        self.all_drop = dict(layer.all_drop)
-        self.all_layers.extend( [self.outputs] )
-        if b_init:
-            self.all_params.extend( [filters, b] )
-        else:
-            self.all_params.extend( [filters] )
-
-class SeparableConv2dLayer(Layer):# Untested
-    """The :class:`SeparableConv2dLayer` class is 2-D convolution with separable filters, see `tf.layers.separable_conv2d <https://www.tensorflow.org/api_docs/python/tf/layers/separable_conv2d>`_.
-
-    Parameters
-    -----------
-    layer : a layer class
-    filters : integer, the dimensionality of the output space (i.e. the number output of filters in the convolution).
-    kernel_size : a tuple or list of N positive integers specifying the spatial dimensions of of the filters. Can be a single integer to specify the same value for all spatial dimensions.
-    strides : a tuple or list of N positive integers specifying the strides of the convolution. Can be a single integer to specify the same value for all spatial dimensions. Specifying any stride value != 1 is incompatible with specifying any dilation_rate value != 1.
-    padding : one of "valid" or "same" (case-insensitive).
-    data_format : A string, one of channels_last (default) or channels_first. The ordering of the dimensions in the inputs. channels_last corresponds to inputs with shapedata_format = 'NWHC' (batch, width, height, channels) while channels_first corresponds to inputs with shape (batch, channels, width, height).
-    dilation_rate : an integer or tuple/list of 2 integers, specifying the dilation rate to use for dilated convolution. Can be a single integer to specify the same value for all spatial dimensions. Currently, specifying any dilation_rate value != 1 is incompatible with specifying any stride value != 1.
-    depth_multiplier : The number of depthwise convolution output channels for each input channel. The total number of depthwise convolution output channels will be equal to num_filters_in * depth_multiplier.
-    act (activation) : Activation function. Set it to None to maintain a linear activation.
-    use_bias : Boolean, whether the layer uses a bias.
-    depthwise_initializer : An initializer for the depthwise convolution kernel.
-    pointwise_initializer : An initializer for the pointwise convolution kernel.
-    bias_initializer : An initializer for the bias vector. If None, no bias will be applied.
-    depthwise_regularizer : Optional regularizer for the depthwise convolution kernel.
-    pointwise_regularizer : Optional regularizer for the pointwise convolution kernel.
-    bias_regularizer : Optional regularizer for the bias vector.
-    activity_regularizer : Regularizer function for the output.
-    name : a string or None, an optional name to attach to this layer.
-    """
-    def __init__(
-        self,
-        layer = None,
-        filters = None,
-        kernel_size=5,
-        strides=(1, 1),
-        padding='valid',
-        data_format='channels_last',
-        dilation_rate=(1, 1),
-        depth_multiplier=1,
-        act=None,
-        use_bias=True,
-        depthwise_initializer=None,
-        pointwise_initializer=None,
-        bias_initializer=tf.zeros_initializer,
-        depthwise_regularizer=None,
-        pointwise_regularizer=None,
-        bias_regularizer=None,
-        activity_regularizer=None,
-        name = 'atrou2d'
-    ):
-        Layer.__init__(self, name=name)
-        self.inputs = layer.outputs
-        assert filters is not None
-        assert tf.__version__ > "0.12.1", "This layer only supports for TF 1.0+"
-        if act is None:
-            act = tf.identity
-
-        bias_initializer = bias_initializer()
-
-        print("  [TL] SeparableConv2dLayer %s: filters:%s kernel_size:%s strides:%s padding:%s dilation_rate:%s depth_multiplier:%s act:%s" %
-                            (self.name, str(filters), str(kernel_size), str(strides), padding, str(dilation_rate), str(depth_multiplier), act.__name__))
-
-        with tf.variable_scope(name) as vs:
-            self.outputs = tf.layers.separable_conv2d(self.inputs, filters, kernel_size,
-                 strides=strides, padding=padding, data_format=data_format,
-                 dilation_rate=dilation_rate, depth_multiplier=depth_multiplier, activation=act,
-                 use_bias=use_bias, depthwise_initializer=depthwise_initializer, pointwise_initializer=pointwise_initializer,
-                 bias_initializer=bias_initializer, depthwise_regularizer=depthwise_regularizer,
-                 pointwise_regularizer=pointwise_regularizer, bias_regularizer=bias_regularizer, activity_regularizer=activity_regularizer,)
-                 #trainable=True, name=None, reuse=None)
-
-            variables = tf.get_collection(TF_GRAPHKEYS_VARIABLES, scope=vs.name)
-
-        self.all_layers = list(layer.all_layers)
-        self.all_params = list(layer.all_params)
-        self.all_drop = dict(layer.all_drop)
-        self.all_layers.extend( [self.outputs] )
-        self.all_params.extend( variables )
-
-## Initializers for Convuolutional Layers
-def deconv2d_bilinear_upsampling_initializer(shape):
-    """Returns initializer that can be passed to DeConv2dLayer to initalize the
-    weights to correspond to channel wise bilinear upsampling.
-    Used in some segmantic segmentation approches such as [FCN](https://arxiv.org/abs/1605.06211)
-
-    Parameters
-    ----------
-        shape : list of shape
-            shape of the filters, [height, width, output_channels, in_channels], must match that passed to DeConv2dLayer
-
-    Returns
-    ----------
-        tf.constant_initializer
-            with weights set to correspond to per channel bilinear upsampling when passed as W_int in DeConv2dLayer
-
-    Examples
-    --------
-    >>> rescale_factor = 2 #upsampling by a factor of 2, ie e.g 100->200
-    >>> filter_size = (2 * rescale_factor - rescale_factor % 2) #Corresponding bilinear filter size
-    >>> num_in_channels = 3
-    >>> num_out_channels = 3
-    >>> deconv_filter_shape = [filter_size, filter_size, num_out_channels, num_in_channels]
-    >>> x = tf.placeholder(tf.float32, [1, imsize, imsize, num_channels])
-    >>> network = tl.layers.InputLayer(x, name='input_layer')
-    >>> bilinear_init = deconv2d_bilinear_upsampling_initializer(shape=filter_shape)
-    >>> network = tl.layers.DeConv2dLayer(network,
-                            shape = filter_shape,
-                            output_shape = [1, imsize*rescale_factor, imsize*rescale_factor, num_out_channels],
-                            strides=[1, rescale_factor, rescale_factor, 1],
-                            W_init=bilinear_init,
-                            padding='SAME',
-                            act=tf.identity, name='g/h1/decon2d')
-    """
-    if shape[0] != shape[1]:
-        raise Exception('deconv2d_bilinear_upsampling_initializer only supports symmetrical filter sizes')
-    if shape[3] < shape [2]:
-        raise Exception('deconv2d_bilinear_upsampling_initializer behaviour is not defined for num_in_channels < num_out_channels ')
-
-    filter_size = shape[0]
-    num_out_channels = shape[2]
-    num_in_channels = shape[3]
-
-    #Create bilinear filter kernel as numpy array
-    bilinear_kernel = np.zeros([filter_size, filter_size], dtype=np.float32)
-    scale_factor = (filter_size + 1) // 2
-    if filter_size % 2 == 1:
-        center = scale_factor - 1
-    else:
-        center = scale_factor - 0.5
-    for x in range(filter_size):
-        for y in range(filter_size):
-            bilinear_kernel[x,y] = (1 - abs(x - center) / scale_factor) * \
-                                   (1 - abs(y - center) / scale_factor)
-    weights = np.zeros((filter_size, filter_size, num_out_channels, num_in_channels))
-    for i in range(num_out_channels):
-        weights[:, :, i, i] = bilinear_kernel
-
-    #assign numpy array to constant_initalizer and pass to get_variable
-    bilinear_weights_init = tf.constant_initializer(value=weights, dtype=tf.float32)
-    return bilinear_weights_init
-
-## Convolutional layer (Simplified)
-def Conv1d(net, n_filter=32, filter_size=5, stride=1, dilation_rate=1, act=None,
-        padding='SAME', use_cudnn_on_gpu=None, data_format="NWC",
-        W_init = tf.truncated_normal_initializer(stddev=0.02),
-        b_init = tf.constant_initializer(value=0.0),
-        W_init_args = {}, b_init_args = {}, name ='conv1d',):
-    """Wrapper for :class:`Conv1dLayer`, if you don't understand how to use :class:`Conv1dLayer`, this function may be easier.
-
-    Parameters
-    ----------
-    net : TensorLayer layer.
-    n_filter : number of filter.
-    filter_size : an int.
-    stride : an int.
-    dilation_rate : As it is 1D conv, the default is "NWC".
-    act : None or activation function.
-    others : see :class:`Conv1dLayer`.
-
-    Examples
-    ---------
-    >>> x = tf.placeholder(tf.float32, [batch_size, width])
-    >>> y_ = tf.placeholder(tf.int64, shape=[batch_size,])
-    >>> n = InputLayer(x, name='in')
-    >>> n = ReshapeLayer(n, [-1, width, 1], name='rs')
-    >>> n = Conv1d(n, 64, 3, 1, act=tf.nn.relu, name='c1')
-    >>> n = MaxPool1d(n, 2, 2, padding='valid', name='m1')
-    >>> n = Conv1d(n, 128, 3, 1, act=tf.nn.relu, name='c2')
-    >>> n = MaxPool1d(n, 2, 2, padding='valid', name='m2')
-    >>> n = Conv1d(n, 128, 3, 1, act=tf.nn.relu, name='c3')
-    >>> n = MaxPool1d(n, 2, 2, padding='valid', name='m3')
-    >>> n = FlattenLayer(n, name='f')
-    >>> n = DenseLayer(n, 500, tf.nn.relu, name='d1')
-    >>> n = DenseLayer(n, 100, tf.nn.relu, name='d2')
-    >>> n = DenseLayer(n, 2, tf.identity, name='o')
-    """
-    if act is None:
-        act = tf.identity
-    net = Conv1dLayer(layer = net,
-            act = act,
-            shape = [filter_size, int(net.outputs.get_shape()[-1]), n_filter],
-            stride = stride,
-            dilation_rate = dilation_rate,
-            padding = padding,
-            use_cudnn_on_gpu = use_cudnn_on_gpu,
-            data_format = data_format,
-            W_init = W_init,
-            b_init = b_init,
-            W_init_args = W_init_args,
-            b_init_args = b_init_args,
-            name = name,
-        )
-    return net
-
-def Conv2d(net, n_filter=32, filter_size=(3, 3), strides=(1, 1), act = None,
-        padding='SAME', W_init = tf.truncated_normal_initializer(stddev=0.02), b_init = tf.constant_initializer(value=0.0),
-        W_init_args = {}, b_init_args = {}, use_cudnn_on_gpu = None, data_format = None,name ='conv2d',):
-    """Wrapper for :class:`Conv2dLayer`, if you don't understand how to use :class:`Conv2dLayer`, this function may be easier.
-
-    Parameters
-    ----------
-    net : TensorLayer layer.
-    n_filter : number of filter.
-    filter_size : tuple (height, width) for filter size.
-    strides : tuple (height, width) for strides.
-    act : None or activation function.
-    others : see :class:`Conv2dLayer`.
-
-    Examples
-    --------
-    >>> w_init = tf.truncated_normal_initializer(stddev=0.01)
-    >>> b_init = tf.constant_initializer(value=0.0)
-    >>> inputs = InputLayer(x, name='inputs')
-    >>> conv1 = Conv2d(inputs, 64, (3, 3), act=tf.nn.relu, padding='SAME', W_init=w_init, b_init=b_init, name='conv1_1')
-    >>> conv1 = Conv2d(conv1, 64, (3, 3), act=tf.nn.relu, padding='SAME', W_init=w_init, b_init=b_init, name='conv1_2')
-    >>> pool1 = MaxPool2d(conv1, (2, 2), padding='SAME', name='pool1')
-    >>> conv2 = Conv2d(pool1, 128, (3, 3), act=tf.nn.relu, padding='SAME', W_init=w_init, b_init=b_init, name='conv2_1')
-    >>> conv2 = Conv2d(conv2, 128, (3, 3), act=tf.nn.relu, padding='SAME', W_init=w_init, b_init=b_init, name='conv2_2')
-    >>> pool2 = MaxPool2d(conv2, (2, 2), padding='SAME', name='pool2')
-    """
-    assert len(strides) == 2, "len(strides) should be 2, Conv2d and Conv2dLayer are different."
-    if act is None:
-        act = tf.identity
-
-    try:
-        pre_channel = int(net.outputs.get_shape()[-1])
-    except: # if pre_channel is ?, it happens when using Spatial Transformer Net
-        pre_channel = 1
-        print("[warnings] unknow input channels, set to 1")
-    net = Conv2dLayer(net,
-                       act = act,
-                       shape = [filter_size[0], filter_size[1], pre_channel, n_filter],  # 32 features for each 5x5 patch
-                       strides = [1, strides[0], strides[1], 1],
-                       padding = padding,
-                       W_init = W_init,
-                       W_init_args = W_init_args,
-                       b_init = b_init,
-                       b_init_args = b_init_args,
-                       use_cudnn_on_gpu = use_cudnn_on_gpu,
-                       data_format = data_format,
-                       name = name)
-    return net
-
-def DeConv2d(net, n_out_channel = 32, filter_size=(3, 3),
-        out_size = (30, 30), strides = (2, 2), padding = 'SAME', batch_size = None, act = None,
-        W_init = tf.truncated_normal_initializer(stddev=0.02), b_init = tf.constant_initializer(value=0.0),
-        W_init_args = {}, b_init_args = {}, name ='decnn2d'):
-    """Wrapper for :class:`DeConv2dLayer`, if you don't understand how to use :class:`DeConv2dLayer`, this function may be easier.
-
-    Parameters
-    ----------
-    net : TensorLayer layer.
-    n_out_channel : int, number of output channel.
-    filter_size : tuple of (height, width) for filter size.
-    out_size :  tuple of (height, width) of output.
-    batch_size : int or None, batch_size. If None, try to find the batch_size from the first dim of net.outputs (you should tell the batch_size when define the input placeholder).
-    strides : tuple of (height, width) for strides.
-    act : None or activation function.
-    others : see :class:`DeConv2dLayer`.
-    """
-    assert len(strides) == 2, "len(strides) should be 2, DeConv2d and DeConv2dLayer are different."
-    if act is None:
-        act = tf.identity
-    if batch_size is None:
-    #     batch_size = tf.shape(net.outputs)[0]
-        fixed_batch_size = net.outputs.get_shape().with_rank_at_least(1)[0]
-        if fixed_batch_size.value:
-            batch_size = fixed_batch_size.value
-        else:
-            from tensorflow.python.ops import array_ops
-            batch_size = array_ops.shape(net.outputs)[0]
-    net = DeConv2dLayer(layer = net,
-                    act = act,
-                    shape = [filter_size[0], filter_size[1], n_out_channel, int(net.outputs.get_shape()[-1])],
-                    output_shape = [batch_size, int(out_size[0]), int(out_size[1]), n_out_channel],
-                    strides = [1, strides[0], strides[1], 1],
-                    padding = padding,
-                    W_init = W_init,
-                    b_init = b_init,
-                    W_init_args = W_init_args,
-                    b_init_args = b_init_args,
-                    name = name)
-    return net
-
-def MaxPool1d(net, filter_size, strides, padding='valid', data_format='channels_last', name=None): #Untested
-    """Wrapper for `tf.layers.max_pooling1d <https://www.tensorflow.org/api_docs/python/tf/layers/max_pooling1d>`_ .
-
-    Parameters
-    ------------
-    net : TensorLayer layer, the tensor over which to pool. Must have rank 3.
-    filter_size (pool_size) : An integer or tuple/list of a single integer, representing the size of the pooling window.
-    strides : An integer or tuple/list of a single integer, specifying the strides of the pooling operation.
-    padding : A string. The padding method, either 'valid' or 'same'. Case-insensitive.
-    data_format : A string, one of channels_last (default) or channels_first. The ordering of the dimensions in the inputs. channels_last corresponds to inputs with shape (batch, length, channels) while channels_first corresponds to inputs with shape (batch, channels, length).
-    name : A string, the name of the layer.
-
-    Returns
-    --------
-    - A :class:`Layer` which the output tensor, of rank 3.
-    """
-    print("  [TL] MaxPool1d %s: filter_size:%s strides:%s padding:%s" %
-                        (name, str(filter_size), str(strides), str(padding)))
-    outputs = tf.layers.max_pooling1d(net.outputs, filter_size, strides, padding=padding, data_format=data_format, name=name)
-
-    net_new = copy.copy(net)
-    net_new.outputs = outputs
-    net_new.all_layers.extend( [outputs] )
-    return net_new
-
-def MeanPool1d(net, filter_size, strides, padding='valid', data_format='channels_last', name=None): #Untested
-    """Wrapper for `tf.layers.average_pooling1d <https://www.tensorflow.org/api_docs/python/tf/layers/average_pooling1d>`_ .
-
-    Parameters
-    ------------
-    net : TensorLayer layer, the tensor over which to pool. Must have rank 3.
-    filter_size (pool_size) : An integer or tuple/list of a single integer, representing the size of the pooling window.
-    strides : An integer or tuple/list of a single integer, specifying the strides of the pooling operation.
-    padding : A string. The padding method, either 'valid' or 'same'. Case-insensitive.
-    data_format : A string, one of channels_last (default) or channels_first. The ordering of the dimensions in the inputs. channels_last corresponds to inputs with shape (batch, length, channels) while channels_first corresponds to inputs with shape (batch, channels, length).
-    name : A string, the name of the layer.
-
-    Returns
-    --------
-    - A :class:`Layer` which the output tensor, of rank 3.
-    """
-    print("  [TL] MeanPool1d %s: filter_size:%s strides:%s padding:%s" %
-                        (name, str(filter_size), str(strides), str(padding)))
-    outputs = tf.layers.average_pooling1d(net.outputs, filter_size, strides, padding=padding, data_format=data_format, name=name)
-
-    net_new = copy.copy(net)
-    net_new.outputs = outputs
-    net_new.all_layers.extend( [outputs] )
-    return net_new
-
-def MaxPool2d(net, filter_size=(2, 2), strides=None, padding='SAME', name='maxpool'):
-    """Wrapper for :class:`PoolLayer`.
-
-    Parameters
-    -----------
-    net : TensorLayer layer.
-    filter_size : tuple of (height, width) for filter size.
-    strides : tuple of (height, width). Default is the same with filter_size.
-    others : see :class:`PoolLayer`.
-    """
-    if strides is None:
-        strides = filter_size
-    assert len(strides) == 2, "len(strides) should be 2, MaxPool2d and PoolLayer are different."
-    net = PoolLayer(net, ksize=[1, filter_size[0], filter_size[1], 1],
-            strides=[1, strides[0], strides[1], 1],
-            padding=padding,
-            pool = tf.nn.max_pool,
-            name = name)
-    return net
-
-def MeanPool2d(net, filter_size=(2, 2), strides=None, padding='SAME', name='meanpool'):
-    """Wrapper for :class:`PoolLayer`.
-
-    Parameters
-    -----------
-    net : TensorLayer layer.
-    filter_size : tuple of (height, width) for filter size.
-    strides : tuple of (height, width). Default is the same with filter_size.
-    others : see :class:`PoolLayer`.
-    """
-    if strides is None:
-        strides = filter_size
-    assert len(strides) == 2, "len(strides) should be 2, MeanPool2d and PoolLayer are different."
-    net = PoolLayer(net, ksize=[1, filter_size[0], filter_size[1], 1],
-            strides=[1, strides[0], strides[1], 1],
-            padding=padding,
-            pool = tf.nn.avg_pool,
-            name = name)
-    return net
-
-def MaxPool3d(net, filter_size, strides, padding='valid', data_format='channels_last', name=None): #Untested
-    """Wrapper for `tf.layers.max_pooling3d <https://www.tensorflow.org/api_docs/python/tf/layers/max_pooling3d>`_ .
-
-    Parameters
-    ------------
-    net : TensorLayer layer, the tensor over which to pool. Must have rank 5.
-    filter_size (pool_size) : An integer or tuple/list of 3 integers: (pool_depth, pool_height, pool_width) specifying the size of the pooling window. Can be a single integer to specify the same value for all spatial dimensions.
-    strides : An integer or tuple/list of 3 integers, specifying the strides of the pooling operation. Can be a single integer to specify the same value for all spatial dimensions.
-    padding : A string. The padding method, either 'valid' or 'same'. Case-insensitive.
-    data_format : A string. The ordering of the dimensions in the inputs. channels_last (default) and channels_first are supported. channels_last corresponds to inputs with shape (batch, depth, height, width, channels) while channels_first corresponds to inputs with shape (batch, channels, depth, height, width).
-    name : A string, the name of the layer.
-    """
-    print("  [TL] MaxPool3d %s: filter_size:%s strides:%s padding:%s" %
-                        (name, str(filter_size), str(strides), str(padding)))
-    outputs = tf.layers.max_pooling3d(net.outputs, filter_size, strides, padding=padding, data_format=data_format, name=name)
-
-    net_new = copy.copy(net)
-    net_new.outputs = outputs
-    net_new.all_layers.extend( [outputs] )
-    return net_new
-
-def MeanPool3d(net, filter_size, strides, padding='valid', data_format='channels_last', name=None): #Untested
-    """Wrapper for `tf.layers.average_pooling3d <https://www.tensorflow.org/api_docs/python/tf/layers/average_pooling3d>`_
-
-    Parameters
-    ------------
-    net : TensorLayer layer, the tensor over which to pool. Must have rank 5.
-    filter_size (pool_size) : An integer or tuple/list of 3 integers: (pool_depth, pool_height, pool_width) specifying the size of the pooling window. Can be a single integer to specify the same value for all spatial dimensions.
-    strides : An integer or tuple/list of 3 integers, specifying the strides of the pooling operation. Can be a single integer to specify the same value for all spatial dimensions.
-    padding : A string. The padding method, either 'valid' or 'same'. Case-insensitive.
-    data_format : A string. The ordering of the dimensions in the inputs. channels_last (default) and channels_first are supported. channels_last corresponds to inputs with shape (batch, depth, height, width, channels) while channels_first corresponds to inputs with shape (batch, channels, depth, height, width).
-    name : A string, the name of the layer.
-    """
-    print("  [TL] MeanPool3d %s: filter_size:%s strides:%s padding:%s" %
-                        (name, str(filter_size), str(strides), str(padding)))
-    outputs = tf.layers.average_pooling3d(net.outputs, filter_size, strides, padding=padding, data_format=data_format, name=name)
-
-    net_new = copy.copy(net)
-    net_new.outputs = outputs
-    net_new.all_layers.extend( [outputs] )
-    return net_new
-
-## Super resolution
-def SubpixelConv2d(net, scale=2, n_out_channel=None, act=tf.identity, name='subpixel_conv2d'):
-    """The :class:`SubpixelConv2d` class is a sub-pixel 2d convolutional ayer, usually be used
-    for Super-Resolution applications, `example code <https://github.com/zsdonghao/SRGAN/>`_.
-
-    Parameters
-    ------------
-    net : TensorLayer layer.
-    scale : int, upscaling ratio, a wrong setting will lead to Dimension size error.
-    n_out_channel : int or None, the number of output channels.
-        Note that, the number of input channels == (scale x scale) x The number of output channels.
-        If None, automatically set n_out_channel == the number of input channels / (scale x scale).
-    act : activation function.
-    name : string.
-        An optional name to attach to this layer.
-
-    Examples
-    ---------
-    >>> # examples here just want to tell you how to set the n_out_channel.
-    >>> x = np.random.rand(2, 16, 16, 4)
-    >>> X = tf.placeholder("float32", shape=(2, 16, 16, 4), name="X")
-    >>> net = InputLayer(X, name='input')
-    >>> net = SubpixelConv2d(net, scale=2, n_out_channel=1, name='subpixel_conv2d')
-    >>> y = sess.run(net.outputs, feed_dict={X: x})
-    >>> print(x.shape, y.shape)
-    ... (2, 16, 16, 4) (2, 32, 32, 1)
-    >>>
-    >>> x = np.random.rand(2, 16, 16, 4*10)
-    >>> X = tf.placeholder("float32", shape=(2, 16, 16, 4*10), name="X")
-    >>> net = InputLayer(X, name='input2')
-    >>> net = SubpixelConv2d(net, scale=2, n_out_channel=10, name='subpixel_conv2d2')
-    >>> y = sess.run(net.outputs, feed_dict={X: x})
-    >>> print(x.shape, y.shape)
-    ... (2, 16, 16, 40) (2, 32, 32, 10)
-    >>>
-    >>> x = np.random.rand(2, 16, 16, 25*10)
-    >>> X = tf.placeholder("float32", shape=(2, 16, 16, 25*10), name="X")
-    >>> net = InputLayer(X, name='input3')
-    >>> net = SubpixelConv2d(net, scale=5, n_out_channel=None, name='subpixel_conv2d3')
-    >>> y = sess.run(net.outputs, feed_dict={X: x})
-    >>> print(x.shape, y.shape)
-    ... (2, 16, 16, 250) (2, 80, 80, 10)
-
-    References
-    ------------
-    - `Real-Time Single Image and Video Super-Resolution Using an Efficient Sub-Pixel Convolutional Neural Network <https://arxiv.org/pdf/1609.05158.pdf>`_
-    """
-    # github/Tetrachrome/subpixel  https://github.com/Tetrachrome/subpixel/blob/master/subpixel.py
-
-    _err_log = "SubpixelConv2d: The number of input channels == (scale x scale) x The number of output channels"
-
-    scope_name = tf.get_variable_scope().name
-    if scope_name:
-        whole_name = scope_name + '/' + name
-    else:
-        whole_name = name
-
-    def _PS(X, r, n_out_channel):
-        if n_out_channel >= 1:
-            assert int(X.get_shape()[-1]) == (r ** 2) * n_out_channel, _err_log
-            bsize, a, b, c = X.get_shape().as_list()
-            bsize = tf.shape(X)[0] # Handling Dimension(None) type for undefined batch dim
-            Xs=tf.split(X,r,3) #b*h*w*r*r
-            Xr=tf.concat(Xs,2) #b*h*(r*w)*r
-            X=tf.reshape(Xr,(bsize,r*a,r*b,n_out_channel)) # b*(r*h)*(r*w)*c
-        else:
-            print(_err_log)
-        return X
-
-    inputs = net.outputs
-
-    if n_out_channel is None:
-        assert int(inputs.get_shape()[-1])/ (scale ** 2) % 1 == 0, _err_log
-        n_out_channel = int(int(inputs.get_shape()[-1])/ (scale ** 2))
-
-    print("  [TL] SubpixelConv2d  %s: scale: %d n_out_channel: %s act: %s" % (name, scale, n_out_channel, act.__name__))
-
-    net_new = Layer(inputs, name=whole_name)
-    # with tf.name_scope(name):
-    with tf.variable_scope(name) as vs:
-        net_new.outputs = act(_PS(inputs, r=scale, n_out_channel=n_out_channel))
-
-    net_new.all_layers = list(net.all_layers)
-    net_new.all_params = list(net.all_params)
-    net_new.all_drop = dict(net.all_drop)
-    net_new.all_layers.extend( [net_new.outputs] )
-    return net_new
-
-def SubpixelConv2d_old(net, scale=2, n_out_channel=None, act=tf.identity, name='subpixel_conv2d'):
-    """The :class:`SubpixelConv2d` class is a sub-pixel 2d convolutional ayer, usually be used
-    for Super-Resolution applications, `example code <https://github.com/zsdonghao/SRGAN/>`_.
-
-    Parameters
-    ------------
-    net : TensorLayer layer.
-    scale : int, upscaling ratio, a wrong setting will lead to Dimension size error.
-    n_out_channel : int or None, the number of output channels.
-        Note that, the number of input channels == (scale x scale) x The number of output channels.
-        If None, automatically set n_out_channel == the number of input channels / (scale x scale).
-    act : activation function.
-    name : string.
-        An optional name to attach to this layer.
-
-    Examples
-    ---------
-    >>> # examples here just want to tell you how to set the n_out_channel.
-    >>> x = np.random.rand(2, 16, 16, 4)
-    >>> X = tf.placeholder("float32", shape=(2, 16, 16, 4), name="X")
-    >>> net = InputLayer(X, name='input')
-    >>> net = SubpixelConv2d(net, scale=2, n_out_channel=1, name='subpixel_conv2d')
-    >>> y = sess.run(net.outputs, feed_dict={X: x})
-    >>> print(x.shape, y.shape)
-    ... (2, 16, 16, 4) (2, 32, 32, 1)
-    >>>
-    >>> x = np.random.rand(2, 16, 16, 4*10)
-    >>> X = tf.placeholder("float32", shape=(2, 16, 16, 4*10), name="X")
-    >>> net = InputLayer(X, name='input2')
-    >>> net = SubpixelConv2d(net, scale=2, n_out_channel=10, name='subpixel_conv2d2')
-    >>> y = sess.run(net.outputs, feed_dict={X: x})
-    >>> print(x.shape, y.shape)
-    ... (2, 16, 16, 40) (2, 32, 32, 10)
-    >>>
-    >>> x = np.random.rand(2, 16, 16, 25*10)
-    >>> X = tf.placeholder("float32", shape=(2, 16, 16, 25*10), name="X")
-    >>> net = InputLayer(X, name='input3')
-    >>> net = SubpixelConv2d(net, scale=5, n_out_channel=None, name='subpixel_conv2d3')
-    >>> y = sess.run(net.outputs, feed_dict={X: x})
-    >>> print(x.shape, y.shape)
-    ... (2, 16, 16, 250) (2, 80, 80, 10)
-
-    References
-    ------------
-    - `Real-Time Single Image and Video Super-Resolution Using an Efficient Sub-Pixel Convolutional Neural Network <https://arxiv.org/pdf/1609.05158.pdf>`_
-    """
-    # github/Tetrachrome/subpixel  https://github.com/Tetrachrome/subpixel/blob/master/subpixel.py
-
-    _err_log = "SubpixelConv2d: The number of input channels == (scale x scale) x The number of output channels"
-
-    scope_name = tf.get_variable_scope().name
-    if scope_name:
-        name = scope_name + '/' + name
-
-    def _phase_shift(I, r):
-        if tf.__version__ < '1.0':
-            raise Exception("Only support TF1.0+")
-        bsize, a, b, c = I.get_shape().as_list()
-        bsize = tf.shape(I)[0] # Handling Dimension(None) type for undefined batch dim
-        X = tf.reshape(I, (bsize, a, b, r, r))
-        X = tf.transpose(X, (0, 1, 2, 4, 3))  # bsize, a, b, 1, 1 # tf 0.12
-        # X = tf.split(1, a, X)  # a, [bsize, b, r, r] # tf 0.12
-        X = tf.split(X, a, 1)
-        # X = tf.concat(2, [tf.squeeze(x, axis=1) for x in X])  # bsize, b, a*r, r # tf 0.12
-        X = tf.concat([tf.squeeze(x, axis=1) for x in X], 2)
-        # X = tf.split(1, b, X)  # b, [bsize, a*r, r] # tf 0.12
-        X = tf.split(X, b, 1)
-        # X = tf.concat(2, [tf.squeeze(x, axis=1) for x in X])  # bsize, a*r, b*r # tf 0.12
-        X = tf.concat([tf.squeeze(x, axis=1) for x in X], 2)
-        return tf.reshape(X, (bsize, a*r, b*r, 1))
-
-    def _PS(X, r, n_out_channel):
-        if n_out_channel > 1:
-            assert int(X.get_shape()[-1]) == (r ** 2) * n_out_channel, _err_log
-            Xc = tf.split(X, n_out_channel, 3)
-            X = tf.concat([_phase_shift(x, r) for x in Xc], 3)
-        elif n_out_channel == 1:
-            assert int(X.get_shape()[-1]) == (r ** 2), _err_log
-            X = _phase_shift(X, r)
-        else:
-            print(_err_log)
-        return X
-
-    inputs = net.outputs
-
-    if n_out_channel is None:
-        assert int(inputs.get_shape()[-1])/ (scale ** 2) % 1 == 0, _err_log
-        n_out_channel = int(int(inputs.get_shape()[-1])/ (scale ** 2))
-
-    print("  [TL] SubpixelConv2d  %s: scale: %d n_out_channel: %s act: %s" % (name, scale, n_out_channel, act.__name__))
-
-    net_new = Layer(inputs, name=name)
-    # with tf.name_scope(name):
-    with tf.variable_scope(name) as vs:
-        net_new.outputs = act(_PS(inputs, r=scale, n_out_channel=n_out_channel))
-
-    net_new.all_layers = list(net.all_layers)
-    net_new.all_params = list(net.all_params)
-    net_new.all_drop = dict(net.all_drop)
-    net_new.all_layers.extend( [net_new.outputs] )
-    return net_new
-
-
-def SubpixelConv1d(net, scale=2, act=tf.identity, name='subpixel_conv1d'):
-    """One-dimensional subpixel upsampling layer.
-    Calls a tensorflow function that directly implements this functionality.
-    We assume input has dim (batch, width, r)
-
-    Parameters
-    ------------
-    net : TensorLayer layer.
-    scale : int, upscaling ratio, a wrong setting will lead to Dimension size error.
-    act : activation function.
-    name : string.
-        An optional name to attach to this layer.
-
-    Examples
-    ----------
-    >>> t_signal = tf.placeholder('float32', [10, 100, 4], name='x')
-    >>> n = InputLayer(t_signal, name='in')
-    >>> n = SubpixelConv1d(n, scale=2, name='s')
-    >>> print(n.outputs.shape)
-    ... (10, 200, 2)
-
-    References
-    -----------
-    - `Audio Super Resolution Implementation <https://github.com/kuleshov/audio-super-res/blob/master/src/models/layers/subpixel.py>`_.
-    """
-    def _PS(I, r):
-        X = tf.transpose(I, [2,1,0]) # (r, w, b)
-        X = tf.batch_to_space_nd(X, [r], [[0,0]]) # (1, r*w, b)
-        X = tf.transpose(X, [2,1,0])
-        return X
-
-    print("  [TL] SubpixelConv1d  %s: scale: %d act: %s" % (name, scale, act.__name__))
-
-    inputs = net.outputs
-    net_new = Layer(inputs, name=name)
-    with tf.name_scope(name):
-        net_new.outputs = act(_PS(inputs, r=scale))
-
-    net_new.all_layers = list(net.all_layers)
-    net_new.all_params = list(net.all_params)
-    net_new.all_drop = dict(net.all_drop)
-    net_new.all_layers.extend( [net_new.outputs] )
-    return net_new
-
-## Spatial Transformer Nets
-def transformer(U, theta, out_size, name='SpatialTransformer2dAffine', **kwargs):
-    """Spatial Transformer Layer for `2D Affine Transformation <https://en.wikipedia.org/wiki/Affine_transformation>`_
-    , see :class:`SpatialTransformer2dAffineLayer` class.
-
-    Parameters
-    ----------
-    U : float
-        The output of a convolutional net should have the
-        shape [num_batch, height, width, num_channels].
-    theta: float
-        The output of the localisation network should be [num_batch, 6], value range should be [0, 1] (via tanh).
-    out_size: tuple of two ints
-        The size of the output of the network (height, width)
-
-    References
-    ----------
-    - `Spatial Transformer Networks <https://arxiv.org/abs/1506.02025>`_
-    - `TensorFlow/Models <https://github.com/tensorflow/models/tree/master/transformer>`_
-
-    Notes
-    -----
-    - To initialize the network to the identity transform init.
-    >>> ``theta`` to
-    >>> identity = np.array([[1., 0., 0.],
-    ...                      [0., 1., 0.]])
-    >>> identity = identity.flatten()
-    >>> theta = tf.Variable(initial_value=identity)
-    """
-
-    def _repeat(x, n_repeats):
-        with tf.variable_scope('_repeat'):
-            rep = tf.transpose(
-                tf.expand_dims(tf.ones(shape=tf.stack([n_repeats, ])), 1), [1, 0])
-            rep = tf.cast(rep, 'int32')
-            x = tf.matmul(tf.reshape(x, (-1, 1)), rep)
-            return tf.reshape(x, [-1])
-
-    def _interpolate(im, x, y, out_size):
-        with tf.variable_scope('_interpolate'):
-            # constants
-            num_batch = tf.shape(im)[0]
-            height = tf.shape(im)[1]
-            width = tf.shape(im)[2]
-            channels = tf.shape(im)[3]
-
-            x = tf.cast(x, 'float32')
-            y = tf.cast(y, 'float32')
-            height_f = tf.cast(height, 'float32')
-            width_f = tf.cast(width, 'float32')
-            out_height = out_size[0]
-            out_width = out_size[1]
-            zero = tf.zeros([], dtype='int32')
-            max_y = tf.cast(tf.shape(im)[1] - 1, 'int32')
-            max_x = tf.cast(tf.shape(im)[2] - 1, 'int32')
-
-            # scale indices from [-1, 1] to [0, width/height]
-            x = (x + 1.0)*(width_f) / 2.0
-            y = (y + 1.0)*(height_f) / 2.0
-
-            # do sampling
-            x0 = tf.cast(tf.floor(x), 'int32')
-            x1 = x0 + 1
-            y0 = tf.cast(tf.floor(y), 'int32')
-            y1 = y0 + 1
-
-            x0 = tf.clip_by_value(x0, zero, max_x)
-            x1 = tf.clip_by_value(x1, zero, max_x)
-            y0 = tf.clip_by_value(y0, zero, max_y)
-            y1 = tf.clip_by_value(y1, zero, max_y)
-            dim2 = width
-            dim1 = width*height
-            base = _repeat(tf.range(num_batch)*dim1, out_height*out_width)
-            base_y0 = base + y0*dim2
-            base_y1 = base + y1*dim2
-            idx_a = base_y0 + x0
-            idx_b = base_y1 + x0
-            idx_c = base_y0 + x1
-            idx_d = base_y1 + x1
-
-            # use indices to lookup pixels in the flat image and restore
-            # channels dim
-            im_flat = tf.reshape(im, tf.stack([-1, channels]))
-            im_flat = tf.cast(im_flat, 'float32')
-            Ia = tf.gather(im_flat, idx_a)
-            Ib = tf.gather(im_flat, idx_b)
-            Ic = tf.gather(im_flat, idx_c)
-            Id = tf.gather(im_flat, idx_d)
-
-            # and finally calculate interpolated values
-            x0_f = tf.cast(x0, 'float32')
-            x1_f = tf.cast(x1, 'float32')
-            y0_f = tf.cast(y0, 'float32')
-            y1_f = tf.cast(y1, 'float32')
-            wa = tf.expand_dims(((x1_f-x) * (y1_f-y)), 1)
-            wb = tf.expand_dims(((x1_f-x) * (y-y0_f)), 1)
-            wc = tf.expand_dims(((x-x0_f) * (y1_f-y)), 1)
-            wd = tf.expand_dims(((x-x0_f) * (y-y0_f)), 1)
-            output = tf.add_n([wa*Ia, wb*Ib, wc*Ic, wd*Id])
-            return output
-
-    def _meshgrid(height, width):
-        with tf.variable_scope('_meshgrid'):
-            # This should be equivalent to:
-            #  x_t, y_t = np.meshgrid(np.linspace(-1, 1, width),
-            #                         np.linspace(-1, 1, height))
-            #  ones = np.ones(np.prod(x_t.shape))
-            #  grid = np.vstack([x_t.flatten(), y_t.flatten(), ones])
-            x_t = tf.matmul(tf.ones(shape=tf.stack([height, 1])),
-                            tf.transpose(tf.expand_dims(tf.linspace(-1.0, 1.0, width), 1), [1, 0]))
-            y_t = tf.matmul(tf.expand_dims(tf.linspace(-1.0, 1.0, height), 1),
-                            tf.ones(shape=tf.stack([1, width])))
-
-            x_t_flat = tf.reshape(x_t, (1, -1))
-            y_t_flat = tf.reshape(y_t, (1, -1))
-
-            ones = tf.ones_like(x_t_flat)
-            grid = tf.concat(axis=0, values=[x_t_flat, y_t_flat, ones])
-            return grid
-
-    def _transform(theta, input_dim, out_size):
-        with tf.variable_scope('_transform'):
-            num_batch = tf.shape(input_dim)[0]
-            height = tf.shape(input_dim)[1]
-            width = tf.shape(input_dim)[2]
-            num_channels = tf.shape(input_dim)[3]
-            theta = tf.reshape(theta, (-1, 2, 3))
-            theta = tf.cast(theta, 'float32')
-
-            # grid of (x_t, y_t, 1), eq (1) in ref [1]
-            height_f = tf.cast(height, 'float32')
-            width_f = tf.cast(width, 'float32')
-            out_height = out_size[0]
-            out_width = out_size[1]
-            grid = _meshgrid(out_height, out_width)
-            grid = tf.expand_dims(grid, 0)
-            grid = tf.reshape(grid, [-1])
-            grid = tf.tile(grid, tf.stack([num_batch]))
-            grid = tf.reshape(grid, tf.stack([num_batch, 3, -1]))
-
-            # Transform A x (x_t, y_t, 1)^T -> (x_s, y_s)
-            T_g = tf.matmul(theta, grid)
-            x_s = tf.slice(T_g, [0, 0, 0], [-1, 1, -1])
-            y_s = tf.slice(T_g, [0, 1, 0], [-1, 1, -1])
-            x_s_flat = tf.reshape(x_s, [-1])
-            y_s_flat = tf.reshape(y_s, [-1])
-
-            input_transformed = _interpolate(
-                input_dim, x_s_flat, y_s_flat,
-                out_size)
-
-            output = tf.reshape(
-                input_transformed, tf.stack([num_batch, out_height, out_width, num_channels]))
-            return output
-
-    with tf.variable_scope(name):
-        output = _transform(theta, U, out_size)
-        return output
-
-def batch_transformer(U, thetas, out_size, name='BatchSpatialTransformer2dAffine'):
-    """Batch Spatial Transformer function for `2D Affine Transformation <https://en.wikipedia.org/wiki/Affine_transformation>`_.
-
-    Parameters
-    ----------
-    U : float
-        tensor of inputs [batch, height, width, num_channels]
-    thetas : float
-        a set of transformations for each input [batch, num_transforms, 6]
-    out_size : int
-        the size of the output [out_height, out_width]
-    Returns: float
-        Tensor of size [batch * num_transforms, out_height, out_width, num_channels]
-    """
-    with tf.variable_scope(name):
-        num_batch, num_transforms = map(int, thetas.get_shape().as_list()[:2])
-        indices = [[i]*num_transforms for i in xrange(num_batch)]
-        input_repeated = tf.gather(U, tf.reshape(indices, [-1]))
-        return transformer(input_repeated, thetas, out_size)
-
-class SpatialTransformer2dAffineLayer(Layer):
-    """The :class:`SpatialTransformer2dAffineLayer` class is a
-    `Spatial Transformer Layer <https://arxiv.org/abs/1506.02025>`_ for
-    `2D Affine Transformation <https://en.wikipedia.org/wiki/Affine_transformation>`_.
-
-    Parameters
-    -----------
-    layer : a layer class with 4-D Tensor of shape [batch, height, width, channels]
-    theta_layer : a layer class for the localisation network.
-        In this layer, we will use a :class:`DenseLayer` to make the theta size to [batch, 6], value range to [0, 1] (via tanh).
-    out_size : tuple of two ints.
-        The size of the output of the network (height, width), the feature maps will be resized by this.
-
-    References
-    -----------
-    - `Spatial Transformer Networks <https://arxiv.org/abs/1506.02025>`_
-    - `TensorFlow/Models <https://github.com/tensorflow/models/tree/master/transformer>`_
-    """
-    def __init__(
-        self,
-        layer = None,
-        theta_layer = None,
-        out_size = [40, 40],
-        name ='sapatial_trans_2d_affine',
-    ):
-        Layer.__init__(self, name=name)
-        self.inputs = layer.outputs
-        self.theta_layer = theta_layer
-        print("  [TL] SpatialTransformer2dAffineLayer %s: in_size:%s out_size:%s" %
-                                (name, self.inputs.get_shape().as_list(), out_size))
-
-        with tf.variable_scope(name) as vs:
-            ## 1. make the localisation network to [batch, 6] via Flatten and Dense.
-            if self.theta_layer.outputs.get_shape().ndims > 2:
-                 self.theta_layer.outputs = flatten_reshape(self.theta_layer.outputs, 'flatten')
-            ## 2. To initialize the network to the identity transform init.
-            # 2.1 W
-            n_in = int(self.theta_layer.outputs.get_shape()[-1])
-            shape = (n_in, 6)
-            W = tf.get_variable(name='W', initializer=tf.zeros(shape))
-            # 2.2 b
-            identity = tf.constant(np.array([[1., 0, 0], [0, 1., 0]]).astype('float32').flatten())
-            b = tf.get_variable(name='b', initializer=identity)
-            # 2.3 transformation matrix
-            self.theta = tf.nn.tanh(tf.matmul(self.theta_layer.outputs, W) + b)
-            ## 3. Spatial Transformer Sampling
-            # 3.1 transformation
-            self.outputs = transformer(self.inputs, self.theta, out_size=out_size)
-            # 3.2 automatically set batch_size and channels
-            # e.g. [?, 40, 40, ?] --> [64, 40, 40, 1] or [64, 20, 20, 4]/ Hao Dong
-            #
-            fixed_batch_size = self.inputs.get_shape().with_rank_at_least(1)[0]
-            if fixed_batch_size.value:
-                batch_size = fixed_batch_size.value
-            else:
-                from tensorflow.python.ops import array_ops
-                batch_size = array_ops.shape(self.inputs)[0]
-            size = self.inputs.get_shape().as_list()
-            n_channels = self.inputs.get_shape().as_list()[-1]
-            # print(self.outputs)
-            self.outputs = tf.reshape(self.outputs, shape=[batch_size, out_size[0], out_size[1], n_channels])
-            # print(self.outputs)
-            # exit()
-            ## 4. Get all parameters
-            variables = tf.get_collection(TF_GRAPHKEYS_VARIABLES, scope=vs.name)
-
-        ## fixed
-        self.all_layers = list(layer.all_layers)
-        self.all_params = list(layer.all_params)
-        self.all_drop = dict(layer.all_drop)
-
-        ## theta_layer
-        self.all_layers.extend(theta_layer.all_layers)
-        self.all_params.extend(theta_layer.all_params)
-        self.all_drop.update(theta_layer.all_drop)
-
-        ## this layer
-        self.all_layers.extend( [self.outputs] )
-        self.all_params.extend( variables )
-
-
-# ## Normalization layer
-class LocalResponseNormLayer(Layer):
-    """The :class:`LocalResponseNormLayer` class is for Local Response Normalization, see ``tf.nn.local_response_normalization`` or ``tf.nn.lrn`` for new TF version.
-    The 4-D input tensor is treated as a 3-D array of 1-D vectors (along the last dimension), and each vector is normalized independently.
-    Within a given vector, each component is divided by the weighted, squared sum of inputs within depth_radius.
-
-    Parameters
-    -----------
-    layer : a layer class. Must be one of the following types: float32, half. 4-D.
-    depth_radius : An optional int. Defaults to 5. 0-D. Half-width of the 1-D normalization window.
-    bias : An optional float. Defaults to 1. An offset (usually positive to avoid dividing by 0).
-    alpha : An optional float. Defaults to 1. A scale factor, usually positive.
-    beta : An optional float. Defaults to 0.5. An exponent.
-    name : A string or None, an optional name to attach to this layer.
-    """
-    def __init__(
-        self,
-        layer = None,
-        depth_radius = None,
-        bias = None,
-        alpha = None,
-        beta = None,
-        name ='lrn_layer',
-    ):
-        Layer.__init__(self, name=name)
-        self.inputs = layer.outputs
-        print("  [TL] LocalResponseNormLayer %s: depth_radius: %d, bias: %f, alpha: %f, beta: %f" %
-                            (self.name, depth_radius, bias, alpha, beta))
-        with tf.variable_scope(name) as vs:
-            self.outputs = tf.nn.lrn(self.inputs, depth_radius=depth_radius, bias=bias, alpha=alpha, beta=beta)
-
-        self.all_layers = list(layer.all_layers)
-        self.all_params = list(layer.all_params)
-        self.all_drop = dict(layer.all_drop)
-        self.all_layers.extend( [self.outputs] )
-
-class BatchNormLayer(Layer):
-    """
-    The :class:`BatchNormLayer` class is a normalization layer, see ``tf.nn.batch_normalization`` and ``tf.nn.moments``.
-
-    Batch normalization on fully-connected or convolutional maps.
-
-    Parameters
-    -----------
-    layer : a :class:`Layer` instance
-        The `Layer` class feeding into this layer.
-    decay : float, default is 0.9.
-        A decay factor for ExponentialMovingAverage, use larger value for large dataset.
-    epsilon : float
-        A small float number to avoid dividing by 0.
-    act : activation function.
-    is_train : boolean
-        Whether train or inference.
-    beta_init : beta initializer
-        The initializer for initializing beta
-    gamma_init : gamma initializer
-        The initializer for initializing gamma
-    name : a string or None
-        An optional name to attach to this layer.
-
-    References
-    ----------
-    - `Source <https://github.com/ry/tensorflow-resnet/blob/master/resnet.py>`_
-    - `stackoverflow <http://stackoverflow.com/questions/38312668/how-does-one-do-inference-with-batch-normalization-with-tensor-flow>`_
-    """
-    def __init__(
-        self,
-        layer = None,
-        decay = 0.9,
-        epsilon = 0.00001,
-        act = tf.identity,
-        is_train = False,
-        beta_init = tf.zeros_initializer,
-        gamma_init = tf.random_normal_initializer(mean=1.0, stddev=0.002), # tf.ones_initializer,
-        name ='batchnorm_layer',
-    ):
-        Layer.__init__(self, name=name)
-        self.inputs = layer.outputs
-        print("  [TL] BatchNormLayer %s: decay:%f epsilon:%f act:%s is_train:%s" %
-                            (self.name, decay, epsilon, act.__name__, is_train))
-        x_shape = self.inputs.get_shape()
-        params_shape = x_shape[-1:]
-
-        from tensorflow.python.training import moving_averages
-        from tensorflow.python.ops import control_flow_ops
-
-        with tf.variable_scope(name) as vs:
-            axis = list(range(len(x_shape) - 1))
-
-            ## 1. beta, gamma
-            if tf.__version__ > '0.12.1' and beta_init == tf.zeros_initializer:
-                beta_init = beta_init()
-            beta = tf.get_variable('beta', shape=params_shape,
-                               initializer=beta_init,
-                               trainable=is_train)#, restore=restore)
-
-            gamma = tf.get_variable('gamma', shape=params_shape,
-                                initializer=gamma_init, trainable=is_train,
-                                )#restore=restore)
-
-            ## 2.
-            if tf.__version__ > '0.12.1':
-                moving_mean_init = tf.zeros_initializer()
-            else:
-                moving_mean_init = tf.zeros_initializer
-            moving_mean = tf.get_variable('moving_mean',
-                                      params_shape,
-                                      initializer=moving_mean_init,
-                                      trainable=False,)#   restore=restore)
-            moving_variance = tf.get_variable('moving_variance',
-                                          params_shape,
-                                          initializer=tf.constant_initializer(1.),
-                                          trainable=False,)#   restore=restore)
-
-            ## 3.
-            # These ops will only be preformed when training.
-            mean, variance = tf.nn.moments(self.inputs, axis)
-            try:    # TF12
-                update_moving_mean = moving_averages.assign_moving_average(
-                                moving_mean, mean, decay, zero_debias=False)     # if zero_debias=True, has bias
-                update_moving_variance = moving_averages.assign_moving_average(
-                                moving_variance, variance, decay, zero_debias=False) # if zero_debias=True, has bias
-                # print("TF12 moving")
-            except Exception as e:  # TF11
-                update_moving_mean = moving_averages.assign_moving_average(
-                                moving_mean, mean, decay)
-                update_moving_variance = moving_averages.assign_moving_average(
-                                moving_variance, variance, decay)
-                # print("TF11 moving")
-
-            def mean_var_with_update():
-                with tf.control_dependencies([update_moving_mean, update_moving_variance]):
-                    return tf.identity(mean), tf.identity(variance)
-
-            if is_train:
-                mean, var = mean_var_with_update()
-                self.outputs = act( tf.nn.batch_normalization(self.inputs, mean, var, beta, gamma, epsilon) )
-            else:
-                self.outputs = act( tf.nn.batch_normalization(self.inputs, moving_mean, moving_variance, beta, gamma, epsilon) )
-
-            variables = [beta, gamma, moving_mean, moving_variance]
-
-            # print(len(variables))
-            # for idx, v in enumerate(variables):
-            #     print("  var {:3}: {:15}   {}".format(idx, str(v.get_shape()), v))
-            # exit()
-
-        self.all_layers = list(layer.all_layers)
-        self.all_params = list(layer.all_params)
-        self.all_drop = dict(layer.all_drop)
-        self.all_layers.extend( [self.outputs] )
-        self.all_params.extend( variables )
-
-# class BatchNormLayer_TF(Layer):   # Work well TF contrib https://github.com/tensorflow/tensorflow/blob/b826b79718e3e93148c3545e7aa3f90891744cc0/tensorflow/contrib/layers/python/layers/layers.py#L100
-#     """
-#     The :class:`BatchNormLayer` class is a normalization layer, see ``tf.nn.batch_normalization`` and ``tf.nn.moments``.
-#
-#     Batch normalization on fully-connected or convolutional maps.
-#
-#     Parameters
-#     -----------
-#     layer : a :class:`Layer` instance
-#         The `Layer` class feeding into this layer.
-#     decay : float
-#         A decay factor for ExponentialMovingAverage.
-#     center: If True, subtract `beta`. If False, `beta` is ignored.
-#     scale: If True, multiply by `gamma`. If False, `gamma` is
-#         not used. When the next layer is linear (also e.g. `nn.relu`), this can be
-#         disabled since the scaling can be done by the next layer.
-#     epsilon : float
-#         A small float number to avoid dividing by 0.
-#     act : activation function.
-#     is_train : boolean
-#         Whether train or inference.
-#     beta_init : beta initializer
-#         The initializer for initializing beta
-#     gamma_init : gamma initializer
-#         The initializer for initializing gamma
-#     name : a string or None
-#         An optional name to attach to this layer.
-#
-#     References
-#     ----------
-#     - `Source <https://github.com/ry/tensorflow-resnet/blob/master/resnet.py>`_
-#     - `stackoverflow <http://stackoverflow.com/questions/38312668/how-does-one-do-inference-with-batch-normalization-with-tensor-flow>`_
-#     """
-#     def __init__(
-#         self,
-#         layer = None,
-#         decay = 0.95,#.999,
-#         center = True,
-#         scale = True,
-#         epsilon = 0.00001,
-#         act = tf.identity,
-#         is_train = False,
-#         beta_init = tf.zeros_initializer,
-#         # gamma_init = tf.ones_initializer,
-#         gamma_init = tf.random_normal_initializer(mean=1.0, stddev=0.002),
-#         name ='batchnorm_layer',
-#     ):
-#         Layer.__init__(self, name=name)
-#         self.inputs = layer.outputs
-#         print("  [TL] BatchNormLayer %s: decay: %f, epsilon: %f, act: %s, is_train: %s" %
-#                             (self.name, decay, epsilon, act.__name__, is_train))
-#         from tensorflow.contrib.layers.python.layers import utils
-#         from tensorflow.contrib.framework.python.ops import variables
-#         from tensorflow.python.ops import init_ops
-#         from tensorflow.python.ops import nn
-#         from tensorflow.python.training import moving_averages
-#         from tensorflow.python.framework import ops
-#         from tensorflow.python.ops import variable_scope
-#         variables_collections = None
-#         outputs_collections=None
-#         updates_collections=None#ops.GraphKeys.UPDATE_OPS
-#         # with variable_scope.variable_op_scope([inputs],
-#         #                                     scope, 'BatchNorm', reuse=reuse) as sc:
-#         # with variable_scope.variable_op_scope([self.inputs], None, name) as vs:
-#         with tf.variable_scope(name) as vs:
-#             inputs_shape = self.inputs.get_shape()
-#             dtype = self.inputs.dtype.base_dtype
-#             axis = list(range(len(inputs_shape) - 1)) # [0, 1, 2]
-#             params_shape = inputs_shape[-1:]
-#             # Allocate parameters for the beta and gamma of the normalization.
-#             beta, gamma = None, None
-#             if center:
-#               beta_collections = utils.get_variable_collections(variables_collections,
-#                                                                 'beta')
-#               beta = variables.model_variable('beta',
-#                                               shape=params_shape,
-#                                               dtype=dtype,
-#                                             #   initializer=init_ops.zeros_initializer,
-#                                               initializer=beta_init,
-#                                               collections=beta_collections,)
-#                                             #   trainable=trainable)
-#             if scale:
-#               gamma_collections = utils.get_variable_collections(variables_collections,
-#                                                                  'gamma')
-#               gamma = variables.model_variable('gamma',
-#                                                shape=params_shape,
-#                                                dtype=dtype,
-#                                             #    initializer=init_ops.ones_initializer,
-#                                                initializer=gamma_init,
-#                                                collections=gamma_collections,)
-#                                             #    trainable=trainable)
-#             # Create moving_mean and moving_variance variables and add them to the
-#             # appropiate collections.
-#             moving_mean_collections = utils.get_variable_collections(
-#                 variables_collections,
-#                 'moving_mean')
-#             moving_mean = variables.model_variable(
-#                 'moving_mean',
-#                 shape=params_shape,
-#                 dtype=dtype,
-#                 # initializer=init_ops.zeros_initializer,
-#                 initializer=tf.zeros_initializer,
-#                 trainable=False,
-#                 collections=moving_mean_collections)
-#             moving_variance_collections = utils.get_variable_collections(
-#                 variables_collections,
-#                 'moving_variance')
-#             moving_variance = variables.model_variable(
-#                 'moving_variance',
-#                 shape=params_shape,
-#                 dtype=dtype,
-#                 # initializer=init_ops.ones_initializer,
-#                 initializer=tf.constant_initializer(1.),
-#                 trainable=False,
-#                 collections=moving_variance_collections)
-#             if is_train:
-#               # Calculate the moments based on the individual batch.
-#               mean, variance = nn.moments(self.inputs, axis, shift=moving_mean)
-#               # Update the moving_mean and moving_variance moments.
-#             #   update_moving_mean = moving_averages.assign_moving_average(
-#             #       moving_mean, mean, decay)
-#             #   update_moving_variance = moving_averages.assign_moving_average(
-#             #       moving_variance, variance, decay)
-#             #   if updates_collections is None:
-#             #     # Make sure the updates are computed here.
-#             #       with ops.control_dependencies([update_moving_mean,
-#             #                                        update_moving_variance]):
-#             #          outputs = nn.batch_normalization(
-#             #               self.inputs, mean, variance, beta, gamma, epsilon)
-#
-#               update_moving_mean = tf.assign(moving_mean,
-#                                    moving_mean * decay + mean * (1 - decay))
-#               update_moving_variance = tf.assign(moving_variance,
-#                                   moving_variance * decay + variance * (1 - decay))
-#               with tf.control_dependencies([update_moving_mean, update_moving_variance]):
-#                   outputs = nn.batch_normalization(
-#                               self.inputs, mean, variance, beta, gamma, epsilon)
-#             #   else:
-#             #     # Collect the updates to be computed later.
-#             #     ops.add_to_collections(updates_collections, update_moving_mean)
-#             #     ops.add_to_collections(updates_collections, update_moving_variance)
-#             #     outputs = nn.batch_normalization(
-#             #         self.inputs, mean, variance, beta, gamma, epsilon)
-#             else:
-#             #   mean, variance = nn.moments(self.inputs, axis, shift=moving_mean)
-#               outputs = nn.batch_normalization(
-#                   self.inputs, moving_mean, moving_variance, beta, gamma, epsilon)
-#                 # self.inputs, mean, variance, beta, gamma, epsilon)
-#             outputs.set_shape(self.inputs.get_shape())
-#             # if activation_fn:
-#             self.outputs = act(outputs)
-#
-#             # variables = tf.get_collection(TF_GRAPHKEYS_VARIABLES, scope=vs.name)
-#             # return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
-#             variables = [beta, gamma, moving_mean, moving_variance]
-#
-#         mean, variance = nn.moments(self.inputs, axis, shift=moving_mean)
-#         self.check_mean = mean
-#         self.check_variance = variance
-#
-#         self.all_layers = list(layer.all_layers)
-#         self.all_params = list(layer.all_params)
-#         self.all_drop = dict(layer.all_drop)
-#         self.all_layers.extend( [self.outputs] )
-#         self.all_params.extend( variables )
-#
-# class BatchNormLayer5(Layer):   # Akara Work well
-#     """
-#     The :class:`BatchNormLayer` class is a normalization layer, see ``tf.nn.batch_normalization`` and ``tf.nn.moments``.
-#
-#     Batch normalization on fully-connected or convolutional maps.
-#
-#     Parameters
-#     -----------
-#     layer : a :class:`Layer` instance
-#         The `Layer` class feeding into this layer.
-#     decay : float
-#         A decay factor for ExponentialMovingAverage.
-#     epsilon : float
-#         A small float number to avoid dividing by 0.
-#     act : activation function.
-#     is_train : boolean
-#         Whether train or inference.
-#     beta_init : beta initializer
-#         The initializer for initializing beta
-#     gamma_init : gamma initializer
-#         The initializer for initializing gamma
-#     name : a string or None
-#         An optional name to attach to this layer.
-#
-#     References
-#     ----------
-#     - `Source <https://github.com/ry/tensorflow-resnet/blob/master/resnet.py>`_
-#     - `stackoverflow <http://stackoverflow.com/questions/38312668/how-does-one-do-inference-with-batch-normalization-with-tensor-flow>`_
-#     """
-#     def __init__(
-#         self,
-#         layer = None,
-#         decay = 0.9,
-#         epsilon = 0.00001,
-#         act = tf.identity,
-#         is_train = False,
-#         beta_init = tf.zeros_initializer,
-#         # gamma_init = tf.ones_initializer,
-#         gamma_init = tf.random_normal_initializer(mean=1.0, stddev=0.002),
-#         name ='batchnorm_layer',
-#     ):
-#         Layer.__init__(self, name=name)
-#         self.inputs = layer.outputs
-#         print("  [TL] BatchNormLayer %s: decay: %f, epsilon: %f, act: %s, is_train: %s" %
-#                             (self.name, decay, epsilon, act.__name__, is_train))
-#         x_shape = self.inputs.get_shape()
-#         params_shape = x_shape[-1:]
-#
-#         from tensorflow.python.training import moving_averages
-#         from tensorflow.python.ops import control_flow_ops
-#
-#         with tf.variable_scope(name) as vs:
-#             axis = list(range(len(x_shape) - 1))
-#
-#             ## 1. beta, gamma
-#             beta = tf.get_variable('beta', shape=params_shape,
-#                                initializer=beta_init,
-#                                trainable=is_train)#, restore=restore)
-#
-#             gamma = tf.get_variable('gamma', shape=params_shape,
-#                                 initializer=gamma_init, trainable=is_train,
-#                                 )#restore=restore)
-#
-#             ## 2. moving variables during training (not update by gradient!)
-#             moving_mean = tf.get_variable('moving_mean',
-#                                       params_shape,
-#                                       initializer=tf.zeros_initializer,
-#                                       trainable=False,)#   restore=restore)
-#             moving_variance = tf.get_variable('moving_variance',
-#                                           params_shape,
-#                                           initializer=tf.constant_initializer(1.),
-#                                           trainable=False,)#   restore=restore)
-#
-#             batch_mean, batch_var = tf.nn.moments(self.inputs, axis)
-#             ## 3.
-#             # These ops will only be preformed when training.
-#             def mean_var_with_update():
-#                 try:    # TF12
-#                     update_moving_mean = moving_averages.assign_moving_average(
-#                                     moving_mean, batch_mean, decay, zero_debias=False)     # if zero_debias=True, has bias
-#                     update_moving_variance = moving_averages.assign_moving_average(
-#                                     moving_variance, batch_var, decay, zero_debias=False) # if zero_debias=True, has bias
-#                     # print("TF12 moving")
-#                 except Exception as e:  # TF11
-#                     update_moving_mean = moving_averages.assign_moving_average(
-#                                     moving_mean, batch_mean, decay)
-#                     update_moving_variance = moving_averages.assign_moving_average(
-#                                     moving_variance, batch_var, decay)
-#                     # print("TF11 moving")
-#
-#             # def mean_var_with_update():
-#                 with tf.control_dependencies([update_moving_mean, update_moving_variance]):
-#                     # return tf.identity(update_moving_mean), tf.identity(update_moving_variance)
-#                     return tf.identity(batch_mean), tf.identity(batch_var)
-#
-#             # if not is_train:
-#             if is_train:
-#                 mean, var = mean_var_with_update()
-#             else:
-#                 mean, var = (moving_mean, moving_variance)
-#
-#             normed = tf.nn.batch_normalization(
-#               x=self.inputs,
-#               mean=mean,
-#               variance=var,
-#               offset=beta,
-#               scale=gamma,
-#               variance_epsilon=epsilon,
-#               name="tf_bn"
-#             )
-#             self.outputs = act( normed )
-#
-#             variables = [beta, gamma, moving_mean, moving_variance]
-#             # print(len(variables))
-#             # for idx, v in enumerate(variables):
-#             #     print("  var {:3}: {:15}   {}".format(idx, str(v.get_shape()), v))
-#             # exit()
-#
-#         self.all_layers = list(layer.all_layers)
-#         self.all_params = list(layer.all_params)
-#         self.all_drop = dict(layer.all_drop)
-#         self.all_layers.extend( [self.outputs] )
-#         self.all_params.extend( variables )
-#         # self.all_params.extend( [beta, gamma] )
-#
-# class BatchNormLayer4(Layer): # work TFlearn https://github.com/tflearn/tflearn/blob/master/tflearn/layers/normalization.py
-#     """
-#     The :class:`BatchNormLayer` class is a normalization layer, see ``tf.nn.batch_normalization`` and ``tf.nn.moments``.
-#
-#     Batch normalization on fully-connected or convolutional maps.
-#
-#     Parameters
-#     -----------
-#     layer : a :class:`Layer` instance
-#         The `Layer` class feeding into this layer.
-#     decay : float
-#         A decay factor for ExponentialMovingAverage.
-#     epsilon : float
-#         A small float number to avoid dividing by 0.
-#     act : activation function.
-#     is_train : boolean
-#         Whether train or inference.
-#     beta_init : beta initializer
-#         The initializer for initializing beta
-#     gamma_init : gamma initializer
-#         The initializer for initializing gamma
-#     name : a string or None
-#         An optional name to attach to this layer.
-#
-#     References
-#     ----------
-#     - `Source <https://github.com/ry/tensorflow-resnet/blob/master/resnet.py>`_
-#     - `stackoverflow <http://stackoverflow.com/questions/38312668/how-does-one-do-inference-with-batch-normalization-with-tensor-flow>`_
-#     """
-#     def __init__(
-#         self,
-#         layer = None,
-#         decay = 0.999,
-#         epsilon = 0.00001,
-#         act = tf.identity,
-#         is_train = None,
-#         beta_init = tf.zeros_initializer,
-#         # gamma_init = tf.ones_initializer,
-#         gamma_init = tf.random_normal_initializer(mean=1.0, stddev=0.002),
-#         name ='batchnorm_layer',
-#     ):
-#         Layer.__init__(self, name=name)
-#         self.inputs = layer.outputs
-#         print("  [TL] BatchNormLayer %s: decay: %f, epsilon: %f, act: %s, is_train: %s" %
-#                             (self.name, decay, epsilon, act.__name__, is_train))
-#         input_shape = self.inputs.get_shape()
-#         # params_shape = input_shape[-1:]
-#         input_ndim = len(input_shape)
-#         from tensorflow.python.training import moving_averages
-#         from tensorflow.python.ops import control_flow_ops
-#
-#         # gamma_init = tf.random_normal_initializer(mean=gamma, stddev=stddev)
-#
-#         # Variable Scope fix for older TF
-#         scope = name
-#         try:
-#             vscope = tf.variable_scope(scope, default_name=name, values=[self.inputs],)
-#                                     #    reuse=reuse)
-#         except Exception:
-#             vscope = tf.variable_op_scope([self.inputs], scope, name)#, reuse=reuse)
-#
-#         with vscope as scope:
-#             name = scope.name
-#         # with tf.variable_scope(name) as vs:
-#             beta = tf.get_variable('beta', shape=[input_shape[-1]],
-#                                 initializer=beta_init,)
-#                             #    initializer=tf.constant_initializer(beta),)
-#                             #    trainable=trainable, )#restore=restore)
-#             gamma = tf.get_variable('gamma', shape=[input_shape[-1]],
-#                                 initializer=gamma_init, )#trainable=trainable,)
-#                                 # restore=restore)
-#
-#             axis = list(range(input_ndim - 1))
-#             moving_mean = tf.get_variable('moving_mean',
-#                                       input_shape[-1:],
-#                                       initializer=tf.zeros_initializer,
-#                                       trainable=False,)
-#                                     #   restore=restore)
-#             moving_variance = tf.get_variable('moving_variance',
-#                                           input_shape[-1:],
-#                                           initializer=tf.constant_initializer(1.),
-#                                           trainable=False,)
-#                                         #   restore=restore)
-#
-#             # Define a function to update mean and variance
-#             def update_mean_var():
-#                 mean, variance = tf.nn.moments(self.inputs, axis)
-#
-#                 # Fix TF 0.12
-#                 try:
-#                     update_moving_mean = moving_averages.assign_moving_average(
-#                         moving_mean, mean, decay, zero_debias=False)            # if zero_debias=True, accuracy is high ..
-#                     update_moving_variance = moving_averages.assign_moving_average(
-#                         moving_variance, variance, decay, zero_debias=False)
-#                 except Exception as e:  # TF 11
-#                     update_moving_mean = moving_averages.assign_moving_average(
-#                         moving_mean, mean, decay)
-#                     update_moving_variance = moving_averages.assign_moving_average(
-#                         moving_variance, variance, decay)
-#
-#                 with tf.control_dependencies(
-#                         [update_moving_mean, update_moving_variance]):
-#                     return tf.identity(mean), tf.identity(variance)
-#
-#             # Retrieve variable managing training mode
-#             # is_training = tflearn.get_training_mode()
-#             if not is_train:    # test : mean=0, std=1
-#             # if is_train:      # train : mean=0, std=1
-#                 is_training = tf.cast(tf.ones([]), tf.bool)
-#             else:
-#                 is_training = tf.cast(tf.zeros([]), tf.bool)
-#             mean, var = tf.cond(
-#                 is_training, update_mean_var, lambda: (moving_mean, moving_variance))
-#                             #  ones                 zeros
-#             try:
-#                 inference = tf.nn.batch_normalization(
-#                     self.inputs, mean, var, beta, gamma, epsilon)
-#                 inference.set_shape(input_shape)
-#             # Fix for old Tensorflow
-#             except Exception as e:
-#                 inference = tf.nn.batch_norm_with_global_normalization(
-#                     self.inputs, mean, var, beta, gamma, epsilon,
-#                     scale_after_normalization=True,
-#                 )
-#                 inference.set_shape(input_shape)
-#
-#             variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope.name)    # 2 params beta, gamma
-#                 # variables = [beta, gamma, moving_mean, moving_variance]
-#
-#             # print(len(variables))
-#             # for idx, v in enumerate(variables):
-#             #     print("  var {:3}: {:15}   {}".format(idx, str(v.get_shape()), v.name))
-#             # exit()
-#
-#         # Add attributes for easy access
-#         # inference.scope = scope
-#         inference.scope = name
-#         inference.beta = beta
-#         inference.gamma = gamma
-#
-#         self.outputs = act( inference )
-#
-#         self.all_layers = list(layer.all_layers)
-#         self.all_params = list(layer.all_params)
-#         self.all_drop = dict(layer.all_drop)
-#         self.all_layers.extend( [self.outputs] )
-#         self.all_params.extend( variables )
-
-# class BatchNormLayer2(Layer):   # don't work http://r2rt.com/implementing-batch-normalization-in-tensorflow.html
-#     """
-#     The :class:`BatchNormLayer` class is a normalization layer, see ``tf.nn.batch_normalization`` and ``tf.nn.moments``.
-#
-#     Batch normalization on fully-connected or convolutional maps.
-#
-#     Parameters
-#     -----------
-#     layer : a :class:`Layer` instance
-#         The `Layer` class feeding into this layer.
-#     decay : float
-#         A decay factor for ExponentialMovingAverage.
-#     epsilon : float
-#         A small float number to avoid dividing by 0.
-#     act : activation function.
-#     is_train : boolean
-#         Whether train or inference.
-#     beta_init : beta initializer
-#         The initializer for initializing beta
-#     gamma_init : gamma initializer
-#         The initializer for initializing gamma
-#     name : a string or None
-#         An optional name to attach to this layer.
-#
-#     References
-#     ----------
-#     - `Source <https://github.com/ry/tensorflow-resnet/blob/master/resnet.py>`_
-#     - `stackoverflow <http://stackoverflow.com/questions/38312668/how-does-one-do-inference-with-batch-normalization-with-tensor-flow>`_
-#     """
-#     def __init__(
-#         self,
-#         layer = None,
-#         decay = 0.999,
-#         epsilon = 0.00001,
-#         act = tf.identity,
-#         is_train = None,
-#         beta_init = tf.zeros_initializer,
-#         # gamma_init = tf.ones_initializer,
-#         gamma_init = tf.random_normal_initializer(mean=1.0, stddev=0.002),
-#         name ='batchnorm_layer',
-#     ):
-#         Layer.__init__(self, name=name)
-#         self.inputs = layer.outputs
-#         print("  [TL] BatchNormLayer %s: decay: %f, epsilon: %f, act: %s, is_train: %s" %
-#                             (self.name, decay, epsilon, act.__name__, is_train))
-#         x_shape = self.inputs.get_shape()
-#         params_shape = x_shape[-1:]
-#
-#         with tf.variable_scope(name) as vs:
-#             gamma = tf.get_variable("gamma", shape=params_shape,
-#                         initializer=gamma_init)
-#             beta = tf.get_variable("beta", shape=params_shape,
-#                         initializer=beta_init)
-#             pop_mean = tf.get_variable("pop_mean", shape=params_shape,
-#                         initializer=tf.zeros_initializer, trainable=False)
-#             pop_var = tf.get_variable("pop_var", shape=params_shape,
-#                         initializer=tf.constant_initializer(1.), trainable=False)
-#
-#             if is_train:
-#                 batch_mean, batch_var = tf.nn.moments(self.inputs, list(range(len(x_shape) - 1)))
-#                 train_mean = tf.assign(pop_mean,
-#                                        pop_mean * decay + batch_mean * (1 - decay))
-#                 train_var = tf.assign(pop_var,
-#                                       pop_var * decay + batch_var * (1 - decay))
-#                 with tf.control_dependencies([train_mean, train_var]):
-#                     self.outputs = act(tf.nn.batch_normalization(self.inputs,
-#                         batch_mean, batch_var, beta, gamma, epsilon))
-#             else:
-#                 self.outputs = act(tf.nn.batch_normalization(self.inputs,
-#                     pop_mean, pop_var, beta, gamma, epsilon))
-#                     # self.outputs = act( tf.nn.batch_normalization(self.inputs, mean, variance, beta, gamma, epsilon) )
-#             # variables = tf.get_collection(TF_GRAPHKEYS_VARIABLES, scope=vs.name)  # 8 params in TF12 if zero_debias=True
-#             variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name)    # 2 params beta, gamma
-#                 # variables = [beta, gamma, moving_mean, moving_variance]
-#
-#             # print(len(variables))
-#             # for idx, v in enumerate(variables):
-#             #     print("  var {:3}: {:15}   {}".format(idx, str(v.get_shape()), v.name))
-#             # exit()
-#
-#         self.all_layers = list(layer.all_layers)
-#         self.all_params = list(layer.all_params)
-#         self.all_drop = dict(layer.all_drop)
-#         self.all_layers.extend( [self.outputs] )
-#         self.all_params.extend( variables )
-
-# class BatchNormLayer3(Layer):   # don't work http://r2rt.com/implementing-batch-normalization-in-tensorflow.html
-#     """
-#     The :class:`BatchNormLayer` class is a normalization layer, see ``tf.nn.batch_normalization`` and ``tf.nn.moments``.
-#
-#     Batch normalization on fully-connected or convolutional maps.
-#
-#     Parameters
-#     -----------
-#     layer : a :class:`Layer` instance
-#         The `Layer` class feeding into this layer.
-#     decay : float
-#         A decay factor for ExponentialMovingAverage.
-#     epsilon : float
-#         A small float number to avoid dividing by 0.
-#     act : activation function.
-#     is_train : boolean
-#         Whether train or inference.
-#     beta_init : beta initializer
-#         The initializer for initializing beta
-#     gamma_init : gamma initializer
-#         The initializer for initializing gamma
-#     name : a string or None
-#         An optional name to attach to this layer.
-#
-#     References
-#     ----------
-#     - `Source <https://github.com/ry/tensorflow-resnet/blob/master/resnet.py>`_
-#     - `stackoverflow <http://stackoverflow.com/questions/38312668/how-does-one-do-inference-with-batch-normalization-with-tensor-flow>`_
-#     """
-#     def __init__(
-#         self,
-#         layer = None,
-#         decay = 0.999,
-#         epsilon = 0.00001,
-#         act = tf.identity,
-#         is_train = None,
-#         beta_init = tf.zeros_initializer,
-#         # gamma_init = tf.ones_initializer,
-#         gamma_init = tf.random_normal_initializer(mean=1.0, stddev=0.002),
-#         name ='batchnorm_layer',
-#     ):
-#         """
-#         Batch normalization on convolutional maps.
-#         Ref.: http://stackoverflow.com/questions/33949786/how-could-i-use-batch-normalization-in-tensorflow
-#         Args:
-#             x:           Tensor, 4D BHWD input maps
-#             n_out:       integer, depth of input maps
-#             phase_train: boolean tf.Varialbe, true indicates training phase
-#             scope:       string, variable scope
-#         Return:
-#             normed:      batch-normalized maps
-#         """
-#         Layer.__init__(self, name=name)
-#         self.inputs = layer.outputs
-#         print("  [TL] BatchNormLayer %s: decay: %f, epsilon: %f, act: %s, is_train: %s" %
-#                             (self.name, decay, epsilon, act.__name__, is_train))
-#         x_shape = self.inputs.get_shape()
-#         params_shape = x_shape[-1:]
-#
-#         if is_train:
-#             phase_train = tf.cast(tf.ones([]), tf.bool)
-#         else:
-#             phase_train = tf.cast(tf.zeros([]), tf.bool)
-#
-#         with tf.variable_scope(name) as vs:
-#             gamma = tf.get_variable("gamma", shape=params_shape,
-#                         initializer=gamma_init)
-#             beta = tf.get_variable("beta", shape=params_shape,
-#                         initializer=beta_init)
-#             batch_mean, batch_var = tf.nn.moments(self.inputs, list(range(len(x_shape) - 1)),#[0,1,2],
-#                             name='moments')
-#             ema = tf.train.ExponentialMovingAverage(decay=decay)
-#
-#             def mean_var_with_update():
-#                 ema_apply_op = ema.apply([batch_mean, batch_var])
-#                 with tf.control_dependencies([ema_apply_op]):
-#                     return tf.identity(batch_mean), tf.identity(batch_var)
-#
-#             mean, var = tf.cond(phase_train,
-#                                 mean_var_with_update,
-#                                 lambda: (ema.average(batch_mean), ema.average(batch_var)))
-#             normed = tf.nn.batch_normalization(self.inputs, mean, var, beta, gamma, epsilon)
-#             self.outputs = act( normed )
-#             variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name)    # 2 params beta, gamma
-#                 # variables = [beta, gamma, moving_mean, moving_variance]
-#
-#             # print(len(variables))
-#             # for idx, v in enumerate(variables):
-#             #     print("  var {:3}: {:15}   {}".format(idx, str(v.get_shape()), v.name))
-#             # exit()
-#
-#         self.all_layers = list(layer.all_layers)
-#         self.all_params = list(layer.all_params)
-#         self.all_drop = dict(layer.all_drop)
-#         self.all_layers.extend( [self.outputs] )
-#         self.all_params.extend( variables )
-
-# class BatchNormLayer_old(Layer):  # don't work
-#     """
-#     The :class:`BatchNormLayer` class is a normalization layer, see ``tf.nn.batch_normalization``.
-#
-#     Batch normalization on fully-connected or convolutional maps.
-#
-#     Parameters
-#     -----------
-#     layer : a :class:`Layer` instance
-#         The `Layer` class feeding into this layer.
-#     decay : float
-#         A decay factor for ExponentialMovingAverage.
-#     epsilon : float
-#         A small float number to avoid dividing by 0.
-#     is_train : boolean
-#         Whether train or inference.
-#     name : a string or None
-#         An optional name to attach to this layer.
-#
-#     References
-#     ----------
-#     - `tf.nn.batch_normalization <https://github.com/tensorflow/tensorflow/blob/master/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.nn.batch_normalization.md>`_
-#     - `stackoverflow <http://stackoverflow.com/questions/33949786/how-could-i-use-batch-normalization-in-tensorflow>`_
-#     - `tensorflow.contrib <https://github.com/tensorflow/tensorflow/blob/b826b79718e3e93148c3545e7aa3f90891744cc0/tensorflow/contrib/layers/python/layers/layers.py#L100>`_
-#     """
-#     def __init__(
-#         self,
-#         layer = None,
-#         act = tf.identity,
-#         decay = 0.999,
-#         epsilon = 0.001,
-#         is_train = None,
-#         name ='batchnorm_layer',
-#     ):
-#         Layer.__init__(self, name=name)
-#         self.inputs = layer.outputs
-#         print("  [TL] BatchNormLayer %s: decay: %f, epsilon: %f, is_train: %s" %
-#                             (self.name, decay, epsilon, is_train))
-#         if is_train == None:
-#             raise Exception("is_train must be True or False")
-#
-#         # (name, input_var, decay, epsilon, is_train)
-#         inputs_shape = self.inputs.get_shape()
-#         axis = list(range(len(inputs_shape) - 1))
-#         params_shape = inputs_shape[-1:]
-#
-#         with tf.variable_scope(name) as vs:
-#             beta = tf.get_variable(name='beta', shape=params_shape,
-#                                  initializer=tf.constant_initializer(0.0))
-#             gamma = tf.get_variable(name='gamma', shape=params_shape,
-#                                   initializer=tf.constant_initializer(1.0))
-#             batch_mean, batch_var = tf.nn.moments(self.inputs,
-#                                                 axis,
-#                                                 name='moments')
-#             ema = tf.train.ExponentialMovingAverage(decay=decay)
-#
-#             def mean_var_with_update():
-#               ema_apply_op = ema.apply([batch_mean, batch_var])
-#               with tf.control_dependencies([ema_apply_op]):
-#                   return tf.identity(batch_mean), tf.identity(batch_var)
-#
-#             if is_train:
-#                 is_train = tf.cast(tf.ones(1), tf.bool)
-#             else:
-#                 is_train = tf.cast(tf.zeros(1), tf.bool)
-#
-#             is_train = tf.reshape(is_train, [])
-#
-#             # print(is_train)
-#             # exit()
-#
-#             mean, var = tf.cond(
-#               is_train,
-#               mean_var_with_update,
-#               lambda: (ema.average(batch_mean), ema.average(batch_var))
-#             )
-#             normed = tf.nn.batch_normalization(
-#               x=self.inputs,
-#               mean=mean,
-#               variance=var,
-#               offset=beta,
-#               scale=gamma,
-#               variance_epsilon=epsilon,
-#               name='tf_bn'
-#             )
-#         self.outputs = act( normed )
-#
-#         self.all_layers = list(layer.all_layers)
-#         self.all_params = list(layer.all_params)
-#         self.all_drop = dict(layer.all_drop)
-#         self.all_layers.extend( [self.outputs] )
-#         self.all_params.extend( [beta, gamma] )
-
-class InstanceNormLayer(Layer):
-    """The :class:`InstanceNormLayer` class is a for instance normalization.
-
-    Parameters
-    -----------
-    layer : a :class:`Layer` instance
-        The `Layer` class feeding into this layer.
-    act : activation function.
-    epsilon : float
-        A small float number.
-    scale_init : beta initializer
-        The initializer for initializing beta
-    offset_init : gamma initializer
-        The initializer for initializing gamma
-    name : a string or None
-        An optional name to attach to this layer.
-    """
-    def __init__(
-    self,
-    layer = None,
-    act = tf.identity,
-    epsilon = 1e-5,
-    scale_init = tf.truncated_normal_initializer(mean=1.0, stddev=0.02),
-    offset_init = tf.constant_initializer(0.0),
-    name ='instan_norm',
-    ):
-        Layer.__init__(self, name=name)
-        self.inputs = layer.outputs
-        print("  [TL] InstanceNormLayer %s: epsilon:%f act:%s" %
-                            (self.name, epsilon, act.__name__))
-
-        with tf.variable_scope(name) as vs:
-            mean, var = tf.nn.moments(self.inputs, [1, 2], keep_dims=True)
-            scale = tf.get_variable('scale',[self.inputs.get_shape()[-1]],
-                initializer=tf.truncated_normal_initializer(mean=1.0, stddev=0.02))
-            offset = tf.get_variable('offset',[self.inputs.get_shape()[-1]],initializer=tf.constant_initializer(0.0))
-            self.outputs = scale * tf.div(self.inputs-mean, tf.sqrt(var+epsilon)) + offset
-            self.outputs = act(self.outputs)
-            variables = tf.get_collection(TF_GRAPHKEYS_VARIABLES, scope=vs.name)
-
-        self.all_layers = list(layer.all_layers)
-        self.all_params = list(layer.all_params)
-        self.all_drop = dict(layer.all_drop)
-        self.all_layers.extend( [self.outputs] )
-        self.all_params.extend( variables )
-
-
-## Pooling layer
-class PoolLayer(Layer):
-    """
-    The :class:`PoolLayer` class is a Pooling layer, you can choose
-    ``tf.nn.max_pool`` and ``tf.nn.avg_pool`` for 2D or
-    ``tf.nn.max_pool3d`` and ``tf.nn.avg_pool3d`` for 3D.
-
-    Parameters
-    ----------
-    layer : a :class:`Layer` instance
-        The `Layer` class feeding into this layer.
-    ksize : a list of ints that has length >= 4.
-        The size of the window for each dimension of the input tensor.
-    strides : a list of ints that has length >= 4.
-        The stride of the sliding window for each dimension of the input tensor.
-    padding : a string from: "SAME", "VALID".
-        The type of padding algorithm to use.
-    pool : a pooling function
-        - see `TensorFlow pooling APIs <https://www.tensorflow.org/versions/master/api_docs/python/nn.html#pooling>`_
-        - class ``tf.nn.max_pool``
-        - class ``tf.nn.avg_pool``
-        - class ``tf.nn.max_pool3d``
-        - class ``tf.nn.avg_pool3d``
-    name : a string or None
-        An optional name to attach to this layer.
-
-    Examples
-    --------
-    - see :class:`Conv2dLayer`.
-    """
-    def __init__(
-        self,
-        layer = None,
-        ksize=[1, 2, 2, 1],
-        strides=[1, 2, 2, 1],
-        padding='SAME',
-        pool = tf.nn.max_pool,
-        name ='pool_layer',
-    ):
-        Layer.__init__(self, name=name)
-        self.inputs = layer.outputs
-        print("  [TL] PoolLayer   %s: ksize:%s strides:%s padding:%s pool:%s" %
-                            (self.name, str(ksize), str(strides), padding, pool.__name__))
-
-        self.outputs = pool(self.inputs, ksize=ksize, strides=strides, padding=padding, name=name)
-
-        self.all_layers = list(layer.all_layers)
-        self.all_params = list(layer.all_params)
-        self.all_drop = dict(layer.all_drop)
-        self.all_layers.extend( [self.outputs] )
-
-## Padding layer
-class PadLayer(Layer):
-    """
-    The :class:`PadLayer` class is a Padding layer for any modes and dimensions.
-    Please see `tf.pad <https://www.tensorflow.org/api_docs/python/tf/pad>`_ for usage.
-
-    Parameters
-    ----------
-    layer : a :class:`Layer` instance
-        The `Layer` class feeding into this layer.
-    padding : a Tensor of type int32.
-    mode : one of "CONSTANT", "REFLECT", or "SYMMETRIC" (case-insensitive)
-    name : a string or None
-        An optional name to attach to this layer.
-    """
-    def __init__(
-        self,
-        layer = None,
-        paddings = None,
-        mode = 'CONSTANT',
-        name = 'pad_layer',
-    ):
-        Layer.__init__(self, name=name)
-        assert paddings is not None, "paddings should be a Tensor of type int32. see https://www.tensorflow.org/api_docs/python/tf/pad"
-        self.inputs = layer.outputs
-        print("  [TL] PadLayer   %s: paddings:%s mode:%s" %
-                            (self.name, list(paddings), mode))
-
-        self.outputs = tf.pad(self.inputs, paddings=paddings, mode=mode, name=name)
-
-        self.all_layers = list(layer.all_layers)
-        self.all_params = list(layer.all_params)
-        self.all_drop = dict(layer.all_drop)
-        self.all_layers.extend( [self.outputs] )
-
-## Object Detection
-class ROIPoolingLayer(Layer):
-    """
-    The :class:`ROIPoolingLayer` class is Region of interest pooling layer.
-
-    Parameters
-    -----------
-    layer : a :class:`Layer` instance
-        The `Layer` class feeding into this layer, the feature maps on which to perform the pooling operation
-    rois : list of regions of interest in the format (feature map index, upper left, bottom right)
-    pool_width : int, size of the pooling sections.
-    pool_width : int, size of the pooling sections.
-
-    Notes
-    -----------
-    - This implementation is from `Deepsense-AI <https://github.com/deepsense-ai/roi-pooling>`_ .
-    - Please install it by the instruction `HERE <https://github.com/zsdonghao/tensorlayer/blob/master/tensorlayer/third_party/roi_pooling/README.md>`_.
-    """
-    def __init__(
-        self,
-        #inputs = None,
-        layer = None,
-        rois = None,
-        pool_height = 2,
-        pool_width = 2,
-        name = 'roipooling_layer',
-    ):
-        Layer.__init__(self, name=name)
-        self.inputs = layer.outputs
-        print ("  [TL] ROIPoolingLayer %s: (%d, %d)" % (self.name, pool_height, pool_width))
-        try:
-            from tensorlayer.third_party.roi_pooling.roi_pooling.roi_pooling_ops import roi_pooling
-        except Exception as e:
-            print(e)
-            print("\nHINT: \n1. https://github.com/deepsense-ai/roi-pooling  \n2. tensorlayer/third_party/roi_pooling\n")
-        self.outputs = roi_pooling(self.inputs, rois, pool_height, pool_width)
-
-        self.all_layers = list(layer.all_layers)
-        self.all_params = list(layer.all_params)
-        self.all_drop = dict(layer.all_drop)
-        self.all_layers.extend( [self.outputs] )
-
-
-## TimeDistributedLayer
-class TimeDistributedLayer(Layer):
-    """
-    The :class:`TimeDistributedLayer` class that applies a function to every timestep of the input tensor.
-    For example, if using :class:`DenseLayer` as the ``layer_class``, inputs [batch_size , length, dim]
-    outputs [batch_size , length, new_dim].
-
-    Parameters
-    ----------
-    layer : a :class:`Layer` instance
-        The `Layer` class feeding into this layer, [batch_size , length, dim]
-    layer_class : a :class:`Layer` class
-    args : dictionary
-        The arguments for the ``layer_class``.
-    name : a string or None
-        An optional name to attach to this layer.
-
-    Examples
-    --------
-    >>> batch_size = 32
-    >>> timestep = 20
-    >>> input_dim = 100
-    >>> x = tf.placeholder(dtype=tf.float32, shape=[batch_size, timestep,  input_dim], name="encode_seqs")
-    >>> net = InputLayer(x, name='input')
-    >>> net = TimeDistributedLayer(net, layer_class=DenseLayer, args={'n_units':50, 'name':'dense'}, name='time_dense')
-    ... [TL] InputLayer  input: (32, 20, 100)
-    ... [TL] TimeDistributedLayer time_dense: layer_class:DenseLayer
-    >>> print(net.outputs._shape)
-    ... (32, 20, 50)
-    >>> net.print_params(False)
-    ... param   0: (100, 50)          time_dense/dense/W:0
-    ... param   1: (50,)              time_dense/dense/b:0
-    ... num of params: 5050
-    """
-    def __init__(
-        self,
-        layer = None,
-        layer_class = None,
-        args = {},
-        name ='time_distributed',
-    ):
-        Layer.__init__(self, name=name)
-        self.inputs = layer.outputs
-        print("  [TL] TimeDistributedLayer %s: layer_class:%s args:%s" %
-                            (self.name, layer_class.__name__, args))
-
-        if not args: args = dict()
-        assert isinstance(args, dict), "'args' must be a dict."
-
-        if not isinstance(self.inputs, tf.Tensor):
-            self.inputs = tf.transpose(tf.stack(self.inputs), [1, 0, 2])
-
-        input_shape = self.inputs.get_shape()
-
-        timestep = input_shape[1]
-        x = tf.unstack(self.inputs, axis=1)
-
-        with ops.suppress_stdout():
-            for i in range(0, timestep):
-                with tf.variable_scope(name, reuse=(set_keep['name_reuse'] if i==0 else True)) as vs:
-                    set_name_reuse((set_keep['name_reuse'] if i==0 else True))
-                    net = layer_class(InputLayer(x[i], name=args['name']+str(i)), **args)
-                    # net = layer_class(InputLayer(x[i], name="input_"+args['name']), **args)
-                    x[i] = net.outputs
-                    variables = tf.get_collection(TF_GRAPHKEYS_VARIABLES, scope=vs.name)
-
-        self.outputs = tf.stack(x, axis=1, name=name)
-
-        self.all_layers = list(layer.all_layers)
-        self.all_params = list(layer.all_params)
-        self.all_drop = dict(layer.all_drop)
-        self.all_layers.extend( [self.outputs] )
-        self.all_params.extend( variables )
-
-
-
-## Recurrent layer
-class RNNLayer(Layer):
-    """
-    The :class:`RNNLayer` class is a RNN layer, you can implement vanilla RNN,
-    LSTM and GRU with it.
-
-    Parameters
-    ----------
-    layer : a :class:`Layer` instance
-        The `Layer` class feeding into this layer.
-    cell_fn : a TensorFlow's core RNN cell as follow (Note TF1.0+ and TF1.0- are different).
-        - see `RNN Cells in TensorFlow <https://www.tensorflow.org/api_docs/python/>`_
-    cell_init_args : a dictionary
-        The arguments for the cell initializer.
-    n_hidden : an int
-        The number of hidden units in the layer.
-    initializer : initializer
-        The initializer for initializing the parameters.
-    n_steps : an int
-        The sequence length.
-    initial_state : None or RNN State
-        If None, initial_state is zero_state.
-    return_last : boolean
-        - If True, return the last output, "Sequence input and single output"
-        - If False, return all outputs, "Synced sequence input and output"
-        - In other word, if you want to apply one or more RNN(s) on this layer, set to False.
-    return_seq_2d : boolean
-        - When return_last = False
-        - If True, return 2D Tensor [n_example, n_hidden], for stacking DenseLayer after it.
-        - If False, return 3D Tensor [n_example/n_steps, n_steps, n_hidden], for stacking multiple RNN after it.
-    name : a string or None
-        An optional name to attach to this layer.
-
-    Variables
-    --------------
-    outputs : a tensor
-        The output of this RNN.
-        return_last = False, outputs = all cell_output, which is the hidden state.
-            cell_output.get_shape() = (?, n_hidden)
-
-    final_state : a tensor or StateTuple
-        When state_is_tuple = False,
-        it is the final hidden and cell states, states.get_shape() = [?, 2 * n_hidden].\n
-        When state_is_tuple = True, it stores two elements: (c, h), in that order.
-        You can get the final state after each iteration during training, then
-        feed it to the initial state of next iteration.
-
-    initial_state : a tensor or StateTuple
-        It is the initial state of this RNN layer, you can use it to initialize
-        your state at the begining of each epoch or iteration according to your
-        training procedure.
-
-    batch_size : int or tensor
-        Is int, if able to compute the batch_size, otherwise, tensor for ``?``.
-
-    Examples
-    --------
-    - For words
-    >>> input_data = tf.placeholder(tf.int32, [batch_size, num_steps])
-    >>> net = tl.layers.EmbeddingInputlayer(
-    ...                 inputs = input_data,
-    ...                 vocabulary_size = vocab_size,
-    ...                 embedding_size = hidden_size,
-    ...                 E_init = tf.random_uniform_initializer(-init_scale, init_scale),
-    ...                 name ='embedding_layer')
-    >>> net = tl.layers.DropoutLayer(net, keep=keep_prob, is_fix=True, is_train=is_train, name='drop1')
-    >>> net = tl.layers.RNNLayer(net,
-    ...             cell_fn=tf.contrib.rnn.BasicLSTMCell,
-    ...             cell_init_args={'forget_bias': 0.0},# 'state_is_tuple': True},
-    ...             n_hidden=hidden_size,
-    ...             initializer=tf.random_uniform_initializer(-init_scale, init_scale),
-    ...             n_steps=num_steps,
-    ...             return_last=False,
-    ...             name='basic_lstm_layer1')
-    >>> lstm1 = net
-    >>> net = tl.layers.DropoutLayer(net, keep=keep_prob, is_fix=True, is_train=is_train, name='drop2')
-    >>> net = tl.layers.RNNLayer(net,
-    ...             cell_fn=tf.contrib.rnn.BasicLSTMCell,
-    ...             cell_init_args={'forget_bias': 0.0}, # 'state_is_tuple': True},
-    ...             n_hidden=hidden_size,
-    ...             initializer=tf.random_uniform_initializer(-init_scale, init_scale),
-    ...             n_steps=num_steps,
-    ...             return_last=False,
-    ...             return_seq_2d=True,
-    ...             name='basic_lstm_layer2')
-    >>> lstm2 = net
-    >>> net = tl.layers.DropoutLayer(net, keep=keep_prob, is_fix=True, is_train=is_train, name='drop3')
-    >>> net = tl.layers.DenseLayer(net,
-    ...             n_units=vocab_size,
-    ...             W_init=tf.random_uniform_initializer(-init_scale, init_scale),
-    ...             b_init=tf.random_uniform_initializer(-init_scale, init_scale),
-    ...             act = tl.activation.identity, name='output_layer')
-
-    - For CNN+LSTM
-    >>> x = tf.placeholder(tf.float32, shape=[batch_size, image_size, image_size, 1])
-    >>> net = tl.layers.InputLayer(x, name='input_layer')
-    >>> net = tl.layers.Conv2dLayer(net,
-    ...                         act = tf.nn.relu,
-    ...                         shape = [5, 5, 1, 32],  # 32 features for each 5x5 patch
-    ...                         strides=[1, 2, 2, 1],
-    ...                         padding='SAME',
-    ...                         name ='cnn_layer1')
-    >>> net = tl.layers.PoolLayer(net,
-    ...                         ksize=[1, 2, 2, 1],
-    ...                         strides=[1, 2, 2, 1],
-    ...                         padding='SAME',
-    ...                         pool = tf.nn.max_pool,
-    ...                         name ='pool_layer1')
-    >>> net = tl.layers.Conv2dLayer(net,
-    ...                         act = tf.nn.relu,
-    ...                         shape = [5, 5, 32, 10], # 10 features for each 5x5 patch
-    ...                         strides=[1, 2, 2, 1],
-    ...                         padding='SAME',
-    ...                         name ='cnn_layer2')
-    >>> net = tl.layers.PoolLayer(net,
-    ...                         ksize=[1, 2, 2, 1],
-    ...                         strides=[1, 2, 2, 1],
-    ...                         padding='SAME',
-    ...                         pool = tf.nn.max_pool,
-    ...                         name ='pool_layer2')
-    >>> net = tl.layers.FlattenLayer(net, name='flatten_layer')
-    >>> net = tl.layers.ReshapeLayer(net, shape=[-1, num_steps, int(net.outputs._shape[-1])])
-    >>> rnn1 = tl.layers.RNNLayer(net,
-    ...                         cell_fn=tf.nn.rnn_cell.LSTMCell,
-    ...                         cell_init_args={},
-    ...                         n_hidden=200,
-    ...                         initializer=tf.random_uniform_initializer(-0.1, 0.1),
-    ...                         n_steps=num_steps,
-    ...                         return_last=False,
-    ...                         return_seq_2d=True,
-    ...                         name='rnn_layer')
-    >>> net = tl.layers.DenseLayer(rnn1, n_units=3,
-    ...                         act = tl.activation.identity, name='output_layer')
-
-    Notes
-    -----
-    Input dimension should be rank 3 : [batch_size, n_steps, n_features], if no, please see :class:`ReshapeLayer`.
-
-    References
-    ----------
-    - `Neural Network RNN Cells in TensorFlow <https://www.tensorflow.org/api_docs/python/rnn_cell/>`_
-    - `tensorflow/python/ops/rnn.py <https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/ops/rnn.py>`_
-    - `tensorflow/python/ops/rnn_cell.py <https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/ops/rnn_cell.py>`_
-    - see TensorFlow tutorial ``ptb_word_lm.py``, TensorLayer tutorials ``tutorial_ptb_lstm*.py`` and ``tutorial_generate_text.py``
-    """
-    def __init__(
-        self,
-        layer = None,
-        cell_fn = None,#tf.nn.rnn_cell.BasicRNNCell,
-        cell_init_args = {},
-        n_hidden = 100,
-        initializer = tf.random_uniform_initializer(-0.1, 0.1),
-        n_steps = 5,
-        initial_state = None,
-        return_last = False,
-        # is_reshape = True,
-        return_seq_2d = False,
-        name = 'rnn_layer',
-    ):
-        Layer.__init__(self, name=name)
-        if cell_fn is None:
-            raise Exception("Please put in cell_fn")
-        if 'GRU' in cell_fn.__name__:
-            try:
-                cell_init_args.pop('state_is_tuple')
-            except:
-                pass
-
-        self.inputs = layer.outputs
-
-        print("  [TL] RNNLayer %s: n_hidden:%d n_steps:%d in_dim:%d in_shape:%s cell_fn:%s " % (self.name, n_hidden,
-            n_steps, self.inputs.get_shape().ndims, self.inputs.get_shape(), cell_fn.__name__))
-        # You can get the dimension by .get_shape() or ._shape, and check the
-        # dimension by .with_rank() as follow.
-        # self.inputs.get_shape().with_rank(2)
-        # self.inputs.get_shape().with_rank(3)
-
-        # Input dimension should be rank 3 [batch_size, n_steps(max), n_features]
-        try:
-            self.inputs.get_shape().with_rank(3)
-        except:
-            raise Exception("RNN : Input dimension should be rank 3 : [batch_size, n_steps, n_features]")
-
-
-        # is_reshape : boolean (deprecate)
-        #     Reshape the inputs to 3 dimension tensor.\n
-        #     If input is［batch_size, n_steps, n_features], we do not need to reshape it.\n
-        #     If input is [batch_size * n_steps, n_features], we need to reshape it.
-        # if is_reshape:
-        #     self.inputs = tf.reshape(self.inputs, shape=[-1, n_steps, int(self.inputs._shape[-1])])
-
-        fixed_batch_size = self.inputs.get_shape().with_rank_at_least(1)[0]
-
-        if fixed_batch_size.value:
-            batch_size = fixed_batch_size.value
-            print("       RNN batch_size (concurrent processes): %d" % batch_size)
-        else:
-            from tensorflow.python.ops import array_ops
-            batch_size = array_ops.shape(self.inputs)[0]
-            print("       non specified batch_size, uses a tensor instead.")
-        self.batch_size = batch_size
-
-        # Simplified version of tensorflow.models.rnn.rnn.py's rnn().
-        # This builds an unrolled LSTM for tutorial purposes only.
-        # In general, use the rnn() or state_saving_rnn() from rnn.py.
-        #
-        # The alternative version of the code below is:
-        #
-        # from tensorflow.models.rnn import rnn
-        # inputs = [tf.squeeze(input_, [1])
-        #           for input_ in tf.split(1, num_steps, inputs)]
-        # outputs, state = rnn.rnn(cell, inputs, initial_state=self._initial_state)
-        outputs = []
-        if 'reuse' in inspect.getargspec(cell_fn.__init__).args:
-            self.cell = cell = cell_fn(num_units=n_hidden, reuse=tf.get_variable_scope().reuse, **cell_init_args)
-        else:
-            self.cell = cell = cell_fn(num_units=n_hidden, **cell_init_args)
-        if initial_state is None:
-            self.initial_state = cell.zero_state(batch_size, dtype=tf.float32)  # 1.2.3
-        state = self.initial_state
-        # with tf.variable_scope("model", reuse=None, initializer=initializer):
-        with tf.variable_scope(name, initializer=initializer) as vs:
-            for time_step in range(n_steps):
-                if time_step > 0: tf.get_variable_scope().reuse_variables()
-                (cell_output, state) = cell(self.inputs[:, time_step, :], state)
-                outputs.append(cell_output)
-
-            # Retrieve just the RNN variables.
-            # rnn_variables = [v for v in tf.all_variables() if v.name.startswith(vs.name)]
-            rnn_variables = tf.get_collection(TF_GRAPHKEYS_VARIABLES, scope=vs.name)
-
-        print("     n_params : %d" % (len(rnn_variables)))
-
-        if return_last:
-            # 2D Tensor [batch_size, n_hidden]
-            self.outputs = outputs[-1]
-        else:
-            if return_seq_2d:
-                # PTB tutorial: stack dense layer after that, or compute the cost from the output
-                # 2D Tensor [n_example, n_hidden]
-                try: # TF1.0
-                    self.outputs = tf.reshape(tf.concat(outputs, 1), [-1, n_hidden])
-                except: # TF0.12
-                    self.outputs = tf.reshape(tf.concat(1, outputs), [-1, n_hidden])
-
-
-            else:
-                # <akara>: stack more RNN layer after that
-                # 3D Tensor [n_example/n_steps, n_steps, n_hidden]
-                try: # TF1.0
-                    self.outputs = tf.reshape(tf.concat(outputs, 1), [-1, n_steps, n_hidden])
-                except: # TF0.12
-                    self.outputs = tf.reshape(tf.concat(1, outputs), [-1, n_steps, n_hidden])
-
-        self.final_state = state
-
-        self.all_layers = list(layer.all_layers)
-        self.all_params = list(layer.all_params)
-        self.all_drop = dict(layer.all_drop)
-        # print(type(self.outputs))
-        self.all_layers.extend( [self.outputs] )
-        self.all_params.extend( rnn_variables )
-
-class BiRNNLayer(Layer):
-    """
-    The :class:`BiRNNLayer` class is a Bidirectional RNN layer.
-
-    Parameters
-    ----------
-    layer : a :class:`Layer` instance
-        The `Layer` class feeding into this layer.
-    cell_fn : a TensorFlow's core RNN cell as follow (Note TF1.0+ and TF1.0- are different).
-        - see `RNN Cells in TensorFlow <https://www.tensorflow.org/api_docs/python/>`_
-    cell_init_args : a dictionary
-        The arguments for the cell initializer.
-    n_hidden : an int
-        The number of hidden units in the layer.
-    initializer : initializer
-        The initializer for initializing the parameters.
-    n_steps : an int
-        The sequence length.
-    fw_initial_state : None or forward RNN State
-        If None, initial_state is zero_state.
-    bw_initial_state : None or backward RNN State
-        If None, initial_state is zero_state.
-    dropout : `tuple` of `float`: (input_keep_prob, output_keep_prob).
-        The input and output keep probability.
-    n_layer : an int, default is 1.
-        The number of RNN layers.
-    return_last : boolean
-        - If True, return the last output, "Sequence input and single output"
-        - If False, return all outputs, "Synced sequence input and output"
-        - In other word, if you want to apply one or more RNN(s) on this layer, set to False.
-    return_seq_2d : boolean
-        - When return_last = False
-        - If True, return 2D Tensor [n_example, n_hidden], for stacking DenseLayer after it.
-        - If False, return 3D Tensor [n_example/n_steps, n_steps, n_hidden], for stacking multiple RNN after it.
-    name : a string or None
-        An optional name to attach to this layer.
-
-    Variables
-    --------------
-    outputs : a tensor
-        The output of this RNN.
-        return_last = False, outputs = all cell_output, which is the hidden state.
-            cell_output.get_shape() = (?, n_hidden)
-
-    fw(bw)_final_state : a tensor or StateTuple
-        When state_is_tuple = False,
-        it is the final hidden and cell states, states.get_shape() = [?, 2 * n_hidden].\n
-        When state_is_tuple = True, it stores two elements: (c, h), in that order.
-        You can get the final state after each iteration during training, then
-        feed it to the initial state of next iteration.
-
-    fw(bw)_initial_state : a tensor or StateTuple
-        It is the initial state of this RNN layer, you can use it to initialize
-        your state at the begining of each epoch or iteration according to your
-        training procedure.
-
-    batch_size : int or tensor
-        Is int, if able to compute the batch_size, otherwise, tensor for ``?``.
-
-    Notes
-    -----
-    - Input dimension should be rank 3 : [batch_size, n_steps, n_features], if no, please see :class:`ReshapeLayer`.
-    - For predicting, the sequence length has to be the same with the sequence length of training, while, for normal
-    RNN, we can use sequence length of 1 for predicting.
-
-    References
-    ----------
-    - `Source <https://github.com/akaraspt/deepsleep/blob/master/deepsleep/model.py>`_
-    """
-    def __init__(
-        self,
-        layer = None,
-        cell_fn = None, #tf.nn.rnn_cell.LSTMCell,
-        cell_init_args = {'use_peepholes':True, 'state_is_tuple':True},
-        n_hidden = 100,
-        initializer = tf.random_uniform_initializer(-0.1, 0.1),
-        n_steps = 5,
-        fw_initial_state = None,
-        bw_initial_state = None,
-        dropout = None,
-        n_layer = 1,
-        return_last = False,
-        return_seq_2d = False,
-        name = 'birnn_layer',
-    ):
-        Layer.__init__(self, name=name)
-        if cell_fn is None:
-            raise Exception("Please put in cell_fn")
-        if 'GRU' in cell_fn.__name__:
-            try:
-                cell_init_args.pop('state_is_tuple')
-            except:
-                pass
-
-        self.inputs = layer.outputs
-
-        print("  [TL] BiRNNLayer %s: n_hidden:%d n_steps:%d in_dim:%d in_shape:%s cell_fn:%s dropout:%s n_layer:%d " % (self.name, n_hidden,
-            n_steps, self.inputs.get_shape().ndims, self.inputs.get_shape(), cell_fn.__name__, dropout, n_layer))
-
-        fixed_batch_size = self.inputs.get_shape().with_rank_at_least(1)[0]
-
-        if fixed_batch_size.value:
-            self.batch_size = fixed_batch_size.value
-            print("       RNN batch_size (concurrent processes): %d" % self.batch_size)
-        else:
-            from tensorflow.python.ops import array_ops
-            self.batch_size = array_ops.shape(self.inputs)[0]
-            print("       non specified batch_size, uses a tensor instead.")
-
-        # Input dimension should be rank 3 [batch_size, n_steps(max), n_features]
-        try:
-            self.inputs.get_shape().with_rank(3)
-        except:
-            raise Exception("RNN : Input dimension should be rank 3 : [batch_size, n_steps, n_features]")
-
-        with tf.variable_scope(name, initializer=initializer) as vs:
-            rnn_creator = lambda: cell_fn(num_units=n_hidden, **cell_init_args)
-            # Apply dropout
-            if dropout:
-                if type(dropout) in [tuple, list]:
-                    in_keep_prob = dropout[0]
-                    out_keep_prob = dropout[1]
-                elif isinstance(dropout, float):
-                    in_keep_prob, out_keep_prob = dropout, dropout
-                else:
-                    raise Exception("Invalid dropout type (must be a 2-D tuple of "
-                                    "float)")
-                try: # TF 1.0
-                    DropoutWrapper_fn = tf.contrib.rnn.DropoutWrapper
-                except:
-                    DropoutWrapper_fn = tf.nn.rnn_cell.DropoutWrapper
-                cell_creator = lambda: DropoutWrapper_fn(rnn_creator(),
-                                                         input_keep_prob=in_keep_prob,
-                                                         output_keep_prob=1.0)  # out_keep_prob)
-            else:
-                cell_creator = rnn_creator
-            self.fw_cell = cell_creator()
-            self.bw_cell = cell_creator()
-
-            # Apply multiple layers
-            if n_layer > 1:
-                try: # TF1.0
-                    MultiRNNCell_fn = tf.contrib.rnn.MultiRNNCell
-                except:
-                    MultiRNNCell_fn = tf.nn.rnn_cell.MultiRNNCell
-
-                try:
-                    self.fw_cell = MultiRNNCell_fn([cell_creator() for _ in range(n_layer)], state_is_tuple=True)
-                    self.bw_cell = MultiRNNCell_fn([cell_creator() for _ in range(n_layer)], state_is_tuple=True)
-                except:
-                    self.fw_cell = MultiRNNCell_fn([cell_creator() for _ in range(n_layer)])
-                    self.bw_cell = MultiRNNCell_fn([cell_creator() for _ in range(n_layer)])
-
-            # Initial state of RNN
-            if fw_initial_state is None:
-                self.fw_initial_state = self.fw_cell.zero_state(self.batch_size, dtype=tf.float32)
-            else:
-                self.fw_initial_state = fw_initial_state
-            if bw_initial_state is None:
-                self.bw_initial_state = self.bw_cell.zero_state(self.batch_size, dtype=tf.float32)
-            else:
-                self.bw_initial_state = bw_initial_state
-            # exit()
-            # Feedforward to MultiRNNCell
-            try: ## TF1.0
-                list_rnn_inputs = tf.unstack(self.inputs, axis=1)
-            except: ## TF0.12
-                list_rnn_inputs = tf.unpack(self.inputs, axis=1)
-
-            try: # TF1.0
-                bidirectional_rnn_fn = tf.contrib.rnn.static_bidirectional_rnn
-            except:
-                bidirectional_rnn_fn = tf.nn.bidirectional_rnn
-            outputs, fw_state, bw_state = bidirectional_rnn_fn(               # outputs, fw_state, bw_state = tf.contrib.rnn.static_bidirectional_rnn(
-                cell_fw=self.fw_cell,
-                cell_bw=self.bw_cell,
-                inputs=list_rnn_inputs,
-                initial_state_fw=self.fw_initial_state,
-                initial_state_bw=self.bw_initial_state
-            )
-
-            if return_last:
-                self.outputs = outputs[-1]
-            else:
-                self.outputs = outputs
-                if return_seq_2d:
-                    # 2D Tensor [n_example, n_hidden]
-                    try: # TF1.0
-                        self.outputs = tf.reshape(tf.concat(outputs, 1), [-1, n_hidden*2])
-                    except: # TF0.12
-                        self.outputs = tf.reshape(tf.concat(1, outputs), [-1, n_hidden*2])
-                else:
-                    # <akara>: stack more RNN layer after that
-                    # 3D Tensor [n_example/n_steps, n_steps, n_hidden]
-
-                    try: # TF1.0
-                        self.outputs = tf.reshape(tf.concat(outputs,1), [-1, n_steps, n_hidden*2])
-                    except: # TF0.12
-                        self.outputs = tf.reshape(tf.concat(1, outputs), [-1, n_steps, n_hidden*2])
-            self.fw_final_state = fw_state
-            self.bw_final_state = bw_state
-
-            # Retrieve just the RNN variables.
-            rnn_variables = tf.get_collection(TF_GRAPHKEYS_VARIABLES, scope=vs.name)
-
-        print("     n_params : %d" % (len(rnn_variables)))
-
-        self.all_layers = list(layer.all_layers)
-        self.all_params = list(layer.all_params)
-        self.all_drop = dict(layer.all_drop)
-        self.all_layers.extend( [self.outputs] )
-        self.all_params.extend( rnn_variables )
-
-# Advanced Ops for Dynamic RNN
-def advanced_indexing_op(input, index):
-    """Advanced Indexing for Sequences, returns the outputs by given sequence lengths.
-    When return the last output :class:`DynamicRNNLayer` uses it to get the last outputs with the sequence lengths.
-
-    Parameters
-    -----------
-    input : tensor for data
-        [batch_size, n_step(max), n_features]
-    index : tensor for indexing, i.e. sequence_length in Dynamic RNN.
-        [batch_size]
-
-    Examples
-    ---------
-    >>> batch_size, max_length, n_features = 3, 5, 2
-    >>> z = np.random.uniform(low=-1, high=1, size=[batch_size, max_length, n_features]).astype(np.float32)
-    >>> b_z = tf.constant(z)
-    >>> sl = tf.placeholder(dtype=tf.int32, shape=[batch_size])
-    >>> o = advanced_indexing_op(b_z, sl)
-    >>>
-    >>> sess = tf.InteractiveSession()
-    >>> tl.layers.initialize_global_variables(sess)
-    >>>
-    >>> order = np.asarray([1,1,2])
-    >>> print("real",z[0][order[0]-1], z[1][order[1]-1], z[2][order[2]-1])
-    >>> y = sess.run([o], feed_dict={sl:order})
-    >>> print("given",order)
-    >>> print("out", y)
-    ... real [-0.93021595  0.53820813] [-0.92548317 -0.77135968] [ 0.89952248  0.19149846]
-    ... given [1 1 2]
-    ... out [array([[-0.93021595,  0.53820813],
-    ...             [-0.92548317, -0.77135968],
-    ...             [ 0.89952248,  0.19149846]], dtype=float32)]
-
-    References
-    -----------
-    - Modified from TFlearn (the original code is used for fixed length rnn), `references <https://github.com/tflearn/tflearn/blob/master/tflearn/layers/recurrent.py>`_.
-    """
-    batch_size = tf.shape(input)[0]
-    # max_length = int(input.get_shape()[1])    # for fixed length rnn, length is given
-    max_length = tf.shape(input)[1]             # for dynamic_rnn, length is unknown
-    dim_size = int(input.get_shape()[2])
-    index = tf.range(0, batch_size) * max_length + (index - 1)
-    flat = tf.reshape(input, [-1, dim_size])
-    relevant = tf.gather(flat, index)
-    return relevant
-
-def retrieve_seq_length_op(data):
-    """An op to compute the length of a sequence from input shape of [batch_size, n_step(max), n_features],
-    it can be used when the features of padding (on right hand side) are all zeros.
-
-    Parameters
-    -----------
-    data : tensor
-        [batch_size, n_step(max), n_features] with zero padding on right hand side.
-
-    Examples
-    ---------
-    >>> data = [[[1],[2],[0],[0],[0]],
-    ...         [[1],[2],[3],[0],[0]],
-    ...         [[1],[2],[6],[1],[0]]]
-    >>> data = np.asarray(data)
-    >>> print(data.shape)
-    ... (3, 5, 1)
-    >>> data = tf.constant(data)
-    >>> sl = retrieve_seq_length_op(data)
-    >>> sess = tf.InteractiveSession()
-    >>> tl.layers.initialize_global_variables(sess)
-    >>> y = sl.eval()
-    ... [2 3 4]
-
-    - Multiple features
-    >>> data = [[[1,2],[2,2],[1,2],[1,2],[0,0]],
-    ...         [[2,3],[2,4],[3,2],[0,0],[0,0]],
-    ...         [[3,3],[2,2],[5,3],[1,2],[0,0]]]
-    >>> sl
-    ... [4 3 4]
-
-    References
-    ------------
-    - Borrow from `TFlearn <https://github.com/tflearn/tflearn/blob/master/tflearn/layers/recurrent.py>`_.
-    """
-    with tf.name_scope('GetLength'):
-        ## TF 1.0 change reduction_indices to axis
-        used = tf.sign(tf.reduce_max(tf.abs(data), 2))
-        length = tf.reduce_sum(used, 1)
-        ## TF < 1.0
-        # used = tf.sign(tf.reduce_max(tf.abs(data), reduction_indices=2))
-        # length = tf.reduce_sum(used, reduction_indices=1)
-        length = tf.cast(length, tf.int32)
-    return length
-
-def retrieve_seq_length_op2(data):
-    """An op to compute the length of a sequence, from input shape of [batch_size, n_step(max)],
-    it can be used when the features of padding (on right hand side) are all zeros.
-
-    Parameters
-    -----------
-    data : tensor
-        [batch_size, n_step(max)] with zero padding on right hand side.
-
-    Examples
-    --------
-    >>> data = [[1,2,0,0,0],
-    ...         [1,2,3,0,0],
-    ...         [1,2,6,1,0]]
-    >>> o = retrieve_seq_length_op2(data)
-    >>> sess = tf.InteractiveSession()
-    >>> tl.layers.initialize_global_variables(sess)
-    >>> print(o.eval())
-    ... [2 3 4]
-    """
-    return tf.reduce_sum(tf.cast(tf.greater(data, tf.zeros_like(data)), tf.int32), 1)
-
-
-def retrieve_seq_length_op3(data, pad_val=0): # HangSheng: return tensor for sequence length, if input is tf.string
-    data_shape_size = data.get_shape().ndims
-    if data_shape_size == 3:
-        return tf.reduce_sum(tf.cast(tf.reduce_any(tf.not_equal(data, pad_val), axis=2), dtype=tf.int32), 1)
-    elif data_shape_size == 2:
-        return tf.reduce_sum(tf.cast(tf.not_equal(data, pad_val), dtype=tf.int32), 1)
-    elif data_shape_size == 1:
-        raise ValueError("retrieve_seq_length_op3: data has wrong shape!")
-    else:
-        raise ValueError("retrieve_seq_length_op3: handling data_shape_size %s hasn't been implemented!" % (data_shape_size))
-
-
-def target_mask_op(data, pad_val=0):        # HangSheng: return tensor for mask,if input is tf.string
-    data_shape_size = data.get_shape().ndims
-    if data_shape_size == 3:
-        return tf.cast(tf.reduce_any(tf.not_equal(data, pad_val), axis=2), dtype=tf.int32)
-    elif data_shape_size == 2:
-        return tf.cast(tf.not_equal(data, pad_val), dtype=tf.int32)
-    elif data_shape_size == 1:
-        raise ValueError("target_mask_op: data has wrong shape!")
-    else:
-        raise ValueError("target_mask_op: handling data_shape_size %s hasn't been implemented!" % (data_shape_size))
-
-
-# Dynamic RNN
-class DynamicRNNLayer(Layer):
-    """
-    The :class:`DynamicRNNLayer` class is a Dynamic RNN layer, see ``tf.nn.dynamic_rnn``.
-
-    Parameters
-    ----------
-    layer : a :class:`Layer` instance
-        The `Layer` class feeding into this layer.
-    cell_fn : a TensorFlow's core RNN cell as follow (Note TF1.0+ and TF1.0- are different).
-        - see `RNN Cells in TensorFlow <https://www.tensorflow.org/api_docs/python/>`_
-    cell_init_args : a dictionary
-        The arguments for the cell initializer.
-    n_hidden : an int
-        The number of hidden units in the layer.
-    initializer : initializer
-        The initializer for initializing the parameters.
-    sequence_length : a tensor, array or None. The sequence length of each row of input data, see ``Advanced Ops for Dynamic RNN``.
-        - If None, it uses ``retrieve_seq_length_op`` to compute the sequence_length, i.e. when the features of padding (on right hand side) are all zeros.
-        - If using word embedding, you may need to compute the sequence_length from the ID array (the integer features before word embedding) by using ``retrieve_seq_length_op2`` or ``retrieve_seq_length_op``.
-        - You can also input an numpy array.
-        - More details about TensorFlow dynamic_rnn in `Wild-ML Blog <http://www.wildml.com/2016/08/rnns-in-tensorflow-a-practical-guide-and-undocumented-features/>`_.
-    initial_state : None or RNN State
-        If None, initial_state is zero_state.
-    dropout : `tuple` of `float`: (input_keep_prob, output_keep_prob).
-        The input and output keep probability.
-    n_layer : an int, default is 1.
-        The number of RNN layers.
-    return_last : boolean
-        - If True, return the last output, "Sequence input and single output"
-        - If False, return all outputs, "Synced sequence input and output"
-        - In other word, if you want to apply one or more RNN(s) on this layer, set to False.
-    return_seq_2d : boolean
-        - When return_last = False
-        - If True, return 2D Tensor [n_example, n_hidden], for stacking DenseLayer or computing cost after it.
-        - If False, return 3D Tensor [n_example/n_steps(max), n_steps(max), n_hidden], for stacking multiple RNN after it.
-    name : a string or None
-        An optional name to attach to this layer.
-
-    Variables
-    ------------
-    outputs : a tensor
-        The output of this RNN.
-        return_last = False, outputs = all cell_output, which is the hidden state.
-            cell_output.get_shape() = (?, n_hidden)
-
-    final_state : a tensor or StateTuple
-        When state_is_tuple = False,
-        it is the final hidden and cell states, states.get_shape() = [?, 2 * n_hidden].\n
-        When state_is_tuple = True, it stores two elements: (c, h), in that order.
-        You can get the final state after each iteration during training, then
-        feed it to the initial state of next iteration.
-
-    initial_state : a tensor or StateTuple
-        It is the initial state of this RNN layer, you can use it to initialize
-        your state at the begining of each epoch or iteration according to your
-        training procedure.
-
-    sequence_length : a tensor or array, shape = [batch_size]
-        The sequence lengths computed by Advanced Opt or the given sequence lengths.
-
-    Notes
-    -----
-    Input dimension should be rank 3 : [batch_size, n_steps(max), n_features], if no, please see :class:`ReshapeLayer`.
-
-    Examples
-    --------
-    >>> input_seqs = tf.placeholder(dtype=tf.int64, shape=[batch_size, None], name="input_seqs")
-    >>> net = tl.layers.EmbeddingInputlayer(
-    ...             inputs = input_seqs,
-    ...             vocabulary_size = vocab_size,
-    ...             embedding_size = embedding_size,
-    ...             name = 'seq_embedding')
-    >>> net = tl.layers.DynamicRNNLayer(net,
-    ...             cell_fn = tf.contrib.rnn.BasicLSTMCell, # for TF0.2 tf.nn.rnn_cell.BasicLSTMCell,
-    ...             n_hidden = embedding_size,
-    ...             dropout = 0.7,
-    ...             sequence_length = tl.layers.retrieve_seq_length_op2(input_seqs),
-    ...             return_seq_2d = True,     # stack denselayer or compute cost after it
-    ...             name = 'dynamic_rnn')
-    ... net = tl.layers.DenseLayer(net, n_units=vocab_size,
-    ...             act=tf.identity, name="output")
-
-    References
-    ----------
-    - `Wild-ML Blog <http://www.wildml.com/2016/08/rnns-in-tensorflow-a-practical-guide-and-undocumented-features/>`_
-    - `dynamic_rnn.ipynb <https://github.com/dennybritz/tf-rnn/blob/master/dynamic_rnn.ipynb>`_
-    - `tf.nn.dynamic_rnn <https://github.com/tensorflow/tensorflow/blob/master/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.nn.dynamic_rnn.md>`_
-    - `tflearn rnn <https://github.com/tflearn/tflearn/blob/master/tflearn/layers/recurrent.py>`_
-    - ``tutorial_dynamic_rnn.py``
-    """
-    def __init__(
-        self,
-        layer = None,
-        cell_fn = None,#tf.nn.rnn_cell.LSTMCell,
-        cell_init_args = {'state_is_tuple' : True},
-        n_hidden = 256,
-        initializer = tf.random_uniform_initializer(-0.1, 0.1),
-        sequence_length = None,
-        initial_state = None,
-        dropout = None,
-        n_layer = 1,
-        return_last = False,
-        return_seq_2d = False,
-        dynamic_rnn_init_args={},
-        name = 'dyrnn_layer',
-    ):
-        Layer.__init__(self, name=name)
-        if cell_fn is None:
-            raise Exception("Please put in cell_fn")
-        if 'GRU' in cell_fn.__name__:
-            try:
-                cell_init_args.pop('state_is_tuple')
-            except:
-                pass
-        self.inputs = layer.outputs
-
-        print("  [TL] DynamicRNNLayer %s: n_hidden:%d, in_dim:%d in_shape:%s cell_fn:%s dropout:%s n_layer:%d" % (self.name, n_hidden,
-             self.inputs.get_shape().ndims, self.inputs.get_shape(), cell_fn.__name__, dropout, n_layer))
-
-        # Input dimension should be rank 3 [batch_size, n_steps(max), n_features]
-        try:
-            self.inputs.get_shape().with_rank(3)
-        except:
-            raise Exception("RNN : Input dimension should be rank 3 : [batch_size, n_steps(max), n_features]")
-
-        # Get the batch_size
-        fixed_batch_size = self.inputs.get_shape().with_rank_at_least(1)[0]
-        if fixed_batch_size.value:
-            batch_size = fixed_batch_size.value
-            print("       batch_size (concurrent processes): %d" % batch_size)
-        else:
-            from tensorflow.python.ops import array_ops
-            batch_size = array_ops.shape(self.inputs)[0]
-            print("       non specified batch_size, uses a tensor instead.")
-        self.batch_size = batch_size
-
-        # Creats the cell function
-        # cell_instance_fn=lambda: cell_fn(num_units=n_hidden, **cell_init_args) # HanSheng
-        rnn_creator = lambda: cell_fn(num_units=n_hidden, **cell_init_args)
-
-        # Apply dropout
-        if dropout:
-            if type(dropout) in [tuple, list]:
-                in_keep_prob = dropout[0]
-                out_keep_prob = dropout[1]
-            elif isinstance(dropout, float):
-                in_keep_prob, out_keep_prob = dropout, dropout
-            else:
-                raise Exception("Invalid dropout type (must be a 2-D tuple of "
-                                "float)")
-            try: # TF1.0
-                DropoutWrapper_fn = tf.contrib.rnn.DropoutWrapper
-            except:
-                DropoutWrapper_fn = tf.nn.rnn_cell.DropoutWrapper
-
-            # cell_instance_fn1=cell_instance_fn        # HanSheng
-            # cell_instance_fn=DropoutWrapper_fn(
-            #                     cell_instance_fn1(),
-            #                     input_keep_prob=in_keep_prob,
-            #                     output_keep_prob=out_keep_prob)
-            cell_creator = lambda: DropoutWrapper_fn(rnn_creator(),
-                      input_keep_prob=in_keep_prob, output_keep_prob=1.0)#out_keep_prob)
-        else:
-            cell_creator = rnn_creator
-        self.cell = cell_creator()
-        # Apply multiple layers
-        if n_layer > 1:
-            try:
-                MultiRNNCell_fn = tf.contrib.rnn.MultiRNNCell
-            except:
-                MultiRNNCell_fn = tf.nn.rnn_cell.MultiRNNCell
-
-            # cell_instance_fn2=cell_instance_fn # HanSheng
-            try:
-                # cell_instance_fn=lambda: MultiRNNCell_fn([cell_instance_fn2() for _ in range(n_layer)], state_is_tuple=True) # HanSheng
-                self.cell = MultiRNNCell_fn([cell_creator() for _ in range(n_layer)], state_is_tuple=True)
-            except: # when GRU
-                # cell_instance_fn=lambda: MultiRNNCell_fn([cell_instance_fn2() for _ in range(n_layer)]) # HanSheng
-                self.cell = MultiRNNCell_fn([cell_creator() for _ in range(n_layer)])
-
-        if dropout:
-            self.cell = DropoutWrapper_fn(self.cell,
-                      input_keep_prob=1.0, output_keep_prob=out_keep_prob)
-
-        # self.cell=cell_instance_fn() # HanSheng
-
-        # Initialize initial_state
-        if initial_state is None:
-            self.initial_state = self.cell.zero_state(batch_size, dtype=tf.float32)
-        else:
-            self.initial_state = initial_state
-
-        # Computes sequence_length
-        if sequence_length is None:
-            try: ## TF1.0
-                sequence_length = retrieve_seq_length_op(
-                            self.inputs if isinstance(self.inputs, tf.Tensor) else tf.stack(self.inputs))
-            except: ## TF0.12
-                sequence_length = retrieve_seq_length_op(
-                            self.inputs if isinstance(self.inputs, tf.Tensor) else tf.pack(self.inputs))
-
-        # Main - Computes outputs and last_states
-        with tf.variable_scope(name, initializer=initializer) as vs:
-            outputs, last_states = tf.nn.dynamic_rnn(
-                cell=self.cell,
-                # inputs=X
-                inputs = self.inputs,
-                # dtype=tf.float64,
-                sequence_length=sequence_length,
-                initial_state = self.initial_state,
-                **dynamic_rnn_init_args
-                )
-            rnn_variables = tf.get_collection(TF_GRAPHKEYS_VARIABLES, scope=vs.name)
-
-            # print("     n_params : %d" % (len(rnn_variables)))
-            # Manage the outputs
-            if return_last:
-                # [batch_size, n_hidden]
-                # outputs = tf.transpose(tf.pack(outputs), [1, 0, 2]) # TF1.0 tf.pack --> tf.stack
-                self.outputs = advanced_indexing_op(outputs, sequence_length)
-            else:
-                # [batch_size, n_step(max), n_hidden]
-                # self.outputs = result[0]["outputs"]
-                # self.outputs = outputs    # it is 3d, but it is a list
-                if return_seq_2d:
-                    # PTB tutorial:
-                    # 2D Tensor [n_example, n_hidden]
-                    try: # TF1.0
-                        self.outputs = tf.reshape(tf.concat(outputs, 1), [-1, n_hidden])
-                    except: # TF0.12
-                        self.outputs = tf.reshape(tf.concat(1, outputs), [-1, n_hidden])
-                else:
-                    # <akara>:
-                    # 3D Tensor [batch_size, n_steps(max), n_hidden]
-                    max_length = tf.shape(outputs)[1]
-                    batch_size = tf.shape(outputs)[0]
-
-
-                    try: # TF1.0
-                        self.outputs = tf.reshape(tf.concat(outputs, 1), [batch_size, max_length, n_hidden])
-                    except: # TF0.12
-                        self.outputs = tf.reshape(tf.concat(1, outputs), [batch_size, max_length, n_hidden])
-                    # self.outputs = tf.reshape(tf.concat(1, outputs), [-1, max_length, n_hidden])
-
-        # Final state
-        self.final_state = last_states
-
-        self.sequence_length = sequence_length
-
-        self.all_layers = list(layer.all_layers)
-        self.all_params = list(layer.all_params)
-        self.all_drop = dict(layer.all_drop)
-
-        self.all_layers.extend( [self.outputs] )
-        self.all_params.extend( rnn_variables )
-
-# Bidirectional Dynamic RNN
-class BiDynamicRNNLayer(Layer):
-    """
-    The :class:`BiDynamicRNNLayer` class is a RNN layer, you can implement vanilla RNN,
-    LSTM and GRU with it.
-
-    Parameters
-    ----------
-    layer : a :class:`Layer` instance
-        The `Layer` class feeding into this layer.
-    cell_fn : a TensorFlow's core RNN cell as follow (Note TF1.0+ and TF1.0- are different).
-        - see `RNN Cells in TensorFlow <https://www.tensorflow.org/api_docs/python/>`_
-    cell_init_args : a dictionary
-        The arguments for the cell initializer.
-    n_hidden : an int
-        The number of hidden units in the layer.
-    initializer : initializer
-        The initializer for initializing the parameters.
-    sequence_length : a tensor, array or None
-        The sequence length of each row of input data, see ``Advanced Ops for Dynamic RNN``.
-            - If None, it uses ``retrieve_seq_length_op`` to compute the sequence_length, i.e. when the features of padding (on right hand side) are all zeros.
-            - If using word embedding, you may need to compute the sequence_length from the ID array (the integer features before word embedding) by using ``retrieve_seq_length_op2`` or ``retrieve_seq_length_op``.
-            - You can also input an numpy array.
-            - More details about TensorFlow dynamic_rnn in `Wild-ML Blog <http://www.wildml.com/2016/08/rnns-in-tensorflow-a-practical-guide-and-undocumented-features/>`_.
-    fw_initial_state : None or forward RNN State
-        If None, initial_state is zero_state.
-    bw_initial_state : None or backward RNN State
-        If None, initial_state is zero_state.
-    dropout : `tuple` of `float`: (input_keep_prob, output_keep_prob).
-        The input and output keep probability.
-    n_layer : an int, default is 1.
-        The number of RNN layers.
-    return_last : boolean
-        If True, return the last output, "Sequence input and single output"\n
-        If False, return all outputs, "Synced sequence input and output"\n
-        In other word, if you want to apply one or more RNN(s) on this layer, set to False.
-    return_seq_2d : boolean
-        - When return_last = False
-        - If True, return 2D Tensor [n_example, 2 * n_hidden], for stacking DenseLayer or computing cost after it.
-        - If False, return 3D Tensor [n_example/n_steps(max), n_steps(max), 2 * n_hidden], for stacking multiple RNN after it.
-    name : a string or None
-        An optional name to attach to this layer.
-
-    Variables
-    -----------------------
-    outputs : a tensor
-        The output of this RNN.
-        return_last = False, outputs = all cell_output, which is the hidden state.
-            cell_output.get_shape() = (?, 2 * n_hidden)
-
-    fw(bw)_final_state : a tensor or StateTuple
-        When state_is_tuple = False,
-        it is the final hidden and cell states, states.get_shape() = [?, 2 * n_hidden].\n
-        When state_is_tuple = True, it stores two elements: (c, h), in that order.
-        You can get the final state after each iteration during training, then
-        feed it to the initial state of next iteration.
-
-    fw(bw)_initial_state : a tensor or StateTuple
-        It is the initial state of this RNN layer, you can use it to initialize
-        your state at the begining of each epoch or iteration according to your
-        training procedure.
-
-    sequence_length : a tensor or array, shape = [batch_size]
-        The sequence lengths computed by Advanced Opt or the given sequence lengths.
-
-    Notes
-    -----
-    Input dimension should be rank 3 : [batch_size, n_steps(max), n_features], if no, please see :class:`ReshapeLayer`.
-
-
-    References
-    ----------
-    - `Wild-ML Blog <http://www.wildml.com/2016/08/rnns-in-tensorflow-a-practical-guide-and-undocumented-features/>`_
-    - `bidirectional_rnn.ipynb <https://github.com/dennybritz/tf-rnn/blob/master/bidirectional_rnn.ipynb>`_
-    """
-    def __init__(
-        self,
-        layer = None,
-        cell_fn = None,#tf.nn.rnn_cell.LSTMCell,
-        cell_init_args = {'state_is_tuple':True},
-        n_hidden = 256,
-        initializer = tf.random_uniform_initializer(-0.1, 0.1),
-        sequence_length = None,
-        fw_initial_state = None,
-        bw_initial_state = None,
-        dropout = None,
-        n_layer = 1,
-        return_last = False,
-        return_seq_2d = False,
-        dynamic_rnn_init_args={},
-        name = 'bi_dyrnn_layer',
-    ):
-        Layer.__init__(self, name=name)
-        if cell_fn is None:
-            raise Exception("Please put in cell_fn")
-        if 'GRU' in cell_fn.__name__:
-            try:
-                cell_init_args.pop('state_is_tuple')
-            except:
-                pass
-        self.inputs = layer.outputs
-
-        print("  [TL] BiDynamicRNNLayer %s: n_hidden:%d in_dim:%d in_shape:%s cell_fn:%s dropout:%s n_layer:%d" %
-              (self.name, n_hidden, self.inputs.get_shape().ndims, self.inputs.get_shape(), cell_fn.__name__, dropout, n_layer))
-
-        # Input dimension should be rank 3 [batch_size, n_steps(max), n_features]
-        try:
-            self.inputs.get_shape().with_rank(3)
-        except:
-            raise Exception("RNN : Input dimension should be rank 3 : [batch_size, n_steps(max), n_features]")
-
-        # Get the batch_size
-        fixed_batch_size = self.inputs.get_shape().with_rank_at_least(1)[0]
-        if fixed_batch_size.value:
-            batch_size = fixed_batch_size.value
-            print("       batch_size (concurrent processes): %d" % batch_size)
-        else:
-            from tensorflow.python.ops import array_ops
-            batch_size = array_ops.shape(self.inputs)[0]
-            print("       non specified batch_size, uses a tensor instead.")
-        self.batch_size = batch_size
-
-        with tf.variable_scope(name, initializer=initializer) as vs:
-            # Creats the cell function
-            # cell_instance_fn=lambda: cell_fn(num_units=n_hidden, **cell_init_args) # HanSheng
-            rnn_creator = lambda: cell_fn(num_units=n_hidden, **cell_init_args)
-
-            # Apply dropout
-            if dropout:
-                if type(dropout) in [tuple, list]:
-                    in_keep_prob = dropout[0]
-                    out_keep_prob = dropout[1]
-                elif isinstance(dropout, float):
-                    in_keep_prob, out_keep_prob = dropout, dropout
-                else:
-                    raise Exception("Invalid dropout type (must be a 2-D tuple of "
-                                    "float)")
-                try:
-                    DropoutWrapper_fn = tf.contrib.rnn.DropoutWrapper
-                except:
-                    DropoutWrapper_fn = tf.nn.rnn_cell.DropoutWrapper
-
-                    # cell_instance_fn1=cell_instance_fn            # HanSheng
-                    # cell_instance_fn=lambda: DropoutWrapper_fn(
-                    #                     cell_instance_fn1(),
-                    #                     input_keep_prob=in_keep_prob,
-                    #                     output_keep_prob=out_keep_prob)
-                cell_creator = lambda: DropoutWrapper_fn(rnn_creator(),
-                                                         input_keep_prob=in_keep_prob,
-                                                         output_keep_prob=1.0)  # out_keep_prob)
-            else:
-                cell_creator = rnn_creator
-            self.fw_cell = cell_creator()
-            self.bw_cell = cell_creator()
-            # Apply multiple layers
-            if n_layer > 1:
-                try:
-                    MultiRNNCell_fn = tf.contrib.rnn.MultiRNNCell
-                except:
-                    MultiRNNCell_fn = tf.nn.rnn_cell.MultiRNNCell
-
-                # cell_instance_fn2=cell_instance_fn            # HanSheng
-                # cell_instance_fn=lambda: MultiRNNCell_fn([cell_instance_fn2() for _ in range(n_layer)])
-                self.fw_cell = MultiRNNCell_fn([cell_creator() for _ in range(n_layer)])
-                self.bw_cell = MultiRNNCell_fn([cell_creator() for _ in range(n_layer)])
-
-            if dropout:
-                self.fw_cell = DropoutWrapper_fn(self.fw_cell,
-                          input_keep_prob=1.0, output_keep_prob=out_keep_prob)
-                self.bw_cell = DropoutWrapper_fn(self.bw_cell,
-                          input_keep_prob=1.0, output_keep_prob=out_keep_prob)
-
-            # self.fw_cell=cell_instance_fn()
-            # self.bw_cell=cell_instance_fn()
-            # Initial state of RNN
-            if fw_initial_state is None:
-                self.fw_initial_state = self.fw_cell.zero_state(self.batch_size, dtype=tf.float32)
-            else:
-                self.fw_initial_state = fw_initial_state
-            if bw_initial_state is None:
-                self.bw_initial_state = self.bw_cell.zero_state(self.batch_size, dtype=tf.float32)
-            else:
-                self.bw_initial_state = bw_initial_state
-            # Computes sequence_length
-            if sequence_length is None:
-                try: ## TF1.0
-                    sequence_length = retrieve_seq_length_op(
-                        self.inputs if isinstance(self.inputs, tf.Tensor) else tf.stack(self.inputs))
-                except: ## TF0.12
-                    sequence_length = retrieve_seq_length_op(
-                        self.inputs if isinstance(self.inputs, tf.Tensor) else tf.pack(self.inputs))
-
-            outputs, (states_fw, states_bw) = tf.nn.bidirectional_dynamic_rnn(
-                cell_fw=self.fw_cell,
-                cell_bw=self.bw_cell,
-                inputs=self.inputs,
-                sequence_length=sequence_length,
-                initial_state_fw=self.fw_initial_state,
-                initial_state_bw=self.bw_initial_state,
-                **dynamic_rnn_init_args
-            )
-            rnn_variables = tf.get_collection(TF_GRAPHKEYS_VARIABLES, scope=vs.name)
-
-            print("     n_params : %d" % (len(rnn_variables)))
-            # Manage the outputs
-            try: # TF1.0
-                outputs = tf.concat(outputs, 2)
-            except: # TF0.12
-                outputs = tf.concat(2, outputs)
-            if return_last:
-                # [batch_size, 2 * n_hidden]
-                self.outputs = advanced_indexing_op(outputs, sequence_length)
-            else:
-                # [batch_size, n_step(max), 2 * n_hidden]
-                if return_seq_2d:
-                    # PTB tutorial:
-                    # 2D Tensor [n_example, 2 * n_hidden]
-                    try: # TF1.0
-                        self.outputs = tf.reshape(tf.concat(outputs, 1), [-1, 2 * n_hidden])
-                    except: # TF0.12
-                        self.outputs = tf.reshape(tf.concat(1, outputs), [-1, 2 * n_hidden])
-                else:
-                    # <akara>:
-                    # 3D Tensor [batch_size, n_steps(max), 2 * n_hidden]
-                    max_length = tf.shape(outputs)[1]
-                    batch_size = tf.shape(outputs)[0]
-                    try: # TF1.0
-                        self.outputs = tf.reshape(tf.concat(outputs, 1), [batch_size, max_length, 2 * n_hidden])
-                    except: # TF0.12
-                        self.outputs = tf.reshape(tf.concat(1, outputs), [batch_size, max_length, 2 * n_hidden])
-                    # self.outputs = tf.reshape(tf.concat(1, outputs), [-1, max_length, 2 * n_hidden])
-
-        # Final state
-        self.fw_final_states = states_fw
-        self.bw_final_states = states_bw
-
-        self.sequence_length = sequence_length
-
-        self.all_layers = list(layer.all_layers)
-        self.all_params = list(layer.all_params)
-        self.all_drop = dict(layer.all_drop)
-
-        self.all_layers.extend( [self.outputs] )
-        self.all_params.extend( rnn_variables )
-
-# Seq2seq
-class Seq2Seq(Layer):
-    """
-    The :class:`Seq2Seq` class is a Simple :class:`DynamicRNNLayer` based Seq2seq layer without using `tl.contrib.seq2seq <https://www.tensorflow.org/api_guides/python/contrib.seq2seq>`_.
-    See `Model <https://camo.githubusercontent.com/242210d7d0151cae91107ee63bff364a860db5dd/687474703a2f2f6936342e74696e797069632e636f6d2f333031333674652e706e67>`_
-    and `Sequence to Sequence Learning with Neural Networks <https://arxiv.org/abs/1409.3215>`_.
-
-    - Please check the example `Twitter Chatbot <>`_.
-    - The Author recommends users to read the source code of :class:`DynamicRNNLayer` and :class:`Seq2Seq`.
-
-    Parameters
-    ----------
-    net_encode_in : a :class:`Layer` instance
-        Encode sequences, [batch_size, None, n_features].
-    net_decode_in : a :class:`Layer` instance
-        Decode sequences, [batch_size, None, n_features].
-    cell_fn : a TensorFlow's core RNN cell as follow (Note TF1.0+ and TF1.0- are different).
-        - see `RNN Cells in TensorFlow <https://www.tensorflow.org/api_docs/python/>`_
-    cell_init_args : a dictionary
-        The arguments for the cell initializer.
-    n_hidden : an int
-        The number of hidden units in the layer.
-    initializer : initializer
-        The initializer for initializing the parameters.
-    encode_sequence_length : tensor for encoder sequence length, see :class:`DynamicRNNLayer` .
-    decode_sequence_length : tensor for decoder sequence length, see :class:`DynamicRNNLayer` .
-    initial_state_encode : None or RNN state (from placeholder or other RNN).
-        If None, initial_state_encode is of zero state.
-    initial_state_decode : None or RNN state (from placeholder or other RNN).
-        If None, initial_state_decode is of the final state of the RNN encoder.
-    dropout : `tuple` of `float`: (input_keep_prob, output_keep_prob).
-        The input and output keep probability.
-    n_layer : an int, default is 1.
-        The number of RNN layers.
-    return_seq_2d : boolean
-        - When return_last = False
-        - If True, return 2D Tensor [n_example, n_hidden], for stacking DenseLayer or computing cost after it.
-        - If False, return 3D Tensor [n_example/n_steps(max), n_steps(max), n_hidden], for stacking multiple RNN after it.
-    name : a string or None
-        An optional name to attach to this layer.
-
-    Variables
-    ------------
-    outputs : a tensor
-        The output of RNN decoder.
-    initial_state_encode : a tensor or StateTuple
-        Initial state of RNN encoder.
-    initial_state_decode : a tensor or StateTuple
-        Initial state of RNN decoder.
-    final_state_encode : a tensor or StateTuple
-        Final state of RNN encoder.
-    final_state_decode : a tensor or StateTuple
-        Final state of RNN decoder.
-
-    Notes
-    --------
-    - How to feed data: `Sequence to Sequence Learning with Neural Networks <https://arxiv.org/pdf/1409.3215v3.pdf>`_
-    - input_seqs : ``['how', 'are', 'you', '<PAD_ID'>]``
-    - decode_seqs : ``['<START_ID>', 'I', 'am', 'fine', '<PAD_ID'>]``
-    - target_seqs : ``['I', 'am', 'fine', '<END_ID', '<PAD_ID'>]``
-    - target_mask : ``[1, 1, 1, 1, 0]``
-    - related functions : tl.prepro <pad_sequences, precess_sequences, sequences_add_start_id, sequences_get_mask>
-
-    Examples
-    ----------
-    >>> from tensorlayer.layers import *
-    >>> batch_size = 32
-    >>> encode_seqs = tf.placeholder(dtype=tf.int64, shape=[batch_size, None], name="encode_seqs")
-    >>> decode_seqs = tf.placeholder(dtype=tf.int64, shape=[batch_size, None], name="decode_seqs")
-    >>> target_seqs = tf.placeholder(dtype=tf.int64, shape=[batch_size, None], name="target_seqs")
-    >>> target_mask = tf.placeholder(dtype=tf.int64, shape=[batch_size, None], name="target_mask") # tl.prepro.sequences_get_mask()
-    >>> with tf.variable_scope("model"):
-    ...     # for chatbot, you can use the same embedding layer,
-    ...     # for translation, you may want to use 2 seperated embedding layers
-    >>>     with tf.variable_scope("embedding") as vs:
-    >>>         net_encode = EmbeddingInputlayer(
-    ...                 inputs = encode_seqs,
-    ...                 vocabulary_size = 10000,
-    ...                 embedding_size = 200,
-    ...                 name = 'seq_embedding')
-    >>>         vs.reuse_variables()
-    >>>         tl.layers.set_name_reuse(True)
-    >>>         net_decode = EmbeddingInputlayer(
-    ...                 inputs = decode_seqs,
-    ...                 vocabulary_size = 10000,
-    ...                 embedding_size = 200,
-    ...                 name = 'seq_embedding')
-    >>>     net = Seq2Seq(net_encode, net_decode,
-    ...             cell_fn = tf.contrib.rnn.BasicLSTMCell,
-    ...             n_hidden = 200,
-    ...             initializer = tf.random_uniform_initializer(-0.1, 0.1),
-    ...             encode_sequence_length = retrieve_seq_length_op2(encode_seqs),
-    ...             decode_sequence_length = retrieve_seq_length_op2(decode_seqs),
-    ...             initial_state_encode = None,
-    ...             dropout = None,
-    ...             n_layer = 1,
-    ...             return_seq_2d = True,
-    ...             name = 'seq2seq')
-    >>> net_out = DenseLayer(net, n_units=10000, act=tf.identity, name='output')
-    >>> e_loss = tl.cost.cross_entropy_seq_with_mask(logits=net_out.outputs, target_seqs=target_seqs, input_mask=target_mask, return_details=False, name='cost')
-    >>> y = tf.nn.softmax(net_out.outputs)
-    >>> net_out.print_params(False)
-
-
-    """
-    def __init__(
-        self,
-        net_encode_in = None,
-        net_decode_in = None,
-        cell_fn = None,#tf.nn.rnn_cell.LSTMCell,
-        cell_init_args = {'state_is_tuple':True},
-        n_hidden = 256,
-        initializer = tf.random_uniform_initializer(-0.1, 0.1),
-        encode_sequence_length = None,
-        decode_sequence_length = None,
-        initial_state_encode = None,
-        initial_state_decode = None,
-        dropout = None,
-        n_layer = 1,
-        # return_last = False,
-        return_seq_2d = False,
-        name = 'seq2seq',
-    ):
-        Layer.__init__(self, name=name)
-        if cell_fn is None:
-            raise Exception("Please put in cell_fn")
-        if 'GRU' in cell_fn.__name__:
-            try:
-                cell_init_args.pop('state_is_tuple')
-            except:
-                pass
-        # self.inputs = layer.outputs
-        print("  [**] Seq2Seq %s: n_hidden:%d cell_fn:%s dropout:%s n_layer:%d" %
-              (self.name, n_hidden, cell_fn.__name__, dropout, n_layer))
-
-        with tf.variable_scope(name) as vs:#, reuse=reuse):
-            # tl.layers.set_name_reuse(reuse)
-            # network = InputLayer(self.inputs, name=name+'/input')
-            network_encode = DynamicRNNLayer(net_encode_in,
-                     cell_fn = cell_fn,
-                     cell_init_args = cell_init_args,
-                     n_hidden = n_hidden,
-                     initial_state = initial_state_encode,
-                     dropout = dropout,
-                     n_layer = n_layer,
-                     sequence_length = encode_sequence_length,
-                     return_last = False,
-                     return_seq_2d = True,
-                     name = name+'_encode')
-            # vs.reuse_variables()
-            # tl.layers.set_name_reuse(True)
-            network_decode = DynamicRNNLayer(net_decode_in,
-                     cell_fn = cell_fn,
-                     cell_init_args = cell_init_args,
-                     n_hidden = n_hidden,
-                     initial_state = (network_encode.final_state if initial_state_decode is None else initial_state_decode),
-                     dropout = dropout,
-                     n_layer = n_layer,
-                     sequence_length = decode_sequence_length,
-                     return_last = False,
-                     return_seq_2d = return_seq_2d,
-                     name = name+'_decode')
-            self.outputs = network_decode.outputs
-
-            rnn_variables = tf.get_collection(TF_GRAPHKEYS_VARIABLES, scope=vs.name)
-
-        # Initial state
-        self.initial_state_encode = network_encode.initial_state
-        self.initial_state_decode = network_decode.initial_state
-
-        # Final state
-        self.final_state_encode = network_encode.final_state
-        self.final_state_decode = network_decode.final_state
-
-        # self.sequence_length = sequence_length
-        self.all_layers = list(network_decode.all_layers)
-        self.all_params = list(network_decode.all_params)
-        self.all_drop = dict(network_decode.all_drop)
-
-        self.all_layers.extend( [self.outputs] )
-        self.all_params.extend( rnn_variables )
-
-        self.all_layers = list_remove_repeat(self.all_layers)
-        self.all_params = list_remove_repeat(self.all_params)
-
-class PeekySeq2Seq(Layer):
-    """
-    Waiting for contribution.
-    The :class:`PeekySeq2Seq` class, see `Model <https://camo.githubusercontent.com/7f690d451036938a51e62feb77149c8bb4be6675/687474703a2f2f6936342e74696e797069632e636f6d2f333032617168692e706e67>`_
-    and `Learning Phrase Representations using RNN Encoder-Decoder for Statistical Machine Translation <https://arxiv.org/abs/1406.1078>`_ .
-    """
-    def __init__(
-        self,
-        net_encode_in = None,
-        net_decode_in = None,
-        cell_fn = None,#tf.nn.rnn_cell.LSTMCell,
-        cell_init_args = {'state_is_tuple':True},
-        n_hidden = 256,
-        initializer = tf.random_uniform_initializer(-0.1, 0.1),
-        in_sequence_length = None,
-        out_sequence_length = None,
-        initial_state = None,
-        dropout = None,
-        n_layer = 1,
-        # return_last = False,
-        return_seq_2d = False,
-        name = 'peeky_seq2seq',
-    ):
-        Layer.__init__(self, name=name)
-        if cell_fn is None:
-            raise Exception("Please put in cell_fn")
-        # self.inputs = layer.outputs
-        print("  [TL] PeekySeq2seq %s: n_hidden:%d cell_fn:%s dropout:%s n_layer:%d" %
-              (self.name, n_hidden, cell_fn.__name__, dropout, n_layer))
-
-class AttentionSeq2Seq(Layer):
-    """
-    Waiting for contribution.
-    The :class:`AttentionSeq2Seq` class, see `Model <https://camo.githubusercontent.com/0e2e4e5fb2dd47846c2fe027737a5df5e711df1b/687474703a2f2f6936342e74696e797069632e636f6d2f6132727733642e706e67>`_
-    and `Neural Machine Translation by Jointly Learning to Align and Translate <https://arxiv.org/pdf/1409.0473v6.pdf>`_ .
-    """
-    def __init__(
-        self,
-        net_encode_in = None,
-        net_decode_in = None,
-        cell_fn = None,#tf.nn.rnn_cell.LSTMCell,
-        cell_init_args = {'state_is_tuple':True},
-        n_hidden = 256,
-        initializer = tf.random_uniform_initializer(-0.1, 0.1),
-        in_sequence_length = None,
-        out_sequence_length = None,
-        initial_state = None,
-        dropout = None,
-        n_layer = 1,
-        # return_last = False,
-        return_seq_2d = False,
-        name = 'attention_seq2seq',
-    ):
-        Layer.__init__(self, name=name)
-        if cell_fn is None:
-            raise Exception("Please put in cell_fn")
-        # self.inputs = layer.outputs
-        print("  [TL] PeekySeq2seq %s: n_hidden:%d cell_fn:%s dropout:%s n_layer:%d" %
-              (self.name, n_hidden, cell_fn.__name__, dropout, n_layer))
-
-## Shape layer
-class FlattenLayer(Layer):
-    """
-    The :class:`FlattenLayer` class is layer which reshape high-dimension
-    input to a vector. Then we can apply DenseLayer, RNNLayer, ConcatLayer and
-    etc on the top of it.
-
-    [batch_size, mask_row, mask_col, n_mask] ---> [batch_size, mask_row * mask_col * n_mask]
-
-    Parameters
-    ----------
-    layer : a :class:`Layer` instance
-        The `Layer` class feeding into this layer.
-    name : a string or None
-        An optional name to attach to this layer.
-
-    Examples
-    --------
-    >>> x = tf.placeholder(tf.float32, shape=[None, 28, 28, 1])
-    >>> net = tl.layers.InputLayer(x, name='input_layer')
-    >>> net = tl.layers.Conv2dLayer(net,
-    ...                    act = tf.nn.relu,
-    ...                    shape = [5, 5, 32, 64],
-    ...                    strides=[1, 1, 1, 1],
-    ...                    padding='SAME',
-    ...                    name ='cnn_layer')
-    >>> net = tl.layers.Pool2dLayer(net,
-    ...                    ksize=[1, 2, 2, 1],
-    ...                    strides=[1, 2, 2, 1],
-    ...                    padding='SAME',
-    ...                    pool = tf.nn.max_pool,
-    ...                    name ='pool_layer',)
-    >>> net = tl.layers.FlattenLayer(net, name='flatten_layer')
-    """
-    def __init__(
-        self,
-        layer = None,
-        name ='flatten_layer',
-    ):
-        Layer.__init__(self, name=name)
-        self.inputs = layer.outputs
-        self.outputs = flatten_reshape(self.inputs, name=name)
-        self.n_units = int(self.outputs.get_shape()[-1])
-        print("  [TL] FlattenLayer %s: %d" % (self.name, self.n_units))
-        self.all_layers = list(layer.all_layers)
-        self.all_params = list(layer.all_params)
-        self.all_drop = dict(layer.all_drop)
-        self.all_layers.extend( [self.outputs] )
-
-class ReshapeLayer(Layer):
-    """
-    The :class:`ReshapeLayer` class is layer which reshape the tensor.
-
-    Parameters
-    ----------
-    layer : a :class:`Layer` instance
-        The `Layer` class feeding into this layer.
-    shape : a list
-        The output shape.
-    name : a string or None
-        An optional name to attach to this layer.
-
-    Examples
-    --------
-    - The core of this layer is ``tf.reshape``.
-    - Use TensorFlow only :
-    >>> x = tf.placeholder(tf.float32, shape=[None, 3])
-    >>> y = tf.reshape(x, shape=[-1, 3, 3])
-    >>> sess = tf.InteractiveSession()
-    >>> print(sess.run(y, feed_dict={x:[[1,1,1],[2,2,2],[3,3,3],[4,4,4],[5,5,5],[6,6,6]]}))
-    ... [[[ 1.  1.  1.]
-    ... [ 2.  2.  2.]
-    ... [ 3.  3.  3.]]
-    ... [[ 4.  4.  4.]
-    ... [ 5.  5.  5.]
-    ... [ 6.  6.  6.]]]
-    """
-    def __init__(
-        self,
-        layer = None,
-        shape = [],
-        name ='reshape_layer',
-    ):
-        Layer.__init__(self, name=name)
-        self.inputs = layer.outputs
-        self.outputs = tf.reshape(self.inputs, shape=shape, name=name)
-        print("  [TL] ReshapeLayer %s: %s" % (self.name, self.outputs.get_shape()))
-        self.all_layers = list(layer.all_layers)
-        self.all_params = list(layer.all_params)
-        self.all_drop = dict(layer.all_drop)
-        self.all_layers.extend( [self.outputs] )
-
-class TransposeLayer(Layer):
-    """
-    The :class:`TransposeLayer` class transpose the dimension of a teneor, see `tf.transpose() <https://www.tensorflow.org/api_docs/python/tf/transpose>`_ .
-
-    Parameters
-    ----------
-    layer : a :class:`Layer` instance
-        The `Layer` class feeding into this layer.
-    perm: list, a permutation of the dimensions
-        Similar with numpy.transpose.
-    name : a string or None
-        An optional name to attach to this layer.
-    """
-    def __init__(
-        self,
-        layer = None,
-        perm = None,
-        name = 'transpose',
-    ):
-        Layer.__init__(self, name=name)
-        self.inputs = layer.outputs
-        assert perm is not None
-
-        print("  [TL] TransposeLayer  %s: perm:%s" % (self.name, perm))
-        # with tf.variable_scope(name) as vs:
-        self.outputs = tf.transpose(self.inputs, perm=perm, name=name)
-        self.all_layers = list(layer.all_layers)
-        self.all_params = list(layer.all_params)
-        self.all_drop = dict(layer.all_drop)
-        self.all_layers.extend( [self.outputs] )
-        # self.all_params.extend( variables )
-
-## Lambda
-class LambdaLayer(Layer):
-    """
-    The :class:`LambdaLayer` class is a layer which is able to use the provided function.
-
-    Parameters
-    ----------
-    layer : a :class:`Layer` instance
-        The `Layer` class feeding into this layer.
-    fn : a function
-        The function that applies to the outputs of previous layer.
-    fn_args : a dictionary
-        The arguments for the function (option).
-    name : a string or None
-        An optional name to attach to this layer.
-
-    Examples
-    ---------
-    >>> x = tf.placeholder(tf.float32, shape=[None, 1], name='x')
-    >>> net = tl.layers.InputLayer(x, name='input_layer')
-    >>> net = LambdaLayer(net, lambda x: 2*x, name='lambda_layer')
-    >>> y = net.outputs
-    >>> sess = tf.InteractiveSession()
-    >>> out = sess.run(y, feed_dict={x : [[1],[2]]})
-    ... [[2],[4]]
-    """
-    def __init__(
-        self,
-        layer = None,
-        fn = None,
-        fn_args = {},
-        name = 'lambda_layer',
-    ):
-        Layer.__init__(self, name=name)
-        assert layer is not None
-        assert fn is not None
-        self.inputs = layer.outputs
-        print("  [TL] LambdaLayer  %s" % self.name)
-        with tf.variable_scope(name) as vs:
-            self.outputs = fn(self.inputs, **fn_args)
-            variables = tf.get_collection(TF_GRAPHKEYS_VARIABLES, scope=vs.name)
-
-        self.all_layers = list(layer.all_layers)
-        self.all_params = list(layer.all_params)
-        self.all_drop = dict(layer.all_drop)
-        self.all_layers.extend( [self.outputs] )
-        self.all_params.extend( variables )
-
-## Merge layer
-class ConcatLayer(Layer):
-    """
-    The :class:`ConcatLayer` class is layer which concat (merge) two or more tensor by given axis..
-
-    Parameters
-    ----------
-    layer : a list of :class:`Layer` instances
-        The `Layer` class feeding into this layer.
-    concat_dim : int
-        Dimension along which to concatenate.
-    name : a string or None
-        An optional name to attach to this layer.
-
-    Examples
-    --------
-    >>> sess = tf.InteractiveSession()
-    >>> x = tf.placeholder(tf.float32, shape=[None, 784])
-    >>> inputs = tl.layers.InputLayer(x, name='input_layer')
-    >>> net1 = tl.layers.DenseLayer(inputs, n_units=800, act = tf.nn.relu, name='relu1_1')
-    >>> net2 = tl.layers.DenseLayer(inputs, n_units=300, act = tf.nn.relu, name='relu2_1')
-    >>> net = tl.layers.ConcatLayer(layer = [net1, net2], name ='concat_layer')
-    ...     [TL] InputLayer input_layer (?, 784)
-    ...     [TL] DenseLayer relu1_1: 800, <function relu at 0x1108e41e0>
-    ...     [TL] DenseLayer relu2_1: 300, <function relu at 0x1108e41e0>
-    ...     [TL] ConcatLayer concat_layer, 1100
-    ...
-    >>> tl.layers.initialize_global_variables(sess)
-    >>> net.print_params()
-    ...     param 0: (784, 800) (mean: 0.000021, median: -0.000020 std: 0.035525)
-    ...     param 1: (800,) (mean: 0.000000, median: 0.000000 std: 0.000000)
-    ...     param 2: (784, 300) (mean: 0.000000, median: -0.000048 std: 0.042947)
-    ...     param 3: (300,) (mean: 0.000000, median: 0.000000 std: 0.000000)
-    ...     num of params: 863500
-    >>> net.print_layers()
-    ...     layer 0: Tensor("Relu:0", shape=(?, 800), dtype=float32)
-    ...     layer 1: Tensor("Relu_1:0", shape=(?, 300), dtype=float32)
-    ...
-    """
-    def __init__(
-        self,
-        layer = [],
-        concat_dim = 1,
-        name ='concat_layer',
-    ):
-        Layer.__init__(self, name=name)
-        self.inputs = []
-        for l in layer:
-            self.inputs.append(l.outputs)
-        try: # TF1.0
-            self.outputs = tf.concat(self.inputs, concat_dim, name=name)
-        except: # TF0.12
-            self.outputs = tf.concat(concat_dim, self.inputs, name=name)
-
-        print("  [TL] ConcatLayer %s: axis: %d" % (self.name, concat_dim))
-
-        self.all_layers = list(layer[0].all_layers)
-        self.all_params = list(layer[0].all_params)
-        self.all_drop = dict(layer[0].all_drop)
-
-        for i in range(1, len(layer)):
-            self.all_layers.extend(list(layer[i].all_layers))
-            self.all_params.extend(list(layer[i].all_params))
-            self.all_drop.update(dict(layer[i].all_drop))
-
-        self.all_layers = list_remove_repeat(self.all_layers)
-        self.all_params = list_remove_repeat(self.all_params)
-        #self.all_drop = list_remove_repeat(self.all_drop) # it is a dict
-
-class ElementwiseLayer(Layer):
-    """
-    The :class:`ElementwiseLayer` class combines multiple :class:`Layer` which have the same output shapes by a given elemwise-wise operation.
-
-    Parameters
-    ----------
-    layer : a list of :class:`Layer` instances
-        The `Layer` class feeding into this layer.
-    combine_fn : a TensorFlow elemwise-merge function
-        e.g. AND is ``tf.minimum`` ;  OR is ``tf.maximum`` ; ADD is ``tf.add`` ; MUL is ``tf.multiply`` and so on.
-        See `TensorFlow Math API <https://www.tensorflow.org/versions/master/api_docs/python/math_ops.html#math>`_ .
-    name : a string or None
-        An optional name to attach to this layer.
-
-    Examples
-    --------
-    - AND Logic
-    >>> net_0 = tl.layers.DenseLayer(net_0, n_units=500,
-    ...                        act = tf.nn.relu, name='net_0')
-    >>> net_1 = tl.layers.DenseLayer(net_1, n_units=500,
-    ...                        act = tf.nn.relu, name='net_1')
-    >>> net_com = tl.layers.ElementwiseLayer(layer = [net_0, net_1],
-    ...                         combine_fn = tf.minimum,
-    ...                         name = 'combine_layer')
-    """
-    def __init__(
-        self,
-        layer = [],
-        combine_fn = tf.minimum,
-        name ='elementwise_layer',
-    ):
-        Layer.__init__(self, name=name)
-
-        print("  [TL] ElementwiseLayer %s: size:%s fn:%s" % (self.name, layer[0].outputs.get_shape(), combine_fn.__name__))
-
-        self.outputs = layer[0].outputs
-        # print(self.outputs._shape, type(self.outputs._shape))
-        for l in layer[1:]:
-            assert str(self.outputs.get_shape()) == str(l.outputs.get_shape()), "Hint: the input shapes should be the same. %s != %s" %  (self.outputs.get_shape() , str(l.outputs.get_shape()))
-            self.outputs = combine_fn(self.outputs, l.outputs, name=name)
-
-        self.all_layers = list(layer[0].all_layers)
-        self.all_params = list(layer[0].all_params)
-        self.all_drop = dict(layer[0].all_drop)
-
-        for i in range(1, len(layer)):
-            self.all_layers.extend(list(layer[i].all_layers))
-            self.all_params.extend(list(layer[i].all_params))
-            self.all_drop.update(dict(layer[i].all_drop))
-
-        self.all_layers = list_remove_repeat(self.all_layers)
-        self.all_params = list_remove_repeat(self.all_params)
-        # self.all_drop = list_remove_repeat(self.all_drop)
-
-## Extend
-class ExpandDimsLayer(Layer):
-    """
-    The :class:`ExpandDimsLayer` class inserts a dimension of 1 into a tensor's shape,
-    see `tf.expand_dims() <https://www.tensorflow.org/api_docs/python/array_ops/shapes_and_shaping#expand_dims>`_ .
-
-    Parameters
-    ----------
-    layer : a :class:`Layer` instance
-        The `Layer` class feeding into this layer.
-    axis : int, 0-D (scalar).
-        Specifies the dimension index at which to expand the shape of input.
-    name : a string or None
-        An optional name to attach to this layer.
-    """
-    def __init__(
-        self,
-        layer = None,
-        axis = None,
-        name = 'expand_dims',
-    ):
-        Layer.__init__(self, name=name)
-        self.inputs = layer.outputs
-
-        print("  [TL] ExpandDimsLayer  %s: axis:%d" % (self.name, axis))
-        with tf.variable_scope(name) as vs:
-            try:    # TF12 TF1.0
-                self.outputs = tf.expand_dims(self.inputs, axis=axis)
-            except: # TF11
-                self.outputs = tf.expand_dims(self.inputs, dim=axis)
-        self.all_layers = list(layer.all_layers)
-        self.all_params = list(layer.all_params)
-        self.all_drop = dict(layer.all_drop)
-        self.all_layers.extend( [self.outputs] )
-        # self.all_params.extend( variables )
-
-class TileLayer(Layer):
-    """
-    The :class:`TileLayer` class constructs a tensor by tiling a given tensor,
-    see `tf.tile() <https://www.tensorflow.org/api_docs/python/array_ops/slicing_and_joining#tile>`_ .
-
-    Parameters
-    ----------
-    layer : a :class:`Layer` instance
-        The `Layer` class feeding into this layer.
-    multiples: a list of int
-        Must be one of the following types: int32, int64. 1-D. Length must be the same as the number of dimensions in input
-    name : a string or None
-        An optional name to attach to this layer.
-    """
-    def __init__(
-        self,
-        layer = None,
-        multiples = None,
-        name = 'tile',
-    ):
-        Layer.__init__(self, name=name)
-        self.inputs = layer.outputs
-
-        print("  [TL] TileLayer  %s: multiples:%s" % (self.name, multiples))
-        with tf.variable_scope(name) as vs:
-            self.outputs = tf.tile(self.inputs, multiples=multiples)
-        self.all_layers = list(layer.all_layers)
-        self.all_params = list(layer.all_params)
-        self.all_drop = dict(layer.all_drop)
-        self.all_layers.extend( [self.outputs] )
-        # self.all_params.extend( variables )
-
-## Stack Unstack
-class StackLayer(Layer):
-    """
-    The :class:`StackLayer` class is layer for stacking a list of rank-R tensors into one rank-(R+1) tensor, see `tf.stack() <https://www.tensorflow.org/api_docs/python/tf/stack>`_.
-
-    Parameters
-    ----------
-    layer : a list of :class:`Layer` instances
-        The `Layer` class feeding into this layer.
-    axis : an int
-        Dimension along which to concatenate.
-    name : a string or None
-        An optional name to attach to this layer.
-    """
-    def __init__(
-        self,
-        layer = [],
-        axis = 0,
-        name ='stack',
-    ):
-        Layer.__init__(self, name=name)
-        self.inputs = []
-        for l in layer:
-            self.inputs.append(l.outputs)
-
-        self.outputs = tf.stack(self.inputs, axis=axis, name=name)
-
-        print("  [TL] StackLayer %s: axis: %d" % (self.name, axis))
-
-        self.all_layers = list(layer[0].all_layers)
-        self.all_params = list(layer[0].all_params)
-        self.all_drop = dict(layer[0].all_drop)
-
-        for i in range(1, len(layer)):
-            self.all_layers.extend(list(layer[i].all_layers))
-            self.all_params.extend(list(layer[i].all_params))
-            self.all_drop.update(dict(layer[i].all_drop))
-
-        self.all_layers = list_remove_repeat(self.all_layers)
-        self.all_params = list_remove_repeat(self.all_params)
-
-def UnStackLayer(
-        layer = None,
-        num = None,
-        axis = 0,
-        name ='unstack',):
-    """
-    The `UnStackLayer` is layer for unstacking the given dimension of a rank-R tensor into rank-(R-1) tensors., see `tf.unstack() <https://www.tensorflow.org/api_docs/python/tf/unstack>`_.
-
-    Parameters
-    ----------
-    layer : a list of :class:`Layer` instances
-        The `Layer` class feeding into this layer.
-    num : an int
-        The length of the dimension axis. Automatically inferred if None (the default).
-    axis : an int
-        Dimension along which to concatenate.
-    name : a string or None
-        An optional name to attach to this layer.
-
-    Returns
-    --------
-    The list of layer objects unstacked from the input.
-    """
-    inputs = layer.outputs
-    with tf.variable_scope(name) as vs:
-        outputs = tf.unstack(inputs, num=num, axis=axis)
-
-    print("  [TL] UnStackLayer %s: num: %s axis: %d, n_outputs: %d" % (name, num, axis, len(outputs)))
-
-    net_new = []
-    scope_name = tf.get_variable_scope().name
-    if scope_name:
-        whole_name = scope_name + '/' + name
-    else:
-        whole_name = name
-
-    for i in range(len(outputs)):
-        n = Layer(None, name=whole_name+str(i))
-        n.outputs = outputs[i]
-        n.all_layers = list(layer.all_layers)
-        n.all_params = list(layer.all_params)
-        n.all_drop = dict(layer.all_drop)
-        n.all_layers.extend( [inputs] )
-
-        net_new.append(n)
-
-    return net_new
-
-## TF-Slim layer
-class SlimNetsLayer(Layer):
-    """
-    The :class:`SlimNetsLayer` class can be used to merge all TF-Slim nets into
-    TensorLayer. Model can be found in `slim-model <https://github.com/tensorflow/models/tree/master/slim#pre-trained-models>`_ , more about slim
-    see `slim-git <https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/slim>`_ .
-
-    Parameters
-    ----------
-    layer : a :class:`Layer` instance
-        The `Layer` class feeding into this layer.
-    slim_layer : a slim network function
-        The network you want to stack onto, end with ``return net, end_points``.
-    slim_args : dictionary
-        The arguments for the slim model.
-    name : a string or None
-        An optional name to attach to this layer.
-
-    Examples
-    --------
-    - see Inception V3 example on `Github <https://github.com/zsdonghao/tensorlayer>`_
-
-    Notes
-    -----
-    The due to TF-Slim stores the layers as dictionary, the ``all_layers`` in this
-    network is not in order ! Fortunately, the ``all_params`` are in order.
-    """
-    def __init__(
-        self,
-        layer = None,
-        slim_layer = None,
-        slim_args = {},
-        name ='tfslim_layer',
-    ):
-        Layer.__init__(self, name=name)
-        assert slim_layer is not None
-        assert slim_args is not None
-        self.inputs = layer.outputs
-        print("  [TL] SlimNetsLayer %s: %s" % (self.name, slim_layer.__name__))
-
-        # with tf.variable_scope(name) as vs:
-        #     net, end_points = slim_layer(self.inputs, **slim_args)
-        #     slim_variables = tf.get_collection(TF_GRAPHKEYS_VARIABLES, scope=vs.name)
-
-        net, end_points = slim_layer(self.inputs, **slim_args)
-
-        slim_variables = tf.get_collection(TF_GRAPHKEYS_VARIABLES, scope=name)
-        if slim_variables == []:
-            print("No variables found under %s : the name of SlimNetsLayer should be matched with the begining of the ckpt file, see tutorial_inceptionV3_tfslim.py for more details" % name)
-
-
-        self.outputs = net
-
-        slim_layers = []
-        for v in end_points.values():
-            # tf.contrib.layers.summaries.summarize_activation(v)
-            slim_layers.append(v)
-
-        self.all_layers = list(layer.all_layers)
-        self.all_params = list(layer.all_params)
-        self.all_drop = dict(layer.all_drop)
-
-        self.all_layers.extend( slim_layers )
-        self.all_params.extend( slim_variables )
-
-## Keras layer
-class KerasLayer(Layer):
-    """
-    The :class:`KerasLayer` class can be used to merge all Keras layers into
-    TensorLayer. Example can be found here `tutorial_keras.py <https://github.com/zsdonghao/tensorlayer/blob/master/example/tutorial_keras.py>`_
-
-    Parameters
-    ----------
-    layer : a :class:`Layer` instance
-        The `Layer` class feeding into this layer.
-    keras_layer : a keras network function
-    keras_args : dictionary
-        The arguments for the keras model.
-    name : a string or None
-        An optional name to attach to this layer.
-    """
-    def __init__(
-        self,
-        layer = None,
-        keras_layer = None,
-        keras_args = {},
-        name ='keras_layer',
-    ):
-        Layer.__init__(self, name=name)
-        assert layer is not None
-        assert keras_layer is not None
-        self.inputs = layer.outputs
-        print("  [TL] KerasLayer %s: %s" % (self.name, keras_layer))
-        print("       This API will be removed, please use LambdaLayer instead.")
-        with tf.variable_scope(name) as vs:
-            self.outputs = keras_layer(self.inputs, **keras_args)
-            variables = tf.get_collection(TF_GRAPHKEYS_VARIABLES, scope=vs.name)
-        self.all_layers = list(layer.all_layers)
-        self.all_params = list(layer.all_params)
-        self.all_drop = dict(layer.all_drop)
-        self.all_layers.extend( [self.outputs] )
-        self.all_params.extend( variables )
-
-## Estimator layer
-class EstimatorLayer(Layer):
-    """
-    The :class:`EstimatorLayer` class accepts ``model_fn`` that described the model.
-    It is similar with :class:`KerasLayer`, see `tutorial_keras.py <https://github.com/zsdonghao/tensorlayer/blob/master/example/tutorial_keras.py>`_
-
-    Parameters
-    ----------
-    layer : a :class:`Layer` instance
-        The `Layer` class feeding into this layer.
-    model_fn : a function that described the model.
-    args : dictionary
-        The arguments for the model_fn.
-    name : a string or None
-        An optional name to attach to this layer.
-    """
-    def __init__(
-        self,
-        layer = None,
-        model_fn = None,
-        args = {},
-        name ='estimator_layer',
-    ):
-        Layer.__init__(self, name=name)
-        assert layer is not None
-        assert model_fn is not None
-        self.inputs = layer.outputs
-        print("  [TL] EstimatorLayer %s: %s" % (self.name, model_fn))
-        print("       This API will be removed, please use LambdaLayer instead.")
-        with tf.variable_scope(name) as vs:
-            self.outputs = model_fn(self.inputs, **args)
-            variables = tf.get_collection(TF_GRAPHKEYS_VARIABLES, scope=vs.name)
-        self.all_layers = list(layer.all_layers)
-        self.all_params = list(layer.all_params)
-        self.all_drop = dict(layer.all_drop)
-        self.all_layers.extend( [self.outputs] )
-        self.all_params.extend( variables )
-
-## Special activation
-class PReluLayer(Layer):
-    """
-    The :class:`PReluLayer` class is Parametric Rectified Linear layer.
-
-    Parameters
-    ----------
-    x : A `Tensor` with type `float`, `double`, `int32`, `int64`, `uint8`,
-        `int16`, or `int8`.
-    channel_shared : `bool`. Single weight is shared by all channels
-    a_init : alpha initializer, default zero constant.
-        The initializer for initializing the alphas.
-    a_init_args : dictionary
-        The arguments for the weights initializer.
-    name : A name for this activation op (optional).
-
-    References
-    -----------
-    - `Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification <http://arxiv.org/pdf/1502.01852v1.pdf>`_
-    """
-    def __init__(
-        self,
-        layer = None,
-        channel_shared = False,
-        a_init = tf.constant_initializer(value=0.0),
-        a_init_args = {},
-        # restore = True,
-        name="prelu_layer"
-    ):
-        Layer.__init__(self, name=name)
-        self.inputs = layer.outputs
-        print("  [TL] PReluLayer %s: channel_shared:%s" % (self.name, channel_shared))
-        if channel_shared:
-            w_shape = (1,)
-        else:
-            w_shape = int(self.inputs.get_shape()[-1])
-
-        # with tf.name_scope(name) as scope:
-        with tf.variable_scope(name) as vs:
-            alphas = tf.get_variable(name='alphas', shape=w_shape, initializer=a_init, **a_init_args )
-            try:  ## TF 1.0
-                self.outputs = tf.nn.relu(self.inputs) + tf.multiply(alphas, (self.inputs - tf.abs(self.inputs))) * 0.5
-            except: ## TF 0.12
-                self.outputs = tf.nn.relu(self.inputs) + tf.mul(alphas, (self.inputs - tf.abs(self.inputs))) * 0.5
-
-
-        self.all_layers = list(layer.all_layers)
-        self.all_params = list(layer.all_params)
-        self.all_drop = dict(layer.all_drop)
-
-        self.all_layers.extend( [self.outputs] )
-        self.all_params.extend( [alphas] )
-
-## Flow control layer
-class MultiplexerLayer(Layer):
-    """
-    The :class:`MultiplexerLayer` selects one of several input and forwards the selected input into the output,
-    see `tutorial_mnist_multiplexer.py`.
-
-    Parameters
-    ----------
-    layer : a list of :class:`Layer` instances
-        The `Layer` class feeding into this layer.
-    name : a string or None
-        An optional name to attach to this layer.
-
-
-    Variables
-    -----------------------
-    sel : a placeholder
-        Input an int [0, inf], which input is the output
-
-    Examples
-    --------
-    >>> x = tf.placeholder(tf.float32, shape=[None, 784], name='x')
-    >>> y_ = tf.placeholder(tf.int64, shape=[None, ], name='y_')
-    >>> # define the network
-    >>> net_in = tl.layers.InputLayer(x, name='input_layer')
-    >>> net_in = tl.layers.DropoutLayer(net_in, keep=0.8, name='drop1')
-    >>> # net 0
-    >>> net_0 = tl.layers.DenseLayer(net_in, n_units=800,
-    ...                                act = tf.nn.relu, name='net0/relu1')
-    >>> net_0 = tl.layers.DropoutLayer(net_0, keep=0.5, name='net0/drop2')
-    >>> net_0 = tl.layers.DenseLayer(net_0, n_units=800,
-    ...                                act = tf.nn.relu, name='net0/relu2')
-    >>> # net 1
-    >>> net_1 = tl.layers.DenseLayer(net_in, n_units=800,
-    ...                                act = tf.nn.relu, name='net1/relu1')
-    >>> net_1 = tl.layers.DropoutLayer(net_1, keep=0.8, name='net1/drop2')
-    >>> net_1 = tl.layers.DenseLayer(net_1, n_units=800,
-    ...                                act = tf.nn.relu, name='net1/relu2')
-    >>> net_1 = tl.layers.DropoutLayer(net_1, keep=0.8, name='net1/drop3')
-    >>> net_1 = tl.layers.DenseLayer(net_1, n_units=800,
-    ...                                act = tf.nn.relu, name='net1/relu3')
-    >>> # multiplexer
-    >>> net_mux = tl.layers.MultiplexerLayer(layer = [net_0, net_1], name='mux_layer')
-    >>> network = tl.layers.ReshapeLayer(net_mux, shape=[-1, 800], name='reshape_layer') #
-    >>> network = tl.layers.DropoutLayer(network, keep=0.5, name='drop3')
-    >>> # output layer
-    >>> network = tl.layers.DenseLayer(network, n_units=10,
-    ...                                act = tf.identity, name='output_layer')
-
-    References
-    ------------
-    - See ``tf.pack() for TF0.12 or tf.stack() for TF1.0`` and ``tf.gather()`` at `TensorFlow - Slicing and Joining <https://www.tensorflow.org/versions/master/api_docs/python/array_ops.html#slicing-and-joining>`_
-    """
-    def __init__(self,
-               layer = [],
-               name='mux_layer'):
-        Layer.__init__(self, name=name)
-        self.n_inputs = len(layer)
-
-        self.inputs = []
-        for l in layer:
-            self.inputs.append(l.outputs)
-        try: ## TF1.0
-            all_inputs = tf.stack(self.inputs, name=name) # pack means concat a list of tensor in a new dim  # 1.2
-        except:
-            all_inputs = tf.pack(self.inputs, name=name) # pack means concat a list of tensor in a new dim  # 1.2
-
-        print("  [TL] MultiplexerLayer %s: n_inputs:%d" % (self.name, self.n_inputs))
-
-        self.sel = tf.placeholder(tf.int32)
-        self.outputs = tf.gather(all_inputs, self.sel, name=name) # [sel, :, : ...] # 1.2
-
-        # print(self.outputs, vars(self.outputs))
-        #         # tf.reshape(self.outputs, shape=)
-        # exit()
-        # the same with ConcatLayer
-        self.all_layers = list(layer[0].all_layers)
-        self.all_params = list(layer[0].all_params)
-        self.all_drop = dict(layer[0].all_drop)
-
-        for i in range(1, len(layer)):
-            self.all_layers.extend(list(layer[i].all_layers))
-            self.all_params.extend(list(layer[i].all_params))
-            self.all_drop.update(dict(layer[i].all_drop))
-
-        self.all_layers = list_remove_repeat(self.all_layers)
-        self.all_params = list_remove_repeat(self.all_params)
-        # self.all_drop = list_remove_repeat(self.all_drop)
-## We can Duplicate the network instead of DemultiplexerLayer
-# class DemultiplexerLayer(Layer):
-#     """
-#     The :class:`DemultiplexerLayer` takes a single input and select one of many output lines, which is connected to the input.
-#
-#     Parameters
-#     ----------
-#     layer : a list of :class:`Layer` instances
-#         The `Layer` class feeding into this layer.
-#     n_outputs : an int
-#         The number of output
-#     name : a string or None
-#         An optional name to attach to this layer.
-#
-#     Field (Class Variables)
-#     -----------------------
-#     sel : a placeholder
-#         Input int [0, inf], the
-#     outputs : a list of Tensor
-#         A list of outputs
-#
-#     Examples
-#     --------
-#     >>>
-#     """
-#     def __init__(self,
-#            layer = None,
-#            name='demux_layer'):
-#         Layer.__init__(self, name=name)
-#         self.outputs = []
-
-## Wrapper
-class EmbeddingAttentionSeq2seqWrapper(Layer):
-  """Sequence-to-sequence model with attention and for multiple buckets (Deprecated after TF0.12).
-
-    This example implements a multi-layer recurrent neural network as encoder,
-    and an attention-based decoder. This is the same as the model described in
-    this paper:
-    - `Grammar as a Foreign Language <http://arxiv.org/abs/1412.7449>`_
-    please look there for details,
-    or into the seq2seq library for complete model implementation.
-    This example also allows to use GRU cells in addition to LSTM cells, and
-    sampled softmax to handle large output vocabulary size. A single-layer
-    version of this model, but with bi-directional encoder, was presented in
-    - `Neural Machine Translation by Jointly Learning to Align and Translate <http://arxiv.org/abs/1409.0473>`_
-    The sampled softmax is described in Section 3 of the following paper.
-    - `On Using Very Large Target Vocabulary for Neural Machine Translation <http://arxiv.org/abs/1412.2007>`_
-
-    Parameters
-    ----------
-    source_vocab_size : size of the source vocabulary.
-    target_vocab_size : size of the target vocabulary.
-    buckets : a list of pairs (I, O), where I specifies maximum input length
-        that will be processed in that bucket, and O specifies maximum output
-        length. Training instances that have inputs longer than I or outputs
-        longer than O will be pushed to the next bucket and padded accordingly.
-        We assume that the list is sorted, e.g., [(2, 4), (8, 16)].
-    size : number of units in each layer of the model.
-    num_layers : number of layers in the model.
-    max_gradient_norm : gradients will be clipped to maximally this norm.
-    batch_size : the size of the batches used during training;
-        the model construction is independent of batch_size, so it can be
-        changed after initialization if this is convenient, e.g., for decoding.
-    learning_rate : learning rate to start with.
-    learning_rate_decay_factor : decay learning rate by this much when needed.
-    use_lstm : if true, we use LSTM cells instead of GRU cells.
-    num_samples : number of samples for sampled softmax.
-    forward_only : if set, we do not construct the backward pass in the model.
-    name : a string or None
-        An optional name to attach to this layer.
-  """
-  def __init__(self,
-               source_vocab_size,
-               target_vocab_size,
-               buckets,
-               size,
-               num_layers,
-               max_gradient_norm,
-               batch_size,
-               learning_rate,
-               learning_rate_decay_factor,
-               use_lstm=False,
-               num_samples=512,
-               forward_only=False,
-               name='wrapper'):
-    Layer.__init__(self)#, name=name)
-
-    self.source_vocab_size = source_vocab_size
-    self.target_vocab_size = target_vocab_size
-    self.buckets = buckets
-    self.batch_size = batch_size
-    self.learning_rate = tf.Variable(float(learning_rate), trainable=False, name='learning_rate')
-    self.learning_rate_decay_op = self.learning_rate.assign(
-        self.learning_rate * learning_rate_decay_factor)
-    self.global_step = tf.Variable(0, trainable=False, name='global_step')
-
-    if tf.__version__ >= "0.12":
-        raise Exception("Deprecated after TF0.12 : use other seq2seq layers instead.")
-
-    # =========== Fake output Layer for compute cost ======
-    # If we use sampled softmax, we need an output projection.
-    with tf.variable_scope(name) as vs:
-        output_projection = None
-        softmax_loss_function = None
-        # Sampled softmax only makes sense if we sample less than vocabulary size.
-        if num_samples > 0 and num_samples < self.target_vocab_size:
-          w = tf.get_variable("proj_w", [size, self.target_vocab_size])
-          w_t = tf.transpose(w)
-          b = tf.get_variable("proj_b", [self.target_vocab_size])
-          output_projection = (w, b)
-
-          def sampled_loss(inputs, labels):
-            labels = tf.reshape(labels, [-1, 1])
-            return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels, num_samples,
-                    self.target_vocab_size)
-          softmax_loss_function = sampled_loss
-
-        # ============ Seq Encode Layer =============
-        # Create the internal multi-layer cell for our RNN.
-        try: # TF1.0
-          cell_creator = lambda: tf.contrib.rnn.GRUCell(size)
-        except:
-          cell_creator = lambda: tf.nn.rnn_cell.GRUCell(size)
-
-        if use_lstm:
-          try: # TF1.0
-            cell_creator = lambda: tf.contrib.rnn.BasicLSTMCell(size)
-          except:
-            cell_creator = lambda: tf.nn.rnn_cell.BasicLSTMCell(size)
-
-        cell = cell_creator()
-        if num_layers > 1:
-          try: # TF1.0
-            cell = tf.contrib.rnn.MultiRNNCell([single_cell] * num_layers)
-          except:
-            cell = tf.nn.rnn_cell.MultiRNNCell([single_cell] * num_layers)
-
-        # ============== Seq Decode Layer ============
-        # The seq2seq function: we use embedding for the input and attention.
-        def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
-          return tf.nn.seq2seq.embedding_attention_seq2seq(
-              encoder_inputs, decoder_inputs, cell,
-              num_encoder_symbols=source_vocab_size,
-              num_decoder_symbols=target_vocab_size,
-              embedding_size=size,
-              output_projection=output_projection,
-              feed_previous=do_decode)
-
-        #=============================================================
-        # Feeds for inputs.
-        self.encoder_inputs = []
-        self.decoder_inputs = []
-        self.target_weights = []
-        for i in xrange(buckets[-1][0]):  # Last bucket is the biggest one.
-          self.encoder_inputs.append(tf.placeholder(tf.int32, shape=[None],
-                                                    name="encoder{0}".format(i)))
-        for i in xrange(buckets[-1][1] + 1):
-          self.decoder_inputs.append(tf.placeholder(tf.int32, shape=[None],
-                                                    name="decoder{0}".format(i)))
-          self.target_weights.append(tf.placeholder(tf.float32, shape=[None],
-                                                    name="weight{0}".format(i)))
-
-        # Our targets are decoder inputs shifted by one.
-        targets = [self.decoder_inputs[i + 1]
-                   for i in xrange(len(self.decoder_inputs) - 1)]
-        self.targets = targets  # DH add for debug
-
-
-        # Training outputs and losses.
-        if forward_only:
-          self.outputs, self.losses = tf.nn.seq2seq.model_with_buckets(
-              self.encoder_inputs, self.decoder_inputs, targets,
-              self.target_weights, buckets, lambda x, y: seq2seq_f(x, y, True),
-              softmax_loss_function=softmax_loss_function)
-          # If we use output projection, we need to project outputs for decoding.
-          if output_projection is not None:
-            for b in xrange(len(buckets)):
-              self.outputs[b] = [
-                  tf.matmul(output, output_projection[0]) + output_projection[1]
-                  for output in self.outputs[b]
-              ]
-        else:
-          self.outputs, self.losses = tf.nn.seq2seq.model_with_buckets(
-              self.encoder_inputs, self.decoder_inputs, targets,
-              self.target_weights, buckets,
-              lambda x, y: seq2seq_f(x, y, False),
-              softmax_loss_function=softmax_loss_function)
-
-        # Gradients and SGD update operation for training the model.
-        params = tf.trainable_variables()
-        if not forward_only:
-          self.gradient_norms = []
-          self.updates = []
-          opt = tf.train.GradientDescentOptimizer(self.learning_rate)
-          for b in xrange(len(buckets)):
-            gradients = tf.gradients(self.losses[b], params)
-            clipped_gradients, norm = tf.clip_by_global_norm(gradients,
-                                                             max_gradient_norm)
-            self.gradient_norms.append(norm)
-            self.updates.append(opt.apply_gradients(
-                zip(clipped_gradients, params), global_step=self.global_step))
-
-        # if save into npz
-        self.all_params = tf.get_collection(TF_GRAPHKEYS_VARIABLES, scope=vs.name)
-
-    # if save into ckpt
-    self.saver = tf.train.Saver(tf.all_variables())
-
-  def step(self, session, encoder_inputs, decoder_inputs, target_weights,
-           bucket_id, forward_only):
-    """Run a step of the model feeding the given inputs.
-
-    Parameters
-    ----------
-    session : tensorflow session to use.
-    encoder_inputs : list of numpy int vectors to feed as encoder inputs.
-    decoder_inputs : list of numpy int vectors to feed as decoder inputs.
-    target_weights : list of numpy float vectors to feed as target weights.
-    bucket_id : which bucket of the model to use.
-    forward_only : whether to do the backward step or only forward.
-
-    Returns
-    --------
-    A triple consisting of gradient norm (or None if we did not do backward),
-    average perplexity, and the outputs.
-
-    Raises
-    --------
-    ValueError : if length of encoder_inputs, decoder_inputs, or
-        target_weights disagrees with bucket size for the specified bucket_id.
-    """
-    # Check if the sizes match.
-    encoder_size, decoder_size = self.buckets[bucket_id]
-    if len(encoder_inputs) != encoder_size:
-      raise ValueError("Encoder length must be equal to the one in bucket,"
-                       " %d != %d." % (len(encoder_inputs), encoder_size))
-    if len(decoder_inputs) != decoder_size:
-      raise ValueError("Decoder length must be equal to the one in bucket,"
-                       " %d != %d." % (len(decoder_inputs), decoder_size))
-    if len(target_weights) != decoder_size:
-      raise ValueError("Weights length must be equal to the one in bucket,"
-                       " %d != %d." % (len(target_weights), decoder_size))
-    # print('in model.step()')
-    # print('a',bucket_id, encoder_size, decoder_size)
-
-    # Input feed: encoder inputs, decoder inputs, target_weights, as provided.
-    input_feed = {}
-    for l in xrange(encoder_size):
-      input_feed[self.encoder_inputs[l].name] = encoder_inputs[l]
-    for l in xrange(decoder_size):
-      input_feed[self.decoder_inputs[l].name] = decoder_inputs[l]
-      input_feed[self.target_weights[l].name] = target_weights[l]
-    # print(self.encoder_inputs[l].name)
-    # print(self.decoder_inputs[l].name)
-    # print(self.target_weights[l].name)
-
-    # Since our targets are decoder inputs shifted by one, we need one more.
-    last_target = self.decoder_inputs[decoder_size].name
-    input_feed[last_target] = np.zeros([self.batch_size], dtype=np.int32)
-    # print('last_target', last_target)
-
-    # Output feed: depends on whether we do a backward step or not.
-    if not forward_only:
-      output_feed = [self.updates[bucket_id],  # Update Op that does SGD.
-                     self.gradient_norms[bucket_id],  # Gradient norm.
-                     self.losses[bucket_id]]  # Loss for this batch.
-    else:
-      output_feed = [self.losses[bucket_id]]  # Loss for this batch.
-      for l in xrange(decoder_size):  # Output logits.
-        output_feed.append(self.outputs[bucket_id][l])
-
-    outputs = session.run(output_feed, input_feed)
-    if not forward_only:
-      return outputs[1], outputs[2], None  # Gradient norm, loss, no outputs.
-    else:
-      return None, outputs[0], outputs[1:]  # No gradient norm, loss, outputs.
-
-  def get_batch(self, data, bucket_id, PAD_ID=0, GO_ID=1, EOS_ID=2, UNK_ID=3):
-    """ Get a random batch of data from the specified bucket, prepare for step.
-
-    To feed data in step(..) it must be a list of batch-major vectors, while
-    data here contains single length-major cases. So the main logic of this
-    function is to re-index data cases to be in the proper format for feeding.
-
-    Parameters
-    ----------
-    data : a tuple of size len(self.buckets) in which each element contains
-        lists of pairs of input and output data that we use to create a batch.
-    bucket_id : integer, which bucket to get the batch for.
-    PAD_ID : int
-        Index of Padding in vocabulary
-    GO_ID : int
-        Index of GO in vocabulary
-    EOS_ID : int
-        Index of End of sentence in vocabulary
-    UNK_ID : int
-        Index of Unknown word in vocabulary
-
-    Returns
-    -------
-    The triple (encoder_inputs, decoder_inputs, target_weights) for
-    the constructed batch that has the proper format to call step(...) later.
-    """
-    encoder_size, decoder_size = self.buckets[bucket_id]
-    encoder_inputs, decoder_inputs = [], []
-
-    # Get a random batch of encoder and decoder inputs from data,
-    # pad them if needed, reverse encoder inputs and add GO to decoder.
-    for _ in xrange(self.batch_size):
-      encoder_input, decoder_input = random.choice(data[bucket_id])
-
-      # Encoder inputs are padded and then reversed.
-      encoder_pad = [PAD_ID] * (encoder_size - len(encoder_input))
-      encoder_inputs.append(list(reversed(encoder_input + encoder_pad)))
-
-      # Decoder inputs get an extra "GO" symbol, and are padded then.
-      decoder_pad_size = decoder_size - len(decoder_input) - 1
-      decoder_inputs.append([GO_ID] + decoder_input +
-                            [PAD_ID] * decoder_pad_size)
-
-    # Now we create batch-major vectors from the data selected above.
-    batch_encoder_inputs, batch_decoder_inputs, batch_weights = [], [], []
-
-    # Batch encoder inputs are just re-indexed encoder_inputs.
-    for length_idx in xrange(encoder_size):
-      batch_encoder_inputs.append(
-          np.array([encoder_inputs[batch_idx][length_idx]
-                    for batch_idx in xrange(self.batch_size)], dtype=np.int32))
-
-    # Batch decoder inputs are re-indexed decoder_inputs, we create weights.
-    for length_idx in xrange(decoder_size):
-      batch_decoder_inputs.append(
-          np.array([decoder_inputs[batch_idx][length_idx]
-                    for batch_idx in xrange(self.batch_size)], dtype=np.int32))
-
-      # Create target_weights to be 0 for targets that are padding.
-      batch_weight = np.ones(self.batch_size, dtype=np.float32)
-      for batch_idx in xrange(self.batch_size):
-        # We set weight to 0 if the corresponding target is a PAD symbol.
-        # The corresponding target is decoder_input shifted by 1 forward.
-        if length_idx < decoder_size - 1:
-          target = decoder_inputs[batch_idx][length_idx + 1]
-        if length_idx == decoder_size - 1 or target == PAD_ID:
-          batch_weight[batch_idx] = 0.0
-      batch_weights.append(batch_weight)
-    return batch_encoder_inputs, batch_decoder_inputs, batch_weights
-
-## Developing or Untested
-# class MaxoutLayer(Layer):
-#     """
-#     Waiting for contribution
-#
-#     Single DenseLayer with Max-out behaviour, work well with Dropout.
-#
-#     References
-#     -----------
-#     `Goodfellow (2013) Maxout Networks <http://arxiv.org/abs/1302.4389>`_
-#     """
-#     def __init__(
-#         self,
-#         layer = None,
-#         n_units = 100,
-#         name ='maxout_layer',
-#     ):
-#         Layer.__init__(self, name=name)
-#         self.inputs = layer.outputs
-#
-#         print("  [TL] MaxoutLayer %s: %d" % (self.name, self.n_units))
-#         print("    Waiting for contribution")
-#         with tf.variable_scope(name) as vs:
-#             pass
-#             # W = tf.Variable(init.xavier_init(n_inputs=n_in, n_outputs=n_units, uniform=True), name='W')
-#             # b = tf.Variable(tf.zeros([n_units]), name='b')
-#
-#         # self.outputs = act(tf.matmul(self.inputs, W) + b)
-#         # https://www.tensorflow.org/versions/r0.9/api_docs/python/array_ops.html#pack
-#         # http://stackoverflow.com/questions/34362193/how-to-explicitly-broadcast-a-tensor-to-match-anothers-shape-in-tensorflow
-#         # tf.concat tf.pack  tf.tile
-#
-#         self.all_layers = list(layer.all_layers)
-#         self.all_params = list(layer.all_params)
-#         self.all_drop = dict(layer.all_drop)
-#         self.all_layers.extend( [self.outputs] )
-#         self.all_params.extend( [W, b] )
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-#
diff --git a/tensorlayer/nlp.py b/tensorlayer/nlp.py
deleted file mode 100644
index 56259d00..00000000
--- a/tensorlayer/nlp.py
+++ /dev/null
@@ -1,1025 +0,0 @@
-#! /usr/bin/python
-# -*- coding: utf8 -*-
-
-
-
-
-import tensorflow as tf
-import os, re
-from sys import platform as _platform
-import collections
-import random
-import numpy as np
-import warnings
-from six.moves import xrange
-from tensorflow.python.platform import gfile
-import re
-
-## Iteration functions
-def generate_skip_gram_batch(data, batch_size, num_skips, skip_window, data_index=0):
-    """Generate a training batch for the Skip-Gram model.
-
-    Parameters
-    ----------
-    data : a list
-        To present context.
-    batch_size : an int
-        Batch size to return.
-    num_skips : an int
-        How many times to reuse an input to generate a label.
-    skip_window : an int
-        How many words to consider left and right.
-    data_index : an int
-        Index of the context location.
-        without using yield, this code use data_index to instead.
-
-    Returns
-    --------
-    batch : a list
-        Inputs
-    labels : a list
-        Labels
-    data_index : an int
-        Index of the context location.
-
-    Examples
-    --------
-    - Setting num_skips=2, skip_window=1, use the right and left words.
-     In the same way, num_skips=4, skip_window=2 means use the nearby 4 words.
-    >>> data = [1,2,3,4,5,6,7,8,9,10,11]
-    >>> batch, labels, data_index = tl.nlp.generate_skip_gram_batch(data=data, batch_size=8, num_skips=2, skip_window=1, data_index=0)
-    >>> print(batch)
-    ... [2 2 3 3 4 4 5 5]
-    >>> print(labels)
-    ... [[3]
-    ... [1]
-    ... [4]
-    ... [2]
-    ... [5]
-    ... [3]
-    ... [4]
-    ... [6]]
-
-    References
-    -----------
-    - `TensorFlow word2vec tutorial <https://www.tensorflow.org/versions/r0.9/tutorials/word2vec/index.html#vector-representations-of-words>`_
-    """
-    # global data_index   # you can put data_index outside the function, then
-    #       modify the global data_index in the function without return it.
-    # note: without using yield, this code use data_index to instead.
-    assert batch_size % num_skips == 0
-    assert num_skips <= 2 * skip_window
-    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
-    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
-    span = 2 * skip_window + 1 # [ skip_window target skip_window ]
-    buffer = collections.deque(maxlen=span)
-    for _ in range(span):
-        buffer.append(data[data_index])
-        data_index = (data_index + 1) % len(data)
-    for i in range(batch_size // num_skips):
-        target = skip_window  # target label at the center of the buffer
-        targets_to_avoid = [ skip_window ]
-        for j in range(num_skips):
-            while target in targets_to_avoid:
-                target = random.randint(0, span - 1)
-            targets_to_avoid.append(target)
-            batch[i * num_skips + j] = buffer[skip_window]
-            labels[i * num_skips + j, 0] = buffer[target]
-        buffer.append(data[data_index])
-        data_index = (data_index + 1) % len(data)
-    return batch, labels, data_index
-
-
-## Sampling functions
-def sample(a=[], temperature=1.0):
-    """Sample an index from a probability array.
-
-    Parameters
-    ----------
-    a : a list
-        List of probabilities.
-    temperature : float or None
-        The higher the more uniform.\n
-        When a = [0.1, 0.2, 0.7],\n
-            temperature = 0.7, the distribution will be sharpen [ 0.05048273  0.13588945  0.81362782]\n
-            temperature = 1.0, the distribution will be the same [0.1    0.2    0.7]\n
-            temperature = 1.5, the distribution will be filtered [ 0.16008435  0.25411807  0.58579758]\n
-        If None, it will be ``np.argmax(a)``
-
-    Notes
-    ------
-    - No matter what is the temperature and input list, the sum of all probabilities will be one.
-    Even if input list = [1, 100, 200], the sum of all probabilities will still be one.
-    - For large vocabulary_size, choice a higher temperature to avoid error.
-    """
-    b = np.copy(a)
-    try:
-        if temperature == 1:
-            return np.argmax(np.random.multinomial(1, a, 1))
-        if temperature is None:
-            return np.argmax(a)
-        else:
-            a = np.log(a) / temperature
-            a = np.exp(a) / np.sum(np.exp(a))
-            return np.argmax(np.random.multinomial(1, a, 1))
-    except:
-        # np.set_printoptions(threshold=np.nan)
-        # print(a)
-        # print(np.sum(a))
-        # print(np.max(a))
-        # print(np.min(a))
-        # exit()
-        message = "For large vocabulary_size, choice a higher temperature\
-         to avoid log error. Hint : use ``sample_top``. "
-        warnings.warn(message, Warning)
-        # print(a)
-        # print(b)
-        return np.argmax(np.random.multinomial(1, b, 1))
-
-def sample_top(a=[], top_k=10):
-    """Sample from ``top_k`` probabilities.
-
-    Parameters
-    ----------
-    a : a list
-        List of probabilities.
-    top_k : int
-        Number of candidates to be considered.
-    """
-    idx = np.argpartition(a, -top_k)[-top_k:]
-    probs = a[idx]
-    # print("new", probs)
-    probs = probs / np.sum(probs)
-    choice = np.random.choice(idx, p=probs)
-    return choice
-    ## old implementation
-    # a = np.array(a)
-    # idx = np.argsort(a)[::-1]
-    # idx = idx[:top_k]
-    # # a = a[idx]
-    # probs = a[idx]
-    # print("prev", probs)
-    # # probs = probs / np.sum(probs)
-    # # choice = np.random.choice(idx, p=probs)
-    # # return choice
-
-
-## Vector representations of words (Advanced)  UNDOCUMENT
-class SimpleVocabulary(object):
-  """Simple vocabulary wrapper, see create_vocab().
-
-  Parameters
-  ------------
-  vocab : A dictionary of word to word_id.
-  unk_id : Id of the special 'unknown' word.
-  """
-
-  def __init__(self, vocab, unk_id):
-    """Initializes the vocabulary."""
-
-
-    self._vocab = vocab
-    self._unk_id = unk_id
-
-  def word_to_id(self, word):
-    """Returns the integer id of a word string."""
-    if word in self._vocab:
-      return self._vocab[word]
-    else:
-      return self._unk_id
-
-class Vocabulary(object):
-  """Create Vocabulary class from a given vocabulary and its id-word, word-id convert,
-  see create_vocab() and ``tutorial_tfrecord3.py``.
-
-  Parameters
-  -----------
-  vocab_file : File containing the vocabulary, where the words are the first
-        whitespace-separated token on each line (other tokens are ignored) and
-        the word ids are the corresponding line numbers.
-  start_word : Special word denoting sentence start.
-  end_word : Special word denoting sentence end.
-  unk_word : Special word denoting unknown words.
-
-  Properties
-  ------------
-  vocab : a dictionary from word to id.
-  reverse_vocab : a list from id to word.
-  start_id : int of start id
-  end_id : int of end id
-  unk_id : int of unk id
-  pad_id : int of padding id
-
-  Vocab_files
-  -------------
-  >>> Look as follow, includes `start_word` , `end_word` but no `unk_word` .
-  >>> a 969108
-  >>> <S> 586368
-  >>> </S> 586368
-  >>> . 440479
-  >>> on 213612
-  >>> of 202290
-  >>> the 196219
-  >>> in 182598
-  >>> with 152984
-  >>> and 139109
-  >>> is 97322
-  """
-
-  def __init__(self,
-               vocab_file,
-               start_word="<S>",
-               end_word="</S>",
-               unk_word="<UNK>",
-               pad_word="<PAD>"):
-    if not tf.gfile.Exists(vocab_file):
-      tf.logging.fatal("Vocab file %s not found.", vocab_file)
-    tf.logging.info("Initializing vocabulary from file: %s", vocab_file)
-
-    with tf.gfile.GFile(vocab_file, mode="r") as f:
-      reverse_vocab = list(f.readlines())
-    reverse_vocab = [line.split()[0] for line in reverse_vocab]
-    # assert start_word in reverse_vocab
-    # assert end_word in reverse_vocab
-    if start_word not in reverse_vocab: # haodong
-      reverse_vocab.append(start_word)
-    if end_word not in reverse_vocab:
-      reverse_vocab.append(end_word)
-    if unk_word not in reverse_vocab:
-      reverse_vocab.append(unk_word)
-    vocab = dict([(x, y) for (y, x) in enumerate(reverse_vocab)])
-
-    print("  [TL] Vocabulary from %s : %s %s %s" % (vocab_file, start_word, end_word, unk_word))
-    print("    vocabulary with %d words (includes start_word, end_word, unk_word)" % len(vocab))
-    # tf.logging.info("     vocabulary with %d words" % len(vocab))
-
-    self.vocab = vocab  # vocab[word] = id
-    self.reverse_vocab = reverse_vocab  # reverse_vocab[id] = word
-
-    # Save special word ids.
-    self.start_id = vocab[start_word]
-    self.end_id = vocab[end_word]
-    self.unk_id = vocab[unk_word]
-    self.pad_id = vocab[pad_word]
-    print("      start_id: %d" % self.start_id)
-    print("      end_id: %d" % self.end_id)
-    print("      unk_id: %d" % self.unk_id)
-    print("      pad_id: %d" % self.pad_id)
-
-  def word_to_id(self, word):
-    """Returns the integer word id of a word string."""
-    if word in self.vocab:
-      return self.vocab[word]
-    else:
-      return self.unk_id
-
-  def id_to_word(self, word_id):
-    """Returns the word string of an integer word id."""
-    if word_id >= len(self.reverse_vocab):
-      return self.reverse_vocab[self.unk_id]
-    else:
-      return self.reverse_vocab[word_id]
-
-def process_sentence(sentence, start_word="<S>", end_word="</S>"):
-    """Converts a sentence string into a list of string words, add start_word and end_word,
-    see ``create_vocab()`` and ``tutorial_tfrecord3.py``.
-
-    Parameter
-    ---------
-    sentence : a sentence in string.
-    start_word : a string or None, if None, non start word will be appended.
-    end_word : a string or None, if None, non end word will be appended.
-
-    Returns
-    ---------
-    A list of strings; the processed caption.
-
-    Examples
-    -----------
-    >>> c = "how are you?"
-    >>> c = tl.nlp.process_sentence(c)
-    >>> print(c)
-    ... ['<S>', 'how', 'are', 'you', '?', '</S>']
-
-    Notes
-    -------
-    - You have to install the following package.
-    - `Installing NLTK <http://www.nltk.org/install.html>`_
-    - `Installing NLTK data <http://www.nltk.org/data.html>`_
-    """
-    try:
-        import nltk
-    except:
-        raise Exception("Hint : NLTK is required.")
-    if start_word is not None:
-        process_sentence = [start_word]
-    else:
-        process_sentence = []
-    process_sentence.extend(nltk.tokenize.word_tokenize(sentence.lower()))
-    if end_word is not None:
-        process_sentence.append(end_word)
-    return process_sentence
-
-def create_vocab(sentences, word_counts_output_file, min_word_count=1):
-    """Creates the vocabulary of word to word_id, see create_vocab() and ``tutorial_tfrecord3.py``.
-
-    The vocabulary is saved to disk in a text file of word counts. The id of each
-    word in the file is its corresponding 0-based line number.
-
-    Parameters
-    ------------
-    sentences : a list of lists of strings.
-    word_counts_output_file : A string
-        The file name.
-    min_word_count : a int
-        Minimum number of occurrences for a word.
-
-    Returns
-    --------
-    - tl.nlp.SimpleVocabulary object.
-
-    Mores
-    -----
-    - ``tl.nlp.build_vocab()``
-
-    Examples
-    --------
-    >>> captions = ["one two , three", "four five five"]
-    >>> processed_capts = []
-    >>> for c in captions:
-    >>>     c = tl.nlp.process_sentence(c, start_word="<S>", end_word="</S>")
-    >>>     processed_capts.append(c)
-    >>> print(processed_capts)
-    ...[['<S>', 'one', 'two', ',', 'three', '</S>'], ['<S>', 'four', 'five', 'five', '</S>']]
-
-    >>> tl.nlp.create_vocab(processed_capts, word_counts_output_file='vocab.txt', min_word_count=1)
-    ...   [TL] Creating vocabulary.
-    ...   Total words: 8
-    ...   Words in vocabulary: 8
-    ...   Wrote vocabulary file: vocab.txt
-    >>> vocab = tl.nlp.Vocabulary('vocab.txt', start_word="<S>", end_word="</S>", unk_word="<UNK>")
-    ... INFO:tensorflow:Initializing vocabulary from file: vocab.txt
-    ... [TL] Vocabulary from vocab.txt : <S> </S> <UNK>
-    ... vocabulary with 10 words (includes start_word, end_word, unk_word)
-    ...     start_id: 2
-    ...     end_id: 3
-    ...     unk_id: 9
-    ...     pad_id: 0
-    """
-    from collections import Counter
-    print("  [TL] Creating vocabulary.")
-    counter = Counter()
-    for c in sentences:
-        counter.update(c)
-        # print('c',c)
-    print("    Total words: %d" % len(counter))
-
-    # Filter uncommon words and sort by descending count.
-    word_counts = [x for x in counter.items() if x[1] >= min_word_count]
-    word_counts.sort(key=lambda x: x[1], reverse=True)
-    word_counts = [("<PAD>", 0)] + word_counts # 1st id should be reserved for padding
-    # print(word_counts)
-    print("    Words in vocabulary: %d" % len(word_counts))
-
-    # Write out the word counts file.
-    with tf.gfile.FastGFile(word_counts_output_file, "w") as f:
-        f.write("\n".join(["%s %d" % (w, c) for w, c in word_counts]))
-    print("    Wrote vocabulary file: %s" % word_counts_output_file)
-
-    # Create the vocabulary dictionary.
-    reverse_vocab = [x[0] for x in word_counts]
-    unk_id = len(reverse_vocab)
-    vocab_dict = dict([(x, y) for (y, x) in enumerate(reverse_vocab)])
-    vocab = SimpleVocabulary(vocab_dict, unk_id)
-
-    return vocab
-
-
-## Vector representations of words
-def simple_read_words(filename="nietzsche.txt"):
-    """Read context from file without any preprocessing.
-
-    Parameters
-    ----------
-    filename : a string
-        A file path (like .txt file)
-
-    Returns
-    --------
-    The context in a string
-    """
-    with open(filename, "r") as f:
-        words = f.read()
-        return words
-
-def read_words(filename="nietzsche.txt", replace = ['\n', '<eos>']):
-    """ File to list format context. Note that, this script can not handle punctuations.
-    For customized read_words method, see ``tutorial_generate_text.py``.
-
-    Parameters
-    -----------
-    filename : a string
-        A file path (like .txt file)
-    replace : a list
-        [original string, target string], to disable replace use ['', '']
-
-    Returns
-    --------
-    The context in a list, split by space by default, and use ``<eos>`` to represent ``\\n``,
-    e.g. ``[... 'how', 'useful', 'it', "'s" ... ]``.
-
-    Code References
-    ---------------
-    - `tensorflow.models.rnn.ptb.reader <https://github.com/tensorflow/tensorflow/tree/master/tensorflow/models/rnn/ptb>`_
-    """
-    with tf.gfile.GFile(filename, "r") as f:
-        try:    # python 3.4 or older
-            context_list = f.read().replace(*replace).split()
-        except: # python 3.5
-            f.seek(0)
-            replace = [x.encode('utf-8') for x in replace]
-            context_list = f.read().replace(*replace).split()
-        return context_list
-
-def read_analogies_file(eval_file='questions-words.txt', word2id={}):
-    """Reads through an analogy question file, return its id format.
-
-    Parameters
-    ----------
-    eval_data : a string
-        The file name.
-    word2id : a dictionary
-        Mapping words to unique IDs.
-
-    Returns
-    --------
-    analogy_questions : a [n, 4] numpy array containing the analogy question's
-             word ids.
-             questions_skipped: questions skipped due to unknown words.
-
-    Examples
-    ---------
-    >>> eval_file should be in this format :
-    >>> : capital-common-countries
-    >>> Athens Greece Baghdad Iraq
-    >>> Athens Greece Bangkok Thailand
-    >>> Athens Greece Beijing China
-    >>> Athens Greece Berlin Germany
-    >>> Athens Greece Bern Switzerland
-    >>> Athens Greece Cairo Egypt
-    >>> Athens Greece Canberra Australia
-    >>> Athens Greece Hanoi Vietnam
-    >>> Athens Greece Havana Cuba
-    ...
-
-    >>> words = tl.files.load_matt_mahoney_text8_dataset()
-    >>> data, count, dictionary, reverse_dictionary = \
-                tl.nlp.build_words_dataset(words, vocabulary_size, True)
-    >>> analogy_questions = tl.nlp.read_analogies_file( \
-                eval_file='questions-words.txt', word2id=dictionary)
-    >>> print(analogy_questions)
-    ... [[ 3068  1248  7161  1581]
-    ... [ 3068  1248 28683  5642]
-    ... [ 3068  1248  3878   486]
-    ... ...,
-    ... [ 1216  4309 19982 25506]
-    ... [ 1216  4309  3194  8650]
-    ... [ 1216  4309   140   312]]
-    """
-    questions = []
-    questions_skipped = 0
-    with open(eval_file, "rb") as analogy_f:
-      for line in analogy_f:
-          if line.startswith(b":"):  # Skip comments.
-                continue
-          words = line.strip().lower().split(b" ")  # lowercase
-          ids = [word2id.get(w.strip()) for w in words]
-          if None in ids or len(ids) != 4:
-              questions_skipped += 1
-          else:
-              questions.append(np.array(ids))
-    print("Eval analogy file: ", eval_file)
-    print("Questions: ", len(questions))
-    print("Skipped: ", questions_skipped)
-    analogy_questions = np.array(questions, dtype=np.int32)
-    return analogy_questions
-
-def build_vocab(data):
-    """Build vocabulary.
-    Given the context in list format.
-    Return the vocabulary, which is a dictionary for word to id.
-    e.g. {'campbell': 2587, 'atlantic': 2247, 'aoun': 6746 .... }
-
-    Parameters
-    ----------
-    data : a list of string
-        the context in list format
-
-    Returns
-    --------
-    word_to_id : a dictionary
-        mapping words to unique IDs. e.g. {'campbell': 2587, 'atlantic': 2247, 'aoun': 6746 .... }
-
-    Code References
-    ---------------
-    - `tensorflow.models.rnn.ptb.reader <https://github.com/tensorflow/tensorflow/tree/master/tensorflow/models/rnn/ptb>`_
-
-    Examples
-    --------
-    >>> data_path = os.getcwd() + '/simple-examples/data'
-    >>> train_path = os.path.join(data_path, "ptb.train.txt")
-    >>> word_to_id = build_vocab(read_txt_words(train_path))
-    """
-    # data = _read_words(filename)
-    counter = collections.Counter(data)
-    # print('counter', counter)   # dictionary for the occurrence number of each word, e.g. 'banknote': 1, 'photography': 1, 'kia': 1
-    count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
-    # print('count_pairs',count_pairs)  # convert dictionary to list of tuple, e.g. ('ssangyong', 1), ('swapo', 1), ('wachter', 1)
-    words, _ = list(zip(*count_pairs))
-    word_to_id = dict(zip(words, range(len(words))))
-    # print(words)    # list of words
-    # print(word_to_id) # dictionary for word to id, e.g. 'campbell': 2587, 'atlantic': 2247, 'aoun': 6746
-    return word_to_id
-
-def build_reverse_dictionary(word_to_id):
-    """Given a dictionary for converting word to integer id.
-    Returns a reverse dictionary for converting a id to word.
-
-    Parameters
-    ----------
-    word_to_id : dictionary
-        mapping words to unique ids
-
-    Returns
-    --------
-    reverse_dictionary : a dictionary
-        mapping ids to words
-    """
-    reverse_dictionary = dict(zip(word_to_id.values(), word_to_id.keys()))
-    return reverse_dictionary
-
-def build_words_dataset(words=[], vocabulary_size=50000, printable=True, unk_key = 'UNK'):
-    """Build the words dictionary and replace rare words with 'UNK' token.
-    The most common word has the smallest integer id.
-
-    Parameters
-    ----------
-    words : a list of string or byte
-        The context in list format. You may need to do preprocessing on the words,
-        such as lower case, remove marks etc.
-    vocabulary_size : an int
-        The maximum vocabulary size, limiting the vocabulary size.
-        Then the script replaces rare words with 'UNK' token.
-    printable : boolean
-        Whether to print the read vocabulary size of the given words.
-    unk_key : a string
-        Unknown words = unk_key
-
-    Returns
-    --------
-    data : a list of integer
-        The context in a list of ids
-    count : a list of tuple and list
-        count[0] is a list : the number of rare words\n
-        count[1:] are tuples : the number of occurrence of each word\n
-        e.g. [['UNK', 418391], (b'the', 1061396), (b'of', 593677), (b'and', 416629), (b'one', 411764)]
-    dictionary : a dictionary
-        word_to_id, mapping words to unique IDs.
-    reverse_dictionary : a dictionary
-        id_to_word, mapping id to unique word.
-
-    Examples
-    --------
-    >>> words = tl.files.load_matt_mahoney_text8_dataset()
-    >>> vocabulary_size = 50000
-    >>> data, count, dictionary, reverse_dictionary = tl.nlp.build_words_dataset(words, vocabulary_size)
-
-    Code References
-    -----------------
-    - `tensorflow/examples/tutorials/word2vec/word2vec_basic.py <https://github.com/tensorflow/tensorflow/blob/r0.7/tensorflow/examples/tutorials/word2vec/word2vec_basic.py>`_
-    """
-    import collections
-    count = [[unk_key, -1]]
-    count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
-    dictionary = dict()
-    for word, _ in count:
-        dictionary[word] = len(dictionary)
-    data = list()
-    unk_count = 0
-    for word in words:
-        if word in dictionary:
-            index = dictionary[word]
-        else:
-            index = 0  # dictionary['UNK']
-            unk_count += 1
-        data.append(index)
-    count[0][1] = unk_count
-    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
-    if printable:
-        print('Real vocabulary size    %d' % len(collections.Counter(words).keys()))
-        print('Limited vocabulary size {}'.format(vocabulary_size))
-    assert len(collections.Counter(words).keys()) >= vocabulary_size , \
-            "the limited vocabulary_size must be less than or equal to the read vocabulary_size"
-    return data, count, dictionary, reverse_dictionary
-
-def words_to_word_ids(data=[], word_to_id={}, unk_key = 'UNK'):
-    """Given a context (words) in list format and the vocabulary,
-    Returns a list of IDs to represent the context.
-
-    Parameters
-    ----------
-    data : a list of string or byte
-        the context in list format
-    word_to_id : a dictionary
-        mapping words to unique IDs.
-    unk_key : a string
-        Unknown words = unk_key
-
-    Returns
-    --------
-    A list of IDs to represent the context.
-
-    Examples
-    --------
-    >>> words = tl.files.load_matt_mahoney_text8_dataset()
-    >>> vocabulary_size = 50000
-    >>> data, count, dictionary, reverse_dictionary = \
-    ...         tl.nlp.build_words_dataset(words, vocabulary_size, True)
-    >>> context = [b'hello', b'how', b'are', b'you']
-    >>> ids = tl.nlp.words_to_word_ids(words, dictionary)
-    >>> context = tl.nlp.word_ids_to_words(ids, reverse_dictionary)
-    >>> print(ids)
-    ... [6434, 311, 26, 207]
-    >>> print(context)
-    ... [b'hello', b'how', b'are', b'you']
-
-    Code References
-    ---------------
-    - `tensorflow.models.rnn.ptb.reader <https://github.com/tensorflow/tensorflow/tree/master/tensorflow/models/rnn/ptb>`_
-    """
-    # if isinstance(data[0], six.string_types):
-    #     print(type(data[0]))
-    #     # exit()
-    #     print(data[0])
-    #     print(word_to_id)
-    #     return [word_to_id[str(word)] for word in data]
-    # else:
-
-    word_ids = []
-    for word in data:
-        if word_to_id.get(word) is not None:
-            word_ids.append(word_to_id[word])
-        else:
-            word_ids.append(word_to_id[unk_key])
-    return word_ids
-    # return [word_to_id[word] for word in data]    # this one
-
-    # if isinstance(data[0], str):
-    #     # print('is a string object')
-    #     return [word_to_id[word] for word in data]
-    # else:#if isinstance(s, bytes):
-    #     # print('is a unicode object')
-    #     # print(data[0])
-    #     return [word_to_id[str(word)] f
-
-def word_ids_to_words(data, id_to_word):
-    """Given a context (ids) in list format and the vocabulary,
-    Returns a list of words to represent the context.
-
-    Parameters
-    ----------
-    data : a list of integer
-        the context in list format
-    id_to_word : a dictionary
-        mapping id to unique word.
-
-    Returns
-    --------
-    A list of string or byte to represent the context.
-
-    Examples
-    ---------
-    >>> see words_to_word_ids
-    """
-    return [id_to_word[i] for i in data]
-
-def save_vocab(count=[], name='vocab.txt'):
-    """Save the vocabulary to a file so the model can be reloaded.
-
-    Parameters
-    ----------
-    count : a list of tuple and list
-        count[0] is a list : the number of rare words\n
-        count[1:] are tuples : the number of occurrence of each word\n
-        e.g. [['UNK', 418391], (b'the', 1061396), (b'of', 593677), (b'and', 416629), (b'one', 411764)]
-
-    Examples
-    ---------
-    >>> words = tl.files.load_matt_mahoney_text8_dataset()
-    >>> vocabulary_size = 50000
-    >>> data, count, dictionary, reverse_dictionary = \
-    ...     tl.nlp.build_words_dataset(words, vocabulary_size, True)
-    >>> tl.nlp.save_vocab(count, name='vocab_text8.txt')
-    >>> vocab_text8.txt
-    ... UNK 418391
-    ... the 1061396
-    ... of 593677
-    ... and 416629
-    ... one 411764
-    ... in 372201
-    ... a 325873
-    ... to 316376
-    """
-    pwd = os.getcwd()
-    vocabulary_size = len(count)
-    with open(os.path.join(pwd, name), "w") as f:
-        for i in xrange(vocabulary_size):
-            f.write("%s %d\n" % (tf.compat.as_text(count[i][0]), count[i][1]))
-    print("%d vocab saved to %s in %s" % (vocabulary_size, name, pwd))
-
-## Functions for translation
-def basic_tokenizer(sentence, _WORD_SPLIT=re.compile(b"([.,!?\"':;)(])")):
-  """Very basic tokenizer: split the sentence into a list of tokens.
-
-  Parameters
-  -----------
-  sentence : tensorflow.python.platform.gfile.GFile Object
-  _WORD_SPLIT : regular expression for word spliting.
-
-
-  Examples
-  --------
-  >>> see create_vocabulary
-  >>> from tensorflow.python.platform import gfile
-  >>> train_path = "wmt/giga-fren.release2"
-  >>> with gfile.GFile(train_path + ".en", mode="rb") as f:
-  >>>    for line in f:
-  >>>       tokens = tl.nlp.basic_tokenizer(line)
-  >>>       print(tokens)
-  >>>       exit()
-  ... [b'Changing', b'Lives', b'|', b'Changing', b'Society', b'|', b'How',
-  ...   b'It', b'Works', b'|', b'Technology', b'Drives', b'Change', b'Home',
-  ...   b'|', b'Concepts', b'|', b'Teachers', b'|', b'Search', b'|', b'Overview',
-  ...   b'|', b'Credits', b'|', b'HHCC', b'Web', b'|', b'Reference', b'|',
-  ...   b'Feedback', b'Virtual', b'Museum', b'of', b'Canada', b'Home', b'Page']
-
-  References
-  ----------
-  - Code from ``/tensorflow/models/rnn/translation/data_utils.py``
-  """
-  words = []
-  sentence = tf.compat.as_bytes(sentence)
-  for space_separated_fragment in sentence.strip().split():
-    words.extend(re.split(_WORD_SPLIT, space_separated_fragment))
-  return [w for w in words if w]
-
-def create_vocabulary(vocabulary_path, data_path, max_vocabulary_size,
-                      tokenizer=None, normalize_digits=True,
-                      _DIGIT_RE=re.compile(br"\d"),
-                      _START_VOCAB=[b"_PAD", b"_GO", b"_EOS", b"_UNK"]):
-  """Create vocabulary file (if it does not exist yet) from data file.
-
-  Data file is assumed to contain one sentence per line. Each sentence is
-  tokenized and digits are normalized (if normalize_digits is set).
-  Vocabulary contains the most-frequent tokens up to max_vocabulary_size.
-  We write it to vocabulary_path in a one-token-per-line format, so that later
-  token in the first line gets id=0, second line gets id=1, and so on.
-
-  Parameters
-  -----------
-  vocabulary_path : path where the vocabulary will be created.
-  data_path : data file that will be used to create vocabulary.
-  max_vocabulary_size : limit on the size of the created vocabulary.
-  tokenizer : a function to use to tokenize each data sentence.
-        if None, basic_tokenizer will be used.
-  normalize_digits : Boolean
-        if true, all digits are replaced by 0s.
-
-  References
-  ----------
-  - Code from ``/tensorflow/models/rnn/translation/data_utils.py``
-  """
-  if not gfile.Exists(vocabulary_path):
-    print("Creating vocabulary %s from data %s" % (vocabulary_path, data_path))
-    vocab = {}
-    with gfile.GFile(data_path, mode="rb") as f:
-      counter = 0
-      for line in f:
-        counter += 1
-        if counter % 100000 == 0:
-          print("  processing line %d" % counter)
-        tokens = tokenizer(line) if tokenizer else basic_tokenizer(line)
-        for w in tokens:
-          word = re.sub(_DIGIT_RE, b"0", w) if normalize_digits else w
-          if word in vocab:
-            vocab[word] += 1
-          else:
-            vocab[word] = 1
-      vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True)
-      if len(vocab_list) > max_vocabulary_size:
-        vocab_list = vocab_list[:max_vocabulary_size]
-      with gfile.GFile(vocabulary_path, mode="wb") as vocab_file:
-        for w in vocab_list:
-          vocab_file.write(w + b"\n")
-  else:
-    print("Vocabulary %s from data %s exists" % (vocabulary_path, data_path))
-
-def initialize_vocabulary(vocabulary_path):
-  """Initialize vocabulary from file, return the word_to_id (dictionary)
-  and id_to_word (list).
-
-  We assume the vocabulary is stored one-item-per-line, so a file:\n
-    dog\n
-    cat\n
-  will result in a vocabulary {"dog": 0, "cat": 1}, and this function will
-  also return the reversed-vocabulary ["dog", "cat"].
-
-  Parameters
-  -----------
-  vocabulary_path : path to the file containing the vocabulary.
-
-  Returns
-  --------
-  vocab : a dictionary
-        Word to id. A dictionary mapping string to integers.
-  rev_vocab : a list
-        Id to word. The reversed vocabulary (a list, which reverses the vocabulary mapping).
-
-  Examples
-  ---------
-  >>> Assume 'test' contains
-  ... dog
-  ... cat
-  ... bird
-  >>> vocab, rev_vocab = tl.nlp.initialize_vocabulary("test")
-  >>> print(vocab)
-  >>> {b'cat': 1, b'dog': 0, b'bird': 2}
-  >>> print(rev_vocab)
-  >>> [b'dog', b'cat', b'bird']
-
-  Raises
-  -------
-  ValueError : if the provided vocabulary_path does not exist.
-  """
-  if gfile.Exists(vocabulary_path):
-    rev_vocab = []
-    with gfile.GFile(vocabulary_path, mode="rb") as f:
-      rev_vocab.extend(f.readlines())
-    rev_vocab = [tf.compat.as_bytes(line.strip()) for line in rev_vocab]
-    vocab = dict([(x, y) for (y, x) in enumerate(rev_vocab)])
-    return vocab, rev_vocab
-  else:
-    raise ValueError("Vocabulary file %s not found.", vocabulary_path)
-
-def sentence_to_token_ids(sentence, vocabulary,
-                          tokenizer=None, normalize_digits=True,
-                          UNK_ID=3, _DIGIT_RE=re.compile(br"\d")):
-  """Convert a string to list of integers representing token-ids.
-
-  For example, a sentence "I have a dog" may become tokenized into
-  ["I", "have", "a", "dog"] and with vocabulary {"I": 1, "have": 2,
-  "a": 4, "dog": 7"} this function will return [1, 2, 4, 7].
-
-  Parameters
-  -----------
-  sentence :  tensorflow.python.platform.gfile.GFile Object
-        The sentence in bytes format to convert to token-ids.\n
-        see basic_tokenizer(), data_to_token_ids()
-  vocabulary : a dictionary mapping tokens to integers.
-  tokenizer : a function to use to tokenize each sentence;
-        If None, basic_tokenizer will be used.
-  normalize_digits : Boolean
-        If true, all digits are replaced by 0s.
-
-  Returns
-  --------
-  A list of integers, the token-ids for the sentence.
-  """
-
-  if tokenizer:
-    words = tokenizer(sentence)
-  else:
-    words = basic_tokenizer(sentence)
-  if not normalize_digits:
-    return [vocabulary.get(w, UNK_ID) for w in words]
-  # Normalize digits by 0 before looking words up in the vocabulary.
-  return [vocabulary.get(re.sub(_DIGIT_RE, b"0", w), UNK_ID) for w in words]
-
-def data_to_token_ids(data_path, target_path, vocabulary_path,
-                      tokenizer=None, normalize_digits=True,
-                      UNK_ID=3, _DIGIT_RE=re.compile(br"\d")):
-  """Tokenize data file and turn into token-ids using given vocabulary file.
-
-  This function loads data line-by-line from data_path, calls the above
-  sentence_to_token_ids, and saves the result to target_path. See comment
-  for sentence_to_token_ids on the details of token-ids format.
-
-  Parameters
-  -----------
-  data_path : path to the data file in one-sentence-per-line format.
-  target_path : path where the file with token-ids will be created.
-  vocabulary_path : path to the vocabulary file.
-  tokenizer : a function to use to tokenize each sentence;
-      if None, basic_tokenizer will be used.
-  normalize_digits : Boolean; if true, all digits are replaced by 0s.
-
-  References
-  ----------
-  - Code from ``/tensorflow/models/rnn/translation/data_utils.py``
-  """
-  if not gfile.Exists(target_path):
-    print("Tokenizing data in %s" % data_path)
-    vocab, _ = initialize_vocabulary(vocabulary_path)
-    with gfile.GFile(data_path, mode="rb") as data_file:
-      with gfile.GFile(target_path, mode="w") as tokens_file:
-        counter = 0
-        for line in data_file:
-          counter += 1
-          if counter % 100000 == 0:
-            print("  tokenizing line %d" % counter)
-          token_ids = sentence_to_token_ids(line, vocab, tokenizer,
-                                            normalize_digits, UNK_ID=UNK_ID,
-                                            _DIGIT_RE=_DIGIT_RE)
-          tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")
-  else:
-    print("Target path %s exists" % target_path)
-
-
-## Metric
-import subprocess
-import tempfile
-from six.moves import urllib
-
-def moses_multi_bleu(hypotheses, references, lowercase=False): # tl.nlp
-  """Calculate the bleu score for hypotheses and references
-  using the MOSES ulti-bleu.perl script.
-
-  Parameters
-  ------------
-  hypotheses : A numpy array of strings where each string is a single example.
-  references : A numpy array of strings where each string is a single example.
-  lowercase : If true, pass the "-lc" flag to the multi-bleu script
-
-  Examples
-  ---------
-  >>> hypotheses = ["a bird is flying on the sky"]
-  >>> references = ["two birds are flying on the sky", "a bird is on the top of the tree", "an airplane is on the sky",]
-  >>> score = tl.nlp.moses_multi_bleu(hypotheses, references)
-
-  Returns
-  --------
-  The BLEU score as a float32 value.
-
-  References
-  ----------
-  - `Google/seq2seq/metric/bleu <https://github.com/google/seq2seq>`_
-  """
-
-  if np.size(hypotheses) == 0:
-    return np.float32(0.0)
-
-  # Get MOSES multi-bleu script
-  try:
-    multi_bleu_path, _ = urllib.request.urlretrieve(
-        "https://raw.githubusercontent.com/moses-smt/mosesdecoder/"
-        "master/scripts/generic/multi-bleu.perl")
-    os.chmod(multi_bleu_path, 0o755)
-  except: #pylint: disable=W0702
-    tf.logging.info("Unable to fetch multi-bleu.perl script, using local.")
-    metrics_dir = os.path.dirname(os.path.realpath(__file__))
-    bin_dir = os.path.abspath(os.path.join(metrics_dir, "..", "..", "bin"))
-    multi_bleu_path = os.path.join(bin_dir, "tools/multi-bleu.perl")
-
-  # Dump hypotheses and references to tempfiles
-  hypothesis_file = tempfile.NamedTemporaryFile()
-  hypothesis_file.write("\n".join(hypotheses).encode("utf-8"))
-  hypothesis_file.write(b"\n")
-  hypothesis_file.flush()
-  reference_file = tempfile.NamedTemporaryFile()
-  reference_file.write("\n".join(references).encode("utf-8"))
-  reference_file.write(b"\n")
-  reference_file.flush()
-
-  # Calculate BLEU using multi-bleu script
-  with open(hypothesis_file.name, "r") as read_pred:
-    bleu_cmd = [multi_bleu_path]
-    if lowercase:
-      bleu_cmd += ["-lc"]
-    bleu_cmd += [reference_file.name]
-    try:
-      bleu_out = subprocess.check_output(
-          bleu_cmd, stdin=read_pred, stderr=subprocess.STDOUT)
-      bleu_out = bleu_out.decode("utf-8")
-      bleu_score = re.search(r"BLEU = (.+?),", bleu_out).group(1)
-      bleu_score = float(bleu_score)
-    except subprocess.CalledProcessError as error:
-      if error.output is not None:
-        tf.logging.warning("multi-bleu.perl script returned non-zero exit code")
-        tf.logging.warning(error.output)
-      bleu_score = np.float32(0.0)
-
-  # Close temp files
-  hypothesis_file.close()
-  reference_file.close()
-
-  return np.float32(bleu_score)
diff --git a/tensorlayer/ops.py b/tensorlayer/ops.py
deleted file mode 100644
index 41bb9911..00000000
--- a/tensorlayer/ops.py
+++ /dev/null
@@ -1,255 +0,0 @@
-#! /usr/bin/python
-# -*- coding: utf8 -*-
-
-
-
-
-import tensorflow as tf
-import tensorlayer as tl
-import os
-import subprocess
-import sys
-from sys import platform as _platform
-from sys import exit as _exit
-
-
-def exit_tf(sess=None, port=6006):
-    """Close TensorFlow session, TensorBoard and Nvidia-process if available.
-
-    Parameters
-    ----------
-    sess : a session instance of TensorFlow
-        TensorFlow session
-    tb_port : an integer
-        TensorBoard port you want to close, 6006 as default.
-    """
-    text = "[TL] Close tensorboard and nvidia-process if available"
-    text2 = "[TL] Close tensorboard and nvidia-process not yet supported by this function (tl.ops.exit_tf) on "
-    if sess != None:
-        sess.close()
-    # import time
-    # time.sleep(2)
-    if _platform == "linux" or _platform == "linux2":
-        print('linux: %s' % text)
-        os.system('nvidia-smi')
-        os.system('fuser '+ port +'/tcp -k')  # kill tensorboard 6006
-        os.system("nvidia-smi | grep python |awk '{print $3}'|xargs kill") # kill all nvidia-smi python process
-        _exit()
-    elif _platform == "darwin":
-        print('OS X: %s' % text)
-        subprocess.Popen("lsof -i tcp:"+ str(port) +"  | grep -v PID | awk '{print $2}' | xargs kill", shell=True) # kill tensorboard
-    elif _platform == "win32":
-        print(text2 + "Windows")
-        # TODO
-    else:
-        print(text2 + _platform)
-
-def open_tb(logdir='/tmp/tensorflow', port=6006):
-    """Open Tensorboard.
-
-    Parameters
-    ----------
-    logdir : a string
-        Directory where your tensorboard logs are saved
-    port : an integer
-        TensorBoard port you want to open, 6006 is tensorboard default
-    """
-    text = "[TL] Open tensorboard, go to localhost:" + str(port) + " to access"
-    text2 = " not yet supported by this function (tl.ops.open_tb)"
-
-    if not tl.files.exists_or_mkdir(logdir, verbose=False):
-        print("[TL] Log reportory was created at %s" % logdir)
-
-    if _platform == "linux" or _platform == "linux2":
-        print('linux %s' % text2)
-        # TODO
-    elif _platform == "darwin":
-        print('OS X: %s' % text)
-        subprocess.Popen(sys.prefix + " | python -m tensorflow.tensorboard --logdir=" + logdir + " --port=" + str(port), shell=True) # open tensorboard in localhost:6006/ or whatever port you chose
-    elif _platform == "win32":
-        print('Windows%s' % text2)
-        # TODO
-    else:
-        print(_platform + text2)
-
-def clear_all(printable=True):
-    """Clears all the placeholder variables of keep prob,
-    including keeping probabilities of all dropout, denoising, dropconnect etc.
-
-    Parameters
-    ----------
-    printable : boolean
-        If True, print all deleted variables.
-    """
-    print('clear all .....................................')
-    gl = globals().copy()
-    for var in gl:
-        if var[0] == '_': continue
-        if 'func' in str(globals()[var]): continue
-        if 'module' in str(globals()[var]): continue
-        if 'class' in str(globals()[var]): continue
-
-        if printable:
-            print(" clear_all ------- %s" % str(globals()[var]))
-
-        del globals()[var]
-
-# def clear_all2(vars, printable=True):
-#     """
-#     The :function:`clear_all()` Clears all the placeholder variables of keep prob,
-#     including keeping probabilities of all dropout, denoising, dropconnect
-#     Parameters
-#     ----------
-#     printable : if True, print all deleted variables.
-#     """
-#     print('clear all .....................................')
-#     for var in vars:
-#         if var[0] == '_': continue
-#         if 'func' in str(var): continue
-#         if 'module' in str(var): continue
-#         if 'class' in str(var): continue
-#
-#         if printable:
-#             print(" clear_all ------- %s" % str(var))
-#
-#         del var
-
-def set_gpu_fraction(sess=None, gpu_fraction=0.3):
-    """Set the GPU memory fraction for the application.
-
-    Parameters
-    ----------
-    sess : a session instance of TensorFlow
-        TensorFlow session
-    gpu_fraction : a float
-        Fraction of GPU memory, (0 ~ 1]
-
-    References
-    ----------
-    - `TensorFlow using GPU <https://www.tensorflow.org/versions/r0.9/how_tos/using_gpu/index.html>`_
-    """
-    print("[TL]: GPU MEM Fraction %f" % gpu_fraction)
-    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_fraction)
-    sess = tf.Session(config = tf.ConfigProto(gpu_options = gpu_options))
-    return sess
-
-
-
-
-
-def disable_print():
-    """Disable console output, ``suppress_stdout`` is recommended.
-
-    Examples
-    ---------
-    >>> print("You can see me")
-    >>> tl.ops.disable_print()
-    >>> print(" You can't see me")
-    >>> tl.ops.enable_print()
-    >>> print("You can see me")
-    """
-    # sys.stdout = os.devnull   # this one kill the process
-    sys.stdout = None
-    sys.stderr = os.devnull
-
-def enable_print():
-    """Enable console output, ``suppress_stdout`` is recommended.
-
-    Examples
-    --------
-    - see tl.ops.disable_print()
-    """
-    sys.stdout = sys.__stdout__
-    sys.stderr = sys.__stderr__
-
-
-# class temporary_disable_print:
-#     """Temporarily disable console output.
-#
-#     Examples
-#     ---------
-#     >>> print("You can see me")
-#     >>> with tl.ops.temporary_disable_print() as t:
-#     >>>     print("You can't see me")
-#     >>> print("You can see me")
-#     """
-#     def __init__(self):
-#         pass
-#     def __enter__(self):
-#         sys.stdout = None
-#         sys.stderr = os.devnull
-#     def __exit__(self, type, value, traceback):
-#         sys.stdout = sys.__stdout__
-#         sys.stderr = sys.__stderr__
-#         return isinstance(value, TypeError)
-
-
-from contextlib import contextmanager
-@contextmanager
-def suppress_stdout():
-    """Temporarily disable console output.
-
-    Examples
-    ---------
-    >>> print("You can see me")
-    >>> with tl.ops.suppress_stdout():
-    >>>     print("You can't see me")
-    >>> print("You can see me")
-
-    References
-    -----------
-    - `stackoverflow <http://stackoverflow.com/questions/2125702/how-to-suppress-console-output-in-python>`_
-    """
-    with open(os.devnull, "w") as devnull:
-        old_stdout = sys.stdout
-        sys.stdout = devnull
-        try:
-            yield
-        finally:
-            sys.stdout = old_stdout
-
-
-
-def get_site_packages_directory():
-    """Print and return the site-packages directory.
-
-    Examples
-    ---------
-    >>> loc = tl.ops.get_site_packages_directory()
-    """
-    import site
-    try:
-        loc = site.getsitepackages()
-        print("[TL] tl.ops : site-packages in ", loc)
-        return loc
-    except:
-        print("[TL] tl.ops : Cannot find package dir from virtual environment")
-        return False
-
-
-
-def empty_trash():
-    """Empty trash folder.
-
-    """
-    text = "[TL] Empty the trash"
-    if _platform == "linux" or _platform == "linux2":
-        print('linux: %s' % text)
-        os.system("rm -rf ~/.local/share/Trash/*")
-    elif _platform == "darwin":
-        print('OS X: %s' % text)
-        os.system("sudo rm -rf ~/.Trash/*")
-    elif _platform == "win32":
-        print('Windows: %s' % text)
-        try:
-            os.system("rd /s c:\$Recycle.Bin")  # Windows 7 or Server 2008
-        except:
-            pass
-        try:
-            os.system("rd /s c:\recycler")  #  Windows XP, Vista, or Server 2003
-        except:
-            pass
-    else:
-        print(_platform)
-
-#
diff --git a/tensorlayer/prepro.py b/tensorlayer/prepro.py
deleted file mode 100644
index e9980870..00000000
--- a/tensorlayer/prepro.py
+++ /dev/null
@@ -1,1732 +0,0 @@
-#! /usr/bin/python
-# -*- coding: utf8 -*-
-
-
-import tensorflow as tf
-import tensorlayer as tl
-import numpy as np
-
-import time
-import numbers
-import random
-import os
-import re
-import sys
-
-import threading
-# import Queue  # <-- donot work for py3
-is_py2 = sys.version[0] == '2'
-if is_py2:
-    import Queue as queue
-else:
-    import queue as queue
-
-from six.moves import range
-import scipy
-from scipy import linalg
-import scipy.ndimage as ndi
-
-from skimage import transform
-from skimage import exposure
-import skimage
-
-from multiprocessing import Pool
-
-# linalg https://docs.scipy.org/doc/scipy/reference/linalg.html
-# ndimage https://docs.scipy.org/doc/scipy/reference/ndimage.html
-
-## Threading
-def threading_data(data=None, fn=None, thread_count=None, **kwargs):
-    """Return a batch of result by given data.
-    Usually be used for data augmentation.
-
-    Parameters
-    -----------
-    data : numpy array, file names and etc, see Examples below.
-    thread_count : the number of threads to use
-    fn : the function for data processing.
-    more args : the args for fn, see Examples below.
-
-    Examples
-    --------
-    - Single array
-    >>> X --> [batch_size, row, col, 1] greyscale
-    >>> results = threading_data(X, zoom, zoom_range=[0.5, 1], is_random=True)
-    ... results --> [batch_size, row, col, channel]
-    >>> tl.visualize.images2d(images=np.asarray(results), second=0.01, saveable=True, name='after', dtype=None)
-    >>> tl.visualize.images2d(images=np.asarray(X), second=0.01, saveable=True, name='before', dtype=None)
-
-    - List of array (e.g. functions with ``multi``)
-    >>> X, Y --> [batch_size, row, col, 1]  greyscale
-    >>> data = threading_data([_ for _ in zip(X, Y)], zoom_multi, zoom_range=[0.5, 1], is_random=True)
-    ... data --> [batch_size, 2, row, col, 1]
-    >>> X_, Y_ = data.transpose((1,0,2,3,4))
-    ... X_, Y_ --> [batch_size, row, col, 1]
-    >>> tl.visualize.images2d(images=np.asarray(X_), second=0.01, saveable=True, name='after', dtype=None)
-    >>> tl.visualize.images2d(images=np.asarray(Y_), second=0.01, saveable=True, name='before', dtype=None)
-
-    - Single array split across ``thread_count`` threads (e.g. functions with ``multi``)
-    >>> X, Y --> [batch_size, row, col, 1]  greyscale
-    >>> data = threading_data(X, zoom_multi, 8, zoom_range=[0.5, 1], is_random=True)
-    ... data --> [batch_size, 2, row, col, 1]
-    >>> X_, Y_ = data.transpose((1,0,2,3,4))
-    ... X_, Y_ --> [batch_size, row, col, 1]
-    >>> tl.visualize.images2d(images=np.asarray(X_), second=0.01, saveable=True, name='after', dtype=None)
-    >>> tl.visualize.images2d(images=np.asarray(Y_), second=0.01, saveable=True, name='before', dtype=None)
-
-    - Customized function for image segmentation
-    >>> def distort_img(data):
-    ...     x, y = data
-    ...     x, y = flip_axis_multi([x, y], axis=0, is_random=True)
-    ...     x, y = flip_axis_multi([x, y], axis=1, is_random=True)
-    ...     x, y = crop_multi([x, y], 100, 100, is_random=True)
-    ...     return x, y
-    >>> X, Y --> [batch_size, row, col, channel]
-    >>> data = threading_data([_ for _ in zip(X, Y)], distort_img)
-    >>> X_, Y_ = data.transpose((1,0,2,3,4))
-
-    References
-    ----------
-    - `python queue <https://pymotw.com/2/Queue/index.html#module-Queue>`_
-    - `run with limited queue <http://effbot.org/librarybook/queue.htm>`_
-    """
-    ## plot function info
-    # for name, value in kwargs.items():
-    #     print('{0} = {1}'.format(name, value))
-    # exit()
-    # define function for threading
-    def apply_fn(results, i, data, kwargs):
-        results[i] = fn(data, **kwargs)
-
-    ## start multi-threaded reading.
-    if thread_count is None: # by Milo
-        results = [None] * len(data) ## preallocate result list
-        threads = []
-        for i in range(len(data)):
-            t = threading.Thread(
-                            name='threading_and_return',
-                            target=apply_fn,
-                            args=(results, i, data[i], kwargs)
-                            )
-            t.start()
-            threads.append(t)
-    else: # by geometrikal
-        divs = np.linspace(0, len(data), thread_count + 1)
-        divs = np.round(divs).astype(int)
-        results = [None] * thread_count
-        threads = []
-        for i in range(thread_count):
-            t = threading.Thread(
-                name='threading_and_return',
-                target=apply_fn,
-                args=(results, i, data[divs[i]:divs[i + 1]], kwargs)
-            )
-            t.start()
-            threads.append(t)
-
-    ## <Milo> wait for all threads to complete
-    for t in threads:
-        t.join()
-
-    if thread_count is None:
-        return np.asarray(results)
-    else:
-        return np.concatenate(results)
-
-
-## Image
-def rotation(x, rg=20, is_random=False, row_index=0, col_index=1, channel_index=2,
-                    fill_mode='nearest', cval=0., order=1):
-    """Rotate an image randomly or non-randomly.
-
-    Parameters
-    -----------
-    x : numpy array
-        An image with dimension of [row, col, channel] (default).
-    rg : int or float
-        Degree to rotate, usually 0 ~ 180.
-    is_random : boolean, default False
-        If True, randomly rotate.
-    row_index, col_index, channel_index : int
-        Index of row, col and channel, default (0, 1, 2), for theano (1, 2, 0).
-    fill_mode : string
-        Method to fill missing pixel, default ‘nearest’, more options ‘constant’, ‘reflect’ or ‘wrap’
-
-        - `scipy ndimage affine_transform <https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.ndimage.interpolation.affine_transform.html>`_
-    cval : scalar, optional
-        Value used for points outside the boundaries of the input if mode='constant'. Default is 0.0
-    order : int, optional
-        The order of interpolation. The order has to be in the range 0-5. See ``apply_transform``.
-
-        - `scipy ndimage affine_transform <https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.ndimage.interpolation.affine_transform.html>`_
-
-    Examples
-    ---------
-    >>> x --> [row, col, 1] greyscale
-    >>> x = rotation(x, rg=40, is_random=False)
-    >>> tl.visualize.frame(x[:,:,0], second=0.01, saveable=True, name='temp',cmap='gray')
-    """
-    if is_random:
-        theta = np.pi / 180 * np.random.uniform(-rg, rg)
-    else:
-        theta = np.pi /180 * rg
-    rotation_matrix = np.array([[np.cos(theta), -np.sin(theta), 0],
-                                [np.sin(theta), np.cos(theta), 0],
-                                [0, 0, 1]])
-
-    h, w = x.shape[row_index], x.shape[col_index]
-    transform_matrix = transform_matrix_offset_center(rotation_matrix, h, w)
-    x = apply_transform(x, transform_matrix, channel_index, fill_mode, cval, order)
-    return x
-
-def rotation_multi(x, rg=20, is_random=False, row_index=0, col_index=1, channel_index=2,
-                    fill_mode='nearest', cval=0., order=1):
-    """Rotate multiple images with the same arguments, randomly or non-randomly.
-    Usually be used for image segmentation which x=[X, Y], X and Y should be matched.
-
-    Parameters
-    -----------
-    x : list of numpy array
-        List of images with dimension of [n_images, row, col, channel] (default).
-    others : see ``rotation``.
-
-    Examples
-    --------
-    >>> x, y --> [row, col, 1]  greyscale
-    >>> x, y = rotation_multi([x, y], rg=90, is_random=False)
-    >>> tl.visualize.frame(x[:,:,0], second=0.01, saveable=True, name='x',cmap='gray')
-    >>> tl.visualize.frame(y[:,:,0], second=0.01, saveable=True, name='y',cmap='gray')
-    """
-    if is_random:
-        theta = np.pi / 180 * np.random.uniform(-rg, rg)
-    else:
-        theta = np.pi /180 * rg
-    rotation_matrix = np.array([[np.cos(theta), -np.sin(theta), 0],
-                                [np.sin(theta), np.cos(theta), 0],
-                                [0, 0, 1]])
-
-    h, w = x[0].shape[row_index], x[0].shape[col_index]
-    transform_matrix = transform_matrix_offset_center(rotation_matrix, h, w)
-    results = []
-    for data in x:
-        results.append( apply_transform(data, transform_matrix, channel_index, fill_mode, cval, order))
-    return np.asarray(results)
-
-# crop
-def crop(x, wrg, hrg, is_random=False, row_index=0, col_index=1, channel_index=2):
-    """Randomly or centrally crop an image.
-
-    Parameters
-    ----------
-    x : numpy array
-        An image with dimension of [row, col, channel] (default).
-    wrg : float
-        Size of weight.
-    hrg : float
-        Size of height.
-    is_random : boolean, default False
-        If True, randomly crop, else central crop.
-    row_index, col_index, channel_index : int
-        Index of row, col and channel, default (0, 1, 2), for theano (1, 2, 0).
-    """
-    h, w = x.shape[row_index], x.shape[col_index]
-    assert (h > hrg) and (w > wrg), "The size of cropping should smaller than the original image"
-    if is_random:
-        h_offset = int(np.random.uniform(0, h-hrg) -1)
-        w_offset = int(np.random.uniform(0, w-wrg) -1)
-        # print(h_offset, w_offset, x[h_offset: hrg+h_offset ,w_offset: wrg+w_offset].shape)
-        return x[h_offset: hrg+h_offset ,w_offset: wrg+w_offset]
-    else:   # central crop
-        h_offset = int(np.floor((h - hrg)/2.))
-        w_offset = int(np.floor((w - wrg)/2.))
-        h_end = h_offset + hrg
-        w_end = w_offset + wrg
-        return x[h_offset: h_end, w_offset: w_end]
-        # old implementation
-        # h_offset = (h - hrg)/2
-        # w_offset = (w - wrg)/2
-        # # print(x[h_offset: h-h_offset ,w_offset: w-w_offset].shape)
-        # return x[h_offset: h-h_offset ,w_offset: w-w_offset]
-        # central crop
-
-
-def crop_multi(x, wrg, hrg, is_random=False, row_index=0, col_index=1, channel_index=2):
-    """Randomly or centrally crop multiple images.
-
-    Parameters
-    ----------
-    x : list of numpy array
-        List of images with dimension of [n_images, row, col, channel] (default).
-    others : see ``crop``.
-    """
-    h, w = x[0].shape[row_index], x[0].shape[col_index]
-    assert (h > hrg) and (w > wrg), "The size of cropping should smaller than the original image"
-    if is_random:
-        h_offset = int(np.random.uniform(0, h-hrg) -1)
-        w_offset = int(np.random.uniform(0, w-wrg) -1)
-        results = []
-        for data in x:
-            results.append( data[h_offset: hrg+h_offset ,w_offset: wrg+w_offset])
-        return np.asarray(results)
-    else:
-        # central crop
-        h_offset = (h - hrg)/2
-        w_offset = (w - wrg)/2
-        results = []
-        for data in x:
-            results.append( data[h_offset: h-h_offset ,w_offset: w-w_offset] )
-        return np.asarray(results)
-
-# flip
-def flip_axis(x, axis, is_random=False):
-    """Flip the axis of an image, such as flip left and right, up and down, randomly or non-randomly,
-
-    Parameters
-    ----------
-    x : numpy array
-        An image with dimension of [row, col, channel] (default).
-    axis : int
-        - 0, flip up and down
-        - 1, flip left and right
-        - 2, flip channel
-    is_random : boolean, default False
-        If True, randomly flip.
-    """
-    if is_random:
-        factor = np.random.uniform(-1, 1)
-        if factor > 0:
-            x = np.asarray(x).swapaxes(axis, 0)
-            x = x[::-1, ...]
-            x = x.swapaxes(0, axis)
-            return x
-        else:
-            return x
-    else:
-        x = np.asarray(x).swapaxes(axis, 0)
-        x = x[::-1, ...]
-        x = x.swapaxes(0, axis)
-        return x
-
-def flip_axis_multi(x, axis, is_random=False):
-    """Flip the axises of multiple images together, such as flip left and right, up and down, randomly or non-randomly,
-
-    Parameters
-    -----------
-    x : list of numpy array
-        List of images with dimension of [n_images, row, col, channel] (default).
-    others : see ``flip_axis``.
-    """
-    if is_random:
-        factor = np.random.uniform(-1, 1)
-        if factor > 0:
-            # x = np.asarray(x).swapaxes(axis, 0)
-            # x = x[::-1, ...]
-            # x = x.swapaxes(0, axis)
-            # return x
-            results = []
-            for data in x:
-                data = np.asarray(data).swapaxes(axis, 0)
-                data = data[::-1, ...]
-                data = data.swapaxes(0, axis)
-                results.append( data )
-            return np.asarray(results)
-        else:
-            return np.asarray(x)
-    else:
-        # x = np.asarray(x).swapaxes(axis, 0)
-        # x = x[::-1, ...]
-        # x = x.swapaxes(0, axis)
-        # return x
-        results = []
-        for data in x:
-            data = np.asarray(data).swapaxes(axis, 0)
-            data = data[::-1, ...]
-            data = data.swapaxes(0, axis)
-            results.append( data )
-        return np.asarray(results)
-
-# shift
-def shift(x, wrg=0.1, hrg=0.1, is_random=False, row_index=0, col_index=1, channel_index=2,
-                 fill_mode='nearest', cval=0., order=1):
-    """Shift an image randomly or non-randomly.
-
-    Parameters
-    -----------
-    x : numpy array
-        An image with dimension of [row, col, channel] (default).
-    wrg : float
-        Percentage of shift in axis x, usually -0.25 ~ 0.25.
-    hrg : float
-        Percentage of shift in axis y, usually -0.25 ~ 0.25.
-    is_random : boolean, default False
-        If True, randomly shift.
-    row_index, col_index, channel_index : int
-        Index of row, col and channel, default (0, 1, 2), for theano (1, 2, 0).
-    fill_mode : string
-        Method to fill missing pixel, default ‘nearest’, more options ‘constant’, ‘reflect’ or ‘wrap’.
-
-        - `scipy ndimage affine_transform <https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.ndimage.interpolation.affine_transform.html>`_
-    cval : scalar, optional
-        Value used for points outside the boundaries of the input if mode='constant'. Default is 0.0.
-    order : int, optional
-        The order of interpolation. The order has to be in the range 0-5. See ``apply_transform``.
-
-        - `scipy ndimage affine_transform <https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.ndimage.interpolation.affine_transform.html>`_
-    """
-    h, w = x.shape[row_index], x.shape[col_index]
-    if is_random:
-        tx = np.random.uniform(-hrg, hrg) * h
-        ty = np.random.uniform(-wrg, wrg) * w
-    else:
-        tx, ty = hrg * h, wrg * w
-    translation_matrix = np.array([[1, 0, tx],
-                                   [0, 1, ty],
-                                   [0, 0, 1]])
-
-    transform_matrix = translation_matrix  # no need to do offset
-    x = apply_transform(x, transform_matrix, channel_index, fill_mode, cval, order)
-    return x
-
-def shift_multi(x, wrg=0.1, hrg=0.1, is_random=False, row_index=0, col_index=1, channel_index=2,
-                 fill_mode='nearest', cval=0., order=1):
-    """Shift images with the same arguments, randomly or non-randomly.
-    Usually be used for image segmentation which x=[X, Y], X and Y should be matched.
-
-    Parameters
-    -----------
-    x : list of numpy array
-        List of images with dimension of [n_images, row, col, channel] (default).
-    others : see ``shift``.
-    """
-    h, w = x[0].shape[row_index], x[0].shape[col_index]
-    if is_random:
-        tx = np.random.uniform(-hrg, hrg) * h
-        ty = np.random.uniform(-wrg, wrg) * w
-    else:
-        tx, ty = hrg * h, wrg * w
-    translation_matrix = np.array([[1, 0, tx],
-                                   [0, 1, ty],
-                                   [0, 0, 1]])
-
-    transform_matrix = translation_matrix  # no need to do offset
-    results = []
-    for data in x:
-        results.append( apply_transform(data, transform_matrix, channel_index, fill_mode, cval, order))
-    return np.asarray(results)
-
-# shear
-def shear(x, intensity=0.1, is_random=False, row_index=0, col_index=1, channel_index=2,
-                 fill_mode='nearest', cval=0., order=1):
-    """Shear an image randomly or non-randomly.
-
-    Parameters
-    -----------
-    x : numpy array
-        An image with dimension of [row, col, channel] (default).
-    intensity : float
-        Percentage of shear, usually -0.5 ~ 0.5 (is_random==True), 0 ~ 0.5 (is_random==False),
-        you can have a quick try by shear(X, 1).
-    is_random : boolean, default False
-        If True, randomly shear.
-    row_index, col_index, channel_index : int
-        Index of row, col and channel, default (0, 1, 2), for theano (1, 2, 0).
-    fill_mode : string
-        Method to fill missing pixel, default ‘nearest’, more options ‘constant’, ‘reflect’ or ‘wrap’.
-
-        - `scipy ndimage affine_transform <https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.ndimage.interpolation.affine_transform.html>`_
-    cval : scalar, optional
-        Value used for points outside the boundaries of the input if mode='constant'. Default is 0.0.
-    order : int, optional
-        The order of interpolation. The order has to be in the range 0-5. See ``apply_transform``.
-
-        - `scipy ndimage affine_transform <https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.ndimage.interpolation.affine_transform.html>`_
-    """
-    if is_random:
-        shear = np.random.uniform(-intensity, intensity)
-    else:
-        shear = intensity
-    shear_matrix = np.array([[1, -np.sin(shear), 0],
-                             [0, np.cos(shear), 0],
-                             [0, 0, 1]])
-
-    h, w = x.shape[row_index], x.shape[col_index]
-    transform_matrix = transform_matrix_offset_center(shear_matrix, h, w)
-    x = apply_transform(x, transform_matrix, channel_index, fill_mode, cval, order)
-    return x
-
-def shear_multi(x, intensity=0.1, is_random=False, row_index=0, col_index=1, channel_index=2,
-                 fill_mode='nearest', cval=0., order=1):
-    """Shear images with the same arguments, randomly or non-randomly.
-    Usually be used for image segmentation which x=[X, Y], X and Y should be matched.
-
-    Parameters
-    -----------
-    x : list of numpy array
-        List of images with dimension of [n_images, row, col, channel] (default).
-    others : see ``shear``.
-    """
-    if is_random:
-        shear = np.random.uniform(-intensity, intensity)
-    else:
-        shear = intensity
-    shear_matrix = np.array([[1, -np.sin(shear), 0],
-                             [0, np.cos(shear), 0],
-                             [0, 0, 1]])
-
-    h, w = x[0].shape[row_index], x[0].shape[col_index]
-    transform_matrix = transform_matrix_offset_center(shear_matrix, h, w)
-    results = []
-    for data in x:
-        results.append( apply_transform(data, transform_matrix, channel_index, fill_mode, cval, order))
-    return np.asarray(results)
-
-# swirl
-def swirl(x, center=None, strength=1, radius=100, rotation=0, output_shape=None, order=1, mode='constant', cval=0, clip=True, preserve_range=False, is_random=False):
-    """Swirl an image randomly or non-randomly, see `scikit-image swirl API <http://scikit-image.org/docs/dev/api/skimage.transform.html#skimage.transform.swirl>`_
-    and `example <http://scikit-image.org/docs/dev/auto_examples/plot_swirl.html>`_.
-
-    Parameters
-    -----------
-    x : numpy array
-        An image with dimension of [row, col, channel] (default).
-    center : (row, column) tuple or (2,) ndarray, optional
-        Center coordinate of transformation.
-    strength : float, optional
-        The amount of swirling applied.
-    radius : float, optional
-        The extent of the swirl in pixels. The effect dies out rapidly beyond radius.
-    rotation : float, (degree) optional
-        Additional rotation applied to the image, usually [0, 360], relates to center.
-    output_shape : tuple (rows, cols), optional
-        Shape of the output image generated. By default the shape of the input image is preserved.
-    order : int, optional
-        The order of the spline interpolation, default is 1. The order has to be in the range 0-5. See skimage.transform.warp for detail.
-    mode : {‘constant’, ‘edge’, ‘symmetric’, ‘reflect’, ‘wrap’}, optional
-        Points outside the boundaries of the input are filled according to the given mode, with ‘constant’ used as the default. Modes match the behaviour of numpy.pad.
-    cval : float, optional
-        Used in conjunction with mode ‘constant’, the value outside the image boundaries.
-    clip : bool, optional
-        Whether to clip the output to the range of values of the input image. This is enabled by default, since higher order interpolation may produce values outside the given input range.
-    preserve_range : bool, optional
-        Whether to keep the original range of values. Otherwise, the input image is converted according to the conventions of img_as_float.
-    is_random : boolean, default False
-        If True, random swirl.
-            - random center = [(0 ~ x.shape[0]), (0 ~ x.shape[1])]
-            - random strength = [0, strength]
-            - random radius = [1e-10, radius]
-            - random rotation = [-rotation, rotation]
-
-    Examples
-    ---------
-    >>> x --> [row, col, 1] greyscale
-    >>> x = swirl(x, strength=4, radius=100)
-    """
-    assert radius != 0, Exception("Invalid radius value")
-    rotation = np.pi / 180 * rotation
-    if is_random:
-        center_h = int(np.random.uniform(0, x.shape[0]))
-        center_w = int(np.random.uniform(0, x.shape[1]))
-        center = (center_h, center_w)
-        strength = np.random.uniform(0, strength)
-        radius = np.random.uniform(1e-10, radius)
-        rotation = np.random.uniform(-rotation, rotation)
-
-    max_v = np.max(x)
-    if max_v > 1:   # Note: the input of this fn should be [-1, 1], rescale is required.
-        x = x / max_v
-    swirled = skimage.transform.swirl(x, center=center, strength=strength, radius=radius, rotation=rotation,
-        output_shape=output_shape, order=order, mode=mode, cval=cval, clip=clip, preserve_range=preserve_range)
-    if max_v > 1:
-        swirled = swirled * max_v
-    return swirled
-
-def swirl_multi(x, center=None, strength=1, radius=100, rotation=0, output_shape=None, order=1, mode='constant', cval=0, clip=True, preserve_range=False, is_random=False):
-    """Swirl multiple images with the same arguments, randomly or non-randomly.
-    Usually be used for image segmentation which x=[X, Y], X and Y should be matched.
-
-    Parameters
-    -----------
-    x : list of numpy array
-        List of images with dimension of [n_images, row, col, channel] (default).
-    others : see ``swirl``.
-    """
-    assert radius != 0, Exception("Invalid radius value")
-    rotation = np.pi / 180 * rotation
-    if is_random:
-        center_h = int(np.random.uniform(0, x[0].shape[0]))
-        center_w = int(np.random.uniform(0, x[0].shape[1]))
-        center = (center_h, center_w)
-        strength = np.random.uniform(0, strength)
-        radius = np.random.uniform(1e-10, radius)
-        rotation = np.random.uniform(-rotation, rotation)
-
-    results = []
-    for data in x:
-        max_v = np.max(data)
-        if max_v > 1:   # Note: the input of this fn should be [-1, 1], rescale is required.
-            data = data / max_v
-        swirled = skimage.transform.swirl(data, center=center, strength=strength, radius=radius, rotation=rotation,
-            output_shape=output_shape, order=order, mode=mode, cval=cval, clip=clip, preserve_range=preserve_range)
-        if max_v > 1:
-            swirled = swirled * max_v
-        results.append( swirled )
-    return np.asarray(results)
-
-# elastic_transform
-
-from scipy.ndimage.interpolation import map_coordinates
-from scipy.ndimage.filters import gaussian_filter
-def elastic_transform(x, alpha, sigma, mode="constant", cval=0, is_random=False):
-    """Elastic deformation of images as described in `[Simard2003] <http://deeplearning.cs.cmu.edu/pdfs/Simard.pdf>`_ .
-
-    Parameters
-    -----------
-    x : numpy array, a greyscale image.
-    alpha : scalar factor.
-    sigma : scalar or sequence of scalars, the smaller the sigma, the more transformation.
-        Standard deviation for Gaussian kernel. The standard deviations of the Gaussian filter are given for each axis as a sequence, or as a single number, in which case it is equal for all axes.
-    mode : default constant, see `scipy.ndimage.filters.gaussian_filter <https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.ndimage.filters.gaussian_filter.html>`_.
-    cval : float, optional. Used in conjunction with mode ‘constant’, the value outside the image boundaries.
-    is_random : boolean, default False
-
-    Examples
-    ---------
-    >>> x = elastic_transform(x, alpha = x.shape[1] * 3, sigma = x.shape[1] * 0.07)
-
-    References
-    ------------
-    - `Github <https://gist.github.com/chsasank/4d8f68caf01f041a6453e67fb30f8f5a>`_.
-    - `Kaggle <https://www.kaggle.com/pscion/ultrasound-nerve-segmentation/elastic-transform-for-data-augmentation-0878921a>`_
-    """
-    if is_random is False:
-        random_state = np.random.RandomState(None)
-    else:
-        random_state = np.random.RandomState(int(time.time()))
-    #
-    is_3d = False
-    if len(x.shape) == 3 and x.shape[-1] == 1:
-        x = x[:,:,0]
-        is_3d = True
-    elif len(x.shape) == 3 and x.shape[-1] != 1:
-        raise Exception("Only support greyscale image")
-    assert len(x.shape)==2
-
-    shape = x.shape
-
-    dx = gaussian_filter((random_state.rand(*shape) * 2 - 1), sigma, mode=mode, cval=cval) * alpha
-    dy = gaussian_filter((random_state.rand(*shape) * 2 - 1), sigma, mode=mode, cval=cval) * alpha
-
-    x_, y_ = np.meshgrid(np.arange(shape[0]), np.arange(shape[1]), indexing='ij')
-    indices = np.reshape(x_ + dx, (-1, 1)), np.reshape(y_ + dy, (-1, 1))
-    if is_3d:
-        return map_coordinates(x, indices, order=1).reshape((shape[0], shape[1], 1))
-    else:
-        return map_coordinates(x, indices, order=1).reshape(shape)
-
-def elastic_transform_multi(x, alpha, sigma, mode="constant", cval=0, is_random=False):
-    """Elastic deformation of images as described in `[Simard2003] <http://deeplearning.cs.cmu.edu/pdfs/Simard.pdf>`_.
-
-    Parameters
-    -----------
-    x : list of numpy array
-    others : see ``elastic_transform``.
-    """
-    if is_random is False:
-        random_state = np.random.RandomState(None)
-    else:
-        random_state = np.random.RandomState(int(time.time()))
-
-    shape = x[0].shape
-    if len(shape) == 3:
-        shape = (shape[0], shape[1])
-    new_shape = random_state.rand(*shape)
-
-    results = []
-    for data in x:
-        is_3d = False
-        if len(data.shape) == 3 and data.shape[-1] == 1:
-            data = data[:,:,0]
-            is_3d = True
-        elif len(data.shape) == 3 and data.shape[-1] != 1:
-            raise Exception("Only support greyscale image")
-        assert len(data.shape)==2
-
-        dx = gaussian_filter((new_shape * 2 - 1), sigma, mode=mode, cval=cval) * alpha
-        dy = gaussian_filter((new_shape * 2 - 1), sigma, mode=mode, cval=cval) * alpha
-
-        x_, y_ = np.meshgrid(np.arange(shape[0]), np.arange(shape[1]), indexing='ij')
-        indices = np.reshape(x_ + dx, (-1, 1)), np.reshape(y_ + dy, (-1, 1))
-        # print(data.shape)
-        if is_3d:
-            results.append( map_coordinates(data, indices, order=1).reshape((shape[0], shape[1], 1)))
-        else:
-            results.append( map_coordinates(data, indices, order=1).reshape(shape) )
-    return np.asarray(results)
-
-# zoom
-def zoom(x, zoom_range=(0.9, 1.1), is_random=False, row_index=0, col_index=1, channel_index=2,
-                fill_mode='nearest', cval=0., order=1):
-    """Zoom in and out of a single image, randomly or non-randomly.
-
-    Parameters
-    -----------
-    x : numpy array
-        An image with dimension of [row, col, channel] (default).
-    zoom_range : list or tuple
-        - If is_random=False, (h, w) are the fixed zoom factor for row and column axies, factor small than one is zoom in.
-        - If is_random=True, (min zoom out, max zoom out) for x and y with different random zoom in/out factor.
-        e.g (0.5, 1) zoom in 1~2 times.
-    is_random : boolean, default False
-        If True, randomly zoom.
-    row_index, col_index, channel_index : int
-        Index of row, col and channel, default (0, 1, 2), for theano (1, 2, 0).
-    fill_mode : string
-        Method to fill missing pixel, default ‘nearest’, more options ‘constant’, ‘reflect’ or ‘wrap’.
-
-        - `scipy ndimage affine_transform <https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.ndimage.interpolation.affine_transform.html>`_
-    cval : scalar, optional
-        Value used for points outside the boundaries of the input if mode='constant'. Default is 0.0.
-    order : int, optional
-        The order of interpolation. The order has to be in the range 0-5. See ``apply_transform``.
-
-        - `scipy ndimage affine_transform <https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.ndimage.interpolation.affine_transform.html>`_
-    """
-    if len(zoom_range) != 2:
-        raise Exception('zoom_range should be a tuple or list of two floats. '
-                        'Received arg: ', zoom_range)
-    if is_random:
-        if zoom_range[0] == 1 and zoom_range[1] == 1:
-            zx, zy = 1, 1
-            print(" random_zoom : not zoom in/out")
-        else:
-            zx, zy = np.random.uniform(zoom_range[0], zoom_range[1], 2)
-    else:
-        zx, zy = zoom_range
-    # print(zx, zy)
-    zoom_matrix = np.array([[zx, 0, 0],
-                            [0, zy, 0],
-                            [0, 0, 1]])
-
-    h, w = x.shape[row_index], x.shape[col_index]
-    transform_matrix = transform_matrix_offset_center(zoom_matrix, h, w)
-    x = apply_transform(x, transform_matrix, channel_index, fill_mode, cval, order)
-    return x
-
-def zoom_multi(x, zoom_range=(0.9, 1.1), is_random=False,
-        row_index=0, col_index=1, channel_index=2, fill_mode='nearest', cval=0., order=1):
-    """Zoom in and out of images with the same arguments, randomly or non-randomly.
-    Usually be used for image segmentation which x=[X, Y], X and Y should be matched.
-
-    Parameters
-    -----------
-    x : list of numpy array
-        List of images with dimension of [n_images, row, col, channel] (default).
-    others : see ``zoom``.
-    """
-    if len(zoom_range) != 2:
-        raise Exception('zoom_range should be a tuple or list of two floats. '
-                        'Received arg: ', zoom_range)
-
-    if is_random:
-        if zoom_range[0] == 1 and zoom_range[1] == 1:
-            zx, zy = 1, 1
-            print(" random_zoom : not zoom in/out")
-        else:
-            zx, zy = np.random.uniform(zoom_range[0], zoom_range[1], 2)
-    else:
-        zx, zy = zoom_range
-
-    zoom_matrix = np.array([[zx, 0, 0],
-                            [0, zy, 0],
-                            [0, 0, 1]])
-
-    h, w = x[0].shape[row_index], x[0].shape[col_index]
-    transform_matrix = transform_matrix_offset_center(zoom_matrix, h, w)
-    # x = apply_transform(x, transform_matrix, channel_index, fill_mode, cval)
-    # return x
-    results = []
-    for data in x:
-        results.append( apply_transform(data, transform_matrix, channel_index, fill_mode, cval, order))
-    return np.asarray(results)
-
-# image = tf.image.random_brightness(image, max_delta=32. / 255.)
-# image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
-# image = tf.image.random_hue(image, max_delta=0.032)
-# image = tf.image.random_contrast(image, lower=0.5, upper=1.5)
-
-# brightness
-def brightness(x, gamma=1, gain=1, is_random=False):
-    """Change the brightness of a single image, randomly or non-randomly.
-
-    Parameters
-    -----------
-    x : numpy array
-        An image with dimension of [row, col, channel] (default).
-    gamma : float, small than 1 means brighter.
-        Non negative real number. Default value is 1, smaller means brighter.
-
-        - If is_random is True, gamma in a range of (1-gamma, 1+gamma).
-    gain : float
-        The constant multiplier. Default value is 1.
-    is_random : boolean, default False
-        - If True, randomly change brightness.
-
-    References
-    -----------
-    - `skimage.exposure.adjust_gamma <http://scikit-image.org/docs/dev/api/skimage.exposure.html>`_
-    - `chinese blog <http://www.cnblogs.com/denny402/p/5124402.html>`_
-    """
-    if is_random:
-        gamma = np.random.uniform(1-gamma, 1+gamma)
-    x = exposure.adjust_gamma(x, gamma, gain)
-    return x
-
-def brightness_multi(x, gamma=1, gain=1, is_random=False):
-    """Change the brightness of multiply images, randomly or non-randomly.
-    Usually be used for image segmentation which x=[X, Y], X and Y should be matched.
-
-    Parameters
-    -----------
-    x : list of numpy array
-        List of images with dimension of [n_images, row, col, channel] (default).
-    others : see ``brightness``.
-    """
-    if is_random:
-        gamma = np.random.uniform(1-gamma, 1+gamma)
-
-    results = []
-    for data in x:
-        results.append( exposure.adjust_gamma(data, gamma, gain) )
-    return np.asarray(results)
-
-
-# contrast
-def constant(x, cutoff=0.5, gain=10, inv=False, is_random=False):
-    # TODO
-    x = exposure.adjust_sigmoid(x, cutoff=cutoff, gain=gain, inv=inv)
-    return x
-
-def constant_multi():
-    #TODO
-    pass
-
-# resize
-def imresize(x, size=[100, 100], interp='‘bicubic', mode=None):
-    """Resize an image by given output size and method. Warning, this function
-    will rescale the value to [0, 255].
-
-    Parameters
-    -----------
-    x : numpy array
-        An image with dimension of [row, col, channel] (default).
-    size : int, float or tuple (h, w)
-        - int, Percentage of current size.
-        - float, Fraction of current size.
-        - tuple, Size of the output image.
-    interp : str, optional
-        Interpolation to use for re-sizing (‘nearest’, ‘lanczos’, ‘bilinear’, ‘bicubic’ or ‘cubic’).
-    mode : str, optional
-        The PIL image mode (‘P’, ‘L’, etc.) to convert arr before resizing.
-
-    Returns
-    --------
-    imresize : ndarray
-    The resized array of image.
-
-    References
-    ------------
-    - `scipy.misc.imresize <https://docs.scipy.org/doc/scipy/reference/generated/scipy.misc.imresize.html>`_
-    """
-    if x.shape[-1] == 1:
-        # greyscale
-        x = scipy.misc.imresize(x[:,:,0], size, interp=interp, mode=mode)
-        return x[:, :, np.newaxis]
-    elif x.shape[-1] == 3:
-        # rgb, bgr ..
-        return scipy.misc.imresize(x, size, interp=interp, mode=mode)
-    else:
-        raise Exception("Unsupported channel %d" % x.shape[-1])
-
-# normailization
-def samplewise_norm(x, rescale=None, samplewise_center=False, samplewise_std_normalization=False,
-            channel_index=2, epsilon=1e-7):
-    """Normalize an image by rescale, samplewise centering and samplewise centering in order.
-
-    Parameters
-    -----------
-    x : numpy array
-        An image with dimension of [row, col, channel] (default).
-    rescale : rescaling factor.
-            If None or 0, no rescaling is applied, otherwise we multiply the data by the value provided (before applying any other transformation)
-    samplewise_center : set each sample mean to 0.
-    samplewise_std_normalization : divide each input by its std.
-    epsilon : small position value for dividing standard deviation.
-
-    Examples
-    --------
-    >>> x = samplewise_norm(x, samplewise_center=True, samplewise_std_normalization=True)
-    >>> print(x.shape, np.mean(x), np.std(x))
-    ... (160, 176, 1), 0.0, 1.0
-
-    Notes
-    ------
-    When samplewise_center and samplewise_std_normalization are True.
-
-    - For greyscale image, every pixels are subtracted and divided by the mean and std of whole image.
-    - For RGB image, every pixels are subtracted and divided by the mean and std of this pixel i.e. the mean and std of a pixel is 0 and 1.
-    """
-    if rescale:
-        x *= rescale
-
-    if x.shape[channel_index] == 1:
-        # greyscale
-        if samplewise_center:
-            x = x - np.mean(x)
-        if samplewise_std_normalization:
-            x = x / np.std(x)
-        return x
-    elif x.shape[channel_index] == 3:
-        # rgb
-        if samplewise_center:
-            x = x - np.mean(x, axis=channel_index, keepdims=True)
-        if samplewise_std_normalization:
-            x = x / (np.std(x, axis=channel_index, keepdims=True) + epsilon)
-        return x
-    else:
-        raise Exception("Unsupported channels %d" % x.shape[channel_index])
-
-def featurewise_norm(x, mean=None, std=None, epsilon=1e-7):
-    """Normalize every pixels by the same given mean and std, which are usually
-    compute from all examples.
-
-    Parameters
-    -----------
-    x : numpy array
-        An image with dimension of [row, col, channel] (default).
-    mean : value for subtraction.
-    std : value for division.
-    epsilon : small position value for dividing standard deviation.
-    """
-    if mean:
-        x = x - mean
-    if std:
-        x = x / (std + epsilon)
-    return x
-
-# whitening
-def get_zca_whitening_principal_components_img(X):
-    """Return the ZCA whitening principal components matrix.
-
-    Parameters
-    -----------
-    x : numpy array
-        Batch of image with dimension of [n_example, row, col, channel] (default).
-    """
-    flatX = np.reshape(X, (X.shape[0], X.shape[1] * X.shape[2] * X.shape[3]))
-    print("zca : computing sigma ..")
-    sigma = np.dot(flatX.T, flatX) / flatX.shape[0]
-    print("zca : computing U, S and V ..")
-    U, S, V = linalg.svd(sigma)
-    print("zca : computing principal components ..")
-    principal_components = np.dot(np.dot(U, np.diag(1. / np.sqrt(S + 10e-7))), U.T)
-    return principal_components
-
-def zca_whitening(x, principal_components):
-    """Apply ZCA whitening on an image by given principal components matrix.
-
-    Parameters
-    -----------
-    x : numpy array
-        An image with dimension of [row, col, channel] (default).
-    principal_components : matrix from ``get_zca_whitening_principal_components_img``.
-    """
-    flatx = np.reshape(x, (x.size))
-    # print(principal_components.shape, x.shape)  # ((28160, 28160), (160, 176, 1))
-    # flatx = np.reshape(x, (x.shape))
-    # flatx = np.reshape(x, (x.shape[0], ))
-    # print(flatx.shape)  # (160, 176, 1)
-    whitex = np.dot(flatx, principal_components)
-    x = np.reshape(whitex, (x.shape[0], x.shape[1], x.shape[2]))
-    return x
-
-# developing
-# def barrel_transform(x, intensity):
-#     # https://github.com/fchollet/keras/blob/master/keras/preprocessing/image.py
-#     # TODO
-#     pass
-#
-# def barrel_transform_multi(x, intensity):
-#     # https://github.com/fchollet/keras/blob/master/keras/preprocessing/image.py
-#     # TODO
-#     pass
-
-# channel shift
-def channel_shift(x, intensity, is_random=False, channel_index=2):
-    """Shift the channels of an image, randomly or non-randomly, see `numpy.rollaxis <https://docs.scipy.org/doc/numpy/reference/generated/numpy.rollaxis.html>`_.
-
-    Parameters
-    -----------
-    x : numpy array
-        An image with dimension of [row, col, channel] (default).
-    intensity : float
-        Intensity of shifting.
-    is_random : boolean, default False
-        If True, randomly shift.
-    channel_index : int
-        Index of channel, default 2.
-    """
-    if is_random:
-        factor = np.random.uniform(-intensity, intensity)
-    else:
-        factor = intensity
-    x = np.rollaxis(x, channel_index, 0)
-    min_x, max_x = np.min(x), np.max(x)
-    channel_images = [np.clip(x_channel + factor, min_x, max_x)
-                      for x_channel in x]
-    x = np.stack(channel_images, axis=0)
-    x = np.rollaxis(x, 0, channel_index+1)
-    return x
-    # x = np.rollaxis(x, channel_index, 0)
-    # min_x, max_x = np.min(x), np.max(x)
-    # channel_images = [np.clip(x_channel + np.random.uniform(-intensity, intensity), min_x, max_x)
-    #                   for x_channel in x]
-    # x = np.stack(channel_images, axis=0)
-    # x = np.rollaxis(x, 0, channel_index+1)
-    # return x
-
-def channel_shift_multi(x, intensity, channel_index=2):
-    """Shift the channels of images with the same arguments, randomly or non-randomly, see `numpy.rollaxis <https://docs.scipy.org/doc/numpy/reference/generated/numpy.rollaxis.html>`_ .
-    Usually be used for image segmentation which x=[X, Y], X and Y should be matched.
-
-    Parameters
-    -----------
-    x : list of numpy array
-        List of images with dimension of [n_images, row, col, channel] (default).
-    others : see ``channel_shift``.
-    """
-    if is_random:
-        factor = np.random.uniform(-intensity, intensity)
-    else:
-        factor = intensity
-
-    results = []
-    for data in x:
-        data = np.rollaxis(data, channel_index, 0)
-        min_x, max_x = np.min(data), np.max(data)
-        channel_images = [np.clip(x_channel + factor, min_x, max_x)
-                          for x_channel in x]
-        data = np.stack(channel_images, axis=0)
-        data = np.rollaxis(x, 0, channel_index+1)
-        results.append( data )
-    return np.asarray(results)
-
-# noise
-def drop(x, keep=0.5):
-    """Randomly set some pixels to zero by a given keeping probability.
-
-    Parameters
-    -----------
-    x : numpy array
-        An image with dimension of [row, col, channel] or [row, col].
-    keep : float (0, 1)
-        The keeping probability, the lower more values will be set to zero.
-    """
-    if len(x.shape) == 3:
-        if x.shape[-1]==3: # color
-            img_size = x.shape
-            mask = np.random.binomial(n=1, p=keep, size=x.shape[:-1])
-            for i in range(3):
-                x[:,:,i] = np.multiply(x[:,:,i] , mask)
-        elif x.shape[-1]==1: # greyscale image
-            img_size = x.shape
-            x = np.multiply(x , np.random.binomial(n=1, p=keep, size=img_size))
-        else:
-            raise Exception("Unsupported shape {}".format(x.shape))
-    elif len(x.shape) == 2 or 1: # greyscale matrix (image) or vector
-        img_size = x.shape
-        x = np.multiply(x , np.random.binomial(n=1, p=keep, size=img_size))
-    else:
-        raise Exception("Unsupported shape {}".format(x.shape))
-    return x
-
-# x = np.asarray([[1,2,3,4,5,6,7,8,9,10],[1,2,3,4,5,6,7,8,9,10]])
-# x = np.asarray([x,x,x,x,x,x])
-# x.shape = 10, 4, 3
-# # print(x)
-# # exit()
-# print(x.shape)
-# # exit()
-# print(drop(x, keep=1.))
-# exit()
-
-# manual transform
-def transform_matrix_offset_center(matrix, x, y):
-    """Return transform matrix offset center.
-
-    Parameters
-    ----------
-    matrix : numpy array
-        Transform matrix
-    x, y : int
-        Size of image.
-
-    Examples
-    --------
-    - See ``rotation``, ``shear``, ``zoom``.
-    """
-    o_x = float(x) / 2 + 0.5
-    o_y = float(y) / 2 + 0.5
-    offset_matrix = np.array([[1, 0, o_x], [0, 1, o_y], [0, 0, 1]])
-    reset_matrix = np.array([[1, 0, -o_x], [0, 1, -o_y], [0, 0, 1]])
-    transform_matrix = np.dot(np.dot(offset_matrix, matrix), reset_matrix)
-    return transform_matrix
-
-
-def apply_transform(x, transform_matrix, channel_index=2, fill_mode='nearest', cval=0., order=1):
-    """Return transformed images by given transform_matrix from ``transform_matrix_offset_center``.
-
-    Parameters
-    ----------
-    x : numpy array
-        Batch of images with dimension of 3, [batch_size, row, col, channel].
-    transform_matrix : numpy array
-        Transform matrix (offset center), can be generated by ``transform_matrix_offset_center``
-    channel_index : int
-        Index of channel, default 2.
-    fill_mode : string
-        Method to fill missing pixel, default ‘nearest’, more options ‘constant’, ‘reflect’ or ‘wrap’
-
-        - `scipy ndimage affine_transform <https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.ndimage.interpolation.affine_transform.html>`_
-    cval : scalar, optional
-        Value used for points outside the boundaries of the input if mode='constant'. Default is 0.0
-    order : int, optional
-        The order of interpolation. The order has to be in the range 0-5:
-
-        - 0 Nearest-neighbor
-        - 1 Bi-linear (default)
-        - 2 Bi-quadratic
-        - 3 Bi-cubic
-        - 4 Bi-quartic
-        - 5 Bi-quintic
-
-        - `scipy ndimage affine_transform <https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.ndimage.interpolation.affine_transform.html>`_
-
-    Examples
-    --------
-    - See ``rotation``, ``shift``, ``shear``, ``zoom``.
-    """
-    x = np.rollaxis(x, channel_index, 0)
-    final_affine_matrix = transform_matrix[:2, :2]
-    final_offset = transform_matrix[:2, 2]
-    channel_images = [ndi.interpolation.affine_transform(x_channel, final_affine_matrix,
-                      final_offset, order=order, mode=fill_mode, cval=cval) for x_channel in x]
-    x = np.stack(channel_images, axis=0)
-    x = np.rollaxis(x, 0, channel_index+1)
-    return x
-
-
-def projective_transform_by_points(x, src, dst, map_args={}, output_shape=None, order=1, mode='constant', cval=0.0, clip=True, preserve_range=False):
-    """Projective transform by given coordinates, usually 4 coordinates. see `scikit-image <http://scikit-image.org/docs/dev/auto_examples/applications/plot_geometric.html>`_.
-
-    Parameters
-    -----------
-    x : numpy array
-        An image with dimension of [row, col, channel] (default).
-    src : list or numpy
-        The original coordinates, usually 4 coordinates of (x, y).
-    dst : list or numpy
-        The coordinates after transformation, the number of coordinates is the same with src.
-    map_args : dict, optional
-        Keyword arguments passed to inverse_map.
-    output_shape : tuple (rows, cols), optional
-        Shape of the output image generated. By default the shape of the input image is preserved. Note that, even for multi-band images, only rows and columns need to be specified.
-    order : int, optional
-        The order of interpolation. The order has to be in the range 0-5:
-
-        - 0 Nearest-neighbor
-        - 1 Bi-linear (default)
-        - 2 Bi-quadratic
-        - 3 Bi-cubic
-        - 4 Bi-quartic
-        - 5 Bi-quintic
-    mode : {‘constant’, ‘edge’, ‘symmetric’, ‘reflect’, ‘wrap’}, optional
-        Points outside the boundaries of the input are filled according to the given mode. Modes match the behaviour of numpy.pad.
-    cval : float, optional
-        Used in conjunction with mode ‘constant’, the value outside the image boundaries.
-    clip : bool, optional
-        Whether to clip the output to the range of values of the input image. This is enabled by default, since higher order interpolation may produce values outside the given input range.
-    preserve_range : bool, optional
-        Whether to keep the original range of values. Otherwise, the input image is converted according to the conventions of img_as_float.
-
-    Examples
-    --------
-    >>> Assume X is an image from CIFAR 10, i.e. shape == (32, 32, 3)
-    >>> src = [[0,0],[0,32],[32,0],[32,32]]
-    >>> dst = [[10,10],[0,32],[32,0],[32,32]]
-    >>> x = projective_transform_by_points(X, src, dst)
-
-    References
-    -----------
-    - `scikit-image : geometric transformations <http://scikit-image.org/docs/dev/auto_examples/applications/plot_geometric.html>`_
-    - `scikit-image : examples <http://scikit-image.org/docs/dev/auto_examples/index.html>`_
-    """
-    if type(src) is list:   # convert to numpy
-        src = np.array(src)
-    if type(dst) is list:
-        dst = np.array(dst)
-    if np.max(x)>1:         # convert to [0, 1]
-        x = x/255
-
-    m = transform.ProjectiveTransform()
-    m.estimate(dst, src)
-    warped = transform.warp(x, m,  map_args=map_args, output_shape=output_shape, order=order, mode=mode, cval=cval, clip=clip, preserve_range=preserve_range)
-    return warped
-
-# Numpy and PIL
-def array_to_img(x, dim_ordering=(0,1,2), scale=True):
-    """Converts a numpy array to PIL image object (uint8 format).
-
-    Parameters
-    ----------
-    x : numpy array
-        A image with dimension of 3 and channels of 1 or 3.
-    dim_ordering : list or tuple of 3 int
-        Index of row, col and channel, default (0, 1, 2), for theano (1, 2, 0).
-    scale : boolean, default is True
-        If True, converts image to [0, 255] from any range of value like [-1, 2].
-
-    References
-    -----------
-    - `PIL Image.fromarray <http://pillow.readthedocs.io/en/3.1.x/reference/Image.html?highlight=fromarray>`_
-    """
-    from PIL import Image
-    # if dim_ordering == 'default':
-    #     dim_ordering = K.image_dim_ordering()
-    # if dim_ordering == 'th':  # theano
-    #     x = x.transpose(1, 2, 0)
-    x = x.transpose(dim_ordering)
-    if scale:
-        x += max(-np.min(x), 0)
-        x_max = np.max(x)
-        if x_max != 0:
-            # print(x_max)
-            # x /= x_max
-            x = x / x_max
-        x *= 255
-    if x.shape[2] == 3:
-        # RGB
-        return Image.fromarray(x.astype('uint8'), 'RGB')
-    elif x.shape[2] == 1:
-        # grayscale
-        return Image.fromarray(x[:, :, 0].astype('uint8'), 'L')
-    else:
-        raise Exception('Unsupported channel number: ', x.shape[2])
-
-
-
-
-def find_contours(x, level=0.8, fully_connected='low', positive_orientation='low'):
-    """ Find iso-valued contours in a 2D array for a given level value, returns list of (n, 2)-ndarrays
-    see `skimage.measure.find_contours <http://scikit-image.org/docs/dev/api/skimage.measure.html#skimage.measure.find_contours>`_ .
-
-    Parameters
-    ------------
-    x : 2D ndarray of double. Input data in which to find contours.
-    level : float. Value along which to find contours in the array.
-    fully_connected : str, {‘low’, ‘high’}.  Indicates whether array elements below the given level value are to be considered fully-connected (and hence elements above the value will only be face connected), or vice-versa. (See notes below for details.)
-    positive_orientation : either ‘low’ or ‘high’. Indicates whether the output contours will produce positively-oriented polygons around islands of low- or high-valued elements. If ‘low’ then contours will wind counter-clockwise around elements below the iso-value. Alternately, this means that low-valued elements are always on the left of the contour.
-    """
-    return skimage.measure.find_contours(x, level, fully_connected='low', positive_orientation='low')
-
-def pt2map(list_points=[], size=(100, 100), val=1):
-    """ Inputs a list of points, return a 2D image.
-
-    Parameters
-    --------------
-    list_points : list of [x, y].
-    size : tuple of (w, h) for output size.
-    val : float or int for the contour value.
-    """
-    i_m = np.zeros(size)
-    if list_points == []:
-        return i_m
-    for xx in list_points:
-        for x in xx:
-            # print(x)
-            i_m[int(np.round(x[0]))][int(np.round(x[1]))] = val
-    return i_m
-
-def binary_dilation(x, radius=3):
-    """ Return fast binary morphological dilation of an image.
-    see `skimage.morphology.binary_dilation <http://scikit-image.org/docs/dev/api/skimage.morphology.html#skimage.morphology.binary_dilation>`_.
-
-    Parameters
-    -----------
-    x : 2D array image.
-    radius : int for the radius of mask.
-    """
-    from skimage.morphology import disk, binary_dilation
-    mask = disk(radius)
-    x = binary_dilation(x, selem=mask)
-    return x
-
-def dilation(x, radius=3):
-    """ Return greyscale morphological dilation of an image,
-    see `skimage.morphology.dilation <http://scikit-image.org/docs/dev/api/skimage.morphology.html#skimage.morphology.dilation>`_.
-
-    Parameters
-    -----------
-    x : 2D array image.
-    radius : int for the radius of mask.
-    """
-    from skimage.morphology import disk, dilation
-    mask = disk(radius)
-    x = dilation(x, selem=mask)
-    return x
-
-
-def binary_erosion(x, radius=3):
-    """ Return binary morphological erosion of an image,
-    see `skimage.morphology.binary_erosion <http://scikit-image.org/docs/dev/api/skimage.morphology.html#skimage.morphology.binary_erosion>`_.
-
-    Parameters
-    -----------
-    x : 2D array image.
-    radius : int for the radius of mask.
-    """
-    from skimage.morphology import disk, dilation, binary_erosion
-    mask = disk(radius)
-    x = binary_erosion(x, selem=mask)
-    return x
-
-def erosion(x, radius=3):
-    """ Return greyscale morphological erosion of an image,
-    see `skimage.morphology.erosion <http://scikit-image.org/docs/dev/api/skimage.morphology.html#skimage.morphology.erosion>`_.
-
-    Parameters
-    -----------
-    x : 2D array image.
-    radius : int for the radius of mask.
-    """
-    from skimage.morphology import disk, dilation, erosion
-    mask = disk(radius)
-    x = erosion(x, selem=mask)
-    return x
-
-
-## Sequence
-def pad_sequences(sequences, maxlen=None, dtype='int32', padding='post', truncating='pre', value=0.):
-    """Pads each sequence to the same length:
-    the length of the longest sequence.
-    If maxlen is provided, any sequence longer
-    than maxlen is truncated to maxlen.
-    Truncation happens off either the beginning (default) or
-    the end of the sequence.
-    Supports post-padding and pre-padding (default).
-
-    Parameters
-    ----------
-    sequences : list of lists where each element is a sequence
-    maxlen : int, maximum length
-    dtype : type to cast the resulting sequence.
-    padding : 'pre' or 'post', pad either before or after each sequence.
-    truncating : 'pre' or 'post', remove values from sequences larger than
-        maxlen either in the beginning or in the end of the sequence
-    value : float, value to pad the sequences to the desired value.
-
-    Returns
-    ----------
-    x : numpy array with dimensions (number_of_sequences, maxlen)
-
-    Examples
-    ----------
-    >>> sequences = [[1,1,1,1,1],[2,2,2],[3,3]]
-    >>> sequences = pad_sequences(sequences, maxlen=None, dtype='int32',
-    ...                  padding='post', truncating='pre', value=0.)
-    ... [[1 1 1 1 1]
-    ...  [2 2 2 0 0]
-    ...  [3 3 0 0 0]]
-    """
-    lengths = [len(s) for s in sequences]
-
-    nb_samples = len(sequences)
-    if maxlen is None:
-        maxlen = np.max(lengths)
-
-    # take the sample shape from the first non empty sequence
-    # checking for consistency in the main loop below.
-    sample_shape = tuple()
-    for s in sequences:
-        if len(s) > 0:
-            sample_shape = np.asarray(s).shape[1:]
-            break
-
-    x = (np.ones((nb_samples, maxlen) + sample_shape) * value).astype(dtype)
-    for idx, s in enumerate(sequences):
-        if len(s) == 0:
-            continue  # empty list was found
-        if truncating == 'pre':
-            trunc = s[-maxlen:]
-        elif truncating == 'post':
-            trunc = s[:maxlen]
-        else:
-            raise ValueError('Truncating type "%s" not understood' % truncating)
-
-        # check `trunc` has expected shape
-        trunc = np.asarray(trunc, dtype=dtype)
-        if trunc.shape[1:] != sample_shape:
-            raise ValueError('Shape of sample %s of sequence at position %s is different from expected shape %s' %
-                             (trunc.shape[1:], idx, sample_shape))
-
-        if padding == 'post':
-            x[idx, :len(trunc)] = trunc
-        elif padding == 'pre':
-            x[idx, -len(trunc):] = trunc
-        else:
-            raise ValueError('Padding type "%s" not understood' % padding)
-    return x.tolist()
-
-def remove_pad_sequences(sequences, pad_id=0):
-    """Remove padding.
-
-    Parameters
-    -----------
-    sequences : list of list.
-    pad_id : int.
-
-    Examples
-    ----------
-    >>> sequences = [[2,3,4,0,0], [5,1,2,3,4,0,0,0], [4,5,0,2,4,0,0,0]]
-    >>> print(remove_pad_sequences(sequences, pad_id=0))
-    ... [[2, 3, 4], [5, 1, 2, 3, 4], [4, 5, 0, 2, 4]]
-    """
-    import copy
-    sequences_out = copy.deepcopy(sequences)
-    for i in range(len(sequences)):
-        # for j in range(len(sequences[i])):
-        #     if sequences[i][j] == pad_id:
-        #         sequences_out[i] = sequences_out[i][:j]
-        #         break
-        for j in range(1, len(sequences[i])):
-            if sequences[i][-j] != pad_id:
-                sequences_out[i] = sequences_out[i][0:-j+1]
-                break
-    return sequences_out
-
-def process_sequences(sequences, end_id=0, pad_val=0, is_shorten=True, remain_end_id=False):
-    """Set all tokens(ids) after END token to the padding value, and then shorten (option) it to the maximum sequence length in this batch.
-
-    Parameters
-    -----------
-    sequences : numpy array or list of list with token IDs.
-        e.g. [[4,3,5,3,2,2,2,2], [5,3,9,4,9,2,2,3]]
-    end_id : int, the special token for END.
-    pad_val : int, replace the end_id and the ids after end_id to this value.
-    is_shorten : boolean, default True.
-        Shorten the sequences.
-    remain_end_id : boolean, default False.
-        Keep an end_id in the end.
-
-    Examples
-    ---------
-    >>> sentences_ids = [[4, 3, 5, 3, 2, 2, 2, 2],  <-- end_id is 2
-    ...                  [5, 3, 9, 4, 9, 2, 2, 3]]  <-- end_id is 2
-    >>> sentences_ids = precess_sequences(sentences_ids, end_id=vocab.end_id, pad_val=0, is_shorten=True)
-    ... [[4, 3, 5, 3, 0], [5, 3, 9, 4, 9]]
-    """
-    max_length = 0
-    for i_s, seq in enumerate(sequences):
-        is_end = False
-        for i_w, n in enumerate(seq):
-            if n == end_id and is_end == False: # 1st time to see end_id
-                is_end = True
-                if max_length < i_w:
-                    max_length = i_w
-                if remain_end_id is False:
-                    seq[i_w] = pad_val      # set end_id to pad_val
-            elif is_end == True:
-                seq[i_w] = pad_val
-
-    if remain_end_id is True:
-        max_length += 1
-    if is_shorten:
-        for i, seq in enumerate(sequences):
-            sequences[i] = seq[:max_length]
-    return sequences
-
-def sequences_add_start_id(sequences, start_id=0, remove_last=False):
-    """Add special start token(id) in the beginning of each sequence.
-
-    Examples
-    ---------
-    >>> sentences_ids = [[4,3,5,3,2,2,2,2], [5,3,9,4,9,2,2,3]]
-    >>> sentences_ids = sequences_add_start_id(sentences_ids, start_id=2)
-    ... [[2, 4, 3, 5, 3, 2, 2, 2, 2], [2, 5, 3, 9, 4, 9, 2, 2, 3]]
-    >>> sentences_ids = sequences_add_start_id(sentences_ids, start_id=2, remove_last=True)
-    ... [[2, 4, 3, 5, 3, 2, 2, 2], [2, 5, 3, 9, 4, 9, 2, 2]]
-
-    - For Seq2seq
-    >>> input = [a, b, c]
-    >>> target = [x, y, z]
-    >>> decode_seq = [start_id, a, b] <-- sequences_add_start_id(input, start_id, True)
-    """
-    sequences_out = [[] for _ in range(len(sequences))]#[[]] * len(sequences)
-    for i in range(len(sequences)):
-        if remove_last:
-            sequences_out[i] = [start_id] + sequences[i][:-1]
-        else:
-            sequences_out[i] = [start_id] + sequences[i]
-    return sequences_out
-
-def sequences_add_end_id(sequences, end_id=888):
-    """Add special end token(id) in the end of each sequence.
-
-    Parameters
-    -----------
-    sequences : list of list.
-    end_id : int.
-
-    Examples
-    ---------
-    >>> sequences = [[1,2,3],[4,5,6,7]]
-    >>> print(sequences_add_end_id(sequences, end_id=999))
-    ... [[1, 2, 3, 999], [4, 5, 6, 999]]
-    """
-    sequences_out = [[] for _ in range(len(sequences))]#[[]] * len(sequences)
-    for i in range(len(sequences)):
-        sequences_out[i] = sequences[i] + [end_id]
-    return sequences_out
-
-
-def sequences_add_end_id_after_pad(sequences, end_id=888, pad_id=0):
-    """Add special end token(id) in the end of each sequence.
-
-    Parameters
-    -----------
-    sequences : list of list.
-    end_id : int.
-    pad_id : int.
-
-    Examples
-    ---------
-    >>> sequences = [[1,2,0,0], [1,2,3,0], [1,2,3,4]]
-    >>> print(sequences_add_end_id_after_pad(sequences, end_id=99, pad_id=0))
-    ... [[1, 2, 99, 0], [1, 2, 3, 99], [1, 2, 3, 4]]
-    """
-    # sequences_out = [[] for _ in range(len(sequences))]#[[]] * len(sequences)
-    import copy
-    sequences_out = copy.deepcopy(sequences)
-    # # add a pad to all
-    # for i in range(len(sequences)):
-    #     for j in range(len(sequences[i])):
-    #         sequences_out[i].append(pad_id)
-    # # pad -- > end
-    # max_len = 0
-    for i in range(len(sequences)):
-        for j in range(len(sequences[i])):
-            if sequences[i][j] == pad_id:
-                sequences_out[i][j] = end_id
-                # if j > max_len:
-                #     max_len = j
-                break
-    # # remove pad if too long
-    # for i in range(len(sequences)):
-    #     for j in range(len(sequences[i])):
-    #         sequences_out[i] = sequences_out[i][:max_len+1]
-    return sequences_out
-
-def sequences_get_mask(sequences, pad_val=0):
-    """Return mask for sequences.
-
-    Examples
-    ---------
-    >>> sentences_ids = [[4, 0, 5, 3, 0, 0],
-    ...                  [5, 3, 9, 4, 9, 0]]
-    >>> mask = sequences_get_mask(sentences_ids, pad_val=0)
-    ... [[1 1 1 1 0 0]
-    ...  [1 1 1 1 1 0]]
-    """
-    mask = np.ones_like(sequences)
-    for i, seq in enumerate(sequences):
-        for i_w in reversed(range(len(seq))):
-            if seq[i_w] == pad_val:
-                mask[i, i_w] = 0
-            else:
-                break   # <-- exit the for loop, prepcess next sequence
-    return mask
-
-
-## Text
-# see tensorlayer.nlp
-
-
-## Tensor Opt
-def distorted_images(images=None, height=24, width=24):
-    """Distort images for generating more training data.
-
-    Features
-    ---------
-    They are cropped to height * width pixels randomly.
-
-    They are approximately whitened to make the model insensitive to dynamic range.
-
-    Randomly flip the image from left to right.
-
-    Randomly distort the image brightness.
-
-    Randomly distort the image contrast.
-
-    Whiten (Normalize) the images.
-
-    Parameters
-    ----------
-    images : 4D Tensor
-        The tensor or placeholder of images
-    height : int
-        The height for random crop.
-    width : int
-        The width for random crop.
-
-    Returns
-    -------
-    result : tuple of Tensor
-        (Tensor for distorted images, Tensor for while loop index)
-
-    Examples
-    --------
-    >>> X_train, y_train, X_test, y_test = tl.files.load_cifar10_dataset(shape=(-1, 32, 32, 3), plotable=False)
-    >>> sess = tf.InteractiveSession()
-    >>> batch_size = 128
-    >>> x = tf.placeholder(tf.float32, shape=[batch_size, 32, 32, 3])
-    >>> distorted_images_op = tl.preprocess.distorted_images(images=x, height=24, width=24)
-    >>> sess.run(tf.initialize_all_variables())
-    >>> feed_dict={x: X_train[0:batch_size,:,:,:]}
-    >>> distorted_images, idx = sess.run(distorted_images_op, feed_dict=feed_dict)
-    >>> tl.visualize.images2d(X_train[0:9,:,:,:], second=2, saveable=False, name='cifar10', dtype=np.uint8, fig_idx=20212)
-    >>> tl.visualize.images2d(distorted_images[1:10,:,:,:], second=10, saveable=False, name='distorted_images', dtype=None, fig_idx=23012)
-
-    Notes
-    ------
-    - The first image in 'distorted_images' should be removed.
-
-    References
-    -----------
-    - `tensorflow.models.image.cifar10.cifar10_input <https://github.com/tensorflow/tensorflow/blob/r0.9/tensorflow/models/image/cifar10/cifar10_input.py>`_
-    """
-    print("This function is deprecated, please use tf.map_fn instead, e.g:\n   \
-            t_image = tf.map_fn(lambda img: tf.image.random_brightness(img, max_delta=32. / 255.), t_image)\n \
-            t_image = tf.map_fn(lambda img: tf.image.random_contrast(img, lower=0.5, upper=1.5), t_image)\n \
-            t_image = tf.map_fn(lambda img: tf.image.random_saturation(img, lower=0.5, upper=1.5), t_image)\n \
-            t_image = tf.map_fn(lambda img: tf.image.random_hue(img, max_delta=0.032), t_image)")
-    exit()
-    # print(" [Warning] distorted_images will be deprecated due to speed, see TFRecord tutorial for more info...")
-    try:
-        batch_size = int(images._shape[0])
-    except:
-        raise Exception('unknow batch_size of images')
-    distorted_x = tf.Variable(tf.constant(0.1, shape=[1, height, width, 3]))
-    i = tf.Variable(tf.constant(0))
-
-    c = lambda distorted_x, i: tf.less(i, batch_size)
-
-    def body(distorted_x, i):
-        # 1. Randomly crop a [height, width] section of the image.
-        image = tf.random_crop(tf.gather(images, i), [height, width, 3])
-        # 2. Randomly flip the image horizontally.
-        image = tf.image.random_flip_left_right(image)
-        # 3. Randomly change brightness.
-        image = tf.image.random_brightness(image, max_delta=63)
-        # 4. Randomly change contrast.
-        image = tf.image.random_contrast(image, lower=0.2, upper=1.8)
-        # 5. Subtract off the mean and divide by the variance of the pixels.
-        image = tf.image.per_image_whitening(image)
-        # 6. Append the image to a batch.
-        image = tf.expand_dims(image, 0)
-        return tf.concat(0, [distorted_x, image]), tf.add(i, 1)
-
-    result = tf.while_loop(cond=c, body=body, loop_vars=(distorted_x, i), parallel_iterations=16)
-    return result
-
-
-def crop_central_whiten_images(images=None, height=24, width=24):
-    """Crop the central of image, and normailize it for test data.
-
-    They are cropped to central of height * width pixels.
-
-    Whiten (Normalize) the images.
-
-    Parameters
-    ----------
-    images : 4D Tensor
-        The tensor or placeholder of images
-    height : int
-        The height for central crop.
-    width : int
-        The width for central crop.
-
-    Returns
-    -------
-    result : tuple Tensor
-        (Tensor for distorted images, Tensor for while loop index)
-
-    Examples
-    --------
-    >>> X_train, y_train, X_test, y_test = tl.files.load_cifar10_dataset(shape=(-1, 32, 32, 3), plotable=False)
-    >>> sess = tf.InteractiveSession()
-    >>> batch_size = 128
-    >>> x = tf.placeholder(tf.float32, shape=[batch_size, 32, 32, 3])
-    >>> central_images_op = tl.preprocess.crop_central_whiten_images(images=x, height=24, width=24)
-    >>> sess.run(tf.initialize_all_variables())
-    >>> feed_dict={x: X_train[0:batch_size,:,:,:]}
-    >>> central_images, idx = sess.run(central_images_op, feed_dict=feed_dict)
-    >>> tl.visualize.images2d(X_train[0:9,:,:,:], second=2, saveable=False, name='cifar10', dtype=np.uint8, fig_idx=20212)
-    >>> tl.visualize.images2d(central_images[1:10,:,:,:], second=10, saveable=False, name='central_images', dtype=None, fig_idx=23012)
-
-    Notes
-    ------
-    The first image in 'central_images' should be removed.
-
-    Code References
-    ----------------
-    - ``tensorflow.models.image.cifar10.cifar10_input``
-    """
-    print("This function is deprecated, please use tf.map_fn instead, e.g:\n   \
-            t_image = tf.map_fn(lambda img: tf.image.random_brightness(img, max_delta=32. / 255.), t_image)\n \
-            t_image = tf.map_fn(lambda img: tf.image.random_contrast(img, lower=0.5, upper=1.5), t_image)\n \
-            t_image = tf.map_fn(lambda img: tf.image.random_saturation(img, lower=0.5, upper=1.5), t_image)\n \
-            t_image = tf.map_fn(lambda img: tf.image.random_hue(img, max_delta=0.032), t_image)")
-    exit()
-    # print(" [Warning] crop_central_whiten_images will be deprecated due to speed, see TFRecord tutorial for more info...")
-    try:
-        batch_size = int(images._shape[0])
-    except:
-        raise Exception('unknow batch_size of images')
-    central_x = tf.Variable(tf.constant(0.1, shape=[1, height, width, 3]))
-    i = tf.Variable(tf.constant(0))
-
-    c = lambda central_x, i: tf.less(i, batch_size)
-
-    def body(central_x, i):
-        # 1. Crop the central [height, width] of the image.
-        image = tf.image.resize_image_with_crop_or_pad(tf.gather(images, i), height, width)
-        # 2. Subtract off the mean and divide by the variance of the pixels.
-        image = tf.image.per_image_whitening(image)
-        # 5. Append the image to a batch.
-        image = tf.expand_dims(image, 0)
-        return tf.concat(0, [central_x, image]), tf.add(i, 1)
-
-    result = tf.while_loop(cond=c, body=body, loop_vars=(central_x, i), parallel_iterations=16)
-    return result
-
-
-
-
-
-
-
-
-
-
-
-
-#
diff --git a/tensorlayer/rein.py b/tensorlayer/rein.py
deleted file mode 100644
index f6d5449b..00000000
--- a/tensorlayer/rein.py
+++ /dev/null
@@ -1,134 +0,0 @@
-#! /usr/bin/python
-# -*- coding: utf8 -*-
-
-
-
-import tensorflow as tf
-import numpy as np
-from six.moves import xrange
-
-def discount_episode_rewards(rewards=[], gamma=0.99, mode=0):
-    """ Take 1D float array of rewards and compute discounted rewards for an
-    episode. When encount a non-zero value, consider as the end a of an episode.
-
-    Parameters
-    ----------
-    rewards : numpy list
-        a list of rewards
-    gamma : float
-        discounted factor
-    mode : int
-        if mode == 0, reset the discount process when encount a non-zero reward (Ping-pong game).
-        if mode == 1, would not reset the discount process.
-
-    Examples
-    ----------
-    >>> rewards = np.asarray([0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1])
-    >>> gamma = 0.9
-    >>> discount_rewards = tl.rein.discount_episode_rewards(rewards, gamma)
-    >>> print(discount_rewards)
-    ... [ 0.72899997  0.81        0.89999998  1.          0.72899997  0.81
-    ... 0.89999998  1.          0.72899997  0.81        0.89999998  1.        ]
-    >>> discount_rewards = tl.rein.discount_episode_rewards(rewards, gamma, mode=1)
-    >>> print(discount_rewards)
-    ... [ 1.52110755  1.69011939  1.87791049  2.08656716  1.20729685  1.34144104
-    ... 1.49048996  1.65610003  0.72899997  0.81        0.89999998  1.        ]
-    """
-    discounted_r = np.zeros_like(rewards, dtype=np.float32)
-    running_add = 0
-    for t in reversed(xrange(0, rewards.size)):
-        if mode == 0:
-            if rewards[t] != 0: running_add = 0
-
-        running_add = running_add * gamma + rewards[t]
-        discounted_r[t] = running_add
-    return discounted_r
-
-
-def cross_entropy_reward_loss(logits, actions, rewards, name=None):
-    """ Calculate the loss for Policy Gradient Network.
-
-    Parameters
-    ----------
-    logits : tensor
-        The network outputs without softmax. This function implements softmax
-        inside.
-    actions : tensor/ placeholder
-        The agent actions.
-    rewards : tensor/ placeholder
-        The rewards.
-
-    Examples
-    ----------
-    >>> states_batch_pl = tf.placeholder(tf.float32, shape=[None, D])
-    >>> network = InputLayer(states_batch_pl, name='input')
-    >>> network = DenseLayer(network, n_units=H, act=tf.nn.relu, name='relu1')
-    >>> network = DenseLayer(network, n_units=3, name='out')
-    >>> probs = network.outputs
-    >>> sampling_prob = tf.nn.softmax(probs)
-    >>> actions_batch_pl = tf.placeholder(tf.int32, shape=[None])
-    >>> discount_rewards_batch_pl = tf.placeholder(tf.float32, shape=[None])
-    >>> loss = tl.rein.cross_entropy_reward_loss(probs, actions_batch_pl, discount_rewards_batch_pl)
-    >>> train_op = tf.train.RMSPropOptimizer(learning_rate, decay_rate).minimize(loss)
-    """
-
-    try: # TF 1.0+
-        cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=actions, logits=logits, name=name)
-    except:
-        cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, targets=actions)
-        # cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, actions)
-
-    try: ## TF1.0+
-        loss = tf.reduce_sum(tf.multiply(cross_entropy, rewards))
-    except: ## TF0.12
-        loss = tf.reduce_sum(tf.mul(cross_entropy, rewards))   # element-wise mul
-    return loss
-
-def log_weight(probs, weights, name='log_weight'):
-    """Log weight.
-
-    Parameters
-    -----------
-    probs : tensor
-        If it is a network output, usually we should scale it to [0, 1] via softmax.
-    weights : tensor
-    """
-    with tf.variable_scope(name):
-        exp_v = tf.reduce_mean(tf.log(probs) * weights)
-        return exp_v
-
-
-
-def choice_action_by_probs(probs=[0.5, 0.5], action_list=None):
-    """Choice and return an an action by given the action probability distribution.
-
-    Parameters
-    ------------
-    probs : a list of float.
-        The probability distribution of all actions.
-    action_list : None or a list of action in integer, string or others.
-        If None, returns an integer range between 0 and len(probs)-1.
-
-    Examples
-    ----------
-    >>> for _ in range(5):
-    >>>     a = choice_action_by_probs([0.2, 0.4, 0.4])
-    >>>     print(a)
-    ... 0
-    ... 1
-    ... 1
-    ... 2
-    ... 1
-    >>> for _ in range(3):
-    >>>     a = choice_action_by_probs([0.5, 0.5], ['a', 'b'])
-    >>>     print(a)
-    ... a
-    ... b
-    ... b
-    """
-    if action_list is None:
-        n_action = len(probs)
-        action_list = np.arange(n_action)
-    else:
-        assert len(action_list) == len(probs), "Number of actions should equal to number of probabilities."
-    return np.random.choice(action_list, p=probs)
diff --git a/tensorlayer/third_party/__init__.py b/tensorlayer/third_party/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/tensorlayer/third_party/roi_pooling/.gitignore b/tensorlayer/third_party/roi_pooling/.gitignore
deleted file mode 100644
index 08030a8f..00000000
--- a/tensorlayer/third_party/roi_pooling/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
-.ipynb_checkpoints/
-build/
-
diff --git a/tensorlayer/third_party/roi_pooling/README.md b/tensorlayer/third_party/roi_pooling/README.md
deleted file mode 100644
index d597cea9..00000000
--- a/tensorlayer/third_party/roi_pooling/README.md
+++ /dev/null
@@ -1,56 +0,0 @@
-# Hint from TensorLayer
-- This implementation is from `https://github.com/deepsense-ai/roi-pooling`, date: 31 Aug 2017.
-- To install this, you have to clone TensorLayer from Github instead of pip install.
-- Remember to modify the `CUDA_LIB` in Makefile before running `python setup.py install` in this folder.
-- Make sure `roi_pooling_example.py` and `test_roi_layer.py` is runable.
-
-
-----
-
- 
-## RoI pooling in TensorFlow
-
-This repo contains the implementation of **Region of Interest pooling** as a custom TensorFlow operation. The CUDA code responsible for the computations was largely taken from the original [Caffe implementation by Ross Girshick](https://github.com/rbgirshick/fast-rcnn).
-
-For more information about RoI pooling you can check out [Region of interest pooling explained](https://deepsense.io/region-of-interest-pooling-explained/) at our [deepsense.io](https://deepsense.io/) blog.
-
-![Region of Interest Pooling animation](roi_pooling_animation.gif)
-
-
-## Requirements
-
-To compile and use `roi_pooling` layer you need to have:
-
-* [CUDA](https://developer.nvidia.com/cuda-toolkit) (tested with 8.0)
-* [https://www.tensorflow.org/](TensorFlow) (tested with 0.12.0 and 1.0.0)
-
-Only official TensorFlow releases are currently supported. If you're using a custom built TensorFlow compiled with a different GCC version (e.g. 5.X) you may need to modify the makefile to [enable the new ABI version](https://gcc.gnu.org/onlinedocs/libstdc++/manual/using_dual_abi.html).
-
-
-## Install
-
-Since it uses compilation
-
-```bash
-$ git clone git@github.com:deepsense-io/roi-pooling.git
-$ cd roi-pooling
-$ python setup.py install
-```
-
-Right now we provide only GPU implementation (no CPU at this time).
-
-
-## Usage
-
-After successful installation you can use the operation like this:
-
-```python
-from roi_pooling.roi_pooling_ops import roi_pooling
-
-# here obtain feature map and regions of interest
-rpooling = roi_pooling(feature_map, rois, 7, 7)
-# continue the model
-```
-
-Working example in Jupyter Notebook: [examples/roi_pooling_minimal_example.ipynb](https://github.com/deepsense-io/roi-pooling/blob/master/examples/roi_pooling_minimal_example.ipynb)
-
diff --git a/tensorlayer/third_party/roi_pooling/examples/__init__.py b/tensorlayer/third_party/roi_pooling/examples/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/tensorlayer/third_party/roi_pooling/examples/roi_pooling_minimal_example.ipynb b/tensorlayer/third_party/roi_pooling/examples/roi_pooling_minimal_example.ipynb
deleted file mode 100644
index c1edc353..00000000
--- a/tensorlayer/third_party/roi_pooling/examples/roi_pooling_minimal_example.ipynb
+++ /dev/null
@@ -1,148 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "* blog post: [Region of interest pooling explained - deepsense.io](https://deepsense.io/region-of-interest-pooling-explained/)\n",
-    "* repository: [deepsense-io/roi-pooling](https://github.com/deepsense-io/roi-pooling)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "from __future__ import print_function\n",
-    "\n",
-    "import tensorflow as tf\n",
-    "import numpy as np\n",
-    "\n",
-    "from roi_pooling.roi_pooling_ops import roi_pooling"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "# 4x4 feature map with only 1 channel\n",
-    "input_value = [[\n",
-    "    [[1], [2], [4], [4]],\n",
-    "    [[3], [4], [1], [2]],\n",
-    "    [[6], [2], [1], [7]],\n",
-    "    [[1], [3], [2], [8]]\n",
-    "]]\n",
-    "input_value = np.asarray(input_value, dtype='float32')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "# regions of interest as lists of:\n",
-    "# feature map index, upper left, bottom right coordinates\n",
-    "rois_value = [\n",
-    "    [0, 0, 0, 1, 3],\n",
-    "    [0, 2, 2, 3, 3],\n",
-    "    [0, 1, 0, 3, 2]\n",
-    "]\n",
-    "rois_value = np.asarray(rois_value, dtype='int32')\n",
-    "\n",
-    "# in this case we have 3 RoI pooling operations:\n",
-    "# * channel 0, rectangular region (0, 0) to (1, 3)\n",
-    "#              xx..\n",
-    "#              xx..\n",
-    "#              xx..\n",
-    "#              xx..\n",
-    "#\n",
-    "# * channel 0, rectangular region (2, 2) to (3, 3)\n",
-    "#              ....\n",
-    "#              ....\n",
-    "#              ..xx\n",
-    "#              ..xx\n",
-    "# * channel 0, rectangular region (1, 0) to (3, 2)\n",
-    "#              ....\n",
-    "#              xxx.\n",
-    "#              xxx.\n",
-    "#              xxx."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[[[[ 3.  4.]\n",
-      "   [ 6.  3.]]]\n",
-      "\n",
-      "\n",
-      " [[[ 1.  7.]\n",
-      "   [ 2.  8.]]]\n",
-      "\n",
-      "\n",
-      " [[[ 4.  4.]\n",
-      "   [ 4.  7.]]]]\n"
-     ]
-    }
-   ],
-   "source": [
-    "input_featuremap = tf.placeholder(tf.float32)\n",
-    "rois = tf.placeholder(tf.int32)\n",
-    "input_const = tf.constant(input_value, tf.float32)\n",
-    "rois_const = tf.constant(rois_value, tf.int32)\n",
-    "y = roi_pooling(input_const, rois_const, pool_height=2, pool_width=2)\n",
-    "\n",
-    "with tf.Session('') as sess:\n",
-    "    y_output = sess.run(y, feed_dict={input_featuremap: input_value, rois: rois_value})\n",
-    "    print(y_output)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 2",
-   "language": "python",
-   "name": "python2"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 2
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython2",
-   "version": "2.7.12"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/tensorlayer/third_party/roi_pooling/roi_pooling/Makefile b/tensorlayer/third_party/roi_pooling/roi_pooling/Makefile
deleted file mode 100644
index db9de786..00000000
--- a/tensorlayer/third_party/roi_pooling/roi_pooling/Makefile
+++ /dev/null
@@ -1,18 +0,0 @@
-TF_INC = $(shell python -c 'import tensorflow as tf; print(tf.sysconfig.get_include())')
-CUDA_LIB = /usr/local/cuda-8.0/lib64
-
-all: clean build test
-
-build: roi_pooling.so
-
-roi_pooling.cu.o: roi_pooling.cu.cc
-	nvcc -std=c++11 -c -o $@ $? -I $(TF_INC) -D GOOGLE_CUDA=1 -x cu -Xcompiler -fPIC -D _GLIBCXX_USE_CXX11_ABI=0
-
-roi_pooling.so: roi_pooling.cc roi_pooling.cu.o
-	g++ -std=c++11 -shared -o $@ $? -I $(TF_INC) -fPIC -lcudart -L$(CUDA_LIB) -D _GLIBCXX_USE_CXX11_ABI=0
-
-test: build
-	python roi_pooling_test.py
-
-clean:
-	rm -f *.o *.so *.pyc *.npy
diff --git a/tensorlayer/third_party/roi_pooling/roi_pooling/__init__.py b/tensorlayer/third_party/roi_pooling/roi_pooling/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/tensorlayer/third_party/roi_pooling/roi_pooling/roi_pooling.cc b/tensorlayer/third_party/roi_pooling/roi_pooling/roi_pooling.cc
deleted file mode 100644
index d1f123dc..00000000
--- a/tensorlayer/third_party/roi_pooling/roi_pooling/roi_pooling.cc
+++ /dev/null
@@ -1,162 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include <cstdio>
-#include <iostream>
-#include <typeinfo>
-
-using namespace tensorflow;
-using namespace std;
-
-REGISTER_OP("RoiPooling")
-.Input("input: float32")
-.Input("rois: int32")
-.Attr("pool_height: int")
-.Attr("pool_width: int")
-.Output("output: float32")
-.Output("argmax_output: int32");
-
-
-#define Dtype float
-
-void RoiPoolingKernelLauncher(const float* input, const int* rois, int n_rois, int channels, int height, int width,
-                              int pooled_height, int pooled_width, Dtype* output, int* argmax_output);
-
-// IMPORTANT(maciek): need info about storage of the data in memory, assumed something but need the docs confirming it
-
-class RoiPoolingOp : public OpKernel {
-    private:
-        int pool_height_, pool_width_;
-    public:
-        explicit RoiPoolingOp(OpKernelConstruction* context) : OpKernel(context) {
-                 OP_REQUIRES_OK(context,
-                   context->GetAttr("pool_height", &pool_height_));
-
-                 OP_REQUIRES_OK(context,
-                   context->GetAttr("pool_width", &pool_width_));
-        }
-
-
-        void Compute(OpKernelContext* context) override {
-            // Grab the input tensor
-            const Tensor& input_tensor = context->input(0);
-            const Tensor& rois_tensor = context->input(1);
-
-            auto input = input_tensor.flat<float>();
-            auto rois = rois_tensor.flat<int32>();
-
-            // Create an output tensor
-            Tensor* output_tensor = NULL;
-            Tensor* argmax_output_tensor = NULL;
-
-            auto input_shape = input_tensor.shape();
-            auto rois_shape = rois_tensor.shape();
-
-            int n_rois = rois_shape.dim_size(0);
-            int height = input_shape.dim_size(1);
-            int width = input_shape.dim_size(2);
-            int channels = input_shape.dim_size(3);
-
-            TensorShape output_shape = TensorShape({static_cast<int64>(n_rois),
-                                        static_cast<int64>(channels),
-                                        static_cast<int64>(pool_height_),
-                                        static_cast<int64>(pool_width_)});
-
-            OP_REQUIRES_OK(context, context->allocate_output(0, output_shape,
-                        &output_tensor));
-
-            OP_REQUIRES_OK(context, context->allocate_output(1, output_shape,
-                        &argmax_output_tensor));
-
-            auto output = output_tensor->template flat<float>();
-            auto argmax_output = argmax_output_tensor->template flat<int32>();
-
-            RoiPoolingKernelLauncher(input.data(), rois.data(),
-                n_rois, channels,
-                height, width,
-                pool_height_, pool_width_,
-                output.data(), argmax_output.data());
-        }
-};
-
-REGISTER_KERNEL_BUILDER(Name("RoiPooling").Device(DEVICE_GPU), RoiPoolingOp);
-
-///////////// RoiPoolingGrad
-
-
-REGISTER_OP("RoiPoolingGrad")
-.Input("orig_input: float32")
-.Input("orig_rois: int32")
-.Input("orig_output: float32")
-.Input("orig_argmax_output: int32")
-.Input("orig_output_grad: float32")
-.Attr("pool_height: int")
-.Attr("pool_width: int")
-.Output("output: float32")
-.Doc(R"doc(
- region of interest pooling grad
-)doc");
-
-#define Dtype float
-void RoiPoolingGradKernelLauncher(const Dtype* orig_input, const int* orig_rois,
-                                 int mb_size,
-                                 int n_rois, int channels, int height, int width,
-                                 int pooled_height, int pooled_width,
-                                 const Dtype* orig_output, const int* orig_argmax_output,
-                                 const Dtype* orig_output_grad,
-                                 Dtype* output);
-
-// IMPORTANT(maciek): need info about storage of the data in memory, assumed something but need the docs confirming it
-
-class RoiPoolingGradOp : public OpKernel {
-    private:
-        int pool_height_, pool_width_;
-    public:
-        explicit RoiPoolingGradOp(OpKernelConstruction* context) : OpKernel(context) {
-                 OP_REQUIRES_OK(context,
-                   context->GetAttr("pool_height", &pool_height_));
-
-                 OP_REQUIRES_OK(context,
-                   context->GetAttr("pool_width", &pool_width_));
-        }
-
-
-        void Compute(OpKernelContext* context) override {
-            // Grab the input tensor
-            const Tensor& orig_input_tensor = context->input(0);
-            const Tensor& orig_rois_tensor = context->input(1);
-            const Tensor& orig_output_tensor = context->input(2);
-            const Tensor& orig_argmax_output_tensor = context->input(3);
-            const Tensor& orig_output_grad_tensor = context->input(4);
-
-            auto orig_input = orig_input_tensor.flat<float>();
-            auto orig_rois = orig_rois_tensor.flat<int32>();
-            auto orig_output = orig_output_tensor.flat<float>();
-            auto orig_argmax_output = orig_argmax_output_tensor.flat<int32>();
-            auto orig_output_grad = orig_output_grad_tensor.flat<float>();
-
-            // Create an output tensor
-            Tensor* output_tensor = NULL;
-            auto orig_input_shape = orig_input_tensor.shape();
-            auto orig_rois_shape = orig_rois_tensor.shape();
-            auto grads_shape = orig_input_shape;
-
-            int mb_size = orig_input_shape.dim_size(0);
-            int n_rois = orig_rois_shape.dim_size(0);
-            int height = orig_input_shape.dim_size(1);
-            int width = orig_input_shape.dim_size(2);
-            int channels = orig_input_shape.dim_size(3);
-
-            OP_REQUIRES_OK(context, context->allocate_output(0, grads_shape,
-                        &output_tensor));
-
-            auto output = output_tensor->template flat<float>();
-
-            // Call the cuda kernel launcher
-            RoiPoolingGradKernelLauncher(orig_input.data(), orig_rois.data(),
-                mb_size, n_rois, channels, height, width, pool_height_, pool_width_,
-                orig_output.data(), orig_argmax_output.data(), orig_output_grad.data(), output.data());
-        }
-};
-
-
-REGISTER_KERNEL_BUILDER(Name("RoiPoolingGrad").Device(DEVICE_GPU), RoiPoolingGradOp);
diff --git a/tensorlayer/third_party/roi_pooling/roi_pooling/roi_pooling.cu.cc b/tensorlayer/third_party/roi_pooling/roi_pooling/roi_pooling.cu.cc
deleted file mode 100644
index bbacb552..00000000
--- a/tensorlayer/third_party/roi_pooling/roi_pooling/roi_pooling.cu.cc
+++ /dev/null
@@ -1,214 +0,0 @@
-#if GOOGLE_CUDA
-
-#include <iostream>
-#include <stdio.h>
-#define EIGEN_USE_GPU
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-
-// CUDA: index helpers
-#define idx4_4(index, d1, d2, d3, d4) (index % d4)
-#define idx4_3(index, d1, d2, d3, d4) ((index / d4) % d3)
-#define idx4_2(index, d1, d2, d3, d4) ((index / d4 / d3) % d2)
-#define idx4_1(index, d1, d2, d3, d4) ((index / d4 / d3 / d2) %d1)
-
-// CUDA: various checks for different function calls.
-#define CUDA_CHECK(condition) \
-  /* Code block avoids redefinition of cudaError_t error */ \
-  do { \
-    cudaError_t error = condition; \
-    if (error != cudaSuccess) { \
-      return 1; \
-    } \
-  } while (0)
-
-// CUDA: grid stride looping
-#define CUDA_KERNEL_LOOP(i, n) \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
-       i < (n); \
-       i += blockDim.x * gridDim.x)
-
-// CUDA: use 512 threads per block
-const int CAFFE_CUDA_NUM_THREADS = 512;
-
-// CUDA: number of blocks for threads.
-inline int CAFFE_GET_BLOCKS(const int N) {
-  // TODO rewrite this part to be consistent with tf conventions
-  int optimal_number_of_blocks = (N + CAFFE_CUDA_NUM_THREADS - 1) / CAFFE_CUDA_NUM_THREADS;
-  int max_number_of_blocks = 65000;
-  return std::min(optimal_number_of_blocks, max_number_of_blocks);
-}
-
-
-#define Dtype float
-
-__global__ void RoiPoolingKernel(const Dtype* input, const int* rois,
-                                 int n_rois, int channels, int height, int width,
-                                 int pooled_height, int pooled_width,
-                                 Dtype* output, int* argmax_output) {
-    int output_size = n_rois * channels * pooled_height * pooled_width;
-
-    CUDA_KERNEL_LOOP(index, output_size) {
-    // (n, c, ph, pw) is an element in the pooled output
-    int pw = idx4_4(index, n_rois, channels, pooled_height, pooled_width);
-    int ph = idx4_3(index, n_rois, channels, pooled_height, pooled_width);
-    int c = idx4_2(index, n_rois, channels, pooled_height, pooled_width);
-    int n = idx4_1(index, n_rois, channels, pooled_height, pooled_width);
-
-    auto bottom_rois_act = rois + n * 5;
-
-    int roi_batch_ind = bottom_rois_act[0];
-    int roi_start_w = bottom_rois_act[1];
-    int roi_start_h = bottom_rois_act[2];
-    int roi_end_w = bottom_rois_act[3];
-    int roi_end_h = bottom_rois_act[4];
-
-    // Force malformed ROIs to be 1x1
-    // NOTE(maciek): roi_start, roi_end seems to be inclusive
-    int roi_width = max(roi_end_w - roi_start_w + 1, 1);
-    int roi_height = max(roi_end_h - roi_start_h + 1, 1);
-
-    // divide the ROIs into smaller regions for max pooling
-    Dtype bin_size_h = static_cast<Dtype>(roi_height) / static_cast<Dtype>(pooled_height);
-    Dtype bin_size_w = static_cast<Dtype>(roi_width) / static_cast<Dtype>(pooled_width);
-
-    // compute the precise coordinates of each pooling subregion of the ROIs
-    int hstart = static_cast<int>(floor(static_cast<Dtype>(ph) * bin_size_h));
-    int wstart = static_cast<int>(floor(static_cast<Dtype>(pw) * bin_size_w));
-    int hend = static_cast<int>(ceil(static_cast<Dtype>(ph + 1) * bin_size_h));
-    int wend = static_cast<int>(ceil(static_cast<Dtype>(pw + 1) * bin_size_w));
-
-    // Add roi offsets and clip to input boundaries
-    hstart = min(max(hstart + roi_start_h, 0), height);
-    hend = min(max(hend + roi_start_h, 0), height);
-    wstart = min(max(wstart + roi_start_w, 0), width);
-    wend = min(max(wend + roi_start_w, 0), width);
-
-    //printf("%d %d %d %d %d %d %d %d\n", n, c, pw, ph, hstart, hend, wstart, wend);
-
-    bool is_empty = (hend <= hstart) || (wend <= wstart);
-
-    // Define an empty pooling region to be zero
-
-    Dtype maxval = is_empty ? 0 : -999999999.0;
-    //Dtype maxval = is_empty ? 0 : -FLT_MAX;
-    // If nothing is pooled, argmax = -1 causes nothing to be backprop'd
-
-    int maxidx = -1;
-    auto input_act = input + (roi_batch_ind * height * width * channels);
-    for (int h = hstart; h < hend; ++h) {
-      for (int w = wstart; w < wend; ++w) {
-        int bottom_index = (h * width + w) * channels + c;
-
-        // bottom index is relative to 2d image only
-        if (input_act[bottom_index] > maxval) {
-          maxval = input_act[bottom_index];
-          maxidx = bottom_index;
-        }
-      }
-    }
-    output[index] = maxval;
-    argmax_output[index] = maxidx;
-  }
-}
-
-
-void RoiPoolingKernelLauncher(const float* input, const int* rois, int n_rois, int channels, int height, int width,
-                              int pooled_height, int pooled_width, Dtype* output, int* argmax_output) {
-    int out_size = n_rois * channels * pooled_height * pooled_width;
-
-    RoiPoolingKernel<<<CAFFE_GET_BLOCKS(out_size), CAFFE_CUDA_NUM_THREADS>>>(input, rois, n_rois, channels, height, width,
-        pooled_height, pooled_width, output, argmax_output);
-}
-
-
-/////////////// Grad
-__global__ void RoiPoolingGradKernel(const Dtype* orig_input, const int* orig_rois,
-                                 int mb_size,
-                                 int n_rois, int channels, int height, int width,
-                                 int pooled_height, int pooled_width,
-                                 const Dtype* orig_output, const int* orig_argmax_output,
-                                 const Dtype* orig_output_grad,
-                                 Dtype* output) {
-
-    int orig_input_size = mb_size * height * width * channels;
-
-    CUDA_KERNEL_LOOP(index, orig_input_size) {
-    // (n, h, w, c) coords in bottom data
-    int c = idx4_4(index, mb_size, height, width, channels);
-    int w = idx4_3(index, mb_size, height, width, channels);
-    int h = idx4_2(index, mb_size, height, width, channels);
-    int n = idx4_1(index, mb_size, height, width, channels);
-
-    Dtype gradient = 0;
-    // Accumulate gradient over all ROIs that pooled this element
-    for (int roi_n = 0; roi_n < n_rois; ++roi_n) {
-      const int* offset_bottom_rois = orig_rois + roi_n * 5;
-      int roi_batch_ind = offset_bottom_rois[0];
-      // Skip if ROI's batch index doesn't match n
-      if (n != roi_batch_ind) {
-        continue;
-      }
-
-      int roi_start_w = offset_bottom_rois[1];
-      int roi_start_h = offset_bottom_rois[2];
-      int roi_end_w = offset_bottom_rois[3];
-      int roi_end_h = offset_bottom_rois[4];
-
-      // Skip if ROI doesn't include (h, w)
-      const bool in_roi = (w >= roi_start_w && w <= roi_end_w &&
-                           h >= roi_start_h && h <= roi_end_h);
-      if (!in_roi) {
-        continue;
-      }
-
-      int offset = (roi_n * channels + c) * pooled_height * pooled_width;
-      const Dtype* offset_top_diff = orig_output_grad + offset;
-      const int* offset_argmax_data = orig_argmax_output + offset;
-
-      // Compute feasible set of pooled units that could have pooled
-      // this bottom unit
-
-      // Force malformed ROIs to be 1x1
-      int roi_width = max(roi_end_w - roi_start_w + 1, 1);
-      int roi_height = max(roi_end_h - roi_start_h + 1, 1);
-
-      Dtype bin_size_h = static_cast<Dtype>(roi_height) / static_cast<Dtype>(pooled_height);
-      Dtype bin_size_w = static_cast<Dtype>(roi_width) / static_cast<Dtype>(pooled_width);
-
-      int phstart = floor(static_cast<Dtype>(h - roi_start_h) / bin_size_h);
-      int phend = ceil(static_cast<Dtype>(h - roi_start_h + 1) / bin_size_h);
-      int pwstart = floor(static_cast<Dtype>(w - roi_start_w) / bin_size_w);
-      int pwend = ceil(static_cast<Dtype>(w - roi_start_w + 1) / bin_size_w);
-
-      phstart = min(max(phstart, 0), pooled_height);
-      phend = min(max(phend, 0), pooled_height);
-      pwstart = min(max(pwstart, 0), pooled_width);
-      pwend = min(max(pwend, 0), pooled_width);
-
-      for (int ph = phstart; ph < phend; ++ph) {
-        for (int pw = pwstart; pw < pwend; ++pw) {
-          if (offset_argmax_data[ph * pooled_width + pw] == (h * width + w)) {
-            gradient += offset_top_diff[ph * pooled_width + pw];
-          }
-        }
-      }
-    }
-    output[index] = gradient;
-  }
-
-}
-
-void RoiPoolingGradKernelLauncher(const Dtype* orig_input, const int* orig_rois,
-                                 int mb_size,
-                                 int n_rois, int channels, int height, int width,
-                                 int pooled_height, int pooled_width,
-                                 const Dtype* orig_output, const int* orig_argmax_output,
-                                 const Dtype* orig_output_grad,
-                                 Dtype* output) {
-    int out_size = mb_size * height * width * channels;
-    RoiPoolingGradKernel<<<CAFFE_GET_BLOCKS(out_size), CAFFE_CUDA_NUM_THREADS>>>(orig_input, orig_rois,
-        mb_size, n_rois, channels, height, width, pooled_height, pooled_width,
-        orig_output, orig_argmax_output, orig_output_grad, output);
-}
-
-#endif
diff --git a/tensorlayer/third_party/roi_pooling/roi_pooling/roi_pooling_ops.py b/tensorlayer/third_party/roi_pooling/roi_pooling/roi_pooling_ops.py
deleted file mode 100644
index 5c46dc37..00000000
--- a/tensorlayer/third_party/roi_pooling/roi_pooling/roi_pooling_ops.py
+++ /dev/null
@@ -1,51 +0,0 @@
-import tensorflow as tf
-from tensorflow.python.framework import ops
-import os
-
-module_path = os.path.realpath(__file__)
-module_dir = os.path.dirname(module_path)
-lib_path = os.path.join(module_dir, 'roi_pooling.so')
-roi_pooling_module = tf.load_op_library(lib_path)
-
-def roi_pooling(input, rois, pool_height, pool_width):
-    """
-      returns a tensorflow operation for computing the Region of Interest Pooling
-    
-      @arg input: feature maps on which to perform the pooling operation
-      @arg rois: list of regions of interest in the format (feature map index, upper left, bottom right)
-      @arg pool_width: size of the pooling sections
-    """
-    # TODO(maciek): ops scope
-    out = roi_pooling_module.roi_pooling(input, rois, pool_height=pool_height, pool_width=pool_width)
-    output, argmax_output = out[0], out[1]
-    return output
-
-
-@ops.RegisterGradient("RoiPooling")
-def _RoiPoolingGrad(op, *grads):
-    orig_inputs = op.inputs[0]
-    orig_rois = op.inputs[1]
-    orig_output = op.outputs[0]
-    orig_argmax_output = op.outputs[1]
-
-    orig_output_grad = grads[0]
-    output_grad = roi_pooling_module.roi_pooling_grad(orig_inputs, orig_rois, orig_output,
-                                                      orig_argmax_output, orig_output_grad,
-                                                      pool_height=op.get_attr('pool_height'),
-                                                      pool_width=op.get_attr('pool_width'))
-    return [output_grad, None]
-
-
-@ops.RegisterShape("RoiPooling")
-def _RoiPoolingShape(op):
-    input = op.inputs[0]
-    rois = op.inputs[1]
-
-    n_rois = rois.get_shape()[0]
-    n_channels = input.get_shape()[3]
-    pool_height = op.get_attr('pool_height')
-    pool_width = op.get_attr('pool_width')
-
-    #TODO: check the width/hegiht order
-    return [tf.TensorShape([n_rois, n_channels, pool_width, pool_height]),
-            tf.TensorShape(None)]
diff --git a/tensorlayer/third_party/roi_pooling/roi_pooling/roi_pooling_test.py b/tensorlayer/third_party/roi_pooling/roi_pooling/roi_pooling_test.py
deleted file mode 100644
index c5f1b361..00000000
--- a/tensorlayer/third_party/roi_pooling/roi_pooling/roi_pooling_test.py
+++ /dev/null
@@ -1,110 +0,0 @@
-import tensorflow as tf
-import numpy as np
-from roi_pooling_ops import roi_pooling
-
-
-class RoiPoolingTest(tf.test.TestCase):
-    # TODO(maciek): add python, implementation and test outputs
-    # TODO(maciek): test pool_height != pool_width, height != width
-
-    def test_roi_pooling_grad(self):
-        # TODO(maciek): corner cases
-        input_value = [[
-            [[1], [2], [4], [4]],
-            [[3], [4], [1], [2]],
-            [[6], [2], [1], [7.0]],
-            [[1], [3], [2], [8]]
-        ]]
-        input_value = np.asarray(input_value, dtype='float32')
-
-        rois_value = [
-            [0, 0, 0, 1, 1],
-            [0, 1, 1, 2, 2],
-            [0, 2, 2, 3, 3],
-            [0, 0, 0, 2, 2],
-            [0, 0, 0, 3, 3]
-        ]
-        rois_value = np.asarray(rois_value, dtype='int32')
-
-        with tf.Session(''):
-            # NOTE(maciek): looks like we have to use consts here, based on tensorflow/python/ops/nn_test.py
-            input_const = tf.constant(input_value, tf.float32)
-            rois_const = tf.constant(rois_value, tf.int32)
-            y = roi_pooling(input_const, rois_const, pool_height=2, pool_width=2)
-            mean = tf.reduce_mean(y)
-
-            numerical_grad_error_1 = tf.test.compute_gradient_error(
-                [input_const], [input_value.shape], y, [5, 2, 2, 1])
-
-            numerical_grad_error_2 = tf.test.compute_gradient_error(
-                [input_const], [input_value.shape], mean, [])
-
-            self.assertLess(numerical_grad_error_1, 1e-4)
-            self.assertLess(numerical_grad_error_2, 1e-4)
-
-    def test_shape_inference_1(self):
-        pooled_w, pooled_h = 2, 2
-        input_w, input_h = 200, 200
-        n_channels = 3
-        n_batches = None
-        input = tf.placeholder(tf.float32, shape=[n_batches, input_w, input_h, n_channels])
-
-        n_rois = None
-        single_roi_dimension = 5
-        rois = tf.placeholder(tf.int32, shape=[n_rois, single_roi_dimension])
-
-        y = roi_pooling(input, rois, pool_height=pooled_w, pool_width=pooled_h)
-
-        self.assertEqual(y.get_shape().ndims, 4)
-        self.assertIs(y.get_shape()[0].value, n_rois)
-        self.assertIs(y.get_shape()[1].value, n_channels)
-        self.assertIs(y.get_shape()[2].value, pooled_h)
-        self.assertIs(y.get_shape()[3].value, pooled_w)
-
-    def test_shape_inference_2(self):
-        pooled_w, pooled_h = 3, 4
-        input_w, input_h = 200, 300
-        n_channels = 3
-        n_batches = None
-        input = tf.placeholder(tf.float32, shape=[n_batches, input_w, input_h, n_channels])
-
-        n_rois = None
-        single_roi_dimension = 5
-        rois = tf.placeholder(tf.int32, shape=[n_rois, single_roi_dimension])
-
-        y = roi_pooling(input, rois, pool_height=pooled_w, pool_width=pooled_h)
-
-        self.assertEqual(y.get_shape().ndims, 4)
-        self.assertIs(y.get_shape()[0].value, n_rois)
-        self.assertIs(y.get_shape()[1].value, n_channels)
-        self.assertIs(y.get_shape()[2].value, pooled_h)
-        self.assertIs(y.get_shape()[3].value, pooled_w)
-
-    def test_very_big_output(self):
-        """
-        This test checks whether the layer can handle a corner case
-        where the number of output pixels is very large, possibly larger
-        than the number of available GPU threads
-        """
-
-        pooled_w, pooled_h = 7,7
-        input_w, input_h = 72, 240
-        n_channels = 512
-        n_batches = 2
-        x_input = np.ones(shape=(n_batches, input_w, input_h, n_channels))
-        n_rois = 5000
-        rois_input = np.ones(shape=(n_rois, 5))
-
-        input = tf.placeholder(tf.float32, shape=[n_batches, input_w, input_h, n_channels])
-        single_roi_dimension = 5
-        rois = tf.placeholder(tf.int32, shape=[n_rois, single_roi_dimension])
-
-        y = roi_pooling(input, rois, pool_height=pooled_w, pool_width=pooled_h)
-
-        with tf.Session('') as sess:
-            y_output = sess.run(y, feed_dict={input: x_input, rois: rois_input})
-
-        self.assertTrue(np.all(y_output == 1))
-
-if __name__ == '__main__':
-    tf.test.main()
diff --git a/tensorlayer/third_party/roi_pooling/roi_pooling_animation.gif b/tensorlayer/third_party/roi_pooling/roi_pooling_animation.gif
deleted file mode 100644
index 9d35d21a..00000000
Binary files a/tensorlayer/third_party/roi_pooling/roi_pooling_animation.gif and /dev/null differ
diff --git a/tensorlayer/third_party/roi_pooling/roi_pooling_example.py b/tensorlayer/third_party/roi_pooling/roi_pooling_example.py
deleted file mode 100644
index 7d9b7b63..00000000
--- a/tensorlayer/third_party/roi_pooling/roi_pooling_example.py
+++ /dev/null
@@ -1,63 +0,0 @@
-from __future__ import print_function
-
-import tensorflow as tf
-import numpy as np
-
-from roi_pooling.roi_pooling_ops import roi_pooling
-
-# input feature map going into the RoI pooling 
-input_value = [[
-    [[1], [2], [4], [4]],
-    [[3], [4], [1], [2]],
-    [[6], [2], [1], [7.0]],
-    [[1], [3], [2], [8]]
-]]
-input_value = np.asarray(input_value, dtype='float32')
-
-# Regions of interest as lists of:
-# feature map index, upper left, bottom right coordinates
-rois_value = [
-    [0, 0, 0, 1, 1],
-    [0, 1, 1, 2, 2],
-    [0, 2, 2, 3, 3],
-    [0, 0, 0, 2, 2],
-    [0, 0, 0, 3, 3]
-]
-rois_value = np.asarray(rois_value, dtype='int32')
-
-# the pool_height and width are parameters of the ROI layer
-pool_height, pool_width = (2, 2)
-n_rois = len(rois_value)
-y_shape = [n_rois, 1, pool_height, pool_width]
-
-print('Input: ', input_value, ', shape: ', input_value.shape)
-print('ROIs: ', rois_value, ', shape: ', rois_value.shape)
-
-# precise semantics is now only defined by the kernel, need tests
-input = tf.placeholder(tf.float32)
-rois = tf.placeholder(tf.int32)
-
-y = roi_pooling(input, rois, pool_height=2, pool_width=2)
-mean = tf.reduce_mean(y)
-
-grads = tf.gradients(mean, input)
-print(type(grads))
-print(len(grads))
-print(grads)
-print(input_value.shape)
-
-with tf.Session('') as sess:
-    input_const = tf.constant(input_value, tf.float32)
-    rois_const = tf.constant(rois_value, tf.int32)
-    y = roi_pooling(input_const, rois_const, pool_height=2, pool_width=2)
-    mean = tf.reduce_mean(y)
-
-    numerical_grad_error_1 = tf.test.compute_gradient_error([input_const], [input_value.shape], y, y_shape)
-    numerical_grad_error_2 = tf.test.compute_gradient_error([input_const], [input_value.shape], mean, [])
-    print(numerical_grad_error_1, numerical_grad_error_2)
-
-with tf.Session('') as sess:
-    y_output = sess.run(y, feed_dict={input: input_value, rois: rois_value})
-    print('y: ', y_output)
-    grads_output = sess.run(grads, feed_dict={input: input_value, rois: rois_value})
-    print('grads: ', grads_output)
diff --git a/tensorlayer/third_party/roi_pooling/setup.py b/tensorlayer/third_party/roi_pooling/setup.py
deleted file mode 100644
index de392a9d..00000000
--- a/tensorlayer/third_party/roi_pooling/setup.py
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/usr/bin/env python
-
-from __future__ import print_function
-from distutils.core import setup
-from distutils.command.install import install as DistutilsInstall
-import sys
-import subprocess
-
-try:
-    import tensorflow
-except ImportError:
-    print("Please install tensorflow 0.12.0 or later")
-    sys.exit()
-    
-
-class MyInstall(DistutilsInstall):
-    def run(self):
-        subprocess.call(['make', '-C', 'roi_pooling', 'build'])
-        DistutilsInstall.run(self)
-
-setup(name='roi-pooling',
-            version='1.0',
-            description='ROI pooling as a custom TensorFlow operation',
-            author='deepsense.io',
-            packages=['roi_pooling'],
-            package_data={'roi_pooling': ['roi_pooling.so']},
-            cmdclass={'install': MyInstall}
-)
-
-    
-
-
diff --git a/tensorlayer/third_party/roi_pooling/test_roi_layer.py b/tensorlayer/third_party/roi_pooling/test_roi_layer.py
deleted file mode 100644
index 5ca6a12a..00000000
--- a/tensorlayer/third_party/roi_pooling/test_roi_layer.py
+++ /dev/null
@@ -1,66 +0,0 @@
-from tensorlayer.layers import *
-
-from tensorlayer.third_party.roi_pooling.roi_pooling.roi_pooling_ops import roi_pooling
-# from roi_pooling.roi_pooling_ops import roi_pooling
-
-
-
-# input feature map going into the RoI pooling
-input_value = [[
-    [[1], [2], [4], [4]],
-    [[3], [4], [1], [2]],
-    [[6], [2], [1], [7.0]],
-    [[1], [3], [2], [8]]
-]]
-input_value = np.asarray(input_value, dtype='float32')
-
-# Regions of interest as lists of:
-# feature map index, upper left, bottom right coordinates
-rois_value = [
-    [0, 0, 0, 1, 1],
-    [0, 1, 1, 2, 2],
-    [0, 2, 2, 3, 3],
-    [0, 0, 0, 2, 2],
-    [0, 0, 0, 3, 3]
-]
-rois_value = np.asarray(rois_value, dtype='int32')
-
-# the pool_height and width are parameters of the ROI layer
-pool_height, pool_width = (2, 2)
-n_rois = len(rois_value)
-y_shape = [n_rois, 1, pool_height, pool_width]
-
-print('Input: ', input_value, ', shape: ', input_value.shape)
-print('ROIs: ', rois_value, ', shape: ', rois_value.shape)
-
-# precise semantics is now only defined by the kernel, need tests
-input = tf.placeholder(tf.float32)
-rois = tf.placeholder(tf.int32)
-
-# y = roi_pooling(input, rois, pool_height=2, pool_width=2)
-n = InputLayer(input, name='in')
-n = ROIPoolingLayer(n, rois=rois, pool_height=2, pool_width=2, name='roi')
-y = n.outputs
-mean = tf.reduce_mean(y)
-
-grads = tf.gradients(mean, input)
-print(type(grads))
-print(len(grads))
-print(grads)
-print(input_value.shape)
-
-with tf.Session('') as sess:
-    input_const = tf.constant(input_value, tf.float32)
-    rois_const = tf.constant(rois_value, tf.int32)
-    y = roi_pooling(input_const, rois_const, pool_height=2, pool_width=2)
-    mean = tf.reduce_mean(y)
-
-    numerical_grad_error_1 = tf.test.compute_gradient_error([input_const], [input_value.shape], y, y_shape)
-    numerical_grad_error_2 = tf.test.compute_gradient_error([input_const], [input_value.shape], mean, [])
-    print(numerical_grad_error_1, numerical_grad_error_2)
-
-with tf.Session('') as sess:
-    y_output = sess.run(y, feed_dict={input: input_value, rois: rois_value})
-    print('y: ', y_output)
-    grads_output = sess.run(grads, feed_dict={input: input_value, rois: rois_value})
-    print('grads: ', grads_output)
diff --git a/tensorlayer/utils.py b/tensorlayer/utils.py
deleted file mode 100644
index 0dbdbdc1..00000000
--- a/tensorlayer/utils.py
+++ /dev/null
@@ -1,540 +0,0 @@
-#! /usr/bin/python
-# -*- coding: utf8 -*-
-import tensorflow as tf
-import tensorlayer as tl
-from . import iterate
-import numpy as np
-import time
-import math
-import random
-
-
-def fit(sess, network, train_op, cost, X_train, y_train, x, y_, acc=None, batch_size=100,
-        n_epoch=100, print_freq=5, X_val=None, y_val=None, eval_train=True,
-        tensorboard=False, tensorboard_epoch_freq=5, tensorboard_weight_histograms=True, tensorboard_graph_vis=True):
-    """Traing a given non time-series network by the given cost function, training data, batch_size, n_epoch etc.
-
-    Parameters
-    ----------
-    sess : TensorFlow session
-        sess = tf.InteractiveSession()
-    network : a TensorLayer layer
-        the network will be trained
-    train_op : a TensorFlow optimizer
-        like tf.train.AdamOptimizer
-    X_train : numpy array
-        the input of training data
-    y_train : numpy array
-        the target of training data
-    x : placeholder
-        for inputs
-    y_ : placeholder
-        for targets
-    acc : the TensorFlow expression of accuracy (or other metric) or None
-        if None, would not display the metric
-    batch_size : int
-        batch size for training and evaluating
-    n_epoch : int
-        the number of training epochs
-    print_freq : int
-        display the training information every ``print_freq`` epochs
-    X_val : numpy array or None
-        the input of validation data
-    y_val : numpy array or None
-        the target of validation data
-    eval_train : boolean
-        if X_val and y_val are not None, it refects whether to evaluate the training data
-    tensorboard : boolean
-        if True summary data will be stored to the log/ direcory for visualization with tensorboard.
-        See also detailed tensorboard_X settings for specific configurations of features. (default False)
-        Also runs tl.layers.initialize_global_variables(sess) internally in fit() to setup the summary nodes, see Note:
-    tensorboard_epoch_freq : int
-        how many epochs between storing tensorboard checkpoint for visualization to log/ directory (default 5)
-    tensorboard_weight_histograms : boolean
-        if True updates tensorboard data in the logs/ directory for visulaization
-        of the weight histograms every tensorboard_epoch_freq epoch (default True)
-    tensorboard_graph_vis : boolean
-        if True stores the graph in the tensorboard summaries saved to log/ (default True)
-
-    Examples
-    --------
-    >>> see tutorial_mnist_simple.py
-    >>> tl.utils.fit(sess, network, train_op, cost, X_train, y_train, x, y_,
-    ...            acc=acc, batch_size=500, n_epoch=200, print_freq=5,
-    ...            X_val=X_val, y_val=y_val, eval_train=False)
-    >>> tl.utils.fit(sess, network, train_op, cost, X_train, y_train, x, y_,
-    ...            acc=acc, batch_size=500, n_epoch=200, print_freq=5,
-    ...            X_val=X_val, y_val=y_val, eval_train=False,
-    ...            tensorboard=True, tensorboard_weight_histograms=True, tensorboard_graph_vis=True)
-
-    Note
-    --------
-        If tensorboard=True, the global_variables_initializer will be run inside the fit function
-        in order to initalize the automatically generated summary nodes used for tensorboard visualization,
-        thus tf.global_variables_initializer().run() before the fit() call will be undefined.
-    """
-    assert X_train.shape[0] >= batch_size, "Number of training examples should be bigger than the batch size"
-
-    if(tensorboard):
-        print("Setting up tensorboard ...")
-        #Set up tensorboard summaries and saver
-        tl.files.exists_or_mkdir('logs/')
-
-        #Only write summaries for more recent TensorFlow versions
-        if hasattr(tf, 'summary') and hasattr(tf.summary, 'FileWriter'):
-            if tensorboard_graph_vis:
-                train_writer = tf.summary.FileWriter('logs/train',sess.graph)
-                val_writer = tf.summary.FileWriter('logs/validation',sess.graph)
-            else:
-                train_writer = tf.summary.FileWriter('logs/train')
-                val_writer = tf.summary.FileWriter('logs/validation')
-
-        #Set up summary nodes
-        if(tensorboard_weight_histograms):
-            for param in network.all_params:
-                if hasattr(tf, 'summary') and hasattr(tf.summary, 'histogram'):
-                    print('Param name ', param.name)
-                    tf.summary.histogram(param.name, param)
-
-        if hasattr(tf, 'summary') and hasattr(tf.summary, 'histogram'):
-            tf.summary.scalar('cost', cost)
-
-        merged = tf.summary.merge_all()
-
-        #Initalize all variables and summaries
-        tl.layers.initialize_global_variables(sess)
-        print("Finished! use $tensorboard --logdir=logs/ to start server")
-
-    print("Start training the network ...")
-    start_time_begin = time.time()
-    tensorboard_train_index, tensorboard_val_index = 0, 0
-    for epoch in range(n_epoch):
-        start_time = time.time()
-        loss_ep = 0; n_step = 0
-        for X_train_a, y_train_a in iterate.minibatches(X_train, y_train,
-                                                    batch_size, shuffle=True):
-            feed_dict = {x: X_train_a, y_: y_train_a}
-            feed_dict.update( network.all_drop )    # enable noise layers
-            loss, _ = sess.run([cost, train_op], feed_dict=feed_dict)
-            loss_ep += loss
-            n_step += 1
-        loss_ep = loss_ep/ n_step
-
-        if tensorboard and hasattr(tf, 'summary'):
-            if epoch+1 == 1 or (epoch+1) % tensorboard_epoch_freq == 0:
-                for X_train_a, y_train_a in iterate.minibatches(
-                                        X_train, y_train, batch_size, shuffle=True):
-                    dp_dict = dict_to_one( network.all_drop )    # disable noise layers
-                    feed_dict = {x: X_train_a, y_: y_train_a}
-                    feed_dict.update(dp_dict)
-                    result = sess.run(merged, feed_dict=feed_dict)
-                    train_writer.add_summary(result, tensorboard_train_index)
-                    tensorboard_train_index += 1
-                if (X_val is not None) and (y_val is not None):                      
-                        for X_val_a, y_val_a in iterate.minibatches(
-                                        X_val, y_val, batch_size, shuffle=True):
-                                dp_dict = dict_to_one( network.all_drop )    # disable noise layers
-                                feed_dict = {x: X_val_a, y_: y_val_a}
-                                feed_dict.update(dp_dict)
-                                result = sess.run(merged, feed_dict=feed_dict)
-                                val_writer.add_summary(result, tensorboard_val_index)
-                                tensorboard_val_index += 1
-
-        if epoch + 1 == 1 or (epoch + 1) % print_freq == 0:
-            if (X_val is not None) and (y_val is not None):
-                print("Epoch %d of %d took %fs" % (epoch + 1, n_epoch, time.time() - start_time))
-                if eval_train is True:
-                    train_loss, train_acc, n_batch = 0, 0, 0
-                    for X_train_a, y_train_a in iterate.minibatches(
-                                            X_train, y_train, batch_size, shuffle=True):
-                        dp_dict = dict_to_one( network.all_drop )    # disable noise layers
-                        feed_dict = {x: X_train_a, y_: y_train_a}
-                        feed_dict.update(dp_dict)
-                        if acc is not None:
-                            err, ac = sess.run([cost, acc], feed_dict=feed_dict)
-                            train_acc += ac
-                        else:
-                            err = sess.run(cost, feed_dict=feed_dict)
-                        train_loss += err;  n_batch += 1
-                    print("   train loss: %f" % (train_loss/ n_batch))
-                    if acc is not None:
-                        print("   train acc: %f" % (train_acc/ n_batch))
-                val_loss, val_acc, n_batch = 0, 0, 0
-                for X_val_a, y_val_a in iterate.minibatches(
-                                            X_val, y_val, batch_size, shuffle=True):
-                    dp_dict = dict_to_one( network.all_drop )    # disable noise layers
-                    feed_dict = {x: X_val_a, y_: y_val_a}
-                    feed_dict.update(dp_dict)
-                    if acc is not None:
-                        err, ac = sess.run([cost, acc], feed_dict=feed_dict)
-                        val_acc += ac
-                    else:
-                        err = sess.run(cost, feed_dict=feed_dict)
-                    val_loss += err; n_batch += 1
-                print("   val loss: %f" % (val_loss/ n_batch))
-                if acc is not None:
-                    print("   val acc: %f" % (val_acc/ n_batch))
-            else:
-                print("Epoch %d of %d took %fs, loss %f" % (epoch + 1, n_epoch, time.time() - start_time, loss_ep))
-    print("Total training time: %fs" % (time.time() - start_time_begin))
-
-
-def test(sess, network, acc, X_test, y_test, x, y_, batch_size, cost=None):
-    """
-    Test a given non time-series network by the given test data and metric.
-
-    Parameters
-    ----------
-    sess : TensorFlow session
-        sess = tf.InteractiveSession()
-    network : a TensorLayer layer
-        the network will be trained
-    acc : the TensorFlow expression of accuracy (or other metric) or None
-        if None, would not display the metric
-    X_test : numpy array
-        the input of test data
-    y_test : numpy array
-        the target of test data
-    x : placeholder
-        for inputs
-    y_ : placeholder
-        for targets
-    batch_size : int or None
-        batch size for testing, when dataset is large, we should use minibatche for testing.
-        when dataset is small, we can set it to None.
-    cost : the TensorFlow expression of cost or None
-        if None, would not display the cost
-
-    Examples
-    --------
-    >>> see tutorial_mnist_simple.py
-    >>> tl.utils.test(sess, network, acc, X_test, y_test, x, y_, batch_size=None, cost=cost)
-    """
-    print('Start testing the network ...')
-    if batch_size is None:
-        dp_dict = dict_to_one( network.all_drop )
-        feed_dict = {x: X_test, y_: y_test}
-        feed_dict.update(dp_dict)
-        if cost is not None:
-            print("   test loss: %f" % sess.run(cost, feed_dict=feed_dict))
-        print("   test acc: %f" % sess.run(acc, feed_dict=feed_dict))
-            # print("   test acc: %f" % np.mean(y_test == sess.run(y_op,
-            #                                           feed_dict=feed_dict)))
-    else:
-        test_loss, test_acc, n_batch = 0, 0, 0
-        for X_test_a, y_test_a in iterate.minibatches(
-                                    X_test, y_test, batch_size, shuffle=True):
-            dp_dict = dict_to_one( network.all_drop )    # disable noise layers
-            feed_dict = {x: X_test_a, y_: y_test_a}
-            feed_dict.update(dp_dict)
-            if cost is not None:
-                err, ac = sess.run([cost, acc], feed_dict=feed_dict)
-                test_loss += err
-            else:
-                ac = sess.run(acc, feed_dict=feed_dict)
-            test_acc += ac; n_batch += 1
-        if cost is not None:
-            print("   test loss: %f" % (test_loss/ n_batch))
-        print("   test acc: %f" % (test_acc/ n_batch))
-
-
-def predict(sess, network, X, x, y_op, batch_size=None):
-    """
-    Return the predict results of given non time-series network.
-
-    Parameters
-    ----------
-    sess : TensorFlow session
-        sess = tf.InteractiveSession()
-    network : a TensorLayer layer
-        the network will be trained
-    X : numpy array
-        the input
-    x : placeholder
-        for inputs
-    y_op : placeholder
-        the argmax expression of softmax outputs
-    batch_size : int or None
-        batch size for prediction, when dataset is large, we should use minibatche for prediction.
-        when dataset is small, we can set it to None.
-
-    Examples
-    --------
-    >>> see tutorial_mnist_simple.py
-    >>> y = network.outputs
-    >>> y_op = tf.argmax(tf.nn.softmax(y), 1)
-    >>> print(tl.utils.predict(sess, network, X_test, x, y_op))
-    """
-    if batch_size is None:
-        dp_dict = dict_to_one( network.all_drop )    # disable noise layers
-        feed_dict = {x: X,}
-        feed_dict.update(dp_dict)
-        return sess.run(y_op, feed_dict=feed_dict)
-    else:
-        result = None
-        for X_a, _ in iterate.minibatches(
-                X, X, batch_size, shuffle=False):
-            dp_dict = dict_to_one( network.all_drop )
-            feed_dict = {x: X_a, }
-            feed_dict.update(dp_dict)
-            result_a = sess.run(y_op, feed_dict=feed_dict)
-            if result is None:
-                result = result_a
-            else:
-                result = np.hstack((result, result_a))
-        return result
-
-
-## Evaluation
-def evaluation(y_test=None, y_predict=None, n_classes=None):
-    """
-    Input the predicted results, targets results and
-    the number of class, return the confusion matrix, F1-score of each class,
-    accuracy and macro F1-score.
-
-    Parameters
-    ----------
-    y_test : numpy.array or list
-        target results
-    y_predict : numpy.array or list
-        predicted results
-    n_classes : int
-        number of classes
-
-    Examples
-    --------
-    >>> c_mat, f1, acc, f1_macro = evaluation(y_test, y_predict, n_classes)
-    """
-    from sklearn.metrics import confusion_matrix, f1_score, accuracy_score
-    c_mat = confusion_matrix(y_test, y_predict, labels = [x for x in range(n_classes)])
-    f1    = f1_score(y_test, y_predict, average = None, labels = [x for x in range(n_classes)])
-    f1_macro = f1_score(y_test, y_predict, average='macro')
-    acc   = accuracy_score(y_test, y_predict)
-    print('confusion matrix: \n',c_mat)
-    print('f1-score:',f1)
-    print('f1-score(macro):',f1_macro)   # same output with > f1_score(y_true, y_pred, average='macro')
-    print('accuracy-score:', acc)
-    return c_mat, f1, acc, f1_macro
-
-def dict_to_one(dp_dict={}):
-    """
-    Input a dictionary, return a dictionary that all items are set to one,
-    use for disable dropout, dropconnect layer and so on.
-
-    Parameters
-    ----------
-    dp_dict : dictionary
-        keeping probabilities
-
-    Examples
-    --------
-    >>> dp_dict = dict_to_one( network.all_drop )
-    >>> dp_dict = dict_to_one( network.all_drop )
-    >>> feed_dict.update(dp_dict)
-    """
-    return {x: 1 for x in dp_dict}
-
-def flatten_list(list_of_list=[[],[]]):
-    """
-    Input a list of list, return a list that all items are in a list.
-
-    Parameters
-    ----------
-    list_of_list : a list of list
-
-    Examples
-    --------
-    >>> tl.utils.flatten_list([[1, 2, 3],[4, 5],[6]])
-    ... [1, 2, 3, 4, 5, 6]
-    """
-    return sum(list_of_list, [])
-
-
-def class_balancing_oversample(X_train=None, y_train=None, printable=True):
-    """Input the features and labels, return the features and labels after oversampling.
-
-    Parameters
-    ----------
-    X_train : numpy.array
-        Features, each row is an example
-    y_train : numpy.array
-        Labels
-
-    Examples
-    --------
-    - One X
-    >>> X_train, y_train = class_balancing_oversample(X_train, y_train, printable=True)
-
-    - Two X
-    >>> X, y = tl.utils.class_balancing_oversample(X_train=np.hstack((X1, X2)), y_train=y, printable=False)
-    >>> X1 = X[:, 0:5]
-    >>> X2 = X[:, 5:]
-    """
-    # ======== Classes balancing
-    if printable:
-        print("Classes balancing for training examples...")
-    from collections import Counter
-    c = Counter(y_train)
-    if printable:
-        print('the occurrence number of each stage: %s' % c.most_common())
-        print('the least stage is Label %s have %s instances' % c.most_common()[-1])
-        print('the most stage is  Label %s have %s instances' % c.most_common(1)[0])
-    most_num = c.most_common(1)[0][1]
-    if printable:
-        print('most num is %d, all classes tend to be this num' % most_num)
-
-    locations = {}
-    number = {}
-
-    for lab, num in c.most_common():    # find the index from y_train
-        number[lab] = num
-        locations[lab] = np.where(np.array(y_train)==lab)[0]
-    if printable:
-        print('convert list(np.array) to dict format')
-    X = {}  # convert list to dict
-    for lab, num in number.items():
-        X[lab] = X_train[locations[lab]]
-
-    # oversampling
-    if printable:
-        print('start oversampling')
-    for key in X:
-        temp = X[key]
-        while True:
-            if len(X[key]) >= most_num:
-                break
-            X[key] = np.vstack((X[key], temp))
-    if printable:
-        print('first features of label 0 >', len(X[0][0]))
-        print('the occurrence num of each stage after oversampling')
-    for key in X:
-        print(key, len(X[key]))
-    if printable:
-        print('make each stage have same num of instances')
-    for key in X:
-        X[key] = X[key][0:most_num,:]
-        print(key, len(X[key]))
-
-    # convert dict to list
-    if printable:
-        print('convert from dict to list format')
-    y_train = []
-    X_train = np.empty(shape=(0,len(X[0][0])))
-    for key in X:
-        X_train = np.vstack( (X_train, X[key] ) )
-        y_train.extend([key for i in range(len(X[key]))])
-    # print(len(X_train), len(y_train))
-    c = Counter(y_train)
-    if printable:
-        print('the occurrence number of each stage after oversampling: %s' % c.most_common())
-    # ================ End of Classes balancing
-    return X_train, y_train
-
-## Random
-def get_random_int(min=0, max=10, number=5, seed=None):
-    """Return a list of random integer by the given range and quantity.
-
-    Examples
-    ---------
-    >>> r = get_random_int(min=0, max=10, number=5)
-    ... [10, 2, 3, 3, 7]
-    """
-    rnd = random.Random()
-    if seed:
-        rnd = random.Random(seed)
-    # return [random.randint(min,max) for p in range(0, number)]
-    return [rnd.randint(min,max) for p in range(0, number)]
-
-#
-# def class_balancing_sequence_4D(X_train, y_train, sequence_length, model='downsampling' ,printable=True):
-#     ''' 输入、输出都是sequence format
-#         oversampling or downsampling
-#     '''
-#     n_features = X_train.shape[2]
-#     # ======== Classes balancing for sequence
-#     if printable:
-#         print("Classes balancing for 4D sequence training examples...")
-#     from collections import Counter
-#     c = Counter(y_train)    # Counter({2: 454, 4: 267, 3: 124, 1: 57, 0: 48})
-#     if printable:
-#         print('the occurrence number of each stage: %s' % c.most_common())
-#         print('the least Label %s have %s instances' % c.most_common()[-1])
-#         print('the most  Label %s have %s instances' % c.most_common(1)[0])
-#     # print(c.most_common()) # [(2, 454), (4, 267), (3, 124), (1, 57), (0, 48)]
-#     most_num = c.most_common(1)[0][1]
-#     less_num = c.most_common()[-1][1]
-#
-#     locations = {}
-#     number = {}
-#     for lab, num in c.most_common():
-#         number[lab] = num
-#         locations[lab] = np.where(np.array(y_train)==lab)[0]
-#     # print(locations)
-#     # print(number)
-#     if printable:
-#         print('  convert list to dict')
-#     X = {}  # convert list to dict
-#     ### a sequence
-#     for lab, _ in number.items():
-#         X[lab] = np.empty(shape=(0,1,n_features,1)) # 4D
-#     for lab, _ in number.items():
-#         #X[lab] = X_train[locations[lab]
-#         for l in locations[lab]:
-#             X[lab] = np.vstack((X[lab], X_train[l*sequence_length : (l+1)*(sequence_length)]))
-#         # X[lab] = X_train[locations[lab]*sequence_length : locations[lab]*(sequence_length+1)]    # a sequence
-#     # print(X)
-#
-#     if model=='oversampling':
-#         if printable:
-#             print('  oversampling -- most num is %d, all classes tend to be this num\nshuffle applied' % most_num)
-#         for key in X:
-#             temp = X[key]
-#             while True:
-#                 if len(X[key]) >= most_num * sequence_length:   # sequence
-#                     break
-#                 X[key] = np.vstack((X[key], temp))
-#             # print(key, len(X[key]))
-#         if printable:
-#             print('  make each stage have same num of instances')
-#         for key in X:
-#             X[key] = X[key][0:most_num*sequence_length,:]   # sequence
-#             if printable:
-#                 print(key, len(X[key]))
-#     elif model=='downsampling':
-#         import random
-#         if printable:
-#             print('  downsampling -- less num is %d, all classes tend to be this num by randomly choice without replacement\nshuffle applied' % less_num)
-#         for key in X:
-#             # print(key, len(X[key]))#, len(X[key])/sequence_length)
-#             s_idx = [ i for i in range(int(len(X[key])/sequence_length))]
-#             s_idx = np.asarray(s_idx)*sequence_length   # start index of sequnce in X[key]
-#             # print('s_idx',s_idx)
-#             r_idx = np.random.choice(s_idx, less_num, replace=False)    # random choice less_num of s_idx
-#             # print('r_idx',r_idx)
-#             temp = X[key]
-#             X[key] = np.empty(shape=(0,1,n_features,1)) # 4D
-#             for idx in r_idx:
-#                 X[key] = np.vstack((X[key], temp[idx:idx+sequence_length]))
-#             # print(key, X[key])
-#             # np.random.choice(l, len(l), replace=False)
-#     else:
-#         raise Exception('  model should be oversampling or downsampling')
-#
-#     # convert dict to list
-#     if printable:
-#         print('  convert dict to list')
-#     y_train = []
-#     # X_train = np.empty(shape=(0,len(X[0][0])))
-#     # X_train = np.empty(shape=(0,len(X[1][0])))    # 2D
-#     X_train = np.empty(shape=(0,1,n_features,1))    # 4D
-#     l_key = list(X.keys())  # shuffle
-#     random.shuffle(l_key)   # shuffle
-#     # for key in X:     # no shuffle
-#     for key in l_key:   # shuffle
-#         X_train = np.vstack( (X_train, X[key] ) )
-#         # print(len(X[key]))
-#         y_train.extend([key for i in range(int(len(X[key])/sequence_length))])
-#     # print(X_train,y_train, type(X_train), type(y_train))
-#     # ================ End of Classes balancing for sequence
-#     # print(X_train.shape, len(y_train))
-#     return X_train, np.asarray(y_train)
diff --git a/tensorlayer/visualize.py b/tensorlayer/visualize.py
deleted file mode 100644
index d49ea10a..00000000
--- a/tensorlayer/visualize.py
+++ /dev/null
@@ -1,390 +0,0 @@
-#! /usr/bin/python
-# -*- coding: utf8 -*-
-
-
-import matplotlib
-
-## use this, if you got the following error:
-#  _tkinter.TclError: no display name and no $DISPLAY environment variable
-
-# matplotlib.use('Agg')
-
-import matplotlib.pyplot as plt
-import numpy as np
-import os
-from . import prepro
-
-
-## Save images
-import scipy.misc
-
-def read_image(image, path=''):
-    """ Read one image.
-
-    Parameters
-    -----------
-    images : string, file name.
-    path : string, path.
-    """
-    return scipy.misc.imread(os.path.join(path, image))
-
-def read_images(img_list, path='', n_threads=10, printable=True):
-    """ Returns all images in list by given path and name of each image file.
-
-    Parameters
-    -------------
-    img_list : list of string, the image file names.
-    path : string, image folder path.
-    n_threads : int, number of thread to read image.
-    printable : bool, print infomation when reading images, default is True.
-    """
-    imgs = []
-    for idx in range(0, len(img_list), n_threads):
-        b_imgs_list = img_list[idx : idx + n_threads]
-        b_imgs = prepro.threading_data(b_imgs_list, fn=read_image, path=path)
-        # print(b_imgs.shape)
-        imgs.extend(b_imgs)
-        if printable:
-            print('read %d from %s' % (len(imgs), path))
-    return imgs
-
-def save_image(image, image_path=''):
-    """Save one image.
-
-    Parameters
-    -----------
-    images : numpy array [w, h, c]
-    image_path : string.
-    """
-    try: # RGB
-        scipy.misc.imsave(image_path, image)
-    except: # Greyscale
-        scipy.misc.imsave(image_path, image[:,:,0])
-
-
-def save_images(images, size, image_path=''):
-    """Save mutiple images into one single image.
-
-    Parameters
-    -----------
-    images : numpy array [batch, w, h, c]
-    size : list of two int, row and column number.
-        number of images should be equal or less than size[0] * size[1]
-    image_path : string.
-
-    Examples
-    ---------
-    >>> images = np.random.rand(64, 100, 100, 3)
-    >>> tl.visualize.save_images(images, [8, 8], 'temp.png')
-    """
-    def merge(images, size):
-        h, w = images.shape[1], images.shape[2]
-        img = np.zeros((h * size[0], w * size[1], 3))
-        for idx, image in enumerate(images):
-            i = idx % size[1]
-            j = idx // size[1]
-            img[j*h:j*h+h, i*w:i*w+w, :] = image
-        return img
-
-    def imsave(images, size, path):
-        return scipy.misc.imsave(path, merge(images, size))
-
-    assert len(images) <= size[0] * size[1], "number of images should be equal or less than size[0] * size[1] {}".format(len(images))
-    return imsave(images, size, image_path)
-
-def W(W=None, second=10, saveable=True, shape=[28,28], name='mnist', fig_idx=2396512):
-    """Visualize every columns of the weight matrix to a group of Greyscale img.
-
-    Parameters
-    ----------
-    W : numpy.array
-        The weight matrix
-    second : int
-        The display second(s) for the image(s), if saveable is False.
-    saveable : boolean
-        Save or plot the figure.
-    shape : a list with 2 int
-        The shape of feature image, MNIST is [28, 80].
-    name : a string
-        A name to save the image, if saveable is True.
-    fig_idx : int
-        matplotlib figure index.
-
-    Examples
-    --------
-    >>> tl.visualize.W(network.all_params[0].eval(), second=10, saveable=True, name='weight_of_1st_layer', fig_idx=2012)
-    """
-    if saveable is False:
-        plt.ion()
-    fig = plt.figure(fig_idx)      # show all feature images
-    size = W.shape[0]
-    n_units = W.shape[1]
-
-    num_r = int(np.sqrt(n_units))  # 每行显示的个数   若25个hidden unit -> 每行显示5个
-    num_c = int(np.ceil(n_units/num_r))
-    count = int(1)
-    for row in range(1, num_r+1):
-        for col in range(1, num_c+1):
-            if count > n_units:
-                break
-            a = fig.add_subplot(num_r, num_c, count)
-            # ------------------------------------------------------------
-            # plt.imshow(np.reshape(W[:,count-1],(28,28)), cmap='gray')
-            # ------------------------------------------------------------
-            feature = W[:,count-1] / np.sqrt( (W[:,count-1]**2).sum())
-            # feature[feature<0.0001] = 0   # value threshold
-            # if count == 1 or count == 2:
-            #     print(np.mean(feature))
-            # if np.std(feature) < 0.03:      # condition threshold
-            #     feature = np.zeros_like(feature)
-            # if np.mean(feature) < -0.015:      # condition threshold
-            #     feature = np.zeros_like(feature)
-            plt.imshow(np.reshape(feature ,(shape[0],shape[1])),
-                    cmap='gray', interpolation="nearest")#, vmin=np.min(feature), vmax=np.max(feature))
-            # plt.title(name)
-            # ------------------------------------------------------------
-            # plt.imshow(np.reshape(W[:,count-1] ,(np.sqrt(size),np.sqrt(size))), cmap='gray', interpolation="nearest")
-            plt.gca().xaxis.set_major_locator(plt.NullLocator())    # distable tick
-            plt.gca().yaxis.set_major_locator(plt.NullLocator())
-            count = count + 1
-    if saveable:
-        plt.savefig(name+'.pdf',format='pdf')
-    else:
-        plt.draw()
-        plt.pause(second)
-
-def frame(I=None, second=5, saveable=True, name='frame', cmap=None, fig_idx=12836):
-    """Display a frame(image). Make sure OpenAI Gym render() is disable before using it.
-
-    Parameters
-    ----------
-    I : numpy.array
-        The image
-    second : int
-        The display second(s) for the image(s), if saveable is False.
-    saveable : boolean
-        Save or plot the figure.
-    name : a string
-        A name to save the image, if saveable is True.
-    cmap : None or string
-        'gray' for greyscale, None for default, etc.
-    fig_idx : int
-        matplotlib figure index.
-
-    Examples
-    --------
-    >>> env = gym.make("Pong-v0")
-    >>> observation = env.reset()
-    >>> tl.visualize.frame(observation)
-    """
-    if saveable is False:
-        plt.ion()
-    fig = plt.figure(fig_idx)      # show all feature images
-
-    if len(I.shape) and I.shape[-1]==1:     # (10,10,1) --> (10,10)
-        I = I[:,:,0]
-
-    plt.imshow(I, cmap)
-    plt.title(name)
-    # plt.gca().xaxis.set_major_locator(plt.NullLocator())    # distable tick
-    # plt.gca().yaxis.set_major_locator(plt.NullLocator())
-
-    if saveable:
-        plt.savefig(name+'.pdf',format='pdf')
-    else:
-        plt.draw()
-        plt.pause(second)
-
-def CNN2d(CNN=None, second=10, saveable=True, name='cnn', fig_idx=3119362):
-    """Display a group of RGB or Greyscale CNN masks.
-
-    Parameters
-    ----------
-    CNN : numpy.array
-        The image. e.g: 64 5x5 RGB images can be (5, 5, 3, 64).
-    second : int
-        The display second(s) for the image(s), if saveable is False.
-    saveable : boolean
-        Save or plot the figure.
-    name : a string
-        A name to save the image, if saveable is True.
-    fig_idx : int
-        matplotlib figure index.
-
-    Examples
-    --------
-    >>> tl.visualize.CNN2d(network.all_params[0].eval(), second=10, saveable=True, name='cnn1_mnist', fig_idx=2012)
-    """
-    # print(CNN.shape)    # (5, 5, 3, 64)
-    # exit()
-    n_mask = CNN.shape[3]
-    n_row = CNN.shape[0]
-    n_col = CNN.shape[1]
-    n_color = CNN.shape[2]
-    row = int(np.sqrt(n_mask))
-    col = int(np.ceil(n_mask/row))
-    plt.ion()   # active mode
-    fig = plt.figure(fig_idx)
-    count = 1
-    for ir in range(1, row+1):
-        for ic in range(1, col+1):
-            if count > n_mask:
-                break
-            a = fig.add_subplot(col, row, count)
-            # print(CNN[:,:,:,count-1].shape, n_row, n_col)   # (5, 1, 32) 5 5
-            # exit()
-            # plt.imshow(
-            #         np.reshape(CNN[count-1,:,:,:], (n_row, n_col)),
-            #         cmap='gray', interpolation="nearest")     # theano
-            if n_color == 1:
-                plt.imshow(
-                        np.reshape(CNN[:,:,:,count-1], (n_row, n_col)),
-                        cmap='gray', interpolation="nearest")
-            elif n_color == 3:
-                plt.imshow(
-                        np.reshape(CNN[:,:,:,count-1], (n_row, n_col, n_color)),
-                        cmap='gray', interpolation="nearest")
-            else:
-                raise Exception("Unknown n_color")
-            plt.gca().xaxis.set_major_locator(plt.NullLocator())    # distable tick
-            plt.gca().yaxis.set_major_locator(plt.NullLocator())
-            count = count + 1
-    if saveable:
-        plt.savefig(name+'.pdf',format='pdf')
-    else:
-        plt.draw()
-        plt.pause(second)
-
-
-def images2d(images=None, second=10, saveable=True, name='images', dtype=None,
-                                                            fig_idx=3119362):
-    """Display a group of RGB or Greyscale images.
-
-    Parameters
-    ----------
-    images : numpy.array
-        The images.
-    second : int
-        The display second(s) for the image(s), if saveable is False.
-    saveable : boolean
-        Save or plot the figure.
-    name : a string
-        A name to save the image, if saveable is True.
-    dtype : None or numpy data type
-        The data type for displaying the images.
-    fig_idx : int
-        matplotlib figure index.
-
-    Examples
-    --------
-    >>> X_train, y_train, X_test, y_test = tl.files.load_cifar10_dataset(shape=(-1, 32, 32, 3), plotable=False)
-    >>> tl.visualize.images2d(X_train[0:100,:,:,:], second=10, saveable=False, name='cifar10', dtype=np.uint8, fig_idx=20212)
-    """
-    # print(images.shape)    # (50000, 32, 32, 3)
-    # exit()
-    if dtype:
-        images = np.asarray(images, dtype=dtype)
-    n_mask = images.shape[0]
-    n_row = images.shape[1]
-    n_col = images.shape[2]
-    n_color = images.shape[3]
-    row = int(np.sqrt(n_mask))
-    col = int(np.ceil(n_mask/row))
-    plt.ion()   # active mode
-    fig = plt.figure(fig_idx)
-    count = 1
-    for ir in range(1, row+1):
-        for ic in range(1, col+1):
-            if count > n_mask:
-                break
-            a = fig.add_subplot(col, row, count)
-            # print(images[:,:,:,count-1].shape, n_row, n_col)   # (5, 1, 32) 5 5
-            # plt.imshow(
-            #         np.reshape(images[count-1,:,:,:], (n_row, n_col)),
-            #         cmap='gray', interpolation="nearest")     # theano
-            if n_color == 1:
-                plt.imshow(
-                        np.reshape(images[count-1,:,:], (n_row, n_col)),
-                        cmap='gray', interpolation="nearest")
-                # plt.title(name)
-            elif n_color == 3:
-                plt.imshow(images[count-1,:,:],
-                        cmap='gray', interpolation="nearest")
-                # plt.title(name)
-            else:
-                raise Exception("Unknown n_color")
-            plt.gca().xaxis.set_major_locator(plt.NullLocator())    # distable tick
-            plt.gca().yaxis.set_major_locator(plt.NullLocator())
-            count = count + 1
-    if saveable:
-        plt.savefig(name+'.pdf',format='pdf')
-    else:
-        plt.draw()
-        plt.pause(second)
-
-def tsne_embedding(embeddings, reverse_dictionary, plot_only=500,
-                        second=5, saveable=False, name='tsne', fig_idx=9862):
-    """Visualize the embeddings by using t-SNE.
-
-    Parameters
-    ----------
-    embeddings : a matrix
-        The images.
-    reverse_dictionary : a dictionary
-        id_to_word, mapping id to unique word.
-    plot_only : int
-        The number of examples to plot, choice the most common words.
-    second : int
-        The display second(s) for the image(s), if saveable is False.
-    saveable : boolean
-        Save or plot the figure.
-    name : a string
-        A name to save the image, if saveable is True.
-    fig_idx : int
-        matplotlib figure index.
-
-    Examples
-    --------
-    >>> see 'tutorial_word2vec_basic.py'
-    >>> final_embeddings = normalized_embeddings.eval()
-    >>> tl.visualize.tsne_embedding(final_embeddings, labels, reverse_dictionary,
-    ...                   plot_only=500, second=5, saveable=False, name='tsne')
-    """
-    def plot_with_labels(low_dim_embs, labels, figsize=(18, 18), second=5,
-                                    saveable=True, name='tsne', fig_idx=9862):
-        assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
-        if saveable is False:
-            plt.ion()
-            plt.figure(fig_idx)
-        plt.figure(figsize=figsize)  #in inches
-        for i, label in enumerate(labels):
-            x, y = low_dim_embs[i,:]
-            plt.scatter(x, y)
-            plt.annotate(label,
-                     xy=(x, y),
-                     xytext=(5, 2),
-                     textcoords='offset points',
-                     ha='right',
-                     va='bottom')
-        if saveable:
-            plt.savefig(name+'.pdf',format='pdf')
-        else:
-            plt.draw()
-            plt.pause(second)
-
-    try:
-        from sklearn.manifold import TSNE
-        import matplotlib.pyplot as plt
-        from six.moves import xrange
-
-        tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
-        # plot_only = 500
-        low_dim_embs = tsne.fit_transform(embeddings[:plot_only,:])
-        labels = [reverse_dictionary[i] for i in xrange(plot_only)]
-        plot_with_labels(low_dim_embs, labels, second=second, saveable=saveable, \
-                                                    name=name, fig_idx=fig_idx)
-    except ImportError:
-        print("Please install sklearn and matplotlib to visualize embeddings.")
-
-
-#