diff --git a/README.md b/README.md
index da28cd03..ddfe9a68 100644
--- a/README.md
+++ b/README.md
@@ -10,8 +10,7 @@
 
 DeepCTR is a **Easy-to-use**,**Modular** and **Extendible** package of deep-learning based CTR models ,including serval DNN-based CTR models and lots of core components layer of the models which can be used to build your own custom model.The goal is to make it possible for everyone to use complex deep learning-based models with `model.fit()`and`model.predict()`. 
 
-Through  `pip install deepctr`  get the package and [**Get Started!**](https://deepctr-doc.readthedocs.io/en/latest/
-)
+Through  `pip install deepctr`  get the package and [**Get Started!**](https://deepctr-doc.readthedocs.io/en/latest/Quick-Start.html)
 
 
 ## Models List
diff --git a/deepctr/layers.py b/deepctr/layers.py
index d11a5da1..a0650115 100644
--- a/deepctr/layers.py
+++ b/deepctr/layers.py
@@ -18,7 +18,6 @@ class FM(Layer):
         
       References
         - [Factorization Machines](https://www.csie.ntu.edu.tw/~b97053/paper/Rendle2010FM.pdf)
-
     """
     def __init__(self, **kwargs):
 
@@ -111,7 +110,7 @@ def build(self, input_shape):
         self.projection_p = self.add_weight(shape=(embedding_size, 1), initializer=glorot_normal(seed=self.seed), name="projection_p")
         super(AFMLayer, self).build(input_shape)  # Be sure to call this somewhere!
 
-    def call(self, inputs,**kwargs ):
+    def call(self, inputs,**kwargs):
 
         if K.ndim(inputs[0]) != 3:
             raise ValueError("Unexpected inputs dimensions %d, expect to be 3 dimensions" % (K.ndim(inputs)))
@@ -258,13 +257,10 @@ class MLP(Layer):
     """The Multi Layer Percetron
         
       Input shape
-        - nD tensor with shape: ``(batch_size, ..., input_dim)``.
-        The most common situation would be a 2D input with shape ``(batch_size, input_dim)``.
+        - nD tensor with shape: ``(batch_size, ..., input_dim)``. The most common situation would be a 2D input with shape ``(batch_size, input_dim)``.
 
       Output shape
-        - nD tensor with shape: ``(batch_size, ..., hidden_size[-1])``.
-        For instance, for a 2D input with shape `(batch_size, input_dim)`,
-        the output would have shape ``(batch_size, hidden_size[-1])``.
+        - nD tensor with shape: ``(batch_size, ..., hidden_size[-1])``. For instance, for a 2D input with shape `(batch_size, input_dim)`, the output would have shape ``(batch_size, hidden_size[-1])``.
 
       Arguments
         - **hidden_size**:list of positive integer, the layer number and units in each layer.
@@ -278,8 +274,8 @@ class MLP(Layer):
         - **use_bn**: bool. Whether use BatchNormalization before activation or not.
 
         - **seed**: A Python integer to use as random seed.
-
     """
+
     def __init__(self,  hidden_size, activation,l2_reg, keep_prob, use_bn,seed,**kwargs):
         self.hidden_size = hidden_size
         self.activation =activation
@@ -339,6 +335,7 @@ class BiInteractionPooling(Layer):
       References
         - [Neural Factorization Machines for Sparse Predictive Analytics](http://arxiv.org/abs/1708.05027)
     """
+
     def __init__(self, **kwargs):
 
         super(BiInteractionPooling, self).__init__(**kwargs)
@@ -505,8 +502,7 @@ class InnerProductLayer(Layer):
         - A list of N 3D tensor with shape: ``(batch_size,1,embedding_size)``.
 
       Output shape
-        - 2D tensor with shape: ``(batch_size, N*(N-1)/2 )`` if use reduce_sum. or
-        3D tensor with shape: ``(batch_size, N*(N-1)/2, embedding_size )`` if not use reduce_sum.
+        - 2D tensor with shape: ``(batch_size, N*(N-1)/2 )`` if use reduce_sum. or 3D tensor with shape: ``(batch_size, N*(N-1)/2, embedding_size )`` if not use reduce_sum.
 
       Arguments
         - **reduce_sum**: bool. Whether return inner product or element-wise product
@@ -514,6 +510,7 @@ class InnerProductLayer(Layer):
       References
             - [Product-based Neural Networks for User Response Prediction](https://arxiv.org/pdf/1611.00144.pdf)
     """
+
     def __init__(self,reduce_sum=True,**kwargs):
         self.reduce_sum = reduce_sum
         super(InnerProductLayer, self).__init__(**kwargs)
@@ -605,6 +602,7 @@ class LocalActivationUnit(Layer):
       References
         - [Deep Interest Network for Click-Through Rate Prediction](https://arxiv.org/pdf/1706.06978.pdf)
     """
+
     def __init__(self,hidden_size, activation,l2_reg, keep_prob, use_bn,seed,**kwargs):
         self.hidden_size = hidden_size
         self.activation = activation
diff --git a/deepctr/sequence.py b/deepctr/sequence.py
index df8169f4..f3807eaf 100644
--- a/deepctr/sequence.py
+++ b/deepctr/sequence.py
@@ -13,7 +13,7 @@ class SequencePoolingLayer(Layer):
 
         - seq_value is a 3D tensor with shape: ``(batch_size, T, embedding_size``
 
-        - seq_len is a 2D tensor with shape : ``(batch_size, 1)``，indicate valid length of each sequence.
+        - seq_len is a 2D tensor with shape : ``(batch_size, 1)``,indicate valid length of each sequence.
 
       Output shape
         - 3D tensor with shape: `(batch_size, 1, embedding_size)`.
@@ -40,11 +40,11 @@ def call(self, seq_value_len_list, **kwargs):
         uiseq_embed_list, user_behavior_length = seq_value_len_list
         embedding_size = uiseq_embed_list.shape[-1]
         mask = tf.sequence_mask(user_behavior_length,
-                                self.seq_len_max, dtype=tf.float32)  # [B, T,1]
-        # tf.transpose(mask, [0, 2, 1])
+                                self.seq_len_max, dtype=tf.float32)
+
         mask = K.permute_dimensions(mask, [0, 2, 1])
-        mask = tf.tile(mask, [1, 1, embedding_size])  # [B, T, H]
-        uiseq_embed_list *= mask  # [B, T, H]
+        mask = tf.tile(mask, [1, 1, embedding_size])
+        uiseq_embed_list *= mask
         hist = uiseq_embed_list
         if self.mode == "max":
             return K.max(hist, 1, keepdims=True)
@@ -127,7 +127,6 @@ def call(self, inputs, **kwargs):
         outputs = K.permute_dimensions(attention_score, (0, 2, 1))
         key_masks = tf.sequence_mask(keys_length, hist_len)
 
-
         if self.weight_normalization:
             paddings = tf.ones_like(outputs) * (-2 ** 32 + 1)
         else:
@@ -135,7 +134,6 @@ def call(self, inputs, **kwargs):
 
         outputs = tf.where(key_masks, outputs, paddings)
 
-
         if self.weight_normalization:
             outputs = K.softmax(outputs)
 
@@ -148,7 +146,7 @@ def compute_output_shape(self, input_shape):
 
     def get_config(self,):
 
-        config = {'hidden_size': self.hidden_size,'activation':self.activation,'weight_normalization':self.weight_normalization}
+        config = {'hidden_size': self.hidden_size, 'activation': self.activation,
+                  'weight_normalization': self.weight_normalization}
         base_config = super(AttentionSequencePoolingLayer, self).get_config()
         return dict(list(base_config.items()) + list(config.items()))
-
diff --git a/demo/run_classification_criteo.py b/demo/run_classification_criteo.py
index 982d43e3..12beec39 100644
--- a/demo/run_classification_criteo.py
+++ b/demo/run_classification_criteo.py
@@ -2,7 +2,6 @@
 from sklearn.preprocessing import LabelEncoder, MinMaxScaler
 from deepctr.models import DeepFM
 
-
 if __name__ == "__main__":
     data = pd.read_csv('./criteo_sample.txt')
 
diff --git a/demo/run_regression_movielens.py b/demo/run_regression_movielens.py
index 39d83061..347749c2 100644
--- a/demo/run_regression_movielens.py
+++ b/demo/run_regression_movielens.py
@@ -2,7 +2,6 @@
 from sklearn.preprocessing import LabelEncoder
 from deepctr.models import DeepFM
 
-
 if __name__ == "__main__":
 
     data = pd.read_csv("./movielens_sample.txt")
diff --git a/docs/source/Demo.rst b/docs/source/Demo.rst
index b26938b0..68ea95f0 100644
--- a/docs/source/Demo.rst
+++ b/docs/source/Demo.rst
@@ -23,7 +23,7 @@ This example shows how to use *DeepFM* to solve a simple binary classification t
 
     import pandas as pd
     from sklearn.preprocessing import LabelEncoder,MinMaxScaler
-    from deepctr import DeepFM
+    from deepctr.models import DeepFM
 
     
     data = pd.read_csv('./criteo_sample.txt')
@@ -80,7 +80,7 @@ This example shows how to use *DeepFM* to solve a simple binary regression task.
 
     import pandas as pd
     from sklearn.preprocessing import LabelEncoder,MinMaxScaler
-    from deepctr import DeepFM
+    from deepctr.models import DeepFM
 
     
     data = pd.read_csv("./movielens_sample.txt")
diff --git a/docs/source/Models-API.rst b/docs/source/Models-API.rst
index fec2b5fc..567be8d0 100644
--- a/docs/source/Models-API.rst
+++ b/docs/source/Models-API.rst
@@ -5,7 +5,7 @@ DeepCTR Models API
 
    FNN<deepctr.models.fnn>
    PNN<deepctr.models.pnn>
-   Wide&Deep<deepctr.models.wdl>
+   WDL<deepctr.models.wdl>
    DeepFM<deepctr.models.deepfm>
    MLR<deepctr.models.mlr>
    NFM<deepctr.models.nfm>
diff --git a/docs/source/Quick-Start.rst b/docs/source/Quick-Start.rst
index 6092fe0d..e2a6bf83 100644
--- a/docs/source/Quick-Start.rst
+++ b/docs/source/Quick-Start.rst
@@ -20,7 +20,7 @@ Step 1: Import model
     import pandas as pd
     from sklearn.preprocessing import LabelEncoder,MinMaxScaler
 
-    from deepctr import DeepFM
+    from deepctr.models import DeepFM
 
     data = pd.read_csv('./criteo_sample.txt')
 
diff --git a/docs/source/index.rst b/docs/source/index.rst
index d4ed2dff..0956b740 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -9,7 +9,7 @@ Welcome to DeepCTR's documentation!
 DeepCTR is a **Easy-to-use** , **Modular** and **Extendible** package of deep-learning based based CTR models ,including serval DNN-based CTR models and lots of core components layer of the models which can be used to build your own custom model.
 The goal is to make it possible for everyone to use complex deep learning-based models with ``model.fit()`` and ``model.predict()`` .
 
-Through ``pip install deepctr`` get the package and `Get Started! <../html/Quick-Start.html>`_
+Through ``pip install deepctr`` get the package and `Get Started! <./Quick-Start.html>`_
 
 You can find source code at https://github.com/shenweichen/DeepCTR