diff --git a/README.md b/README.md
index a3eb3a1..3811d13 100644
--- a/README.md
+++ b/README.md
@@ -1,18 +1,24 @@
 # RecommendationCTR
 Recommendation model for CTR prediction
 
-## Implementation based on KGAT
+## Implementation based on following project
 
-* https://github.com/LunaBlack/KGAT-pytorch
+* https://github.com/LunaBlack/KGAT-pytorch # model KGAT Pyorch implementation
 
-* https://github.com/xiangwang1223/knowledge_graph_attention_network
+* https://github.com/xiangwang1223/knowledge_graph_attention_network # model KGAT Tensorflow implementation
+
+* https://github.com/gusye1234/LightGCN-PyTorch # model LightGCN Pyorch implementation
+
+* https://github.com/shenweichen/GraphEmbedding # a lot of graph embedding model (DeepWalk, Struc2Vec, etc.) implementation
 
 ## Environment Requirement
 The code has been tested running under Python 3.6.8. The required packages are as follows:
-* torch == 1.3.1
-* dgl-cu90 == 0.4.1
-* numpy == 1.15.4
-* pandas == 0.23.1
-* scipy == 1.1.0
-* sklearn == 0.20.0
-
+* torch
+* dgl-cu90
+* numpy
+* pandas
+* scipy
+* sklearn
+* networkx
+* fastdtw
+* joblib
diff --git a/lgcn_model.py b/light_gcn/lgcn_model.py
similarity index 100%
rename from lgcn_model.py
rename to light_gcn/lgcn_model.py
diff --git a/script_kgat.py b/script_kgat.py
deleted file mode 100644
index 7403a36..0000000
--- a/script_kgat.py
+++ /dev/null
@@ -1,296 +0,0 @@
-import os
-os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,2,3'
-
-import random
-import logging
-import argparse
-from time import time
-
-import torch
-import numpy as np
-import pandas as pd
-import torch.nn as nn
-import torch.optim as optim
-import torch.distributed as dist
-
-from model.KGAT import KGAT
-from utility.parser_kgat import *
-from utility.log_helper import *
-from utility.metrics import *
-from utility.helper import *
-from utility.loader_kgat import DataLoaderKGAT
-
-
-def evaluate(model, train_graph, train_user_dict, test_user_dict, user_ids_batches, item_ids, K):
-    model.eval()
-
-    with torch.no_grad():
-        att = model.compute_attention(train_graph)
-    train_graph.edata['att'] = att
-
-    n_users = len(test_user_dict.keys())
-    item_ids_batch = item_ids.cpu().numpy()
-
-    cf_scores = []
-    precision = []
-    recall = []
-    ndcg = []
-
-    with torch.no_grad():
-        for user_ids_batch in user_ids_batches:
-            cf_scores_batch = model('predict', train_graph, user_ids_batch, item_ids)       # (n_batch_users, n_eval_items)
-
-            cf_scores_batch = cf_scores_batch.cpu()
-            user_ids_batch = user_ids_batch.cpu().numpy()
-            precision_batch, recall_batch, ndcg_batch = calc_metrics_at_k(cf_scores_batch, train_user_dict, test_user_dict, user_ids_batch, item_ids_batch, K)
-
-            cf_scores.append(cf_scores_batch.numpy())
-            precision.append(precision_batch)
-            recall.append(recall_batch)
-            ndcg.append(ndcg_batch)
-
-    cf_scores = np.concatenate(cf_scores, axis=0)
-    precision_k = sum(np.concatenate(precision)) / n_users
-    recall_k = sum(np.concatenate(recall)) / n_users
-    ndcg_k = sum(np.concatenate(ndcg)) / n_users
-    return cf_scores, precision_k, recall_k, ndcg_k
-
-
-def train(args):
-    # seed
-    random.seed(args.seed)
-    np.random.seed(args.seed)
-    torch.manual_seed(args.seed)
-
-    log_save_id = create_log_id(args.save_dir)
-    logging_config(folder=args.save_dir, name='log{:d}'.format(log_save_id), no_console=False)
-    logging.info(args)
-
-    # GPU / CPU
-    use_cuda = torch.cuda.is_available()
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    n_gpu = torch.cuda.device_count()
-    if n_gpu > 0:
-        torch.cuda.manual_seed_all(args.seed)
-
-    # load data
-    data = DataLoaderKGAT(args, logging)
-
-    if args.use_pretrain == 1:
-        user_pre_embed = torch.tensor(data.user_pre_embed)
-        item_pre_embed = torch.tensor(data.item_pre_embed)
-    else:
-        user_pre_embed, item_pre_embed = None, None
-
-    user_ids = list(data.test_user_dict.keys()) # for test
-    user_ids_batches = [user_ids[i: i + args.test_batch_size] for i in range(0, len(user_ids), args.test_batch_size)] # for test
-    user_ids_batches = [torch.LongTensor(d) for d in user_ids_batches] # for test
-    if use_cuda:
-        user_ids_batches = [d.to(device) for d in user_ids_batches] # for test
-
-    item_ids = torch.arange(data.n_items, dtype=torch.long)
-    if use_cuda:
-        item_ids = item_ids.to(device)
-
-    # construct model & optimizer
-    model = KGAT(args, data.n_users, data.n_entities, data.n_relations, user_pre_embed, item_pre_embed)
-    if args.use_pretrain == 2:
-        model = load_model(model, args.pretrain_model_path)
-
-    model.to(device)
-    # if n_gpu > 1:
-    #     model = nn.parallel.DistributedDataParallel(model)
-    logging.info(model)
-
-    optimizer = optim.Adam(model.parameters(), lr=args.lr)
-
-    # move graph data to GPU
-    train_graph = data.train_graph
-    train_nodes = torch.LongTensor(train_graph.ndata['id'])
-    train_edges = torch.LongTensor(train_graph.edata['type'])
-    if use_cuda:
-        train_nodes = train_nodes.to(device)
-        train_edges = train_edges.to(device)
-    train_graph.ndata['id'] = train_nodes
-    train_graph.edata['type'] = train_edges
-
-    test_graph = data.test_graph
-    test_nodes = torch.LongTensor(test_graph.ndata['id'])
-    test_edges = torch.LongTensor(test_graph.edata['type'])
-    if use_cuda:
-        test_nodes = test_nodes.to(device)
-        test_edges = test_edges.to(device)
-    test_graph.ndata['id'] = test_nodes
-    test_graph.edata['type'] = test_edges
-
-    # initialize metrics
-    best_epoch = -1
-    epoch_list = []
-    precision_list = []
-    recall_list = []
-    ndcg_list = []
-
-    # train model
-    for epoch in range(1, args.n_epoch + 1):
-        time0 = time()
-        model.train()
-
-        # update attention scores
-        with torch.no_grad():
-            att = model('calc_att', train_graph)
-        train_graph.edata['att'] = att
-        logging.info('Update attention scores: Epoch {:04d} | Total Time {:.1f}s'.format(epoch, time() - time0))
-
-        # train cf
-        time1 = time()
-        cf_total_loss = 0
-        n_cf_batch = data.n_cf_train // data.cf_batch_size + 1
-
-        for iter in range(1, n_cf_batch + 1):
-            time2 = time()
-            cf_batch_user, cf_batch_pos_item, cf_batch_neg_item = data.generate_cf_batch(data.train_user_dict)
-            if use_cuda:
-                cf_batch_user = cf_batch_user.to(device)
-                cf_batch_pos_item = cf_batch_pos_item.to(device)
-                cf_batch_neg_item = cf_batch_neg_item.to(device)
-            cf_batch_loss = model('calc_cf_loss', train_graph, cf_batch_user, cf_batch_pos_item, cf_batch_neg_item)
-
-            print('cf_batch_loss.backward() start')
-            time_start = time()
-            cf_batch_loss.backward()
-            print('cf_batch_loss.backward time:' + str(time() - time_start))
-            optimizer.step()
-            optimizer.zero_grad()
-            cf_total_loss += cf_batch_loss.item()
-
-            if (iter % args.cf_print_every) == 0:
-                logging.info('CF Training: Epoch {:04d} Iter {:04d} / {:04d} | Time {:.1f}s | Iter Loss {:.4f} | Iter Mean Loss {:.4f}'.format(epoch, iter, n_cf_batch, time() - time2, cf_batch_loss.item(), cf_total_loss / iter))
-        logging.info('CF Training: Epoch {:04d} Total Iter {:04d} | Total Time {:.1f}s | Iter Mean Loss {:.4f}'.format(epoch, n_cf_batch, time() - time1, cf_total_loss / n_cf_batch))
-
-        # train kg
-        time1 = time()
-        kg_total_loss = 0
-        n_kg_batch = data.n_kg_train // data.kg_batch_size + 1
-
-        for iter in range(1, n_kg_batch + 1):
-            time2 = time()
-            kg_batch_head, kg_batch_relation, kg_batch_pos_tail, kg_batch_neg_tail = data.generate_kg_batch(data.train_kg_dict)
-            if use_cuda:
-                kg_batch_head = kg_batch_head.to(device)
-                kg_batch_relation = kg_batch_relation.to(device)
-                kg_batch_pos_tail = kg_batch_pos_tail.to(device)
-                kg_batch_neg_tail = kg_batch_neg_tail.to(device)
-            kg_batch_loss = model('calc_kg_loss', kg_batch_head, kg_batch_relation, kg_batch_pos_tail, kg_batch_neg_tail)
-
-            kg_batch_loss.backward()
-            optimizer.step()
-            optimizer.zero_grad()
-            kg_total_loss += kg_batch_loss.item()
-
-            if (iter % args.kg_print_every) == 0:
-                logging.info('KG Training: Epoch {:04d} Iter {:04d} / {:04d} | Time {:.1f}s | Iter Loss {:.4f} | Iter Mean Loss {:.4f}'.format(epoch, iter, n_kg_batch, time() - time2, kg_batch_loss.item(), kg_total_loss / iter))
-        logging.info('KG Training: Epoch {:04d} Total Iter {:04d} | Total Time {:.1f}s | Iter Mean Loss {:.4f}'.format(epoch, n_kg_batch, time() - time1, kg_total_loss / n_kg_batch))
-
-        logging.info('CF + KG Training: Epoch {:04d} | Total Time {:.1f}s'.format(epoch, time() - time0))
-
-        # evaluate cf
-        if (epoch % args.evaluate_every) == 0:
-            time1 = time()
-            _, precision, recall, ndcg = evaluate(model, train_graph, data.train_user_dict, data.test_user_dict, user_ids_batches, item_ids, args.K)
-            logging.info('CF Evaluation: Epoch {:04d} | Total Time {:.1f}s | Precision {:.4f} Recall {:.4f} NDCG {:.4f}'.format(epoch, time() - time1, precision, recall, ndcg))
-
-            epoch_list.append(epoch)
-            precision_list.append(precision)
-            recall_list.append(recall)
-            ndcg_list.append(ndcg)
-            best_recall, should_stop = early_stopping(recall_list, args.stopping_steps)
-
-            if should_stop:
-                break
-
-            if recall_list.index(best_recall) == len(recall_list) - 1:
-                save_model(model, args.save_dir, epoch, best_epoch)
-                logging.info('Save model on epoch {:04d}!'.format(epoch))
-                best_epoch = epoch
-
-    # save model
-    save_model(model, args.save_dir, epoch)
-
-    # save metrics
-    _, precision, recall, ndcg = evaluate(model, train_graph, data.train_user_dict, data.test_user_dict, user_ids_batches, item_ids, args.K)
-    logging.info('Final CF Evaluation: Precision {:.4f} Recall {:.4f} NDCG {:.4f}'.format(precision, recall, ndcg))
-
-    epoch_list.append(epoch)
-    precision_list.append(precision)
-    recall_list.append(recall)
-    ndcg_list.append(ndcg)
-
-    metrics = pd.DataFrame([epoch_list, precision_list, recall_list, ndcg_list]).transpose()
-    metrics.columns = ['epoch_idx', 'precision@{}'.format(args.K), 'recall@{}'.format(args.K), 'ndcg@{}'.format(args.K)]
-    metrics.to_csv(args.save_dir + '/metrics.tsv', sep='\t', index=False)
-
-
-def predict(args):
-    # GPU / CPU
-    use_cuda = torch.cuda.is_available()
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    n_gpu = torch.cuda.device_count()
-    if n_gpu > 0:
-        torch.cuda.manual_seed_all(args.seed)
-
-    # load data
-    data = DataLoaderKGAT(args, logging)
-
-    user_ids = list(data.test_user_dict.keys())
-    user_ids_batches = [user_ids[i: i + args.test_batch_size] for i in range(0, len(user_ids), args.test_batch_size)]
-    user_ids_batches = [torch.LongTensor(d) for d in user_ids_batches]
-    if use_cuda:
-        user_ids_batches = [d.to(device) for d in user_ids_batches]
-
-    item_ids = torch.arange(data.n_items, dtype=torch.long)
-    if use_cuda:
-        item_ids = item_ids.to(device)
-
-    # load model
-    model = KGAT(args, data.n_users, data.n_entities, data.n_relations)
-    model = load_model(model, args.pretrain_model_path)
-    model.to(device)
-    # if n_gpu > 1:
-    #     model = nn.parallel.DistributedDataParallel(model)
-
-    # move graph data to GPU
-    train_graph = data.train_graph
-    train_nodes = torch.LongTensor(train_graph.ndata['id'])
-    train_edges = torch.LongTensor(train_graph.edata['type'])
-    if use_cuda:
-        train_nodes = train_nodes.to(device)
-        train_edges = train_edges.to(device)
-    train_graph.ndata['id'] = train_nodes
-    train_graph.edata['type'] = train_edges
-
-    test_graph = data.test_graph
-    test_nodes = torch.LongTensor(test_graph.ndata['id'])
-    test_edges = torch.LongTensor(test_graph.edata['type'])
-    if use_cuda:
-        test_nodes = test_nodes.to(device)
-        test_edges = test_edges.to(device)
-    test_graph.ndata['id'] = test_nodes
-    test_graph.edata['type'] = test_edges
-
-    # predict
-    cf_scores, precision, recall, ndcg = evaluate(model, train_graph, data.train_user_dict, data.test_user_dict, user_ids_batches, item_ids, args.K)
-    np.save(args.save_dir + 'cf_scores.npy', cf_scores)
-    print('CF Evaluation: Precision {:.4f} Recall {:.4f} NDCG {:.4f}'.format(precision, recall, ndcg))
-
-
-
-if __name__ == '__main__':
-    args = parse_kgat_args()
-    train(args)
-    # predict(args)
-
-
-
-
-
-
diff --git a/script_lgcn.py b/script_lgcn.py
index 25958ea..501a244 100644
--- a/script_lgcn.py
+++ b/script_lgcn.py
@@ -7,7 +7,7 @@
 import numpy as np
 
 from cf_dataset import DataOnlyCF
-from lgcn_model import LightGCN
+from light_gcn.lgcn_model import LightGCN
 from metrics import precision_and_recall, ndcg, auc
 
 EPOCH = 200
diff --git a/test_dgl.py b/test_dgl.py
deleted file mode 100644
index 115fdca..0000000
--- a/test_dgl.py
+++ /dev/null
@@ -1,55 +0,0 @@
-import time
-
-import dgl
-import dgl.function as fn
-import torch
-import networkx as nx
-import matplotlib.pyplot as plt
-
-def pagerank_message_func(edges):
-    return {'pv' : edges.src['pv'] / edges.src['deg']}
-
-def pagerank_reduce_func(nodes):
-    msgs = torch.sum(nodes.mailbox['pv'], dim=1)
-    pv = (1 - DAMP) / N + DAMP * msgs
-    return {'pv' : pv}
-
-def pagerank_naive(g):
-    # Phase #1: send out messages along all edges.
-    for u, v in zip(*g.edges()):
-        g.send((u, v))
-    # Phase #2: receive messages to compute new PageRank values.
-    for v in g.nodes():
-        g.recv(v)
-
-def pagerank_batch(g):
-    g.send(g.edges())
-    g.recv(g.nodes())
-
-def pagerank_builtin(g):
-    g.ndata['pv'] = g.ndata['pv'] / g.ndata['deg']
-    g.update_all(message_func=fn.copy_src(src='pv', out='m'),
-                 reduce_func=fn.sum(msg='m',out='m_sum'))
-    g.ndata['pv'] = (1 - DAMP) / N + DAMP * g.ndata['m_sum']
-
-if __name__ == "__main__":
-    N = 10000  # number of nodes
-    DAMP = 0.85  # damping factor
-    K = 10  # number of iterations
-    g = nx.nx.erdos_renyi_graph(N, 0.1)
-    g = dgl.DGLGraph(g)
-    # nx.draw(g.to_networkx(), node_size=50, node_color=[[.5, .5, .5,]])
-    # plt.show()
-
-    g.ndata['pv'] = torch.ones(N) / N
-    g.ndata['deg'] = g.out_degrees(g.nodes()).float()
-    g.register_message_func(pagerank_message_func)
-    g.register_reduce_func(pagerank_reduce_func)
-
-    ts = time.time()
-    for k in range(K):
-        # pagerank_builtin(g)
-        pagerank_naive(g)
-    print('time spend: ' + str(time.time() - ts))
-
-    # print(g.ndata['pv'])