Skip to content

Commit

Permalink
dev-din base version
Browse files Browse the repository at this point in the history
  • Loading branch information
wangze committed Mar 18, 2020
1 parent afd1854 commit 80eccf9
Show file tree
Hide file tree
Showing 4 changed files with 252 additions and 36 deletions.
54 changes: 18 additions & 36 deletions deepctr_torch/inputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,42 +133,6 @@ def combined_dnn_input(sparse_embedding_list, dense_value_list):
else:
raise NotImplementedError

#
#
# def varlen_embedding_lookup(embedding_dict, sequence_input_dict, varlen_sparse_feature_columns):
# varlen_embedding_vec_dict = {}
# for fc in varlen_sparse_feature_columns:
# feature_name = fc.name
# embedding_name = fc.embedding_name
# if fc.use_hash:
# # lookup_idx = Hash(fc.vocabulary_size, mask_zero=True)(sequence_input_dict[feature_name])
# # TODO: add hash function
# lookup_idx = sequence_input_dict[feature_name]
# else:
# lookup_idx = sequence_input_dict[feature_name]
# varlen_embedding_vec_dict[feature_name] = embedding_dict[embedding_name](lookup_idx)
# return varlen_embedding_vec_dict
#
#
# def get_varlen_pooling_list(embedding_dict, features, varlen_sparse_feature_columns, to_list=False):
# pooling_vec_list = defaultdict(list)
# for fc in varlen_sparse_feature_columns:
# feature_name = fc.name
# combiner = fc.combiner
# feature_length_name = fc.length_name
# if feature_length_name is not None:
# seq_input = embedding_dict[feature_name]
# vec = SequencePoolingLayer(combiner)([seq_input, features[feature_length_name]])
# else:
# seq_input = embedding_dict[feature_name]
# vec = SequencePoolingLayer(combiner)(seq_input)
# pooling_vec_list[fc.group_name].append(vec)
#
# if to_list:
# return chain.from_iterable(pooling_vec_list.values())
#
# return pooling_vec_list


def get_varlen_pooling_list(embedding_dict, features, feature_index, varlen_sparse_feature_columns, device):
varlen_sparse_embedding_list = []
Expand Down Expand Up @@ -241,6 +205,7 @@ def input_from_feature_columns(self, X, feature_columns, embedding_dict, support
return sparse_embedding_list + varlen_sparse_embedding_list, dense_value_list



def embedding_lookup(X, sparse_embedding_dict, sparse_input_dict, sparse_feature_columns, return_feat_list=(),
mask_feat_list=(), to_list=False):
"""
Expand Down Expand Up @@ -271,6 +236,23 @@ def embedding_lookup(X, sparse_embedding_dict, sparse_input_dict, sparse_feature
return group_embedding_dict


def varlen_embedding_lookup(X, embedding_dict, sequence_input_dict, varlen_sparse_feature_columns):
varlen_embedding_vec_dict = {}
for fc in varlen_sparse_feature_columns:
feature_name = fc.name
embedding_name = fc.embedding_name
if fc.use_hash:
# lookup_idx = Hash(fc.vocabulary_size, mask_zero=True)(sequence_input_dict[feature_name])
# TODO: add hash function
lookup_idx = sequence_input_dict[feature_name]
else:
lookup_idx = sequence_input_dict[feature_name]
varlen_embedding_vec_dict[feature_name] = embedding_dict[embedding_name](
X[:, lookup_idx[0]:lookup_idx[1]].long()) # (lookup_idx)

return varlen_embedding_vec_dict


def get_dense_input(X, features, feature_columns):
dense_feature_columns = list(filter(lambda x: isinstance(
x, DenseFeat), feature_columns)) if feature_columns else []
Expand Down
141 changes: 141 additions & 0 deletions deepctr_torch/models/din.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
# -*- coding:utf-8 -*-
"""
Author:
Yuef Zhang
Reference:
[1] Zhou G, Zhu X, Song C, et al. Deep interest network for click-through rate prediction[C]//Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining. ACM, 2018: 1059-1068. (https://arxiv.org/pdf/1706.06978.pdf)
"""
import torch
import torch.nn as nn
import torch.nn.functional as F

from .basemodel import BaseModel
from ..inputs import get_varlen_pooling_list, embedding_lookup, varlen_embedding_lookup, SparseFeat, \
DenseFeat, VarLenSparseFeat, combined_dnn_input
from ..layers import FM, DNN
from ..layers.sequence import AttentionSequencePoolingLayer

import pdb


class DIN(BaseModel):
"""Instantiates the Deep Interest Network architecture.
:param dnn_feature_columns: An iterable containing all the features used by deep part of the model.
:param history_feature_list: list,to indicate sequence sparse field
:param dnn_use_bn: bool. Whether use BatchNormalization before activation or not in deep net
:param dnn_hidden_units: list,list of positive integer or empty list, the layer number and units in each layer of deep net
:param dnn_activation: Activation function to use in deep net
:param att_hidden_size: list,list of positive integer , the layer number and units in each layer of attention net
:param att_activation: Activation function to use in attention net
:param att_weight_normalization: bool.Whether normalize the attention score of local activation unit.
:param l2_reg_dnn: float. L2 regularizer strength applied to DNN
:param l2_reg_embedding: float. L2 regularizer strength applied to embedding vector
:param dnn_dropout: float in [0,1), the probability we will drop out a given DNN coordinate.
:param init_std: float,to use as the initialize std of embedding vector
:param seed: integer ,to use as random seed.
:param task: str, ``"binary"`` for binary logloss or ``"regression"`` for regression loss
:return: A PyTorch model instance.
"""

def __init__(self,
dnn_feature_columns,
history_feature_list,
dnn_use_bn=False,
embedding_size=8,
dnn_hidden_units=(256, 128),
dnn_activation='relu',
att_hidden_size=[80, 40],
att_activation='Dice',
l2_reg_dnn=0.0,
init_std=0.0001,
dnn_dropout=0,
task='binary', device='cpu'):

# super(DIN, self).__init__([], dnn_feature_columns, embedding_size=embedding_size,
# dnn_hidden_units=dnn_hidden_units, l2_reg_linear=0,
# l2_reg_dnn=l2_reg_dnn, init_std=init_std,
# dnn_dropout=dnn_dropout, dnn_activation=dnn_activation,
# task=task, varlen=False, device=device)

super(DIN, self).__init__([], dnn_feature_columns,
dnn_hidden_units=dnn_hidden_units, l2_reg_linear=0,
l2_reg_dnn=l2_reg_dnn, init_std=init_std,
dnn_dropout=dnn_dropout, dnn_activation=dnn_activation,
task=task, device=device)

self.sparse_feature_columns = list(
filter(lambda x: isinstance(x, SparseFeat), dnn_feature_columns)) if dnn_feature_columns else []
self.varlen_sparse_feature_columns = list(
filter(lambda x: isinstance(x, VarLenSparseFeat), dnn_feature_columns)) if dnn_feature_columns else []

self.history_feature_list = history_feature_list

self.history_feature_columns = []
self.sparse_varlen_feature_columns = []
self.history_fc_names = list(map(lambda x: "hist_" + x, history_feature_list))

for fc in self.varlen_sparse_feature_columns:
feature_name = fc.name
if feature_name in self.history_fc_names:
self.history_feature_columns.append(fc)
else:
self.sparse_varlen_feature_columns.append(fc)

self.atten = AttentionSequencePoolingLayer(att_hidden_units=att_hidden_size,
embedding_dim=embedding_size,
activation=att_activation)

self.dnn = DNN(inputs_dim=self.compute_input_dim(dnn_feature_columns, embedding_size),
hidden_units=dnn_hidden_units,
activation=dnn_activation,
dropout_rate=dnn_dropout,
l2_reg=l2_reg_dnn)
self.dnn_linear = nn.Linear(dnn_hidden_units[-1], 1, bias=False).to(device)
self.to(device)

def forward(self, X):
sparse_embedding_list, dense_value_list = self.input_from_feature_columns(X, self.dnn_feature_columns,
self.embedding_dict)

# sequence pooling part
query_emb_list = embedding_lookup(X, self.embedding_dict, self.feature_index, self.sparse_feature_columns,
self.history_feature_list, self.history_feature_list, to_list=True)
keys_emb_list = embedding_lookup(X, self.embedding_dict, self.feature_index, self.history_feature_columns,
self.history_fc_names, self.history_fc_names, to_list=True)
dnn_input_emb_list = embedding_lookup(X, self.embedding_dict, self.feature_index, self.sparse_feature_columns,
mask_feat_list=self.history_feature_list, to_list=True)

sequence_embed_dict = varlen_embedding_lookup(X, self.embedding_dict, self.feature_index,
self.sparse_varlen_feature_columns)

sequence_embed_list = get_varlen_pooling_list(sequence_embed_dict, X, self.feature_index,
self.sparse_varlen_feature_columns,self.device)

dnn_input_emb_list += sequence_embed_list

# concatenate
query_emb = torch.cat(query_emb_list, dim=-1) # [B, 1, E]
keys_emb = torch.cat(keys_emb_list, dim=-1) # [B, T, E]
keys_length = torch.ones((query_emb.size(0), 1)) # [B, 1]
deep_input_emb = torch.cat(dnn_input_emb_list, dim=-1)

hist = self.atten(query_emb, keys_emb, keys_length)

# deep part
deep_input_emb = torch.cat((deep_input_emb, hist), dim=-1)
deep_input_emb = deep_input_emb.view(deep_input_emb.size(0), -1)

dnn_input = combined_dnn_input([deep_input_emb], dense_value_list)
dnn_output = self.dnn(dnn_input)
dnn_logit = self.dnn_linear(dnn_output)

y_pred = self.out(dnn_logit)

return y_pred


if __name__ == '__main__':
pass

47 changes: 47 additions & 0 deletions examples/run_din.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import sys

sys.path.insert(0, '..')

import numpy as np
import torch
from deepctr_torch.inputs import (DenseFeat, SparseFeat, VarLenSparseFeat,
get_feature_names)
from deepctr_torch.models.din import DIN
import pdb


def get_xy_fd():
# pdb.set_trace()
feature_columns = [SparseFeat('user', 3, embedding_dim=8), SparseFeat('gender', 2, embedding_dim=8),
SparseFeat('item', 3 + 1, embedding_dim=8), SparseFeat('item_gender', 2 + 1, embedding_dim=8),
DenseFeat('score', 1)]
# feature_columns += [VarLenSparseFeat('hist_item', 3 + 1, maxlen=4, embedding_name='item'),
# VarLenSparseFeat('hist_item_gender', 2 + 1, maxlen=4, embedding_name='item_gender')]

feature_columns += [VarLenSparseFeat(SparseFeat('hist_item', 3 + 1, embedding_dim=8), 4),
VarLenSparseFeat(SparseFeat('hist_item_gender', 2 + 1, embedding_dim=8), 4)]

behavior_feature_list = ["item", "item_gender"]
uid = np.array([0, 1, 2])
ugender = np.array([0, 1, 0])
iid = np.array([1, 2, 3]) # 0 is mask value
igender = np.array([1, 2, 1]) # 0 is mask value
score = np.array([0.1, 0.2, 0.3])

hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]])
hist_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]])

feature_dict = {'user': uid, 'gender': ugender, 'item': iid, 'item_gender': igender,
'hist_item': hist_iid, 'hist_item_gender': hist_igender, 'score': score}
x = {name: feature_dict[name] for name in get_feature_names(feature_columns)}
y = np.array([1, 0, 1])

return x, y, feature_columns, behavior_feature_list


if __name__ == "__main__":
x, y, feature_columns, behavior_feature_list = get_xy_fd()
model = DIN(feature_columns, behavior_feature_list)
model.compile('adagrad', 'binary_crossentropy',
metrics=['binary_crossentropy'])
history = model.fit(x, y, batch_size=3, epochs=10, validation_split=0.0, verbose=2)
46 changes: 46 additions & 0 deletions tests/models/DIN_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# -*- coding: utf-8 -*-
import pytest
import numpy as np

from deepctr_torch.models.din import DIN
from deepctr_torch.inputs import SparseFeat,VarLenSparseFeat,DenseFeat,get_feature_names
from ..utils import get_test_data, SAMPLE_SIZE, check_model, get_device


def get_xy_fd():
feature_columns = [SparseFeat('user',3), SparseFeat('gender', 2), SparseFeat('item', 3 + 1),
SparseFeat('item_gender', 2 + 1),DenseFeat('score', 1)]
feature_columns += [VarLenSparseFeat(SparseFeat('hist_item', 3 + 1, embedding_dim=8), 4),
VarLenSparseFeat(SparseFeat('hist_item_gender', 2 + 1, embedding_dim=8), 4)]

behavior_feature_list = ["item", "item_gender"]
uid = np.array([0, 1, 2])
ugender = np.array([0, 1, 0])
iid = np.array([1, 2, 3]) # 0 is mask value
igender = np.array([1, 2, 1]) # 0 is mask value
score = np.array([0.1, 0.2, 0.3])

hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]])
hist_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]])

feature_dict = {'user': uid, 'gender': ugender, 'item': iid, 'item_gender': igender,
'hist_item': hist_iid, 'hist_item_gender': hist_igender, 'score': score}

feature_names = get_feature_names(feature_columns)
x = {name:feature_dict[name] for name in feature_names}
y = np.array([1, 0, 1])

return x, y, feature_columns, behavior_feature_list


def test_DIN():
model_name = "DIN"

x, y, feature_columns, behavior_feature_list = get_xy_fd()
model = DIN(feature_columns, behavior_feature_list, dnn_dropout=0.5)

check_model(model, model_name, x, y) # only have 3 train data so we set validation ratio at 0


if __name__ == "__main__":
pass

0 comments on commit 80eccf9

Please sign in to comment.