forked from dselivanov/text2vec
-
Notifications
You must be signed in to change notification settings - Fork 0
/
distance.R
272 lines (249 loc) · 10.1 KB
/
distance.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
# // Copyright (C) 2015 - 2016 Dmitriy Selivanov
# // This file is part of text2vec
# //
# // text2vec is free software: you can redistribute it and/or modify it
# // under the terms of the GNU General Public License as published by
# // the Free Software Foundation, either version 2 of the License, or
# // (at your option) any later version.
# //
# // text2vec is distributed in the hope that it will be useful, but
# // WITHOUT ANY WARRANTY; without even the implied warranty of
# // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# // GNU General Public License for more details.
# //
# // You should have received a copy of the GNU General Public License
# // along with text2vec. If not, see <http://www.gnu.org/licenses/>.
# work on 0,1 valued sparse matrices
jaccard_sim = function(x, y = NULL, format = "dgCMatrix") {
if (!inherits(x, "sparseMatrix"))
stop("at the moment jaccard distance defined only for sparse matrices")
# union x
rs_x = rowSums(x)
if (is.null(y)) {
# intersect x
RESULT = tcrossprod(x)
rs_y = rs_x
} else {
if (!inherits(y, "sparseMatrix"))
stop("at the moment jaccard distance defined only for sparse matrices")
# intersect x y
RESULT = tcrossprod(x, y)
# union y
rs_y = rowSums(y)
}
RESULT = as(RESULT, "TsparseMatrix")
# add 1 to indices because of zero-based indices in sparse matrices
RESULT@x = RESULT@x / (rs_x[RESULT@i + 1L] + rs_y[RESULT@j + 1L] - RESULT@x)
if (!inherits(RESULT, format))
RESULT = as(RESULT, format)
RESULT
}
#' @name distances
#' @param x first matrix.
#' @param y second matrix. For \code{dist2} \code{y = NULL} set by default.
#' This means that we will assume \code{y = x} and calculate distances/similarities between all rows of the \code{x}.
#' @param method usually \code{character} or instance of \code{tet2vec_distance} class.
#' The distances/similarity measure to be used. One of \code{c("cosine", "euclidean", "jaccard")} or \link{RWMD}.
#' \code{RWMD} works only on bag-of-words matrices.
# Also user should keep in mind, that distance = 1 - similarity.
#' \bold{In case of \code{"cosine"} distance max distance will be 1 - (-1) = 2}
#' @param norm \code{character = c("l2", "l1", "none")} - how to scale input matrices.
#' If they already scaled - use \code{"none"}
#' @title Pairwise Distance Matrix Computation
#' @description \code{dist2} calculates pairwise distances/similarities between the
#' rows of two data matrices. \bold{Note} that some methods work only on sparse matrices and
#' others work only on dense matrices.
#' @details Computes the distance matrix computed by using the specified method.
#' Similar to \link{dist} function, but works with two matrices.
#' @return \code{dist2} returns \code{matrix} of distances/similarities between each row of
#' matrix \code{x} and each row of matrix \code{y}.
#' @export
dist2 = function(x, y = NULL, method = c("cosine", "euclidean", "jaccard"),
norm = c("l2", "l1", "none")) {
stopifnot(inherits(x, "matrix") || inherits(x, "Matrix"))
stopifnot(inherits(method, "distance_model") || inherits(method, "character"))
FLAG_TWO_MATRICES_INPUT = FALSE
if (!is.null(y)) {
FLAG_TWO_MATRICES_INPUT = TRUE
}
if (FLAG_TWO_MATRICES_INPUT) {
stopifnot(inherits(y, "matrix") || inherits(y, "Matrix"))
stopifnot(ncol(x) == ncol(y))
stopifnot(colnames(x) == colnames(y))
}
norm = match.arg(norm)
RESULT = NULL
if (inherits(method, "character")) {
method = match.arg(method)
if(method %in% c("cosine", "jaccard")) {
if( inherits(x, "sparseMatrix") || inherits(y, "sparseMatrix"))
logger$warn("Sparsity will be lost - worth to calculate similarity instead of distance.")
RESULT = 1 - sim2(x = x, y = y, method = method, norm = norm)
}
if (method == "euclidean") {
if (!FLAG_TWO_MATRICES_INPUT)
y = x
if (!inherits(x, "matrix") || !inherits(y, "matrix")) {
msg = "At the moment eucludian distance could be calculated only for dense matrices of class 'matrix'"
logger$error(msg)
stop(msg)
}
# transpose, because euclidean_dist() function calculates dist between columns
x = t(normalize(x, norm))
if (FLAG_TWO_MATRICES_INPUT) {
y = t(normalize(y, norm))
RESULT = euclidean_dist(x, y)
} else
RESULT = euclidean_dist(x, x)
}
}
if (inherits(method, "distance_model")) {
if (!FLAG_TWO_MATRICES_INPUT)
y = x
if (inherits(method, "RWMD")) {
if (norm != "none") {
msg = paste(norm, "norm provided. RWMD can be computed only on bag-of-words matrices - raw word-counts")
logger$warn(msg)
}
RESULT = method$dist2(x, y)
}
}
if (is.null(RESULT))
stop(paste("not implemented for class", method))
RESULT
}
#' @rdname distances
#' @title "Parallel" Distance Matrix Computation
#' @description \code{pdist2} calculates "parallel" distances between the rows of two data matrices.
#' @details \code{pdist2} takes two matrices and return a single vector.
#' giving the ‘parallel’ distances of the vectors.
#' @return \code{pdist2} returns \code{vector} of "parallel" distances between rows
#' of \code{x} and \code{y}.
#' @export
pdist2 = function(x, y, method = c("cosine", "euclidean", "jaccard"),
norm = c("l2", "l1", "none")) {
stopifnot(inherits(x, "matrix") || inherits(x, "Matrix"))
stopifnot(inherits(y, "matrix") || inherits(y, "Matrix"))
stopifnot(inherits(method, "distance_model") || inherits(method, "character"))
stopifnot(ncol(x) == ncol(y))
stopifnot(nrow(x) == nrow(y))
stopifnot(colnames(x) == colnames(y))
norm = match.arg(norm)
RESULT = NULL
if (inherits(method, "character")) {
method = match.arg(method)
if(method %in% c("cosine", "jaccard")) {
RESULT = 1 - psim2(x = x, y = y, method = method, norm = norm)
}
if (method == "euclidean") {
if (!inherits(x, "matrix") || !inherits(y, "matrix"))
stop("At the moment eucludian distance can be calculated only for
dense matrices of class 'matrix'")
RESULT = sqrt(rowSums((x - y) ^ 2))
}
}
if (inherits(method, "distance_model")) {
if (inherits(method, "RWMD")) {
if (norm != "none") {
msg = paste(norm, "norm provided. RWMD can be computed only on bag-of-words matrices - raw word-counts")
logger$warn(msg)
}
RESULT = method$pdist2(x, y)
}
}
if (is.null(RESULT))
stop(paste("not implemented for class", method))
RESULT
}
#' @name similarities
#' @title Pairwise Similarity Matrix Computation
#' @description \code{sim2} calculates pairwise similarities between the
#' rows of two data matrices. \bold{Note} that some methods work only on sparse matrices and
#' others work only on dense matrices.
#' @param x first matrix.
#' @param y second matrix. For \code{sim2} \code{y = NULL} set by default.
#' This means that we will assume \code{y = x} and calculate similarities between all rows of the \code{x}.
#' @param method \code{character}, the similarity measure to be used. One of \code{c("cosine", "jaccard")}.
#' @param norm \code{character = c("l2", "none")} - how to scale input matrices. If they already scaled - use \code{"none"}
#' @details Computes the similarity matrix using given method.
#' @return \code{sim2} returns \code{matrix} of similarities between each row of
#' matrix \code{x} and each row of matrix \code{y}.
#' @export
sim2 = function(x, y = NULL, method = c("cosine", "jaccard"),
norm = c("l2", "none")) {
norm = match.arg(norm)
method = match.arg(method)
# check first matrix
stopifnot(inherits(x, "matrix") || inherits(x, "Matrix"))
FLAG_TWO_MATRICES_INPUT = FALSE
if (!is.null(y)) {
FLAG_TWO_MATRICES_INPUT = TRUE
}
# check second matrix
if (FLAG_TWO_MATRICES_INPUT) {
stopifnot(inherits(y, "matrix") || inherits(y, "Matrix"))
stopifnot(ncol(x) == ncol(y))
stopifnot(colnames(x) == colnames(y))
}
RESULT = NULL
if (method == "cosine") {
x = normalize(x, norm)
if (FLAG_TWO_MATRICES_INPUT) {
y = normalize(y, norm)
RESULT = tcrossprod(x, y)
}
else
RESULT = tcrossprod(x)
}
if (method == "jaccard") {
if (!inherits(x, "sparseMatrix"))
stop("at the moment jaccard distance defined only for sparse matrices")
if (norm != "none") {
msg = paste(norm, "norm provided. Howewer matrix will be converted to binary (0,1) automatically.")
msg = paste(msg, "'jaccard' can be computed only on sets which should be encoded as sparse matrices of 0, 1.")
logger$warn(msg)
}
x@x = sign(x@x)
if (FLAG_TWO_MATRICES_INPUT) {
y@x = sign(y@x)
}
RESULT = jaccard_sim(x, y)
}
RESULT
}
#' @rdname similarities
#' @title "Parallel" Similarity Matrix Computation
#' @description \code{psim2} calculates "parallel" similarities between the rows of two data matrices.
#' @details \code{psim2} takes two matrices and return a single vector.
#' giving the ‘parallel’ similarities of the vectors.
#' @return \code{psim2} returns \code{vector} of "parallel" similarities between rows of \code{x} and \code{y}.
#' @export
psim2 = function(x, y, method = c("cosine", "jaccard"), norm = c("l2", "none")) {
method = match.arg(method)
norm = match.arg(norm)
stopifnot(inherits(x, "matrix") || inherits(x, "Matrix"))
stopifnot(inherits(y, "matrix") || inherits(y, "Matrix"))
stopifnot(ncol(x) == ncol(y))
stopifnot(nrow(x) == nrow(y))
stopifnot(colnames(x) == colnames(y))
RESULT = NULL
if (method == "cosine") {
y = normalize(y, norm)
x = normalize(x, norm)
RESULT = rowSums(x * y)
}
if (method == "jaccard") {
if (!inherits(x, "sparseMatrix"))
stop("at the moment jaccard distance defined only for sparse matrices")
if (norm != "none") {
msg = paste(norm, "norm provided. Howewer matrix will be converted to binary (0,1) automatically.")
msg = paste(msg, "'jaccard' can be computed only on sets which should be encoded as sparse matrices of 0, 1.")
logger$warn(msg)
}
x@x = sign(x@x)
y@x = sign(y@x)
intrs = rowSums(x * y)
RESULT = intrs / (rowSums(x) + rowSums(y) - intrs)
}
RESULT
}