Skip to content

Commit

Permalink
Merge tag '0.6'
Browse files Browse the repository at this point in the history
CRAN 0.6 release
  • Loading branch information
dselivanov committed Feb 18, 2020
2 parents f673a93 + b05f127 commit 13d0b03
Show file tree
Hide file tree
Showing 78 changed files with 1,002 additions and 2,623 deletions.
38 changes: 17 additions & 21 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
Package: text2vec
Type: Package
Version: 0.5.1.5
Date: 2018-05-28
Version: 0.6
Title: Modern Text Mining Framework for R
License: GPL (>= 2) | file LICENSE
Description: Fast and memory-friendly tools for text vectorization, topic
Expand All @@ -18,37 +17,34 @@ Authors@R: c(
comment = "Author of the WaprLDA C++ code"))
Maintainer: Dmitriy Selivanov <selivanov.dmitriy@gmail.com>
Encoding: UTF-8
SystemRequirements: GNU make, C++11
SystemRequirements: C++11
Depends:
R (>= 3.2.0),
R (>= 3.6.0),
methods
Imports:
Matrix (>= 1.1),
Rcpp (>= 0.11),
RcppParallel (>= 4.3.14),
digest (>= 0.6.8),
foreach(>= 1.4.3),
Rcpp (>= 1.0.3),
R6 (>= 2.3.0),
data.table(>= 1.9.6),
irlba (>= 2.2.1),
R6 (>= 2.1.2),
futile.logger (>= 1.4.3),
rsparse (>= 0.3.3.4),
stringi (>= 1.1.5),
mlapi (>= 0.1.0)
LinkingTo: Rcpp,
RcppParallel,
digest,
sparsepp (>= 0.2.0)
mlapi (>= 0.1.0),
lgr (>= 0.2),
digest (>= 0.6.8)
LinkingTo:
Rcpp,
digest (>= 0.6.8)
Suggests:
parallel,
doParallel,
glmnet,
magrittr,
udpipe (>= 0.6),
glmnet,
testthat,
covr,
knitr,
rmarkdown
rmarkdown,
proxy
URL: http://text2vec.org
BugReports: https://github.com/dselivanov/text2vec/issues
VignetteBuilder: knitr
LazyData: true
RoxygenNote: 6.0.1
RoxygenNote: 6.1.1
15 changes: 3 additions & 12 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,16 @@

S3method(create_dtm,itoken)
S3method(create_dtm,itoken_parallel)
S3method(create_dtm,list)
S3method(create_tcm,itoken)
S3method(create_tcm,itoken_parallel)
S3method(create_vocabulary,character)
S3method(create_vocabulary,itoken)
S3method(create_vocabulary,itoken_parallel)
S3method(create_vocabulary,list)
S3method(itoken,character)
S3method(itoken,iterator)
S3method(itoken,list)
S3method(itoken_parallel,character)
S3method(itoken_parallel,ifiles_parallel)
S3method(itoken_parallel,iterator)
S3method(itoken_parallel,list)
S3method(print,text2vec_vocabulary)
export(BNS)
Expand All @@ -38,13 +36,13 @@ export(create_vocabulary)
export(dist2)
export(fit)
export(fit_transform)
export(glove)
export(hash_vectorizer)
export(idir)
export(ifiles)
export(ifiles_parallel)
export(itoken)
export(itoken_parallel)
export(jsPCA_robust)
export(normalize)
export(pdist2)
export(perplexity)
Expand All @@ -65,15 +63,8 @@ import(digest)
import(methods)
import(mlapi)
importFrom(R6,R6Class)
importFrom(RcppParallel,RcppParallelLibs)
importFrom(foreach,"%do%")
importFrom(foreach,"%dopar%")
importFrom(foreach,foreach)
importFrom(futile.logger,flog.debug)
importFrom(futile.logger,flog.error)
importFrom(futile.logger,flog.info)
importFrom(futile.logger,flog.warn)
importFrom(methods,as)
importFrom(rsparse,GloVe)
importFrom(utils,setTxtProgressBar)
importFrom(utils,txtProgressBar)
useDynLib("text2vec", .registration=TRUE)
6 changes: 5 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
# text2vec 0.5.1.*
# text2vec 0.6
1. 2019-12-17
* **breaking change** - removed construction of a vocabulary in parallel on windows
* use `rsparse` package for SVD and GloVe factorizations
* uodated RWMD implementation (hopefully bug free)
1. 2018-09-10
* **breaking change** - changed IDF formula - see #280 for details.
1. 2018-05-28
Expand Down
24 changes: 0 additions & 24 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
@@ -1,30 +1,6 @@
# Generated by using Rcpp::compileAttributes() -> do not edit by hand
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393

cpp_glove_create <- function(params) {
.Call(`_text2vec_cpp_glove_create`, params)
}

cpp_glove_get_word_vectors <- function(ptr) {
.Call(`_text2vec_cpp_glove_get_word_vectors`, ptr)
}

cpp_glove_set_cost_zero <- function(ptr) {
invisible(.Call(`_text2vec_cpp_glove_set_cost_zero`, ptr))
}

cpp_glove_partial_fit <- function(ptr, x_irow, x_icol, x_val, iter_order) {
.Call(`_text2vec_cpp_glove_partial_fit`, ptr, x_irow, x_icol, x_val, iter_order)
}

cpp_glove_get_sparsity_level <- function(ptr) {
.Call(`_text2vec_cpp_glove_get_sparsity_level`, ptr)
}

cpp_glove_dump_model <- function(ptr) {
.Call(`_text2vec_cpp_glove_dump_model`, ptr)
}

hasher <- function(x, hash_size) {
.Call(`_text2vec_hasher`, x, hash_size)
}
Expand Down
10 changes: 5 additions & 5 deletions R/analogies.R
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ prepare_analogy_questions = function(questions_file_path, vocab_terms) {# nocov

questions_number = sum(sapply(q, nrow))

flog.info("%d full questions found out of %d total",
logger$info("%d full questions found out of %d total",
questions_number,
length(lines) - length(section_name_ind))

Expand All @@ -56,8 +56,8 @@ prepare_analogy_questions = function(questions_file_path, vocab_terms) {# nocov
#' @param m_word_vectors word vectors \code{numeric matrix}. Each row should
#' represent a word.
#' @description This function checks how well the GloVe word embeddings do on
#' the analogy task. For full examples see \link{glove}.
#' @seealso \link{prepare_analogy_questions}, \link{glove}
#' the analogy task. For full examples see \link{GloVe}.
#' @seealso \link{prepare_analogy_questions}, \link{GloVe}
#' @export
check_analogy_accuracy = function(questions_list, m_word_vectors) {

Expand Down Expand Up @@ -90,7 +90,7 @@ check_analogy_accuracy = function(questions_list, m_word_vectors) {
act = q_mat[, 4]
correct_number = sum(preds == act)

flog.info("%s: correct %d out of %d, accuracy = %.4f",
logger$info("%s: correct %d out of %d, accuracy = %.4f",
category,
correct_number,
q_number,
Expand All @@ -104,6 +104,6 @@ check_analogy_accuracy = function(questions_list, m_word_vectors) {
)
}
res = rbindlist(res)
flog.info("OVERALL ACCURACY = %.4f", sum(res[['predicted']] == res[['actual']]) / nrow(res) )
logger$info("OVERALL ACCURACY = %.4f", sum(res[['predicted']] == res[['actual']]) / nrow(res) )
res
}# nocov end
5 changes: 3 additions & 2 deletions R/coherence.R
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,8 @@
#' tcm = crossprod(sign(dtm))
#'
#' # check coherence
#' futile.logger::flog.threshold(futile.logger::DEBUG)
#' logger = lgr::get_logger('text2vec')
#' logger$set_threshold('debug')
#' res = coherence(tw, tcm, n_doc_tcm = N)
#' res
#'
Expand Down Expand Up @@ -234,7 +235,7 @@ res = matrix(NA_real_, nrow = n_topics, ncol = n_metrics,
topic_i_term_indices = topic_i_term_indices[!is.na(topic_i_term_indices)]
for(j in seq_len(n_metrics)) {
m = metrics[j]
futile.logger::flog.debug("calculating coherence metric '%s' for topic %d", m, i)
logger$debug("calculating coherence metric '%s' for topic %d", m, i)
res[i, j] = calc_coherence(m, topic_i_term_indices, tcm, smooth, n_doc_tcm = n_doc_tcm)
}
}
Expand Down
12 changes: 6 additions & 6 deletions R/distance.R
Original file line number Diff line number Diff line change
Expand Up @@ -81,15 +81,15 @@ dist2 = function(x, y = NULL, method = c("cosine", "euclidean", "jaccard"),
method = match.arg(method)
if(method %in% c("cosine", "jaccard")) {
if( inherits(x, "sparseMatrix") || inherits(y, "sparseMatrix"))
flog.warn("Sparsity will be lost - worth to calculate similarity instead of distance.")
logger$warn("Sparsity will be lost - worth to calculate similarity instead of distance.")
RESULT = 1 - sim2(x = x, y = y, method = method, norm = norm)
}
if (method == "euclidean") {
if (!FLAG_TWO_MATRICES_INPUT)
y = x
if (!inherits(x, "matrix") || !inherits(y, "matrix")) {
msg = "At the moment eucludian distance could be calculated only for dense matrices of class 'matrix'"
flog.error(msg)
logger$error(msg)
stop(msg)

}
Expand All @@ -109,7 +109,7 @@ dist2 = function(x, y = NULL, method = c("cosine", "euclidean", "jaccard"),
if (inherits(method, "RWMD")) {
if (norm != "none") {
msg = paste(norm, "norm provided. RWMD can be computed only on bag-of-words matrices - raw word-counts")
flog.warn(msg)
logger$warn(msg)
}
RESULT = method$dist2(x, y)
}
Expand Down Expand Up @@ -154,7 +154,7 @@ pdist2 = function(x, y, method = c("cosine", "euclidean", "jaccard"),
if (inherits(method, "RWMD")) {
if (norm != "none") {
msg = paste(norm, "norm provided. RWMD can be computed only on bag-of-words matrices - raw word-counts")
flog.warn(msg)
logger$warn(msg)
}
RESULT = method$pdist2(x, y)
}
Expand Down Expand Up @@ -217,7 +217,7 @@ sim2 = function(x, y = NULL, method = c("cosine", "jaccard"),
if (norm != "none") {
msg = paste(norm, "norm provided. Howewer matrix will be converted to binary (0,1) automatically.")
msg = paste(msg, "'jaccard' can be computed only on sets which should be encoded as sparse matrices of 0, 1.")
flog.warn(msg)
logger$warn(msg)
}
x@x = sign(x@x)
if (FLAG_TWO_MATRICES_INPUT) {
Expand Down Expand Up @@ -259,7 +259,7 @@ psim2 = function(x, y, method = c("cosine", "jaccard"), norm = c("l2", "none"))
if (norm != "none") {
msg = paste(norm, "norm provided. Howewer matrix will be converted to binary (0,1) automatically.")
msg = paste(msg, "'jaccard' can be computed only on sets which should be encoded as sparse matrices of 0, 1.")
flog.warn(msg)
logger$warn(msg)
}

x@x = sign(x@x)
Expand Down
Loading

0 comments on commit 13d0b03

Please sign in to comment.