Skip to content

Commit

Permalink
Add functions: widely_hclust and widely_kmeans
Browse files Browse the repository at this point in the history
  • Loading branch information
dgrtwo committed Aug 15, 2020
1 parent ee925e4 commit 13b3b3c
Show file tree
Hide file tree
Showing 7 changed files with 218 additions and 3 deletions.
5 changes: 3 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ Imports:
tidytext,
purrr,
Matrix,
broom
broom,
tibble
Suggests:
ggraph,
igraph,
Expand All @@ -40,4 +41,4 @@ Suggests:
ggplot2,
maps,
irlba
RoxygenNote: 7.1.0
RoxygenNote: 7.1.1
3 changes: 3 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,11 @@ export(squarely)
export(squarely_)
export(widely)
export(widely_)
export(widely_hclust)
export(widely_kmeans)
export(widely_svd)
export(widely_svd_)
import(Matrix)
import(dplyr)
importFrom(broom,tidy)
importFrom(rlang,":=")
3 changes: 2 additions & 1 deletion R/globals.R
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
globalVariables(c("item1", "item2", "value", "..data", "data"))
globalVariables(c("item1", "item2", "value", "..data", "data",
"item", "cluster"))
58 changes: 58 additions & 0 deletions R/widely_hclust.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#' Cluster pairs of items into groups using hierarchical clustering
#'
#' Reshape a table that represents pairwise distances into hierarchical clusters,
#' returning a table with \code{item} and \code{cluster} columns.
#'
#' @param tbl Table
#' @param item1 First item
#' @param item2 Second item
#' @param distance Distance column
#' @param k The desired number of groups
#' @param h Height at which to cut the hierarchically clustered tree
#'
#' @examples
#'
#' library(gapminder)
#' library(dplyr)
#'
#' # Construct Euclidean distances between countries based on life
#' # expectancy over time
#' country_distances <- gapminder %>%
#' pairwise_dist(country, year, lifeExp)
#'
#' country_distances
#'
#' # Turn this into 5 hierarchical clusters
#' clusters <- country_distances %>%
#' widely_hclust(item1, item2, distance, k = 8)
#'
#' # Examine a few such clusters
#' clusters %>% filter(cluster == 1)
#' clusters %>% filter(cluster == 2)
#'
#' @seealso \link{cutree}
#'
#' @export
widely_hclust <- function(tbl, item1, item2, distance, k = NULL, h = NULL) {
col1_str <- as.character(substitute(item1))
col2_str <- as.character(substitute(item2))
dist_str <- as.character(substitute(distance))

unique_items <- unique(c(as.character(tbl[[col1_str]]), as.character(tbl[[col2_str]])))

form <- stats::as.formula(paste(col1_str, "~", col2_str))

max_distance <- max(tbl[[dist_str]])

tibble(item1 = match(tbl[[col1_str]], unique_items),
item2 = match(tbl[[col2_str]], unique_items),
distance = tbl[[dist_str]]) %>%
reshape2::acast(item1 ~ item2, value.var = "distance", fill = max_distance) %>%
stats::as.dist() %>%
stats::hclust() %>%
stats::cutree(k = k, h = h) %>%
tibble::enframe("item", "cluster") %>%
dplyr::mutate(item = unique_items[as.integer(item)],
cluster = factor(cluster)) %>%
dplyr::arrange(cluster)
}
54 changes: 54 additions & 0 deletions R/widely_kmeans.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#' Cluster items based on k-means across features
#'
#' Given a tidy table of features describing each item, perform k-means
#' clustering using \code{\link{kmeans}} and retidy the data into
#' one-row-per-cluster.
#'
#' @param tbl Table
#' @param item Item to cluster (as a bare column name)
#' @param feature Feature column (dimension in clustering)
#' @param value Value column
#' @param k Number of clusters
#' @param fill What to fill in for missing values
#' @param ... Other arguments passed on to \code{\link{kmeans}}
#'
#' @seealso \code{\link{widely_hclust}}
#'
#' @importFrom rlang :=
#'
#' @examples
#'
#' library(gapminder)
#' library(dplyr)
#'
#' clusters <- gapminder %>%
#' widely_kmeans(country, year, lifeExp, k = 5)
#'
#' clusters
#'
#' clusters %>%
#' count(cluster)
#'
#' # Examine a few clusters
#' clusters %>% filter(cluster == 1)
#' clusters %>% filter(cluster == 2)
#'
#' @export
widely_kmeans <- function(tbl, item, feature, value, k, fill = 0, ...) {
item_str <- as.character(substitute(item))
feature_str <- as.character(substitute(feature))
value_str <- as.character(substitute(value))

form <- stats::as.formula(paste(item_str, "~", feature_str))

m <- tbl %>%
reshape2::acast(form, value.var = value_str, fill = fill)

clustered <- stats::kmeans(m, k, ...)

# Add the clusters to the original table
i <- match(rownames(m), as.character(tbl[[item_str]]))
tibble::tibble(!!sym(item_str) := tbl[[item_str]][i],
cluster = factor(clustered$cluster)) %>%
dplyr::arrange(cluster)
}
49 changes: 49 additions & 0 deletions man/widely_hclust.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

49 changes: 49 additions & 0 deletions man/widely_kmeans.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 13b3b3c

Please sign in to comment.