Add functions: widely_hclust and widely_kmeans

juliasilge · Aug 15, 2020 · 13b3b3c · 13b3b3c
1 parent ee925e4
commit 13b3b3c
Show file tree

Hide file tree

Showing 7 changed files with 218 additions and 3 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -24,7 +24,8 @@ Imports:
     tidytext,
     purrr,
     Matrix,
-    broom
+    broom,
+    tibble
 Suggests:
     ggraph,
     igraph,
@@ -40,4 +41,4 @@ Suggests:
     ggplot2,
     maps,
     irlba
-RoxygenNote: 7.1.0
+RoxygenNote: 7.1.1
diff --git a/NAMESPACE b/NAMESPACE
@@ -16,8 +16,11 @@ export(squarely)
 export(squarely_)
 export(widely)
 export(widely_)
+export(widely_hclust)
+export(widely_kmeans)
 export(widely_svd)
 export(widely_svd_)
 import(Matrix)
 import(dplyr)
 importFrom(broom,tidy)
+importFrom(rlang,":=")
diff --git a/R/globals.R b/R/globals.R
@@ -1 +1,2 @@
-globalVariables(c("item1", "item2", "value", "..data", "data"))
+globalVariables(c("item1", "item2", "value", "..data", "data",
+                  "item", "cluster"))
diff --git a/R/widely_hclust.R b/R/widely_hclust.R
@@ -0,0 +1,58 @@
+#' Cluster pairs of items into groups using hierarchical clustering
+#'
+#' Reshape a table that represents pairwise distances into hierarchical clusters,
+#' returning a table with \code{item} and \code{cluster} columns.
+#'
+#' @param tbl Table
+#' @param item1 First item
+#' @param item2 Second item
+#' @param distance Distance column
+#' @param k The desired number of groups
+#' @param h Height at which to cut the hierarchically clustered tree
+#'
+#' @examples
+#'
+#' library(gapminder)
+#' library(dplyr)
+#'
+#' # Construct Euclidean distances between countries based on life
+#' # expectancy over time
+#' country_distances <- gapminder %>%
+#'   pairwise_dist(country, year, lifeExp)
+#'
+#' country_distances
+#'
+#' # Turn this into 5 hierarchical clusters
+#' clusters <- country_distances %>%
+#'   widely_hclust(item1, item2, distance, k = 8)
+#'
+#' # Examine a few such clusters
+#' clusters %>% filter(cluster == 1)
+#' clusters %>% filter(cluster == 2)
+#'
+#' @seealso \link{cutree}
+#'
+#' @export
+widely_hclust <- function(tbl, item1, item2, distance, k = NULL, h = NULL) {
+  col1_str <- as.character(substitute(item1))
+  col2_str <- as.character(substitute(item2))
+  dist_str <- as.character(substitute(distance))
+
+  unique_items <- unique(c(as.character(tbl[[col1_str]]), as.character(tbl[[col2_str]])))
+
+  form <- stats::as.formula(paste(col1_str, "~", col2_str))
+
+  max_distance <- max(tbl[[dist_str]])
+
+  tibble(item1 = match(tbl[[col1_str]], unique_items),
+         item2 = match(tbl[[col2_str]], unique_items),
+         distance = tbl[[dist_str]]) %>%
+    reshape2::acast(item1 ~ item2, value.var = "distance", fill = max_distance) %>%
+    stats::as.dist() %>%
+    stats::hclust() %>%
+    stats::cutree(k = k, h = h) %>%
+    tibble::enframe("item", "cluster") %>%
+    dplyr::mutate(item = unique_items[as.integer(item)],
+                  cluster = factor(cluster)) %>%
+    dplyr::arrange(cluster)
+}
diff --git a/R/widely_kmeans.R b/R/widely_kmeans.R
@@ -0,0 +1,54 @@
+#' Cluster items based on k-means across features
+#'
+#' Given a tidy table of features describing each item, perform k-means
+#' clustering using \code{\link{kmeans}} and retidy the data into
+#' one-row-per-cluster.
+#'
+#' @param tbl Table
+#' @param item Item to cluster (as a bare column name)
+#' @param feature Feature column (dimension in clustering)
+#' @param value Value column
+#' @param k Number of clusters
+#' @param fill What to fill in for missing values
+#' @param ... Other arguments passed on to \code{\link{kmeans}}
+#'
+#' @seealso \code{\link{widely_hclust}}
+#'
+#' @importFrom rlang :=
+#'
+#' @examples
+#'
+#' library(gapminder)
+#' library(dplyr)
+#'
+#' clusters <- gapminder %>%
+#'   widely_kmeans(country, year, lifeExp, k = 5)
+#'
+#' clusters
+#'
+#' clusters %>%
+#'   count(cluster)
+#'
+#' # Examine a few clusters
+#' clusters %>% filter(cluster == 1)
+#' clusters %>% filter(cluster == 2)
+#'
+#' @export
+widely_kmeans <- function(tbl, item, feature, value, k, fill = 0, ...) {
+  item_str <- as.character(substitute(item))
+  feature_str <- as.character(substitute(feature))
+  value_str <- as.character(substitute(value))
+
+  form <- stats::as.formula(paste(item_str, "~", feature_str))
+
+  m <- tbl %>%
+    reshape2::acast(form, value.var = value_str, fill = fill)
+
+  clustered <- stats::kmeans(m, k, ...)
+
+  # Add the clusters to the original table
+  i <- match(rownames(m), as.character(tbl[[item_str]]))
+  tibble::tibble(!!sym(item_str) := tbl[[item_str]][i],
+                 cluster = factor(clustered$cluster)) %>%
+    dplyr::arrange(cluster)
+}
diff --git a/man/widely_hclust.Rd b/man/widely_hclust.Rd
diff --git a/man/widely_kmeans.Rd b/man/widely_kmeans.Rd