s3alfisc · s3alfisc · Feb 26, 2023 · Feb 26, 2023 · Feb 26, 2023 · Jun 8, 2023
diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml
@@ -25,7 +25,7 @@ jobs:
 
       - uses: r-lib/actions/setup-r-dependencies@v1
         with:
-          extra-packages: pkgdown
+          extra-packages: any::pkgdown, local::.
           needs: website
 
       - name: Deploy package

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,31 +1,32 @@
 Package: summclust
-Title: Module to Compute Influence and Leverage Statistics for Regression Models with Clustered Errors
-Version: 0.7.0
+Title: Module to Compute Influence and Leverage Statistics for Regression Models
+       with Clustered Errors
+Version: 0.8.0
 Authors@R: 
     c(
     person(given = "Alexander",
            family = "Fischer",
            role = c("aut", "cre"),
            email = "[email protected]")
     )
-Description: Module to compute cluster specific information for regression models
+Description: Module to compute cluster robust covariance matrices (CRV3 and 
+        CRV3J) and cluster specific information for regression models
         with clustered errors, including leverage and influence statistics. 
         Models of type 'lm' and 'fixest'(from the 'stats' and 'fixest' packages)
         are supported. 'summclust' implements similar features as the 
         user-written 'summclust.ado' Stata module (MacKinnon, Nielsen & Webb, 2022; 
         <arXiv:2205.03288v1>).
 License: MIT + file LICENSE
 Encoding: UTF-8
-Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.2.1
 Imports: 
     utils, 
     dreamerr, 
-    MASS, 
     collapse, 
     generics, 
     cli, 
-    rlang
+    Matrix, 
+    Rcpp (>= 1.0.9), 
+    RcppEigen
 Suggests: 
     ggplot2,
     latex2exp,
@@ -38,7 +39,10 @@ Suggests:
     knitr,
     rmarkdown,
     covr
+LinkingTo: Rcpp, RcppEigen
+Remotes: kylebutts/fixest/tree/sparse-matrix
 Config/testthat/edition: 3
 URL: https://s3alfisc.github.io/summclust/
 BugReports: https://github.com/s3alfisc/summclust/issues
 VignetteBuilder: knitr
+RoxygenNote: 7.2.3
diff --git a/NAMESPACE b/NAMESPACE
@@ -10,7 +10,12 @@ S3method(vcov_CR3J,lm)
 export(summclust)
 export(tidy)
 export(vcov_CR3J)
-importFrom(MASS,ginv)
+importFrom(Matrix,crossprod)
+importFrom(Matrix,diag)
+importFrom(Matrix,solve)
+importFrom(Matrix,sparse.model.matrix)
+importFrom(Matrix,t)
+importFrom(Matrix,tcrossprod)
 importFrom(cli,cli_abort)
 importFrom(collapse,GRP)
 importFrom(collapse,add_vars)
@@ -36,4 +41,5 @@ importFrom(stats,rnorm)
 importFrom(stats,terms)
 importFrom(stats,update)
 importFrom(stats,weights)
+importFrom(utils,compareVersion)
 importFrom(utils,stack)
diff --git a/NEWS.md b/NEWS.md
@@ -1,4 +1,11 @@
-# New features since latest release
+# summclust 0.7
+
+* Adds option to sparsify `vcov_CR3J.fixest`. This reduces the computational cost of 
+  computing CRV3 covariance matrices when the regression's design matrix 
+  gets very sparse. It can't be used within `summclust` because the `beta_jack` do not 
+  match. However, it's useful when users just want to access `vcov_CR3J.fixest`.
+
+# summclust 0.6
 
 # summclust 0.7
 

diff --git a/R/MPinv.R b/R/MPinv.R
@@ -0,0 +1,30 @@
+# Function taken from `VCA::MPinv`
+#' Moore-Penrose Generalized Inverse of a Matrix 
+#' 
+#' This function is originally implemented in package 'MASS' as function 
+#' \code{ginv}.  
+#' It was adapted to be able to deal with matrices from the 'Matrix' package, 
+#' e.g. sparse matrices. 
+#' 
+#' @param X (object) two-dimensional, for which a Moore-Penrose inverse 
+#' has to be computed 
+#' @param tol (numeric) tolerance value to be used in comparisons 
+#' 
+#' @return (object) A Moore-Penrose inverse of X. 
+#' 
+#' @author Authors of the 'MASS' package.  
+MPinv <- function (X, tol = sqrt(.Machine$double.eps))  {
+  if (length(dim(X)) > 2L) 
+    stop("'X' must be two-dimensional")
+
+  Xsvd <- svd(X)
+  Positive <- Xsvd$d > max(tol * Xsvd$d[1L], 0)
+  if (all(Positive)) {
+    Xsvd$v %*% (1/Xsvd$d * t(Xsvd$u)) 
+  } else if (!any(Positive)) {
+    array(0, dim(X)[2L:1L]) 	
+  } else {
+    Xsvd$v[, Positive, drop = FALSE] %*% 
+    ((1/Xsvd$d[Positive]) * t(Xsvd$u[, Positive, drop = FALSE]))
+  }
+}  
diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -0,0 +1,7 @@
+# Generated by using Rcpp::compileAttributes() -> do not edit by hand
+# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
+
+rcpp_hello_world <- function() {
+    .Call(`_summclust_rcpp_hello_world`)
+}
+
diff --git a/R/cluster_jackknife.R b/R/cluster_jackknife.R
@@ -34,16 +34,20 @@ cluster_jackknife <- function(
   # calculate X_g'X_g
   tXgXg <- lapply(
     unique_clusters,
-    function(x) crossprod(X[cluster_df == x, , drop = FALSE])
+    function(x) Matrix::crossprod(X[c(cluster_df == x), , drop = FALSE])
   )
   names(tXgXg) <- unique_clusters
   tXX <- Reduce("+", tXgXg)
   # all.equal(tXX, crossprod(X))
 
+  if(inherits(X, "Matrix")){
+    y <- Matrix::Matrix(y)
+  }
+
   tXgyg <- lapply(
     unique_clusters,
     function(x) {
-      t(X[cluster_df == x, , drop = FALSE]) %*% y[cluster_df == x, drop = FALSE]
+      Matrix::t(X[c(cluster_df == x), , drop = FALSE]) %*% y[c(cluster_df == x), drop = FALSE]
     }
   )
   names(tXgyg) <- unique_clusters
@@ -53,11 +57,15 @@ cluster_jackknife <- function(
   beta_hat <- solve(tXX) %*% tXy
   # initiate jackknife
 
+  tXX <- as.matrix(tXX)
+
   beta_jack <-
     lapply(
       unique_clusters,
       function(x) {
-        MASS::ginv(tXX - tXgXg[[x]]) %*% (tXy - (t(X[cluster_df == x, , drop = FALSE]) %*% y[cluster_df == x, drop = FALSE]))
+        summclust:::eigen_pinv(as.matrix(tXX - as.matrix(tXgXg[[x]]))) %*%
+        (tXy - (Matrix::t(X[c(cluster_df == x), , drop = FALSE]) %*%
+        y[c(cluster_df == x), drop = FALSE]))
       }
     )
 
@@ -70,7 +78,7 @@ cluster_jackknife <- function(
   V3 <- lapply(
     seq_along(unique_clusters),
     function(x) {
-      tcrossprod(beta_jack[[x]] - beta_center)
+      Matrix::tcrossprod(beta_jack[[x]] - beta_center)
     }
   )
 
@@ -95,3 +103,145 @@ cluster_jackknife <- function(
   res
 
 }
+
+
+cluster_jackknife_sparse <- function(
+    y,
+    X,
+    cluster_vec,
+    type){
+
+  #' Conducts the Cluster Jackknife as in MNW (2022) for
+  #' CRV3 / CRV3J variance matrix estimation
+  #' @param y A vector containing the dependent variable
+  #' @param X A regression design matrix of type "dgCMatrix" from the {Matrix}
+  #' package
+  #' @param cluster_vec A vector containing the cluster ids
+  #' @param type Either "CRV3" or "CRV3J", where each implements the
+  #' variance-covariance estimator from MNW (2022) of the same name
+  #'
+  #' @return An list, containing
+  #' \item{vcov}{The CRV3 variance-covariance matrix}
+  #' \item{beta_jack}{The leave-one-cluster-out jackknive regression
+  #'  coefficients}
+  #' \item{unique_clusters}{A vector containing all unique clusters contained in
+  #' `cluster_df`}
+  #' \item{tXgXg}{A list containing crossproducts of Xg, where X g is the
+  #' submatrix of the design matrix X which belong to observations in cluster g}
+  #' \item{tXX}{The crossproduct of the design matrix X}
+  #' \item{tXy}{t(X) %*% y}
+  #' \item{G}{The number of unique clusters}
+  #' \item{small_sample_correction}{The employed small sample correction}
+  #' @importFrom Matrix crossprod tcrossprod t
+  #'
+  #' @noRd
+
+  unique_clusters <- as.character(unique(cluster_vec))
+  g_idx <- split(seq_len(nrow(X)), cluster_vec)
+  G <- length(g_idx)
+  small_sample_correction <- (G - 1) / G
+
+  if(inherits(X, "Matrix")){
+    y <- Matrix::Matrix(y)
+  }
+
+  # calculate X_g'X_g
+  # Do this because subsetting is slow
+  n <- nrow(X)
+  K <- ncol(X)
+  X_triplet <- methods::as(X, "TsparseMatrix")
+  i <- X_triplet@i + 1
+  j <- X_triplet@j + 1
+
+  res <- lapply(
+    g_idx,
+    function(idx) {
+      keep <- which(i %in% idx)
+
+      Xg <- Matrix::sparseMatrix(
+        i = i[keep],
+        j = j[keep],
+        x = X_triplet@x[keep],
+        dims = c(n, K)
+      )
+      yg <- Matrix::sparseVector(
+        x = y[idx], i = idx, length = n
+      )
+
+      return(list(
+        tXgXg = Matrix::crossprod(Xg),
+        tXgyg = Matrix::crossprod(Xg, yg)
+      ))
+    }
+  )
+
+  # Extract from `res`
+  tXgXg <- lapply(res, function(x) x$tXgXg)
+  names(tXgXg) <- unique_clusters
+  tXgyg <- lapply(res, function(x) x$tXgyg)
+  names(tXgyg) <- unique_clusters
+
+  tXX <- Reduce("+", tXgXg)
+  # all(tXX == crossprod(X))
+  tXy <- Reduce("+", tXgyg)
+  # all(tXy == crossprod(X, y))
+
+  beta_hat <- Matrix::solve(tXX, tXy)
+
+  # initiate jackknife
+  beta_jack <-
+    lapply(
+      seq_len(G),
+      function(g) {
+        tXX_diff <- tXX - tXgXg[[g]]
+        tXy_diff <- tXy - tXgyg[[g]]
+
+        beta <- tryCatch(
+          {
+            Matrix::solve(tXX_diff, tXy_diff)
+          },
+          error = function(e) {
+            # message(paste0("Error when dropping cluster ", g, ". Using Psuedo-Inverse instead."))
+            summclust:::eigen_pinv(as.matrix(tXX_diff %*% tXy_diff))
+          }
+        )
+
+        return(beta)
+      }
+    )
+
+  if (type == "CRV3J") {
+    beta_bar <- beta_center <- Reduce("+", beta_jack) / G
+  } else if (type == "CRV3") {
+    beta_center <- beta_hat
+  }
+
+  V3 <- lapply(
+    seq_along(unique_clusters),
+    function(x) {
+      Matrix::tcrossprod(beta_jack[[x]] - beta_center)
+    }
+  )
+
+  vcov <- Reduce("+", V3) * small_sample_correction
+  beta_jack <- Reduce("cbind", beta_jack)
+
+  rownames(beta_jack) <- colnames(X)
+  colnames(beta_jack) <- unique_clusters
+  colnames(vcov) <- rownames(vcov) <- colnames(X)
+
+  res <- list(
+    vcov = vcov,
+    beta_jack = beta_jack,
+    unique_clusters = unique_clusters,
+    tXgXg = tXgXg,
+    tXgyg = tXgyg,
+    tXX = tXX,
+    tXy = tXy,
+    G = G,
+    small_sample_correction = small_sample_correction
+  )
+
+  res
+
+}