Skip to content

CRAN release v0.6 #16

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 24 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/pkgdown.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ jobs:

- uses: r-lib/actions/setup-r-dependencies@v1
with:
extra-packages: pkgdown
extra-packages: any::pkgdown, local::.
needs: website

- name: Deploy package
Expand Down
18 changes: 11 additions & 7 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,31 +1,32 @@
Package: summclust
Title: Module to Compute Influence and Leverage Statistics for Regression Models with Clustered Errors
Version: 0.7.0
Title: Module to Compute Influence and Leverage Statistics for Regression Models
with Clustered Errors
Version: 0.8.0
Authors@R:
c(
person(given = "Alexander",
family = "Fischer",
role = c("aut", "cre"),
email = "[email protected]")
)
Description: Module to compute cluster specific information for regression models
Description: Module to compute cluster robust covariance matrices (CRV3 and
CRV3J) and cluster specific information for regression models
with clustered errors, including leverage and influence statistics.
Models of type 'lm' and 'fixest'(from the 'stats' and 'fixest' packages)
are supported. 'summclust' implements similar features as the
user-written 'summclust.ado' Stata module (MacKinnon, Nielsen & Webb, 2022;
<arXiv:2205.03288v1>).
License: MIT + file LICENSE
Encoding: UTF-8
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.2.1
Imports:
utils,
dreamerr,
MASS,
collapse,
generics,
cli,
rlang
Matrix,
Rcpp (>= 1.0.9),
RcppEigen
Suggests:
ggplot2,
latex2exp,
Expand All @@ -38,7 +39,10 @@ Suggests:
knitr,
rmarkdown,
covr
LinkingTo: Rcpp, RcppEigen
Remotes: kylebutts/fixest/tree/sparse-matrix
Config/testthat/edition: 3
URL: https://s3alfisc.github.io/summclust/
BugReports: https://github.com/s3alfisc/summclust/issues
VignetteBuilder: knitr
RoxygenNote: 7.2.3
8 changes: 7 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,12 @@ S3method(vcov_CR3J,lm)
export(summclust)
export(tidy)
export(vcov_CR3J)
importFrom(MASS,ginv)
importFrom(Matrix,crossprod)
importFrom(Matrix,diag)
importFrom(Matrix,solve)
importFrom(Matrix,sparse.model.matrix)
importFrom(Matrix,t)
importFrom(Matrix,tcrossprod)
importFrom(cli,cli_abort)
importFrom(collapse,GRP)
importFrom(collapse,add_vars)
Expand All @@ -36,4 +41,5 @@ importFrom(stats,rnorm)
importFrom(stats,terms)
importFrom(stats,update)
importFrom(stats,weights)
importFrom(utils,compareVersion)
importFrom(utils,stack)
9 changes: 8 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,11 @@
# New features since latest release
# summclust 0.7

* Adds option to sparsify `vcov_CR3J.fixest`. This reduces the computational cost of
computing CRV3 covariance matrices when the regression's design matrix
gets very sparse. It can't be used within `summclust` because the `beta_jack` do not
match. However, it's useful when users just want to access `vcov_CR3J.fixest`.

# summclust 0.6

# summclust 0.7

Expand Down
30 changes: 30 additions & 0 deletions R/MPinv.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Function taken from `VCA::MPinv`
#' Moore-Penrose Generalized Inverse of a Matrix
#'
#' This function is originally implemented in package 'MASS' as function
#' \code{ginv}.
#' It was adapted to be able to deal with matrices from the 'Matrix' package,
#' e.g. sparse matrices.
#'
#' @param X (object) two-dimensional, for which a Moore-Penrose inverse
#' has to be computed
#' @param tol (numeric) tolerance value to be used in comparisons
#'
#' @return (object) A Moore-Penrose inverse of X.
#'
#' @author Authors of the 'MASS' package.
MPinv <- function (X, tol = sqrt(.Machine$double.eps)) {
if (length(dim(X)) > 2L)
stop("'X' must be two-dimensional")

Xsvd <- svd(X)
Positive <- Xsvd$d > max(tol * Xsvd$d[1L], 0)
if (all(Positive)) {
Xsvd$v %*% (1/Xsvd$d * t(Xsvd$u))
} else if (!any(Positive)) {
array(0, dim(X)[2L:1L])
} else {
Xsvd$v[, Positive, drop = FALSE] %*%
((1/Xsvd$d[Positive]) * t(Xsvd$u[, Positive, drop = FALSE]))
}
}
7 changes: 7 additions & 0 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Generated by using Rcpp::compileAttributes() -> do not edit by hand
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393

rcpp_hello_world <- function() {
.Call(`_summclust_rcpp_hello_world`)
}

158 changes: 154 additions & 4 deletions R/cluster_jackknife.R
Original file line number Diff line number Diff line change
Expand Up @@ -34,16 +34,20 @@ cluster_jackknife <- function(
# calculate X_g'X_g
tXgXg <- lapply(
unique_clusters,
function(x) crossprod(X[cluster_df == x, , drop = FALSE])
function(x) Matrix::crossprod(X[c(cluster_df == x), , drop = FALSE])
)
names(tXgXg) <- unique_clusters
tXX <- Reduce("+", tXgXg)
# all.equal(tXX, crossprod(X))

if(inherits(X, "Matrix")){
y <- Matrix::Matrix(y)
}

tXgyg <- lapply(
unique_clusters,
function(x) {
t(X[cluster_df == x, , drop = FALSE]) %*% y[cluster_df == x, drop = FALSE]
Matrix::t(X[c(cluster_df == x), , drop = FALSE]) %*% y[c(cluster_df == x), drop = FALSE]
}
)
names(tXgyg) <- unique_clusters
Expand All @@ -53,11 +57,15 @@ cluster_jackknife <- function(
beta_hat <- solve(tXX) %*% tXy
# initiate jackknife

tXX <- as.matrix(tXX)

beta_jack <-
lapply(
unique_clusters,
function(x) {
MASS::ginv(tXX - tXgXg[[x]]) %*% (tXy - (t(X[cluster_df == x, , drop = FALSE]) %*% y[cluster_df == x, drop = FALSE]))
summclust:::eigen_pinv(as.matrix(tXX - as.matrix(tXgXg[[x]]))) %*%
(tXy - (Matrix::t(X[c(cluster_df == x), , drop = FALSE]) %*%
y[c(cluster_df == x), drop = FALSE]))
}
)

Expand All @@ -70,7 +78,7 @@ cluster_jackknife <- function(
V3 <- lapply(
seq_along(unique_clusters),
function(x) {
tcrossprod(beta_jack[[x]] - beta_center)
Matrix::tcrossprod(beta_jack[[x]] - beta_center)
}
)

Expand All @@ -95,3 +103,145 @@ cluster_jackknife <- function(
res

}


cluster_jackknife_sparse <- function(
y,
X,
cluster_vec,
type){

#' Conducts the Cluster Jackknife as in MNW (2022) for
#' CRV3 / CRV3J variance matrix estimation
#' @param y A vector containing the dependent variable
#' @param X A regression design matrix of type "dgCMatrix" from the {Matrix}
#' package
#' @param cluster_vec A vector containing the cluster ids
#' @param type Either "CRV3" or "CRV3J", where each implements the
#' variance-covariance estimator from MNW (2022) of the same name
#'
#' @return An list, containing
#' \item{vcov}{The CRV3 variance-covariance matrix}
#' \item{beta_jack}{The leave-one-cluster-out jackknive regression
#' coefficients}
#' \item{unique_clusters}{A vector containing all unique clusters contained in
#' `cluster_df`}
#' \item{tXgXg}{A list containing crossproducts of Xg, where X g is the
#' submatrix of the design matrix X which belong to observations in cluster g}
#' \item{tXX}{The crossproduct of the design matrix X}
#' \item{tXy}{t(X) %*% y}
#' \item{G}{The number of unique clusters}
#' \item{small_sample_correction}{The employed small sample correction}
#' @importFrom Matrix crossprod tcrossprod t
#'
#' @noRd

unique_clusters <- as.character(unique(cluster_vec))
g_idx <- split(seq_len(nrow(X)), cluster_vec)
G <- length(g_idx)
small_sample_correction <- (G - 1) / G

if(inherits(X, "Matrix")){
y <- Matrix::Matrix(y)
}

# calculate X_g'X_g
# Do this because subsetting is slow
n <- nrow(X)
K <- ncol(X)
X_triplet <- methods::as(X, "TsparseMatrix")
i <- X_triplet@i + 1
j <- X_triplet@j + 1

res <- lapply(
g_idx,
function(idx) {
keep <- which(i %in% idx)

Xg <- Matrix::sparseMatrix(
i = i[keep],
j = j[keep],
x = X_triplet@x[keep],
dims = c(n, K)
)
yg <- Matrix::sparseVector(
x = y[idx], i = idx, length = n
)

return(list(
tXgXg = Matrix::crossprod(Xg),
tXgyg = Matrix::crossprod(Xg, yg)
))
}
)

# Extract from `res`
tXgXg <- lapply(res, function(x) x$tXgXg)
names(tXgXg) <- unique_clusters
tXgyg <- lapply(res, function(x) x$tXgyg)
names(tXgyg) <- unique_clusters

tXX <- Reduce("+", tXgXg)
# all(tXX == crossprod(X))
tXy <- Reduce("+", tXgyg)
# all(tXy == crossprod(X, y))

beta_hat <- Matrix::solve(tXX, tXy)

# initiate jackknife
beta_jack <-
lapply(
seq_len(G),
function(g) {
tXX_diff <- tXX - tXgXg[[g]]
tXy_diff <- tXy - tXgyg[[g]]

beta <- tryCatch(
{
Matrix::solve(tXX_diff, tXy_diff)
},
error = function(e) {
# message(paste0("Error when dropping cluster ", g, ". Using Psuedo-Inverse instead."))
summclust:::eigen_pinv(as.matrix(tXX_diff %*% tXy_diff))
}
)

return(beta)
}
)

if (type == "CRV3J") {
beta_bar <- beta_center <- Reduce("+", beta_jack) / G
} else if (type == "CRV3") {
beta_center <- beta_hat
}

V3 <- lapply(
seq_along(unique_clusters),
function(x) {
Matrix::tcrossprod(beta_jack[[x]] - beta_center)
}
)

vcov <- Reduce("+", V3) * small_sample_correction
beta_jack <- Reduce("cbind", beta_jack)

rownames(beta_jack) <- colnames(X)
colnames(beta_jack) <- unique_clusters
colnames(vcov) <- rownames(vcov) <- colnames(X)

res <- list(
vcov = vcov,
beta_jack = beta_jack,
unique_clusters = unique_clusters,
tXgXg = tXgXg,
tXgyg = tXgyg,
tXX = tXX,
tXy = tXy,
G = G,
small_sample_correction = small_sample_correction
)

res

}
Loading