Data generation

data <- matrix(c(0.0, 0, 2, 0, 0, 1, 0, 1, 0, 
                 2.1, 1, 0, 2, 3, 2, 0, 0, 3, 
                 2.7, 0, 0, 0, 2, 2, 1, 1, 1, 
                 5.9, 3, 0, 1, 0, 0, 0, 2, 0, 
                 7.3, 3, 4, 0, 1, 1, 1, 0, 0, 
                 0.0, 0, 2, 0, 0, 3, 0, 0, 0, 
                 2.0, 1, 0, 2, 1, 0, 0, 0, 1), 
                 byrow = TRUE, nrow=7, ncol=9)
colnames(data) <- c("outcome", "x1", "x2", "x3", "x4", "x5", 
                    "x6", "x7", "x8")

Fitting a simple model

simple_model <- lm(outcome ~ x1, data = as.data.frame(data))
summary(simple_model)

## 
## Call:
## lm(formula = outcome ~ x1, data = as.data.frame(data))
## 
## Residuals:
##       1       2       3       4       5       6       7 
## -0.6632 -0.4829  2.0368 -0.5224  0.8776 -0.6632 -0.5829 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)   0.6632     0.5914   1.121  0.31306   
## x1            1.9197     0.3499   5.487  0.00274 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.153 on 5 degrees of freedom
## Multiple R-squared:  0.8576, Adjusted R-squared:  0.8291 
## F-statistic: 30.11 on 1 and 5 DF,  p-value: 0.002743

Selecting variables with the lasso

library(glmnet)

## Loading required package: Matrix

## Loaded glmnet 4.1-4

coef(glmnet(y=data[, 1], x=data[, 2:9], lambda=1))

## 9 x 1 sparse Matrix of class "dgCMatrix"
##                    s0
## (Intercept) 1.5311844
## x1          1.1012711
## x2          .        
## x3          .        
## x4          .        
## x5          .        
## x6          0.2357701
## x7          .        
## x8          .

Fitting a more complex model

complex_model <- lm(outcome ~ x1 + x6, data = as.data.frame(data))
summary(complex_model)

## 
## Call:
## lm(formula = outcome ~ x1 + x6, data = as.data.frame(data))
## 
## Residuals:
##          1          2          3          4          5          6          7 
## -2.190e-01  1.000e-01  3.714e-01  3.381e-01 -3.714e-01 -2.190e-01  1.747e-18 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   0.2190     0.1914   1.145  0.31621    
## x1            1.7810     0.1087  16.385 8.12e-05 ***
## x6            2.1095     0.2996   7.040  0.00215 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3522 on 4 degrees of freedom
## Multiple R-squared:  0.9894, Adjusted R-squared:  0.984 
## F-statistic: 186.1 on 2 and 4 DF,  p-value: 0.0001131

Checking linear dependencies

max(abs(data[, 7] - data[, 5] + data[, 9]))  # x_6 = x_4 - x_8

## [1] 0

Verifying the coefficients of determination

See the literature section.

Rsquared <- function(estimator, data) 
{
  n <- dim(data)[1]
  p <- length(estimator[-1])
  s <- nnzero(estimator[-1])
  X <- cbind(rep(1, n), data[, 2:(p+1)])  # intercept and x'es
  Rsquared <- 1 - sum((data[, 1]-X%*%estimator)^2)/(var(data[, 1])*(n-1))
  Rsquared_adjusted <- 1-(1-Rsquared)*(n-1)/(n-s-1)
  print(Rsquared)
  print(Rsquared_adjusted)
}
estimatorls <- simple_model$coefficients
estimatorlasso <- rep(0, dim(data)[2]) # do not forget the intercept
estimatorlasso[c(1, 2, 7)] <- complex_model$coefficients
Rsquared(estimatorls, data)

## [1] 0.8575922
## [1] 0.8291106

Rsquared(estimatorlasso, data)

## [1] 0.9893652
## [1] 0.9840478

Making a latex table

# the comment=NA flag removes the hashtags: copy-paste becomes easier
library(stargazer)
stargazer(data[, 1:2])


% Table created by stargazer v.5.2.3 by Marek Hlavac, Social Policy Institute. E-mail: marek.hlavac at gmail.com
% Date and time: Sun, Oct 12, 2025 - 14:47:56
\begin{table}[!htbp] \centering 
  \caption{} 
  \label{} 
\begin{tabular}{@{\extracolsep{5pt}} cc} 
\\[-1.8ex]\hline 
\hline \\[-1.8ex] 
outcome & x1 \\ 
\hline \\[-1.8ex] 
$0$ & $0$ \\ 
$2.100$ & $1$ \\ 
$2.700$ & $0$ \\ 
$5.900$ & $3$ \\ 
$7.300$ & $3$ \\ 
$0$ & $0$ \\ 
$2$ & $1$ \\ 
\hline \\[-1.8ex] 
\end{tabular} 
\end{table}

stargazer(data)


% Table created by stargazer v.5.2.3 by Marek Hlavac, Social Policy Institute. E-mail: marek.hlavac at gmail.com
% Date and time: Sun, Oct 12, 2025 - 14:47:56
\begin{table}[!htbp] \centering 
  \caption{} 
  \label{} 
\begin{tabular}{@{\extracolsep{5pt}} ccccccccc} 
\\[-1.8ex]\hline 
\hline \\[-1.8ex] 
outcome & x1 & x2 & x3 & x4 & x5 & x6 & x7 & x8 \\ 
\hline \\[-1.8ex] 
$0$ & $0$ & $2$ & $0$ & $0$ & $1$ & $0$ & $1$ & $0$ \\ 
$2.100$ & $1$ & $0$ & $2$ & $3$ & $2$ & $0$ & $0$ & $3$ \\ 
$2.700$ & $0$ & $0$ & $0$ & $2$ & $2$ & $1$ & $1$ & $1$ \\ 
$5.900$ & $3$ & $0$ & $1$ & $0$ & $0$ & $0$ & $2$ & $0$ \\ 
$7.300$ & $3$ & $4$ & $0$ & $1$ & $1$ & $1$ & $0$ & $0$ \\ 
$0$ & $0$ & $2$ & $0$ & $0$ & $3$ & $0$ & $0$ & $0$ \\ 
$2$ & $1$ & $0$ & $2$ & $1$ & $0$ & $0$ & $0$ & $1$ \\ 
\hline \\[-1.8ex] 
\end{tabular} 
\end{table}

Solution to Exercise 1.1 in `Fundamentals of High-Dimensional Statistics’

Johannes Lederer

Data generation

Fitting a simple model

Selecting variables with the lasso

Fitting a more complex model

Checking linear dependencies

Verifying the coefficients of determination

Making a latex table