In this markdown (source here), we work through an example where we have a lot of variables, but only a few are significant. It demonstrates how different regularized regression techniques, namely ridge and lasso, can be used to tackle this problem with fewer data points than unknowns.
Regression and Large Dimensional Data
let’s make an example that uses a lot of variables but only a few that are important
100 variables, only 5 matter
require(magrittr)
## Loading required package: magrittr
require(dplyr)
## Loading required package: dplyr
## Warning: package 'dplyr' was built under R version 3.1.1
##
## Attaching package: 'dplyr'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
require(ggplot2)
## Loading required package: ggplot2
bigDF <- data.frame()
N = 200
for (i in 1:100)
bigDF[1:N,i] <- runif(N)
Response2 = bigDF[,1] * 20 + bigDF[,2]^1.2 * 10 + bigDF[,3]^0.9*5+ bigDF[,4]*2.5+ bigDF[,5] + rowSums(0.1 * bigDF[,6:100]^0.5) + 5 + 0.01*runif(N)
sensDF <- data.frame(Method = 0, Var = 0, Value=0)
#ggpairs(bigDF[,c(101,1:10)], lower = list(continuous = "smooth"))
bigDF$Response <- Response2
summary(bigMod<-lm(Response ~ ., data=bigDF[1:110,]))
##
## Call:
## lm(formula = Response ~ ., data = bigDF[1:110, ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.10282 -0.02516 -0.00289 0.02785 0.12180
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.17586 0.91008 7.88 2.5e-05 ***
## V1 20.21817 0.15144 133.50 3.8e-16 ***
## V2 10.33462 0.16088 64.24 2.7e-13 ***
## V3 4.50689 0.19033 23.68 2.0e-09 ***
## V4 2.46991 0.22513 10.97 1.6e-06 ***
## V5 0.95284 0.18391 5.18 0.00058 ***
## V6 0.07847 0.13143 0.60 0.56521
## V7 0.07530 0.16891 0.45 0.66628
## V8 -0.01637 0.21002 -0.08 0.93957
## V9 0.15221 0.17894 0.85 0.41703
## V10 0.00614 0.17618 0.03 0.97295
## V11 -0.11732 0.17104 -0.69 0.51003
## V12 0.07411 0.14762 0.50 0.62769
## V13 0.09823 0.18757 0.52 0.61313
## V14 0.21131 0.18372 1.15 0.27972
## V15 0.25595 0.16321 1.57 0.15127
## V16 0.05269 0.19339 0.27 0.79143
## V17 0.26275 0.12482 2.11 0.06459 .
## V18 -0.00310 0.18244 -0.02 0.98682
## V19 -0.09132 0.16415 -0.56 0.59155
## V20 0.26141 0.13003 2.01 0.07529 .
## V21 0.18142 0.13478 1.35 0.21120
## V22 -0.08955 0.22837 -0.39 0.70408
## V23 0.18933 0.17776 1.07 0.31458
## V24 0.02334 0.14609 0.16 0.87662
## V25 0.03237 0.20402 0.16 0.87745
## V26 0.37202 0.15890 2.34 0.04393 *
## V27 0.04643 0.14475 0.32 0.75574
## V28 0.22728 0.20590 1.10 0.29831
## V29 -0.04259 0.14353 -0.30 0.77339
## V30 0.19828 0.13548 1.46 0.17736
## V31 0.06783 0.14167 0.48 0.64349
## V32 0.16410 0.18856 0.87 0.40675
## V33 0.15603 0.12830 1.22 0.25487
## V34 0.05735 0.18754 0.31 0.76672
## V35 -0.15118 0.19597 -0.77 0.46023
## V36 0.13828 0.13086 1.06 0.31816
## V37 -0.23306 0.24050 -0.97 0.35782
## V38 0.03192 0.24820 0.13 0.90051
## V39 0.12000 0.12828 0.94 0.37396
## V40 -0.20686 0.14216 -1.46 0.17961
## V41 0.29576 0.20891 1.42 0.19052
## V42 -0.12536 0.17204 -0.73 0.48474
## V43 0.13949 0.15951 0.87 0.40456
## V44 0.21930 0.16438 1.33 0.21494
## V45 -0.14456 0.19073 -0.76 0.46789
## V46 -0.04426 0.14683 -0.30 0.76991
## V47 0.06443 0.21532 0.30 0.77155
## V48 0.19630 0.17268 1.14 0.28498
## V49 0.10600 0.16181 0.66 0.52882
## V50 0.04752 0.20073 0.24 0.81815
## V51 0.16232 0.16330 0.99 0.34617
## V52 -0.09940 0.15051 -0.66 0.52553
## V53 0.08780 0.21258 0.41 0.68925
## V54 -0.02115 0.17915 -0.12 0.90864
## V55 0.21637 0.14031 1.54 0.15744
## V56 0.03453 0.24854 0.14 0.89257
## V57 0.14471 0.12102 1.20 0.26234
## V58 0.12174 0.15198 0.80 0.44374
## V59 0.13562 0.17494 0.78 0.45809
## V60 0.10557 0.24791 0.43 0.68023
## V61 0.23608 0.21254 1.11 0.29548
## V62 -0.25061 0.23907 -1.05 0.32184
## V63 0.35590 0.18027 1.97 0.07979 .
## V64 0.13526 0.18983 0.71 0.49417
## V65 -0.02453 0.19714 -0.12 0.90371
## V66 0.01886 0.15298 0.12 0.90460
## V67 0.22642 0.20839 1.09 0.30548
## V68 0.06471 0.14710 0.44 0.67040
## V69 0.21364 0.15696 1.36 0.20657
## V70 -0.17908 0.22835 -0.78 0.45305
## V71 0.37911 0.16150 2.35 0.04348 *
## V72 0.02179 0.16819 0.13 0.89975
## V73 0.03364 0.17913 0.19 0.85519
## V74 0.02635 0.21718 0.12 0.90610
## V75 -0.00948 0.12486 -0.08 0.94112
## V76 0.04064 0.22361 0.18 0.85981
## V77 0.18110 0.15658 1.16 0.27720
## V78 0.08613 0.16584 0.52 0.61602
## V79 0.45618 0.17917 2.55 0.03140 *
## V80 -0.13203 0.30263 -0.44 0.67292
## V81 0.24713 0.18539 1.33 0.21528
## V82 0.17330 0.21346 0.81 0.43782
## V83 0.14445 0.18871 0.77 0.46358
## V84 0.26649 0.15970 1.67 0.12953
## V85 0.18485 0.17516 1.06 0.31878
## V86 -0.07645 0.17911 -0.43 0.67953
## V87 -0.13902 0.14796 -0.94 0.37196
## V88 -0.00801 0.19715 -0.04 0.96847
## V89 -0.01516 0.16484 -0.09 0.92872
## V90 0.03189 0.17708 0.18 0.86106
## V91 0.03323 0.17047 0.19 0.84978
## V92 0.07643 0.21798 0.35 0.73394
## V93 0.17277 0.24206 0.71 0.49346
## V94 -0.24668 0.28737 -0.86 0.41295
## V95 0.03635 0.26080 0.14 0.89221
## V96 -0.00406 0.20875 -0.02 0.98489
## V97 -0.00833 0.15435 -0.05 0.95814
## V98 0.18658 0.18604 1.00 0.34210
## V99 0.23637 0.19272 1.23 0.25112
## V100 0.24643 0.16505 1.49 0.16963
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.16 on 9 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: 0.999
## F-statistic: 1.93e+03 on 100 and 9 DF, p-value: 3.19e-14
plotDF <- bigDF
plotDF[1:110,'Type'] <- 'Train'
plotDF[111:200,'Type'] <- 'Test'
plotDF$Predict <- predict(bigMod,plotDF)
plotDF$Error <- plotDF$Response-plotDF$Predict
ggplot(plotDF,aes(x=Response,y=Predict,color=Type,size=abs(Error))) + geom_point() + scale_size("Absolute Error") + geom_smooth(method="lm",se=F,size=1)
sqrt(var(data.frame(plotDF %>% filter(Type=="Test") %>% select(Error))))/110
## Error
## Error 0.004209
ggplot(plotDF,aes(x=Error)) + geom_histogram()
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
sensitivities <- coef(bigMod)
require(glmnet)
## Loading required package: glmnet
## Loading required package: Matrix
## Loaded glmnet 1.9-8
sensDF[1:length(sensitivities),'Method'] <- "Least-Squares"
sensDF[1:length(sensitivities),'Var'] <- names(sensitivities)
sensDF[1:length(sensitivities),'Value'] <- (sensitivities)
rowStart <- length(sensitivities)+1
Lasso regression
crossValid <- cv.glmnet(as.matrix(bigDF[1:40,1:100]),as.matrix(bigDF$Response[1:40]),alpha = 1)
plot(crossValid)
lambda <- crossValid$lambda.min
sensitivities <- coef(crossValid)
plotDF <- bigDF
plotDF[1:40,'Type'] <- 'Train'
plotDF[41:200,'Type'] <- 'Test'
plotDF[,"Predict" ]<- data.frame(Predict=predict(crossValid,as.matrix(plotDF[,1:100]),lambda=lambda))
plotDF$Error <- plotDF$Response-plotDF$Predict
ggplot(plotDF,aes(x=Response,y=Predict,color=Type,size=abs(Error))) + geom_point() + scale_size("Absolute Error") + geom_smooth(method="lm",se=F,size=1)
sqrt(var(data.frame(plotDF %>% filter(Type=="Test") %>% select(Error))))/40
## Error
## Error 0.01154
ggplot(plotDF,aes(x=Error)) + geom_histogram()
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
sensDF[rowStart:(rowStart + length(sensitivities)-1),'Method'] <- "Lasso"
sensDF[rowStart:(rowStart + length(sensitivities)-1),'Var'] <- t(t(rownames(sensitivities)))
sensDF[rowStart:(rowStart + length(sensitivities)-1),'Value'] <- as.numeric(sensitivities)
rowStart <- rowStart + length(sensitivities)
Ridge regression
crossValid <- cv.glmnet(as.matrix(bigDF[1:110,1:100]),as.matrix(bigDF$Response[1:110]),alpha = 0,lambda=exp(seq(-5,5,by=0.1)))
plot(crossValid)
lambda <- crossValid$lambda.min
sensitivities <- coef(crossValid)
plotDF <- bigDF
plotDF[1:110,'Type'] <- 'Train'
plotDF[111:200,'Type'] <- 'Test'
plotDF[,"Predict" ]<- data.frame(Predict=predict(crossValid,as.matrix(plotDF[,1:100]),lambda=lambda))
plotDF$Error <- plotDF$Response-plotDF$Predict
ggplot(plotDF,aes(x=Response,y=Predict,color=Type,size=abs(Error))) + geom_point() + scale_size("Absolute Error") + geom_smooth(method="lm",se=F,size=1)
sqrt(var(data.frame(plotDF %>% filter(Type=="Test") %>% select(Error))))/110
## Error
## Error 0.006032
ggplot(plotDF,aes(x=Error)) + geom_histogram()
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
sensDF[rowStart:(rowStart + length(sensitivities)-1),'Method'] <- "Ridge"
sensDF[rowStart:(rowStart + length(sensitivities)-1),'Var'] <- t(t(rownames(sensitivities)))
sensDF[rowStart:(rowStart + length(sensitivities)-1),'Value'] <- as.numeric(sensitivities)
rowStart <- rowStart + length(sensitivities)
#good values
sensDF[rowStart:(rowStart + length(sensitivities)-1),'Method'] <- "Exact"
sensDF[rowStart:(rowStart + length(sensitivities)-1),'Var'] <- t(t(rownames(sensitivities)))
sensDF[rowStart:(rowStart + length(sensitivities)-1),'Value'] <- c(5,20,12,5*.9,2.5,1,rep(0.1,95))
sensDF$Exact = rep(c(5,20,12,5*.9,2.5,1,rep(0.1,95)),4)
Compare Methods
ggplot(sensDF,aes(x=reorder(Var,-Value),y=Value,color=Method,group=Method)) + geom_point() + geom_line() + theme(panel.grid.major = element_blank(),axis.text.x = element_text(angle = 90, hjust = 1, size=8))+ scale_x_discrete("Variable") + scale_y_continuous("Coefficient")
ggplot(sensDF[sensDF$Exact<1,],aes(x=reorder(Var,-Value),y=Value,color=Method,group=Method)) + geom_point() + geom_line() + theme(panel.grid.major = element_blank(),axis.text.x = element_text(angle = 90, hjust = 1, size=8))+ scale_x_discrete("Variable") + scale_y_continuous("Coefficient")
ggplot(sensDF,aes(x=reorder(Var,-Value),y=abs(Value-Exact),color=Method,group=Method)) + geom_point() + geom_line() + theme(panel.grid.major = element_blank(),axis.text.x = element_text(angle = 90, hjust = 1, size=8)) + scale_x_discrete("Variable") + scale_y_continuous("Abs. Error in Coefficient")
// add bootstrap table styles to pandoc tables $(document).ready(function () { $('tr.header').parent('thead').parent('table').addClass('table table-condensed'); });