In this markdown (source here), we work through an example where we have a lot of variables, but only a few are significant. It demonstrates how different regularized regression techniques, namely ridge and lasso, can be used to tackle this problem with fewer data points than unknowns.

Regression and Large Dimensional Data

let’s make an example that uses a lot of variables but only a few that are important

100 variables, only 5 matter

require(magrittr)

## Loading required package: magrittr

require(dplyr)

## Loading required package: dplyr

## Warning: package 'dplyr' was built under R version 3.1.1

## 
## Attaching package: 'dplyr'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

require(ggplot2)

## Loading required package: ggplot2

bigDF <- data.frame()
N = 200
for (i in 1:100)
  bigDF[1:N,i] <- runif(N)

Response2 = bigDF[,1] * 20 + bigDF[,2]^1.2 * 10 + bigDF[,3]^0.9*5+ bigDF[,4]*2.5+ bigDF[,5] + rowSums(0.1 * bigDF[,6:100]^0.5) + 5 + 0.01*runif(N)



sensDF <- data.frame(Method = 0, Var = 0, Value=0)

#ggpairs(bigDF[,c(101,1:10)], lower = list(continuous = "smooth")) 

bigDF$Response <- Response2

summary(bigMod<-lm(Response ~ ., data=bigDF[1:110,]))

## 
## Call:
## lm(formula = Response ~ ., data = bigDF[1:110, ])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.10282 -0.02516 -0.00289  0.02785  0.12180 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.17586    0.91008    7.88  2.5e-05 ***
## V1          20.21817    0.15144  133.50  3.8e-16 ***
## V2          10.33462    0.16088   64.24  2.7e-13 ***
## V3           4.50689    0.19033   23.68  2.0e-09 ***
## V4           2.46991    0.22513   10.97  1.6e-06 ***
## V5           0.95284    0.18391    5.18  0.00058 ***
## V6           0.07847    0.13143    0.60  0.56521    
## V7           0.07530    0.16891    0.45  0.66628    
## V8          -0.01637    0.21002   -0.08  0.93957    
## V9           0.15221    0.17894    0.85  0.41703    
## V10          0.00614    0.17618    0.03  0.97295    
## V11         -0.11732    0.17104   -0.69  0.51003    
## V12          0.07411    0.14762    0.50  0.62769    
## V13          0.09823    0.18757    0.52  0.61313    
## V14          0.21131    0.18372    1.15  0.27972    
## V15          0.25595    0.16321    1.57  0.15127    
## V16          0.05269    0.19339    0.27  0.79143    
## V17          0.26275    0.12482    2.11  0.06459 .  
## V18         -0.00310    0.18244   -0.02  0.98682    
## V19         -0.09132    0.16415   -0.56  0.59155    
## V20          0.26141    0.13003    2.01  0.07529 .  
## V21          0.18142    0.13478    1.35  0.21120    
## V22         -0.08955    0.22837   -0.39  0.70408    
## V23          0.18933    0.17776    1.07  0.31458    
## V24          0.02334    0.14609    0.16  0.87662    
## V25          0.03237    0.20402    0.16  0.87745    
## V26          0.37202    0.15890    2.34  0.04393 *  
## V27          0.04643    0.14475    0.32  0.75574    
## V28          0.22728    0.20590    1.10  0.29831    
## V29         -0.04259    0.14353   -0.30  0.77339    
## V30          0.19828    0.13548    1.46  0.17736    
## V31          0.06783    0.14167    0.48  0.64349    
## V32          0.16410    0.18856    0.87  0.40675    
## V33          0.15603    0.12830    1.22  0.25487    
## V34          0.05735    0.18754    0.31  0.76672    
## V35         -0.15118    0.19597   -0.77  0.46023    
## V36          0.13828    0.13086    1.06  0.31816    
## V37         -0.23306    0.24050   -0.97  0.35782    
## V38          0.03192    0.24820    0.13  0.90051    
## V39          0.12000    0.12828    0.94  0.37396    
## V40         -0.20686    0.14216   -1.46  0.17961    
## V41          0.29576    0.20891    1.42  0.19052    
## V42         -0.12536    0.17204   -0.73  0.48474    
## V43          0.13949    0.15951    0.87  0.40456    
## V44          0.21930    0.16438    1.33  0.21494    
## V45         -0.14456    0.19073   -0.76  0.46789    
## V46         -0.04426    0.14683   -0.30  0.76991    
## V47          0.06443    0.21532    0.30  0.77155    
## V48          0.19630    0.17268    1.14  0.28498    
## V49          0.10600    0.16181    0.66  0.52882    
## V50          0.04752    0.20073    0.24  0.81815    
## V51          0.16232    0.16330    0.99  0.34617    
## V52         -0.09940    0.15051   -0.66  0.52553    
## V53          0.08780    0.21258    0.41  0.68925    
## V54         -0.02115    0.17915   -0.12  0.90864    
## V55          0.21637    0.14031    1.54  0.15744    
## V56          0.03453    0.24854    0.14  0.89257    
## V57          0.14471    0.12102    1.20  0.26234    
## V58          0.12174    0.15198    0.80  0.44374    
## V59          0.13562    0.17494    0.78  0.45809    
## V60          0.10557    0.24791    0.43  0.68023    
## V61          0.23608    0.21254    1.11  0.29548    
## V62         -0.25061    0.23907   -1.05  0.32184    
## V63          0.35590    0.18027    1.97  0.07979 .  
## V64          0.13526    0.18983    0.71  0.49417    
## V65         -0.02453    0.19714   -0.12  0.90371    
## V66          0.01886    0.15298    0.12  0.90460    
## V67          0.22642    0.20839    1.09  0.30548    
## V68          0.06471    0.14710    0.44  0.67040    
## V69          0.21364    0.15696    1.36  0.20657    
## V70         -0.17908    0.22835   -0.78  0.45305    
## V71          0.37911    0.16150    2.35  0.04348 *  
## V72          0.02179    0.16819    0.13  0.89975    
## V73          0.03364    0.17913    0.19  0.85519    
## V74          0.02635    0.21718    0.12  0.90610    
## V75         -0.00948    0.12486   -0.08  0.94112    
## V76          0.04064    0.22361    0.18  0.85981    
## V77          0.18110    0.15658    1.16  0.27720    
## V78          0.08613    0.16584    0.52  0.61602    
## V79          0.45618    0.17917    2.55  0.03140 *  
## V80         -0.13203    0.30263   -0.44  0.67292    
## V81          0.24713    0.18539    1.33  0.21528    
## V82          0.17330    0.21346    0.81  0.43782    
## V83          0.14445    0.18871    0.77  0.46358    
## V84          0.26649    0.15970    1.67  0.12953    
## V85          0.18485    0.17516    1.06  0.31878    
## V86         -0.07645    0.17911   -0.43  0.67953    
## V87         -0.13902    0.14796   -0.94  0.37196    
## V88         -0.00801    0.19715   -0.04  0.96847    
## V89         -0.01516    0.16484   -0.09  0.92872    
## V90          0.03189    0.17708    0.18  0.86106    
## V91          0.03323    0.17047    0.19  0.84978    
## V92          0.07643    0.21798    0.35  0.73394    
## V93          0.17277    0.24206    0.71  0.49346    
## V94         -0.24668    0.28737   -0.86  0.41295    
## V95          0.03635    0.26080    0.14  0.89221    
## V96         -0.00406    0.20875   -0.02  0.98489    
## V97         -0.00833    0.15435   -0.05  0.95814    
## V98          0.18658    0.18604    1.00  0.34210    
## V99          0.23637    0.19272    1.23  0.25112    
## V100         0.24643    0.16505    1.49  0.16963    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.16 on 9 degrees of freedom
## Multiple R-squared:     1,   Adjusted R-squared:  0.999 
## F-statistic: 1.93e+03 on 100 and 9 DF,  p-value: 3.19e-14

plotDF <- bigDF
plotDF[1:110,'Type'] <- 'Train'
plotDF[111:200,'Type'] <- 'Test'
plotDF$Predict <- predict(bigMod,plotDF)
plotDF$Error <- plotDF$Response-plotDF$Predict
ggplot(plotDF,aes(x=Response,y=Predict,color=Type,size=abs(Error))) + geom_point() + scale_size("Absolute Error") + geom_smooth(method="lm",se=F,size=1)

plot of chunk unnamed-chunk-1

sqrt(var(data.frame(plotDF %>% filter(Type=="Test") %>% select(Error))))/110

##          Error
## Error 0.004209

ggplot(plotDF,aes(x=Error)) + geom_histogram()

## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

plot of chunk unnamed-chunk-1

sensitivities <- coef(bigMod)
require(glmnet)

## Loading required package: glmnet
## Loading required package: Matrix
## Loaded glmnet 1.9-8

sensDF[1:length(sensitivities),'Method'] <- "Least-Squares"
sensDF[1:length(sensitivities),'Var'] <- names(sensitivities)
sensDF[1:length(sensitivities),'Value'] <- (sensitivities)
rowStart <- length(sensitivities)+1

Lasso regression

crossValid <- cv.glmnet(as.matrix(bigDF[1:40,1:100]),as.matrix(bigDF$Response[1:40]),alpha = 1)
plot(crossValid)

plot of chunk unnamed-chunk-2

lambda <- crossValid$lambda.min
sensitivities <- coef(crossValid)
plotDF <- bigDF
plotDF[1:40,'Type'] <- 'Train'
plotDF[41:200,'Type'] <- 'Test'
plotDF[,"Predict" ]<- data.frame(Predict=predict(crossValid,as.matrix(plotDF[,1:100]),lambda=lambda))
plotDF$Error <- plotDF$Response-plotDF$Predict
ggplot(plotDF,aes(x=Response,y=Predict,color=Type,size=abs(Error))) + geom_point() + scale_size("Absolute Error") + geom_smooth(method="lm",se=F,size=1)

plot of chunk unnamed-chunk-2

sqrt(var(data.frame(plotDF %>% filter(Type=="Test") %>% select(Error))))/40

##         Error
## Error 0.01154

ggplot(plotDF,aes(x=Error)) + geom_histogram()

## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

plot of chunk unnamed-chunk-2

sensDF[rowStart:(rowStart + length(sensitivities)-1),'Method'] <- "Lasso"
sensDF[rowStart:(rowStart + length(sensitivities)-1),'Var'] <- t(t(rownames(sensitivities)))
sensDF[rowStart:(rowStart + length(sensitivities)-1),'Value'] <- as.numeric(sensitivities)
rowStart <- rowStart + length(sensitivities)

Ridge regression

crossValid <- cv.glmnet(as.matrix(bigDF[1:110,1:100]),as.matrix(bigDF$Response[1:110]),alpha = 0,lambda=exp(seq(-5,5,by=0.1)))
plot(crossValid)

plot of chunk unnamed-chunk-3

lambda <- crossValid$lambda.min
sensitivities <- coef(crossValid)
plotDF <- bigDF
plotDF[1:110,'Type'] <- 'Train'
plotDF[111:200,'Type'] <- 'Test'
plotDF[,"Predict" ]<- data.frame(Predict=predict(crossValid,as.matrix(plotDF[,1:100]),lambda=lambda))
plotDF$Error <- plotDF$Response-plotDF$Predict
ggplot(plotDF,aes(x=Response,y=Predict,color=Type,size=abs(Error))) + geom_point() + scale_size("Absolute Error") + geom_smooth(method="lm",se=F,size=1)

plot of chunk unnamed-chunk-3

sqrt(var(data.frame(plotDF %>% filter(Type=="Test") %>% select(Error))))/110

##          Error
## Error 0.006032

ggplot(plotDF,aes(x=Error)) + geom_histogram()

## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

plot of chunk unnamed-chunk-3

sensDF[rowStart:(rowStart + length(sensitivities)-1),'Method'] <- "Ridge"
sensDF[rowStart:(rowStart + length(sensitivities)-1),'Var'] <- t(t(rownames(sensitivities)))
sensDF[rowStart:(rowStart + length(sensitivities)-1),'Value'] <- as.numeric(sensitivities)
rowStart <- rowStart + length(sensitivities)

#good values
sensDF[rowStart:(rowStart + length(sensitivities)-1),'Method'] <- "Exact"
sensDF[rowStart:(rowStart + length(sensitivities)-1),'Var'] <- t(t(rownames(sensitivities)))
sensDF[rowStart:(rowStart + length(sensitivities)-1),'Value'] <- c(5,20,12,5*.9,2.5,1,rep(0.1,95))

sensDF$Exact = rep(c(5,20,12,5*.9,2.5,1,rep(0.1,95)),4)

Compare Methods

ggplot(sensDF,aes(x=reorder(Var,-Value),y=Value,color=Method,group=Method)) + geom_point() + geom_line() + theme(panel.grid.major = element_blank(),axis.text.x = element_text(angle = 90, hjust = 1, size=8))+ scale_x_discrete("Variable") + scale_y_continuous("Coefficient")

ggplot(sensDF[sensDF$Exact<1,],aes(x=reorder(Var,-Value),y=Value,color=Method,group=Method)) + geom_point() + geom_line() + theme(panel.grid.major = element_blank(),axis.text.x = element_text(angle = 90, hjust = 1, size=8))+ scale_x_discrete("Variable") + scale_y_continuous("Coefficient")

plot of chunk unnamed-chunk-4

ggplot(sensDF,aes(x=reorder(Var,-Value),y=abs(Value-Exact),color=Method,group=Method)) + geom_point() + geom_line() + theme(panel.grid.major = element_blank(),axis.text.x = element_text(angle = 90, hjust = 1, size=8)) + scale_x_discrete("Variable") + scale_y_continuous("Abs. Error in Coefficient")

plot of chunk unnamed-chunk-4

DrRyanMc.com

The Computational Engineering, Statistical Forecasting, Data Science, and Oxford Commas of Ryan McClarren

Large-scale sensitivity analysis using regression

Regression and Large Dimensional Data

Lasso regression

Ridge regression

Compare Methods

Leave a Reply Cancel reply