Overview

The goal of this project is to select and validate a predictive model for the bikeshare system “CapitalShare” in Washington DC.

Datasets

Datasets used in this project are from Capital Bikeshare User Data (https://www.capitalbikeshare.com/system-data), from Jan 2016 to Dec 2017 in a two-year time space, mainly focusing on variables include date, time, station, member type, duration, etc. To facilitate the analysis about user trends, we also obtained weather data in the Washington, D.C. area from Weather Underground (https://www.wunderground.com/).

Load Packages and Datasets


Load Packages

library(tidyverse)
library(psych)
library(forcats)
library(lubridate)
library(GGally)
library(corrplot)
library(leaps)
library(glmnet)
library(MASS)
library(modelr)
library(gam)
library(plotmo)

Dataset Preparation

Load datasets

quarters_raw <- c("2016Q1","2016Q2","2016Q3","2016Q4","2017Q1","2017Q2","2017Q3","2017Q4")
filenames = paste(c('2016_q1','2016_q2','2016_q3','2016_q4','2017_q1','2017_q2','2017_q3','2017_q4'), "trip_history_data.csv", sep = "_")
dfnames = paste0('bike_original',quarters_raw)
#a for loop is used to read and name 8 csv's, and assign R friendly column names
for (i in 1:8){
  assign(dfnames[i],read_csv(file=filenames[i],col_names=c("Duration","Start_date","End_date","Start_station_number","Start_station","End_station_number","End_station","Bike_number","Member_type"),skip=1)) 
}
add_season <- function(df,x){
  df %>% 
    add_column(season=x)
}

df_lst <- list(bike_original2016Q1,bike_original2016Q2,bike_original2016Q3,bike_original2016Q4,bike_original2017Q1,bike_original2017Q2,bike_original2017Q3,bike_original2017Q4)
seasons <- c("winter","spring","summer","fall","winter","spring","summer","fall")
quarters <- c("2016Q1","2016Q2","2016Q3","2016Q4","2017Q1","2017Q2","2017Q3","2017Q4")

for (i in 1:8) {
  assign(paste0('bike_original',quarters[i]),add_season(df_lst[[i]],seasons[i]))
}


combine 8 into 1 and add year, month, day, weekday and specific date

bike <- rbind(bike_original2016Q1,bike_original2016Q2,bike_original2016Q3,bike_original2016Q4,bike_original2017Q1,bike_original2017Q2,bike_original2017Q3,bike_original2017Q4)  

bike <-bike %>% 
  mutate(
    year= year(ymd_hms(Start_date)),
    month= month(ymd_hms(Start_date)),
    day= day(ymd_hms(Start_date)),
    weekday=wday(ymd_hms(Start_date)),
    date = make_date(year,month,day)) %>% 
  mutate(Member_type=as_factor(Member_type))
bike<-bike %>% 
  mutate(year= as.factor(year),
    month= as.factor(month),
    weekday=as.factor(weekday),
    season=as.factor(season))

create bike by day to get counts per day

bike_by_day <-
  bike %>% 
  group_by(date,Member_type) %>% 
  mutate(count=n(),
         total_duration_in_seconds=sum(as.numeric(Duration)*0.001)) %>% 
  dplyr::select(date,count,year,month,weekday,season,total_duration_in_seconds,Member_type) %>% 
  distinct(date,count,year,month,weekday,season,total_duration_in_seconds,Member_type) 
bike_by_day<-bike_by_day %>% 
  as.tibble()

read in weather

weather2017 <- read_csv(file="2017WeatherData.csv") %>% 
  add_column(year="2017") %>% 
  mutate(date = make_date(year,month,day)) %>% 
  dplyr::select(-year)
weather2016 <-read_csv(file="2016WeatherData.csv") %>% 
    add_column(year="2016") %>% 
  mutate(date = make_date(year,month,day)) %>% 
    dplyr::select(-year)
#create weather tibble
weather<-as_tibble(rbind(weather2016,weather2017)) 
#transform percipitation
weather <- weather %>% 
  within(precip[precip == 'T'] <- '0.005') %>% 
  transform(precip=as.double(precip))
weather <- weather %>% 
  within(high_wind[high_wind == '-'] <- '0') %>% 
  transform(high_wind=as.double(high_wind))

bike_weather<-
  bike_by_day%>%
  left_join(weather, by = c("date" = "date")) %>% 
  dplyr::select(-day,-events,-month.y,-total_duration_in_seconds)

Data Preprocess

numeric_bike <- as.data.frame(bike_weather[,unlist(lapply(bike_weather,is.numeric))])
par(mfrow=c(2,3))
for (i in 1:20){
  hist(numeric_bike[,i],main = names(numeric_bike)[i] )
}

#skewed variables: high humidity, avg_visibility, low_visibility, precip
#do log transformation first
bike_weather <- bike_weather %>% 
  mutate(log_high_humidity=log(high_humidity+0.00000001),
         log_avg_visi=log(avg_visibility+0.00000001),
         log_low_visi=log(low_visibility+0.00000001),
         log_precip=log(precip+0.00000001)) %>% 
  dplyr::select(-high_humidity,-avg_visibility,-low_visibility,-precip)

bike_weather<- bike_weather %>% 
 mutate(season=as_factor(season),
         high_visibility=as.double(high_visibility))
summary(bike_weather[,-1]) # not include date
##      count         year        month.x    weekday    season   
##  Min.   :    3   2016:724   3      :124   1:208   fall  :368  
##  1st Qu.: 1918   2017:730   5      :124   2:206   spring:364  
##  Median : 4245              7      :124   3:206   summer:368  
##  Mean   : 4878              8      :124   4:208   winter:354  
##  3rd Qu.: 7555              10     :124   5:208               
##  Max.   :11851              12     :124   6:210               
##                             (Other):710   7:208               
##  Member_type    high_temp         avg_temp        low_temp    
##  Member:727   Min.   : 23.00   Min.   :20.00   Min.   :13.00  
##  Casual:727   1st Qu.: 56.00   1st Qu.:47.25   1st Qu.:39.00  
##               Median : 71.00   Median :62.00   Median :53.00  
##               Mean   : 69.17   Mean   :61.02   Mean   :52.37  
##               3rd Qu.: 84.00   3rd Qu.:76.00   3rd Qu.:68.00  
##               Max.   :101.00   Max.   :91.00   Max.   :81.00  
##                                                               
##  high_dewpoint   avg_dewpoint    low_dewpoint    avg_humidity  
##  Min.   : 1.0   Min.   :-1.00   Min.   :-9.00   Min.   :27.00  
##  1st Qu.:40.0   1st Qu.:34.00   1st Qu.:26.25   1st Qu.:55.00  
##  Median :56.0   Median :50.00   Median :43.00   Median :65.00  
##  Mean   :53.2   Mean   :47.34   Mean   :41.40   Mean   :64.78  
##  3rd Qu.:68.0   3rd Qu.:64.00   3rd Qu.:59.00   3rd Qu.:74.00  
##  Max.   :80.0   Max.   :77.00   Max.   :75.00   Max.   :97.00  
##                                                                
##   low_humidity   high_sealevelpress avg_sealevelpress low_sealevelpress
##  Min.   :13.00   Min.   :29.61      Min.   :29.42     Min.   :29.18    
##  1st Qu.:34.00   1st Qu.:30.02      1st Qu.:29.93     1st Qu.:29.84    
##  Median :42.00   Median :30.13      Median :30.05     Median :29.96    
##  Mean   :45.29   Mean   :30.15      Mean   :30.05     Mean   :29.95    
##  3rd Qu.:54.00   3rd Qu.:30.27      3rd Qu.:30.18     3rd Qu.:30.09    
##  Max.   :93.00   Max.   :30.80      Max.   :30.72     Max.   :30.64    
##                                                                        
##  high_visibility    low_wind        avg_wind        high_wind    
##  Min.   :10      Min.   : 0.00   Min.   : 0.000   Min.   : 0.00  
##  1st Qu.:10      1st Qu.:15.00   1st Qu.: 6.000   1st Qu.:18.00  
##  Median :10      Median :18.00   Median : 8.000   Median :23.00  
##  Mean   :10      Mean   :19.51   Mean   : 8.779   Mean   :24.65  
##  3rd Qu.:10      3rd Qu.:23.00   3rd Qu.:11.000   3rd Qu.:29.75  
##  Max.   :10      Max.   :52.00   Max.   :22.000   Max.   :65.00  
##                                                                  
##  log_high_humidity  log_avg_visi    log_low_visi       log_precip     
##  Min.   :3.689     Min.   :1.099   Min.   :-18.421   Min.   :-18.421  
##  1st Qu.:4.321     1st Qu.:2.303   1st Qu.:  1.609   1st Qu.:-18.421  
##  Median :4.466     Median :2.303   Median :  2.303   Median :-18.421  
##  Mean   :4.409     Mean   :2.236   Mean   :  1.188   Mean   :-11.582  
##  3rd Qu.:4.533     3rd Qu.:2.303   3rd Qu.:  2.303   3rd Qu.: -3.507  
##  Max.   :4.605     Max.   :2.303   Max.   :  2.303   Max.   :  1.197  
## 


#Cross-Validation Data

set.seed(1)
n <- nrow(bike_weather)
tr <- sample(n,0.8*n)
te <- (-tr)
tr_bike_weather <- bike_weather[tr,]
te_bike_weather <- bike_weather[te,]

Methods

Feature Selection

1. Subset Selection Method

Multicolinearity

tr_bike_weather<- tr_bike_weather %>% 
 mutate(season=as_factor(season),
         high_visibility=as.double(high_visibility))
summary(tr_bike_weather[,-1]) # not include date
##      count         year        month.x    weekday    season   
##  Min.   :    3   2016:586   4      :105   1:178   fall  :292  
##  1st Qu.: 1970   2017:577   8      :104   2:161   spring:294  
##  Median : 4328              12     :102   3:161   summer:299  
##  Mean   : 4928              7      :101   4:168   winter:278  
##  3rd Qu.: 7629              10     : 98   5:164               
##  Max.   :11832              3      : 96   6:162               
##                             (Other):557   7:169               
##  Member_type    high_temp         avg_temp        low_temp    
##  Member:592   Min.   : 23.00   Min.   :20.00   Min.   :13.00  
##  Casual:571   1st Qu.: 56.00   1st Qu.:48.00   1st Qu.:39.00  
##               Median : 71.00   Median :63.00   Median :53.00  
##               Mean   : 69.21   Mean   :61.08   Mean   :52.46  
##               3rd Qu.: 84.00   3rd Qu.:76.00   3rd Qu.:68.00  
##               Max.   :101.00   Max.   :91.00   Max.   :81.00  
##                                                               
##  high_dewpoint    avg_dewpoint    low_dewpoint    avg_humidity  
##  Min.   : 1.00   Min.   :-1.00   Min.   :-9.00   Min.   :27.00  
##  1st Qu.:40.00   1st Qu.:34.00   1st Qu.:27.00   1st Qu.:55.00  
##  Median :56.00   Median :50.00   Median :43.00   Median :65.00  
##  Mean   :53.26   Mean   :47.41   Mean   :41.53   Mean   :64.84  
##  3rd Qu.:68.00   3rd Qu.:64.00   3rd Qu.:59.00   3rd Qu.:74.00  
##  Max.   :80.00   Max.   :77.00   Max.   :75.00   Max.   :97.00  
##                                                                 
##   low_humidity   high_sealevelpress avg_sealevelpress low_sealevelpress
##  Min.   :13.00   Min.   :29.61      Min.   :29.42     Min.   :29.18    
##  1st Qu.:34.00   1st Qu.:30.02      1st Qu.:29.93     1st Qu.:29.84    
##  Median :43.00   Median :30.13      Median :30.05     Median :29.96    
##  Mean   :45.49   Mean   :30.15      Mean   :30.05     Mean   :29.96    
##  3rd Qu.:54.00   3rd Qu.:30.27      3rd Qu.:30.18     3rd Qu.:30.09    
##  Max.   :93.00   Max.   :30.80      Max.   :30.72     Max.   :30.64    
##                                                                        
##  high_visibility    low_wind        avg_wind        high_wind    
##  Min.   :10      Min.   : 0.00   Min.   : 0.000   Min.   : 0.00  
##  1st Qu.:10      1st Qu.:15.00   1st Qu.: 6.000   1st Qu.:18.00  
##  Median :10      Median :18.00   Median : 8.000   Median :23.00  
##  Mean   :10      Mean   :19.48   Mean   : 8.776   Mean   :24.61  
##  3rd Qu.:10      3rd Qu.:23.00   3rd Qu.:11.000   3rd Qu.:29.50  
##  Max.   :10      Max.   :52.00   Max.   :22.000   Max.   :65.00  
##                                                                  
##  log_high_humidity  log_avg_visi    log_low_visi       log_precip     
##  Min.   :3.689     Min.   :1.386   Min.   :-18.421   Min.   :-18.421  
##  1st Qu.:4.317     1st Qu.:2.303   1st Qu.:  1.609   1st Qu.:-18.421  
##  Median :4.466     Median :2.303   Median :  2.303   Median :-18.421  
##  Mean   :4.409     Mean   :2.236   Mean   :  1.220   Mean   :-11.570  
##  3rd Qu.:4.533     3rd Qu.:2.303   3rd Qu.:  2.303   3rd Qu.: -3.507  
##  Max.   :4.605     Max.   :2.303   Max.   :  2.303   Max.   :  1.197  
## 
# check for multicolinearity
#remove date & high visibility(no variation)
cor(bike_weather[,unlist(lapply(tr_bike_weather,is.numeric))])
## Warning in cor(bike_weather[, unlist(lapply(tr_bike_weather,
## is.numeric))]): the standard deviation is zero
##                           count   high_temp    avg_temp     low_temp
## count               1.000000000  0.42064907  0.41043555  0.387375930
## high_temp           0.420649065  1.00000000  0.98659441  0.943053082
## avg_temp            0.410435552  0.98659441  1.00000000  0.984466901
## low_temp            0.387375930  0.94305308  0.98446690  1.000000000
## high_dewpoint       0.349402281  0.91515684  0.94058751  0.940579532
## avg_dewpoint        0.345029735  0.90777540  0.94365174  0.954452042
## low_dewpoint        0.340884111  0.88118808  0.92491960  0.944638183
## avg_humidity       -0.001914621  0.22686286  0.29514417  0.360113091
## low_humidity       -0.059268684  0.08533511  0.17691507  0.269874594
## high_sealevelpress -0.110671676 -0.41233090 -0.43323019 -0.441539587
## avg_sealevelpress  -0.034749387 -0.28025717 -0.28935864 -0.289239163
## low_sealevelpress   0.027618690 -0.14720256 -0.14724584 -0.141596283
## high_visibility              NA          NA          NA           NA
## low_wind           -0.115871466 -0.09782625 -0.12421583 -0.148289849
## avg_wind           -0.125820548 -0.19870087 -0.20561687 -0.207020597
## high_wind          -0.108648709 -0.09149265 -0.11988412 -0.147134365
## log_high_humidity   0.081289022  0.36706901  0.39110658  0.406471313
## log_avg_visi        0.208323834  0.20944113  0.16386387  0.110168848
## log_low_visi        0.113742350  0.07742769  0.06781064  0.056066141
## log_precip         -0.170071814 -0.08067887 -0.03945752  0.005897213
##                    high_dewpoint avg_dewpoint  low_dewpoint avg_humidity
## count               0.3494022807   0.34502974  0.3408841113 -0.001914621
## high_temp           0.9151568447   0.90777540  0.8811880801  0.226862864
## avg_temp            0.9405875093   0.94365174  0.9249195966  0.295144172
## low_temp            0.9405795324   0.95445204  0.9446381827  0.360113091
## high_dewpoint       1.0000000000   0.97836809  0.9380622675  0.542223264
## avg_dewpoint        0.9783680936   1.00000000  0.9805740570  0.577339608
## low_dewpoint        0.9380622675   0.98057406  1.0000000000  0.573073525
## avg_humidity        0.5422232637   0.57733961  0.5730735249  1.000000000
## low_humidity        0.3924351682   0.45174509  0.4865148315  0.904087272
## high_sealevelpress -0.4617400167  -0.46384851 -0.4332848435 -0.312487920
## avg_sealevelpress  -0.3567399420  -0.33320233 -0.2780197182 -0.288575865
## low_sealevelpress  -0.2404270871  -0.19433157 -0.1326563760 -0.239584840
## high_visibility               NA           NA            NA           NA
## low_wind           -0.1133001421  -0.18110686 -0.2351953667 -0.195171036
## avg_wind           -0.1916503099  -0.25514087 -0.2953611382 -0.206553690
## high_wind          -0.1186927167  -0.18618143 -0.2405048835 -0.213130741
## log_high_humidity   0.6155524363   0.61329909  0.5701900610  0.869670930
## log_avg_visi       -0.0008144368  -0.01908361  0.0008298162 -0.490183345
## log_low_visi       -0.0380965343  -0.03459520 -0.0064276510 -0.273675727
## log_precip          0.1575211771   0.13669014  0.0932913717  0.513859184
##                    low_humidity high_sealevelpress avg_sealevelpress
## count               -0.05926868        -0.11067168       -0.03474939
## high_temp            0.08533511        -0.41233090       -0.28025717
## avg_temp             0.17691507        -0.43323019       -0.28935864
## low_temp             0.26987459        -0.44153959       -0.28923916
## high_dewpoint        0.39243517        -0.46174002       -0.35673994
## avg_dewpoint         0.45174509        -0.46384851       -0.33320233
## low_dewpoint         0.48651483        -0.43328484       -0.27801972
## avg_humidity         0.90408727        -0.31248792       -0.28857586
## low_humidity         1.00000000        -0.23909188       -0.20992396
## high_sealevelpress  -0.23909188         1.00000000        0.94372340
## avg_sealevelpress   -0.20992396         0.94372340        1.00000000
## low_sealevelpress   -0.17123405         0.82702244        0.95511545
## high_visibility              NA                 NA                NA
## low_wind            -0.19715345        -0.20464863       -0.36033332
## avg_wind            -0.15287332        -0.10963188       -0.26997526
## high_wind           -0.21513699        -0.22856984       -0.37853707
## log_high_humidity    0.59478972        -0.33039684       -0.30921056
## log_avg_visi        -0.47856745         0.10923291        0.18010102
## log_low_visi        -0.24143610         0.09475535        0.15991948
## log_precip           0.48386652        -0.28181585       -0.36336305
##                    low_sealevelpress high_visibility    low_wind
## count                     0.02761869              NA -0.11587147
## high_temp                -0.14720256              NA -0.09782625
## avg_temp                 -0.14724584              NA -0.12421583
## low_temp                 -0.14159628              NA -0.14828985
## high_dewpoint            -0.24042709              NA -0.11330014
## avg_dewpoint             -0.19433157              NA -0.18110686
## low_dewpoint             -0.13265638              NA -0.23519537
## avg_humidity             -0.23958484              NA -0.19517104
## low_humidity             -0.17123405              NA -0.19715345
## high_sealevelpress        0.82702244              NA -0.20464863
## avg_sealevelpress         0.95511545              NA -0.36033332
## low_sealevelpress         1.00000000              NA -0.45240395
## high_visibility                   NA               1          NA
## low_wind                 -0.45240395              NA  1.00000000
## avg_wind                 -0.36762349              NA  0.76390238
## high_wind                -0.46669349              NA  0.95378846
## log_high_humidity        -0.25644712              NA -0.15638647
## log_avg_visi              0.20283081              NA -0.01523901
## log_low_visi              0.19087107              NA -0.11268420
## log_precip               -0.39387949              NA  0.25315596
##                       avg_wind   high_wind log_high_humidity  log_avg_visi
## count              -0.12582055 -0.10864871        0.08128902  0.2083238336
## high_temp          -0.19870087 -0.09149265        0.36706901  0.2094411328
## avg_temp           -0.20561687 -0.11988412        0.39110658  0.1638638731
## low_temp           -0.20702060 -0.14713436        0.40647131  0.1101688483
## high_dewpoint      -0.19165031 -0.11869272        0.61555244 -0.0008144368
## avg_dewpoint       -0.25514087 -0.18618143        0.61329909 -0.0190836119
## low_dewpoint       -0.29536114 -0.24050488        0.57019006  0.0008298162
## avg_humidity       -0.20655369 -0.21313074        0.86967093 -0.4901833447
## low_humidity       -0.15287332 -0.21513699        0.59478972 -0.4785674522
## high_sealevelpress -0.10963188 -0.22856984       -0.33039684  0.1092329090
## avg_sealevelpress  -0.26997526 -0.37853707       -0.30921056  0.1801010184
## low_sealevelpress  -0.36762349 -0.46669349       -0.25644712  0.2028308056
## high_visibility             NA          NA                NA            NA
## low_wind            0.76390238  0.95378846       -0.15638647 -0.0152390053
## avg_wind            1.00000000  0.74587302       -0.23224114 -0.0245489212
## high_wind           0.74587302  1.00000000       -0.17192167 -0.0152047422
## log_high_humidity  -0.23224114 -0.17192167        1.00000000 -0.3239267236
## log_avg_visi       -0.02454892 -0.01520474       -0.32392672  1.0000000000
## log_low_visi       -0.01265291 -0.13002408       -0.20887048  0.5817612420
## log_precip          0.15241722  0.22518332        0.41900816 -0.4530559098
##                    log_low_visi   log_precip
## count               0.113742350 -0.170071814
## high_temp           0.077427687 -0.080678869
## avg_temp            0.067810643 -0.039457516
## low_temp            0.056066141  0.005897213
## high_dewpoint      -0.038096534  0.157521177
## avg_dewpoint       -0.034595204  0.136690141
## low_dewpoint       -0.006427651  0.093291372
## avg_humidity       -0.273675727  0.513859184
## low_humidity       -0.241436102  0.483866517
## high_sealevelpress  0.094755348 -0.281815855
## avg_sealevelpress   0.159919476 -0.363363052
## low_sealevelpress   0.190871070 -0.393879486
## high_visibility              NA           NA
## low_wind           -0.112684201  0.253155964
## avg_wind           -0.012652909  0.152417221
## high_wind          -0.130024075  0.225183322
## log_high_humidity  -0.208870478  0.419008160
## log_avg_visi        0.581761242 -0.453055910
## log_low_visi        1.000000000 -0.306933506
## log_precip         -0.306933506  1.000000000
#try to drop highs and lows to avoid multicolinearity
dropped_bike_weather <- tr_bike_weather %>% 
  dplyr::select(-date,-high_temp,-low_temp,-high_dewpoint,-low_dewpoint,-low_humidity,-high_sealevelpress,-low_sealevelpress,-high_visibility,-high_wind,-low_wind,-log_high_humidity,-log_avg_visi,-season)
corr1 <- cor(dropped_bike_weather[,unlist(lapply(dropped_bike_weather,is.numeric))])
corrplot::corrplot(corr1,method = "number",type="upper")

dropped_bike_weather<-dropped_bike_weather %>% 
  dplyr::select(-avg_dewpoint)

stepwise selection

# remove date
dropped_bike_weather <- dropped_bike_weather[,-1]

Automatic methods and Cross-validation

# Forward Stepwise Selection
set.seed(1)
# using the sample() function to split the set of observations into two halves
n <- nrow(dropped_bike_weather)
train <- sample(n,n/2)
# select training data
train_data <- dropped_bike_weather[train,]
# select test data
test_data <- dropped_bike_weather[-train,]

# Full model
FullModel <- lm(count~.,data=train_data)
# Forward Stepwise Selection
fwd_selected <- step(FullModel, direction="forward", test="F")
## Start:  AIC=8583.69
## count ~ year + month.x + weekday + Member_type + avg_temp + avg_humidity + 
##     avg_sealevelpress + avg_wind + log_low_visi + log_precip
fwd_model <- lm(count ~ year + month.x + weekday + Member_type + avg_temp + 
    avg_humidity + avg_sealevelpress + avg_wind + log_low_visi + 
    log_precip, data = train_data)
# get the estimated test MSE for the linear regression fit
rmse(fwd_model,data=test_data) ^2 # 2556469
## [1] 2722660
# Backward Stepwise Selection
bck_selected <- step(FullModel, direction="backward", test="F")
## Start:  AIC=8583.69
## count ~ year + month.x + weekday + Member_type + avg_temp + avg_humidity + 
##     avg_sealevelpress + avg_wind + log_low_visi + log_precip
## 
##                     Df  Sum of Sq        RSS    AIC   F value    Pr(>F)
## - weekday            6   22255256 1407631755 8580.9    1.4860  0.180633
## - log_low_visi       1     210781 1385587279 8581.8    0.0844  0.771475
## - avg_sealevelpress  1    3683278 1389059777 8583.2    1.4756  0.224986
## <none>                            1385376499 8583.7                    
## - log_precip         1   14750117 1400126615 8587.8    5.9091  0.015379
## - avg_humidity       1   16330415 1401706914 8588.5    6.5422  0.010799
## - year               1   19114864 1404491363 8589.6    7.6577  0.005842
## - month.x           11   69557947 1454934446 8590.1    2.5333  0.003986
## - avg_wind           1   21118706 1406495204 8590.5    8.4604  0.003775
## - avg_temp           1  211250197 1596626695 8664.1   84.6296 < 2.2e-16
## - Member_type        1 3584990965 4970367464 9323.9 1436.1944 < 2.2e-16
##                        
## - weekday              
## - log_low_visi         
## - avg_sealevelpress    
## <none>                 
## - log_precip        *  
## - avg_humidity      *  
## - year              ** 
## - month.x           ** 
## - avg_wind          ** 
## - avg_temp          ***
## - Member_type       ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=8580.95
## count ~ year + month.x + Member_type + avg_temp + avg_humidity + 
##     avg_sealevelpress + avg_wind + log_low_visi + log_precip
## 
##                     Df  Sum of Sq        RSS    AIC   F value    Pr(>F)
## - log_low_visi       1     418286 1408050041 8579.1    0.1667  0.683213
## - avg_sealevelpress  1    2598700 1410230454 8580.0    1.0357  0.309265
## <none>                            1407631755 8580.9                    
## - avg_humidity       1   13627154 1421258909 8584.5    5.4310  0.020136
## - log_precip         1   17960378 1425592133 8586.3    7.1580  0.007681
## - year               1   18879888 1426511643 8586.7    7.5244  0.006281
## - avg_wind           1   20396140 1428027895 8587.3    8.1287  0.004517
## - month.x           11   73219811 1480851565 8588.4    2.6528  0.002549
## - avg_temp           1  223035656 1630667410 8664.4   88.8890 < 2.2e-16
## - Member_type        1 3624367150 5031998905 9319.1 1444.4616 < 2.2e-16
##                        
## - log_low_visi         
## - avg_sealevelpress    
## <none>                 
## - avg_humidity      *  
## - log_precip        ** 
## - year              ** 
## - avg_wind          ** 
## - month.x           ** 
## - avg_temp          ***
## - Member_type       ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=8579.12
## count ~ year + month.x + Member_type + avg_temp + avg_humidity + 
##     avg_sealevelpress + avg_wind + log_precip
## 
##                     Df  Sum of Sq        RSS    AIC   F value    Pr(>F)
## - avg_sealevelpress  1    2698419 1410748459 8578.2    1.0770  0.299809
## <none>                            1408050041 8579.1                    
## - avg_humidity       1   15208133 1423258174 8583.4    6.0701  0.014047
## - log_precip         1   18988321 1427038361 8584.9    7.5789  0.006097
## - year               1   19714732 1427764773 8585.2    7.8688  0.005204
## - avg_wind           1   20437351 1428487392 8585.5    8.1572  0.004448
## - month.x           11   75242505 1483292545 8587.4    2.7302  0.001905
## - avg_temp           1  224228370 1632278411 8663.0   89.4971 < 2.2e-16
## - Member_type        1 3626046688 5034096729 9317.3 1447.2769 < 2.2e-16
##                        
## - avg_sealevelpress    
## <none>                 
## - avg_humidity      *  
## - log_precip        ** 
## - year              ** 
## - avg_wind          ** 
## - month.x           ** 
## - avg_temp          ***
## - Member_type       ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=8578.23
## count ~ year + month.x + Member_type + avg_temp + avg_humidity + 
##     avg_wind + log_precip
## 
##                Df  Sum of Sq        RSS    AIC   F value    Pr(>F)    
## <none>                       1410748459 8578.2                        
## - avg_humidity  1   16678124 1427426583 8583.1    6.6559 0.0101352 *  
## - year          1   19600731 1430349190 8584.2    7.8222 0.0053371 ** 
## - log_precip    1   23464795 1434213254 8585.8    9.3643 0.0023179 ** 
## - month.x      11   80557424 1491305884 8588.5    2.9226 0.0009128 ***
## - avg_wind      1   30777004 1441525463 8588.8   12.2825 0.0004936 ***
## - avg_temp      1  240484175 1651232634 8667.7   95.9722 < 2.2e-16 ***
## - Member_type   1 3623449240 5034197699 9315.3 1446.0423 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
bck_model <- lm(count ~ year + month.x + Member_type + avg_temp + avg_humidity + avg_wind+log_precip, data = train_data)
# get the estimated test MSE for the linear regression fit
rmse(bck_model,data=test_data) ^2 #2577751
## [1] 2773779
# Exhaustive Stepwise Selection
both_selected <- step(FullModel, direction="both", test="F")
## Start:  AIC=8583.69
## count ~ year + month.x + weekday + Member_type + avg_temp + avg_humidity + 
##     avg_sealevelpress + avg_wind + log_low_visi + log_precip
## 
##                     Df  Sum of Sq        RSS    AIC   F value    Pr(>F)
## - weekday            6   22255256 1407631755 8580.9    1.4860  0.180633
## - log_low_visi       1     210781 1385587279 8581.8    0.0844  0.771475
## - avg_sealevelpress  1    3683278 1389059777 8583.2    1.4756  0.224986
## <none>                            1385376499 8583.7                    
## - log_precip         1   14750117 1400126615 8587.8    5.9091  0.015379
## - avg_humidity       1   16330415 1401706914 8588.5    6.5422  0.010799
## - year               1   19114864 1404491363 8589.6    7.6577  0.005842
## - month.x           11   69557947 1454934446 8590.1    2.5333  0.003986
## - avg_wind           1   21118706 1406495204 8590.5    8.4604  0.003775
## - avg_temp           1  211250197 1596626695 8664.1   84.6296 < 2.2e-16
## - Member_type        1 3584990965 4970367464 9323.9 1436.1944 < 2.2e-16
##                        
## - weekday              
## - log_low_visi         
## - avg_sealevelpress    
## <none>                 
## - log_precip        *  
## - avg_humidity      *  
## - year              ** 
## - month.x           ** 
## - avg_wind          ** 
## - avg_temp          ***
## - Member_type       ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=8580.95
## count ~ year + month.x + Member_type + avg_temp + avg_humidity + 
##     avg_sealevelpress + avg_wind + log_low_visi + log_precip
## 
##                     Df  Sum of Sq        RSS    AIC   F value    Pr(>F)
## - log_low_visi       1     418286 1408050041 8579.1    0.1667  0.683213
## - avg_sealevelpress  1    2598700 1410230454 8580.0    1.0357  0.309265
## <none>                            1407631755 8580.9                    
## + weekday            6   22255256 1385376499 8583.7    1.4860  0.180633
## - avg_humidity       1   13627154 1421258909 8584.5    5.4310  0.020136
## - log_precip         1   17960378 1425592133 8586.3    7.1580  0.007681
## - year               1   18879888 1426511643 8586.7    7.5244  0.006281
## - avg_wind           1   20396140 1428027895 8587.3    8.1287  0.004517
## - month.x           11   73219811 1480851565 8588.4    2.6528  0.002549
## - avg_temp           1  223035656 1630667410 8664.4   88.8890 < 2.2e-16
## - Member_type        1 3624367150 5031998905 9319.1 1444.4616 < 2.2e-16
##                        
## - log_low_visi         
## - avg_sealevelpress    
## <none>                 
## + weekday              
## - avg_humidity      *  
## - log_precip        ** 
## - year              ** 
## - avg_wind          ** 
## - month.x           ** 
## - avg_temp          ***
## - Member_type       ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=8579.12
## count ~ year + month.x + Member_type + avg_temp + avg_humidity + 
##     avg_sealevelpress + avg_wind + log_precip
## 
##                     Df  Sum of Sq        RSS    AIC   F value    Pr(>F)
## - avg_sealevelpress  1    2698419 1410748459 8578.2    1.0770  0.299809
## <none>                            1408050041 8579.1                    
## + log_low_visi       1     418286 1407631755 8580.9    0.1667  0.683213
## + weekday            6   22462762 1385587279 8581.8    1.5023  0.175073
## - avg_humidity       1   15208133 1423258174 8583.4    6.0701  0.014047
## - log_precip         1   18988321 1427038361 8584.9    7.5789  0.006097
## - year               1   19714732 1427764773 8585.2    7.8688  0.005204
## - avg_wind           1   20437351 1428487392 8585.5    8.1572  0.004448
## - month.x           11   75242505 1483292545 8587.4    2.7302  0.001905
## - avg_temp           1  224228370 1632278411 8663.0   89.4971 < 2.2e-16
## - Member_type        1 3626046688 5034096729 9317.3 1447.2769 < 2.2e-16
##                        
## - avg_sealevelpress    
## <none>                 
## + log_low_visi         
## + weekday              
## - avg_humidity      *  
## - log_precip        ** 
## - year              ** 
## - avg_wind          ** 
## - month.x           ** 
## - avg_temp          ***
## - Member_type       ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=8578.23
## count ~ year + month.x + Member_type + avg_temp + avg_humidity + 
##     avg_wind + log_precip
## 
##                     Df  Sum of Sq        RSS    AIC   F value    Pr(>F)
## <none>                            1410748459 8578.2                    
## + avg_sealevelpress  1    2698419 1408050041 8579.1    1.0770 0.2998094
## + log_low_visi       1     518005 1410230454 8580.0    0.2064 0.6497526
## + weekday            6   21368504 1389379955 8581.4    1.4278 0.2016690
## - avg_humidity       1   16678124 1427426583 8583.1    6.6559 0.0101352
## - year               1   19600731 1430349190 8584.2    7.8222 0.0053371
## - log_precip         1   23464795 1434213254 8585.8    9.3643 0.0023179
## - month.x           11   80557424 1491305884 8588.5    2.9226 0.0009128
## - avg_wind           1   30777004 1441525463 8588.8   12.2825 0.0004936
## - avg_temp           1  240484175 1651232634 8667.7   95.9722 < 2.2e-16
## - Member_type        1 3623449240 5034197699 9315.3 1446.0423 < 2.2e-16
##                        
## <none>                 
## + avg_sealevelpress    
## + log_low_visi         
## + weekday              
## - avg_humidity      *  
## - year              ** 
## - log_precip        ** 
## - month.x           ***
## - avg_wind          ***
## - avg_temp          ***
## - Member_type       ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
both_model <- lm(count ~ year + month.x + Member_type + avg_temp + avg_humidity + avg_wind + log_precip, data = train_data)
# get the estimated test MSE for the linear regression fit
rmse(both_model,data=test_data) ^2 #  2577751
## [1] 2773779

All-possible regression approach

# Forward stepwise selection
regfit_fwd <- dropped_bike_weather[,-5] %>% 
  regsubsets(count~., data =., nvmax=10, method = "forward")
regfit_fwd_summary <- regfit_fwd %>% 
  summary()

fwd_fit_dat<-tibble(
  num_pred=1:10,
  RSS=regfit_fwd_summary$rss,
  R2=regfit_fwd_summary$rsq,
  Adj_R2=regfit_fwd_summary$adjr2,
  Cp=regfit_fwd_summary$cp) %>% 
#reshape the data for plotting 
  gather(key="statistic",value="value",-num_pred)

fwd_fit_dat %>% 
  filter(statistic!="R2") %>% 
  ggplot(aes(x=num_pred,y=value))+
  geom_point()+
  geom_line()+
  facet_wrap(~statistic,ncol=3,scales="free")

# Backward stepwise selection
regfit_bck <- dropped_bike_weather[,-5] %>% 
  regsubsets(count~., data =., nvmax=10, method = "backward")
regfit_bck_summary <- regfit_bck %>% 
  summary()

bck_fit_dat<-tibble(
  num_pred=1:10,
  RSS=regfit_bck_summary$rss,
  R2=regfit_bck_summary$rsq,
  Adj_R2=regfit_bck_summary$adjr2,
  Cp=regfit_bck_summary$cp) %>% 
#reshape the data for plotting 
  gather(key="statistic",value="value",-num_pred)

bck_fit_dat %>% 
  filter(statistic!="R2") %>% 
  ggplot(aes(x=num_pred,y=value))+
  geom_point()+
  geom_line()+
  facet_wrap(~statistic,ncol=3,scales="free")

# Exhaustive stepwise selection
regfit_ex <- dropped_bike_weather[,-5] %>% 
  regsubsets(count~., data =., nvmax=10, method = "exhaustive")
regfit_ex_summary <- regfit_ex %>% 
  summary()

ex_fit_dat<-tibble(
  num_pred=1:10,
  RSS=regfit_ex_summary$rss,
  R2=regfit_ex_summary$rsq,
  Adj_R2=regfit_ex_summary$adjr2,
  Cp=regfit_ex_summary$cp) %>% 
#reshape the data for plotting 
  gather(key="statistic",value="value",-num_pred)

ex_fit_dat %>% 
  filter(statistic!="R2") %>% 
  ggplot(aes(x=num_pred,y=value))+
  geom_point()+
  geom_line()+
  facet_wrap(~statistic,ncol=3,scales="free")


Selected Model from stepwise feature selection

step_fit <- lm(count~.,data=dropped_bike_weather)
step_fit %>% 
  summary()
## 
## Call:
## lm(formula = count ~ ., data = dropped_bike_weather)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5102.4 -1123.9   -73.8  1104.2  6142.4 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -1.131e+04  9.128e+03  -1.240 0.215388    
## year2017           6.245e+02  9.420e+01   6.629 5.20e-11 ***
## month.x2           2.970e+02  2.403e+02   1.236 0.216785    
## month.x3           6.833e+02  2.559e+02   2.670 0.007686 ** 
## month.x4           9.146e+02  2.759e+02   3.315 0.000947 ***
## month.x5           5.772e+02  2.873e+02   2.009 0.044797 *  
## month.x6           5.345e+02  3.396e+02   1.574 0.115770    
## month.x7          -5.502e+00  3.612e+02  -0.015 0.987851    
## month.x8          -2.634e+02  3.537e+02  -0.745 0.456658    
## month.x9           3.871e+02  3.303e+02   1.172 0.241570    
## month.x10          1.134e+03  2.886e+02   3.930 9.00e-05 ***
## month.x11          6.337e+02  2.548e+02   2.487 0.013032 *  
## month.x12          5.528e-01  2.336e+02   0.002 0.998113    
## weekday2           4.215e+02  1.744e+02   2.417 0.015824 *  
## weekday3           6.945e+02  1.743e+02   3.984 7.20e-05 ***
## weekday4           7.256e+02  1.728e+02   4.199 2.89e-05 ***
## weekday5           5.755e+02  1.738e+02   3.311 0.000960 ***
## weekday6           5.979e+02  1.741e+02   3.434 0.000617 ***
## weekday7           4.239e+02  1.721e+02   2.463 0.013917 *  
## Member_typeCasual -4.992e+03  9.375e+01 -53.253  < 2e-16 ***
## avg_temp           8.850e+01  6.742e+00  13.127  < 2e-16 ***
## avg_humidity      -2.196e+01  4.886e+00  -4.495 7.68e-06 ***
## avg_sealevelpress  4.477e+02  2.976e+02   1.504 0.132812    
## avg_wind          -4.864e+01  1.686e+01  -2.885 0.003988 ** 
## log_low_visi       1.523e+01  1.384e+01   1.100 0.271513    
## log_precip        -4.020e+01  8.315e+00  -4.834 1.52e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1595 on 1137 degrees of freedom
## Multiple R-squared:  0.7815, Adjusted R-squared:  0.7767 
## F-statistic: 162.7 on 25 and 1137 DF,  p-value: < 2.2e-16
par(mfrow=c(2,2))
step_fit %>% 
  plot()

Beyond linearity

set.seed(1)
# use cross validation to choose dfs
df<- rep(0,6)
val<- list(dropped_bike_weather$avg_temp,dropped_bike_weather$avg_humidity,dropped_bike_weather$avg_sealevelpress,dropped_bike_weather$avg_wind,dropped_bike_weather$log_low_visi,dropped_bike_weather$log_precip)
for (i in 1:6){
fit <- smooth.spline(val[[i]],dropped_bike_weather$count,cv=TRUE)
df[i]<-fit$df
}
df
## [1] 5.596995 4.632338 4.345706 4.075545 2.843863 3.577682

Selected gam Model

gam_fit_s<-dropped_bike_weather %>%
  gam(count ~ year + month.x + weekday + Member_type + s(avg_temp, 5.596995) + s(avg_humidity, 4.632338) + s(avg_sealevelpress, 4.345706) + s(avg_wind, 4.075545) + s(log_low_visi, 2.843863) + s(log_precip, 3.577682), data =.)

par(mfrow=c(2,3))
plot.Gam(gam_fit_s,se=T,col="red")

summary(gam_fit_s)
## 
## Call: gam(formula = count ~ year + month.x + weekday + Member_type + 
##     s(avg_temp, 5.596995) + s(avg_humidity, 4.632338) + s(avg_sealevelpress, 
##     4.345706) + s(avg_wind, 4.075545) + s(log_low_visi, 2.843863) + 
##     s(log_precip, 3.577682), data = .)
## Deviance Residuals:
##      Min       1Q   Median       3Q      Max 
## -4620.97 -1174.62    20.22  1068.94  5753.37 
## 
## (Dispersion Parameter for gaussian family taken to be 2307531)
## 
##     Null Deviance: 13247841901 on 1162 degrees of freedom
## Residual Deviance: 2579653955 on 1117.928 degrees of freedom
## AIC: 20386.54 
## 
## Number of Local Scoring Iterations: 2 
## 
## Anova for Parametric Effects
##                                    Df     Sum Sq    Mean Sq   F value
## year                              1.0   48864563   48864563   21.1761
## month.x                          11.0 2173311640  197573785   85.6213
## weekday                           6.0   84227164   14037861    6.0835
## Member_type                       1.0 7184285774 7184285774 3113.4081
## s(avg_temp, 5.596995)             1.0  259391539  259391539  112.4109
## s(avg_humidity, 4.632338)         1.0  289228350  289228350  125.3410
## s(avg_sealevelpress, 4.345706)    1.0   63728353   63728353   27.6175
## s(avg_wind, 4.075545)             1.0   30970088   30970088   13.4213
## s(log_low_visi, 2.843863)         1.0     127465     127465    0.0552
## s(log_precip, 3.577682)           1.0   99069806   99069806   42.9333
## Residuals                      1117.9 2579653955    2307531          
##                                   Pr(>F)    
## year                           4.669e-06 ***
## month.x                        < 2.2e-16 ***
## weekday                        2.779e-06 ***
## Member_type                    < 2.2e-16 ***
## s(avg_temp, 5.596995)          < 2.2e-16 ***
## s(avg_humidity, 4.632338)      < 2.2e-16 ***
## s(avg_sealevelpress, 4.345706) 1.770e-07 ***
## s(avg_wind, 4.075545)          0.0002604 ***
## s(log_low_visi, 2.843863)      0.8142292    
## s(log_precip, 3.577682)        8.626e-11 ***
## Residuals                                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                                Npar Df  Npar F     Pr(F)    
## (Intercept)                                                 
## year                                                        
## month.x                                                     
## weekday                                                     
## Member_type                                                 
## s(avg_temp, 5.596995)              4.6 16.7123 6.883e-15 ***
## s(avg_humidity, 4.632338)          3.6  3.2545   0.01434 *  
## s(avg_sealevelpress, 4.345706)     3.3  1.4433   0.22491    
## s(avg_wind, 4.075545)              3.1  0.5802   0.63229    
## s(log_low_visi, 2.843863)          1.8  1.0020   0.36222    
## s(log_precip, 3.577682)            2.6 15.5666 7.303e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

not all smoothing

gam_fit_sl<-dropped_bike_weather %>%
  gam(count ~ year + month.x + weekday + Member_type + s(avg_temp, 33.350675) + s(avg_humidity, 4.355668) + avg_sealevelpress + avg_wind + log_low_visi + s(log_precip, 3.973716), data =.)
par(mfrow=c(2,3))
plot.Gam(gam_fit_sl,se=T,col="orange")

summary(gam_fit_sl)
## 
## Call: gam(formula = count ~ year + month.x + weekday + Member_type + 
##     s(avg_temp, 33.350675) + s(avg_humidity, 4.355668) + avg_sealevelpress + 
##     avg_wind + log_low_visi + s(log_precip, 3.973716), data = .)
## Deviance Residuals:
##      Min       1Q   Median       3Q      Max 
## -4157.41 -1213.79    33.24  1114.55  5637.96 
## 
## (Dispersion Parameter for gaussian family taken to be 2318114)
## 
##     Null Deviance: 13247841901 on 1162 degrees of freedom
## Residual Deviance: 2546031362 on 1098.32 degrees of freedom
## AIC: 20410.5 
## 
## Number of Local Scoring Iterations: 2 
## 
## Anova for Parametric Effects
##                               Df     Sum Sq    Mean Sq   F value    Pr(>F)
## year                         1.0   45827531   45827531   19.7693 9.624e-06
## month.x                     11.0 2181762885  198342080   85.5618 < 2.2e-16
## weekday                      6.0   79852251   13308708    5.7412 6.789e-06
## Member_type                  1.0 7190398702 7190398702 3101.8308 < 2.2e-16
## s(avg_temp, 33.350675)       1.0  265809142  265809142  114.6661 < 2.2e-16
## s(avg_humidity, 4.355668)    1.0  271125174  271125174  116.9594 < 2.2e-16
## avg_sealevelpress            1.0   66501517   66501517   28.6878 1.035e-07
## avg_wind                     1.0   31566399   31566399   13.6173 0.0002351
## log_low_visi                 1.0      85870      85870    0.0370 0.8474129
## s(log_precip, 3.973716)      1.0   77957551   77957551   33.6297 8.709e-09
## Residuals                 1098.3 2546031362    2318114                    
##                              
## year                      ***
## month.x                   ***
## weekday                   ***
## Member_type               ***
## s(avg_temp, 33.350675)    ***
## s(avg_humidity, 4.355668) ***
## avg_sealevelpress         ***
## avg_wind                  ***
## log_low_visi                 
## s(log_precip, 3.973716)   ***
## Residuals                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                           Npar Df  Npar F     Pr(F)    
## (Intercept)                                            
## year                                                   
## month.x                                                
## weekday                                                
## Member_type                                            
## s(avg_temp, 33.350675)       32.4  3.1431 1.292e-08 ***
## s(avg_humidity, 4.355668)     3.4  3.2684   0.01661 *  
## avg_sealevelpress                                      
## avg_wind                                               
## log_low_visi                                           
## s(log_precip, 3.973716)       3.0 11.1659 3.543e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

2. Shrinkage Method

# L1 lasso
set.seed (1)
mod_matrix <- model.matrix(count~.,tr_bike_weather[,-c(1,6)])[,-1]
train <- sample(1:nrow(mod_matrix), nrow(mod_matrix)/2)
test =(-train)
response <- tr_bike_weather$count
response.test <- response[test]
#alpha=1 for lasso
lambda_grid<-10^seq(10,-2,length=100)
lasso_mod <-glmnet(mod_matrix[train,],response[train],alpha=1,lambda=lambda_grid)
par(mfrow=c(1,1))
lasso_mod %>% 
  plot()

#perform cross-validation and compute the associated test error.
cv_out_l<-cv.glmnet(mod_matrix[train,],response[train],alpha=1)
plot(cv_out_l)

best_lambda=cv_out_l$lambda.min 
best_lambda
## [1] 38.44728
lasso_pred <- predict(lasso_mod,s=best_lambda,newx=mod_matrix[test,])
#MSE
lasso_MSE <- mean((lasso_pred-response.test)^2)
lasso_MSE #2452066
## [1] 2840073
lasso_coef <- cv_out_l %>% 
  predict(.,type ="coefficients",s=best_lambda)
lasso_coef
## 39 x 1 sparse Matrix of class "dgCMatrix"
##                                1
## (Intercept)        -1.118614e+04
## year2017            2.432025e+02
## month.x2            .           
## month.x3            1.905635e+02
## month.x4            3.962642e+02
## month.x5            .           
## month.x6            2.749640e+02
## month.x7            .           
## month.x8           -2.788771e+02
## month.x9            2.357465e+02
## month.x10           4.334665e+02
## month.x11           .           
## month.x12           .           
## weekday2            5.170076e-01
## weekday3            3.826092e+01
## weekday4            2.259621e+02
## weekday5            1.871988e+02
## weekday6            .           
## weekday7            .           
## Member_typeCasual  -4.964561e+03
## high_temp           7.731876e+01
## avg_temp            1.133584e-02
## low_temp            .           
## high_dewpoint       .           
## avg_dewpoint        .           
## low_dewpoint        .           
## avg_humidity        .           
## low_humidity       -4.449898e+00
## high_sealevelpress  .           
## avg_sealevelpress   3.807296e+02
## low_sealevelpress   .           
## high_visibility     .           
## low_wind            .           
## avg_wind           -2.800352e+01
## high_wind          -9.738275e+00
## log_high_humidity   .           
## log_avg_visi        8.675102e+02
## log_low_visi        .           
## log_precip         -2.056450e+01

Selected Model from lasso regression

lasso_fit <- glmnet(mod_matrix,response,alpha=1,lambda = best_lambda)
lasso_fit %>% 
  summary()
##           Length Class     Mode   
## a0         1     -none-    numeric
## beta      38     dgCMatrix S4     
## df         1     -none-    numeric
## dim        2     -none-    numeric
## lambda     1     -none-    numeric
## dev.ratio  1     -none-    numeric
## nulldev    1     -none-    numeric
## npasses    1     -none-    numeric
## jerr       1     -none-    numeric
## offset     1     -none-    logical
## call       5     -none-    call   
## nobs       1     -none-    numeric

Cross-validation result

set.seed(1)
#MSE comparison
full_model <- lm(count~., data = tr_bike_weather)
MSE_full <- rmse(full_model, data=te_bike_weather) ^2
MSE_selected <- rmse(step_fit, data=te_bike_weather) ^2 
#GAM mse
gam_pred <- predict(gam_fit_s, te_bike_weather)
gam_MSE <- mean((te_bike_weather$count-gam_pred)^2)

#GAM modified MSE
gam_pred_sl <- predict(gam_fit_sl, te_bike_weather)
gam_sl_MSE <- mean((te_bike_weather$count-gam_pred_sl)^2)

#lasso
x=model.matrix(count~.,data=te_bike_weather[,-c(1,6)])
x=x[,-1]
lasso_pred <- predict(lasso_fit, s=best_lambda, newx=x)
lasso_MSE <- mean((lasso_pred-te_bike_weather$count)^2) 

# combine
cbind(model <- c("Full Linear Model (p=23)","Subset Selected Model (p=10)","General Additive Model - all smoothing splines","General Additive Model - partial splines", "Lasso Model (λ= 38.44728)"),MSE <- c(MSE_full,MSE_selected,gam_MSE,gam_sl_MSE,lasso_MSE))
##      [,1]                                             [,2]              
## [1,] "Full Linear Model (p=23)"                       "2678817.95905591"
## [2,] "Subset Selected Model (p=10)"                   "2761878.70131835"
## [3,] "General Additive Model - all smoothing splines" "2433135.59287282"
## [4,] "General Additive Model - partial splines"       "2503048.79642685"
## [5,] "Lasso Model (λ= 38.44728)"                      "2640365.70326811"