The goal of this project is to select and validate a predictive model for the bikeshare system “CapitalShare” in Washington DC.
Datasets used in this project are from Capital Bikeshare User Data (https://www.capitalbikeshare.com/system-data), from Jan 2016 to Dec 2017 in a two-year time space, mainly focusing on variables include date, time, station, member type, duration, etc. To facilitate the analysis about user trends, we also obtained weather data in the Washington, D.C. area from Weather Underground (https://www.wunderground.com/).
Load Packages
library(tidyverse)
library(psych)
library(forcats)
library(lubridate)
library(GGally)
library(corrplot)
library(leaps)
library(glmnet)
library(MASS)
library(modelr)
library(gam)
library(plotmo)
Load datasets
quarters_raw <- c("2016Q1","2016Q2","2016Q3","2016Q4","2017Q1","2017Q2","2017Q3","2017Q4")
filenames = paste(c('2016_q1','2016_q2','2016_q3','2016_q4','2017_q1','2017_q2','2017_q3','2017_q4'), "trip_history_data.csv", sep = "_")
dfnames = paste0('bike_original',quarters_raw)
#a for loop is used to read and name 8 csv's, and assign R friendly column names
for (i in 1:8){
assign(dfnames[i],read_csv(file=filenames[i],col_names=c("Duration","Start_date","End_date","Start_station_number","Start_station","End_station_number","End_station","Bike_number","Member_type"),skip=1))
}
add_season <- function(df,x){
df %>%
add_column(season=x)
}
df_lst <- list(bike_original2016Q1,bike_original2016Q2,bike_original2016Q3,bike_original2016Q4,bike_original2017Q1,bike_original2017Q2,bike_original2017Q3,bike_original2017Q4)
seasons <- c("winter","spring","summer","fall","winter","spring","summer","fall")
quarters <- c("2016Q1","2016Q2","2016Q3","2016Q4","2017Q1","2017Q2","2017Q3","2017Q4")
for (i in 1:8) {
assign(paste0('bike_original',quarters[i]),add_season(df_lst[[i]],seasons[i]))
}
combine 8 into 1 and add year, month, day, weekday and specific date
bike <- rbind(bike_original2016Q1,bike_original2016Q2,bike_original2016Q3,bike_original2016Q4,bike_original2017Q1,bike_original2017Q2,bike_original2017Q3,bike_original2017Q4)
bike <-bike %>%
mutate(
year= year(ymd_hms(Start_date)),
month= month(ymd_hms(Start_date)),
day= day(ymd_hms(Start_date)),
weekday=wday(ymd_hms(Start_date)),
date = make_date(year,month,day)) %>%
mutate(Member_type=as_factor(Member_type))
bike<-bike %>%
mutate(year= as.factor(year),
month= as.factor(month),
weekday=as.factor(weekday),
season=as.factor(season))
create bike by day to get counts per day
bike_by_day <-
bike %>%
group_by(date,Member_type) %>%
mutate(count=n(),
total_duration_in_seconds=sum(as.numeric(Duration)*0.001)) %>%
dplyr::select(date,count,year,month,weekday,season,total_duration_in_seconds,Member_type) %>%
distinct(date,count,year,month,weekday,season,total_duration_in_seconds,Member_type)
bike_by_day<-bike_by_day %>%
as.tibble()
read in weather
weather2017 <- read_csv(file="2017WeatherData.csv") %>%
add_column(year="2017") %>%
mutate(date = make_date(year,month,day)) %>%
dplyr::select(-year)
weather2016 <-read_csv(file="2016WeatherData.csv") %>%
add_column(year="2016") %>%
mutate(date = make_date(year,month,day)) %>%
dplyr::select(-year)
#create weather tibble
weather<-as_tibble(rbind(weather2016,weather2017))
#transform percipitation
weather <- weather %>%
within(precip[precip == 'T'] <- '0.005') %>%
transform(precip=as.double(precip))
weather <- weather %>%
within(high_wind[high_wind == '-'] <- '0') %>%
transform(high_wind=as.double(high_wind))
bike_weather<-
bike_by_day%>%
left_join(weather, by = c("date" = "date")) %>%
dplyr::select(-day,-events,-month.y,-total_duration_in_seconds)
numeric_bike <- as.data.frame(bike_weather[,unlist(lapply(bike_weather,is.numeric))])
par(mfrow=c(2,3))
for (i in 1:20){
hist(numeric_bike[,i],main = names(numeric_bike)[i] )
}
#skewed variables: high humidity, avg_visibility, low_visibility, precip
#do log transformation first
bike_weather <- bike_weather %>%
mutate(log_high_humidity=log(high_humidity+0.00000001),
log_avg_visi=log(avg_visibility+0.00000001),
log_low_visi=log(low_visibility+0.00000001),
log_precip=log(precip+0.00000001)) %>%
dplyr::select(-high_humidity,-avg_visibility,-low_visibility,-precip)
bike_weather<- bike_weather %>%
mutate(season=as_factor(season),
high_visibility=as.double(high_visibility))
summary(bike_weather[,-1]) # not include date
## count year month.x weekday season
## Min. : 3 2016:724 3 :124 1:208 fall :368
## 1st Qu.: 1918 2017:730 5 :124 2:206 spring:364
## Median : 4245 7 :124 3:206 summer:368
## Mean : 4878 8 :124 4:208 winter:354
## 3rd Qu.: 7555 10 :124 5:208
## Max. :11851 12 :124 6:210
## (Other):710 7:208
## Member_type high_temp avg_temp low_temp
## Member:727 Min. : 23.00 Min. :20.00 Min. :13.00
## Casual:727 1st Qu.: 56.00 1st Qu.:47.25 1st Qu.:39.00
## Median : 71.00 Median :62.00 Median :53.00
## Mean : 69.17 Mean :61.02 Mean :52.37
## 3rd Qu.: 84.00 3rd Qu.:76.00 3rd Qu.:68.00
## Max. :101.00 Max. :91.00 Max. :81.00
##
## high_dewpoint avg_dewpoint low_dewpoint avg_humidity
## Min. : 1.0 Min. :-1.00 Min. :-9.00 Min. :27.00
## 1st Qu.:40.0 1st Qu.:34.00 1st Qu.:26.25 1st Qu.:55.00
## Median :56.0 Median :50.00 Median :43.00 Median :65.00
## Mean :53.2 Mean :47.34 Mean :41.40 Mean :64.78
## 3rd Qu.:68.0 3rd Qu.:64.00 3rd Qu.:59.00 3rd Qu.:74.00
## Max. :80.0 Max. :77.00 Max. :75.00 Max. :97.00
##
## low_humidity high_sealevelpress avg_sealevelpress low_sealevelpress
## Min. :13.00 Min. :29.61 Min. :29.42 Min. :29.18
## 1st Qu.:34.00 1st Qu.:30.02 1st Qu.:29.93 1st Qu.:29.84
## Median :42.00 Median :30.13 Median :30.05 Median :29.96
## Mean :45.29 Mean :30.15 Mean :30.05 Mean :29.95
## 3rd Qu.:54.00 3rd Qu.:30.27 3rd Qu.:30.18 3rd Qu.:30.09
## Max. :93.00 Max. :30.80 Max. :30.72 Max. :30.64
##
## high_visibility low_wind avg_wind high_wind
## Min. :10 Min. : 0.00 Min. : 0.000 Min. : 0.00
## 1st Qu.:10 1st Qu.:15.00 1st Qu.: 6.000 1st Qu.:18.00
## Median :10 Median :18.00 Median : 8.000 Median :23.00
## Mean :10 Mean :19.51 Mean : 8.779 Mean :24.65
## 3rd Qu.:10 3rd Qu.:23.00 3rd Qu.:11.000 3rd Qu.:29.75
## Max. :10 Max. :52.00 Max. :22.000 Max. :65.00
##
## log_high_humidity log_avg_visi log_low_visi log_precip
## Min. :3.689 Min. :1.099 Min. :-18.421 Min. :-18.421
## 1st Qu.:4.321 1st Qu.:2.303 1st Qu.: 1.609 1st Qu.:-18.421
## Median :4.466 Median :2.303 Median : 2.303 Median :-18.421
## Mean :4.409 Mean :2.236 Mean : 1.188 Mean :-11.582
## 3rd Qu.:4.533 3rd Qu.:2.303 3rd Qu.: 2.303 3rd Qu.: -3.507
## Max. :4.605 Max. :2.303 Max. : 2.303 Max. : 1.197
##
#Cross-Validation Data
set.seed(1)
n <- nrow(bike_weather)
tr <- sample(n,0.8*n)
te <- (-tr)
tr_bike_weather <- bike_weather[tr,]
te_bike_weather <- bike_weather[te,]
tr_bike_weather<- tr_bike_weather %>%
mutate(season=as_factor(season),
high_visibility=as.double(high_visibility))
summary(tr_bike_weather[,-1]) # not include date
## count year month.x weekday season
## Min. : 3 2016:586 4 :105 1:178 fall :292
## 1st Qu.: 1970 2017:577 8 :104 2:161 spring:294
## Median : 4328 12 :102 3:161 summer:299
## Mean : 4928 7 :101 4:168 winter:278
## 3rd Qu.: 7629 10 : 98 5:164
## Max. :11832 3 : 96 6:162
## (Other):557 7:169
## Member_type high_temp avg_temp low_temp
## Member:592 Min. : 23.00 Min. :20.00 Min. :13.00
## Casual:571 1st Qu.: 56.00 1st Qu.:48.00 1st Qu.:39.00
## Median : 71.00 Median :63.00 Median :53.00
## Mean : 69.21 Mean :61.08 Mean :52.46
## 3rd Qu.: 84.00 3rd Qu.:76.00 3rd Qu.:68.00
## Max. :101.00 Max. :91.00 Max. :81.00
##
## high_dewpoint avg_dewpoint low_dewpoint avg_humidity
## Min. : 1.00 Min. :-1.00 Min. :-9.00 Min. :27.00
## 1st Qu.:40.00 1st Qu.:34.00 1st Qu.:27.00 1st Qu.:55.00
## Median :56.00 Median :50.00 Median :43.00 Median :65.00
## Mean :53.26 Mean :47.41 Mean :41.53 Mean :64.84
## 3rd Qu.:68.00 3rd Qu.:64.00 3rd Qu.:59.00 3rd Qu.:74.00
## Max. :80.00 Max. :77.00 Max. :75.00 Max. :97.00
##
## low_humidity high_sealevelpress avg_sealevelpress low_sealevelpress
## Min. :13.00 Min. :29.61 Min. :29.42 Min. :29.18
## 1st Qu.:34.00 1st Qu.:30.02 1st Qu.:29.93 1st Qu.:29.84
## Median :43.00 Median :30.13 Median :30.05 Median :29.96
## Mean :45.49 Mean :30.15 Mean :30.05 Mean :29.96
## 3rd Qu.:54.00 3rd Qu.:30.27 3rd Qu.:30.18 3rd Qu.:30.09
## Max. :93.00 Max. :30.80 Max. :30.72 Max. :30.64
##
## high_visibility low_wind avg_wind high_wind
## Min. :10 Min. : 0.00 Min. : 0.000 Min. : 0.00
## 1st Qu.:10 1st Qu.:15.00 1st Qu.: 6.000 1st Qu.:18.00
## Median :10 Median :18.00 Median : 8.000 Median :23.00
## Mean :10 Mean :19.48 Mean : 8.776 Mean :24.61
## 3rd Qu.:10 3rd Qu.:23.00 3rd Qu.:11.000 3rd Qu.:29.50
## Max. :10 Max. :52.00 Max. :22.000 Max. :65.00
##
## log_high_humidity log_avg_visi log_low_visi log_precip
## Min. :3.689 Min. :1.386 Min. :-18.421 Min. :-18.421
## 1st Qu.:4.317 1st Qu.:2.303 1st Qu.: 1.609 1st Qu.:-18.421
## Median :4.466 Median :2.303 Median : 2.303 Median :-18.421
## Mean :4.409 Mean :2.236 Mean : 1.220 Mean :-11.570
## 3rd Qu.:4.533 3rd Qu.:2.303 3rd Qu.: 2.303 3rd Qu.: -3.507
## Max. :4.605 Max. :2.303 Max. : 2.303 Max. : 1.197
##
# check for multicolinearity
#remove date & high visibility(no variation)
cor(bike_weather[,unlist(lapply(tr_bike_weather,is.numeric))])
## Warning in cor(bike_weather[, unlist(lapply(tr_bike_weather,
## is.numeric))]): the standard deviation is zero
## count high_temp avg_temp low_temp
## count 1.000000000 0.42064907 0.41043555 0.387375930
## high_temp 0.420649065 1.00000000 0.98659441 0.943053082
## avg_temp 0.410435552 0.98659441 1.00000000 0.984466901
## low_temp 0.387375930 0.94305308 0.98446690 1.000000000
## high_dewpoint 0.349402281 0.91515684 0.94058751 0.940579532
## avg_dewpoint 0.345029735 0.90777540 0.94365174 0.954452042
## low_dewpoint 0.340884111 0.88118808 0.92491960 0.944638183
## avg_humidity -0.001914621 0.22686286 0.29514417 0.360113091
## low_humidity -0.059268684 0.08533511 0.17691507 0.269874594
## high_sealevelpress -0.110671676 -0.41233090 -0.43323019 -0.441539587
## avg_sealevelpress -0.034749387 -0.28025717 -0.28935864 -0.289239163
## low_sealevelpress 0.027618690 -0.14720256 -0.14724584 -0.141596283
## high_visibility NA NA NA NA
## low_wind -0.115871466 -0.09782625 -0.12421583 -0.148289849
## avg_wind -0.125820548 -0.19870087 -0.20561687 -0.207020597
## high_wind -0.108648709 -0.09149265 -0.11988412 -0.147134365
## log_high_humidity 0.081289022 0.36706901 0.39110658 0.406471313
## log_avg_visi 0.208323834 0.20944113 0.16386387 0.110168848
## log_low_visi 0.113742350 0.07742769 0.06781064 0.056066141
## log_precip -0.170071814 -0.08067887 -0.03945752 0.005897213
## high_dewpoint avg_dewpoint low_dewpoint avg_humidity
## count 0.3494022807 0.34502974 0.3408841113 -0.001914621
## high_temp 0.9151568447 0.90777540 0.8811880801 0.226862864
## avg_temp 0.9405875093 0.94365174 0.9249195966 0.295144172
## low_temp 0.9405795324 0.95445204 0.9446381827 0.360113091
## high_dewpoint 1.0000000000 0.97836809 0.9380622675 0.542223264
## avg_dewpoint 0.9783680936 1.00000000 0.9805740570 0.577339608
## low_dewpoint 0.9380622675 0.98057406 1.0000000000 0.573073525
## avg_humidity 0.5422232637 0.57733961 0.5730735249 1.000000000
## low_humidity 0.3924351682 0.45174509 0.4865148315 0.904087272
## high_sealevelpress -0.4617400167 -0.46384851 -0.4332848435 -0.312487920
## avg_sealevelpress -0.3567399420 -0.33320233 -0.2780197182 -0.288575865
## low_sealevelpress -0.2404270871 -0.19433157 -0.1326563760 -0.239584840
## high_visibility NA NA NA NA
## low_wind -0.1133001421 -0.18110686 -0.2351953667 -0.195171036
## avg_wind -0.1916503099 -0.25514087 -0.2953611382 -0.206553690
## high_wind -0.1186927167 -0.18618143 -0.2405048835 -0.213130741
## log_high_humidity 0.6155524363 0.61329909 0.5701900610 0.869670930
## log_avg_visi -0.0008144368 -0.01908361 0.0008298162 -0.490183345
## log_low_visi -0.0380965343 -0.03459520 -0.0064276510 -0.273675727
## log_precip 0.1575211771 0.13669014 0.0932913717 0.513859184
## low_humidity high_sealevelpress avg_sealevelpress
## count -0.05926868 -0.11067168 -0.03474939
## high_temp 0.08533511 -0.41233090 -0.28025717
## avg_temp 0.17691507 -0.43323019 -0.28935864
## low_temp 0.26987459 -0.44153959 -0.28923916
## high_dewpoint 0.39243517 -0.46174002 -0.35673994
## avg_dewpoint 0.45174509 -0.46384851 -0.33320233
## low_dewpoint 0.48651483 -0.43328484 -0.27801972
## avg_humidity 0.90408727 -0.31248792 -0.28857586
## low_humidity 1.00000000 -0.23909188 -0.20992396
## high_sealevelpress -0.23909188 1.00000000 0.94372340
## avg_sealevelpress -0.20992396 0.94372340 1.00000000
## low_sealevelpress -0.17123405 0.82702244 0.95511545
## high_visibility NA NA NA
## low_wind -0.19715345 -0.20464863 -0.36033332
## avg_wind -0.15287332 -0.10963188 -0.26997526
## high_wind -0.21513699 -0.22856984 -0.37853707
## log_high_humidity 0.59478972 -0.33039684 -0.30921056
## log_avg_visi -0.47856745 0.10923291 0.18010102
## log_low_visi -0.24143610 0.09475535 0.15991948
## log_precip 0.48386652 -0.28181585 -0.36336305
## low_sealevelpress high_visibility low_wind
## count 0.02761869 NA -0.11587147
## high_temp -0.14720256 NA -0.09782625
## avg_temp -0.14724584 NA -0.12421583
## low_temp -0.14159628 NA -0.14828985
## high_dewpoint -0.24042709 NA -0.11330014
## avg_dewpoint -0.19433157 NA -0.18110686
## low_dewpoint -0.13265638 NA -0.23519537
## avg_humidity -0.23958484 NA -0.19517104
## low_humidity -0.17123405 NA -0.19715345
## high_sealevelpress 0.82702244 NA -0.20464863
## avg_sealevelpress 0.95511545 NA -0.36033332
## low_sealevelpress 1.00000000 NA -0.45240395
## high_visibility NA 1 NA
## low_wind -0.45240395 NA 1.00000000
## avg_wind -0.36762349 NA 0.76390238
## high_wind -0.46669349 NA 0.95378846
## log_high_humidity -0.25644712 NA -0.15638647
## log_avg_visi 0.20283081 NA -0.01523901
## log_low_visi 0.19087107 NA -0.11268420
## log_precip -0.39387949 NA 0.25315596
## avg_wind high_wind log_high_humidity log_avg_visi
## count -0.12582055 -0.10864871 0.08128902 0.2083238336
## high_temp -0.19870087 -0.09149265 0.36706901 0.2094411328
## avg_temp -0.20561687 -0.11988412 0.39110658 0.1638638731
## low_temp -0.20702060 -0.14713436 0.40647131 0.1101688483
## high_dewpoint -0.19165031 -0.11869272 0.61555244 -0.0008144368
## avg_dewpoint -0.25514087 -0.18618143 0.61329909 -0.0190836119
## low_dewpoint -0.29536114 -0.24050488 0.57019006 0.0008298162
## avg_humidity -0.20655369 -0.21313074 0.86967093 -0.4901833447
## low_humidity -0.15287332 -0.21513699 0.59478972 -0.4785674522
## high_sealevelpress -0.10963188 -0.22856984 -0.33039684 0.1092329090
## avg_sealevelpress -0.26997526 -0.37853707 -0.30921056 0.1801010184
## low_sealevelpress -0.36762349 -0.46669349 -0.25644712 0.2028308056
## high_visibility NA NA NA NA
## low_wind 0.76390238 0.95378846 -0.15638647 -0.0152390053
## avg_wind 1.00000000 0.74587302 -0.23224114 -0.0245489212
## high_wind 0.74587302 1.00000000 -0.17192167 -0.0152047422
## log_high_humidity -0.23224114 -0.17192167 1.00000000 -0.3239267236
## log_avg_visi -0.02454892 -0.01520474 -0.32392672 1.0000000000
## log_low_visi -0.01265291 -0.13002408 -0.20887048 0.5817612420
## log_precip 0.15241722 0.22518332 0.41900816 -0.4530559098
## log_low_visi log_precip
## count 0.113742350 -0.170071814
## high_temp 0.077427687 -0.080678869
## avg_temp 0.067810643 -0.039457516
## low_temp 0.056066141 0.005897213
## high_dewpoint -0.038096534 0.157521177
## avg_dewpoint -0.034595204 0.136690141
## low_dewpoint -0.006427651 0.093291372
## avg_humidity -0.273675727 0.513859184
## low_humidity -0.241436102 0.483866517
## high_sealevelpress 0.094755348 -0.281815855
## avg_sealevelpress 0.159919476 -0.363363052
## low_sealevelpress 0.190871070 -0.393879486
## high_visibility NA NA
## low_wind -0.112684201 0.253155964
## avg_wind -0.012652909 0.152417221
## high_wind -0.130024075 0.225183322
## log_high_humidity -0.208870478 0.419008160
## log_avg_visi 0.581761242 -0.453055910
## log_low_visi 1.000000000 -0.306933506
## log_precip -0.306933506 1.000000000
#try to drop highs and lows to avoid multicolinearity
dropped_bike_weather <- tr_bike_weather %>%
dplyr::select(-date,-high_temp,-low_temp,-high_dewpoint,-low_dewpoint,-low_humidity,-high_sealevelpress,-low_sealevelpress,-high_visibility,-high_wind,-low_wind,-log_high_humidity,-log_avg_visi,-season)
corr1 <- cor(dropped_bike_weather[,unlist(lapply(dropped_bike_weather,is.numeric))])
corrplot::corrplot(corr1,method = "number",type="upper")
dropped_bike_weather<-dropped_bike_weather %>%
dplyr::select(-avg_dewpoint)
# remove date
dropped_bike_weather <- dropped_bike_weather[,-1]
Automatic methods and Cross-validation
# Forward Stepwise Selection
set.seed(1)
# using the sample() function to split the set of observations into two halves
n <- nrow(dropped_bike_weather)
train <- sample(n,n/2)
# select training data
train_data <- dropped_bike_weather[train,]
# select test data
test_data <- dropped_bike_weather[-train,]
# Full model
FullModel <- lm(count~.,data=train_data)
# Forward Stepwise Selection
fwd_selected <- step(FullModel, direction="forward", test="F")
## Start: AIC=8583.69
## count ~ year + month.x + weekday + Member_type + avg_temp + avg_humidity +
## avg_sealevelpress + avg_wind + log_low_visi + log_precip
fwd_model <- lm(count ~ year + month.x + weekday + Member_type + avg_temp +
avg_humidity + avg_sealevelpress + avg_wind + log_low_visi +
log_precip, data = train_data)
# get the estimated test MSE for the linear regression fit
rmse(fwd_model,data=test_data) ^2 # 2556469
## [1] 2722660
# Backward Stepwise Selection
bck_selected <- step(FullModel, direction="backward", test="F")
## Start: AIC=8583.69
## count ~ year + month.x + weekday + Member_type + avg_temp + avg_humidity +
## avg_sealevelpress + avg_wind + log_low_visi + log_precip
##
## Df Sum of Sq RSS AIC F value Pr(>F)
## - weekday 6 22255256 1407631755 8580.9 1.4860 0.180633
## - log_low_visi 1 210781 1385587279 8581.8 0.0844 0.771475
## - avg_sealevelpress 1 3683278 1389059777 8583.2 1.4756 0.224986
## <none> 1385376499 8583.7
## - log_precip 1 14750117 1400126615 8587.8 5.9091 0.015379
## - avg_humidity 1 16330415 1401706914 8588.5 6.5422 0.010799
## - year 1 19114864 1404491363 8589.6 7.6577 0.005842
## - month.x 11 69557947 1454934446 8590.1 2.5333 0.003986
## - avg_wind 1 21118706 1406495204 8590.5 8.4604 0.003775
## - avg_temp 1 211250197 1596626695 8664.1 84.6296 < 2.2e-16
## - Member_type 1 3584990965 4970367464 9323.9 1436.1944 < 2.2e-16
##
## - weekday
## - log_low_visi
## - avg_sealevelpress
## <none>
## - log_precip *
## - avg_humidity *
## - year **
## - month.x **
## - avg_wind **
## - avg_temp ***
## - Member_type ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=8580.95
## count ~ year + month.x + Member_type + avg_temp + avg_humidity +
## avg_sealevelpress + avg_wind + log_low_visi + log_precip
##
## Df Sum of Sq RSS AIC F value Pr(>F)
## - log_low_visi 1 418286 1408050041 8579.1 0.1667 0.683213
## - avg_sealevelpress 1 2598700 1410230454 8580.0 1.0357 0.309265
## <none> 1407631755 8580.9
## - avg_humidity 1 13627154 1421258909 8584.5 5.4310 0.020136
## - log_precip 1 17960378 1425592133 8586.3 7.1580 0.007681
## - year 1 18879888 1426511643 8586.7 7.5244 0.006281
## - avg_wind 1 20396140 1428027895 8587.3 8.1287 0.004517
## - month.x 11 73219811 1480851565 8588.4 2.6528 0.002549
## - avg_temp 1 223035656 1630667410 8664.4 88.8890 < 2.2e-16
## - Member_type 1 3624367150 5031998905 9319.1 1444.4616 < 2.2e-16
##
## - log_low_visi
## - avg_sealevelpress
## <none>
## - avg_humidity *
## - log_precip **
## - year **
## - avg_wind **
## - month.x **
## - avg_temp ***
## - Member_type ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=8579.12
## count ~ year + month.x + Member_type + avg_temp + avg_humidity +
## avg_sealevelpress + avg_wind + log_precip
##
## Df Sum of Sq RSS AIC F value Pr(>F)
## - avg_sealevelpress 1 2698419 1410748459 8578.2 1.0770 0.299809
## <none> 1408050041 8579.1
## - avg_humidity 1 15208133 1423258174 8583.4 6.0701 0.014047
## - log_precip 1 18988321 1427038361 8584.9 7.5789 0.006097
## - year 1 19714732 1427764773 8585.2 7.8688 0.005204
## - avg_wind 1 20437351 1428487392 8585.5 8.1572 0.004448
## - month.x 11 75242505 1483292545 8587.4 2.7302 0.001905
## - avg_temp 1 224228370 1632278411 8663.0 89.4971 < 2.2e-16
## - Member_type 1 3626046688 5034096729 9317.3 1447.2769 < 2.2e-16
##
## - avg_sealevelpress
## <none>
## - avg_humidity *
## - log_precip **
## - year **
## - avg_wind **
## - month.x **
## - avg_temp ***
## - Member_type ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=8578.23
## count ~ year + month.x + Member_type + avg_temp + avg_humidity +
## avg_wind + log_precip
##
## Df Sum of Sq RSS AIC F value Pr(>F)
## <none> 1410748459 8578.2
## - avg_humidity 1 16678124 1427426583 8583.1 6.6559 0.0101352 *
## - year 1 19600731 1430349190 8584.2 7.8222 0.0053371 **
## - log_precip 1 23464795 1434213254 8585.8 9.3643 0.0023179 **
## - month.x 11 80557424 1491305884 8588.5 2.9226 0.0009128 ***
## - avg_wind 1 30777004 1441525463 8588.8 12.2825 0.0004936 ***
## - avg_temp 1 240484175 1651232634 8667.7 95.9722 < 2.2e-16 ***
## - Member_type 1 3623449240 5034197699 9315.3 1446.0423 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
bck_model <- lm(count ~ year + month.x + Member_type + avg_temp + avg_humidity + avg_wind+log_precip, data = train_data)
# get the estimated test MSE for the linear regression fit
rmse(bck_model,data=test_data) ^2 #2577751
## [1] 2773779
# Exhaustive Stepwise Selection
both_selected <- step(FullModel, direction="both", test="F")
## Start: AIC=8583.69
## count ~ year + month.x + weekday + Member_type + avg_temp + avg_humidity +
## avg_sealevelpress + avg_wind + log_low_visi + log_precip
##
## Df Sum of Sq RSS AIC F value Pr(>F)
## - weekday 6 22255256 1407631755 8580.9 1.4860 0.180633
## - log_low_visi 1 210781 1385587279 8581.8 0.0844 0.771475
## - avg_sealevelpress 1 3683278 1389059777 8583.2 1.4756 0.224986
## <none> 1385376499 8583.7
## - log_precip 1 14750117 1400126615 8587.8 5.9091 0.015379
## - avg_humidity 1 16330415 1401706914 8588.5 6.5422 0.010799
## - year 1 19114864 1404491363 8589.6 7.6577 0.005842
## - month.x 11 69557947 1454934446 8590.1 2.5333 0.003986
## - avg_wind 1 21118706 1406495204 8590.5 8.4604 0.003775
## - avg_temp 1 211250197 1596626695 8664.1 84.6296 < 2.2e-16
## - Member_type 1 3584990965 4970367464 9323.9 1436.1944 < 2.2e-16
##
## - weekday
## - log_low_visi
## - avg_sealevelpress
## <none>
## - log_precip *
## - avg_humidity *
## - year **
## - month.x **
## - avg_wind **
## - avg_temp ***
## - Member_type ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=8580.95
## count ~ year + month.x + Member_type + avg_temp + avg_humidity +
## avg_sealevelpress + avg_wind + log_low_visi + log_precip
##
## Df Sum of Sq RSS AIC F value Pr(>F)
## - log_low_visi 1 418286 1408050041 8579.1 0.1667 0.683213
## - avg_sealevelpress 1 2598700 1410230454 8580.0 1.0357 0.309265
## <none> 1407631755 8580.9
## + weekday 6 22255256 1385376499 8583.7 1.4860 0.180633
## - avg_humidity 1 13627154 1421258909 8584.5 5.4310 0.020136
## - log_precip 1 17960378 1425592133 8586.3 7.1580 0.007681
## - year 1 18879888 1426511643 8586.7 7.5244 0.006281
## - avg_wind 1 20396140 1428027895 8587.3 8.1287 0.004517
## - month.x 11 73219811 1480851565 8588.4 2.6528 0.002549
## - avg_temp 1 223035656 1630667410 8664.4 88.8890 < 2.2e-16
## - Member_type 1 3624367150 5031998905 9319.1 1444.4616 < 2.2e-16
##
## - log_low_visi
## - avg_sealevelpress
## <none>
## + weekday
## - avg_humidity *
## - log_precip **
## - year **
## - avg_wind **
## - month.x **
## - avg_temp ***
## - Member_type ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=8579.12
## count ~ year + month.x + Member_type + avg_temp + avg_humidity +
## avg_sealevelpress + avg_wind + log_precip
##
## Df Sum of Sq RSS AIC F value Pr(>F)
## - avg_sealevelpress 1 2698419 1410748459 8578.2 1.0770 0.299809
## <none> 1408050041 8579.1
## + log_low_visi 1 418286 1407631755 8580.9 0.1667 0.683213
## + weekday 6 22462762 1385587279 8581.8 1.5023 0.175073
## - avg_humidity 1 15208133 1423258174 8583.4 6.0701 0.014047
## - log_precip 1 18988321 1427038361 8584.9 7.5789 0.006097
## - year 1 19714732 1427764773 8585.2 7.8688 0.005204
## - avg_wind 1 20437351 1428487392 8585.5 8.1572 0.004448
## - month.x 11 75242505 1483292545 8587.4 2.7302 0.001905
## - avg_temp 1 224228370 1632278411 8663.0 89.4971 < 2.2e-16
## - Member_type 1 3626046688 5034096729 9317.3 1447.2769 < 2.2e-16
##
## - avg_sealevelpress
## <none>
## + log_low_visi
## + weekday
## - avg_humidity *
## - log_precip **
## - year **
## - avg_wind **
## - month.x **
## - avg_temp ***
## - Member_type ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=8578.23
## count ~ year + month.x + Member_type + avg_temp + avg_humidity +
## avg_wind + log_precip
##
## Df Sum of Sq RSS AIC F value Pr(>F)
## <none> 1410748459 8578.2
## + avg_sealevelpress 1 2698419 1408050041 8579.1 1.0770 0.2998094
## + log_low_visi 1 518005 1410230454 8580.0 0.2064 0.6497526
## + weekday 6 21368504 1389379955 8581.4 1.4278 0.2016690
## - avg_humidity 1 16678124 1427426583 8583.1 6.6559 0.0101352
## - year 1 19600731 1430349190 8584.2 7.8222 0.0053371
## - log_precip 1 23464795 1434213254 8585.8 9.3643 0.0023179
## - month.x 11 80557424 1491305884 8588.5 2.9226 0.0009128
## - avg_wind 1 30777004 1441525463 8588.8 12.2825 0.0004936
## - avg_temp 1 240484175 1651232634 8667.7 95.9722 < 2.2e-16
## - Member_type 1 3623449240 5034197699 9315.3 1446.0423 < 2.2e-16
##
## <none>
## + avg_sealevelpress
## + log_low_visi
## + weekday
## - avg_humidity *
## - year **
## - log_precip **
## - month.x ***
## - avg_wind ***
## - avg_temp ***
## - Member_type ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
both_model <- lm(count ~ year + month.x + Member_type + avg_temp + avg_humidity + avg_wind + log_precip, data = train_data)
# get the estimated test MSE for the linear regression fit
rmse(both_model,data=test_data) ^2 # 2577751
## [1] 2773779
All-possible regression approach
# Forward stepwise selection
regfit_fwd <- dropped_bike_weather[,-5] %>%
regsubsets(count~., data =., nvmax=10, method = "forward")
regfit_fwd_summary <- regfit_fwd %>%
summary()
fwd_fit_dat<-tibble(
num_pred=1:10,
RSS=regfit_fwd_summary$rss,
R2=regfit_fwd_summary$rsq,
Adj_R2=regfit_fwd_summary$adjr2,
Cp=regfit_fwd_summary$cp) %>%
#reshape the data for plotting
gather(key="statistic",value="value",-num_pred)
fwd_fit_dat %>%
filter(statistic!="R2") %>%
ggplot(aes(x=num_pred,y=value))+
geom_point()+
geom_line()+
facet_wrap(~statistic,ncol=3,scales="free")
# Backward stepwise selection
regfit_bck <- dropped_bike_weather[,-5] %>%
regsubsets(count~., data =., nvmax=10, method = "backward")
regfit_bck_summary <- regfit_bck %>%
summary()
bck_fit_dat<-tibble(
num_pred=1:10,
RSS=regfit_bck_summary$rss,
R2=regfit_bck_summary$rsq,
Adj_R2=regfit_bck_summary$adjr2,
Cp=regfit_bck_summary$cp) %>%
#reshape the data for plotting
gather(key="statistic",value="value",-num_pred)
bck_fit_dat %>%
filter(statistic!="R2") %>%
ggplot(aes(x=num_pred,y=value))+
geom_point()+
geom_line()+
facet_wrap(~statistic,ncol=3,scales="free")
# Exhaustive stepwise selection
regfit_ex <- dropped_bike_weather[,-5] %>%
regsubsets(count~., data =., nvmax=10, method = "exhaustive")
regfit_ex_summary <- regfit_ex %>%
summary()
ex_fit_dat<-tibble(
num_pred=1:10,
RSS=regfit_ex_summary$rss,
R2=regfit_ex_summary$rsq,
Adj_R2=regfit_ex_summary$adjr2,
Cp=regfit_ex_summary$cp) %>%
#reshape the data for plotting
gather(key="statistic",value="value",-num_pred)
ex_fit_dat %>%
filter(statistic!="R2") %>%
ggplot(aes(x=num_pred,y=value))+
geom_point()+
geom_line()+
facet_wrap(~statistic,ncol=3,scales="free")
Selected Model from stepwise feature selection
step_fit <- lm(count~.,data=dropped_bike_weather)
step_fit %>%
summary()
##
## Call:
## lm(formula = count ~ ., data = dropped_bike_weather)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5102.4 -1123.9 -73.8 1104.2 6142.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.131e+04 9.128e+03 -1.240 0.215388
## year2017 6.245e+02 9.420e+01 6.629 5.20e-11 ***
## month.x2 2.970e+02 2.403e+02 1.236 0.216785
## month.x3 6.833e+02 2.559e+02 2.670 0.007686 **
## month.x4 9.146e+02 2.759e+02 3.315 0.000947 ***
## month.x5 5.772e+02 2.873e+02 2.009 0.044797 *
## month.x6 5.345e+02 3.396e+02 1.574 0.115770
## month.x7 -5.502e+00 3.612e+02 -0.015 0.987851
## month.x8 -2.634e+02 3.537e+02 -0.745 0.456658
## month.x9 3.871e+02 3.303e+02 1.172 0.241570
## month.x10 1.134e+03 2.886e+02 3.930 9.00e-05 ***
## month.x11 6.337e+02 2.548e+02 2.487 0.013032 *
## month.x12 5.528e-01 2.336e+02 0.002 0.998113
## weekday2 4.215e+02 1.744e+02 2.417 0.015824 *
## weekday3 6.945e+02 1.743e+02 3.984 7.20e-05 ***
## weekday4 7.256e+02 1.728e+02 4.199 2.89e-05 ***
## weekday5 5.755e+02 1.738e+02 3.311 0.000960 ***
## weekday6 5.979e+02 1.741e+02 3.434 0.000617 ***
## weekday7 4.239e+02 1.721e+02 2.463 0.013917 *
## Member_typeCasual -4.992e+03 9.375e+01 -53.253 < 2e-16 ***
## avg_temp 8.850e+01 6.742e+00 13.127 < 2e-16 ***
## avg_humidity -2.196e+01 4.886e+00 -4.495 7.68e-06 ***
## avg_sealevelpress 4.477e+02 2.976e+02 1.504 0.132812
## avg_wind -4.864e+01 1.686e+01 -2.885 0.003988 **
## log_low_visi 1.523e+01 1.384e+01 1.100 0.271513
## log_precip -4.020e+01 8.315e+00 -4.834 1.52e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1595 on 1137 degrees of freedom
## Multiple R-squared: 0.7815, Adjusted R-squared: 0.7767
## F-statistic: 162.7 on 25 and 1137 DF, p-value: < 2.2e-16
par(mfrow=c(2,2))
step_fit %>%
plot()
Beyond linearity
set.seed(1)
# use cross validation to choose dfs
df<- rep(0,6)
val<- list(dropped_bike_weather$avg_temp,dropped_bike_weather$avg_humidity,dropped_bike_weather$avg_sealevelpress,dropped_bike_weather$avg_wind,dropped_bike_weather$log_low_visi,dropped_bike_weather$log_precip)
for (i in 1:6){
fit <- smooth.spline(val[[i]],dropped_bike_weather$count,cv=TRUE)
df[i]<-fit$df
}
df
## [1] 5.596995 4.632338 4.345706 4.075545 2.843863 3.577682
Selected gam Model
gam_fit_s<-dropped_bike_weather %>%
gam(count ~ year + month.x + weekday + Member_type + s(avg_temp, 5.596995) + s(avg_humidity, 4.632338) + s(avg_sealevelpress, 4.345706) + s(avg_wind, 4.075545) + s(log_low_visi, 2.843863) + s(log_precip, 3.577682), data =.)
par(mfrow=c(2,3))
plot.Gam(gam_fit_s,se=T,col="red")
summary(gam_fit_s)
##
## Call: gam(formula = count ~ year + month.x + weekday + Member_type +
## s(avg_temp, 5.596995) + s(avg_humidity, 4.632338) + s(avg_sealevelpress,
## 4.345706) + s(avg_wind, 4.075545) + s(log_low_visi, 2.843863) +
## s(log_precip, 3.577682), data = .)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4620.97 -1174.62 20.22 1068.94 5753.37
##
## (Dispersion Parameter for gaussian family taken to be 2307531)
##
## Null Deviance: 13247841901 on 1162 degrees of freedom
## Residual Deviance: 2579653955 on 1117.928 degrees of freedom
## AIC: 20386.54
##
## Number of Local Scoring Iterations: 2
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value
## year 1.0 48864563 48864563 21.1761
## month.x 11.0 2173311640 197573785 85.6213
## weekday 6.0 84227164 14037861 6.0835
## Member_type 1.0 7184285774 7184285774 3113.4081
## s(avg_temp, 5.596995) 1.0 259391539 259391539 112.4109
## s(avg_humidity, 4.632338) 1.0 289228350 289228350 125.3410
## s(avg_sealevelpress, 4.345706) 1.0 63728353 63728353 27.6175
## s(avg_wind, 4.075545) 1.0 30970088 30970088 13.4213
## s(log_low_visi, 2.843863) 1.0 127465 127465 0.0552
## s(log_precip, 3.577682) 1.0 99069806 99069806 42.9333
## Residuals 1117.9 2579653955 2307531
## Pr(>F)
## year 4.669e-06 ***
## month.x < 2.2e-16 ***
## weekday 2.779e-06 ***
## Member_type < 2.2e-16 ***
## s(avg_temp, 5.596995) < 2.2e-16 ***
## s(avg_humidity, 4.632338) < 2.2e-16 ***
## s(avg_sealevelpress, 4.345706) 1.770e-07 ***
## s(avg_wind, 4.075545) 0.0002604 ***
## s(log_low_visi, 2.843863) 0.8142292
## s(log_precip, 3.577682) 8.626e-11 ***
## Residuals
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## year
## month.x
## weekday
## Member_type
## s(avg_temp, 5.596995) 4.6 16.7123 6.883e-15 ***
## s(avg_humidity, 4.632338) 3.6 3.2545 0.01434 *
## s(avg_sealevelpress, 4.345706) 3.3 1.4433 0.22491
## s(avg_wind, 4.075545) 3.1 0.5802 0.63229
## s(log_low_visi, 2.843863) 1.8 1.0020 0.36222
## s(log_precip, 3.577682) 2.6 15.5666 7.303e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
gam_fit_sl<-dropped_bike_weather %>%
gam(count ~ year + month.x + weekday + Member_type + s(avg_temp, 33.350675) + s(avg_humidity, 4.355668) + avg_sealevelpress + avg_wind + log_low_visi + s(log_precip, 3.973716), data =.)
par(mfrow=c(2,3))
plot.Gam(gam_fit_sl,se=T,col="orange")
summary(gam_fit_sl)
##
## Call: gam(formula = count ~ year + month.x + weekday + Member_type +
## s(avg_temp, 33.350675) + s(avg_humidity, 4.355668) + avg_sealevelpress +
## avg_wind + log_low_visi + s(log_precip, 3.973716), data = .)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4157.41 -1213.79 33.24 1114.55 5637.96
##
## (Dispersion Parameter for gaussian family taken to be 2318114)
##
## Null Deviance: 13247841901 on 1162 degrees of freedom
## Residual Deviance: 2546031362 on 1098.32 degrees of freedom
## AIC: 20410.5
##
## Number of Local Scoring Iterations: 2
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## year 1.0 45827531 45827531 19.7693 9.624e-06
## month.x 11.0 2181762885 198342080 85.5618 < 2.2e-16
## weekday 6.0 79852251 13308708 5.7412 6.789e-06
## Member_type 1.0 7190398702 7190398702 3101.8308 < 2.2e-16
## s(avg_temp, 33.350675) 1.0 265809142 265809142 114.6661 < 2.2e-16
## s(avg_humidity, 4.355668) 1.0 271125174 271125174 116.9594 < 2.2e-16
## avg_sealevelpress 1.0 66501517 66501517 28.6878 1.035e-07
## avg_wind 1.0 31566399 31566399 13.6173 0.0002351
## log_low_visi 1.0 85870 85870 0.0370 0.8474129
## s(log_precip, 3.973716) 1.0 77957551 77957551 33.6297 8.709e-09
## Residuals 1098.3 2546031362 2318114
##
## year ***
## month.x ***
## weekday ***
## Member_type ***
## s(avg_temp, 33.350675) ***
## s(avg_humidity, 4.355668) ***
## avg_sealevelpress ***
## avg_wind ***
## log_low_visi
## s(log_precip, 3.973716) ***
## Residuals
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## year
## month.x
## weekday
## Member_type
## s(avg_temp, 33.350675) 32.4 3.1431 1.292e-08 ***
## s(avg_humidity, 4.355668) 3.4 3.2684 0.01661 *
## avg_sealevelpress
## avg_wind
## log_low_visi
## s(log_precip, 3.973716) 3.0 11.1659 3.543e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# L1 lasso
set.seed (1)
mod_matrix <- model.matrix(count~.,tr_bike_weather[,-c(1,6)])[,-1]
train <- sample(1:nrow(mod_matrix), nrow(mod_matrix)/2)
test =(-train)
response <- tr_bike_weather$count
response.test <- response[test]
#alpha=1 for lasso
lambda_grid<-10^seq(10,-2,length=100)
lasso_mod <-glmnet(mod_matrix[train,],response[train],alpha=1,lambda=lambda_grid)
par(mfrow=c(1,1))
lasso_mod %>%
plot()
#perform cross-validation and compute the associated test error.
cv_out_l<-cv.glmnet(mod_matrix[train,],response[train],alpha=1)
plot(cv_out_l)
best_lambda=cv_out_l$lambda.min
best_lambda
## [1] 38.44728
lasso_pred <- predict(lasso_mod,s=best_lambda,newx=mod_matrix[test,])
#MSE
lasso_MSE <- mean((lasso_pred-response.test)^2)
lasso_MSE #2452066
## [1] 2840073
lasso_coef <- cv_out_l %>%
predict(.,type ="coefficients",s=best_lambda)
lasso_coef
## 39 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -1.118614e+04
## year2017 2.432025e+02
## month.x2 .
## month.x3 1.905635e+02
## month.x4 3.962642e+02
## month.x5 .
## month.x6 2.749640e+02
## month.x7 .
## month.x8 -2.788771e+02
## month.x9 2.357465e+02
## month.x10 4.334665e+02
## month.x11 .
## month.x12 .
## weekday2 5.170076e-01
## weekday3 3.826092e+01
## weekday4 2.259621e+02
## weekday5 1.871988e+02
## weekday6 .
## weekday7 .
## Member_typeCasual -4.964561e+03
## high_temp 7.731876e+01
## avg_temp 1.133584e-02
## low_temp .
## high_dewpoint .
## avg_dewpoint .
## low_dewpoint .
## avg_humidity .
## low_humidity -4.449898e+00
## high_sealevelpress .
## avg_sealevelpress 3.807296e+02
## low_sealevelpress .
## high_visibility .
## low_wind .
## avg_wind -2.800352e+01
## high_wind -9.738275e+00
## log_high_humidity .
## log_avg_visi 8.675102e+02
## log_low_visi .
## log_precip -2.056450e+01
Selected Model from lasso regression
lasso_fit <- glmnet(mod_matrix,response,alpha=1,lambda = best_lambda)
lasso_fit %>%
summary()
## Length Class Mode
## a0 1 -none- numeric
## beta 38 dgCMatrix S4
## df 1 -none- numeric
## dim 2 -none- numeric
## lambda 1 -none- numeric
## dev.ratio 1 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## call 5 -none- call
## nobs 1 -none- numeric
set.seed(1)
#MSE comparison
full_model <- lm(count~., data = tr_bike_weather)
MSE_full <- rmse(full_model, data=te_bike_weather) ^2
MSE_selected <- rmse(step_fit, data=te_bike_weather) ^2
#GAM mse
gam_pred <- predict(gam_fit_s, te_bike_weather)
gam_MSE <- mean((te_bike_weather$count-gam_pred)^2)
#GAM modified MSE
gam_pred_sl <- predict(gam_fit_sl, te_bike_weather)
gam_sl_MSE <- mean((te_bike_weather$count-gam_pred_sl)^2)
#lasso
x=model.matrix(count~.,data=te_bike_weather[,-c(1,6)])
x=x[,-1]
lasso_pred <- predict(lasso_fit, s=best_lambda, newx=x)
lasso_MSE <- mean((lasso_pred-te_bike_weather$count)^2)
# combine
cbind(model <- c("Full Linear Model (p=23)","Subset Selected Model (p=10)","General Additive Model - all smoothing splines","General Additive Model - partial splines", "Lasso Model (λ= 38.44728)"),MSE <- c(MSE_full,MSE_selected,gam_MSE,gam_sl_MSE,lasso_MSE))
## [,1] [,2]
## [1,] "Full Linear Model (p=23)" "2678817.95905591"
## [2,] "Subset Selected Model (p=10)" "2761878.70131835"
## [3,] "General Additive Model - all smoothing splines" "2433135.59287282"
## [4,] "General Additive Model - partial splines" "2503048.79642685"
## [5,] "Lasso Model (λ= 38.44728)" "2640365.70326811"