Visualization and Discovery section

library(pastecs)
#Let us read a file to use this package
US_accident <- read.table("US_Accidents_Dec20_updated.csv", sep=",", header = T, fill = T) 
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec, :
## EOF within quoted string
summary(US_accident)
##       ID              Severity          Start_Time          End_Time        
##  Length:779138      Length:779138      Length:779138      Length:779138     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##   Start_Lat          Start_Lng           End_Lat            End_Lng         
##  Length:779138      Length:779138      Length:779138      Length:779138     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##  Distance.mi.       Description           Number             Street         
##  Length:779138      Length:779138      Length:779138      Length:779138     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##      Side               City              County             State          
##  Length:779138      Length:779138      Length:779138      Length:779138     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##    Zipcode            Country            Timezone         Airport_Code      
##  Length:779138      Length:779138      Length:779138      Length:779138     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##  Weather_Timestamp  Temperature.F.     Wind_Chill.F.      Humidity...       
##  Length:779138      Length:779138      Length:779138      Length:779138     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##  Pressure.in.       Visibility.mi.     Wind_Direction     Wind_Speed.mph.   
##  Length:779138      Length:779138      Length:779138      Length:779138     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##  Precipitation.in.  Weather_Condition    Amenity              Bump          
##  Length:779138      Length:779138      Length:779138      Length:779138     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##    Crossing           Give_Way           Junction           No_Exit         
##  Length:779138      Length:779138      Length:779138      Length:779138     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##    Railway           Roundabout          Station              Stop          
##  Length:779138      Length:779138      Length:779138      Length:779138     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##  Traffic_Calming    Traffic_Signal     Turning_Loop       Sunrise_Sunset    
##  Length:779138      Length:779138      Length:779138      Length:779138     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##  Civil_Twilight     Nautical_Twilight  Astronomical_Twilight
##  Length:779138      Length:779138      Length:779138        
##  Class :character   Class :character   Class :character     
##  Mode  :character   Mode  :character   Mode  :character
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:pastecs':
## 
##     first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v stringr 1.4.0
## v tidyr   1.2.0     v forcats 0.5.1
## v readr   2.1.2
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x tidyr::extract() masks pastecs::extract()
## x dplyr::filter()  masks stats::filter()
## x dplyr::first()   masks pastecs::first()
## x dplyr::lag()     masks stats::lag()
## x dplyr::last()    masks pastecs::last()
library(readr)
df <- read_csv("US_Accidents_Dec20_updated.csv", col_types = cols(.default = col_character())) %>% 
type_convert()
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_character(),
##   Severity = col_double(),
##   Start_Time = col_datetime(format = ""),
##   End_Time = col_datetime(format = ""),
##   Start_Lat = col_double(),
##   Start_Lng = col_double(),
##   End_Lat = col_double(),
##   End_Lng = col_double(),
##   `Distance(mi)` = col_double(),
##   Number = col_double(),
##   Weather_Timestamp = col_datetime(format = ""),
##   `Temperature(F)` = col_double(),
##   `Wind_Chill(F)` = col_double(),
##   `Humidity(%)` = col_double(),
##   `Pressure(in)` = col_double(),
##   `Visibility(mi)` = col_double(),
##   `Wind_Speed(mph)` = col_double(),
##   `Precipitation(in)` = col_double(),
##   Amenity = col_logical(),
##   Bump = col_logical(),
##   Crossing = col_logical()
##   # ... with 10 more columns
## )
## i Use `spec()` for the full column specifications.
#Description, Street, Side, City, County, State, Zipcode, country, Timezone, Airport_Code

head(df)
## # A tibble: 6 x 47
##   ID        Severity Start_Time          End_Time            Start_Lat Start_Lng
##   <chr>        <dbl> <dttm>              <dttm>                  <dbl>     <dbl>
## 1 A-2716600        3 2016-02-08 00:37:08 2016-02-08 06:37:08      40.1     -83.1
## 2 A-2716601        2 2016-02-08 05:56:20 2016-02-08 11:56:20      39.9     -84.1
## 3 A-2716602        2 2016-02-08 06:15:39 2016-02-08 12:15:39      39.1     -84.5
## 4 A-2716603        2 2016-02-08 06:15:39 2016-02-08 12:15:39      39.1     -84.5
## 5 A-2716604        2 2016-02-08 06:51:45 2016-02-08 12:51:45      41.1     -81.5
## 6 A-2716605        3 2016-02-08 07:53:43 2016-02-08 13:53:43      39.2     -84.5
## # ... with 41 more variables: End_Lat <dbl>, End_Lng <dbl>,
## #   `Distance(mi)` <dbl>, Description <chr>, Number <dbl>, Street <chr>,
## #   Side <chr>, City <chr>, County <chr>, State <chr>, Zipcode <chr>,
## #   Country <chr>, Timezone <chr>, Airport_Code <chr>,
## #   Weather_Timestamp <dttm>, `Temperature(F)` <dbl>, `Wind_Chill(F)` <dbl>,
## #   `Humidity(%)` <dbl>, `Pressure(in)` <dbl>, `Visibility(mi)` <dbl>,
## #   Wind_Direction <chr>, `Wind_Speed(mph)` <dbl>, ...
summary(df)
##       ID               Severity       Start_Time                 
##  Length:1516064     Min.   :1.000   Min.   :2016-02-08 00:37:08  
##  Class :character   1st Qu.:2.000   1st Qu.:2018-07-17 14:41:25  
##  Mode  :character   Median :2.000   Median :2020-01-24 11:16:33  
##                     Mean   :2.239   Mean   :2019-07-15 07:01:48  
##                     3rd Qu.:2.000   3rd Qu.:2020-10-22 13:01:30  
##                     Max.   :4.000   Max.   :2020-12-31 23:28:56  
##                                                                  
##     End_Time                     Start_Lat       Start_Lng      
##  Min.   :2016-02-08 06:37:08   Min.   :24.57   Min.   :-124.50  
##  1st Qu.:2018-07-17 17:13:14   1st Qu.:33.85   1st Qu.:-118.21  
##  Median :2020-01-24 13:38:15   Median :37.35   Median : -94.38  
##  Mean   :2019-07-15 11:42:20   Mean   :36.90   Mean   : -98.60  
##  3rd Qu.:2020-10-22 17:50:19   3rd Qu.:40.73   3rd Qu.: -80.87  
##  Max.   :2021-01-01 00:00:00   Max.   :49.00   Max.   : -67.11  
##                                                                 
##     End_Lat         End_Lng         Distance(mi)      Description       
##  Min.   :24.57   Min.   :-124.50   Min.   :  0.0000   Length:1516064    
##  1st Qu.:33.85   1st Qu.:-118.21   1st Qu.:  0.0000   Class :character  
##  Median :37.35   Median : -94.38   Median :  0.1780   Mode  :character  
##  Mean   :36.90   Mean   : -98.60   Mean   :  0.5873                     
##  3rd Qu.:40.73   3rd Qu.: -80.87   3rd Qu.:  0.5940                     
##  Max.   :49.08   Max.   : -67.11   Max.   :155.1860                     
##                                                                         
##      Number           Street              Side               City          
##  Min.   :      0   Length:1516064     Length:1516064     Length:1516064    
##  1st Qu.:   1212   Class :character   Class :character   Class :character  
##  Median :   4000   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :   8908                                                           
##  3rd Qu.:  10100                                                           
##  Max.   :9999997                                                           
##  NA's   :1046095                                                           
##     County             State             Zipcode            Country         
##  Length:1516064     Length:1516064     Length:1516064     Length:1516064    
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##    Timezone         Airport_Code       Weather_Timestamp            
##  Length:1516064     Length:1516064     Min.   :2016-02-08 00:53:00  
##  Class :character   Class :character   1st Qu.:2018-07-10 10:55:30  
##  Mode  :character   Mode  :character   Median :2020-01-22 05:53:00  
##                                        Mean   :2019-07-12 00:02:11  
##                                        3rd Qu.:2020-10-21 04:54:00  
##                                        Max.   :2020-12-31 23:35:00  
##                                        NA's   :30264                
##  Temperature(F)   Wind_Chill(F)     Humidity(%)      Pressure(in)  
##  Min.   :-89.00   Min.   :-89.0    Min.   :  1.00   Min.   : 0.00  
##  1st Qu.: 47.00   1st Qu.: 40.8    1st Qu.: 48.00   1st Qu.:29.44  
##  Median : 61.00   Median : 57.0    Median : 68.00   Median :29.88  
##  Mean   : 59.58   Mean   : 55.1    Mean   : 64.66   Mean   :29.55  
##  3rd Qu.: 73.00   3rd Qu.: 71.0    3rd Qu.: 84.00   3rd Qu.:30.04  
##  Max.   :170.60   Max.   :113.0    Max.   :100.00   Max.   :58.04  
##  NA's   :43033    NA's   :449316   NA's   :45509    NA's   :36274  
##  Visibility(mi)   Wind_Direction     Wind_Speed(mph)  Precipitation(in)
##  Min.   :  0.00   Length:1516064     Min.   :  0.00   Min.   : 0       
##  1st Qu.: 10.00   Class :character   1st Qu.:  4.60   1st Qu.: 0       
##  Median : 10.00   Mode  :character   Median :  7.00   Median : 0       
##  Mean   :  9.13                      Mean   :  7.63   Mean   : 0       
##  3rd Qu.: 10.00                      3rd Qu.: 10.40   3rd Qu.: 0       
##  Max.   :140.00                      Max.   :984.00   Max.   :24       
##  NA's   :44211                       NA's   :128862   NA's   :510549   
##  Weather_Condition   Amenity           Bump          Crossing      
##  Length:1516064     Mode :logical   Mode :logical   Mode :logical  
##  Class :character   FALSE:1503661   FALSE:1515803   FALSE:1429681  
##  Mode  :character   TRUE :12403     TRUE :261       TRUE :86383    
##                                                                    
##                                                                    
##                                                                    
##                                                                    
##   Give_Way        Junction        No_Exit         Railway       
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:1512809   FALSE:1311566   FALSE:1514335   FALSE:1503480  
##  TRUE :3255      TRUE :204498    TRUE :1729      TRUE :12584    
##                                                                 
##                                                                 
##                                                                 
##                                                                 
##  Roundabout       Station           Stop         Traffic_Calming
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:1516013   FALSE:1487917   FALSE:1498368   FALSE:1515575  
##  TRUE :51        TRUE :28147     TRUE :17696     TRUE :489      
##                                                                 
##                                                                 
##                                                                 
##                                                                 
##  Traffic_Signal  Turning_Loop    Sunrise_Sunset     Civil_Twilight    
##  Mode :logical   Mode :logical   Length:1516064     Length:1516064    
##  FALSE:1346095   FALSE:1516064   Class :character   Class :character  
##  TRUE :169969                    Mode  :character   Mode  :character  
##                                                                       
##                                                                       
##                                                                       
##                                                                       
##  Nautical_Twilight  Astronomical_Twilight
##  Length:1516064     Length:1516064       
##  Class :character   Class :character     
##  Mode  :character   Mode  :character     
##                                          
##                                          
##                                          
## 
str(df)
## spec_tbl_df [1,516,064 x 47] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ ID                   : chr [1:1516064] "A-2716600" "A-2716601" "A-2716602" "A-2716603" ...
##  $ Severity             : num [1:1516064] 3 2 2 2 2 3 2 2 2 2 ...
##  $ Start_Time           : POSIXct[1:1516064], format: "2016-02-08 00:37:08" "2016-02-08 05:56:20" ...
##  $ End_Time             : POSIXct[1:1516064], format: "2016-02-08 06:37:08" "2016-02-08 11:56:20" ...
##  $ Start_Lat            : num [1:1516064] 40.1 39.9 39.1 39.1 41.1 ...
##  $ Start_Lng            : num [1:1516064] -83.1 -84.1 -84.5 -84.5 -81.5 ...
##  $ End_Lat              : num [1:1516064] 40.1 39.9 39.1 39.1 41.1 ...
##  $ End_Lng              : num [1:1516064] -83 -84 -84.5 -84.5 -81.5 ...
##  $ Distance(mi)         : num [1:1516064] 3.23 0.747 0.055 0.219 0.123 ...
##  $ Description          : chr [1:1516064] "Between Sawmill Rd/Exit 20 and OH-315/Olentangy Riv Rd/Exit 22 - Accident." "At OH-4/OH-235/Exit 41 - Accident." "At I-71/US-50/Exit 1 - Accident." "At I-71/US-50/Exit 1 - Accident." ...
##  $ Number               : num [1:1516064] NA NA NA NA NA ...
##  $ Street               : chr [1:1516064] "Outerbelt E" "I-70 E" "I-75 S" "US-50 E" ...
##  $ Side                 : chr [1:1516064] "R" "R" "R" "R" ...
##  $ City                 : chr [1:1516064] "Dublin" "Dayton" "Cincinnati" "Cincinnati" ...
##  $ County               : chr [1:1516064] "Franklin" "Montgomery" "Hamilton" "Hamilton" ...
##  $ State                : chr [1:1516064] "OH" "OH" "OH" "OH" ...
##  $ Zipcode              : chr [1:1516064] "43017" "45424" "45203" "45202" ...
##  $ Country              : chr [1:1516064] "US" "US" "US" "US" ...
##  $ Timezone             : chr [1:1516064] "US/Eastern" "US/Eastern" "US/Eastern" "US/Eastern" ...
##  $ Airport_Code         : chr [1:1516064] "KOSU" "KFFO" "KLUK" "KLUK" ...
##  $ Weather_Timestamp    : POSIXct[1:1516064], format: "2016-02-08 00:53:00" "2016-02-08 05:58:00" ...
##  $ Temperature(F)       : num [1:1516064] 42.1 36.9 36 36 39 37 35.6 35.6 33.8 33.1 ...
##  $ Wind_Chill(F)        : num [1:1516064] 36.1 NA NA NA NA 29.8 29.2 29.2 NA 30 ...
##  $ Humidity(%)          : num [1:1516064] 58 91 97 97 55 93 100 100 100 92 ...
##  $ Pressure(in)         : num [1:1516064] 29.8 29.7 29.7 29.7 29.6 ...
##  $ Visibility(mi)       : num [1:1516064] 10 10 10 10 10 10 10 10 3 0.5 ...
##  $ Wind_Direction       : chr [1:1516064] "SW" "Calm" "Calm" "Calm" ...
##  $ Wind_Speed(mph)      : num [1:1516064] 10.4 NA NA NA NA 10.4 8.1 8.1 2.3 3.5 ...
##  $ Precipitation(in)    : num [1:1516064] 0 0.02 0.02 0.02 NA 0.01 NA NA NA 0.08 ...
##  $ Weather_Condition    : chr [1:1516064] "Light Rain" "Light Rain" "Overcast" "Overcast" ...
##  $ Amenity              : logi [1:1516064] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ Bump                 : logi [1:1516064] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ Crossing             : logi [1:1516064] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ Give_Way             : logi [1:1516064] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ Junction             : logi [1:1516064] FALSE FALSE TRUE TRUE FALSE FALSE ...
##  $ No_Exit              : logi [1:1516064] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ Railway              : logi [1:1516064] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ Roundabout           : logi [1:1516064] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ Station              : logi [1:1516064] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ Stop                 : logi [1:1516064] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ Traffic_Calming      : logi [1:1516064] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ Traffic_Signal       : logi [1:1516064] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ Turning_Loop         : logi [1:1516064] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ Sunrise_Sunset       : chr [1:1516064] "Night" "Night" "Night" "Night" ...
##  $ Civil_Twilight       : chr [1:1516064] "Night" "Night" "Night" "Night" ...
##  $ Nautical_Twilight    : chr [1:1516064] "Night" "Night" "Night" "Night" ...
##  $ Astronomical_Twilight: chr [1:1516064] "Night" "Night" "Day" "Day" ...
##  - attr(*, "problems")=<externalptr>
#1,516,064 × 47
(colMeans(is.na(df)))*100
##                    ID              Severity            Start_Time 
##           0.000000000           0.000000000           0.000000000 
##              End_Time             Start_Lat             Start_Lng 
##           0.000000000           0.000000000           0.000000000 
##               End_Lat               End_Lng          Distance(mi) 
##           0.000000000           0.000000000           0.000000000 
##           Description                Number                Street 
##           0.000000000          69.000715009           0.000000000 
##                  Side                  City                County 
##           0.000000000           0.005474703           0.000000000 
##                 State               Zipcode               Country 
##           0.000000000           0.061672858           0.000000000 
##              Timezone          Airport_Code     Weather_Timestamp 
##           0.151840556           0.280199253           1.996221795 
##        Temperature(F)         Wind_Chill(F)           Humidity(%) 
##           2.838468561          29.637007409           3.001786204 
##          Pressure(in)        Visibility(mi)        Wind_Direction 
##           2.392643055           2.916169766           2.760965236 
##       Wind_Speed(mph)     Precipitation(in)     Weather_Condition 
##           8.499773097          33.675952994           2.902713870 
##               Amenity                  Bump              Crossing 
##           0.000000000           0.000000000           0.000000000 
##              Give_Way              Junction               No_Exit 
##           0.000000000           0.000000000           0.000000000 
##               Railway            Roundabout               Station 
##           0.000000000           0.000000000           0.000000000 
##                  Stop       Traffic_Calming        Traffic_Signal 
##           0.000000000           0.000000000           0.000000000 
##          Turning_Loop        Sunrise_Sunset        Civil_Twilight 
##           0.000000000           0.005474703           0.005474703 
##     Nautical_Twilight Astronomical_Twilight 
##           0.005474703           0.005474703
#Number, 69.000715009 
#Wind_Chill(F) 29.637007409
#Precipitation(in) 33.675952994 
my_data <- df[ , c("Severity", "Start_Time", "End_Time", "Temperature(F)", "Humidity(%)", "Visibility(mi)", "Wind_Speed(mph)", "Start_Lat", "Start_Lng", "Zipcode", "Pressure(in)", "Junction", "Sunrise_Sunset", "Distance(mi)", "Wind_Chill(F)", "Wind_Direction", "Precipitation(in)", "State")]   
library(pastecs)
stat.desc(my_data)
##                  Severity   Start_Time     End_Time Temperature(F)  Humidity(%)
## nbr.val      1.516064e+06 1.516064e+06 1.516064e+06   1.473031e+06 1.470555e+06
## nbr.null     0.000000e+00 0.000000e+00 0.000000e+00   5.650000e+02 0.000000e+00
## nbr.na       0.000000e+00 0.000000e+00 0.000000e+00   4.303300e+04 4.550900e+04
## min          1.000000e+00 1.454892e+09 1.454913e+09  -8.900000e+01 1.000000e+00
## max          4.000000e+00 1.609457e+09 1.609459e+09   1.706000e+02 1.000000e+02
## range        3.000000e+00 1.545655e+08 1.545458e+08   2.596000e+02 9.900000e+01
## sum          3.393906e+06 2.369872e+15 2.369898e+15   8.776996e+07 9.508550e+07
## median       2.000000e+00 1.579865e+09 1.579873e+09   6.100000e+01 6.800000e+01
## mean         2.238630e+00 1.563174e+09 1.563191e+09   5.958460e+01 6.465960e+01
## SE.mean      4.939132e-04 3.640382e+04 3.639897e+04   1.505594e-02 1.918079e-02
## CI.mean.0.95 9.680528e-04 7.135024e+04 7.134074e+04   2.950912e-02 3.759369e-02
## var          3.698441e-01 2.009146e+15 2.008611e+15   3.339085e+02 5.410213e+02
## std.dev      6.081481e-01 4.482350e+07 4.481753e+07   1.827316e+01 2.325986e+01
## coef.var     2.716609e-01 2.867467e-02 2.867054e-02   3.066760e-01 3.597279e-01
##              Visibility(mi) Wind_Speed(mph)    Start_Lat     Start_Lng Zipcode
## nbr.val        1.471853e+06    1.387202e+06 1.516064e+06  1.516064e+06      NA
## nbr.null       1.410000e+03    2.028770e+05 0.000000e+00  0.000000e+00      NA
## nbr.na         4.421100e+04    1.288620e+05 0.000000e+00  0.000000e+00      NA
## min            0.000000e+00    0.000000e+00 2.457022e+01 -1.244976e+02      NA
## max            1.400000e+02    9.840000e+02 4.900058e+01 -6.711317e+01      NA
## range          1.400000e+02    9.840000e+02 2.443036e+01  5.738440e+01      NA
## sum            1.344060e+07    1.058548e+07 5.594361e+07 -1.494827e+08      NA
## median         1.000000e+01    7.000000e+00 3.735113e+01 -9.438100e+01      NA
## mean           9.131755e+00    7.630812e+00 3.690056e+01 -9.859919e+01      NA
## SE.mean        2.381399e-03    4.786370e-03 4.195333e-03  1.502172e-02      NA
## CI.mean.0.95   4.667460e-03    9.381121e-03 8.222709e-03  2.944205e-02      NA
## var            8.346970e+00    3.177988e+01 2.668397e+01  3.421028e+02      NA
## std.dev        2.889112e+00    5.637364e+00 5.165653e+00  1.849602e+01      NA
## coef.var       3.163808e-01    7.387633e-01 1.399885e-01 -1.875880e-01      NA
##              Pressure(in) Junction Sunrise_Sunset Distance(mi) Wind_Chill(F)
## nbr.val      1.479790e+06       NA             NA 1.516064e+06  1.066748e+06
## nbr.null     1.000000e+00       NA             NA 4.006130e+05  5.030000e+02
## nbr.na       3.627400e+04       NA             NA 0.000000e+00  4.493160e+05
## min          0.000000e+00       NA             NA 0.000000e+00 -8.900000e+01
## max          5.804000e+01       NA             NA 1.551860e+02  1.130000e+02
## range        5.804000e+01       NA             NA 1.551860e+02  2.020000e+02
## sum          4.373513e+07       NA             NA 8.903264e+05  5.878823e+07
## median       2.988000e+01       NA             NA 1.780000e-01  5.700000e+01
## mean         2.955495e+01       NA             NA 5.872617e-01  5.510976e+01
## SE.mean      8.358277e-04       NA             NA 1.325979e-03  2.045568e-02
## CI.mean.0.95 1.638194e-03       NA             NA 2.598874e-03  4.009245e-02
## var          1.033793e+00       NA             NA 2.665576e+00  4.463647e+02
## std.dev      1.016756e+00       NA             NA 1.632659e+00  2.112735e+01
## coef.var     3.440222e-02       NA             NA 2.780122e+00  3.833685e-01
##              Wind_Direction Precipitation(in) State
## nbr.val                  NA      1.005515e+06    NA
## nbr.null                 NA      9.034290e+05    NA
## nbr.na                   NA      5.105490e+05    NA
## min                      NA      0.000000e+00    NA
## max                      NA      2.400000e+01    NA
## range                    NA      2.400000e+01    NA
## sum                      NA      8.524610e+03    NA
## median                   NA      0.000000e+00    NA
## mean                     NA      8.477855e-03    NA
## SE.mean                  NA      1.289617e-04    NA
## CI.mean.0.95             NA      2.527606e-04    NA
## var                      NA      1.672284e-02    NA
## std.dev                  NA      1.293168e-01    NA
## coef.var                 NA      1.525348e+01    NA
library(tidyr)
library(ggplot2)
library(KernSmooth)
## KernSmooth 2.23 loaded
## Copyright M. P. Wand 1997-2009
library(purrr)
library(dplyr)
options(scipen=999)
hist(my_data$'Severity', 
     breaks=seq(0,5,1),
     col="brown1",
     main="Histogram of Severity",
     xlab="Severity",
     ylab="Frequency", 
     xlim=c(0,4)
     )

hist(my_data$'Start_Time', 
     breaks = 40,
     col="goldenrod1",
     main="Histogram of Start_Time",
     xlab="Start_Time",
     ylab="Frequency", 
     )
## Warning in breaks[-1L] + breaks[-nB]: NAs produced by integer overflow

hist(my_data$'End_Time', 
     breaks = 40,
     col="burlywood1",
     main="Histogram of End_Time",
     xlab="End_Time",
     ylab="Frequency", 
     )
## Warning in breaks[-1L] + breaks[-nB]: NAs produced by integer overflow

hist(my_data$'Temperature(F)', 
     breaks=40, 
     col="cornflowerblue",
     main="Histogram of Temperature", 
     xlab="Temperature (Deg. F)", 
     ylab="Frequency", 
     font.main = 3 
     )

hist(my_data$'Humidity(%)', 
     breaks=40,
     col="lightblue",
     main="Histogram of Humidity",
     xlab="Humidity(%)",
     ylab="Frequency", 
     )

hist(my_data$'Visibility(mi)', 
     breaks=40,
     col="azure4",
     main="Histogram of Visibility",
     xlim=c(0,150),
     xlab="Visibility(mi)",
     ylab="Frequency", 
     )

hist(my_data$'Wind_Speed(mph)', 
     breaks=40,
     col="coral",
     main="Histogram of Wind_Speed",
     xlab="Wind_Speed(mph)",
     ylab="Frequency", 
     )

hist(my_data$'Pressure(in)', 
     breaks=40,
     col="aquamarine3",
     main="Histogram of Pressure",
     xlab="Pressure(in)",
     ylab="Frequency", 
     )

hist(my_data$'Distance(mi)', 
     breaks=40,
     col="darkorchid1",
     main="Histogram of Distance",
     xlab="Distance(mi)",
     ylab="Frequency", 
     )

hist(my_data$"Wind_Chill(F)", 
     breaks=40,
     col="darkseagreen1",
     main="Histogram of Wind_Chill",
     xlab="Wind_Chill(F)",
     ylab="Frequency", 
     )

hist(my_data$"Precipitation(in)", 
     breaks=40,
     col="slategray1",
     main="Histogram of Precipitation",
     xlab="Precipitation(in)",
     ylab="Frequency", 
     )

S1 <- my_data %>% 
  group_by(Wind_Direction) %>%
  summarise(n = n())
S1
## # A tibble: 25 x 2
##    Wind_Direction      n
##    <chr>           <int>
##  1 Calm            79192
##  2 CALM           202870
##  3 E               52435
##  4 East            24064
##  5 ENE             51257
##  6 ESE             51295
##  7 N               53718
##  8 NE              48355
##  9 NNE             46509
## 10 NNW             68014
## # ... with 15 more rows
options(scipen=999)
ggplot(S1, aes(x=S1$Wind_Direction, y=S1$n)) + geom_bar(stat="identity", position = position_dodge(width=2)) + 
  labs(x="Wind_Direction", y="Frequency")
## Warning: Use of `S1$Wind_Direction` is discouraged. Use `Wind_Direction`
## instead.
## Warning: Use of `S1$n` is discouraged. Use `n` instead.

S2 <- my_data %>% 
  group_by(State) %>%
  summarise(n = n())
S2
## # A tibble: 49 x 2
##    State      n
##    <chr>  <int>
##  1 AL      9375
##  2 AR      4373
##  3 AZ     30185
##  4 CA    448833
##  5 CO     19809
##  6 CT     15194
##  7 DC      3788
##  8 DE      2331
##  9 FL    153007
## 10 GA     31111
## # ... with 39 more rows
options(scipen=999)
ggplot(S2, aes(x=S2$State, y=S2$n)) + geom_bar(stat="identity", position = position_dodge(width=2)) + 
  labs(x="State", y="Frequency")
## Warning: Use of `S2$State` is discouraged. Use `State` instead.
## Warning: Use of `S2$n` is discouraged. Use `n` instead.

library(rgdal)
## Loading required package: sp
## Please note that rgdal will be retired by the end of 2023,
## plan transition to sf/stars/terra functions using GDAL and PROJ
## at your earliest convenience.
## 
## rgdal: version: 1.5-28, (SVN revision 1158)
## Geospatial Data Abstraction Library extensions to R successfully loaded
## Loaded GDAL runtime: GDAL 3.2.1, released 2020/12/29
## Path to GDAL shared files: C:/Users/onecouple/Documents/R/win-library/4.1/rgdal/gdal
## GDAL binary built with GEOS: TRUE 
## Loaded PROJ runtime: Rel. 7.2.1, January 1st, 2021, [PJ_VERSION: 721]
## Path to PROJ shared files: C:/Users/onecouple/Documents/R/win-library/4.1/rgdal/proj
## PROJ CDN enabled: FALSE
## Linking to sp version:1.4-6
## To mute warnings of possible GDAL/OSR exportToProj4() degradation,
## use options("rgdal_show_exportToProj4_warnings"="none") before loading sp or rgdal.
## Overwritten PROJ_LIB was C:/Users/onecouple/Documents/R/win-library/4.1/rgdal/proj
library(shiny)
library(purrr)
library(usmap)
library(ggplot2)
library(dplyr)
my_data %>% select(Start_Time, End_Time) %>% head(5)
## # A tibble: 5 x 2
##   Start_Time          End_Time           
##   <dttm>              <dttm>             
## 1 2016-02-08 00:37:08 2016-02-08 06:37:08
## 2 2016-02-08 05:56:20 2016-02-08 11:56:20
## 3 2016-02-08 06:15:39 2016-02-08 12:15:39
## 4 2016-02-08 06:15:39 2016-02-08 12:15:39
## 5 2016-02-08 06:51:45 2016-02-08 12:51:45
library(stringr)
library(tidyr)
library(dplyr)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
accidents_time <- my_data %>%
  mutate(Duration = as.numeric(End_Time - Start_Time)) %>%
  filter(!(Duration < 0)) %>%
  separate(Start_Time, into = c("Date", "Time"), sep = " ") %>%
  mutate("Year" = str_sub(Date, 1, 4), "Month" = str_sub(Date, 6, 7), 
         "Day" = str_sub(Date, 9, 10), "Wday" = as.character(wday(Date)), 
         "Hour" = str_sub(Time, 1, 2)) %>%
  select(-c("Date", "Time", "End_Time")) %>%
  select(Severity, Year, Month, Day, Hour, Wday, Duration)
head(accidents_time)
## # A tibble: 6 x 7
##   Severity Year  Month Day   Hour  Wday  Duration
##      <dbl> <chr> <chr> <chr> <chr> <chr>    <dbl>
## 1        3 2016  02    08    00    2          360
## 2        2 2016  02    08    05    2          360
## 3        2 2016  02    08    06    2          360
## 4        2 2016  02    08    06    2          360
## 5        2 2016  02    08    06    2          360
## 6        3 2016  02    08    07    2          360
library(ggplot2)
accidents_happenHour <- accidents_time %>% count(Hour)
accidents_happenHour
## # A tibble: 24 x 2
##    Hour      n
##    <chr> <int>
##  1 00    46125
##  2 01    43434
##  3 02    38801
##  4 03    32175
##  5 04    29478
##  6 05    40105
##  7 06    57859
##  8 07    73938
##  9 08    75802
## 10 09    59744
## # ... with 14 more rows
p <- ggplot(accidents_happenHour, aes(Hour, n))
p + geom_point(aes(color = n)) + labs(x = "Hour of a day", y = "Number of accidents")

accidents_severity <- accidents_time %>%
    group_by(Hour) %>%
    summarise(mean(Severity))
accidents_severity
## # A tibble: 24 x 2
##    Hour  `mean(Severity)`
##    <chr>            <dbl>
##  1 00                2.26
##  2 01                2.18
##  3 02                2.21
##  4 03                2.27
##  5 04                2.31
##  6 05                2.32
##  7 06                2.27
##  8 07                2.25
##  9 08                2.24
## 10 09                2.27
## # ... with 14 more rows
accident_summary <- merge(accidents_happenHour, accidents_severity)
accident_summary <- accident_summary %>% rename(Average_Severity = "mean(Severity)")
accident_summary
##    Hour      n Average_Severity
## 1    00  46125         2.256629
## 2    01  43434         2.179168
## 3    02  38801         2.206593
## 4    03  32175         2.272789
## 5    04  29478         2.306805
## 6    05  40105         2.320035
## 7    06  57859         2.272870
## 8    07  73938         2.248032
## 9    08  75802         2.242131
## 10   09  59744         2.272078
## 11   10  50442         2.285813
## 12   11  51884         2.266074
## 13   12  72283         2.222085
## 14   13  83700         2.198076
## 15   14  90162         2.204543
## 16   15 100074         2.223345
## 17   16 105559         2.227446
## 18   17 108011         2.225986
## 19   18  91413         2.235459
## 20   19  65190         2.243688
## 21   20  53515         2.248024
## 22   21  48988         2.231342
## 23   22  49156         2.232973
## 24   23  48226         2.232613
options(scipen=999)
ggplot(data = accident_summary) + 
  geom_col(mapping=aes(x=Hour, y=n, fill=Average_Severity)) +
    scale_fill_distiller(palette="Reds", trans= "reverse") +
    labs(
        title = "Amount and Severity of Car Accidents by hour",
        x = "Houro of a day",
        y = "Number of accidents",
        caption = "A Countrywide Traffic Accident Dataset, 2016-2020.",
        fill = "Average Severity") +
    scale_y_continuous(expand = expansion(mult = c(0, .1)))

library(stringr)
library(tidyr)
library(dplyr)
library(lubridate)
library(ggplot2)
accidents_day <- accidents_time %>% count(Wday)
accidents_day
## # A tibble: 7 x 2
##   Wday       n
##   <chr>  <int>
## 1 1     123775
## 2 2     235831
## 3 3     250900
## 4 4     255775
## 5 5     258036
## 6 6     254127
## 7 7     137620
options(scipen=999)
p <- ggplot(accidents_day, aes(Wday, n))
p + geom_point(aes(color = n))+ 
  labs(x = "Day of a week", y = "Amount of accidents")

accidents_severity_day <- accidents_time %>%
    group_by(Wday) %>%
    summarise(mean(Severity))
accidents_severity_day
## # A tibble: 7 x 2
##   Wday  `mean(Severity)`
##   <chr>            <dbl>
## 1 1                 2.27
## 2 2                 2.24
## 3 3                 2.23
## 4 4                 2.23
## 5 5                 2.23
## 6 6                 2.24
## 7 7                 2.26
accident_summary2 <- merge(accidents_day, accidents_severity_day)
accident_summary2 <- accident_summary2 %>% rename(Severity_mean = "mean(Severity)")
accident_summary2
##   Wday      n Severity_mean
## 1    1 123775      2.271436
## 2    2 235831      2.238637
## 3    3 250900      2.234189
## 4    4 255775      2.227208
## 5    5 258036      2.228050
## 6    6 254127      2.236960
## 7    7 137620      2.261357
options(scipen=999)
ggplot(data = accident_summary2) + 
  geom_col(mapping=aes(x=Wday, y=n, fill=Severity_mean)) +
    scale_fill_distiller(palette="Blues", trans= "reverse") +
    labs(
        title = "Car Accidents each Day of the Week",
        x = "Day of the week",
        y = "Number of accidents",
        caption = "A Countrywide Traffic Accident Dataset, 2016-2020.",
        fill = "Severity_mean") +
    scale_y_continuous(expand = expansion(mult = c(0, .1)))

library(stringr)
library(tidyr)
library(dplyr)
library(lubridate)
library(ggplot2)
accidents_year <- accidents_time %>% count(Year)
accidents_year
## # A tibble: 5 x 2
##   Year       n
##   <chr>  <int>
## 1 2016  129325
## 2 2017  170099
## 3 2018  166936
## 4 2019  261772
## 5 2020  787932
options(scipen=999)
p <- ggplot(accidents_year, aes(Year, n))
p + geom_point(aes(color = n)) + labs(x = "Year", y = "Amount of accidents")

accidents_severity_year <- accidents_time %>%
    group_by(Year) %>%
    summarise(mean(Severity))
accidents_severity_year
## # A tibble: 5 x 2
##   Year  `mean(Severity)`
##   <chr>            <dbl>
## 1 2016              2.39
## 2 2017              2.46
## 3 2018              2.49
## 4 2019              2.30
## 5 2020              2.09
options(scipen=999)
accident_summary3 <- merge(accidents_year, accidents_severity_year)
accident_summary3 <- accident_summary3 %>% rename(Severity_mean_year = "mean(Severity)")
accident_summary3
##   Year      n Severity_mean_year
## 1 2016 129325           2.393559
## 2 2017 170099           2.463213
## 3 2018 166936           2.485617
## 4 2019 261772           2.303665
## 5 2020 787932           2.090783
ggplot(data = accident_summary3) + 
  geom_col(mapping=aes(x=Year, y=n, fill=Severity_mean_year)) +
    scale_fill_distiller(palette="Purples", trans= "reverse") +
    labs(
        title = "Car Accidents from 2016 to 2020",
        x = "Year",
        y = "Amount of accidents",
        caption = "A Countrywide Traffic Accident Dataset, 2016-2020.",
        fill = "Severity_mean_year") +
    scale_y_continuous(expand = expansion(mult = c(0, .1)))

library(stringr)
library(tidyr)
library(dplyr)
library(lubridate)
library(ggplot2)
accidents_month <- accidents_time %>% count(Month)
accidents_month
## # A tibble: 12 x 2
##    Month      n
##    <chr>  <int>
##  1 01     88540
##  2 02     82419
##  3 03     96802
##  4 04    107007
##  5 05    108195
##  6 06    113048
##  7 07     53650
##  8 08     62903
##  9 09    122906
## 10 10    181074
## 11 11    222031
## 12 12    277489
options(scipen=999)
p <- ggplot(accidents_month, aes(Month, n))
p + geom_point(aes(color = n)) + labs(x = "Month", y = "Amount of accidents")

accidents_severity_month <- accidents_time %>%
    group_by(Month) %>%
    summarise(mean(Severity))
accidents_severity_month
## # A tibble: 12 x 2
##    Month `mean(Severity)`
##    <chr>            <dbl>
##  1 01                2.33
##  2 02                2.31
##  3 03                2.30
##  4 04                2.28
##  5 05                2.32
##  6 06                2.34
##  7 07                2.47
##  8 08                2.40
##  9 09                2.22
## 10 10                2.17
## 11 11                2.14
## 12 12                2.12
options(scipen=999)
accident_summary4 <- merge(accidents_month, accidents_severity_month)
accident_summary4 <- accident_summary4 %>% rename(Severity_mean_month = "mean(Severity)")
accident_summary4
##    Month      n Severity_mean_month
## 1     01  88540            2.333860
## 2     02  82419            2.309067
## 3     03  96802            2.303041
## 4     04 107007            2.283514
## 5     05 108195            2.321013
## 6     06 113048            2.343447
## 7     07  53650            2.467866
## 8     08  62903            2.402350
## 9     09 122906            2.216320
## 10    10 181074            2.171383
## 11    11 222031            2.139354
## 12    12 277489            2.124484
ggplot(data = accident_summary4) + 
  geom_col(mapping=aes(x=Month, y=n, fill=Severity_mean_month)) +
    scale_fill_distiller(palette="Oranges", trans= "reverse") +
    labs(
        title = "Car Accidents by Month",
        x = "Month",
        y = "Amount of accidents",
        caption = "A Countrywide Traffic Accident Dataset, 2016-2020.",
        fill = "Severity_mean_month") +
    scale_y_continuous(expand = expansion(mult = c(0, .1)))

library(ggplot2)
library(wordcloud)
## Loading required package: RColorBrewer
library(RColorBrewer)
library(wordcloud2)
library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
accidents_weather <- df %>% count(Weather_Condition)
accidents_weather
## # A tibble: 117 x 2
##    Weather_Condition         n
##    <chr>                 <int>
##  1 Blowing Dust             78
##  2 Blowing Dust / Windy     79
##  3 Blowing Snow            144
##  4 Blowing Snow / Windy     63
##  5 Clear                180223
##  6 Cloudy               161291
##  7 Cloudy / Windy         3300
##  8 Drifting Snow             1
##  9 Drizzle                1023
## 10 Drizzle / Windy           3
## # ... with 107 more rows
text <- df$Weather_Condition
docs <- Corpus(VectorSource(text))
dtm <- TermDocumentMatrix(docs) 
matrix <- as.matrix(dtm) 
words <- sort(rowSums(matrix),decreasing=TRUE) 
df_weather <- data.frame(word = names(words),freq=words)
set.seed(1234)
wordcloud(words = df_weather$word, freq = df_weather$freq, min.freq = 1,           
          max.words=200, random.order=FALSE, rot.per=0.35,colors=brewer.pal(8, "Set2"))

library(maps)
## 
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
## 
##     map
library(usmap)
library(ggplot2)
library(patchwork)

US <- map_data('state')
head(US)
##        long      lat group order  region subregion
## 1 -87.46201 30.38968     1     1 alabama      <NA>
## 2 -87.48493 30.37249     1     2 alabama      <NA>
## 3 -87.52503 30.37249     1     3 alabama      <NA>
## 4 -87.53076 30.33239     1     4 alabama      <NA>
## 5 -87.57087 30.32665     1     5 alabama      <NA>
## 6 -87.58806 30.32665     1     6 alabama      <NA>
ggplot()+
  geom_map(data = US, map = US, mapping = aes( map_id = region, x = long, y = lat, group = group), color = "darkgray", fill = "white", size = 0.5) +
  geom_point(data = df, mapping=aes(x = Start_Lng, y = Start_Lat, color = Severity), size = 0.005)+
  labs(title = "US Traffic Accidents",
       subtitle = "Source: A Countrywide Traffic Accident Dataset, 2016-2020.") +
  theme(legend.position = "right")
## Warning: Ignoring unknown aesthetics: x, y

Poisson Regression Model

data = read.csv("US_Accidents_Dec20_filled.csv")



library(dplyr)

head(data)
##   X        ID Severity       Date     Time            End_Time Start_Lat
## 1 1 A-2716600        3 2016-02-08 00:37:08 2016-02-08 06:37:08  40.10891
## 2 2 A-2716601        2 2016-02-08 05:56:20 2016-02-08 11:56:20  39.86542
## 3 3 A-2716602        2 2016-02-08 06:15:39 2016-02-08 12:15:39  39.10266
## 4 4 A-2716603        2 2016-02-08 06:15:39 2016-02-08 12:15:39  39.10148
## 5 5 A-2716604        2 2016-02-08 06:51:45 2016-02-08 12:51:45  41.06213
## 6 6 A-2716605        3 2016-02-08 07:53:43 2016-02-08 13:53:43  39.17239
##   Start_Lng  End_Lat   End_Lng Distance.mi.
## 1 -83.09286 40.11206 -83.03187        3.230
## 2 -84.06280 39.86501 -84.04873        0.747
## 3 -84.52468 39.10209 -84.52396        0.055
## 4 -84.52341 39.09841 -84.52241        0.219
## 5 -81.53784 41.06217 -81.53547        0.123
## 6 -84.49279 39.17048 -84.50180        0.500
##                                                                  Description
## 1 Between Sawmill Rd/Exit 20 and OH-315/Olentangy Riv Rd/Exit 22 - Accident.
## 2                                         At OH-4/OH-235/Exit 41 - Accident.
## 3                                           At I-71/US-50/Exit 1 - Accident.
## 4                                           At I-71/US-50/Exit 1 - Accident.
## 5                                            At Dart Ave/Exit 21 - Accident.
## 6                                         At Mitchell Ave/Exit 6 - Accident.
##   Number      Street Side       City     County State Zipcode Country
## 1   4000 Outerbelt E    R     Dublin   Franklin    OH   43017      US
## 2   4000      I-70 E    R     Dayton Montgomery    OH   45424      US
## 3   4000      I-75 S    R Cincinnati   Hamilton    OH   45203      US
## 4   4000     US-50 E    R Cincinnati   Hamilton    OH   45202      US
## 5   4000      I-77 N    R      Akron     Summit    OH   44311      US
## 6   4000      I-75 S    R Cincinnati   Hamilton    OH   45217      US
##     Timezone Airport_Code   Weather_Timestamp Temperature.F. Wind_Chill.F.
## 1 US/Eastern         KOSU 2016-02-08 00:53:00           42.1          36.1
## 2 US/Eastern         KFFO 2016-02-08 05:58:00           36.9          57.0
## 3 US/Eastern         KLUK 2016-02-08 05:53:00           36.0          57.0
## 4 US/Eastern         KLUK 2016-02-08 05:53:00           36.0          57.0
## 5 US/Eastern         KAKR 2016-02-08 06:54:00           39.0          57.0
## 6 US/Eastern         KLUK 2016-02-08 07:53:00           37.0          29.8
##   Humidity... Pressure.in. Visibility.mi. Wind_Direction Wind_Speed.mph.
## 1          58        29.76             10             SW            10.4
## 2          91        29.68             10           Calm             7.0
## 3          97        29.70             10           Calm             7.0
## 4          97        29.70             10           Calm             7.0
## 5          55        29.65             10           Calm             7.0
## 6          93        29.69             10            WSW            10.4
##   Precipitation.in. Weather_Condition Amenity  Bump Crossing Give_Way Junction
## 1              0.00        Light Rain   False False    False    False    False
## 2              0.02        Light Rain   False False    False    False    False
## 3              0.02          Overcast   False False    False    False     True
## 4              0.02          Overcast   False False    False    False     True
## 5              0.00          Overcast   False False    False    False    False
## 6              0.01        Light Rain   False False    False    False    False
##   No_Exit Railway Roundabout Station  Stop Traffic_Calming Traffic_Signal
## 1   False   False      False   False False           False          False
## 2   False   False      False   False False           False          False
## 3   False   False      False   False False           False          False
## 4   False   False      False   False False           False          False
## 5   False   False      False   False False           False          False
## 6   False   False      False   False False           False          False
##   Turning_Loop Sunrise_Sunset Civil_Twilight Nautical_Twilight
## 1        False          Night          Night             Night
## 2        False          Night          Night             Night
## 3        False          Night          Night             Night
## 4        False          Night          Night             Night
## 5        False          Night          Night               Day
## 6        False            Day            Day               Day
##   Astronomical_Twilight Year Month Day Wday Hour X.Zipcode. X.Month.
## 1                 Night 2016     2   8    2    0    Zipcode    Month
## 2                 Night 2016     2   8    2    5    Zipcode    Month
## 3                   Day 2016     2   8    2    6    Zipcode    Month
## 4                   Day 2016     2   8    2    6    Zipcode    Month
## 5                   Day 2016     2   8    2    6    Zipcode    Month
## 6                   Day 2016     2   8    2    7    Zipcode    Month
library(dplyr)
test_group <- data %>%
  group_by(State, Wday) %>%
  summarize(
    avg_temp = mean(Temperature.F.),
    avg_hum = mean(Humidity...),
    avg_wind = mean(Wind_Chill.F.),
    avg_pressure = mean(Pressure.in.),
    avg_precipitation = mean(Precipitation.in.),
    avg_distance = mean(Distance.mi.),
    avg_sev = mean(Severity),
    count = n()
  )
## `summarise()` has grouped output by 'State'. You can override using the
## `.groups` argument.
test_group
## # A tibble: 343 x 10
## # Groups:   State [49]
##    State  Wday avg_temp avg_hum avg_wind avg_pressure avg_precipitation
##    <chr> <int>    <dbl>   <dbl>    <dbl>        <dbl>             <dbl>
##  1 AL        1     63.3    75.5     62.0         29.7           0.0937 
##  2 AL        2     62.2    69.5     58.8         29.8           0.0127 
##  3 AL        3     60.9    70.4     58.1         29.8           0.00739
##  4 AL        4     64.2    70.5     61.3         29.7           0.00658
##  5 AL        5     64.8    68.5     61.9         29.7           0.00670
##  6 AL        6     64.4    68.9     62.4         29.8           0.00345
##  7 AL        7     61.2    77.5     60.1         29.7           0.00874
##  8 AR        1     54.1    78.1     51.6         29.7           0.0123 
##  9 AR        2     55.2    74.1     52.6         29.8           0.00702
## 10 AR        3     56.2    73.0     54.3         29.8           0.00628
## # ... with 333 more rows, and 3 more variables: avg_distance <dbl>,
## #   avg_sev <dbl>, count <int>
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:patchwork':
## 
##     area
## The following object is masked from 'package:dplyr':
## 
##     select
#test_group$weekday <- factor(nyc_bikes$weekday, levels=c('Sunday', 
#'Monday','Tuesday','Wednesday','Thursday','Friday','Saturday'))
boxplot(count~Wday, xlab="Weekday", ylab="Count", data=test_group)

boxplot(count~State, xlab="State", ylab="Count", data=test_group)

car_quant=data.frame(test_group[c('avg_temp','avg_hum','avg_wind', 'avg_pressure', 'avg_precipitation', 'avg_distance','avg_sev','count')])
plot(car_quant)

mdl1 <- glm(count~., family="poisson", data=test_group)
summary(mdl1)
## 
## Call:
## glm(formula = count ~ ., family = "poisson", data = test_group)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -29.0608   -3.9658    0.3221    4.2960   27.0323  
## 
## Coefficients:
##                     Estimate Std. Error  z value             Pr(>|z|)    
## (Intercept)       -2.5839416  0.4762330   -5.426    0.000000057697740 ***
## StateAR           -0.0830385  0.0193875   -4.283    0.000018431646829 ***
## StateAZ           -0.0800318  0.0328770   -2.434               0.0149 *  
## StateCA            3.4319996  0.0139496  246.029 < 0.0000000000000002 ***
## StateCO            3.3244549  0.0560287   59.335 < 0.0000000000000002 ***
## StateCT            1.0324728  0.0173631   59.464 < 0.0000000000000002 ***
## StateDC           -0.3784986  0.0219929  -17.210 < 0.0000000000000002 ***
## StateDE           -0.1969352  0.0271506   -7.253    0.000000000000406 ***
## StateFL            1.9486452  0.0153415  127.018 < 0.0000000000000002 ***
## StateGA            1.9036762  0.0160019  118.966 < 0.0000000000000002 ***
## StateIA            0.3380981  0.0292587   11.555 < 0.0000000000000002 ***
## StateID            0.2342897  0.0401347    5.838    0.000000005296167 ***
## StateIL            2.1807843  0.0197116  110.634 < 0.0000000000000002 ***
## StateIN            1.3894836  0.0190913   72.781 < 0.0000000000000002 ***
## StateKS           -0.3681348  0.0197789  -18.612 < 0.0000000000000002 ***
## StateKY           -0.4391627  0.0180746  -24.297 < 0.0000000000000002 ***
## StateLA            0.4963006  0.0146105   33.969 < 0.0000000000000002 ***
## StateMA           -0.2218845  0.0201805  -10.995 < 0.0000000000000002 ***
## StateMD            2.0926811  0.0152297  137.408 < 0.0000000000000002 ***
## StateME           -1.7128501  0.0270981  -63.209 < 0.0000000000000002 ***
## StateMI            1.7712513  0.0195308   90.690 < 0.0000000000000002 ***
## StateMN            2.1616819  0.0297680   72.618 < 0.0000000000000002 ***
## StateMO            0.6624950  0.0170661   38.819 < 0.0000000000000002 ***
## StateMS           -1.3841562  0.0224795  -61.574 < 0.0000000000000002 ***
## StateMT            0.8622225  0.0582238   14.809 < 0.0000000000000002 ***
## StateNC            1.4189604  0.0129044  109.959 < 0.0000000000000002 ***
## StateND           -2.5306777  0.0601727  -42.057 < 0.0000000000000002 ***
## StateNE           -0.8843769  0.0291082  -30.382 < 0.0000000000000002 ***
## StateNH           -0.9670823  0.0246694  -39.202 < 0.0000000000000002 ***
## StateNJ            1.4426377  0.0157634   91.518 < 0.0000000000000002 ***
## StateNM           -0.6696015  0.0532741  -12.569 < 0.0000000000000002 ***
## StateNV           -2.2676811  0.0324673  -69.845 < 0.0000000000000002 ***
## StateNY            2.1404388  0.0159356  134.318 < 0.0000000000000002 ***
## StateOH            1.5597999  0.0166549   93.654 < 0.0000000000000002 ***
## StateOK           -0.5658235  0.0223401  -25.328 < 0.0000000000000002 ***
## StateOR            2.6326793  0.0155303  169.519 < 0.0000000000000002 ***
## StatePA            2.2070042  0.0160181  137.781 < 0.0000000000000002 ***
## StateRI           -1.0022400  0.0222888  -44.966 < 0.0000000000000002 ***
## StateSC            1.1483425  0.0136717   83.994 < 0.0000000000000002 ***
## StateSD           -2.9000588  0.0729952  -39.729 < 0.0000000000000002 ***
## StateTN            0.6895593  0.0135933   50.728 < 0.0000000000000002 ***
## StateTX            1.6801386  0.0142363  118.018 < 0.0000000000000002 ***
## StateUT            2.3307617  0.0453014   51.450 < 0.0000000000000002 ***
## StateVA            2.0308740  0.0127450  159.346 < 0.0000000000000002 ***
## StateVT           -2.3489522  0.0563256  -41.703 < 0.0000000000000002 ***
## StateWA            1.8271552  0.0160334  113.959 < 0.0000000000000002 ***
## StateWI            1.1065691  0.0234483   47.192 < 0.0000000000000002 ***
## StateWV           -0.3914624  0.0229002  -17.094 < 0.0000000000000002 ***
## StateWY           -0.4784087  0.1135020   -4.215    0.000024980022946 ***
## Wday               0.0084951  0.0004884   17.395 < 0.0000000000000002 ***
## avg_temp           0.1562132  0.0014154  110.368 < 0.0000000000000002 ***
## avg_hum           -0.0165067  0.0005923  -27.867 < 0.0000000000000002 ***
## avg_wind          -0.1273878  0.0013624  -93.506 < 0.0000000000000002 ***
## avg_pressure       0.4523062  0.0159298   28.394 < 0.0000000000000002 ***
## avg_precipitation  1.8936107  0.2572244    7.362    0.000000000000182 ***
## avg_distance       0.0009121  0.0085105    0.107               0.9147    
## avg_sev           -2.1216546  0.0183920 -115.358 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 3138110  on 342  degrees of freedom
## Residual deviance:   16148  on 286  degrees of freedom
## AIC: 19365
## 
## Number of Fisher Scoring iterations: 4
coef(mdl1)
##       (Intercept)           StateAR           StateAZ           StateCA 
##      -2.583941649      -0.083038483      -0.080031787       3.431999631 
##           StateCO           StateCT           StateDC           StateDE 
##       3.324454904       1.032472750      -0.378498623      -0.196935217 
##           StateFL           StateGA           StateIA           StateID 
##       1.948645177       1.903676157       0.338098100       0.234289706 
##           StateIL           StateIN           StateKS           StateKY 
##       2.180784348       1.389483569      -0.368134807      -0.439162742 
##           StateLA           StateMA           StateMD           StateME 
##       0.496300625      -0.221884541       2.092681121      -1.712850084 
##           StateMI           StateMN           StateMO           StateMS 
##       1.771251310       2.161681912       0.662495009      -1.384156238 
##           StateMT           StateNC           StateND           StateNE 
##       0.862222461       1.418960403      -2.530677724      -0.884376874 
##           StateNH           StateNJ           StateNM           StateNV 
##      -0.967082347       1.442637677      -0.669601501      -2.267681067 
##           StateNY           StateOH           StateOK           StateOR 
##       2.140438802       1.559799890      -0.565823458       2.632679344 
##           StatePA           StateRI           StateSC           StateSD 
##       2.207004231      -1.002240035       1.148342464      -2.900058849 
##           StateTN           StateTX           StateUT           StateVA 
##       0.689559269       1.680138572       2.330761724       2.030874013 
##           StateVT           StateWA           StateWI           StateWV 
##      -2.348952174       1.827155169       1.106569137      -0.391462418 
##           StateWY              Wday          avg_temp           avg_hum 
##      -0.478408688       0.008495078       0.156213152      -0.016506669 
##          avg_wind      avg_pressure avg_precipitation      avg_distance 
##      -0.127387833       0.452306157       1.893610727       0.000912105 
##           avg_sev 
##      -2.121654647

RF Model Section

library(stringr)
library(tidyr)
library(dplyr)
library(lubridate)
library(maps)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(zoo)
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(party)
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Loading required package: sandwich
## 
## Attaching package: 'strucchange'
## The following object is masked from 'package:stringr':
## 
##     boundary
library(randomForest)
## randomForest 4.7-1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
accidents <-read.csv("US_Accidents_Dec20_filled.csv")
head(accidents)
##   X        ID Severity       Date     Time            End_Time Start_Lat
## 1 1 A-2716600        3 2016-02-08 00:37:08 2016-02-08 06:37:08  40.10891
## 2 2 A-2716601        2 2016-02-08 05:56:20 2016-02-08 11:56:20  39.86542
## 3 3 A-2716602        2 2016-02-08 06:15:39 2016-02-08 12:15:39  39.10266
## 4 4 A-2716603        2 2016-02-08 06:15:39 2016-02-08 12:15:39  39.10148
## 5 5 A-2716604        2 2016-02-08 06:51:45 2016-02-08 12:51:45  41.06213
## 6 6 A-2716605        3 2016-02-08 07:53:43 2016-02-08 13:53:43  39.17239
##   Start_Lng  End_Lat   End_Lng Distance.mi.
## 1 -83.09286 40.11206 -83.03187        3.230
## 2 -84.06280 39.86501 -84.04873        0.747
## 3 -84.52468 39.10209 -84.52396        0.055
## 4 -84.52341 39.09841 -84.52241        0.219
## 5 -81.53784 41.06217 -81.53547        0.123
## 6 -84.49279 39.17048 -84.50180        0.500
##                                                                  Description
## 1 Between Sawmill Rd/Exit 20 and OH-315/Olentangy Riv Rd/Exit 22 - Accident.
## 2                                         At OH-4/OH-235/Exit 41 - Accident.
## 3                                           At I-71/US-50/Exit 1 - Accident.
## 4                                           At I-71/US-50/Exit 1 - Accident.
## 5                                            At Dart Ave/Exit 21 - Accident.
## 6                                         At Mitchell Ave/Exit 6 - Accident.
##   Number      Street Side       City     County State Zipcode Country
## 1   4000 Outerbelt E    R     Dublin   Franklin    OH   43017      US
## 2   4000      I-70 E    R     Dayton Montgomery    OH   45424      US
## 3   4000      I-75 S    R Cincinnati   Hamilton    OH   45203      US
## 4   4000     US-50 E    R Cincinnati   Hamilton    OH   45202      US
## 5   4000      I-77 N    R      Akron     Summit    OH   44311      US
## 6   4000      I-75 S    R Cincinnati   Hamilton    OH   45217      US
##     Timezone Airport_Code   Weather_Timestamp Temperature.F. Wind_Chill.F.
## 1 US/Eastern         KOSU 2016-02-08 00:53:00           42.1          36.1
## 2 US/Eastern         KFFO 2016-02-08 05:58:00           36.9          57.0
## 3 US/Eastern         KLUK 2016-02-08 05:53:00           36.0          57.0
## 4 US/Eastern         KLUK 2016-02-08 05:53:00           36.0          57.0
## 5 US/Eastern         KAKR 2016-02-08 06:54:00           39.0          57.0
## 6 US/Eastern         KLUK 2016-02-08 07:53:00           37.0          29.8
##   Humidity... Pressure.in. Visibility.mi. Wind_Direction Wind_Speed.mph.
## 1          58        29.76             10             SW            10.4
## 2          91        29.68             10           Calm             7.0
## 3          97        29.70             10           Calm             7.0
## 4          97        29.70             10           Calm             7.0
## 5          55        29.65             10           Calm             7.0
## 6          93        29.69             10            WSW            10.4
##   Precipitation.in. Weather_Condition Amenity  Bump Crossing Give_Way Junction
## 1              0.00        Light Rain   False False    False    False    False
## 2              0.02        Light Rain   False False    False    False    False
## 3              0.02          Overcast   False False    False    False     True
## 4              0.02          Overcast   False False    False    False     True
## 5              0.00          Overcast   False False    False    False    False
## 6              0.01        Light Rain   False False    False    False    False
##   No_Exit Railway Roundabout Station  Stop Traffic_Calming Traffic_Signal
## 1   False   False      False   False False           False          False
## 2   False   False      False   False False           False          False
## 3   False   False      False   False False           False          False
## 4   False   False      False   False False           False          False
## 5   False   False      False   False False           False          False
## 6   False   False      False   False False           False          False
##   Turning_Loop Sunrise_Sunset Civil_Twilight Nautical_Twilight
## 1        False          Night          Night             Night
## 2        False          Night          Night             Night
## 3        False          Night          Night             Night
## 4        False          Night          Night             Night
## 5        False          Night          Night               Day
## 6        False            Day            Day               Day
##   Astronomical_Twilight Year Month Day Wday Hour X.Zipcode. X.Month.
## 1                 Night 2016     2   8    2    0    Zipcode    Month
## 2                 Night 2016     2   8    2    5    Zipcode    Month
## 3                   Day 2016     2   8    2    6    Zipcode    Month
## 4                   Day 2016     2   8    2    6    Zipcode    Month
## 5                   Day 2016     2   8    2    6    Zipcode    Month
## 6                   Day 2016     2   8    2    7    Zipcode    Month
str(accidents)
## 'data.frame':    1516064 obs. of  56 variables:
##  $ X                    : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ ID                   : chr  "A-2716600" "A-2716601" "A-2716602" "A-2716603" ...
##  $ Severity             : int  3 2 2 2 2 3 2 2 2 2 ...
##  $ Date                 : chr  "2016-02-08" "2016-02-08" "2016-02-08" "2016-02-08" ...
##  $ Time                 : chr  "00:37:08" "05:56:20" "06:15:39" "06:15:39" ...
##  $ End_Time             : chr  "2016-02-08 06:37:08" "2016-02-08 11:56:20" "2016-02-08 12:15:39" "2016-02-08 12:15:39" ...
##  $ Start_Lat            : num  40.1 39.9 39.1 39.1 41.1 ...
##  $ Start_Lng            : num  -83.1 -84.1 -84.5 -84.5 -81.5 ...
##  $ End_Lat              : num  40.1 39.9 39.1 39.1 41.1 ...
##  $ End_Lng              : num  -83 -84 -84.5 -84.5 -81.5 ...
##  $ Distance.mi.         : num  3.23 0.747 0.055 0.219 0.123 ...
##  $ Description          : chr  "Between Sawmill Rd/Exit 20 and OH-315/Olentangy Riv Rd/Exit 22 - Accident." "At OH-4/OH-235/Exit 41 - Accident." "At I-71/US-50/Exit 1 - Accident." "At I-71/US-50/Exit 1 - Accident." ...
##  $ Number               : int  4000 4000 4000 4000 4000 4000 4000 1887 4000 4000 ...
##  $ Street               : chr  "Outerbelt E" "I-70 E" "I-75 S" "US-50 E" ...
##  $ Side                 : chr  "R" "R" "R" "R" ...
##  $ City                 : chr  "Dublin" "Dayton" "Cincinnati" "Cincinnati" ...
##  $ County               : chr  "Franklin" "Montgomery" "Hamilton" "Hamilton" ...
##  $ State                : chr  "OH" "OH" "OH" "OH" ...
##  $ Zipcode              : chr  "43017" "45424" "45203" "45202" ...
##  $ Country              : chr  "US" "US" "US" "US" ...
##  $ Timezone             : chr  "US/Eastern" "US/Eastern" "US/Eastern" "US/Eastern" ...
##  $ Airport_Code         : chr  "KOSU" "KFFO" "KLUK" "KLUK" ...
##  $ Weather_Timestamp    : chr  "2016-02-08 00:53:00" "2016-02-08 05:58:00" "2016-02-08 05:53:00" "2016-02-08 05:53:00" ...
##  $ Temperature.F.       : num  42.1 36.9 36 36 39 37 35.6 35.6 33.8 33.1 ...
##  $ Wind_Chill.F.        : num  36.1 57 57 57 57 29.8 29.2 29.2 57 30 ...
##  $ Humidity...          : int  58 91 97 97 55 93 100 100 100 92 ...
##  $ Pressure.in.         : num  29.8 29.7 29.7 29.7 29.6 ...
##  $ Visibility.mi.       : num  10 10 10 10 10 10 10 10 3 0.5 ...
##  $ Wind_Direction       : chr  "SW" "Calm" "Calm" "Calm" ...
##  $ Wind_Speed.mph.      : num  10.4 7 7 7 7 10.4 8.1 8.1 2.3 3.5 ...
##  $ Precipitation.in.    : num  0 0.02 0.02 0.02 0 0.01 0 0 0 0.08 ...
##  $ Weather_Condition    : chr  "Light Rain" "Light Rain" "Overcast" "Overcast" ...
##  $ Amenity              : chr  "False" "False" "False" "False" ...
##  $ Bump                 : chr  "False" "False" "False" "False" ...
##  $ Crossing             : chr  "False" "False" "False" "False" ...
##  $ Give_Way             : chr  "False" "False" "False" "False" ...
##  $ Junction             : chr  "False" "False" "True" "True" ...
##  $ No_Exit              : chr  "False" "False" "False" "False" ...
##  $ Railway              : chr  "False" "False" "False" "False" ...
##  $ Roundabout           : chr  "False" "False" "False" "False" ...
##  $ Station              : chr  "False" "False" "False" "False" ...
##  $ Stop                 : chr  "False" "False" "False" "False" ...
##  $ Traffic_Calming      : chr  "False" "False" "False" "False" ...
##  $ Traffic_Signal       : chr  "False" "False" "False" "False" ...
##  $ Turning_Loop         : chr  "False" "False" "False" "False" ...
##  $ Sunrise_Sunset       : chr  "Night" "Night" "Night" "Night" ...
##  $ Civil_Twilight       : chr  "Night" "Night" "Night" "Night" ...
##  $ Nautical_Twilight    : chr  "Night" "Night" "Night" "Night" ...
##  $ Astronomical_Twilight: chr  "Night" "Night" "Day" "Day" ...
##  $ Year                 : int  2016 2016 2016 2016 2016 2016 2016 2016 2016 2016 ...
##  $ Month                : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ Day                  : int  8 8 8 8 8 8 8 8 8 8 ...
##  $ Wday                 : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ Hour                 : int  0 5 6 6 6 7 8 8 8 11 ...
##  $ X.Zipcode.           : chr  "Zipcode" "Zipcode" "Zipcode" "Zipcode" ...
##  $ X.Month.             : chr  "Month" "Month" "Month" "Month" ...
dim(accidents)
## [1] 1516064      56
accidents$Severity = as.factor(accidents$Severity)
accidents = accidents[-c(12,64,65)]

accidents2 <- sample(1:nrow(accidents), 50000, replace = FALSE)
accidents2 <- accidents[accidents2, ]
trainingIndices <- createDataPartition(accidents2$Severity, p = 0.7, list = FALSE)
training <- accidents2[trainingIndices, ]
testing <- accidents2[-trainingIndices, ]

head(accidents)
##   X        ID Severity       Date     Time            End_Time Start_Lat
## 1 1 A-2716600        3 2016-02-08 00:37:08 2016-02-08 06:37:08  40.10891
## 2 2 A-2716601        2 2016-02-08 05:56:20 2016-02-08 11:56:20  39.86542
## 3 3 A-2716602        2 2016-02-08 06:15:39 2016-02-08 12:15:39  39.10266
## 4 4 A-2716603        2 2016-02-08 06:15:39 2016-02-08 12:15:39  39.10148
## 5 5 A-2716604        2 2016-02-08 06:51:45 2016-02-08 12:51:45  41.06213
## 6 6 A-2716605        3 2016-02-08 07:53:43 2016-02-08 13:53:43  39.17239
##   Start_Lng  End_Lat   End_Lng Distance.mi. Number      Street Side       City
## 1 -83.09286 40.11206 -83.03187        3.230   4000 Outerbelt E    R     Dublin
## 2 -84.06280 39.86501 -84.04873        0.747   4000      I-70 E    R     Dayton
## 3 -84.52468 39.10209 -84.52396        0.055   4000      I-75 S    R Cincinnati
## 4 -84.52341 39.09841 -84.52241        0.219   4000     US-50 E    R Cincinnati
## 5 -81.53784 41.06217 -81.53547        0.123   4000      I-77 N    R      Akron
## 6 -84.49279 39.17048 -84.50180        0.500   4000      I-75 S    R Cincinnati
##       County State Zipcode Country   Timezone Airport_Code   Weather_Timestamp
## 1   Franklin    OH   43017      US US/Eastern         KOSU 2016-02-08 00:53:00
## 2 Montgomery    OH   45424      US US/Eastern         KFFO 2016-02-08 05:58:00
## 3   Hamilton    OH   45203      US US/Eastern         KLUK 2016-02-08 05:53:00
## 4   Hamilton    OH   45202      US US/Eastern         KLUK 2016-02-08 05:53:00
## 5     Summit    OH   44311      US US/Eastern         KAKR 2016-02-08 06:54:00
## 6   Hamilton    OH   45217      US US/Eastern         KLUK 2016-02-08 07:53:00
##   Temperature.F. Wind_Chill.F. Humidity... Pressure.in. Visibility.mi.
## 1           42.1          36.1          58        29.76             10
## 2           36.9          57.0          91        29.68             10
## 3           36.0          57.0          97        29.70             10
## 4           36.0          57.0          97        29.70             10
## 5           39.0          57.0          55        29.65             10
## 6           37.0          29.8          93        29.69             10
##   Wind_Direction Wind_Speed.mph. Precipitation.in. Weather_Condition Amenity
## 1             SW            10.4              0.00        Light Rain   False
## 2           Calm             7.0              0.02        Light Rain   False
## 3           Calm             7.0              0.02          Overcast   False
## 4           Calm             7.0              0.02          Overcast   False
## 5           Calm             7.0              0.00          Overcast   False
## 6            WSW            10.4              0.01        Light Rain   False
##    Bump Crossing Give_Way Junction No_Exit Railway Roundabout Station  Stop
## 1 False    False    False    False   False   False      False   False False
## 2 False    False    False    False   False   False      False   False False
## 3 False    False    False     True   False   False      False   False False
## 4 False    False    False     True   False   False      False   False False
## 5 False    False    False    False   False   False      False   False False
## 6 False    False    False    False   False   False      False   False False
##   Traffic_Calming Traffic_Signal Turning_Loop Sunrise_Sunset Civil_Twilight
## 1           False          False        False          Night          Night
## 2           False          False        False          Night          Night
## 3           False          False        False          Night          Night
## 4           False          False        False          Night          Night
## 5           False          False        False          Night          Night
## 6           False          False        False            Day            Day
##   Nautical_Twilight Astronomical_Twilight Year Month Day Wday Hour X.Zipcode.
## 1             Night                 Night 2016     2   8    2    0    Zipcode
## 2             Night                 Night 2016     2   8    2    5    Zipcode
## 3             Night                   Day 2016     2   8    2    6    Zipcode
## 4             Night                   Day 2016     2   8    2    6    Zipcode
## 5               Day                   Day 2016     2   8    2    6    Zipcode
## 6               Day                   Day 2016     2   8    2    7    Zipcode
##   X.Month.
## 1    Month
## 2    Month
## 3    Month
## 4    Month
## 5    Month
## 6    Month
set.seed(1991)
rf <-randomForest(Severity~.,data=training, ntree=500)
print(rf)
## 
## Call:
##  randomForest(formula = Severity ~ ., data = training, ntree = 500) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 7
## 
##         OOB estimate of  error rate: 15.71%
## Confusion matrix:
##     1     2    3   4 class.error
## 1 238   395   31   1  0.64210526
## 2  71 27286  386 228  0.02448965
## 3  15  2400 1109 161  0.69905020
## 4   3  1602  206 870  0.67549422
floor(sqrt(ncol(training) - 1))
## [1] 7
mtry <- tuneRF(training[-1],training$Severity, ntreeTry=500,
stepFactor=1.5,improve=0.01, trace=TRUE, plot=TRUE)
## mtry = 7  OOB error = 0.05% 
## Searching left ...
## mtry = 5     OOB error = 0.23% 
## -3.263158 0.01 
## Searching right ...
## mtry = 10    OOB error = 0.01% 
## 0.7894737 0.01 
## mtry = 15    OOB error = 0% 
## 0.75 0.01 
## mtry = 22    OOB error = 0% 
## 0 0.01

best.m <- mtry[mtry[, 2] == min(mtry[, 2]), 1]
print(mtry)
##        mtry     OOBError
## 5.OOB     5 0.0023141535
## 7.OOB     7 0.0005428261
## 10.OOB   10 0.0001142792
## 15.OOB   15 0.0000285698
## 22.OOB   22 0.0000285698
print(best.m)
## 15.OOB 22.OOB 
##     15     22
set.seed(1991)
rf <-randomForest(Severity~.,data=training, mtry=best.m, importance=TRUE,ntree=500)
print(rf)
## 
## Call:
##  randomForest(formula = Severity ~ ., data = training, mtry = best.m,      importance = TRUE, ntree = 500) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 15
## 
##         OOB estimate of  error rate: 15.54%
## Confusion matrix:
##     1     2    3    4 class.error
## 1 257   369   37    2  0.61353383
## 2  91 27122  456  302  0.03035287
## 3  21  2310 1174  180  0.68141113
## 4   7  1447  217 1010  0.62327490
#Evaluate variable importance
importance(rf)
##                                1         2          3            4
## X                      4.8219907 23.094654  0.8728140  -2.50926652
## ID                     4.5437521 24.966109 -0.8004541  -5.53167148
## Date                  23.1101450 37.705234 15.0145965  12.97410652
## Time                   4.4714434 58.291412 17.1124820   9.03764621
## End_Time              24.3598337 39.338220 14.2190994  12.22482145
## Start_Lat             21.8552786 38.617527 25.3779456  -0.91088774
## Start_Lng             17.9490321 36.418698 25.1012961  13.16475281
## End_Lat               22.2398524 38.400894 23.7764888   1.15063439
## End_Lng               20.0345261 36.894517 25.0333443  12.68217770
## Distance.mi.          43.0977440 83.453903 38.7485496 100.84258239
## Number                 2.9429016 29.456216 10.1122566   8.66523763
## Street                11.0463188 72.010604 37.3524610  36.47891251
## Side                  -1.8534833 20.474933 12.1687470  -4.37567012
## City                  10.8067484 23.802297 13.3972536   0.11264732
## County                12.9720080 18.205068 18.7466795   1.19241609
## State                 27.7484252 32.357843 28.5260808   8.19377614
## Zipcode               21.4116773 46.066835 27.0640303  15.24205124
## Country                0.0000000  0.000000  0.0000000   0.00000000
## Timezone              10.3936746 12.068947 13.4326463   7.06278768
## Airport_Code          15.6460774 28.809041 13.2758968   3.91855966
## Weather_Timestamp     22.0463762 37.468828  6.0861617   9.88567466
## Temperature.F.        -0.2243507 34.662531  1.4381435   0.71433798
## Wind_Chill.F.          1.5934365 30.953809 -0.5815765   2.67372870
## Humidity...            7.7314547 35.103981 -2.2835100   6.64300098
## Pressure.in.          13.0991655 50.297480 10.5338585  -3.59281250
## Visibility.mi.         2.3668208 12.613300  0.3434382   3.72268283
## Wind_Direction        -1.3495888 22.278323  2.4919653   3.14711078
## Wind_Speed.mph.       -1.3169168 21.656553  0.3324858   3.40634282
## Precipitation.in.      1.1319857  3.066885 -0.7584083  -1.58458875
## Weather_Condition      3.0069967 11.619510  3.4555140   0.88091589
## Amenity                1.3369352 -1.108658  0.5347562  -0.84805607
## Bump                   0.0000000  1.342974 -1.0010015   0.00000000
## Crossing              22.6849259 15.693086 -2.1852207   5.72203478
## Give_Way               0.0000000  1.995125  1.3687466   3.93988359
## Junction               6.9603393 11.909184  4.3518513  13.90415992
## No_Exit                0.0000000 -2.355656  2.6953638  -1.00100150
## Railway                1.4047352  1.987739 -7.3998584  -0.73173207
## Roundabout             0.0000000  0.000000  0.0000000   0.00000000
## Station                1.3515362  5.754554 -2.6072520   1.86639913
## Stop                  -0.3721685  4.152632 -0.2092429  -0.50548604
## Traffic_Calming        0.0000000  2.464307 -1.0010015   1.00100150
## Traffic_Signal        21.3611712 25.849222  7.0203346   1.36006407
## Turning_Loop           0.0000000  0.000000  0.0000000   0.00000000
## Sunrise_Sunset         4.5647691 19.746621 -0.3620615   3.44938717
## Civil_Twilight         6.3698016 21.264256 -2.0421620   4.74309569
## Nautical_Twilight      9.2288646 23.401505 -0.7773985   6.48003569
## Astronomical_Twilight  8.7853564 23.292451 -1.3710409   2.78211117
## Year                  11.4839159 11.526351  2.3819272   4.67070751
## Month                 10.5121548 11.338570  7.9936243   0.06417341
## Day                    0.8785365 13.276897  1.0756971   2.48918164
## Wday                   3.0721225 69.125418 -2.6674794   5.01903709
## Hour                   2.0695031 38.931364 13.0768389   6.41194441
## X.Zipcode.             0.0000000  0.000000  0.0000000   0.00000000
## X.Month.               0.0000000  0.000000  0.0000000   0.00000000
##                       MeanDecreaseAccuracy MeanDecreaseGini
## X                               22.7738559    348.070809435
## ID                              24.4348813    340.781207824
## Date                            37.2663329    616.129987807
## Time                            62.8617757    470.574289907
## End_Time                        38.3324955    652.565528474
## Start_Lat                       41.8237692    463.535957066
## Start_Lng                       37.5616967    638.094287925
## End_Lat                         41.5653210    461.588597717
## End_Lng                         38.1615925    613.749410721
## Distance.mi.                    94.3693642    798.113664875
## Number                          28.4707286    184.816421753
## Street                          78.3445982    542.075785812
## Side                            21.4767963     45.346733279
## City                            29.1861937    301.879080602
## County                          21.4102410    264.447085959
## State                           35.1244244    303.575301236
## Zipcode                         48.5106810    711.794170897
## Country                          0.0000000      0.000000000
## Timezone                        12.7823505    110.641953881
## Airport_Code                    33.7442168    278.748656513
## Weather_Timestamp               35.4894543    495.988484170
## Temperature.F.                  34.8900961    295.387156087
## Wind_Chill.F.                   31.3031219    177.391466766
## Humidity...                     33.6024974    314.994510519
## Pressure.in.                    50.0883461    370.524956679
## Visibility.mi.                  12.6957726     87.482718107
## Wind_Direction                  20.9627149    227.656772556
## Wind_Speed.mph.                 21.1497776    236.995711294
## Precipitation.in.                2.3263157     51.353763073
## Weather_Condition               12.7463850    166.440204304
## Amenity                         -0.8278739      6.069879578
## Bump                             0.8202088      0.635493103
## Crossing                        26.7349471     47.203790880
## Give_Way                         3.4674894      5.384013985
## Junction                        17.8862264     44.869257281
## No_Exit                         -1.8333017      1.381646405
## Railway                         -1.1283750      7.678839150
## Roundabout                       0.0000000      0.003666667
## Station                          5.0384145      8.421735855
## Stop                             3.5539506      7.698152863
## Traffic_Calming                  1.9038874      0.907204434
## Traffic_Signal                  32.3704927     58.867172956
## Turning_Loop                     0.0000000      0.000000000
## Sunrise_Sunset                  17.9205827     34.703993458
## Civil_Twilight                  19.7803978     42.625844420
## Nautical_Twilight               23.0657171     66.683851571
## Astronomical_Twilight           23.3549169     63.552287911
## Year                            11.6438091     91.193110846
## Month                           11.3376632    208.754838504
## Day                             13.3949423    276.529311167
## Wday                            65.0454712    248.635222001
## Hour                            42.0426659    249.311308188
## X.Zipcode.                       0.0000000      0.000000000
## X.Month.                         0.0000000      0.000000000
varImpPlot(rf)