Visualization and Discovery section
library(pastecs)
US_accident <- read.table("US_Accidents_Dec20_updated.csv", sep=",", header = T, fill = T)
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec, :
## EOF within quoted string
summary(US_accident)
## ID Severity Start_Time End_Time
## Length:779138 Length:779138 Length:779138 Length:779138
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
## Start_Lat Start_Lng End_Lat End_Lng
## Length:779138 Length:779138 Length:779138 Length:779138
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
## Distance.mi. Description Number Street
## Length:779138 Length:779138 Length:779138 Length:779138
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
## Side City County State
## Length:779138 Length:779138 Length:779138 Length:779138
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
## Zipcode Country Timezone Airport_Code
## Length:779138 Length:779138 Length:779138 Length:779138
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
## Weather_Timestamp Temperature.F. Wind_Chill.F. Humidity...
## Length:779138 Length:779138 Length:779138 Length:779138
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
## Pressure.in. Visibility.mi. Wind_Direction Wind_Speed.mph.
## Length:779138 Length:779138 Length:779138 Length:779138
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
## Precipitation.in. Weather_Condition Amenity Bump
## Length:779138 Length:779138 Length:779138 Length:779138
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
## Crossing Give_Way Junction No_Exit
## Length:779138 Length:779138 Length:779138 Length:779138
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
## Railway Roundabout Station Stop
## Length:779138 Length:779138 Length:779138 Length:779138
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
## Traffic_Calming Traffic_Signal Turning_Loop Sunrise_Sunset
## Length:779138 Length:779138 Length:779138 Length:779138
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
## Civil_Twilight Nautical_Twilight Astronomical_Twilight
## Length:779138 Length:779138 Length:779138
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:pastecs':
##
## first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v stringr 1.4.0
## v tidyr 1.2.0 v forcats 0.5.1
## v readr 2.1.2
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x tidyr::extract() masks pastecs::extract()
## x dplyr::filter() masks stats::filter()
## x dplyr::first() masks pastecs::first()
## x dplyr::lag() masks stats::lag()
## x dplyr::last() masks pastecs::last()
library(readr)
df <- read_csv("US_Accidents_Dec20_updated.csv", col_types = cols(.default = col_character())) %>%
type_convert()
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_character(),
## Severity = col_double(),
## Start_Time = col_datetime(format = ""),
## End_Time = col_datetime(format = ""),
## Start_Lat = col_double(),
## Start_Lng = col_double(),
## End_Lat = col_double(),
## End_Lng = col_double(),
## `Distance(mi)` = col_double(),
## Number = col_double(),
## Weather_Timestamp = col_datetime(format = ""),
## `Temperature(F)` = col_double(),
## `Wind_Chill(F)` = col_double(),
## `Humidity(%)` = col_double(),
## `Pressure(in)` = col_double(),
## `Visibility(mi)` = col_double(),
## `Wind_Speed(mph)` = col_double(),
## `Precipitation(in)` = col_double(),
## Amenity = col_logical(),
## Bump = col_logical(),
## Crossing = col_logical()
## # ... with 10 more columns
## )
## i Use `spec()` for the full column specifications.
head(df)
## # A tibble: 6 x 47
## ID Severity Start_Time End_Time Start_Lat Start_Lng
## <chr> <dbl> <dttm> <dttm> <dbl> <dbl>
## 1 A-2716600 3 2016-02-08 00:37:08 2016-02-08 06:37:08 40.1 -83.1
## 2 A-2716601 2 2016-02-08 05:56:20 2016-02-08 11:56:20 39.9 -84.1
## 3 A-2716602 2 2016-02-08 06:15:39 2016-02-08 12:15:39 39.1 -84.5
## 4 A-2716603 2 2016-02-08 06:15:39 2016-02-08 12:15:39 39.1 -84.5
## 5 A-2716604 2 2016-02-08 06:51:45 2016-02-08 12:51:45 41.1 -81.5
## 6 A-2716605 3 2016-02-08 07:53:43 2016-02-08 13:53:43 39.2 -84.5
## # ... with 41 more variables: End_Lat <dbl>, End_Lng <dbl>,
## # `Distance(mi)` <dbl>, Description <chr>, Number <dbl>, Street <chr>,
## # Side <chr>, City <chr>, County <chr>, State <chr>, Zipcode <chr>,
## # Country <chr>, Timezone <chr>, Airport_Code <chr>,
## # Weather_Timestamp <dttm>, `Temperature(F)` <dbl>, `Wind_Chill(F)` <dbl>,
## # `Humidity(%)` <dbl>, `Pressure(in)` <dbl>, `Visibility(mi)` <dbl>,
## # Wind_Direction <chr>, `Wind_Speed(mph)` <dbl>, ...
summary(df)
## ID Severity Start_Time
## Length:1516064 Min. :1.000 Min. :2016-02-08 00:37:08
## Class :character 1st Qu.:2.000 1st Qu.:2018-07-17 14:41:25
## Mode :character Median :2.000 Median :2020-01-24 11:16:33
## Mean :2.239 Mean :2019-07-15 07:01:48
## 3rd Qu.:2.000 3rd Qu.:2020-10-22 13:01:30
## Max. :4.000 Max. :2020-12-31 23:28:56
##
## End_Time Start_Lat Start_Lng
## Min. :2016-02-08 06:37:08 Min. :24.57 Min. :-124.50
## 1st Qu.:2018-07-17 17:13:14 1st Qu.:33.85 1st Qu.:-118.21
## Median :2020-01-24 13:38:15 Median :37.35 Median : -94.38
## Mean :2019-07-15 11:42:20 Mean :36.90 Mean : -98.60
## 3rd Qu.:2020-10-22 17:50:19 3rd Qu.:40.73 3rd Qu.: -80.87
## Max. :2021-01-01 00:00:00 Max. :49.00 Max. : -67.11
##
## End_Lat End_Lng Distance(mi) Description
## Min. :24.57 Min. :-124.50 Min. : 0.0000 Length:1516064
## 1st Qu.:33.85 1st Qu.:-118.21 1st Qu.: 0.0000 Class :character
## Median :37.35 Median : -94.38 Median : 0.1780 Mode :character
## Mean :36.90 Mean : -98.60 Mean : 0.5873
## 3rd Qu.:40.73 3rd Qu.: -80.87 3rd Qu.: 0.5940
## Max. :49.08 Max. : -67.11 Max. :155.1860
##
## Number Street Side City
## Min. : 0 Length:1516064 Length:1516064 Length:1516064
## 1st Qu.: 1212 Class :character Class :character Class :character
## Median : 4000 Mode :character Mode :character Mode :character
## Mean : 8908
## 3rd Qu.: 10100
## Max. :9999997
## NA's :1046095
## County State Zipcode Country
## Length:1516064 Length:1516064 Length:1516064 Length:1516064
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Timezone Airport_Code Weather_Timestamp
## Length:1516064 Length:1516064 Min. :2016-02-08 00:53:00
## Class :character Class :character 1st Qu.:2018-07-10 10:55:30
## Mode :character Mode :character Median :2020-01-22 05:53:00
## Mean :2019-07-12 00:02:11
## 3rd Qu.:2020-10-21 04:54:00
## Max. :2020-12-31 23:35:00
## NA's :30264
## Temperature(F) Wind_Chill(F) Humidity(%) Pressure(in)
## Min. :-89.00 Min. :-89.0 Min. : 1.00 Min. : 0.00
## 1st Qu.: 47.00 1st Qu.: 40.8 1st Qu.: 48.00 1st Qu.:29.44
## Median : 61.00 Median : 57.0 Median : 68.00 Median :29.88
## Mean : 59.58 Mean : 55.1 Mean : 64.66 Mean :29.55
## 3rd Qu.: 73.00 3rd Qu.: 71.0 3rd Qu.: 84.00 3rd Qu.:30.04
## Max. :170.60 Max. :113.0 Max. :100.00 Max. :58.04
## NA's :43033 NA's :449316 NA's :45509 NA's :36274
## Visibility(mi) Wind_Direction Wind_Speed(mph) Precipitation(in)
## Min. : 0.00 Length:1516064 Min. : 0.00 Min. : 0
## 1st Qu.: 10.00 Class :character 1st Qu.: 4.60 1st Qu.: 0
## Median : 10.00 Mode :character Median : 7.00 Median : 0
## Mean : 9.13 Mean : 7.63 Mean : 0
## 3rd Qu.: 10.00 3rd Qu.: 10.40 3rd Qu.: 0
## Max. :140.00 Max. :984.00 Max. :24
## NA's :44211 NA's :128862 NA's :510549
## Weather_Condition Amenity Bump Crossing
## Length:1516064 Mode :logical Mode :logical Mode :logical
## Class :character FALSE:1503661 FALSE:1515803 FALSE:1429681
## Mode :character TRUE :12403 TRUE :261 TRUE :86383
##
##
##
##
## Give_Way Junction No_Exit Railway
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:1512809 FALSE:1311566 FALSE:1514335 FALSE:1503480
## TRUE :3255 TRUE :204498 TRUE :1729 TRUE :12584
##
##
##
##
## Roundabout Station Stop Traffic_Calming
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:1516013 FALSE:1487917 FALSE:1498368 FALSE:1515575
## TRUE :51 TRUE :28147 TRUE :17696 TRUE :489
##
##
##
##
## Traffic_Signal Turning_Loop Sunrise_Sunset Civil_Twilight
## Mode :logical Mode :logical Length:1516064 Length:1516064
## FALSE:1346095 FALSE:1516064 Class :character Class :character
## TRUE :169969 Mode :character Mode :character
##
##
##
##
## Nautical_Twilight Astronomical_Twilight
## Length:1516064 Length:1516064
## Class :character Class :character
## Mode :character Mode :character
##
##
##
##
str(df)
## spec_tbl_df [1,516,064 x 47] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ID : chr [1:1516064] "A-2716600" "A-2716601" "A-2716602" "A-2716603" ...
## $ Severity : num [1:1516064] 3 2 2 2 2 3 2 2 2 2 ...
## $ Start_Time : POSIXct[1:1516064], format: "2016-02-08 00:37:08" "2016-02-08 05:56:20" ...
## $ End_Time : POSIXct[1:1516064], format: "2016-02-08 06:37:08" "2016-02-08 11:56:20" ...
## $ Start_Lat : num [1:1516064] 40.1 39.9 39.1 39.1 41.1 ...
## $ Start_Lng : num [1:1516064] -83.1 -84.1 -84.5 -84.5 -81.5 ...
## $ End_Lat : num [1:1516064] 40.1 39.9 39.1 39.1 41.1 ...
## $ End_Lng : num [1:1516064] -83 -84 -84.5 -84.5 -81.5 ...
## $ Distance(mi) : num [1:1516064] 3.23 0.747 0.055 0.219 0.123 ...
## $ Description : chr [1:1516064] "Between Sawmill Rd/Exit 20 and OH-315/Olentangy Riv Rd/Exit 22 - Accident." "At OH-4/OH-235/Exit 41 - Accident." "At I-71/US-50/Exit 1 - Accident." "At I-71/US-50/Exit 1 - Accident." ...
## $ Number : num [1:1516064] NA NA NA NA NA ...
## $ Street : chr [1:1516064] "Outerbelt E" "I-70 E" "I-75 S" "US-50 E" ...
## $ Side : chr [1:1516064] "R" "R" "R" "R" ...
## $ City : chr [1:1516064] "Dublin" "Dayton" "Cincinnati" "Cincinnati" ...
## $ County : chr [1:1516064] "Franklin" "Montgomery" "Hamilton" "Hamilton" ...
## $ State : chr [1:1516064] "OH" "OH" "OH" "OH" ...
## $ Zipcode : chr [1:1516064] "43017" "45424" "45203" "45202" ...
## $ Country : chr [1:1516064] "US" "US" "US" "US" ...
## $ Timezone : chr [1:1516064] "US/Eastern" "US/Eastern" "US/Eastern" "US/Eastern" ...
## $ Airport_Code : chr [1:1516064] "KOSU" "KFFO" "KLUK" "KLUK" ...
## $ Weather_Timestamp : POSIXct[1:1516064], format: "2016-02-08 00:53:00" "2016-02-08 05:58:00" ...
## $ Temperature(F) : num [1:1516064] 42.1 36.9 36 36 39 37 35.6 35.6 33.8 33.1 ...
## $ Wind_Chill(F) : num [1:1516064] 36.1 NA NA NA NA 29.8 29.2 29.2 NA 30 ...
## $ Humidity(%) : num [1:1516064] 58 91 97 97 55 93 100 100 100 92 ...
## $ Pressure(in) : num [1:1516064] 29.8 29.7 29.7 29.7 29.6 ...
## $ Visibility(mi) : num [1:1516064] 10 10 10 10 10 10 10 10 3 0.5 ...
## $ Wind_Direction : chr [1:1516064] "SW" "Calm" "Calm" "Calm" ...
## $ Wind_Speed(mph) : num [1:1516064] 10.4 NA NA NA NA 10.4 8.1 8.1 2.3 3.5 ...
## $ Precipitation(in) : num [1:1516064] 0 0.02 0.02 0.02 NA 0.01 NA NA NA 0.08 ...
## $ Weather_Condition : chr [1:1516064] "Light Rain" "Light Rain" "Overcast" "Overcast" ...
## $ Amenity : logi [1:1516064] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Bump : logi [1:1516064] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Crossing : logi [1:1516064] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Give_Way : logi [1:1516064] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Junction : logi [1:1516064] FALSE FALSE TRUE TRUE FALSE FALSE ...
## $ No_Exit : logi [1:1516064] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Railway : logi [1:1516064] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Roundabout : logi [1:1516064] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Station : logi [1:1516064] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Stop : logi [1:1516064] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Traffic_Calming : logi [1:1516064] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Traffic_Signal : logi [1:1516064] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Turning_Loop : logi [1:1516064] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Sunrise_Sunset : chr [1:1516064] "Night" "Night" "Night" "Night" ...
## $ Civil_Twilight : chr [1:1516064] "Night" "Night" "Night" "Night" ...
## $ Nautical_Twilight : chr [1:1516064] "Night" "Night" "Night" "Night" ...
## $ Astronomical_Twilight: chr [1:1516064] "Night" "Night" "Day" "Day" ...
## - attr(*, "problems")=<externalptr>
(colMeans(is.na(df)))*100
## ID Severity Start_Time
## 0.000000000 0.000000000 0.000000000
## End_Time Start_Lat Start_Lng
## 0.000000000 0.000000000 0.000000000
## End_Lat End_Lng Distance(mi)
## 0.000000000 0.000000000 0.000000000
## Description Number Street
## 0.000000000 69.000715009 0.000000000
## Side City County
## 0.000000000 0.005474703 0.000000000
## State Zipcode Country
## 0.000000000 0.061672858 0.000000000
## Timezone Airport_Code Weather_Timestamp
## 0.151840556 0.280199253 1.996221795
## Temperature(F) Wind_Chill(F) Humidity(%)
## 2.838468561 29.637007409 3.001786204
## Pressure(in) Visibility(mi) Wind_Direction
## 2.392643055 2.916169766 2.760965236
## Wind_Speed(mph) Precipitation(in) Weather_Condition
## 8.499773097 33.675952994 2.902713870
## Amenity Bump Crossing
## 0.000000000 0.000000000 0.000000000
## Give_Way Junction No_Exit
## 0.000000000 0.000000000 0.000000000
## Railway Roundabout Station
## 0.000000000 0.000000000 0.000000000
## Stop Traffic_Calming Traffic_Signal
## 0.000000000 0.000000000 0.000000000
## Turning_Loop Sunrise_Sunset Civil_Twilight
## 0.000000000 0.005474703 0.005474703
## Nautical_Twilight Astronomical_Twilight
## 0.005474703 0.005474703
my_data <- df[ , c("Severity", "Start_Time", "End_Time", "Temperature(F)", "Humidity(%)", "Visibility(mi)", "Wind_Speed(mph)", "Start_Lat", "Start_Lng", "Zipcode", "Pressure(in)", "Junction", "Sunrise_Sunset", "Distance(mi)", "Wind_Chill(F)", "Wind_Direction", "Precipitation(in)", "State")]
library(pastecs)
stat.desc(my_data)
## Severity Start_Time End_Time Temperature(F) Humidity(%)
## nbr.val 1.516064e+06 1.516064e+06 1.516064e+06 1.473031e+06 1.470555e+06
## nbr.null 0.000000e+00 0.000000e+00 0.000000e+00 5.650000e+02 0.000000e+00
## nbr.na 0.000000e+00 0.000000e+00 0.000000e+00 4.303300e+04 4.550900e+04
## min 1.000000e+00 1.454892e+09 1.454913e+09 -8.900000e+01 1.000000e+00
## max 4.000000e+00 1.609457e+09 1.609459e+09 1.706000e+02 1.000000e+02
## range 3.000000e+00 1.545655e+08 1.545458e+08 2.596000e+02 9.900000e+01
## sum 3.393906e+06 2.369872e+15 2.369898e+15 8.776996e+07 9.508550e+07
## median 2.000000e+00 1.579865e+09 1.579873e+09 6.100000e+01 6.800000e+01
## mean 2.238630e+00 1.563174e+09 1.563191e+09 5.958460e+01 6.465960e+01
## SE.mean 4.939132e-04 3.640382e+04 3.639897e+04 1.505594e-02 1.918079e-02
## CI.mean.0.95 9.680528e-04 7.135024e+04 7.134074e+04 2.950912e-02 3.759369e-02
## var 3.698441e-01 2.009146e+15 2.008611e+15 3.339085e+02 5.410213e+02
## std.dev 6.081481e-01 4.482350e+07 4.481753e+07 1.827316e+01 2.325986e+01
## coef.var 2.716609e-01 2.867467e-02 2.867054e-02 3.066760e-01 3.597279e-01
## Visibility(mi) Wind_Speed(mph) Start_Lat Start_Lng Zipcode
## nbr.val 1.471853e+06 1.387202e+06 1.516064e+06 1.516064e+06 NA
## nbr.null 1.410000e+03 2.028770e+05 0.000000e+00 0.000000e+00 NA
## nbr.na 4.421100e+04 1.288620e+05 0.000000e+00 0.000000e+00 NA
## min 0.000000e+00 0.000000e+00 2.457022e+01 -1.244976e+02 NA
## max 1.400000e+02 9.840000e+02 4.900058e+01 -6.711317e+01 NA
## range 1.400000e+02 9.840000e+02 2.443036e+01 5.738440e+01 NA
## sum 1.344060e+07 1.058548e+07 5.594361e+07 -1.494827e+08 NA
## median 1.000000e+01 7.000000e+00 3.735113e+01 -9.438100e+01 NA
## mean 9.131755e+00 7.630812e+00 3.690056e+01 -9.859919e+01 NA
## SE.mean 2.381399e-03 4.786370e-03 4.195333e-03 1.502172e-02 NA
## CI.mean.0.95 4.667460e-03 9.381121e-03 8.222709e-03 2.944205e-02 NA
## var 8.346970e+00 3.177988e+01 2.668397e+01 3.421028e+02 NA
## std.dev 2.889112e+00 5.637364e+00 5.165653e+00 1.849602e+01 NA
## coef.var 3.163808e-01 7.387633e-01 1.399885e-01 -1.875880e-01 NA
## Pressure(in) Junction Sunrise_Sunset Distance(mi) Wind_Chill(F)
## nbr.val 1.479790e+06 NA NA 1.516064e+06 1.066748e+06
## nbr.null 1.000000e+00 NA NA 4.006130e+05 5.030000e+02
## nbr.na 3.627400e+04 NA NA 0.000000e+00 4.493160e+05
## min 0.000000e+00 NA NA 0.000000e+00 -8.900000e+01
## max 5.804000e+01 NA NA 1.551860e+02 1.130000e+02
## range 5.804000e+01 NA NA 1.551860e+02 2.020000e+02
## sum 4.373513e+07 NA NA 8.903264e+05 5.878823e+07
## median 2.988000e+01 NA NA 1.780000e-01 5.700000e+01
## mean 2.955495e+01 NA NA 5.872617e-01 5.510976e+01
## SE.mean 8.358277e-04 NA NA 1.325979e-03 2.045568e-02
## CI.mean.0.95 1.638194e-03 NA NA 2.598874e-03 4.009245e-02
## var 1.033793e+00 NA NA 2.665576e+00 4.463647e+02
## std.dev 1.016756e+00 NA NA 1.632659e+00 2.112735e+01
## coef.var 3.440222e-02 NA NA 2.780122e+00 3.833685e-01
## Wind_Direction Precipitation(in) State
## nbr.val NA 1.005515e+06 NA
## nbr.null NA 9.034290e+05 NA
## nbr.na NA 5.105490e+05 NA
## min NA 0.000000e+00 NA
## max NA 2.400000e+01 NA
## range NA 2.400000e+01 NA
## sum NA 8.524610e+03 NA
## median NA 0.000000e+00 NA
## mean NA 8.477855e-03 NA
## SE.mean NA 1.289617e-04 NA
## CI.mean.0.95 NA 2.527606e-04 NA
## var NA 1.672284e-02 NA
## std.dev NA 1.293168e-01 NA
## coef.var NA 1.525348e+01 NA
library(tidyr)
library(ggplot2)
library(KernSmooth)
## KernSmooth 2.23 loaded
## Copyright M. P. Wand 1997-2009
library(purrr)
library(dplyr)
options(scipen=999)
hist(my_data$'Severity',
breaks=seq(0,5,1),
col="brown1",
main="Histogram of Severity",
xlab="Severity",
ylab="Frequency",
xlim=c(0,4)
)

hist(my_data$'Start_Time',
breaks = 40,
col="goldenrod1",
main="Histogram of Start_Time",
xlab="Start_Time",
ylab="Frequency",
)
## Warning in breaks[-1L] + breaks[-nB]: NAs produced by integer overflow

hist(my_data$'End_Time',
breaks = 40,
col="burlywood1",
main="Histogram of End_Time",
xlab="End_Time",
ylab="Frequency",
)
## Warning in breaks[-1L] + breaks[-nB]: NAs produced by integer overflow

hist(my_data$'Temperature(F)',
breaks=40,
col="cornflowerblue",
main="Histogram of Temperature",
xlab="Temperature (Deg. F)",
ylab="Frequency",
font.main = 3
)

hist(my_data$'Humidity(%)',
breaks=40,
col="lightblue",
main="Histogram of Humidity",
xlab="Humidity(%)",
ylab="Frequency",
)

hist(my_data$'Visibility(mi)',
breaks=40,
col="azure4",
main="Histogram of Visibility",
xlim=c(0,150),
xlab="Visibility(mi)",
ylab="Frequency",
)

hist(my_data$'Wind_Speed(mph)',
breaks=40,
col="coral",
main="Histogram of Wind_Speed",
xlab="Wind_Speed(mph)",
ylab="Frequency",
)

hist(my_data$'Pressure(in)',
breaks=40,
col="aquamarine3",
main="Histogram of Pressure",
xlab="Pressure(in)",
ylab="Frequency",
)

hist(my_data$'Distance(mi)',
breaks=40,
col="darkorchid1",
main="Histogram of Distance",
xlab="Distance(mi)",
ylab="Frequency",
)

hist(my_data$"Wind_Chill(F)",
breaks=40,
col="darkseagreen1",
main="Histogram of Wind_Chill",
xlab="Wind_Chill(F)",
ylab="Frequency",
)

hist(my_data$"Precipitation(in)",
breaks=40,
col="slategray1",
main="Histogram of Precipitation",
xlab="Precipitation(in)",
ylab="Frequency",
)

S1 <- my_data %>%
group_by(Wind_Direction) %>%
summarise(n = n())
S1
## # A tibble: 25 x 2
## Wind_Direction n
## <chr> <int>
## 1 Calm 79192
## 2 CALM 202870
## 3 E 52435
## 4 East 24064
## 5 ENE 51257
## 6 ESE 51295
## 7 N 53718
## 8 NE 48355
## 9 NNE 46509
## 10 NNW 68014
## # ... with 15 more rows
options(scipen=999)
ggplot(S1, aes(x=S1$Wind_Direction, y=S1$n)) + geom_bar(stat="identity", position = position_dodge(width=2)) +
labs(x="Wind_Direction", y="Frequency")
## Warning: Use of `S1$Wind_Direction` is discouraged. Use `Wind_Direction`
## instead.
## Warning: Use of `S1$n` is discouraged. Use `n` instead.

S2 <- my_data %>%
group_by(State) %>%
summarise(n = n())
S2
## # A tibble: 49 x 2
## State n
## <chr> <int>
## 1 AL 9375
## 2 AR 4373
## 3 AZ 30185
## 4 CA 448833
## 5 CO 19809
## 6 CT 15194
## 7 DC 3788
## 8 DE 2331
## 9 FL 153007
## 10 GA 31111
## # ... with 39 more rows
options(scipen=999)
ggplot(S2, aes(x=S2$State, y=S2$n)) + geom_bar(stat="identity", position = position_dodge(width=2)) +
labs(x="State", y="Frequency")
## Warning: Use of `S2$State` is discouraged. Use `State` instead.
## Warning: Use of `S2$n` is discouraged. Use `n` instead.

library(rgdal)
## Loading required package: sp
## Please note that rgdal will be retired by the end of 2023,
## plan transition to sf/stars/terra functions using GDAL and PROJ
## at your earliest convenience.
##
## rgdal: version: 1.5-28, (SVN revision 1158)
## Geospatial Data Abstraction Library extensions to R successfully loaded
## Loaded GDAL runtime: GDAL 3.2.1, released 2020/12/29
## Path to GDAL shared files: C:/Users/onecouple/Documents/R/win-library/4.1/rgdal/gdal
## GDAL binary built with GEOS: TRUE
## Loaded PROJ runtime: Rel. 7.2.1, January 1st, 2021, [PJ_VERSION: 721]
## Path to PROJ shared files: C:/Users/onecouple/Documents/R/win-library/4.1/rgdal/proj
## PROJ CDN enabled: FALSE
## Linking to sp version:1.4-6
## To mute warnings of possible GDAL/OSR exportToProj4() degradation,
## use options("rgdal_show_exportToProj4_warnings"="none") before loading sp or rgdal.
## Overwritten PROJ_LIB was C:/Users/onecouple/Documents/R/win-library/4.1/rgdal/proj
library(shiny)
library(purrr)
library(usmap)
library(ggplot2)
library(dplyr)
my_data %>% select(Start_Time, End_Time) %>% head(5)
## # A tibble: 5 x 2
## Start_Time End_Time
## <dttm> <dttm>
## 1 2016-02-08 00:37:08 2016-02-08 06:37:08
## 2 2016-02-08 05:56:20 2016-02-08 11:56:20
## 3 2016-02-08 06:15:39 2016-02-08 12:15:39
## 4 2016-02-08 06:15:39 2016-02-08 12:15:39
## 5 2016-02-08 06:51:45 2016-02-08 12:51:45
library(stringr)
library(tidyr)
library(dplyr)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
accidents_time <- my_data %>%
mutate(Duration = as.numeric(End_Time - Start_Time)) %>%
filter(!(Duration < 0)) %>%
separate(Start_Time, into = c("Date", "Time"), sep = " ") %>%
mutate("Year" = str_sub(Date, 1, 4), "Month" = str_sub(Date, 6, 7),
"Day" = str_sub(Date, 9, 10), "Wday" = as.character(wday(Date)),
"Hour" = str_sub(Time, 1, 2)) %>%
select(-c("Date", "Time", "End_Time")) %>%
select(Severity, Year, Month, Day, Hour, Wday, Duration)
head(accidents_time)
## # A tibble: 6 x 7
## Severity Year Month Day Hour Wday Duration
## <dbl> <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 3 2016 02 08 00 2 360
## 2 2 2016 02 08 05 2 360
## 3 2 2016 02 08 06 2 360
## 4 2 2016 02 08 06 2 360
## 5 2 2016 02 08 06 2 360
## 6 3 2016 02 08 07 2 360
library(ggplot2)
accidents_happenHour <- accidents_time %>% count(Hour)
accidents_happenHour
## # A tibble: 24 x 2
## Hour n
## <chr> <int>
## 1 00 46125
## 2 01 43434
## 3 02 38801
## 4 03 32175
## 5 04 29478
## 6 05 40105
## 7 06 57859
## 8 07 73938
## 9 08 75802
## 10 09 59744
## # ... with 14 more rows
p <- ggplot(accidents_happenHour, aes(Hour, n))
p + geom_point(aes(color = n)) + labs(x = "Hour of a day", y = "Number of accidents")

accidents_severity <- accidents_time %>%
group_by(Hour) %>%
summarise(mean(Severity))
accidents_severity
## # A tibble: 24 x 2
## Hour `mean(Severity)`
## <chr> <dbl>
## 1 00 2.26
## 2 01 2.18
## 3 02 2.21
## 4 03 2.27
## 5 04 2.31
## 6 05 2.32
## 7 06 2.27
## 8 07 2.25
## 9 08 2.24
## 10 09 2.27
## # ... with 14 more rows
accident_summary <- merge(accidents_happenHour, accidents_severity)
accident_summary <- accident_summary %>% rename(Average_Severity = "mean(Severity)")
accident_summary
## Hour n Average_Severity
## 1 00 46125 2.256629
## 2 01 43434 2.179168
## 3 02 38801 2.206593
## 4 03 32175 2.272789
## 5 04 29478 2.306805
## 6 05 40105 2.320035
## 7 06 57859 2.272870
## 8 07 73938 2.248032
## 9 08 75802 2.242131
## 10 09 59744 2.272078
## 11 10 50442 2.285813
## 12 11 51884 2.266074
## 13 12 72283 2.222085
## 14 13 83700 2.198076
## 15 14 90162 2.204543
## 16 15 100074 2.223345
## 17 16 105559 2.227446
## 18 17 108011 2.225986
## 19 18 91413 2.235459
## 20 19 65190 2.243688
## 21 20 53515 2.248024
## 22 21 48988 2.231342
## 23 22 49156 2.232973
## 24 23 48226 2.232613
options(scipen=999)
ggplot(data = accident_summary) +
geom_col(mapping=aes(x=Hour, y=n, fill=Average_Severity)) +
scale_fill_distiller(palette="Reds", trans= "reverse") +
labs(
title = "Amount and Severity of Car Accidents by hour",
x = "Houro of a day",
y = "Number of accidents",
caption = "A Countrywide Traffic Accident Dataset, 2016-2020.",
fill = "Average Severity") +
scale_y_continuous(expand = expansion(mult = c(0, .1)))

library(stringr)
library(tidyr)
library(dplyr)
library(lubridate)
library(ggplot2)
accidents_day <- accidents_time %>% count(Wday)
accidents_day
## # A tibble: 7 x 2
## Wday n
## <chr> <int>
## 1 1 123775
## 2 2 235831
## 3 3 250900
## 4 4 255775
## 5 5 258036
## 6 6 254127
## 7 7 137620
options(scipen=999)
p <- ggplot(accidents_day, aes(Wday, n))
p + geom_point(aes(color = n))+
labs(x = "Day of a week", y = "Amount of accidents")

accidents_severity_day <- accidents_time %>%
group_by(Wday) %>%
summarise(mean(Severity))
accidents_severity_day
## # A tibble: 7 x 2
## Wday `mean(Severity)`
## <chr> <dbl>
## 1 1 2.27
## 2 2 2.24
## 3 3 2.23
## 4 4 2.23
## 5 5 2.23
## 6 6 2.24
## 7 7 2.26
accident_summary2 <- merge(accidents_day, accidents_severity_day)
accident_summary2 <- accident_summary2 %>% rename(Severity_mean = "mean(Severity)")
accident_summary2
## Wday n Severity_mean
## 1 1 123775 2.271436
## 2 2 235831 2.238637
## 3 3 250900 2.234189
## 4 4 255775 2.227208
## 5 5 258036 2.228050
## 6 6 254127 2.236960
## 7 7 137620 2.261357
options(scipen=999)
ggplot(data = accident_summary2) +
geom_col(mapping=aes(x=Wday, y=n, fill=Severity_mean)) +
scale_fill_distiller(palette="Blues", trans= "reverse") +
labs(
title = "Car Accidents each Day of the Week",
x = "Day of the week",
y = "Number of accidents",
caption = "A Countrywide Traffic Accident Dataset, 2016-2020.",
fill = "Severity_mean") +
scale_y_continuous(expand = expansion(mult = c(0, .1)))

library(stringr)
library(tidyr)
library(dplyr)
library(lubridate)
library(ggplot2)
accidents_year <- accidents_time %>% count(Year)
accidents_year
## # A tibble: 5 x 2
## Year n
## <chr> <int>
## 1 2016 129325
## 2 2017 170099
## 3 2018 166936
## 4 2019 261772
## 5 2020 787932
options(scipen=999)
p <- ggplot(accidents_year, aes(Year, n))
p + geom_point(aes(color = n)) + labs(x = "Year", y = "Amount of accidents")

accidents_severity_year <- accidents_time %>%
group_by(Year) %>%
summarise(mean(Severity))
accidents_severity_year
## # A tibble: 5 x 2
## Year `mean(Severity)`
## <chr> <dbl>
## 1 2016 2.39
## 2 2017 2.46
## 3 2018 2.49
## 4 2019 2.30
## 5 2020 2.09
options(scipen=999)
accident_summary3 <- merge(accidents_year, accidents_severity_year)
accident_summary3 <- accident_summary3 %>% rename(Severity_mean_year = "mean(Severity)")
accident_summary3
## Year n Severity_mean_year
## 1 2016 129325 2.393559
## 2 2017 170099 2.463213
## 3 2018 166936 2.485617
## 4 2019 261772 2.303665
## 5 2020 787932 2.090783
ggplot(data = accident_summary3) +
geom_col(mapping=aes(x=Year, y=n, fill=Severity_mean_year)) +
scale_fill_distiller(palette="Purples", trans= "reverse") +
labs(
title = "Car Accidents from 2016 to 2020",
x = "Year",
y = "Amount of accidents",
caption = "A Countrywide Traffic Accident Dataset, 2016-2020.",
fill = "Severity_mean_year") +
scale_y_continuous(expand = expansion(mult = c(0, .1)))

library(stringr)
library(tidyr)
library(dplyr)
library(lubridate)
library(ggplot2)
accidents_month <- accidents_time %>% count(Month)
accidents_month
## # A tibble: 12 x 2
## Month n
## <chr> <int>
## 1 01 88540
## 2 02 82419
## 3 03 96802
## 4 04 107007
## 5 05 108195
## 6 06 113048
## 7 07 53650
## 8 08 62903
## 9 09 122906
## 10 10 181074
## 11 11 222031
## 12 12 277489
options(scipen=999)
p <- ggplot(accidents_month, aes(Month, n))
p + geom_point(aes(color = n)) + labs(x = "Month", y = "Amount of accidents")

accidents_severity_month <- accidents_time %>%
group_by(Month) %>%
summarise(mean(Severity))
accidents_severity_month
## # A tibble: 12 x 2
## Month `mean(Severity)`
## <chr> <dbl>
## 1 01 2.33
## 2 02 2.31
## 3 03 2.30
## 4 04 2.28
## 5 05 2.32
## 6 06 2.34
## 7 07 2.47
## 8 08 2.40
## 9 09 2.22
## 10 10 2.17
## 11 11 2.14
## 12 12 2.12
options(scipen=999)
accident_summary4 <- merge(accidents_month, accidents_severity_month)
accident_summary4 <- accident_summary4 %>% rename(Severity_mean_month = "mean(Severity)")
accident_summary4
## Month n Severity_mean_month
## 1 01 88540 2.333860
## 2 02 82419 2.309067
## 3 03 96802 2.303041
## 4 04 107007 2.283514
## 5 05 108195 2.321013
## 6 06 113048 2.343447
## 7 07 53650 2.467866
## 8 08 62903 2.402350
## 9 09 122906 2.216320
## 10 10 181074 2.171383
## 11 11 222031 2.139354
## 12 12 277489 2.124484
ggplot(data = accident_summary4) +
geom_col(mapping=aes(x=Month, y=n, fill=Severity_mean_month)) +
scale_fill_distiller(palette="Oranges", trans= "reverse") +
labs(
title = "Car Accidents by Month",
x = "Month",
y = "Amount of accidents",
caption = "A Countrywide Traffic Accident Dataset, 2016-2020.",
fill = "Severity_mean_month") +
scale_y_continuous(expand = expansion(mult = c(0, .1)))

library(ggplot2)
library(wordcloud)
## Loading required package: RColorBrewer
library(RColorBrewer)
library(wordcloud2)
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
accidents_weather <- df %>% count(Weather_Condition)
accidents_weather
## # A tibble: 117 x 2
## Weather_Condition n
## <chr> <int>
## 1 Blowing Dust 78
## 2 Blowing Dust / Windy 79
## 3 Blowing Snow 144
## 4 Blowing Snow / Windy 63
## 5 Clear 180223
## 6 Cloudy 161291
## 7 Cloudy / Windy 3300
## 8 Drifting Snow 1
## 9 Drizzle 1023
## 10 Drizzle / Windy 3
## # ... with 107 more rows
text <- df$Weather_Condition
docs <- Corpus(VectorSource(text))
dtm <- TermDocumentMatrix(docs)
matrix <- as.matrix(dtm)
words <- sort(rowSums(matrix),decreasing=TRUE)
df_weather <- data.frame(word = names(words),freq=words)
set.seed(1234)
wordcloud(words = df_weather$word, freq = df_weather$freq, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.35,colors=brewer.pal(8, "Set2"))

library(maps)
##
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
##
## map
library(usmap)
library(ggplot2)
library(patchwork)
US <- map_data('state')
head(US)
## long lat group order region subregion
## 1 -87.46201 30.38968 1 1 alabama <NA>
## 2 -87.48493 30.37249 1 2 alabama <NA>
## 3 -87.52503 30.37249 1 3 alabama <NA>
## 4 -87.53076 30.33239 1 4 alabama <NA>
## 5 -87.57087 30.32665 1 5 alabama <NA>
## 6 -87.58806 30.32665 1 6 alabama <NA>
ggplot()+
geom_map(data = US, map = US, mapping = aes( map_id = region, x = long, y = lat, group = group), color = "darkgray", fill = "white", size = 0.5) +
geom_point(data = df, mapping=aes(x = Start_Lng, y = Start_Lat, color = Severity), size = 0.005)+
labs(title = "US Traffic Accidents",
subtitle = "Source: A Countrywide Traffic Accident Dataset, 2016-2020.") +
theme(legend.position = "right")
## Warning: Ignoring unknown aesthetics: x, y

RF Model Section
library(stringr)
library(tidyr)
library(dplyr)
library(lubridate)
library(maps)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(zoo)
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(party)
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Loading required package: sandwich
##
## Attaching package: 'strucchange'
## The following object is masked from 'package:stringr':
##
## boundary
library(randomForest)
## randomForest 4.7-1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
accidents <-read.csv("US_Accidents_Dec20_filled.csv")
head(accidents)
## X ID Severity Date Time End_Time Start_Lat
## 1 1 A-2716600 3 2016-02-08 00:37:08 2016-02-08 06:37:08 40.10891
## 2 2 A-2716601 2 2016-02-08 05:56:20 2016-02-08 11:56:20 39.86542
## 3 3 A-2716602 2 2016-02-08 06:15:39 2016-02-08 12:15:39 39.10266
## 4 4 A-2716603 2 2016-02-08 06:15:39 2016-02-08 12:15:39 39.10148
## 5 5 A-2716604 2 2016-02-08 06:51:45 2016-02-08 12:51:45 41.06213
## 6 6 A-2716605 3 2016-02-08 07:53:43 2016-02-08 13:53:43 39.17239
## Start_Lng End_Lat End_Lng Distance.mi.
## 1 -83.09286 40.11206 -83.03187 3.230
## 2 -84.06280 39.86501 -84.04873 0.747
## 3 -84.52468 39.10209 -84.52396 0.055
## 4 -84.52341 39.09841 -84.52241 0.219
## 5 -81.53784 41.06217 -81.53547 0.123
## 6 -84.49279 39.17048 -84.50180 0.500
## Description
## 1 Between Sawmill Rd/Exit 20 and OH-315/Olentangy Riv Rd/Exit 22 - Accident.
## 2 At OH-4/OH-235/Exit 41 - Accident.
## 3 At I-71/US-50/Exit 1 - Accident.
## 4 At I-71/US-50/Exit 1 - Accident.
## 5 At Dart Ave/Exit 21 - Accident.
## 6 At Mitchell Ave/Exit 6 - Accident.
## Number Street Side City County State Zipcode Country
## 1 4000 Outerbelt E R Dublin Franklin OH 43017 US
## 2 4000 I-70 E R Dayton Montgomery OH 45424 US
## 3 4000 I-75 S R Cincinnati Hamilton OH 45203 US
## 4 4000 US-50 E R Cincinnati Hamilton OH 45202 US
## 5 4000 I-77 N R Akron Summit OH 44311 US
## 6 4000 I-75 S R Cincinnati Hamilton OH 45217 US
## Timezone Airport_Code Weather_Timestamp Temperature.F. Wind_Chill.F.
## 1 US/Eastern KOSU 2016-02-08 00:53:00 42.1 36.1
## 2 US/Eastern KFFO 2016-02-08 05:58:00 36.9 57.0
## 3 US/Eastern KLUK 2016-02-08 05:53:00 36.0 57.0
## 4 US/Eastern KLUK 2016-02-08 05:53:00 36.0 57.0
## 5 US/Eastern KAKR 2016-02-08 06:54:00 39.0 57.0
## 6 US/Eastern KLUK 2016-02-08 07:53:00 37.0 29.8
## Humidity... Pressure.in. Visibility.mi. Wind_Direction Wind_Speed.mph.
## 1 58 29.76 10 SW 10.4
## 2 91 29.68 10 Calm 7.0
## 3 97 29.70 10 Calm 7.0
## 4 97 29.70 10 Calm 7.0
## 5 55 29.65 10 Calm 7.0
## 6 93 29.69 10 WSW 10.4
## Precipitation.in. Weather_Condition Amenity Bump Crossing Give_Way Junction
## 1 0.00 Light Rain False False False False False
## 2 0.02 Light Rain False False False False False
## 3 0.02 Overcast False False False False True
## 4 0.02 Overcast False False False False True
## 5 0.00 Overcast False False False False False
## 6 0.01 Light Rain False False False False False
## No_Exit Railway Roundabout Station Stop Traffic_Calming Traffic_Signal
## 1 False False False False False False False
## 2 False False False False False False False
## 3 False False False False False False False
## 4 False False False False False False False
## 5 False False False False False False False
## 6 False False False False False False False
## Turning_Loop Sunrise_Sunset Civil_Twilight Nautical_Twilight
## 1 False Night Night Night
## 2 False Night Night Night
## 3 False Night Night Night
## 4 False Night Night Night
## 5 False Night Night Day
## 6 False Day Day Day
## Astronomical_Twilight Year Month Day Wday Hour X.Zipcode. X.Month.
## 1 Night 2016 2 8 2 0 Zipcode Month
## 2 Night 2016 2 8 2 5 Zipcode Month
## 3 Day 2016 2 8 2 6 Zipcode Month
## 4 Day 2016 2 8 2 6 Zipcode Month
## 5 Day 2016 2 8 2 6 Zipcode Month
## 6 Day 2016 2 8 2 7 Zipcode Month
str(accidents)
## 'data.frame': 1516064 obs. of 56 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ ID : chr "A-2716600" "A-2716601" "A-2716602" "A-2716603" ...
## $ Severity : int 3 2 2 2 2 3 2 2 2 2 ...
## $ Date : chr "2016-02-08" "2016-02-08" "2016-02-08" "2016-02-08" ...
## $ Time : chr "00:37:08" "05:56:20" "06:15:39" "06:15:39" ...
## $ End_Time : chr "2016-02-08 06:37:08" "2016-02-08 11:56:20" "2016-02-08 12:15:39" "2016-02-08 12:15:39" ...
## $ Start_Lat : num 40.1 39.9 39.1 39.1 41.1 ...
## $ Start_Lng : num -83.1 -84.1 -84.5 -84.5 -81.5 ...
## $ End_Lat : num 40.1 39.9 39.1 39.1 41.1 ...
## $ End_Lng : num -83 -84 -84.5 -84.5 -81.5 ...
## $ Distance.mi. : num 3.23 0.747 0.055 0.219 0.123 ...
## $ Description : chr "Between Sawmill Rd/Exit 20 and OH-315/Olentangy Riv Rd/Exit 22 - Accident." "At OH-4/OH-235/Exit 41 - Accident." "At I-71/US-50/Exit 1 - Accident." "At I-71/US-50/Exit 1 - Accident." ...
## $ Number : int 4000 4000 4000 4000 4000 4000 4000 1887 4000 4000 ...
## $ Street : chr "Outerbelt E" "I-70 E" "I-75 S" "US-50 E" ...
## $ Side : chr "R" "R" "R" "R" ...
## $ City : chr "Dublin" "Dayton" "Cincinnati" "Cincinnati" ...
## $ County : chr "Franklin" "Montgomery" "Hamilton" "Hamilton" ...
## $ State : chr "OH" "OH" "OH" "OH" ...
## $ Zipcode : chr "43017" "45424" "45203" "45202" ...
## $ Country : chr "US" "US" "US" "US" ...
## $ Timezone : chr "US/Eastern" "US/Eastern" "US/Eastern" "US/Eastern" ...
## $ Airport_Code : chr "KOSU" "KFFO" "KLUK" "KLUK" ...
## $ Weather_Timestamp : chr "2016-02-08 00:53:00" "2016-02-08 05:58:00" "2016-02-08 05:53:00" "2016-02-08 05:53:00" ...
## $ Temperature.F. : num 42.1 36.9 36 36 39 37 35.6 35.6 33.8 33.1 ...
## $ Wind_Chill.F. : num 36.1 57 57 57 57 29.8 29.2 29.2 57 30 ...
## $ Humidity... : int 58 91 97 97 55 93 100 100 100 92 ...
## $ Pressure.in. : num 29.8 29.7 29.7 29.7 29.6 ...
## $ Visibility.mi. : num 10 10 10 10 10 10 10 10 3 0.5 ...
## $ Wind_Direction : chr "SW" "Calm" "Calm" "Calm" ...
## $ Wind_Speed.mph. : num 10.4 7 7 7 7 10.4 8.1 8.1 2.3 3.5 ...
## $ Precipitation.in. : num 0 0.02 0.02 0.02 0 0.01 0 0 0 0.08 ...
## $ Weather_Condition : chr "Light Rain" "Light Rain" "Overcast" "Overcast" ...
## $ Amenity : chr "False" "False" "False" "False" ...
## $ Bump : chr "False" "False" "False" "False" ...
## $ Crossing : chr "False" "False" "False" "False" ...
## $ Give_Way : chr "False" "False" "False" "False" ...
## $ Junction : chr "False" "False" "True" "True" ...
## $ No_Exit : chr "False" "False" "False" "False" ...
## $ Railway : chr "False" "False" "False" "False" ...
## $ Roundabout : chr "False" "False" "False" "False" ...
## $ Station : chr "False" "False" "False" "False" ...
## $ Stop : chr "False" "False" "False" "False" ...
## $ Traffic_Calming : chr "False" "False" "False" "False" ...
## $ Traffic_Signal : chr "False" "False" "False" "False" ...
## $ Turning_Loop : chr "False" "False" "False" "False" ...
## $ Sunrise_Sunset : chr "Night" "Night" "Night" "Night" ...
## $ Civil_Twilight : chr "Night" "Night" "Night" "Night" ...
## $ Nautical_Twilight : chr "Night" "Night" "Night" "Night" ...
## $ Astronomical_Twilight: chr "Night" "Night" "Day" "Day" ...
## $ Year : int 2016 2016 2016 2016 2016 2016 2016 2016 2016 2016 ...
## $ Month : int 2 2 2 2 2 2 2 2 2 2 ...
## $ Day : int 8 8 8 8 8 8 8 8 8 8 ...
## $ Wday : int 2 2 2 2 2 2 2 2 2 2 ...
## $ Hour : int 0 5 6 6 6 7 8 8 8 11 ...
## $ X.Zipcode. : chr "Zipcode" "Zipcode" "Zipcode" "Zipcode" ...
## $ X.Month. : chr "Month" "Month" "Month" "Month" ...
dim(accidents)
## [1] 1516064 56
accidents$Severity = as.factor(accidents$Severity)
accidents = accidents[-c(12,64,65)]
accidents2 <- sample(1:nrow(accidents), 50000, replace = FALSE)
accidents2 <- accidents[accidents2, ]
trainingIndices <- createDataPartition(accidents2$Severity, p = 0.7, list = FALSE)
training <- accidents2[trainingIndices, ]
testing <- accidents2[-trainingIndices, ]
head(accidents)
## X ID Severity Date Time End_Time Start_Lat
## 1 1 A-2716600 3 2016-02-08 00:37:08 2016-02-08 06:37:08 40.10891
## 2 2 A-2716601 2 2016-02-08 05:56:20 2016-02-08 11:56:20 39.86542
## 3 3 A-2716602 2 2016-02-08 06:15:39 2016-02-08 12:15:39 39.10266
## 4 4 A-2716603 2 2016-02-08 06:15:39 2016-02-08 12:15:39 39.10148
## 5 5 A-2716604 2 2016-02-08 06:51:45 2016-02-08 12:51:45 41.06213
## 6 6 A-2716605 3 2016-02-08 07:53:43 2016-02-08 13:53:43 39.17239
## Start_Lng End_Lat End_Lng Distance.mi. Number Street Side City
## 1 -83.09286 40.11206 -83.03187 3.230 4000 Outerbelt E R Dublin
## 2 -84.06280 39.86501 -84.04873 0.747 4000 I-70 E R Dayton
## 3 -84.52468 39.10209 -84.52396 0.055 4000 I-75 S R Cincinnati
## 4 -84.52341 39.09841 -84.52241 0.219 4000 US-50 E R Cincinnati
## 5 -81.53784 41.06217 -81.53547 0.123 4000 I-77 N R Akron
## 6 -84.49279 39.17048 -84.50180 0.500 4000 I-75 S R Cincinnati
## County State Zipcode Country Timezone Airport_Code Weather_Timestamp
## 1 Franklin OH 43017 US US/Eastern KOSU 2016-02-08 00:53:00
## 2 Montgomery OH 45424 US US/Eastern KFFO 2016-02-08 05:58:00
## 3 Hamilton OH 45203 US US/Eastern KLUK 2016-02-08 05:53:00
## 4 Hamilton OH 45202 US US/Eastern KLUK 2016-02-08 05:53:00
## 5 Summit OH 44311 US US/Eastern KAKR 2016-02-08 06:54:00
## 6 Hamilton OH 45217 US US/Eastern KLUK 2016-02-08 07:53:00
## Temperature.F. Wind_Chill.F. Humidity... Pressure.in. Visibility.mi.
## 1 42.1 36.1 58 29.76 10
## 2 36.9 57.0 91 29.68 10
## 3 36.0 57.0 97 29.70 10
## 4 36.0 57.0 97 29.70 10
## 5 39.0 57.0 55 29.65 10
## 6 37.0 29.8 93 29.69 10
## Wind_Direction Wind_Speed.mph. Precipitation.in. Weather_Condition Amenity
## 1 SW 10.4 0.00 Light Rain False
## 2 Calm 7.0 0.02 Light Rain False
## 3 Calm 7.0 0.02 Overcast False
## 4 Calm 7.0 0.02 Overcast False
## 5 Calm 7.0 0.00 Overcast False
## 6 WSW 10.4 0.01 Light Rain False
## Bump Crossing Give_Way Junction No_Exit Railway Roundabout Station Stop
## 1 False False False False False False False False False
## 2 False False False False False False False False False
## 3 False False False True False False False False False
## 4 False False False True False False False False False
## 5 False False False False False False False False False
## 6 False False False False False False False False False
## Traffic_Calming Traffic_Signal Turning_Loop Sunrise_Sunset Civil_Twilight
## 1 False False False Night Night
## 2 False False False Night Night
## 3 False False False Night Night
## 4 False False False Night Night
## 5 False False False Night Night
## 6 False False False Day Day
## Nautical_Twilight Astronomical_Twilight Year Month Day Wday Hour X.Zipcode.
## 1 Night Night 2016 2 8 2 0 Zipcode
## 2 Night Night 2016 2 8 2 5 Zipcode
## 3 Night Day 2016 2 8 2 6 Zipcode
## 4 Night Day 2016 2 8 2 6 Zipcode
## 5 Day Day 2016 2 8 2 6 Zipcode
## 6 Day Day 2016 2 8 2 7 Zipcode
## X.Month.
## 1 Month
## 2 Month
## 3 Month
## 4 Month
## 5 Month
## 6 Month
set.seed(1991)
rf <-randomForest(Severity~.,data=training, ntree=500)
print(rf)
##
## Call:
## randomForest(formula = Severity ~ ., data = training, ntree = 500)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 7
##
## OOB estimate of error rate: 15.71%
## Confusion matrix:
## 1 2 3 4 class.error
## 1 238 395 31 1 0.64210526
## 2 71 27286 386 228 0.02448965
## 3 15 2400 1109 161 0.69905020
## 4 3 1602 206 870 0.67549422
floor(sqrt(ncol(training) - 1))
## [1] 7
mtry <- tuneRF(training[-1],training$Severity, ntreeTry=500,
stepFactor=1.5,improve=0.01, trace=TRUE, plot=TRUE)
## mtry = 7 OOB error = 0.05%
## Searching left ...
## mtry = 5 OOB error = 0.23%
## -3.263158 0.01
## Searching right ...
## mtry = 10 OOB error = 0.01%
## 0.7894737 0.01
## mtry = 15 OOB error = 0%
## 0.75 0.01
## mtry = 22 OOB error = 0%
## 0 0.01

best.m <- mtry[mtry[, 2] == min(mtry[, 2]), 1]
print(mtry)
## mtry OOBError
## 5.OOB 5 0.0023141535
## 7.OOB 7 0.0005428261
## 10.OOB 10 0.0001142792
## 15.OOB 15 0.0000285698
## 22.OOB 22 0.0000285698
print(best.m)
## 15.OOB 22.OOB
## 15 22
set.seed(1991)
rf <-randomForest(Severity~.,data=training, mtry=best.m, importance=TRUE,ntree=500)
print(rf)
##
## Call:
## randomForest(formula = Severity ~ ., data = training, mtry = best.m, importance = TRUE, ntree = 500)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 15
##
## OOB estimate of error rate: 15.54%
## Confusion matrix:
## 1 2 3 4 class.error
## 1 257 369 37 2 0.61353383
## 2 91 27122 456 302 0.03035287
## 3 21 2310 1174 180 0.68141113
## 4 7 1447 217 1010 0.62327490
importance(rf)
## 1 2 3 4
## X 4.8219907 23.094654 0.8728140 -2.50926652
## ID 4.5437521 24.966109 -0.8004541 -5.53167148
## Date 23.1101450 37.705234 15.0145965 12.97410652
## Time 4.4714434 58.291412 17.1124820 9.03764621
## End_Time 24.3598337 39.338220 14.2190994 12.22482145
## Start_Lat 21.8552786 38.617527 25.3779456 -0.91088774
## Start_Lng 17.9490321 36.418698 25.1012961 13.16475281
## End_Lat 22.2398524 38.400894 23.7764888 1.15063439
## End_Lng 20.0345261 36.894517 25.0333443 12.68217770
## Distance.mi. 43.0977440 83.453903 38.7485496 100.84258239
## Number 2.9429016 29.456216 10.1122566 8.66523763
## Street 11.0463188 72.010604 37.3524610 36.47891251
## Side -1.8534833 20.474933 12.1687470 -4.37567012
## City 10.8067484 23.802297 13.3972536 0.11264732
## County 12.9720080 18.205068 18.7466795 1.19241609
## State 27.7484252 32.357843 28.5260808 8.19377614
## Zipcode 21.4116773 46.066835 27.0640303 15.24205124
## Country 0.0000000 0.000000 0.0000000 0.00000000
## Timezone 10.3936746 12.068947 13.4326463 7.06278768
## Airport_Code 15.6460774 28.809041 13.2758968 3.91855966
## Weather_Timestamp 22.0463762 37.468828 6.0861617 9.88567466
## Temperature.F. -0.2243507 34.662531 1.4381435 0.71433798
## Wind_Chill.F. 1.5934365 30.953809 -0.5815765 2.67372870
## Humidity... 7.7314547 35.103981 -2.2835100 6.64300098
## Pressure.in. 13.0991655 50.297480 10.5338585 -3.59281250
## Visibility.mi. 2.3668208 12.613300 0.3434382 3.72268283
## Wind_Direction -1.3495888 22.278323 2.4919653 3.14711078
## Wind_Speed.mph. -1.3169168 21.656553 0.3324858 3.40634282
## Precipitation.in. 1.1319857 3.066885 -0.7584083 -1.58458875
## Weather_Condition 3.0069967 11.619510 3.4555140 0.88091589
## Amenity 1.3369352 -1.108658 0.5347562 -0.84805607
## Bump 0.0000000 1.342974 -1.0010015 0.00000000
## Crossing 22.6849259 15.693086 -2.1852207 5.72203478
## Give_Way 0.0000000 1.995125 1.3687466 3.93988359
## Junction 6.9603393 11.909184 4.3518513 13.90415992
## No_Exit 0.0000000 -2.355656 2.6953638 -1.00100150
## Railway 1.4047352 1.987739 -7.3998584 -0.73173207
## Roundabout 0.0000000 0.000000 0.0000000 0.00000000
## Station 1.3515362 5.754554 -2.6072520 1.86639913
## Stop -0.3721685 4.152632 -0.2092429 -0.50548604
## Traffic_Calming 0.0000000 2.464307 -1.0010015 1.00100150
## Traffic_Signal 21.3611712 25.849222 7.0203346 1.36006407
## Turning_Loop 0.0000000 0.000000 0.0000000 0.00000000
## Sunrise_Sunset 4.5647691 19.746621 -0.3620615 3.44938717
## Civil_Twilight 6.3698016 21.264256 -2.0421620 4.74309569
## Nautical_Twilight 9.2288646 23.401505 -0.7773985 6.48003569
## Astronomical_Twilight 8.7853564 23.292451 -1.3710409 2.78211117
## Year 11.4839159 11.526351 2.3819272 4.67070751
## Month 10.5121548 11.338570 7.9936243 0.06417341
## Day 0.8785365 13.276897 1.0756971 2.48918164
## Wday 3.0721225 69.125418 -2.6674794 5.01903709
## Hour 2.0695031 38.931364 13.0768389 6.41194441
## X.Zipcode. 0.0000000 0.000000 0.0000000 0.00000000
## X.Month. 0.0000000 0.000000 0.0000000 0.00000000
## MeanDecreaseAccuracy MeanDecreaseGini
## X 22.7738559 348.070809435
## ID 24.4348813 340.781207824
## Date 37.2663329 616.129987807
## Time 62.8617757 470.574289907
## End_Time 38.3324955 652.565528474
## Start_Lat 41.8237692 463.535957066
## Start_Lng 37.5616967 638.094287925
## End_Lat 41.5653210 461.588597717
## End_Lng 38.1615925 613.749410721
## Distance.mi. 94.3693642 798.113664875
## Number 28.4707286 184.816421753
## Street 78.3445982 542.075785812
## Side 21.4767963 45.346733279
## City 29.1861937 301.879080602
## County 21.4102410 264.447085959
## State 35.1244244 303.575301236
## Zipcode 48.5106810 711.794170897
## Country 0.0000000 0.000000000
## Timezone 12.7823505 110.641953881
## Airport_Code 33.7442168 278.748656513
## Weather_Timestamp 35.4894543 495.988484170
## Temperature.F. 34.8900961 295.387156087
## Wind_Chill.F. 31.3031219 177.391466766
## Humidity... 33.6024974 314.994510519
## Pressure.in. 50.0883461 370.524956679
## Visibility.mi. 12.6957726 87.482718107
## Wind_Direction 20.9627149 227.656772556
## Wind_Speed.mph. 21.1497776 236.995711294
## Precipitation.in. 2.3263157 51.353763073
## Weather_Condition 12.7463850 166.440204304
## Amenity -0.8278739 6.069879578
## Bump 0.8202088 0.635493103
## Crossing 26.7349471 47.203790880
## Give_Way 3.4674894 5.384013985
## Junction 17.8862264 44.869257281
## No_Exit -1.8333017 1.381646405
## Railway -1.1283750 7.678839150
## Roundabout 0.0000000 0.003666667
## Station 5.0384145 8.421735855
## Stop 3.5539506 7.698152863
## Traffic_Calming 1.9038874 0.907204434
## Traffic_Signal 32.3704927 58.867172956
## Turning_Loop 0.0000000 0.000000000
## Sunrise_Sunset 17.9205827 34.703993458
## Civil_Twilight 19.7803978 42.625844420
## Nautical_Twilight 23.0657171 66.683851571
## Astronomical_Twilight 23.3549169 63.552287911
## Year 11.6438091 91.193110846
## Month 11.3376632 208.754838504
## Day 13.3949423 276.529311167
## Wday 65.0454712 248.635222001
## Hour 42.0426659 249.311308188
## X.Zipcode. 0.0000000 0.000000000
## X.Month. 0.0000000 0.000000000
varImpPlot(rf)
