Visualization and Discovery section

library(pastecs)
#Let us read a file to use this package
US_accident <- read.table("US_Accidents_Dec20_updated.csv", sep=",", header = T, fill = T) 
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec, :
## EOF within quoted string
summary(US_accident)
##       ID              Severity          Start_Time          End_Time        
##  Length:779138      Length:779138      Length:779138      Length:779138     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##   Start_Lat          Start_Lng           End_Lat            End_Lng         
##  Length:779138      Length:779138      Length:779138      Length:779138     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##  Distance.mi.       Description           Number             Street         
##  Length:779138      Length:779138      Length:779138      Length:779138     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##      Side               City              County             State          
##  Length:779138      Length:779138      Length:779138      Length:779138     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##    Zipcode            Country            Timezone         Airport_Code      
##  Length:779138      Length:779138      Length:779138      Length:779138     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##  Weather_Timestamp  Temperature.F.     Wind_Chill.F.      Humidity...       
##  Length:779138      Length:779138      Length:779138      Length:779138     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##  Pressure.in.       Visibility.mi.     Wind_Direction     Wind_Speed.mph.   
##  Length:779138      Length:779138      Length:779138      Length:779138     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##  Precipitation.in.  Weather_Condition    Amenity              Bump          
##  Length:779138      Length:779138      Length:779138      Length:779138     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##    Crossing           Give_Way           Junction           No_Exit         
##  Length:779138      Length:779138      Length:779138      Length:779138     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##    Railway           Roundabout          Station              Stop          
##  Length:779138      Length:779138      Length:779138      Length:779138     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##  Traffic_Calming    Traffic_Signal     Turning_Loop       Sunrise_Sunset    
##  Length:779138      Length:779138      Length:779138      Length:779138     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##  Civil_Twilight     Nautical_Twilight  Astronomical_Twilight
##  Length:779138      Length:779138      Length:779138        
##  Class :character   Class :character   Class :character     
##  Mode  :character   Mode  :character   Mode  :character
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:pastecs':
## 
##     first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v stringr 1.4.0
## v tidyr   1.2.0     v forcats 0.5.1
## v readr   2.1.2
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x tidyr::extract() masks pastecs::extract()
## x dplyr::filter()  masks stats::filter()
## x dplyr::first()   masks pastecs::first()
## x dplyr::lag()     masks stats::lag()
## x dplyr::last()    masks pastecs::last()
library(readr)
df <- read_csv("US_Accidents_Dec20_updated.csv", col_types = cols(.default = col_character())) %>% 
type_convert()
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_character(),
##   Severity = col_double(),
##   Start_Time = col_datetime(format = ""),
##   End_Time = col_datetime(format = ""),
##   Start_Lat = col_double(),
##   Start_Lng = col_double(),
##   End_Lat = col_double(),
##   End_Lng = col_double(),
##   `Distance(mi)` = col_double(),
##   Number = col_double(),
##   Weather_Timestamp = col_datetime(format = ""),
##   `Temperature(F)` = col_double(),
##   `Wind_Chill(F)` = col_double(),
##   `Humidity(%)` = col_double(),
##   `Pressure(in)` = col_double(),
##   `Visibility(mi)` = col_double(),
##   `Wind_Speed(mph)` = col_double(),
##   `Precipitation(in)` = col_double(),
##   Amenity = col_logical(),
##   Bump = col_logical(),
##   Crossing = col_logical()
##   # ... with 10 more columns
## )
## i Use `spec()` for the full column specifications.
#Description, Street, Side, City, County, State, Zipcode, country, Timezone, Airport_Code

head(df)
## # A tibble: 6 x 47
##   ID        Severity Start_Time          End_Time            Start_Lat Start_Lng
##   <chr>        <dbl> <dttm>              <dttm>                  <dbl>     <dbl>
## 1 A-2716600        3 2016-02-08 00:37:08 2016-02-08 06:37:08      40.1     -83.1
## 2 A-2716601        2 2016-02-08 05:56:20 2016-02-08 11:56:20      39.9     -84.1
## 3 A-2716602        2 2016-02-08 06:15:39 2016-02-08 12:15:39      39.1     -84.5
## 4 A-2716603        2 2016-02-08 06:15:39 2016-02-08 12:15:39      39.1     -84.5
## 5 A-2716604        2 2016-02-08 06:51:45 2016-02-08 12:51:45      41.1     -81.5
## 6 A-2716605        3 2016-02-08 07:53:43 2016-02-08 13:53:43      39.2     -84.5
## # ... with 41 more variables: End_Lat <dbl>, End_Lng <dbl>,
## #   `Distance(mi)` <dbl>, Description <chr>, Number <dbl>, Street <chr>,
## #   Side <chr>, City <chr>, County <chr>, State <chr>, Zipcode <chr>,
## #   Country <chr>, Timezone <chr>, Airport_Code <chr>,
## #   Weather_Timestamp <dttm>, `Temperature(F)` <dbl>, `Wind_Chill(F)` <dbl>,
## #   `Humidity(%)` <dbl>, `Pressure(in)` <dbl>, `Visibility(mi)` <dbl>,
## #   Wind_Direction <chr>, `Wind_Speed(mph)` <dbl>, ...
summary(df)
##       ID               Severity       Start_Time                 
##  Length:1516064     Min.   :1.000   Min.   :2016-02-08 00:37:08  
##  Class :character   1st Qu.:2.000   1st Qu.:2018-07-17 14:41:25  
##  Mode  :character   Median :2.000   Median :2020-01-24 11:16:33  
##                     Mean   :2.239   Mean   :2019-07-15 07:01:48  
##                     3rd Qu.:2.000   3rd Qu.:2020-10-22 13:01:30  
##                     Max.   :4.000   Max.   :2020-12-31 23:28:56  
##                                                                  
##     End_Time                     Start_Lat       Start_Lng      
##  Min.   :2016-02-08 06:37:08   Min.   :24.57   Min.   :-124.50  
##  1st Qu.:2018-07-17 17:13:14   1st Qu.:33.85   1st Qu.:-118.21  
##  Median :2020-01-24 13:38:15   Median :37.35   Median : -94.38  
##  Mean   :2019-07-15 11:42:20   Mean   :36.90   Mean   : -98.60  
##  3rd Qu.:2020-10-22 17:50:19   3rd Qu.:40.73   3rd Qu.: -80.87  
##  Max.   :2021-01-01 00:00:00   Max.   :49.00   Max.   : -67.11  
##                                                                 
##     End_Lat         End_Lng         Distance(mi)      Description       
##  Min.   :24.57   Min.   :-124.50   Min.   :  0.0000   Length:1516064    
##  1st Qu.:33.85   1st Qu.:-118.21   1st Qu.:  0.0000   Class :character  
##  Median :37.35   Median : -94.38   Median :  0.1780   Mode  :character  
##  Mean   :36.90   Mean   : -98.60   Mean   :  0.5873                     
##  3rd Qu.:40.73   3rd Qu.: -80.87   3rd Qu.:  0.5940                     
##  Max.   :49.08   Max.   : -67.11   Max.   :155.1860                     
##                                                                         
##      Number           Street              Side               City          
##  Min.   :      0   Length:1516064     Length:1516064     Length:1516064    
##  1st Qu.:   1212   Class :character   Class :character   Class :character  
##  Median :   4000   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :   8908                                                           
##  3rd Qu.:  10100                                                           
##  Max.   :9999997                                                           
##  NA's   :1046095                                                           
##     County             State             Zipcode            Country         
##  Length:1516064     Length:1516064     Length:1516064     Length:1516064    
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##    Timezone         Airport_Code       Weather_Timestamp            
##  Length:1516064     Length:1516064     Min.   :2016-02-08 00:53:00  
##  Class :character   Class :character   1st Qu.:2018-07-10 10:55:30  
##  Mode  :character   Mode  :character   Median :2020-01-22 05:53:00  
##                                        Mean   :2019-07-12 00:02:11  
##                                        3rd Qu.:2020-10-21 04:54:00  
##                                        Max.   :2020-12-31 23:35:00  
##                                        NA's   :30264                
##  Temperature(F)   Wind_Chill(F)     Humidity(%)      Pressure(in)  
##  Min.   :-89.00   Min.   :-89.0    Min.   :  1.00   Min.   : 0.00  
##  1st Qu.: 47.00   1st Qu.: 40.8    1st Qu.: 48.00   1st Qu.:29.44  
##  Median : 61.00   Median : 57.0    Median : 68.00   Median :29.88  
##  Mean   : 59.58   Mean   : 55.1    Mean   : 64.66   Mean   :29.55  
##  3rd Qu.: 73.00   3rd Qu.: 71.0    3rd Qu.: 84.00   3rd Qu.:30.04  
##  Max.   :170.60   Max.   :113.0    Max.   :100.00   Max.   :58.04  
##  NA's   :43033    NA's   :449316   NA's   :45509    NA's   :36274  
##  Visibility(mi)   Wind_Direction     Wind_Speed(mph)  Precipitation(in)
##  Min.   :  0.00   Length:1516064     Min.   :  0.00   Min.   : 0       
##  1st Qu.: 10.00   Class :character   1st Qu.:  4.60   1st Qu.: 0       
##  Median : 10.00   Mode  :character   Median :  7.00   Median : 0       
##  Mean   :  9.13                      Mean   :  7.63   Mean   : 0       
##  3rd Qu.: 10.00                      3rd Qu.: 10.40   3rd Qu.: 0       
##  Max.   :140.00                      Max.   :984.00   Max.   :24       
##  NA's   :44211                       NA's   :128862   NA's   :510549   
##  Weather_Condition   Amenity           Bump          Crossing      
##  Length:1516064     Mode :logical   Mode :logical   Mode :logical  
##  Class :character   FALSE:1503661   FALSE:1515803   FALSE:1429681  
##  Mode  :character   TRUE :12403     TRUE :261       TRUE :86383    
##                                                                    
##                                                                    
##                                                                    
##                                                                    
##   Give_Way        Junction        No_Exit         Railway       
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:1512809   FALSE:1311566   FALSE:1514335   FALSE:1503480  
##  TRUE :3255      TRUE :204498    TRUE :1729      TRUE :12584    
##                                                                 
##                                                                 
##                                                                 
##                                                                 
##  Roundabout       Station           Stop         Traffic_Calming
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:1516013   FALSE:1487917   FALSE:1498368   FALSE:1515575  
##  TRUE :51        TRUE :28147     TRUE :17696     TRUE :489      
##                                                                 
##                                                                 
##                                                                 
##                                                                 
##  Traffic_Signal  Turning_Loop    Sunrise_Sunset     Civil_Twilight    
##  Mode :logical   Mode :logical   Length:1516064     Length:1516064    
##  FALSE:1346095   FALSE:1516064   Class :character   Class :character  
##  TRUE :169969                    Mode  :character   Mode  :character  
##                                                                       
##                                                                       
##                                                                       
##                                                                       
##  Nautical_Twilight  Astronomical_Twilight
##  Length:1516064     Length:1516064       
##  Class :character   Class :character     
##  Mode  :character   Mode  :character     
##                                          
##                                          
##                                          
## 
str(df)
## spec_tbl_df [1,516,064 x 47] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ ID                   : chr [1:1516064] "A-2716600" "A-2716601" "A-2716602" "A-2716603" ...
##  $ Severity             : num [1:1516064] 3 2 2 2 2 3 2 2 2 2 ...
##  $ Start_Time           : POSIXct[1:1516064], format: "2016-02-08 00:37:08" "2016-02-08 05:56:20" ...
##  $ End_Time             : POSIXct[1:1516064], format: "2016-02-08 06:37:08" "2016-02-08 11:56:20" ...
##  $ Start_Lat            : num [1:1516064] 40.1 39.9 39.1 39.1 41.1 ...
##  $ Start_Lng            : num [1:1516064] -83.1 -84.1 -84.5 -84.5 -81.5 ...
##  $ End_Lat              : num [1:1516064] 40.1 39.9 39.1 39.1 41.1 ...
##  $ End_Lng              : num [1:1516064] -83 -84 -84.5 -84.5 -81.5 ...
##  $ Distance(mi)         : num [1:1516064] 3.23 0.747 0.055 0.219 0.123 ...
##  $ Description          : chr [1:1516064] "Between Sawmill Rd/Exit 20 and OH-315/Olentangy Riv Rd/Exit 22 - Accident." "At OH-4/OH-235/Exit 41 - Accident." "At I-71/US-50/Exit 1 - Accident." "At I-71/US-50/Exit 1 - Accident." ...
##  $ Number               : num [1:1516064] NA NA NA NA NA ...
##  $ Street               : chr [1:1516064] "Outerbelt E" "I-70 E" "I-75 S" "US-50 E" ...
##  $ Side                 : chr [1:1516064] "R" "R" "R" "R" ...
##  $ City                 : chr [1:1516064] "Dublin" "Dayton" "Cincinnati" "Cincinnati" ...
##  $ County               : chr [1:1516064] "Franklin" "Montgomery" "Hamilton" "Hamilton" ...
##  $ State                : chr [1:1516064] "OH" "OH" "OH" "OH" ...
##  $ Zipcode              : chr [1:1516064] "43017" "45424" "45203" "45202" ...
##  $ Country              : chr [1:1516064] "US" "US" "US" "US" ...
##  $ Timezone             : chr [1:1516064] "US/Eastern" "US/Eastern" "US/Eastern" "US/Eastern" ...
##  $ Airport_Code         : chr [1:1516064] "KOSU" "KFFO" "KLUK" "KLUK" ...
##  $ Weather_Timestamp    : POSIXct[1:1516064], format: "2016-02-08 00:53:00" "2016-02-08 05:58:00" ...
##  $ Temperature(F)       : num [1:1516064] 42.1 36.9 36 36 39 37 35.6 35.6 33.8 33.1 ...
##  $ Wind_Chill(F)        : num [1:1516064] 36.1 NA NA NA NA 29.8 29.2 29.2 NA 30 ...
##  $ Humidity(%)          : num [1:1516064] 58 91 97 97 55 93 100 100 100 92 ...
##  $ Pressure(in)         : num [1:1516064] 29.8 29.7 29.7 29.7 29.6 ...
##  $ Visibility(mi)       : num [1:1516064] 10 10 10 10 10 10 10 10 3 0.5 ...
##  $ Wind_Direction       : chr [1:1516064] "SW" "Calm" "Calm" "Calm" ...
##  $ Wind_Speed(mph)      : num [1:1516064] 10.4 NA NA NA NA 10.4 8.1 8.1 2.3 3.5 ...
##  $ Precipitation(in)    : num [1:1516064] 0 0.02 0.02 0.02 NA 0.01 NA NA NA 0.08 ...
##  $ Weather_Condition    : chr [1:1516064] "Light Rain" "Light Rain" "Overcast" "Overcast" ...
##  $ Amenity              : logi [1:1516064] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ Bump                 : logi [1:1516064] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ Crossing             : logi [1:1516064] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ Give_Way             : logi [1:1516064] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ Junction             : logi [1:1516064] FALSE FALSE TRUE TRUE FALSE FALSE ...
##  $ No_Exit              : logi [1:1516064] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ Railway              : logi [1:1516064] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ Roundabout           : logi [1:1516064] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ Station              : logi [1:1516064] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ Stop                 : logi [1:1516064] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ Traffic_Calming      : logi [1:1516064] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ Traffic_Signal       : logi [1:1516064] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ Turning_Loop         : logi [1:1516064] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ Sunrise_Sunset       : chr [1:1516064] "Night" "Night" "Night" "Night" ...
##  $ Civil_Twilight       : chr [1:1516064] "Night" "Night" "Night" "Night" ...
##  $ Nautical_Twilight    : chr [1:1516064] "Night" "Night" "Night" "Night" ...
##  $ Astronomical_Twilight: chr [1:1516064] "Night" "Night" "Day" "Day" ...
##  - attr(*, "problems")=<externalptr>
#1,516,064 × 47
(colMeans(is.na(df)))*100
##                    ID              Severity            Start_Time 
##           0.000000000           0.000000000           0.000000000 
##              End_Time             Start_Lat             Start_Lng 
##           0.000000000           0.000000000           0.000000000 
##               End_Lat               End_Lng          Distance(mi) 
##           0.000000000           0.000000000           0.000000000 
##           Description                Number                Street 
##           0.000000000          69.000715009           0.000000000 
##                  Side                  City                County 
##           0.000000000           0.005474703           0.000000000 
##                 State               Zipcode               Country 
##           0.000000000           0.061672858           0.000000000 
##              Timezone          Airport_Code     Weather_Timestamp 
##           0.151840556           0.280199253           1.996221795 
##        Temperature(F)         Wind_Chill(F)           Humidity(%) 
##           2.838468561          29.637007409           3.001786204 
##          Pressure(in)        Visibility(mi)        Wind_Direction 
##           2.392643055           2.916169766           2.760965236 
##       Wind_Speed(mph)     Precipitation(in)     Weather_Condition 
##           8.499773097          33.675952994           2.902713870 
##               Amenity                  Bump              Crossing 
##           0.000000000           0.000000000           0.000000000 
##              Give_Way              Junction               No_Exit 
##           0.000000000           0.000000000           0.000000000 
##               Railway            Roundabout               Station 
##           0.000000000           0.000000000           0.000000000 
##                  Stop       Traffic_Calming        Traffic_Signal 
##           0.000000000           0.000000000           0.000000000 
##          Turning_Loop        Sunrise_Sunset        Civil_Twilight 
##           0.000000000           0.005474703           0.005474703 
##     Nautical_Twilight Astronomical_Twilight 
##           0.005474703           0.005474703
#Number, 69.000715009 
#Wind_Chill(F) 29.637007409
#Precipitation(in) 33.675952994 
my_data <- df[ , c("Severity", "Start_Time", "End_Time", "Temperature(F)", "Humidity(%)", "Visibility(mi)", "Wind_Speed(mph)", "Start_Lat", "Start_Lng", "Zipcode", "Pressure(in)", "Junction", "Sunrise_Sunset", "Distance(mi)", "Wind_Chill(F)", "Wind_Direction", "Precipitation(in)", "State")]   
library(pastecs)
stat.desc(my_data)
##                  Severity   Start_Time     End_Time Temperature(F)  Humidity(%)
## nbr.val      1.516064e+06 1.516064e+06 1.516064e+06   1.473031e+06 1.470555e+06
## nbr.null     0.000000e+00 0.000000e+00 0.000000e+00   5.650000e+02 0.000000e+00
## nbr.na       0.000000e+00 0.000000e+00 0.000000e+00   4.303300e+04 4.550900e+04
## min          1.000000e+00 1.454892e+09 1.454913e+09  -8.900000e+01 1.000000e+00
## max          4.000000e+00 1.609457e+09 1.609459e+09   1.706000e+02 1.000000e+02
## range        3.000000e+00 1.545655e+08 1.545458e+08   2.596000e+02 9.900000e+01
## sum          3.393906e+06 2.369872e+15 2.369898e+15   8.776996e+07 9.508550e+07
## median       2.000000e+00 1.579865e+09 1.579873e+09   6.100000e+01 6.800000e+01
## mean         2.238630e+00 1.563174e+09 1.563191e+09   5.958460e+01 6.465960e+01
## SE.mean      4.939132e-04 3.640382e+04 3.639897e+04   1.505594e-02 1.918079e-02
## CI.mean.0.95 9.680528e-04 7.135024e+04 7.134074e+04   2.950912e-02 3.759369e-02
## var          3.698441e-01 2.009146e+15 2.008611e+15   3.339085e+02 5.410213e+02
## std.dev      6.081481e-01 4.482350e+07 4.481753e+07   1.827316e+01 2.325986e+01
## coef.var     2.716609e-01 2.867467e-02 2.867054e-02   3.066760e-01 3.597279e-01
##              Visibility(mi) Wind_Speed(mph)    Start_Lat     Start_Lng Zipcode
## nbr.val        1.471853e+06    1.387202e+06 1.516064e+06  1.516064e+06      NA
## nbr.null       1.410000e+03    2.028770e+05 0.000000e+00  0.000000e+00      NA
## nbr.na         4.421100e+04    1.288620e+05 0.000000e+00  0.000000e+00      NA
## min            0.000000e+00    0.000000e+00 2.457022e+01 -1.244976e+02      NA
## max            1.400000e+02    9.840000e+02 4.900058e+01 -6.711317e+01      NA
## range          1.400000e+02    9.840000e+02 2.443036e+01  5.738440e+01      NA
## sum            1.344060e+07    1.058548e+07 5.594361e+07 -1.494827e+08      NA
## median         1.000000e+01    7.000000e+00 3.735113e+01 -9.438100e+01      NA
## mean           9.131755e+00    7.630812e+00 3.690056e+01 -9.859919e+01      NA
## SE.mean        2.381399e-03    4.786370e-03 4.195333e-03  1.502172e-02      NA
## CI.mean.0.95   4.667460e-03    9.381121e-03 8.222709e-03  2.944205e-02      NA
## var            8.346970e+00    3.177988e+01 2.668397e+01  3.421028e+02      NA
## std.dev        2.889112e+00    5.637364e+00 5.165653e+00  1.849602e+01      NA
## coef.var       3.163808e-01    7.387633e-01 1.399885e-01 -1.875880e-01      NA
##              Pressure(in) Junction Sunrise_Sunset Distance(mi) Wind_Chill(F)
## nbr.val      1.479790e+06       NA             NA 1.516064e+06  1.066748e+06
## nbr.null     1.000000e+00       NA             NA 4.006130e+05  5.030000e+02
## nbr.na       3.627400e+04       NA             NA 0.000000e+00  4.493160e+05
## min          0.000000e+00       NA             NA 0.000000e+00 -8.900000e+01
## max          5.804000e+01       NA             NA 1.551860e+02  1.130000e+02
## range        5.804000e+01       NA             NA 1.551860e+02  2.020000e+02
## sum          4.373513e+07       NA             NA 8.903264e+05  5.878823e+07
## median       2.988000e+01       NA             NA 1.780000e-01  5.700000e+01
## mean         2.955495e+01       NA             NA 5.872617e-01  5.510976e+01
## SE.mean      8.358277e-04       NA             NA 1.325979e-03  2.045568e-02
## CI.mean.0.95 1.638194e-03       NA             NA 2.598874e-03  4.009245e-02
## var          1.033793e+00       NA             NA 2.665576e+00  4.463647e+02
## std.dev      1.016756e+00       NA             NA 1.632659e+00  2.112735e+01
## coef.var     3.440222e-02       NA             NA 2.780122e+00  3.833685e-01
##              Wind_Direction Precipitation(in) State
## nbr.val                  NA      1.005515e+06    NA
## nbr.null                 NA      9.034290e+05    NA
## nbr.na                   NA      5.105490e+05    NA
## min                      NA      0.000000e+00    NA
## max                      NA      2.400000e+01    NA
## range                    NA      2.400000e+01    NA
## sum                      NA      8.524610e+03    NA
## median                   NA      0.000000e+00    NA
## mean                     NA      8.477855e-03    NA
## SE.mean                  NA      1.289617e-04    NA
## CI.mean.0.95             NA      2.527606e-04    NA
## var                      NA      1.672284e-02    NA
## std.dev                  NA      1.293168e-01    NA
## coef.var                 NA      1.525348e+01    NA
library(tidyr)
library(ggplot2)
library(KernSmooth)
## KernSmooth 2.23 loaded
## Copyright M. P. Wand 1997-2009
library(purrr)
library(dplyr)
options(scipen=999)
hist(my_data$'Severity', 
     breaks=seq(0,5,1),
     col="brown1",
     main="Histogram of Severity",
     xlab="Severity",
     ylab="Frequency", 
     xlim=c(0,4)
     )

hist(my_data$'Start_Time', 
     breaks = 40,
     col="goldenrod1",
     main="Histogram of Start_Time",
     xlab="Start_Time",
     ylab="Frequency", 
     )
## Warning in breaks[-1L] + breaks[-nB]: NAs produced by integer overflow

hist(my_data$'End_Time', 
     breaks = 40,
     col="burlywood1",
     main="Histogram of End_Time",
     xlab="End_Time",
     ylab="Frequency", 
     )
## Warning in breaks[-1L] + breaks[-nB]: NAs produced by integer overflow

hist(my_data$'Temperature(F)', 
     breaks=40, 
     col="cornflowerblue",
     main="Histogram of Temperature", 
     xlab="Temperature (Deg. F)", 
     ylab="Frequency", 
     font.main = 3 
     )

hist(my_data$'Humidity(%)', 
     breaks=40,
     col="lightblue",
     main="Histogram of Humidity",
     xlab="Humidity(%)",
     ylab="Frequency", 
     )

hist(my_data$'Visibility(mi)', 
     breaks=40,
     col="azure4",
     main="Histogram of Visibility",
     xlim=c(0,150),
     xlab="Visibility(mi)",
     ylab="Frequency", 
     )

hist(my_data$'Wind_Speed(mph)', 
     breaks=40,
     col="coral",
     main="Histogram of Wind_Speed",
     xlab="Wind_Speed(mph)",
     ylab="Frequency", 
     )

hist(my_data$'Pressure(in)', 
     breaks=40,
     col="aquamarine3",
     main="Histogram of Pressure",
     xlab="Pressure(in)",
     ylab="Frequency", 
     )

hist(my_data$'Distance(mi)', 
     breaks=40,
     col="darkorchid1",
     main="Histogram of Distance",
     xlab="Distance(mi)",
     ylab="Frequency", 
     )

hist(my_data$"Wind_Chill(F)", 
     breaks=40,
     col="darkseagreen1",
     main="Histogram of Wind_Chill",
     xlab="Wind_Chill(F)",
     ylab="Frequency", 
     )

hist(my_data$"Precipitation(in)", 
     breaks=40,
     col="slategray1",
     main="Histogram of Precipitation",
     xlab="Precipitation(in)",
     ylab="Frequency", 
     )

S1 <- my_data %>% 
  group_by(Wind_Direction) %>%
  summarise(n = n())
S1
## # A tibble: 25 x 2
##    Wind_Direction      n
##    <chr>           <int>
##  1 Calm            79192
##  2 CALM           202870
##  3 E               52435
##  4 East            24064
##  5 ENE             51257
##  6 ESE             51295
##  7 N               53718
##  8 NE              48355
##  9 NNE             46509
## 10 NNW             68014
## # ... with 15 more rows
options(scipen=999)
ggplot(S1, aes(x=S1$Wind_Direction, y=S1$n)) + geom_bar(stat="identity", position = position_dodge(width=2)) + 
  labs(x="Wind_Direction", y="Frequency")
## Warning: Use of `S1$Wind_Direction` is discouraged. Use `Wind_Direction`
## instead.
## Warning: Use of `S1$n` is discouraged. Use `n` instead.

S2 <- my_data %>% 
  group_by(State) %>%
  summarise(n = n())
S2
## # A tibble: 49 x 2
##    State      n
##    <chr>  <int>
##  1 AL      9375
##  2 AR      4373
##  3 AZ     30185
##  4 CA    448833
##  5 CO     19809
##  6 CT     15194
##  7 DC      3788
##  8 DE      2331
##  9 FL    153007
## 10 GA     31111
## # ... with 39 more rows
options(scipen=999)
ggplot(S2, aes(x=S2$State, y=S2$n)) + geom_bar(stat="identity", position = position_dodge(width=2)) + 
  labs(x="State", y="Frequency")
## Warning: Use of `S2$State` is discouraged. Use `State` instead.
## Warning: Use of `S2$n` is discouraged. Use `n` instead.

library(rgdal)
## Loading required package: sp
## Please note that rgdal will be retired by the end of 2023,
## plan transition to sf/stars/terra functions using GDAL and PROJ
## at your earliest convenience.
## 
## rgdal: version: 1.5-28, (SVN revision 1158)
## Geospatial Data Abstraction Library extensions to R successfully loaded
## Loaded GDAL runtime: GDAL 3.2.1, released 2020/12/29
## Path to GDAL shared files: C:/Users/onecouple/Documents/R/win-library/4.1/rgdal/gdal
## GDAL binary built with GEOS: TRUE 
## Loaded PROJ runtime: Rel. 7.2.1, January 1st, 2021, [PJ_VERSION: 721]
## Path to PROJ shared files: C:/Users/onecouple/Documents/R/win-library/4.1/rgdal/proj
## PROJ CDN enabled: FALSE
## Linking to sp version:1.4-6
## To mute warnings of possible GDAL/OSR exportToProj4() degradation,
## use options("rgdal_show_exportToProj4_warnings"="none") before loading sp or rgdal.
## Overwritten PROJ_LIB was C:/Users/onecouple/Documents/R/win-library/4.1/rgdal/proj
library(shiny)
library(purrr)
library(usmap)
library(ggplot2)
library(dplyr)
my_data %>% select(Start_Time, End_Time) %>% head(5)
## # A tibble: 5 x 2
##   Start_Time          End_Time           
##   <dttm>              <dttm>             
## 1 2016-02-08 00:37:08 2016-02-08 06:37:08
## 2 2016-02-08 05:56:20 2016-02-08 11:56:20
## 3 2016-02-08 06:15:39 2016-02-08 12:15:39
## 4 2016-02-08 06:15:39 2016-02-08 12:15:39
## 5 2016-02-08 06:51:45 2016-02-08 12:51:45
library(stringr)
library(tidyr)
library(dplyr)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
accidents_time <- my_data %>%
  mutate(Duration = as.numeric(End_Time - Start_Time)) %>%
  filter(!(Duration < 0)) %>%
  separate(Start_Time, into = c("Date", "Time"), sep = " ") %>%
  mutate("Year" = str_sub(Date, 1, 4), "Month" = str_sub(Date, 6, 7), 
         "Day" = str_sub(Date, 9, 10), "Wday" = as.character(wday(Date)), 
         "Hour" = str_sub(Time, 1, 2)) %>%
  select(-c("Date", "Time", "End_Time")) %>%
  select(Severity, Year, Month, Day, Hour, Wday, Duration)
head(accidents_time)
## # A tibble: 6 x 7
##   Severity Year  Month Day   Hour  Wday  Duration
##      <dbl> <chr> <chr> <chr> <chr> <chr>    <dbl>
## 1        3 2016  02    08    00    2          360
## 2        2 2016  02    08    05    2          360
## 3        2 2016  02    08    06    2          360
## 4        2 2016  02    08    06    2          360
## 5        2 2016  02    08    06    2          360
## 6        3 2016  02    08    07    2          360
library(ggplot2)
accidents_happenHour <- accidents_time %>% count(Hour)
accidents_happenHour
## # A tibble: 24 x 2
##    Hour      n
##    <chr> <int>
##  1 00    46125
##  2 01    43434
##  3 02    38801
##  4 03    32175
##  5 04    29478
##  6 05    40105
##  7 06    57859
##  8 07    73938
##  9 08    75802
## 10 09    59744
## # ... with 14 more rows
p <- ggplot(accidents_happenHour, aes(Hour, n))
p + geom_point(aes(color = n)) + labs(x = "Hour of a day", y = "Number of accidents")

accidents_severity <- accidents_time %>%
    group_by(Hour) %>%
    summarise(mean(Severity))
accidents_severity
## # A tibble: 24 x 2
##    Hour  `mean(Severity)`
##    <chr>            <dbl>
##  1 00                2.26
##  2 01                2.18
##  3 02                2.21
##  4 03                2.27
##  5 04                2.31
##  6 05                2.32
##  7 06                2.27
##  8 07                2.25
##  9 08                2.24
## 10 09                2.27
## # ... with 14 more rows
accident_summary <- merge(accidents_happenHour, accidents_severity)
accident_summary <- accident_summary %>% rename(Average_Severity = "mean(Severity)")
accident_summary
##    Hour      n Average_Severity
## 1    00  46125         2.256629
## 2    01  43434         2.179168
## 3    02  38801         2.206593
## 4    03  32175         2.272789
## 5    04  29478         2.306805
## 6    05  40105         2.320035
## 7    06  57859         2.272870
## 8    07  73938         2.248032
## 9    08  75802         2.242131
## 10   09  59744         2.272078
## 11   10  50442         2.285813
## 12   11  51884         2.266074
## 13   12  72283         2.222085
## 14   13  83700         2.198076
## 15   14  90162         2.204543
## 16   15 100074         2.223345
## 17   16 105559         2.227446
## 18   17 108011         2.225986
## 19   18  91413         2.235459
## 20   19  65190         2.243688
## 21   20  53515         2.248024
## 22   21  48988         2.231342
## 23   22  49156         2.232973
## 24   23  48226         2.232613
options(scipen=999)
ggplot(data = accident_summary) + 
  geom_col(mapping=a