Chapter 3 EDA
3.1 Basic Plots
This section starts by looking at the FlightDelays
dataset
3.1.1 A basic barchart of the Carrier attribute
<- as_tibble(FlightDelays)
flights %>%group_by(Carrier)%>%summarise(n()))
(flights## # A tibble: 2 x 2
## Carrier `n()`
## * <fct> <int>
## 1 AA 2906
## 2 UA 1123
ggplot(flights, aes(x = Carrier))+ geom_bar() +
scale_y_continuous(breaks = seq(0,3000,by=250))
A two way contingency table between carrier and whether or not a flight was delayed more than 30 min
<- flights%>%group_by(Carrier)%>%mutate(status = (Delayed30=='Yes'))%>%
temp_stat summarise(dl30min = mean(status), dl30min_n = sum(status), dl30min_Y = sum(!status), ct = n())
temp_stat## # A tibble: 2 x 5
## Carrier dl30min dl30min_n dl30min_Y ct
## * <fct> <dbl> <int> <int> <int>
## 1 AA 0.135 393 2513 2906
## 2 UA 0.182 204 919 1123
3.1.2 Flight delay in various intervals
<- flights %>% filter(Carrier=='UA')%>%dplyr::select(Delay)%>%unlist()
delays <- as.data.frame(cut(delays, seq(-50,450,50))%>%table())
temp_stat colnames(temp_stat) <- c("time interval", "number of flights")
kbl(temp_stat) %>%
kable_styling(bootstrap_options = "striped", full_width = F, position = "center")
time interval | number of flights |
---|---|
(-50,0] | 722 |
(0,50] | 249 |
(50,100] | 86 |
(100,150] | 39 |
(150,200] | 14 |
(200,250] | 7 |
(250,300] | 3 |
(300,350] | 2 |
(350,400] | 1 |
(400,450] | 0 |
3.1.3 Histogram of the delay times
%>% dplyr::filter(Carrier=='UA')%>%
flights ::select(Delay)%>%ggplot(aes(Delay)) +
dplyrgeom_bar()
3.2 Numeric Summaries
c(mean(delays), median(delays), sd(delays))
## [1] 15.98308 -1.00000 45.13895
summary(delays)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -17.00 -5.00 -1.00 15.98 12.50 377.00
<- flights %>% filter(Carrier=='UA')%>%dplyr::select(Delay)%>%
temp_stat summarise(n = n(), nd =n_distinct(), sd(Delay), mad(Delay), IQR(Delay), var(Delay),
min(Delay), max(Delay), quantile(Delay, 0.3))
kbl(temp_stat) %>%
kable_styling(bootstrap_options = "striped", full_width = F, position = "center")
n | nd | sd(Delay) | mad(Delay) | IQR(Delay) | var(Delay) | min(Delay) | max(Delay) | quantile(Delay, 0.3) |
---|---|---|---|---|---|---|---|---|
1123 | 0 | 45.13895 | 7.413 | 17.5 | 2037.525 | -17 | 377 | -4 |
3.3 Box Plots
<- flights %>% filter(Carrier=='UA')%>%dplyr::select(Delay,Day)
delays %>%ggplot(aes(x=Day, y=Delay))+geom_boxplot() delays
3.4 Quantile Plots
<- as_tibble(NCBirths2004)
ncbirths %>%ggplot(aes(sample=Weight)) + geom_qq() + geom_qq_line() ncbirths
<- flights %>% filter(Carrier=='UA')%>%dplyr::select(Delay,Day)
delays %>%ggplot(aes(sample=Delay)) + geom_qq() + geom_qq_line() delays
3.5 ECDF
ggplot(Beerwings, aes(Beer, color = Gender)) + stat_ecdf(geom="step")
3.6 Scatterplot
ggplot(Beerwings, aes(x=Hotwings, y = Beer))+ geom_point()