TidyTuesday - Transit Costs

library(tidyverse)
library(ggplot2)
library(ggrepel)
library(ggdark)
library(ggtext)

transit_cost <- data.table::fread("~/github/tidytues/tidytuesday/data/2021/2021-01-05/transit_cost.csv")

transit_cost <- transit_cost %>%
  mutate(tunnel_per = as.numeric(str_replace(tunnel_per, "%", "")),
         real_cost = as.numeric(real_cost),
         country = ifelse(is.na(country), "Unk", country),
         cc_id = paste(country,city))

dat <- transit_cost %>%
  group_by(country, city) %>%
  summarise(
    total_projects = n(),
    total_stations = sum(stations, na.rm = TRUE),
    total_tunnel_len = sum(tunnel, na.rm = TRUE),
    total_len = sum(length, na.rm = TRUE),
    tunnel_pc = total_tunnel_len/total_len,
    total_cost = sum(real_cost, na.rm = TRUE) / 1e3, #now in Billions
    avg_stations = mean(stations, na.rm = TRUE),
    avg_tunnel_len = mean(tunnel, na.rm = TRUE),
    avg_len = mean(length, na.rm = TRUE),
    tunnel_pc = avg_tunnel_len/avg_len,
    avg_cost = mean(real_cost, na.rm = TRUE)
  ) %>%
  filter(total_len < 10000,
         total_projects > 1,
         country %in% c("CN", "IN")) %>%
  mutate(cc_id = paste(country,city),
         country = ifelse(country == "CN", "China", "India"))
glimpse(dat)
## Rows: 35
## Columns: 13
## Groups: country [2]
## $ country          <chr> "China", "China", "China", "China", "China", "China"…
## $ city             <chr> "Beijing", "Changchun", "Changsha", "Chengdu", "Chon…
## $ total_projects   <int> 27, 7, 13, 11, 11, 7, 3, 10, 5, 11, 10, 8, 2, 5, 12,…
## $ total_stations   <int> 376, 81, 152, 152, 163, 66, 61, 121, 90, 154, 192, 1…
## $ total_tunnel_len <dbl> 450.0686, 95.5000, 164.5400, 225.1000, 156.3370, 151…
## $ total_len        <dbl> 721.973, 116.000, 216.860, 252.950, 273.220, 166.940…
## $ tunnel_pc        <dbl> 0.76506589, 0.82327586, 0.82196655, 0.88989919, 0.78…
## $ total_cost       <dbl> 138.86373, 16.92127, 38.33016, 44.59756, 41.87035, 2…
## $ avg_stations     <dbl> 13.9259259, 11.5714286, 11.6923077, 13.8181818, 14.8…
## $ avg_tunnel_len   <dbl> 20.457664, 13.642857, 13.711667, 20.463636, 19.54212…
## $ avg_len          <dbl> 26.739741, 16.571429, 16.681538, 22.995455, 24.83818…
## $ avg_cost         <dbl> 5143.1011, 2417.3243, 2948.4738, 4054.3236, 3806.395…
## $ cc_id            <chr> "CN Beijing", "CN Changchun", "CN Changsha", "CN Che…
label_dat_india <- dat %>%
  filter(country %in% "India",
         city != "Gurgaon")
label_dat_china <- dat %>%
  filter(city %in% c("Shanghai", "Beijing"))

to_plot <- transit_cost %>%
  tidyr::drop_na() %>%
  filter(cc_id %in% dat$cc_id,
         country %in% c("CN", "IN"),
         length < 100) %>%
  mutate(country = ifelse(country == "CN", "China", "India"))

glimpse(to_plot)
## Rows: 192
## Columns: 21
## $ e                <int> 7288, 7289, 7290, 7291, 7296, 7297, 7298, 7299, 7304…
## $ country          <chr> "India", "India", "India", "India", "India", "India"…
## $ city             <chr> "Mumbai", "Mumbai", "Mumbai", "Mumbai", "Mumbai", "M…
## $ line             <chr> "Monorail", "Line 3", "Line 2A", "Line 2B", "Line 4"…
## $ start_year       <chr> "2009", "2016", "2016", "2018", "2018", "2019", "201…
## $ end_year         <chr> "2019", "2022", "2021", "2023", "2022", "2022", "202…
## $ rr               <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ length           <dbl> 20.20, 33.50, 18.60, 23.50, 32.30, 2.70, 24.90, 14.5…
## $ tunnel_per       <dbl> 0.00, 100.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.…
## $ tunnel           <dbl> 0.000, 33.500, 0.000, 0.000, 0.000, 0.000, 0.000, 0.…
## $ stations         <int> 17, 27, 17, 22, 32, 2, 16, 13, 16, 11, 4, 10, 17, 46…
## $ source1          <chr> "Plan", "Media", "Media", "Plan", "Plan", "Plan", "P…
## $ cost             <dbl> 30000.00, 320000.00, 64100.00, 109860.00, 145490.00,…
## $ currency         <chr> "INR", "INR", "INR", "INR", "INR", "INR", "INR", "IN…
## $ year             <int> 2014, 2019, 2018, 2020, 2020, 2017, 2020, 2016, 2018…
## $ ppp_rate         <dbl> 0.0540, 0.0470, 0.0480, 0.0470, 0.0470, 0.0480, 0.04…
## $ real_cost        <dbl> 1620.000, 15040.000, 3076.800, 5163.420, 6838.030, 3…
## $ cost_km_millions <dbl> 80.19802, 448.95522, 165.41935, 219.72000, 211.70372…
## $ source2          <chr> "Media", "Trade", "Media", "Plan", "Plan", "Plan", "…
## $ reference        <chr> "https://indianexpress.com/article/cities/mumbai/ind…
## $ cc_id            <chr> "IN Mumbai", "IN Mumbai", "IN Mumbai", "IN Mumbai", …
in_color <- "#2a9d8f"
cn_color <- "#fca311"

chennai <- dat %>% filter(city == "Chennai")
chennai_stations <- chennai$total_stations
chennai_projects <- chennai$total_projects
chennai_x <- chennai$avg_len
chennai_y <- chennai$avg_stations
chennai_cost <- chennai$total_cost

to_plot %>%
  ggplot(aes(length, stations)) +
  geom_point(color = "#8d99ae", size = 0.8, show.legend = FALSE, alpha = 0.3) +
  geom_smooth(data = dat %>% filter(city!="Wenzhou"),
              aes(avg_len, avg_stations, color = country),
              se=FALSE, linetype="dashed", size=0.3, method = "lm", span = 4) +
  geom_point(data = dat,
             aes(avg_len, avg_stations, color = country, size = total_cost^1.3),
             pch = 19, alpha = 0.7) +
  geom_text_repel(data = label_dat_india,
                   aes(avg_len, avg_stations,
                                   label = city,
                                   color = country),
                                   min.segment.length = 1,
                   box.padding = unit(0.5, "line"),
                   nudge_x = -1,
                   show.legend = FALSE) +
  annotate(geom = "curve",
           xend = label_dat_china$avg_len[1], yend = label_dat_china$avg_stations[1],
           x = label_dat_china$avg_len[1] + 6, y = label_dat_china$avg_stations[1] - 6,
           curvature = .3, arrow = arrow(length = unit(0, "mm")), color = "#fca311") +
  annotate(geom = "text",
           x = label_dat_china$avg_len[1] + 5.5, y = label_dat_china$avg_stations[1] - 7,
           label = "Shanghai & Beijing", hjust = "left", size = 3.4, color = "#fca311") +
  annotate(geom = "curve",
           xend = 2, yend = 0,
           x = 2 + 7, y = 0 - 11,
           curvature = -.3, arrow = arrow(length = unit(2, "mm")), color = "#8d99ae", alpha = 0.6) +
  annotate(geom = "text",
           x = 2 + 7.5, y = 0 - 11.5,
           label = "Each point is a transit line", hjust = "left", size = 3.4, color = "#8d99ae") +
  annotate(geom = "curve",
           xend = chennai_x, yend = chennai_y + 2,
           x = chennai_x, y = chennai_y + 16,
           curvature = 0, arrow = arrow(length = unit(0, "mm")), color = in_color, alpha = 0.6) +
  annotate(geom = "text",
           x = chennai_x + 0.5, y = chennai_y + 13,
           label = glue::glue(
             "{cost} over {stations} stations
             in {chennai_projects} lines",
             x = scales::label_number(accuracy = 1, suffix = " km")(chennai_x),
             y = chennai_y,
             stations = chennai_stations,
             chennai_projects = chennai_projects,
             cost = scales::label_dollar(accuracy = 1, suffix = "M")(chennai_cost)
             ),
           hjust = "left", size = 3.4, color = in_color) +
  dark_theme_minimal() +
  scale_x_continuous(breaks = seq(10, 90, 20)) +
  scale_y_continuous(breaks = seq(10, 70, 20)) +
  scale_size(name = "Total City Cost",
             breaks = c(20^1.3, 50^1.3, 80^1.3),
             labels = c("$20M", "$50M","$80M"),
             range = c(1,10)
             ) +
  coord_cartesian(ylim = c(0,90), clip = "off") +
  labs(
    title = "The Cost of Transit in the 21<sup>st</sup> Century",
    subtitle = glue::glue("<span style='color:{cn_color};font-family:Inter-Medium;'>China: </span>253 projects in 28 cities, totaling $1T since 1998<br /><span style='color:{in_color};font-family:Inter-Medium;'>India: </span> 29 projects in 7 cities, totaling $2B since 2011<br /><span style='color:{in_color};font-family:Inter-Medium;'>India</span> has longer transit lines with more stations than <span style='color:{cn_color};font-family:Inter-Medium;'>China</span>, driving up costs <br />for each city."),
    x = "Average Length",
    y = "Average Stations",
    caption = "#tidytuesday\n@rsangole"
    ) +
  theme(
    legend.title = element_text(size = 10, color = "gray60"),
    legend.text =  element_text(size = 10, color = "gray60"),
    axis.ticks.x.bottom = element_line(colour = "gray30",size = 0.5),
    axis.ticks.y.left = element_line(colour = "gray30"),
    axis.title.y = element_text(hjust = .9, size = 10, face = "italic", color = "gray60"),
    axis.title.x = element_text(hjust = .9, size = 10, face = "italic", color = "gray60"),
    plot.title = element_markdown(family = "Inter-Medium", color = "#f8f8f2", size = 22,
                                  margin = margin(0, 0, 0.5, 0, unit = "line")),
    plot.title.position = "plot",
    plot.subtitle = element_markdown(color = "#f8f8f2", size = 12, lineheight = 1.2,
                                     margin = margin(0, 0, 1, 0, unit = "line")),
    plot.margin = margin(1.5, 1.5, 1, 1.5, unit = "line"),
    legend.position = c(0.9,0.1)
    ) +
  scale_discrete_manual(aesthetics = "color",
                        values = c("India" = in_color, "China" = cn_color),
                        guide = F)