Plotting a regression line on a scatter plot of smoking and drinking data with ggplot2 (CC355)

April 17, 2025 • PD Schloss • 9 min read

Pat uses R to recreate a plot from Philip Bump showing the correlation between smoking and drinking across US counties using functions from the tidyverse. To pull this off he uses the dplyr, ggplot2, and showtext packages. The functions he uses from these packages include aes, annotate, coord_cartesian, drop_na, element_blank, element_line, element_text, filter, font_add_google, geom_abline, geom_hline, geom_label, geom_point, geom_smooth, geom_vline, ggplot, ggsave, inner_join, labs, library, lm, median, mutate, read_csv, round, select, showtext_auto, showtext_opts, starts_with, summarize, theme, tribble, and unit. The newsletter describing this visualization at a 30,000 ft view can be found here. You can find the original article here. The UWPHI data can be found here. If you have a figure that you would like to see me discuss in a future newsletter and episode of Code Club, email me at pat@riffomonas.org!

Code

library(tidyverse)
library(showtext)

font_add_google("Libre Franklin", "franklin")
showtext_opts(dpi = 300)
showtext_auto()

codes_of_interest <- tribble(
  ~fipscode, ~hjust, ~vjust,
  "02180",  -0.15,  0.5,
  "38079",  0.5,  -0.5, 
  "02070",  -0.15,  0.5,
  "19115",  -0.15,  0.5,
  "19019",  -0.15,  0.5,
  "55109",  0.5,  1.5,
  "06041",  0.5,  1.5,
  "51610",  0.5,  1.5,
  "49043",  0.5,  1.5,
  "49051",  0.5,  1.5,
  "49049",  1.15,  0.5,
  "09005",  0,  0,
  "24033",  1.15,  0.5,
  "49031",  1.15,  0.5,
  "28021",  1.15,  0.5,
  "28063",  1.15,  0.5,
  "28053",  1.15,  0.5,
  "21237",  1.15,  0.5,
  "46121",  1.15,  0.5,
  "38085",  -0.15,  0.5
)

smoking_drinking <- read_csv("https://www.countyhealthrankings.org/sites/default/files/media/document/analytic_data2025.csv",
    skip = 1) %>%
  filter(countycode != "000") %>%
  select(fipscode, state, county,
         smoking = v009_rawvalue, drinking = v049_rawvalue) %>%
  drop_na() %>%
  mutate(smoking = round(smoking, digits = 2),
         drinking = round(drinking, digits = 2))

median_values <- smoking_drinking %>%
  summarize(med_smoking = median(smoking),
            med_drinking = median(drinking))

interest <- inner_join(smoking_drinking, codes_of_interest, by = "fipscode")

model <- lm(smoking~drinking, data = smoking_drinking)
intercept <- model$coefficients[1]
slope <- model$coefficients[2]

smoking_drinking %>%
  ggplot(aes(x = drinking, y = smoking)) +
  geom_vline(xintercept = median_values$med_drinking, color = "gray80",
             linewidth = 0.2) +
  geom_hline(yintercept = median_values$med_smoking, color = "gray80",
             linewidth = 0.2) +
  geom_point(size = 1, alpha = 0.2, color = "#479126") +
  geom_point(data = interest, 
             aes(x = drinking, y = smoking), fill = "#499427", shape = 21) +
  geom_abline(slope = slope, intercept = intercept,
              linewidth = 0.4) +
  # geom_smooth(formula = y~x, method = "lm") +
  geom_label(data = interest, 
             aes(x = drinking, y = smoking, label = fipscode),
             hjust = interest$hjust, vjust = interest$vjust,
             family = "franklin", size = 8.5, size.unit = "pt",
             label.padding = unit(1, "pt"), label.size = unit(0, "pt")) +
  annotate(geom = "text",
           x = c(-0.03, 0.317),
           y = c(0.416, -0.035),
           hjust = c(-0.05, 1),
           vjust = c(1.2, -0.5),
           label = c("Adult smoking", "Excessive drinking"),
           family = "franklin", size = 8.5, size.unit = "pt") +
  annotate(geom = "text",
           x = c(0, 0.317),
           y = c(0.36, 0.01),
           hjust = c(0, 1),
           label = c("Less drinking,\nmore smokers",
                     "More drinking,\nfewer smokers"),
           family = "franklin", size = 9, size.unit = "pt",
           fontface = "bold", lineheight = 1) +
  coord_cartesian(clip = "off", expand = FALSE,
                  xlim = c(-0.03, 0.317), ylim = c(-0.035, 0.416)) +
  labs(
    title = "How smoking compares with excessive drinking",
    subtitle = "Median county values are shown with horizontal and vertical lines.",
    caption = "Source: University of Wisconsin Population Health Institute"
  ) +
  theme(
    text = element_text(family = "franklin"),
    panel.grid = element_blank(),
    panel.background = element_blank(),
    axis.text = element_blank(),
    axis.ticks = element_blank(),
    axis.title = element_blank(),
    axis.line = element_line(linewidth = 0.3),
    plot.title = element_text(face = "bold", margin = margin(t = 3, b = 12)),
    plot.subtitle = element_text(margin = margin(b = 8)),
    plot.caption = element_text(hjust = 0, margin = margin(t = 7))
  )

ggsave("uwphi_biplot.png", width = 6, height = 6.39)