Unequal Recreational Losses?
  • Home
  • MDCEV
  • Random Utility
  • Source Code
  • Report a Bug

On this page

  • Data
    • Libraries
    • Welfare Table
    • Per participant
      • EJ
      • Environmental Vulnerabilities
  • Income
  • Low-income Welfare
  • Spread Across Models & DCA
  • Map
    • Grey Map
    • Color Map
    • Color RUM
  • Sensitivity Test
  • What Years?
  • By park site

Unequal Recreational Losses? Revealed Preference Evidence of Campsite Closures in Hawaiʻi

  • Show All Code
  • Hide All Code

  • View Source

This site provides a walk through on the results from Unequal Recreational Losses? Revealed Preference Evidence of Campsite Closures in Hawaiʻi. This tab covers the comparison by loading the data from the models. Tabs MDCEV and RUM are the codes that will run each model.

Data

Libraries

Show code
library(dplyr)
library(tidyr)
library(haven)
library(readr)
library(mlogit)
library(lmtest)
library(purrr)
library(stringr)
library(readr)
library(ggplot2)
library(stargazer)
library(readxl)
library(patchwork)
library(broom)
library(tidyverse)
library(sf)
library(tigris)
library(ggrepel)
library(ggspatial)
library(ggpattern)


final_comparison_master=read_csv("data/final_comparison_master.csv")


final_data_2018 <- read_csv("data/final_data_2018.csv")
final_data_2019 <- read_csv("data/final_data_2019.csv")
final_data_2020 <- read_csv("data/final_data_2020.csv")
final_data_2021 <- read_csv("data/final_data_2021.csv")
final_data_2022 <- read_csv("data/final_data_2022.csv")
final_data_2023 <- read_csv("data/final_data_2023.csv")

model_list_obs <- readRDS("data/model_list_obs.rds")
model_list_52 <- readRDS("data/model_list_52.rds")

Welfare Table

Show code
# Calculate Mean and 95% Confidence Intervals
summary_wide <- final_comparison_master %>%
  filter(year %in% c(2018, 2023)) %>%
  group_by(parkname, year) %>%
  summarise(
    n = n(),
    mean_val = mean(loss_rum_observed, na.rm = TRUE),
    sd_val   = sd(loss_rum_observed, na.rm = TRUE),
    # Calculate Standard Error and 95% CI bounds
    se_val   = sd_val / sqrt(n),
    LCI      = mean_val - (1.96 * se_val),
    UCI      = mean_val + (1.96 * se_val),
    .groups = "drop"
  ) %>%
  select(parkname, year, mean_val, LCI, UCI) %>%
  pivot_wider(
    names_from = year, 
    values_from = c(mean_val, LCI, UCI),
    names_glue = "{year}_{.value}" 
  ) %>%
  # 2018 Stats then 2023 Stats
  select(parkname, 
         contains("2018"), 
         contains("2023")) %>%
  as.data.frame()
Show code
stargazer(summary_wide, 
          type = "html", 
          out = "tables/Welfare_Comparison_Table.html",
          summary = FALSE, 
          rownames = FALSE,
          title = "Welfare Loss: Mean and 95% Confidence Intervals (2018 vs 2023)",
          digits = 2,
          # Groups the 3 columns under each year
          column.labels = c("Park Name", "2018", "2023"),
          column.separate = c(1, 3, 3), 
          covariate.labels = c("Park", 
                               "Mean", "Lower CI", "Upper CI", 
                               "Mean", "Lower CI", "Upper CI"))



# Calculate Mean and 95% Confidence Intervals
summary_wide <- final_comparison_master %>%
  filter(year %in% c(2018, 2023)) %>%
  group_by(parkname, year) %>%
  summarise(
    n = n(),
    mean_val = mean(loss_rum_52wk, na.rm = TRUE),
    sd_val   = sd(loss_rum_52wk, na.rm = TRUE),
    # Calculate Standard Error and 95% CI bounds
    se_val   = sd_val / sqrt(n),
    LCI      = mean_val - (1.96 * se_val),
    UCI      = mean_val + (1.96 * se_val),
    .groups = "drop"
  ) %>%
  select(parkname, year, mean_val, LCI, UCI) %>%
  pivot_wider(
    names_from = year, 
    values_from = c(mean_val, LCI, UCI),
    names_glue = "{year}_{.value}" 
  ) %>%
  # 2018 Stats then 2023 Stats
  select(parkname, 
         contains("2018"), 
         contains("2023")) %>%
  as.data.frame()
Show code
stargazer(summary_wide, 
          type = "html", 
          out = "tables/Welfare_Comparison_Table_frequency.html",
          summary = FALSE, 
          rownames = FALSE,
          title = "Welfare Loss Frequency: Mean and 95% Confidence Intervals (2018 vs 2023)",
          digits = 2,
          # Groups the 3 columns under each year
          column.labels = c("Park Name", "2018", "2023"),
          column.separate = c(1, 3, 3), 
          covariate.labels = c("Park", 
                               "Mean", "Lower CI", "Upper CI", 
                               "Mean", "Lower CI", "Upper CI"))
Show the Code
# Calculate Mean and 95% Confidence Intervals
summary_wide <- final_comparison_master %>%
  filter(year %in% c(2019, 2020)) %>%
  group_by(parkname, year) %>%
  summarise(
    n = n(),
    mean_val = mean(loss_rum_52wk, na.rm = TRUE),
    sd_val   = sd(loss_rum_52wk, na.rm = TRUE),
    # Calculate Standard Error and 95% CI bounds
    se_val   = sd_val / sqrt(n),
    LCI      = mean_val - (1.96 * se_val),
    UCI      = mean_val + (1.96 * se_val),
    .groups = "drop"
  ) %>%
  select(parkname, year, mean_val, LCI, UCI) %>%
  pivot_wider(
    names_from = year, 
    values_from = c(mean_val, LCI, UCI),
    names_glue = "{year}_{.value}" 
  ) %>%
  # 2018 Stats then 2023 Stats
  select(parkname, 
         contains("2019"), 
         contains("2020")) %>%
  as.data.frame()
Show code
stargazer(summary_wide, 
          type = "html", 
          out = "tables/Welfare_Comparison_Table_frequency_1920.html",
          summary = FALSE, 
          rownames = FALSE,
          title = "Welfare Loss Frequency: Mean and 95% Confidence Intervals (2019 & 2020)",
          digits = 2,
          # Groups the 3 columns under each year
          column.labels = c("Park Name", "2019", "2020"),
          column.separate = c(1, 3, 3), 
          covariate.labels = c("Park", 
                               "Mean", "Lower CI", "Upper CI", 
                               "Mean", "Lower CI", "Upper CI"))
Show code
# ── Build summary for both models ─────────────────────────────────────────────
calc_summary <- function(data, loss_col, years) {
    data %>%
        filter(year %in% years, parkname != "Stay_Home") %>%
        group_by(parkname, year) %>%
        summarise(
            n        = n(),
            mean_val = mean(.data[[loss_col]],  na.rm = TRUE),
            sd_val   = sd(.data[[loss_col]],    na.rm = TRUE),
            se_val   = sd_val / sqrt(n),
            LCI      = mean_val - 1.96 * se_val,
            UCI      = mean_val + 1.96 * se_val,
            .groups  = "drop"
        ) %>%
        select(parkname, year, mean_val, LCI, UCI)
}

# ── Frequency RUM ─────────────────────────────────────────────────────────────
freq_wide <- calc_summary(final_comparison_master, "loss_rum_52wk", c(2021, 2022)) %>%
    pivot_wider(
        names_from  = year,
        values_from = c(mean_val, LCI, UCI),
        names_glue  = "{year}_freq_{.value}"
    )

# ── Observed RUM ──────────────────────────────────────────────────────────────
obs_wide <- calc_summary(final_comparison_master, "loss_rum_observed", c(2021, 2022)) %>%
    pivot_wider(
        names_from  = year,
        values_from = c(mean_val, LCI, UCI),
        names_glue  = "{year}_obs_{.value}"
    )

# ── Join both models ──────────────────────────────────────────────────────────
summary_wide <- freq_wide %>%
    left_join(obs_wide, by = "parkname") %>%
    select(
        parkname,
        # 2019 — Observed then Frequency
        `2021_obs_mean_val`, `2021_obs_LCI`, `2021_obs_UCI`,
        `2021_freq_mean_val`, `2021_freq_LCI`, `2021_freq_UCI`,
        # 2020 — Observed then Frequency
        `2022_obs_mean_val`, `2022_obs_LCI`, `2022_obs_UCI`,
        `2022_freq_mean_val`, `2022_freq_LCI`, `2022_freq_UCI`
    ) %>%
    arrange(parkname) %>%
    as.data.frame()

# ── Export ────────────────────────────────────────────────────────────────────
stargazer(
    summary_wide,
    type       = "html",
    out        = "tables/Welfare_Comparison_Table_frequency_2122.html",
    summary    = FALSE,
    rownames   = FALSE,
    title      = "Welfare Loss: Observed and Frequency RUM — Mean and 95% CI (2021 & 2022)",
    digits     = 2,
    # 4 groups: 2020 Observed, 2020 Frequency, 2022 Observed, 2022 Frequency
    column.labels    = c("Park Name",
                         "2021 Observed", "2021 Frequency",
                         "2022 Observed", "2022 Frequency"),
    column.separate  = c(1, 3, 3, 3, 3),
    covariate.labels = c("Park",
                         "Mean", "Lower CI", "Upper CI",   # 2019 Observed
                         "Mean", "Lower CI", "Upper CI",   # 2019 Frequency
                         "Mean", "Lower CI", "Upper CI",   # 2020 Observed
                         "Mean", "Lower CI", "Upper CI"),  # 2020 Frequency
    notes       = "95% confidence intervals. Observed RUM conditions welfare on observed trips. Frequency RUM uses 52 weekly choice occasions.",
    notes.align = "l"
)

<table style="text-align:center"><caption><strong>Welfare Loss: Observed and Frequency RUM — Mean and 95% CI (2021 & 2022)</strong></caption>
<tr><td colspan="13" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left">Park</td><td>Mean</td><td>Lower CI</td><td>Upper CI</td><td>Mean</td><td>Lower CI</td><td>Upper CI</td><td>Mean</td><td>Lower CI</td><td>Upper CI</td><td>Mean</td><td>Lower CI</td><td>Upper CI</td></tr>
<tr><td colspan="13" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left">Ainapo Cabin</td><td>-2.18</td><td>-2.45</td><td>-1.92</td><td>-2.57</td><td>-2.83</td><td>-2.30</td><td>-4.85</td><td>-5.67</td><td>-4.03</td><td>-5.39</td><td>-5.98</td><td>-4.80</td></tr>
<tr><td style="text-align:left">Hipalau</td><td>-2.62</td><td>-2.85</td><td>-2.39</td><td>-3.13</td><td>-3.35</td><td>-2.90</td><td>-3.32</td><td>-3.61</td><td>-3.03</td><td>-4.07</td><td>-4.37</td><td>-3.77</td></tr>
<tr><td style="text-align:left">Kaluahaulu</td><td>-3.39</td><td>-3.68</td><td>-3.09</td><td>-4.02</td><td>-4.31</td><td>-3.72</td><td>-3.00</td><td>-3.26</td><td>-2.74</td><td>-3.68</td><td>-3.95</td><td>-3.41</td></tr>
<tr><td style="text-align:left">Kamananui Trail</td><td>-1.29</td><td>-1.36</td><td>-1.22</td><td>-1.56</td><td>-1.61</td><td>-1.52</td><td>-0.84</td><td>-0.89</td><td>-0.80</td><td>-1.05</td><td>-1.08</td><td>-1.02</td></tr>
<tr><td style="text-align:left">Kaunala Trail</td><td>-3.06</td><td>-3.22</td><td>-2.90</td><td>-3.68</td><td>-3.80</td><td>-3.57</td><td>-2.65</td><td>-2.79</td><td>-2.51</td><td>-3.28</td><td>-3.39</td><td>-3.17</td></tr>
<tr><td style="text-align:left">Kawaikoi</td><td>-12.55</td><td>-13.66</td><td>-11.44</td><td>-13.87</td><td>-14.88</td><td>-12.85</td><td>-17.47</td><td>-19.02</td><td>-15.91</td><td>-19.61</td><td>-21.06</td><td>-18.17</td></tr>
<tr><td style="text-align:left">Keanakolu Ranger</td><td>Bunkhouse Cabins</td><td>-2.08</td><td>-2.30</td><td>-1.85</td><td>-2.46</td><td>-2.70</td><td>-2.21</td><td>-5.01</td><td>-5.72</td><td>-4.30</td><td>-5.65</td><td>-6.24</td></tr>
<tr><td style="text-align:left">Kuaokala Trail</td><td>-35.34</td><td>-37.10</td><td>-33.58</td><td>-38.26</td><td>-39.42</td><td>-37.09</td><td>-28.93</td><td>-30.38</td><td>-27.48</td><td>-32.76</td><td>-33.85</td><td>-31.67</td></tr>
<tr><td style="text-align:left">Kulanaahane Trail</td><td>-1.57</td><td>-1.65</td><td>-1.48</td><td>-1.90</td><td>-1.95</td><td>-1.84</td><td>-1.05</td><td>-1.11</td><td>-1.00</td><td>-1.31</td><td>-1.35</td><td>-1.27</td></tr>
<tr><td style="text-align:left">Kuliouou Trail</td><td>-7.25</td><td>-7.66</td><td>-6.85</td><td>-8.59</td><td>-8.86</td><td>-8.33</td><td>-6.02</td><td>-6.35</td><td>-5.69</td><td>-7.35</td><td>-7.60</td><td>-7.11</td></tr>
<tr><td style="text-align:left">Lonomea</td><td>-7.69</td><td>-8.36</td><td>-7.02</td><td>-8.83</td><td>-9.47</td><td>-8.19</td><td>-12.36</td><td>-13.45</td><td>-11.28</td><td>-14.34</td><td>-15.38</td><td>-13.29</td></tr>
<tr><td style="text-align:left">Maakua Ridge Trail</td><td>-1.85</td><td>-1.95</td><td>-1.74</td><td>-2.23</td><td>-2.30</td><td>-2.16</td><td>-2.22</td><td>-2.35</td><td>-2.10</td><td>-2.76</td><td>-2.85</td><td>-2.67</td></tr>
<tr><td style="text-align:left">Manana Trail</td><td>-5.04</td><td>-5.30</td><td>-4.78</td><td>-6.03</td><td>-6.20</td><td>-5.85</td><td>-5.46</td><td>-5.74</td><td>-5.18</td><td>-6.70</td><td>-6.91</td><td>-6.48</td></tr>
<tr><td style="text-align:left">Peacock Flats Campsite</td><td>-98.03</td><td>-102.92</td><td>-93.13</td><td>-87.01</td><td>-89.67</td><td>-84.34</td><td>-106.41</td><td>-111.78</td><td>-101.04</td><td>-93.82</td><td>-96.96</td><td>-90.68</td></tr>
<tr><td style="text-align:left">Sugi Grove</td><td>-10.13</td><td>-11.02</td><td>-9.24</td><td>-11.40</td><td>-12.23</td><td>-10.57</td><td>-11.27</td><td>-12.26</td><td>-10.28</td><td>-13.15</td><td>-14.12</td><td>-12.19</td></tr>
<tr><td style="text-align:left">Waialae Cabin Campsite</td><td>-1.58</td><td>-1.71</td><td>-1.44</td><td>-1.90</td><td>-2.03</td><td>-1.76</td><td>-2.24</td><td>-2.43</td><td>-2.04</td><td>-2.76</td><td>-2.96</td><td>-2.56</td></tr>
<tr><td style="text-align:left">Waikoali</td><td>-8.53</td><td>-9.28</td><td>-7.78</td><td>-9.72</td><td>-10.43</td><td>-9.01</td><td>-9.12</td><td>-9.92</td><td>-8.32</td><td>-10.78</td><td>-11.57</td><td>-9.99</td></tr>
<tr><td style="text-align:left">Waikolu</td><td>-0.74</td><td>-0.80</td><td>-0.67</td><td>-0.89</td><td>-0.92</td><td>-0.86</td><td>-0.74</td><td>-0.80</td><td>-0.68</td><td>-0.92</td><td>-0.94</td><td>-0.89</td></tr>
<tr><td style="text-align:left">Waimano Trail</td><td>-2.59</td><td>-2.72</td><td>-2.45</td><td>-3.12</td><td>-3.22</td><td>-3.03</td><td>-1.80</td><td>-1.89</td><td>-1.71</td><td>-2.23</td><td>-2.30</td><td>-2.16</td></tr>
<tr><td style="text-align:left">Waimanu Campsite</td><td>-42.07</td><td>-47.35</td><td>-36.79</td><td>-28.33</td><td>-31.12</td><td>-25.53</td><td>-13.85</td><td>-15.96</td><td>-11.74</td><td>-13.04</td><td>-14.39</td><td>-11.69</td></tr>
<tr><td style="text-align:left">Wiliwili</td><td>-3.87</td><td>-4.20</td><td>-3.54</td><td>-4.58</td><td>-4.90</td><td>-4.25</td><td>-4.64</td><td>-5.04</td><td>-4.25</td><td>-5.65</td><td>-6.06</td><td>-5.24</td></tr>
<tr><td style="text-align:left">Wiliwilinui Ridge Trail</td><td>-1.20</td><td>-1.26</td><td>-1.13</td><td>-1.45</td><td>-1.49</td><td>-1.41</td><td>-1.58</td><td>-1.67</td><td>-1.50</td><td>-1.97</td><td>-2.03</td><td>-1.90</td></tr>
<tr><td colspan="13" style="border-bottom: 1px solid black"></td></tr><tr><td colspan="13" style="text-align:left">95% confidence intervals. Observed RUM conditions welfare on observed trips. Frequency RUM uses 52 weekly choice occasions.</td></tr>
</table>
Show code
message("Saved to Welfare_Comparison_Table_frequency_2122.html")

Plot

So this model takes the park mean of the welfare per person and shows the mean per park across years. The stability shows that will including a frequency trip in RUM shows the highest variation, the RUM opt out is more concise and then the MDCEV is always in the middle.

Show code
# Summarize and Reshape the data
plot_data <- final_comparison_master %>%
  group_by(parkname, year) %>%
  filter(parkname!="Stay_Home")%>%
  summarise(
    ObservedRUM = mean(loss_rum_observed, na.rm = TRUE),
    FrequencyRUM = mean(loss_rum_52wk, na.rm = TRUE),
    MDCEV = mean(loss_mdcev, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  pivot_longer(
    cols = c(ObservedRUM, FrequencyRUM, MDCEV),
    names_to = "Model_Type",
    values_to = "Mean_Welfare_Loss"
  )


# Create the Boxplot 
p=ggplot(plot_data, aes(x = factor(year), y = Mean_Welfare_Loss, fill = Model_Type)) +
  # Classic boxplot with black outlines
  geom_boxplot(outlier.shape = 21, outlier.fill = "white", color = "black", alpha = 0.8) +
  
  # Labels
  labs(
    title = "Distribution of Mean Park Welfare Loss from Site Closures by Year",
    subtitle = "Comparing Observed RUM and MDCEV Model",
    x = "Year",
    y = "Average Welfare Loss Across sites ($)",
    fill = "Model Type"
  ) +
  
  # Colors
scale_fill_manual(values = c(
    "ObservedRUM"  = "#D9D9D9",   # Light Grey
    "MDCEV"        = "#252525"    # Dark Grey / Black
  )) +
  
  # Theme Customization
  theme_bw() + # Starting with theme_bw gives a clean white background
  theme(
    # Set all text to Times New Roman (serif)
    text = element_text(family = "serif", size =20),
    
    # Title and axis styling
    plot.title = element_text(face = "bold", size =26, hjust = 0.5),
    plot.subtitle = element_text(hjust = 0.5),
    axis.title = element_text(face = "bold"),
    axis.text = element_text(color = "black"),
    
    # Remove gridlines for the "Classic/Blank" look
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    
    # Legend styling
    legend.position = "bottom",
    legend.background = element_blank(),
    
    # Classic full border (Using 'size' instead of 'linewidth' for compatibility)
    panel.border = element_rect(colour = "black", fill = NA, size = 1)
  )

p

Show code
ggsave("images/comparision_perperson_2models.png", plot = p,width=12, height=10, dpi=300)

Per participant

Show code
full_year_clean=rbind(final_data_2018,final_data_2019,final_data_2020,final_data_2021,final_data_2022,final_data_2023)
# Identify participants by ensuring IDs match as characters
participants_only_welfare <- final_comparison_master %>%
    filter(parkname!="Stay_Home")%>%
  left_join(
    # Convert ID to character on the fly so the join works
    full_year_clean %>% 
      select(id, parkname, year, VisitCount), 
    by = c("id", "parkname", "year")
  ) %>%
  # Filter for people who actually visited the park in question
  filter(VisitCount > 0)

# Summary for the "Participants Only" group
participants_summary_table <- participants_only_welfare %>%
  group_by(year, parkname) %>%
  summarise(
    avg_loss_obs_participant   = mean(loss_rum_observed, na.rm = TRUE),
    avg_loss_52wk_participant  = mean(loss_rum_52wk, na.rm = TRUE),
    avg_loss_mdcev_participant = mean(loss_mdcev, na.rm = TRUE),
    num_participants           = n(),
    .groups = "drop"
  )

Plot

Show code
# Reshape the Participants Summary for plotting
plot_data_participants <- participants_summary_table %>%
  # Select the three mean columns we created earlier
  select(year, parkname, 
         ObservedRUM = avg_loss_obs_participant, 
         FrequencyRUM = avg_loss_52wk_participant, 
         MDCEV = avg_loss_mdcev_participant) %>%
  pivot_longer(
    cols = c(ObservedRUM, FrequencyRUM, MDCEV),
    names_to = "Model_Type",
    values_to = "Mean_Welfare_Loss"
  )

#  Create the Grayscale Boxplot
participants=ggplot(plot_data_participants, aes(x = factor(year), y = Mean_Welfare_Loss, fill = Model_Type)) +
  # Standard boxplot with black outlines and white outliers
  geom_boxplot(outlier.shape = 21, outlier.fill = "white", color = "black", alpha = 1) +
  
  # Labels
  labs(
    title = "Mean Annual Welfare Loss: Active Participants Only",
    subtitle = "Calculated for individuals with Observed Visits (VisitCount > 0)",
    x = "Year",
    y = "Mean Annual Welfare Loss ($/User)",
    fill = "Model Type"
  ) +
  
  # Shades of Grey
  scale_fill_manual(values = c(
    "ObservedRUM"  = "#D9D9D9",   # Light Grey
    "FrequencyRUM" = "#737373",   # Medium Grey
    "MDCEV"        = "#252525"    # Dark Grey / Black
  )) +
  
  # Theme Customization (Blank Background + Times New Roman)
  theme_bw() + 
  theme(
    text = element_text(family = "serif", size = 20),
    plot.title = element_text(face = "bold", size = 26, hjust = 0.5),
    plot.subtitle = element_text(hjust = 0.5),
    axis.title = element_text(face = "bold"),
    axis.text = element_text(color = "black"),
    
    # Remove all gridlines for a completely blank white background
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    
    # Legend styling
    legend.position = "bottom",
    legend.background = element_blank(),
    
    # Classic full black border around the plot (using size for compatibility)
    panel.border = element_rect(colour = "black", fill = NA, size = 1)
  )

ggsave("images/comparision_perparticipate.png",plot=participants, width=12, height=10, dpi=300)

participants

EJ

Now that we have the per person welfare across individuals we can link the zip to a location with a higher EJ score and see how this spread spills over to EJ

EJScreen, developed by the U.S. Environmental Protection Agency (EPA), is a screening tool that provides percentile rankings for environmental and demographic indicators, such as air pollution, proximity to hazardous sites, poverty, and limited English proficiency, across census tracts in the U.S. It is primarily used for identifying areas with potential environmental justice concerns, but it does not designate any community as “disadvantaged.”

Instead, it enables comparisons across regions and supports risk assessments, permitting decisions, and community advocacy. In contrast, the Climate and Economic Justice Screening Tool (CEJST), developed by the White House Council on Environmental Quality (CEQ), is designed specifically to identify disadvantaged communities (DACs) eligible for benefits under the Justice40 Initiative, which aims to direct 40% of federal climate and infrastructure investments to these communities. CEJST is a binary approach labeling each census tract as either disadvantaged or not. While EJScreen is more detailed and flexible for analysis, CEJST provides a clear policy designation to guide funding and investment.

We use the State Percentiles here.

The U.S. percentiles are computed based on the block groups nationwide. The state percentiles are calculated within each individual state. EJScreen focuses on the U.S. percentiles as a way to visualize all results in common units. The U.S. percentile uses the U.S. block groups as the basis of comparison. The state percentile was also calculated based on the block groups of a given state (or the District of Columbia or Puerto Rico). The national or state mean value was calculated as the average of the block groups with data for that indicator, within the respective geographic scope.

Social Vulnerabilities

Using EJScreen gives us flexibility where we can both examine the social factors and environmental factors. Here we will look at the social vulnerabilities of 90th percentiles within the state who are identified using :

  • Percentile for Demographic Index

  • Percentile for Supplemental Demographic Index

  • Percentile for % people of color

  • Percentile for % low income

  • Percentile for % unemployed

  • Percentile for % persons with disabilities

  • Percentile for % limited English speaking

  • Percentile for % less than high school education

  • Percentile for % under age 5

  • Percentile for % over age 64

  • Percentile for Low Life Expectancy

Environmental Vulnerabilities

Here we will look at the environmental hazard of 90th percentiles within the state who are identified using :

  • -Percentile for Particulate Matter 2.5

  • Percentile for Ozone

  • Percentile for Diesel particulate matter

  • Percentile for Toxic Releases to Air

  • Percentile for Traffic proximity

  • Percentile for Lead paint

  • Percentile for Superfund proximity

  • Percentile for RMP facility proximity

  • Percentile for Hazardous waste proximity

  • Percentile for Underground storage tanks

  • Percentile for Wastewater discharge

  • Percentile for Nitrogen Dioxide (NO2)

  • Percentile for Drinking Water Non-Compliance

  • Percentile for Particulate Matter 2.5

  • Percentile for Ozone EJ Index

  • Percentile for Diesel particulate matter EJ Index

  • Percentile for Toxic Releases to Air EJ Index

  • Percentile for Traffic proximity EJ Index

  • Percentile for Lead paint EJ Index

  • Percentile for Superfund proximity EJ Index

  • Percentile for RMP Facility Proximity EJ Index

  • Percentile for Hazardous waste proximity EJ Index

  • Percentile for Underground storage tanks EJ Index

  • Percentile for Wastewater discharge EJ Index

  • Percentile for Nitrogen Dioxide (NO2) EJ Index

  • Percentile for Drinking Water Non-Compliance EJ Index

Show code
# Hud data

ZIP_TRACT_122025 <- read_excel("data/ZIP_TRACT_122025.xlsx")

# only hawaii

dishawaii <- readRDS("data/dishawaii.rds")

# clean tract
dishawaii <- dishawaii %>%
    mutate(TRACT = str_pad(as.character(ID), 11, "left", "0"))

# This taking the tact by zip code filtering hawaii  and making sure tract and zip in write order
# THIS is where you need to make changes
# It should be in an earlier chunk in your script

dishawaii_tract <- dishawaii %>%
  rowwise() %>%
  mutate(
    # ── CHANGE 1: threshold from >= 80 to >= 90 ───────────────────────────
    env_burden = any(c_across(c(
      P_PM25, P_OZONE, P_DSLPM, P_RSEI_AIR, P_PTRAF,
      P_LDPNT, P_PNPL, P_PRMP, P_PTSDF, P_UST, P_PWDIS,
      P_NO2, P_DWATER,
      P_D2_PM25, P_D5_PM25, P_D2_OZONE, P_D5_OZONE,
      P_D2_DSLPM, P_D5_DSLPM, P_D2_RSEI_AIR, P_D5_RSEI_AIR,
      P_D2_PTRAF, P_D5_PTRAF, P_D2_LDPNT, P_D5_LDPNT,
      P_D2_PNPL, P_D5_PNPL, P_D2_PRMP, P_D5_PRMP,
      P_D2_PTSDF, P_D5_PTSDF, P_D2_UST, P_D5_UST,
      P_D2_PWDIS, P_D5_PWDIS, P_D2_NO2, P_D5_NO2,
      P_D2_DWATER, P_D5_DWATER
    )) >= 90,   # <-- CHANGE THIS NUMBER (80 or 90)
    na.rm = TRUE),

    soc_vulnerable = any(c_across(c(
      P_PEOPCOLORPCT, P_LOWINCPCT, P_UNEMPPCT, P_DISABILITYPCT,
      P_LINGISOPCT, P_LESSHSPCT, P_UNDER5PCT, P_OVER64PCT, P_LIFEEXPPCT
    )) >= 90,   # <-- CHANGE THIS NUMBER (80 or 90)
    na.rm = TRUE),

    # ── CHANGE 2: AND vs OR logic ─────────────────────────────────────────
    DAC_flag = as.integer(env_burden | soc_vulnerable)
    #                               ^ change & to | for OR logic
  ) %>%
  ungroup()


hud_crosswalk <- ZIP_TRACT_122025 %>%
    rename_with(toupper) %>% 
    mutate(
        # Ensure standard 11-digit Tract and 5-digit ZIP
        TRACT = str_pad(as.character(TRACT), 11, "left", "0"),
        ZIP   = str_pad(as.character(ZIP), 5, "left", "0")
    ) %>%
    # Filter for Hawaii only: 
    # Either by ZIP prefix (967/968) OR Census Tract State prefix (15)
    filter(str_starts(TRACT, "15"))


#Define thursholds into the 90s
dishawaii_zip <- hud_crosswalk %>%
  left_join(dishawaii_tract, by = "TRACT") %>%
  group_by(ZIP) %>%
  summarise(
    
    # ── Binary population share flags ────────────────────────────────────────
    pct_pop_in_DAC  = sum(TOT_RATIO[DAC_flag == 1],          na.rm = TRUE),
    pct_pop_in_Envi = sum(TOT_RATIO[env_burden == TRUE],     na.rm = TRUE),
    pct_pop_in_Soci = sum(TOT_RATIO[soc_vulnerable == TRUE], na.rm = TRUE),
    
    # ── Environmental composite ───────────────────────────────────────────────
    # Use P_ columns — state percentiles, all on 0-100 scale
    # Excludes PM25 and OZONE (limited Hawaii data)
    env_composite = weighted.mean(
      rowMeans(cbind(
        P_DSLPM,    P_RSEI_AIR, P_PTRAF,  P_LDPNT,
        P_PNPL,     P_PRMP,     P_PTSDF,  P_UST,
        P_PWDIS,    P_NO2,      P_DWATER
      ), na.rm = TRUE),
      TOT_RATIO, na.rm = TRUE
    ),
    
    # ── Social vulnerability composite ────────────────────────────────────────
    # All 9 social P_ indicators — state percentiles
    soci_composite = weighted.mean(
      rowMeans(cbind(
        P_PEOPCOLORPCT, P_LOWINCPCT,    P_UNEMPPCT,
        P_DISABILITYPCT, P_LINGISOPCT,  P_LESSHSPCT,
        P_UNDER5PCT,    P_OVER64PCT,    P_LIFEEXPPCT
      ), na.rm = TRUE),
      TOT_RATIO, na.rm = TRUE
    ),
    
    # ── Also store the index scores EJScreen already computed ─────────────────
    # DEMOGIDX_2 is EJScreen's own demographic index — useful as a check
    demog_index = weighted.mean(P_DEMOGIDX_2, TOT_RATIO, na.rm = TRUE),
    
    total_ratio = sum(TOT_RATIO, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  mutate(
    
    # ── Binary flags ──────────────────────────────────────────────────────────
    is_disadvantaged_50      = ifelse(pct_pop_in_DAC  >= 1, 1, 0),
    is_disadvantaged_50_envi = ifelse(pct_pop_in_Envi >= 1, 1, 0),
    is_disadvantaged_50_soci = ifelse(pct_pop_in_Soci >= 1, 1, 0),
    
    # ── Environmental percentile blocks ───────────────────────────────────────
    # High env_pctile
    env_pctile = ntile(env_composite, 100),
    env_block  = cut(env_pctile,
      breaks = c(0, 20, 40, 60, 80, 90, 95, 100),
      labels = c(
        "0-20th\n(Lowest)",  "20-40th", "40-60th",
        "60-80th", "80-90th", "90-95th", "95-100th\n(Highest)"
      ),
      include.lowest = TRUE
    ),
    
    # ── Social vulnerability percentile blocks ────────────────────────────────
    # High soci_pctile = high vulnerability = most vulnerable
    soci_pctile = ntile(soci_composite, 100),
    soci_block  = cut(soci_pctile,
      breaks = c(0, 20, 40, 60, 80, 90, 95, 100),
      labels = c(
        "0-20th\n(Lowest)",  "20-40th", "40-60th",
        "60-80th", "80-90th", "90-95th", "95-100th\n(Highest)"
      ),
      include.lowest = TRUE
    )
  )

# ── Verification checks ───────────────────────────────────────────────────────

# 1. Composite scores should increase with block
dishawaii_zip %>%
  group_by(env_block) %>%
  summarise(
    mean_env  = round(mean(env_composite,  na.rm = TRUE), 1),
    mean_flag = round(mean(pct_pop_in_Envi, na.rm = TRUE), 2),
    n         = n()
  ) %>% arrange(env_block)
# A tibble: 7 × 4
  env_block             mean_env mean_flag     n
  <fct>                    <dbl>     <dbl> <int>
1 "0-20th\n(Lowest)"        11.6      0.28    40
2 "20-40th"                 20.3      0.33    29
3 "40-60th"                 34.8      0.57    20
4 "60-80th"                 58.3      0.93    20
5 "80-90th"                 66.8      0.99    10
6 "90-95th"                 67.6      1        5
7 "95-100th\n(Highest)"     68.3      1        5
Show code
dishawaii_zip %>%
  group_by(soci_block) %>%
  summarise(
    mean_soci = round(mean(soci_composite,  na.rm = TRUE), 1),
    mean_flag = round(mean(pct_pop_in_Soci, na.rm = TRUE), 2),
    n         = n()
  ) %>% arrange(soci_block)
# A tibble: 7 × 4
  soci_block            mean_soci mean_flag     n
  <fct>                     <dbl>     <dbl> <int>
1 "0-20th\n(Lowest)"         42.1      0.32    40
2 "20-40th"                  53.4      0.56    29
3 "40-60th"                  60.5      0.47    20
4 "60-80th"                  65.5      0.85    20
5 "80-90th"                  69.3      0.98    10
6 "90-95th"                  72.2      1        5
7 "95-100th\n(Highest)"      74.6      1        5
Show code
# 2. Check coverage
dishawaii_zip %>%
  summarise(
    total_zips      = n(),
    env_complete    = sum(!is.na(env_composite)),
    soci_complete   = sum(!is.na(soci_composite)),
    total_ratio_avg = round(mean(total_ratio), 2)  # should be close to 1.0
  )
# A tibble: 1 × 4
  total_zips env_complete soci_complete total_ratio_avg
       <int>        <int>         <int>           <dbl>
1        129          129           129               1
Show code
# ── Join to welfare data ──────────────────────────────────────────────────────
final_comparison_master1 <- final_comparison_master %>%
  left_join(
    dishawaii_zip %>%
      mutate(ZIP = as.numeric(ZIP)) %>%   # convert ZIP to numeric here
      select(ZIP,
             env_composite,  env_pctile,  env_block,
             soci_composite, soci_pctile, soci_block,
             demog_index,
             pct_pop_in_DAC, pct_pop_in_Envi, pct_pop_in_Soci,
             is_disadvantaged_50, is_disadvantaged_50_envi, is_disadvantaged_50_soci),
    by = c("zip" = "ZIP")
  )

Income

Show code
final_comparison_master1 <- final_comparison_master1 %>%
  mutate(
    income_pctile = 101 - ntile(income, 100),
    
    income_block = cut(income_pctile,
      breaks = c(0, 20, 40, 60, 80, 90, 95, 100),
      labels = c(
        "0-20th\n(Lowest)",  "20-40th",  "40-60th",
        "60-80th", "80-90th", "90-95th", "95-100th\n(Highest)"
      ),
      include.lowest = TRUE
    )
  )

avg_data <- final_comparison_master1 %>%
  filter(year %in% c(2018, 2023), parkname !="Stay_Home") %>%
  pivot_longer(
    cols      = c(loss_mdcev, loss_rum_observed,loss_rum_52wk),
    names_to  = "model",
    values_to = "loss"
  ) %>%
  mutate(
    model = recode(model,
      "loss_mdcev"        = "MDCEV",
      "loss_rum_observed" = "Observed RUM",
      "loss_rum_52wk" = "Frequency RUM"
    )
  ) %>%
  group_by(income_block, model, year) %>%
  summarise(
    mean_loss = mean(loss,           na.rm = TRUE),
    se        = sd(loss, na.rm = TRUE) / sqrt(n()),
    p25       = quantile(loss, 0.25, na.rm = TRUE),
    p75       = quantile(loss, 0.75, na.rm = TRUE),
    n         = n(),
    .groups   = "drop"
  ) %>%
  mutate(
    ci_low  = mean_loss - 1.96 * se,
    ci_high = mean_loss + 1.96 * se,
    year    = factor(year, labels = c("2018", "2023"))
  )


avg_data %>% distinct(income_block) %>% arrange(income_block)
# A tibble: 7 × 1
  income_block         
  <fct>                
1 "0-20th\n(Lowest)"   
2 "20-40th"            
3 "40-60th"            
4 "60-80th"            
5 "80-90th"            
6 "90-95th"            
7 "95-100th\n(Highest)"
Show code
# ── Shade the 90th+ blocks  ──────────────────
vulnerable_positions <- avg_data %>%
  filter(grepl("90-95th|95-100th", as.character(income_block))) %>%
  mutate(block_num = as.numeric(income_block)) %>%
  distinct(block_num) %>%
  pull(block_num)

shade_start <- min(vulnerable_positions) - 0.5
shade_end   <- max(vulnerable_positions) + 0.5

# ── Plot ──────────────────────────────────────────────────────────────────────
# ── Step 1: Build each indicator separately with identical structure ──────────

# Income — already built, just add indicator column
income_avg <- final_comparison_master1 %>%
    filter(year %in% c(2018, 2023), !is.na(income_block), parkname !="Stay_Home") %>%
    pivot_longer(
        cols      = c(loss_mdcev, loss_rum_observed,loss_rum_52wk ),
        names_to  = "model",
        values_to = "loss"
    ) %>%
    mutate(
        model        = recode(model,
                              "loss_mdcev"        = "MDCEV",
                              "loss_rum_observed" = "Observed RUM",
      "loss_rum_52wk" = "Frequency RUM"),
        pctile_block = income_block,
        indicator    = "Income\n(Least → Most Vulnerable)"
    )

# Environmental burden — high pctile = most vulnerable = already RIGHT
env_avg <- final_comparison_master1 %>%
    filter(year %in% c(2018, 2023), !is.na(env_block), parkname !="Stay_Home") %>%
    pivot_longer(
        cols      = c(loss_mdcev, loss_rum_observed,loss_rum_52wk),
        names_to  = "model",
        values_to = "loss"
    ) %>%
    mutate(
        model        = recode(model,
                              "loss_mdcev"        = "MDCEV",
                              "loss_rum_observed" = "Observed RUM",
      "loss_rum_52wk" = "Frequency RUM"),
        pctile_block = env_block,
        indicator    = "Environmental Burden\n(Least → Most Vulnerable)"
    )

# Social vulnerability — high pctile = most vulnerable = already RIGHT
soci_avg <- final_comparison_master1 %>%
    filter(year %in% c(2018, 2023), !is.na(soci_block), parkname !="Stay_Home") %>%
    pivot_longer(
        cols      = c(loss_mdcev, loss_rum_observed,loss_rum_52wk),
        names_to  = "model",
        values_to = "loss"
    ) %>%
    mutate(
        model        = recode(model,
                              "loss_mdcev"        = "MDCEV",
                              "loss_rum_observed" = "Observed RUM",
      "loss_rum_52wk" = "Frequency RUM"),
        pctile_block = soci_block,
        indicator    = "Social Vulnerability\n(Least → Most Vulnerable)"
    )

# ── Step 2: Combine and summarise ─────────────────────────────────────────────
all_avg <- bind_rows(income_avg, env_avg, soci_avg) %>%
    mutate(
        year = factor(year, labels = c("2018", "2023")),
        indicator = factor(indicator, levels = c(
            "Income\n(Least → Most Vulnerable)",
            "Social Vulnerability\n(Least → Most Vulnerable)",
            "Environmental Burden\n(Least → Most Vulnerable)"
        ))
    ) %>%
    group_by(indicator, pctile_block, model, year) %>%
    summarise(
        mean_loss = mean(loss,           na.rm = TRUE),
        se        = sd(loss, na.rm = TRUE) / sqrt(n()),
        p25       = quantile(loss, 0.25, na.rm = TRUE),
        p75       = quantile(loss, 0.75, na.rm = TRUE),
        n         = n(),
        .groups   = "drop"
    ) %>%
    mutate(
        ci_low  = mean_loss - 1.96 * se,
        ci_high = mean_loss + 1.96 * se
    )

# ── Step 3: Shading positions — 90th+ blocks are RIGHT in all three ───────────
vulnerable_positions <- all_avg %>%
    filter(grepl("90-95th|95-100th", as.character(pctile_block))) %>%
    mutate(block_num = as.numeric(pctile_block)) %>%
    distinct(block_num) %>%
    pull(block_num)

shade_start <- min(vulnerable_positions) - 0.5
shade_end   <- max(vulnerable_positions) + 0.5
Show code
# ── Step 4: Plot ──────────────────────────────────────────────────────────────
ggplot(all_avg,
       aes(x     = pctile_block,
           y     = mean_loss,
           color = model,
           group = model)) +
  
  geom_rect(
    data = tibble(xmin = shade_start, xmax = shade_end),
    aes(xmin = xmin, xmax = xmax, ymin = -Inf, ymax = Inf),
    fill        = "gray90",
    alpha       = 0.5,
    inherit.aes = FALSE
  ) +
  
  geom_errorbar(
    aes(ymin = ci_low, ymax = ci_high),
    width     = 0.2,
    linewidth = 0.4,
    position  = position_dodge(0.4)   # slightly wider dodge for 3 models
  ) +
  
  geom_line(linewidth = 0.8,  position = position_dodge(0.4)) +
  geom_point(size = 2.5,      position = position_dodge(0.4)) +
  
  geom_hline(
    yintercept = 0, linetype = "dashed",
    color = "gray40", linewidth = 0.4
  ) +
  
  geom_text(
    data = tibble(
      pctile_block = factor("90-95th",
                            levels = levels(all_avg$pctile_block)),
      mean_loss    = -2,
      year         = factor("2018"),
      indicator    = factor("Income\n(Least → Most Vulnerable)",
                            levels = levels(all_avg$indicator))
    ),
    aes(x = pctile_block, y = mean_loss),
    label       = "90th+\nVulnerable",
    inherit.aes = FALSE,
    size        = 3.5,
    color       = "gray40",
    hjust       = 0.5,
    family      = "Times New Roman"
  ) +
  
  facet_grid(indicator ~ year) +
  
  # ── Three model grayscale ─────────────────────────────────────────────────
  scale_color_manual(
    values = c(
      "MDCEV"         = "gray10",
      "Observed RUM"  = "gray55",
      "Frequency RUM" = "gray80"    # lightest gray for third model
    ),
    name = "Model"
  ) +
  scale_linetype_manual(
    values = c(
      "MDCEV"         = "solid",
      "Observed RUM"  = "solid",
      "Frequency RUM" = "dashed"    # dashed helps distinguish in grayscale
    ),
    name = "Model"
  ) +
  aes(linetype = model) +           # add linetype aesthetic
  
  scale_y_continuous(
    labels = scales::dollar_format(prefix = "$"),
    name   = "Mean Welfare Loss"
  ) +
  
  labs(
    x     = "Percentile Block",
    title = "Mean Welfare Loss Across Distributions"
  ) +
  
  theme_minimal(base_family = "Times New Roman") +
  theme(
    panel.grid.minor   = element_blank(),
    panel.grid.major.x = element_blank(),
    axis.text.x   = element_text(size = 16, angle = 20,
                                 hjust = 1, lineheight = 0.8),
    axis.text.y   = element_text(size = 20),
    strip.text.x  = element_text(face = "bold", size = 16),
    strip.text.y  = element_text(face = "bold", size = 20,
                                 angle = 0, hjust = 0),
    legend.position = "bottom",
    legend.title  = element_text(face = "bold", size = 20),
    legend.text   = element_text(size = 20),
    plot.title    = element_text(face = "bold", size = 20),
    panel.spacing = unit(1.2, "lines")
  )

Show code
ggsave("images/social_vulnerabilities.png", width=12, height=10, dpi=300)
Show code
income_lookup <- final_comparison_master1 %>%
mutate(id = as.character(id)) %>%
distinct(id, income_block, income) %>%
group_by(id) %>%
slice(1) %>%   # keep one row per person if duplicates exist
ungroup()

# ── Rebuild all_avg_appendix with three models ────────────────────────────────

income_avg_all <- final_comparison_master1 %>%
  filter(!is.na(income_block), parkname!="Stay_Home") %>%
  pivot_longer(
    cols      = c(loss_mdcev, loss_rum_observed, loss_rum_52wk),  # <-- added
    names_to  = "model",
    values_to = "loss"
  ) %>%
  mutate(
    model = recode(model,
      "loss_mdcev"        = "MDCEV",
      "loss_rum_observed" = "Observed RUM",
      "loss_rum_52wk"     = "Frequency RUM"   # <-- added
    ),
    pctile_block = income_block,
    indicator    = "Income\n(Least → Most Vulnerable)"
  )

env_avg_all <- final_comparison_master1 %>%
  filter(!is.na(env_block), parkname!="Stay_Home") %>%
  pivot_longer(
    cols      = c(loss_mdcev, loss_rum_observed, loss_rum_52wk),
    names_to  = "model",
    values_to = "loss"
  ) %>%
  mutate(
    model = recode(model,
      "loss_mdcev"        = "MDCEV",
      "loss_rum_observed" = "Observed RUM",
      "loss_rum_52wk"     = "Frequency RUM"
    ),
    pctile_block = env_block,
    indicator    = "Environmental Burden\n(Least → Most Vulnerable)"
  )

soci_avg_all <- final_comparison_master1 %>%
  filter(!is.na(soci_block), parkname!="Stay_Home") %>%
  pivot_longer(
    cols      = c(loss_mdcev, loss_rum_observed, loss_rum_52wk),
    names_to  = "model",
    values_to = "loss"
  ) %>%
  mutate(
    model = recode(model,
      "loss_mdcev"        = "MDCEV",
      "loss_rum_observed" = "Observed RUM",
      "loss_rum_52wk"     = "Frequency RUM"
    ),
    pctile_block = soci_block,
    indicator    = "Social Vulnerability\n(Least → Most Vulnerable)"
  )

all_avg_appendix <- bind_rows(income_avg_all, env_avg_all, soci_avg_all) %>%
  mutate(
    year      = factor(year),
    model     = factor(model, levels = c("MDCEV", "Observed RUM", "Frequency RUM")),
    indicator = factor(indicator, levels = c(
      "Income\n(Least → Most Vulnerable)",
      "Social Vulnerability\n(Least → Most Vulnerable)",
      "Environmental Burden\n(Least → Most Vulnerable)"
    ))
  ) %>%
  group_by(indicator, pctile_block, model, year) %>%
  summarise(
    mean_loss = mean(loss,           na.rm = TRUE),
    se        = sd(loss, na.rm = TRUE) / sqrt(n()),
    p25       = quantile(loss, 0.25, na.rm = TRUE),
    p75       = quantile(loss, 0.75, na.rm = TRUE),
    n         = n(),
    .groups   = "drop"
  ) %>%
  mutate(
    ci_low  = mean_loss - 1.96 * se,
    ci_high = mean_loss + 1.96 * se
  )

# ── Updated helper function with three models ─────────────────────────────────
make_appendix_plot <- function(data, years, title_suffix) {
  
  plot_data <- data %>% filter(year %in% years)
  
  vulnerable_positions <- plot_data %>%
    filter(grepl("90-95th|95-100th", as.character(pctile_block))) %>%
    mutate(block_num = as.numeric(pctile_block)) %>%
    distinct(block_num) %>%
    pull(block_num)
  
  shade_start <- min(vulnerable_positions) - 0.5
  shade_end   <- max(vulnerable_positions) + 0.5
  
  ggplot(plot_data,
         aes(x        = pctile_block,
             y        = mean_loss,
             color    = model,
             linetype = model,        # <-- added for readability
             group    = model)) +
    
    geom_rect(
      data = tibble(xmin = shade_start, xmax = shade_end),
      aes(xmin = xmin, xmax = xmax, ymin = -Inf, ymax = Inf),
      fill        = "gray90",
      alpha       = 0.5,
      inherit.aes = FALSE
    ) +
    
    geom_errorbar(
      aes(ymin = ci_low, ymax = ci_high),
      width     = 0.2,
      linewidth = 0.35,
      position  = position_dodge(0.4)   # slightly wider for 3 models
    ) +
    
    geom_line(linewidth = 0.7,  position = position_dodge(0.4)) +
    geom_point(size = 1.8,      position = position_dodge(0.4)) +
    
    geom_hline(
      yintercept = 0, linetype = "dashed",
      color = "gray40", linewidth = 0.4
    ) +
    
    facet_grid(indicator ~ year) +
    
    # ── Three model color and linetype scales ─────────────────────────────────
    scale_color_manual(
      values = c(
        "MDCEV"         = "gray10",
        "Observed RUM"  = "gray55",
        "Frequency RUM" = "gray75"    # <-- added
      ),
      name = "Model"
    ) +
    scale_linetype_manual(
      values = c(
        "MDCEV"         = "solid",
        "Observed RUM"  = "solid",
        "Frequency RUM" = "dashed"    # <-- dashed to distinguish from Observed RUM
      ),
      name = "Model"
    ) +
    scale_fill_manual(
      values = c(
        "MDCEV"         = "gray10",
        "Observed RUM"  = "gray55",
        "Frequency RUM" = "gray75"
      ),
      guide  = "none"
    ) +
    scale_y_continuous(
      labels = scales::dollar_format(prefix = "$"),
      name   = "Mean Welfare Loss"
    ) +
    
    labs(
      x     = "Percentile Block",
      title = paste("Mean Welfare Loss Across Distributions —", title_suffix)
    ) +
    
    theme_minimal(base_family = "Times New Roman") +
    theme(
      panel.grid.minor   = element_blank(),
      panel.grid.major.x = element_blank(),
      axis.text.x   = element_text(size = 12, angle = 30,
                                   hjust = 1, lineheight = 0.8),
      axis.text.y   = element_text(size = 20),
      strip.text.x  = element_text(face = "bold", size = 12),
      strip.text.y  = element_text(face = "bold", size = 14,
                                   angle = 0, hjust = 0),
      legend.position  = "bottom",
      legend.title     = element_text(face = "bold", size = 20),
      legend.text      = element_text(size = 20),
      plot.title       = element_text(face = "bold", size = 20),
      panel.spacing    = unit(0.8, "lines")
    )
}

# ── Build and save ────────────────────────────────────────────────────────────
appendix_a <- make_appendix_plot(
  all_avg_appendix,
  years        = c(2018, 2019, 2020),
  title_suffix = "2018–2020"
)

appendix_b <- make_appendix_plot(
  all_avg_appendix,
  years        = c(2021, 2022, 2023),
  title_suffix = "2021–2023"
)


ggsave(
  "images/appendix_A_welfare_2018_2020.png",
  plot   = appendix_a,
  width  = 14,
  height = 12,
  dpi    = 300,
  bg     = "white"
)

ggsave(
  "images/appendix_B_welfare_2021_2023.png",
  plot   = appendix_b,
  width  = 14,
  height = 12,
  dpi    = 300,
  bg     = "white"
)
print(appendix_a)

Low-income Welfare

So low income individuals live further away and choose sites closer to them then higher income individuals. Thats whats explained by the gap. We know we value there time less

Show code
# COMPARE: Average distance to all sites vs distance to actually visited sites


# Distance to ALL sites (potential distance) 
# This is the average distance each income block's zip codes are from
# every campsite regardless of whether they visited

distance_all_sites <- full_year_clean %>%
  filter(parkname != "Stay_Home") %>%
  mutate(id = as.character(id)) %>%
  left_join(income_lookup, by = "id") %>%
  filter(!is.na(income_block)) %>%
  group_by(income_block, id) %>%
  summarise(
    mean_dist_all_sites = mean(travel_distance_km, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  group_by(income_block) %>%
  summarise(
    mean_dist_potential  = round(mean(mean_dist_all_sites, na.rm = TRUE), 1),
    median_dist_potential= round(median(mean_dist_all_sites, na.rm = TRUE), 1),
    n_individuals        = n_distinct(id),
    .groups = "drop"
  )

# Distance to VISITED sites only ────────────────────────────────────
# Average distance to sites that were actually visited (VisitCount > 0)

distance_visited_sites <- full_year_clean %>%
  filter(parkname != "Stay_Home", VisitCount > 0) %>%
  mutate(id = as.character(id)) %>%
  left_join(income_lookup, by = "id") %>%
  filter(!is.na(income_block)) %>%
  group_by(income_block, id) %>%
  summarise(
    mean_dist_visited = mean(travel_distance_km, na.rm = TRUE),
    n_sites_visited   = n_distinct(parkname),
    total_trips       = sum(VisitCount,          na.rm = TRUE),
    .groups = "drop"
  ) %>%
  group_by(income_block) %>%
  summarise(
    mean_dist_visited   = round(mean(mean_dist_visited, na.rm = TRUE), 1),
    median_dist_visited = round(median(mean_dist_visited, na.rm = TRUE), 1),
    mean_sites_visited  = round(mean(n_sites_visited,   na.rm = TRUE), 2),
    mean_trips          = round(mean(total_trips,       na.rm = TRUE), 2),
    n_individuals       = n_distinct(id),
    .groups = "drop"
  )

# Combine and compare ───────────────────────────────────────────────
distance_comparison <- distance_all_sites %>%
  left_join(distance_visited_sites, by = "income_block",
            suffix = c("_all", "_visited")) %>%
  mutate(
    # Selection effect: do people choose closer sites?
    distance_gap    = round(mean_dist_visited - mean_dist_potential, 1),
    selection_ratio = round(mean_dist_visited / mean_dist_potential, 3)
  ) %>%
  arrange(income_block)

message("\n── Distance: potential vs visited by income block ──")
print(distance_comparison %>%
  select(income_block, mean_dist_potential, mean_dist_visited,
         distance_gap, selection_ratio, mean_sites_visited))
# A tibble: 7 × 6
  income_block          mean_dist_potential mean_dist_visited distance_gap
  <fct>                               <dbl>             <dbl>        <dbl>
1 "0-20th\n(Lowest)"                   155.              152          -3.3
2 "20-40th"                            163.              161.         -2.5
3 "40-60th"                            149.              153.          3.7
4 "60-80th"                            154.              155.          1.4
5 "80-90th"                            190.              172.        -17.3
6 "90-95th"                            275.              187.        -87.8
7 "95-100th\n(Highest)"                245.              199.        -45.3
# ℹ 2 more variables: selection_ratio <dbl>, mean_sites_visited <dbl>
Show code
# ── Site-level distance by income block 
# Which sites does each income block actually visit
# and how far are they from those sites?

site_distance_by_income <- full_year_clean %>%
  filter(parkname != "Stay_Home", VisitCount > 0) %>%
  mutate(id = as.character(id)) %>%
  left_join(income_lookup, by = "id") %>%
  filter(!is.na(income_block)) %>%
  group_by(income_block, parkname, island_park) %>%
  summarise(
    mean_distance = round(mean(travel_distance_km, na.rm = TRUE), 1),
    total_trips   = sum(VisitCount, na.rm = TRUE),
    n_visitors    = n_distinct(id),
    .groups       = "drop"
  ) %>%
  group_by(income_block) %>%
  mutate(
    pct_trips = round(total_trips / sum(total_trips) * 100, 1)
  ) %>%
  ungroup() %>%
  arrange(income_block, desc(pct_trips))

message("\n── Top 3 sites by income block with distances ──")
site_distance_by_income %>%
  group_by(income_block) %>%
  slice_max(pct_trips, n = 3) %>%
  print(n = 30)
# A tibble: 21 × 7
# Groups:   income_block [7]
   income_block        parkname island_park mean_distance total_trips n_visitors
   <fct>               <chr>    <chr>               <dbl>       <dbl>      <int>
 1 "0-20th\n(Lowest)"  Peacock… Oahu                 155.        1090        688
 2 "0-20th\n(Lowest)"  Kuaokal… Oahu                 135.         386        257
 3 "0-20th\n(Lowest)"  Waimanu… Hawaii               264.          95         83
 4 "20-40th"           Peacock… Oahu                 168.        1086        649
 5 "20-40th"           Kuaokal… Oahu                 145          347        218
 6 "20-40th"           Waimanu… Hawaii               204.         250        192
 7 "40-60th"           Peacock… Oahu                 168.         398        271
 8 "40-60th"           Kawaikoi Kauai                166.         370        233
 9 "40-60th"           Sugi Gr… Kauai                148.         345        246
10 "60-80th"           Peacock… Oahu                 149.         525        324
11 "60-80th"           Waimanu… Hawaii               185.         441        332
12 "60-80th"           Kuaokal… Oahu                 123.         187        123
13 "80-90th"           Peacock… Oahu                 175.         399        264
14 "80-90th"           Kuaokal… Oahu                 154.         178        110
15 "80-90th"           Waimanu… Hawaii               223.         127        111
16 "90-95th"           Waimanu… Hawaii               211.         278        191
17 "90-95th"           Keanako… Hawaii               158.         115         98
18 "90-95th"           Ainapo … Hawaii               175.          64         42
19 "95-100th\n(Highes… Waimanu… Hawaii               230.         148        119
20 "95-100th\n(Highes… Keanako… Hawaii               228.          65         55
21 "95-100th\n(Highes… Peacock… Oahu                 172.          54         40
# ℹ 1 more variable: pct_trips <dbl>
Show code
# ── Plot comparison ───────────────────────────────────────────────────
distance_comparison %>%
  pivot_longer(
    cols      = c(mean_dist_potential, mean_dist_visited),
    names_to  = "type",
    values_to = "distance"
  ) %>%
  mutate(
    type = case_when(
      type == "mean_dist_potential" ~ "Average Distance to All Sites\n(Potential)",
      type == "mean_dist_visited"   ~ "Average Distance to Visited Sites\n(Actual)"
    ),
    # ── Clean x-axis labels with income ranges ────────────────────────────────
    income_label = case_when(
      income_block == "0-20th\n(Lowest)"    ~ "> $102k",
      income_block == "20-40th"             ~ "$90k–$102k",
      income_block == "40-60th"             ~ "$79k–$90k",
      income_block == "60-80th"             ~ "$67k–$79k",
      income_block == "80-90th"             ~ "$63k–$67k",
      income_block == "90-95th"             ~ "$57k–$63k",
      income_block == "95-100th\n(Highest)" ~ "< $57k",
      TRUE ~ as.character(income_block)
    ),
    income_label = factor(income_label, levels = c(
      "< $57k", "$57k–$63k", "$63k–$67k", "$67k–$79k",
      "$79k–$90k", "$90k–$102k", "> $102k"
    ))
  ) %>%
  ggplot(aes(x     = income_label,
             y     = distance,
             color = type,
             group = type)) +

  geom_line(linewidth = 0.9) +
  geom_point(size = 3) +

  # ── High / Low income arrows on x axis ────────────────────────────────────
  annotate("segment",
           x = 0.5, xend = 1.8, y = -30, yend = -30,
           arrow = arrow(length = unit(0.2, "cm"), ends = "first"),
           color = "gray30", linewidth = 0.6) +
  annotate("text",
           x = 1.0, y = -45,
           label = "Lower Income",
           size = 4.5, color = "gray30",
           family = "Times New Roman",
           hjust = 0.5, fontface = "bold") +

  annotate("segment",
           x = 5.2, xend = 6.5, y = -30, yend = -30,
           arrow = arrow(length = unit(0.2, "cm"), ends = "last"),
           color = "gray30", linewidth = 0.6) +
  annotate("text",
           x = 6.0, y = -45,
           label = "Higher Income",
           size = 4.5, color = "gray30",
           family = "Times New Roman",
           hjust = 0.5, fontface = "bold") +

  scale_color_manual(
    values = c(
      "Average Distance to All Sites\n(Potential)"  = "gray70",
      "Average Distance to Visited Sites\n(Actual)" = "gray10"
    ),
    name = ""
  ) +
  scale_y_continuous(
    name   = "Mean Distance (km)",
    labels = scales::number_format(suffix = " km"),
    expand = expansion(mult = c(0.15, 0.05))  # extra space at bottom for arrows
  ) +
  labs(
    x     = "Household Income Range",
    title = "Potential vs Actual Distance by Income Block"
  ) +
  coord_cartesian(clip = "off") +   # allow annotations outside plot area
  theme_minimal(base_family = "Times New Roman") +
  theme(
    panel.grid.minor   = element_blank(),
    panel.grid.major.x = element_blank(),
    axis.text.x   = element_text(size = 14, angle = 20, hjust = 1),
    axis.text.y   = element_text(size = 16),
    axis.title    = element_text(size = 16),
    legend.position  = "bottom",
    legend.text   = element_text(size = 16),
    plot.title    = element_text(face = "bold", size = 20),
    plot.margin   = margin(t = 10, r = 10, b = 30, l = 10)  # bottom space for arrows
  )

Show code
ggsave("images/distance_sitesvisited.png", width=12, height=8, dpi=300)
Show code
# Test 1: Rank-based selection 
# For each person: rank all 22 sites by distance (1 = closest)
# Then check which rank sites they actually visited
# If constrained -> visit low-rank (nearby) sites disproportionately

rank_selection <- full_year_clean %>%
  filter(parkname != "Stay_Home") %>%
  mutate(id = as.character(id)) %>%
  left_join(income_lookup, by = "id") %>%
  filter(!is.na(income_block)) %>%
  group_by(id) %>%
  mutate(
    # Rank sites by distance for each individual — 1 = closest
    distance_rank = rank(travel_distance_km, ties.method = "first"),
    n_sites       = n()
  ) %>%
  ungroup()

# For visited sites: what is their average distance rank?
rank_visited <- rank_selection %>%
  filter(VisitCount > 0) %>%
  group_by(id, income_block) %>%
  summarise(
    mean_rank_visited    = mean(distance_rank, na.rm = TRUE),
    min_rank_visited     = min(distance_rank,  na.rm = TRUE),
    median_rank_visited  = median(distance_rank, na.rm = TRUE),
    n_sites_visited      = n(),
    .groups = "drop"
  )

rank_summary <- rank_visited %>%
  group_by(income_block) %>%
  summarise(
    mean_rank_of_visited   = round(mean(mean_rank_visited),   2),
    median_rank_of_visited = round(median(median_rank_visited), 2),
    mean_closest_visited   = round(mean(min_rank_visited),    2),
    pct_visit_top5_closest = round(mean(min_rank_visited <= 5) * 100, 1),
    pct_visit_top3_closest = round(mean(min_rank_visited <= 3) * 100, 1),
    .groups = "drop"
  ) %>%
  arrange(income_block)

message("── Test 1: Distance rank of visited sites ──")
message("Lower rank = visits closer sites | Higher rank = travels further")
print(rank_summary)
# A tibble: 7 × 6
  income_block  mean_rank_of_visited median_rank_of_visited mean_closest_visited
  <fct>                        <dbl>                  <dbl>                <dbl>
1 "0-20th\n(Lo…                13.3                      12                 12.5
2 "20-40th"                    13.6                      12                 12.8
3 "40-60th"                    16.6                      14                 15.6
4 "60-80th"                    14.8                      13                 14.0
5 "80-90th"                    11.5                      11                 10.9
6 "90-95th"                     6.06                      3                  5.5
7 "95-100th\n(…                11.1                      10                 10.7
# ℹ 2 more variables: pct_visit_top5_closest <dbl>,
#   pct_visit_top3_closest <dbl>
Show code
#Test 2: Probability of visiting top-N closest sites 
# Does low income visit their N closest sites more often?

visit_probability <- rank_selection %>%
  mutate(
    visited    = VisitCount > 0,
    top3       = distance_rank <= 3,
    top5       = distance_rank <= 5,
    top10      = distance_rank <= 10,
    bottom5    = distance_rank > 17   # furthest 5 sites
  ) %>%
  group_by(income_block) %>%
  summarise(
    prob_visit_top3    = round(mean(visited[top3],    na.rm = TRUE) * 100, 1),
    prob_visit_top5    = round(mean(visited[top5],    na.rm = TRUE) * 100, 1),
    prob_visit_top10   = round(mean(visited[top10],   na.rm = TRUE) * 100, 1),
    prob_visit_bottom5 = round(mean(visited[bottom5], na.rm = TRUE) * 100, 1),
    # Ratio: how much more likely to visit nearby vs far sites?
    near_far_ratio     = round(
      mean(visited[top5],    na.rm = TRUE) /
      mean(visited[bottom5], na.rm = TRUE), 2),
    .groups = "drop"
  ) %>%
  arrange(income_block)

message("\n── Test 2: Probability of visiting near vs far sites ──")
message("Higher near_far_ratio = stronger geographic constraint")
print(visit_probability)
# A tibble: 7 × 6
  income_block          prob_visit_top3 prob_visit_top5 prob_visit_top10
  <fct>                           <dbl>           <dbl>            <dbl>
1 "0-20th\n(Lowest)"                2.1             2.3              4.9
2 "20-40th"                         6               4.2              4.6
3 "40-60th"                         1.2             1.3              2.8
4 "60-80th"                         4.1             3.6              3.3
5 "80-90th"                         9.1             6.6              5.5
6 "90-95th"                        25.8            17.2              9.8
7 "95-100th\n(Highest)"            14.4             9.5              6  
# ℹ 2 more variables: prob_visit_bottom5 <dbl>, near_far_ratio <dbl>
Show code
# Test 3: Individual-level regression 
# Does distance predict visit probability differently across income groups?
# If low income is more constrained:
# β_distance should be more negative for low income


regression_by_income <- rank_selection %>%
  mutate(visited = as.integer(VisitCount > 0)) %>%
  group_by(income_block) %>%
  do(
    tidy(glm(visited ~ travel_distance_km,
             data   = .,
             family = binomial(link = "logit")))
  ) %>%
  filter(term == "travel_distance_km") %>%
  mutate(
    estimate  = round(estimate,   5),
    std.error = round(std.error,  5),
    p.value   = round(p.value,    4)
  ) %>%
  select(income_block, estimate, std.error, p.value) %>%
  arrange(income_block)

message("\n── Test 3: Logit coefficient on distance by income block ──")
message("More negative estimate = distance matters MORE for that income group")
message("= stronger geographic constraint on site selection")
print(regression_by_income)
# A tibble: 7 × 4
# Groups:   income_block [7]
  income_block          estimate std.error p.value
  <fct>                    <dbl>     <dbl>   <dbl>
1 "0-20th\n(Lowest)"    -0.00096   0.00033  0.0038
2 "20-40th"             -0.00077   0.0003   0.0104
3 "40-60th"              0.00061   0.00035  0.0826
4 "60-80th"              0.00008   0.00032  0.812 
5 "80-90th"             -0.00244   0.00039  0     
6 "90-95th"             -0.00712   0.00042  0     
7 "95-100th\n(Highest)" -0.00294   0.00042  0     
Show code
# Test 4: Substitution range 
# How spread out are the sites each person visits?
# Constrained campers visit a narrow geographic range
# Unconstrained campers visit sites spread across islands

substitution_range <- final_data_2018 %>%
  filter(parkname != "Stay_Home", VisitCount > 0) %>%
  mutate(id = as.character(id)) %>%
  left_join(income_lookup, by = "id") %>%
  filter(!is.na(income_block)) %>%
  group_by(id, income_block) %>%
  summarise(
    n_islands_visited   = n_distinct(island_park),
    n_sites_visited     = n_distinct(parkname),
    range_km            = max(travel_distance_km) - min(travel_distance_km),
    max_dist            = max(travel_distance_km),
    min_dist            = min(travel_distance_km),
    .groups = "drop"
  ) %>%
  group_by(income_block) %>%
  summarise(
    mean_islands_visited = round(mean(n_islands_visited),  2),
    pct_visit_own_island = round(mean(n_islands_visited == 1) * 100, 1),
    mean_range_km        = round(mean(range_km,            na.rm = TRUE), 1),
    mean_max_dist        = round(mean(max_dist,            na.rm = TRUE), 1),
    mean_min_dist        = round(mean(min_dist,            na.rm = TRUE), 1),
    .groups = "drop"
  ) %>%
  arrange(income_block)

message("\n── Test 4: Geographic substitution range ──")
message("pct_visit_own_island: % who only visit sites on their home island")
message("mean_range_km: spread between closest and furthest site visited")
print(substitution_range)
# A tibble: 7 × 6
  income_block          mean_islands_visited pct_visit_own_island mean_range_km
  <fct>                                <dbl>                <dbl>         <dbl>
1 "0-20th\n(Lowest)"                    1.01                 99.4           2  
2 "20-40th"                             1.01                 98.6           2.4
3 "40-60th"                             1.02                 98.2           3.7
4 "60-80th"                             1.02                 97.5           3  
5 "80-90th"                             1.03                 96.9           3.4
6 "90-95th"                             1                   100             3.7
7 "95-100th\n(Highest)"                 1                   100             2  
# ℹ 2 more variables: mean_max_dist <dbl>, mean_min_dist <dbl>
Show code
# Test 5: Gravity model 
# Classic spatial interaction: visits ~ 1/distance
# If constrained: stronger distance decay for low income

gravity_by_income <- rank_selection %>%
  mutate(
    visited       = as.integer(VisitCount > 0),
    log_distance  = log(travel_distance_km + 1)
  ) %>%
  group_by(income_block) %>%
  do(
    tidy(glm(visited ~ log_distance,
             data   = .,
             family = binomial(link = "logit")))
  ) %>%
  filter(term == "log_distance") %>%
  mutate(
    estimate  = round(estimate,  3),
    std.error = round(std.error, 3),
    p.value   = round(p.value,   4)
  ) %>%
  select(income_block, estimate, std.error, p.value) %>%
  rename(log_distance_coef = estimate) %>%
  arrange(income_block)

message("\n── Test 5: Gravity model — log distance effect by income ──")
message("More negative coefficient = stronger distance decay = more constrained")
print(gravity_by_income)
# A tibble: 7 × 4
# Groups:   income_block [7]
  income_block          log_distance_coef std.error p.value
  <fct>                             <dbl>     <dbl>   <dbl>
1 "0-20th\n(Lowest)"                0.101     0.042  0.0163
2 "20-40th"                         0.101     0.037  0.0062
3 "40-60th"                         0.278     0.046  0     
4 "60-80th"                         0.096     0.038  0.0103
5 "80-90th"                        -0.075     0.052  0.145 
6 "90-95th"                        -0.649     0.057  0     
7 "95-100th\n(Highest)"            -0.203     0.064  0.0015
Show code
# ── Plot all four measures together 

p1 <- rank_summary %>%
  ggplot(aes(x = income_block, y = mean_rank_of_visited, group = 1)) +
  geom_line(linewidth = 0.8, color = "gray10") +
  geom_point(size = 3,       color = "gray10") +
  labs(x = "", y = "Mean Distance Rank\nof Visited Sites",
       title = "Test 1: Distance Rank") +
  theme_minimal(base_family = "Times New Roman") +
  theme(axis.text.x = element_text(angle = 20, hjust = 1, size = 9),
        panel.grid.minor = element_blank(),
        plot.title = element_text(face = "bold", size = 11))

p2 <- visit_probability %>%
  ggplot(aes(x = income_block, y = near_far_ratio, group = 1)) +
  geom_line(linewidth = 0.8, color = "gray10") +
  geom_point(size = 3,       color = "gray10") +
  geom_hline(yintercept = 1, linetype = "dashed",
             color = "gray50", linewidth = 0.4) +
  labs(x = "", y = "Near/Far Visit Ratio\n(top 5 / bottom 5)",
       title = "Test 2: Near/Far Ratio") +
  theme_minimal(base_family = "Times New Roman") +
  theme(axis.text.x = element_text(angle = 20, hjust = 1, size = 9),
        panel.grid.minor = element_blank(),
        plot.title = element_text(face = "bold", size = 11))

p3 <- regression_by_income %>%
  ggplot(aes(x = income_block, y = estimate, group = 1)) +
  geom_col(fill = "gray30", alpha = 0.8, width = 0.6) +
  geom_errorbar(aes(ymin = estimate - 1.96 * std.error,
                    ymax = estimate + 1.96 * std.error),
                width = 0.2, color = "gray10") +
  labs(x = "", y = "Distance Coefficient\n(logit)",
       title = "Test 3: Distance Sensitivity") +
  theme_minimal(base_family = "Times New Roman") +
  theme(axis.text.x = element_text(angle = 20, hjust = 1, size = 9),
        panel.grid.minor = element_blank(),
        plot.title = element_text(face = "bold", size = 11))

p4 <- substitution_range %>%
  ggplot(aes(x = income_block, y = pct_visit_own_island, group = 1)) +
  geom_line(linewidth = 0.8, color = "gray10") +
  geom_point(size = 3,       color = "gray10") +
  scale_y_continuous(labels = scales::percent_format(scale = 1)) +
  labs(x = "", y = "% Visiting Own\nIsland Sites Only",
       title = "Test 4:  Remain on Home Island") +
  theme_minimal(base_family = "Times New Roman") +
  theme(axis.text.x = element_text(angle = 20, hjust = 1, size = 9),
        panel.grid.minor = element_blank(),
        plot.title = element_text(face = "bold", size = 11))

combined <- (p1 + p2) / (p3 + p4) +
  plot_annotation(
    title   = "Four Tests of Geographic Selection Constraint by Income Block",
    caption = "All four tests ask: does distance matter more for lower income communities?",
    theme   = theme(
      plot.title   = element_text(family = "Times New Roman",
                                  face = "bold", size = 14),
      plot.caption = element_text(family = "Times New Roman",
                                  size = 10, color = "gray40")
    )
  )

print(combined)

Show code
ggsave("images/geographic_constraint_tests.png",
       combined, width = 12, height = 10,
       dpi = 300, bg = "white")
Show code
all_years_raw <- map_df(2018:2023, function(yr) {
  get(paste0("final_data_", yr)) %>%
    mutate(year = yr, id = as.character(id))
})

site_by_income <- all_years_raw %>%
  filter(parkname != "Stay_Home", VisitCount > 0) %>%
  mutate(id = as.character(id)) %>%
  left_join(income_lookup, by = "id") %>%
  filter(!is.na(income_block)) %>%
  group_by(income_block, parkname) %>%
  summarise(
    total_trips = sum(VisitCount, na.rm = TRUE),
    .groups     = "drop"
  ) %>%
  group_by(income_block) %>%
  mutate(
    pct_trips = round(total_trips / sum(total_trips) * 100, 1)
  ) %>%
  ungroup() %>%
  arrange(income_block, desc(pct_trips))


# Check if high income disproportionately visits Peacock and Waimanu
site_by_income %>%
  filter(parkname %in% c("Peacock Flats Campsite", 
                          "Waimanu Campsite")) %>%
  select(income_block, parkname, total_trips, pct_trips) %>%
  arrange(parkname, income_block) %>%
  print(n = 20)
# A tibble: 14 × 4
   income_block          parkname               total_trips pct_trips
   <fct>                 <chr>                        <dbl>     <dbl>
 1 "0-20th\n(Lowest)"    Peacock Flats Campsite        1090      55.6
 2 "20-40th"             Peacock Flats Campsite        1086      49  
 3 "40-60th"             Peacock Flats Campsite         398      18.9
 4 "60-80th"             Peacock Flats Campsite         525      24.6
 5 "80-90th"             Peacock Flats Campsite         399      41.4
 6 "90-95th"             Peacock Flats Campsite          45       7.7
 7 "95-100th\n(Highest)" Peacock Flats Campsite          54      11.2
 8 "0-20th\n(Lowest)"    Waimanu Campsite                95       4.8
 9 "20-40th"             Waimanu Campsite               250      11.3
10 "40-60th"             Waimanu Campsite                61       2.9
11 "60-80th"             Waimanu Campsite               441      20.6
12 "80-90th"             Waimanu Campsite               127      13.2
13 "90-95th"             Waimanu Campsite               278      47.8
14 "95-100th\n(Highest)" Waimanu Campsite               148      30.6
Show code
# What share of each income group's trips go to these two sites?
site_by_income %>%
  mutate(
    high_value_site = parkname %in% c("Peacock Flats Campsite",
                                       "Waimanu Campsite")
  ) %>%
  group_by(income_block, high_value_site) %>%
  summarise(
    pct_trips = sum(pct_trips),
    .groups   = "drop"
  ) %>%
  filter(high_value_site) %>%
  arrange(income_block)
# A tibble: 7 × 3
  income_block          high_value_site pct_trips
  <fct>                 <lgl>               <dbl>
1 "0-20th\n(Lowest)"    TRUE                 60.4
2 "20-40th"             TRUE                 60.3
3 "40-60th"             TRUE                 21.8
4 "60-80th"             TRUE                 45.2
5 "80-90th"             TRUE                 54.6
6 "90-95th"             TRUE                 55.5
7 "95-100th\n(Highest)" TRUE                 41.8

Spread Across Models & DCA

The spread here show the value of park based on whether or not the person comes from a disadvantage community.

The MDCEV is fairly stable compared to the RUM and the spread is larger whe nconsidering each trip occasion.

For DCA are consistently less in Observed RUM and MDCEV but is greater in Frequency 52.

Show code
# 1. Prepare data: Group by Park, Year, and EJ Status
plot_data_three_panel <- final_comparison_master1 %>%
  filter(parkname!="Stay_Home") %>%
  group_by(year, parkname, is_disadvantaged_50) %>%
  summarise(
    Observed_RUM = mean(loss_rum_observed, na.rm = TRUE),
    Frequency_RUM = mean(loss_rum_52wk, na.rm = TRUE),
    MDCEV = mean(loss_mdcev, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  # Pivot to long format
  pivot_longer(
    cols = c(Observed_RUM, Frequency_RUM, MDCEV),
    names_to = "Model_Type",
    values_to = "Welfare_Loss"
  ) %>%
  # Clean up factor labels for the legend and facets
  mutate(
    Model_Type = factor(Model_Type, 
                        levels = c("Observed_RUM", "Frequency_RUM", "MDCEV"),
                        labels = c("Observed RUM", "Frequency RUM (52-Wk)", "MDCEV")),
    EJ_Group = factor(is_disadvantaged_50, 
                      levels = c(0, 1), 
                      labels = c("Non-Disadvantaged", "Disadvantaged (EJ)"))
  )

# 2. Create the Three-Panel Plot
ggplot(plot_data_three_panel, aes(x = factor(year), y = Welfare_Loss, fill = EJ_Group)) +
  # Boxplot with black outlines
  geom_boxplot(outlier.shape = 21, outlier.size = 1, color = "black", alpha = 0.8) +
  
  # CREATE THREE PANELS (One for each Model)
  facet_wrap(~ Model_Type, scales = "free_y") + 
  
  # Grayscale for the EJ Comparison
  scale_fill_manual(values = c(
    "Non-Disadvantaged" = "#F0F0F0", # Very Light Grey
    "Disadvantaged (EJ)" = "#636363"  # Dark Grey
  )) +
  
  # Labels
  labs(
    title = "Annual Welfare Loss by Model Type and EJ Status",
    subtitle = "Comparing Disadvantaged vs. Non-Disadvantaged ZIP Codes across Hawaii Parks",
    x = "Year",
    y = "Average Annual Welfare Loss ($/Park)",
    fill = "Community Status"
  ) +
  
  # Academic Theme
  theme_bw() +
  theme(
    text = element_text(family = "serif", size = 16),
    plot.title = element_text(face = "bold", size = 20, hjust = 0.5),
    plot.subtitle = element_text(hjust = 0.5, size = 16),
    
    # Grid and Panel styling
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    strip.background = element_rect(fill = "white", color = "black"),
    strip.text = element_text(face = "bold", size = 16),
    
    # Legend at bottom
    legend.position = "bottom",
    panel.border = element_rect(colour = "black", fill = NA, size = 0.8)
  )

Show code
ggsave("images/comparision_EJstatus.png", width=12, height=10, dpi=300)
Show code
#| cache: true
# 1. Prepare data: Group by Park, Year, and EJ Status
plot_data_three_panel <- final_comparison_master1 %>%
  filter(parkname!="Stay_Home") %>%
  group_by(year, parkname, is_disadvantaged_50_envi) %>%
  summarise(
    Observed_RUM = mean(loss_rum_observed, na.rm = TRUE),
    Frequency_RUM = mean(loss_rum_52wk, na.rm = TRUE),
    MDCEV = mean(loss_mdcev, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  # Pivot to long format
  pivot_longer(
    cols = c(Observed_RUM, Frequency_RUM, MDCEV),
    names_to = "Model_Type",
    values_to = "Welfare_Loss"
  ) %>%
  # Clean up factor labels for the legend and facets
  mutate(
    Model_Type = factor(Model_Type, 
                        levels = c("Observed_RUM", "Frequency_RUM", "MDCEV"),
                        labels = c("Observed RUM", "Frequency RUM (52-Wk)", "MDCEV")),
    EJ_Group = factor(is_disadvantaged_50_envi, 
                      levels = c(0, 1), 
                      labels = c("Non-Disadvantaged", "Disadvantaged (EJ)"))
  )

# 2. Create the Three-Panel Plot
ggplot(plot_data_three_panel, aes(x = factor(year), y = Welfare_Loss, fill = EJ_Group)) +
  # Boxplot with black outlines
  geom_boxplot(outlier.shape = 21, outlier.size = 1, color = "black", alpha = 0.8) +
  
  # CREATE THREE PANELS (One for each Model)
  facet_wrap(~ Model_Type, scales = "free_y") + 
  
  # Grayscale for the EJ Comparison
  scale_fill_manual(values = c(
    "Non-Disadvantaged" = "#F0F0F0", # Very Light Grey
    "Disadvantaged (EJ)" = "#636363"  # Dark Grey
  )) +
  
  # Labels
  labs(
    title = "Annual Welfare Loss by Model Type and Environmental Burdens Status",
    subtitle = "Comparing Disadvantaged vs. Non-Disadvantaged ZIP Codes across Hawaii Parks",
    x = "Year",
    y = "Average Annual Welfare Loss ($/Park)",
    fill = "Community Status"
  ) +
  
  # Academic Theme
  theme_bw() +
  theme(
    text = element_text(family = "serif", size = 12),
    plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
    plot.subtitle = element_text(hjust = 0.5, size = 11),
    
    # Grid and Panel styling
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    strip.background = element_rect(fill = "white", color = "black"),
    strip.text = element_text(face = "bold", size = 11),
    
    # Legend at bottom
    legend.position = "bottom",
    panel.border = element_rect(colour = "black", fill = NA, size = 0.8)
  )

Show code
ggsave("images/comparision_envir_status.png", width=12, height=10, dpi=300)

# 1. Prepare data: Group by Park, Year, and EJ Status
plot_data_three_panel <- final_comparison_master1 %>%
  filter(parkname!="Stay_Home") %>%
  group_by(year, parkname, is_disadvantaged_50_envi) %>%
  summarise(
    Observed_RUM = mean(loss_rum_observed, na.rm = TRUE),
    MDCEV = mean(loss_mdcev, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  # Pivot to long format
  pivot_longer(
    cols = c(Observed_RUM, MDCEV),
    names_to = "Model_Type",
    values_to = "Welfare_Loss"
  ) %>%
  # Clean up factor labels for the legend and facets
  mutate(
    Model_Type = factor(Model_Type, 
                        levels = c("Observed_RUM",  "MDCEV"),
                        labels = c("Observed RUM", "MDCEV")),
    EJ_Group = factor(is_disadvantaged_50_envi, 
                      levels = c(0, 1), 
                      labels = c("Non-Disadvantaged", "Disadvantaged"))
  )

# 2. Create the Three-Panel Plot
ggplot(plot_data_three_panel, aes(x = factor(year), y = Welfare_Loss, fill = EJ_Group)) +
  # Boxplot with black outlines
  geom_boxplot(outlier.shape = 21, outlier.size = 1, color = "black", alpha = 0.8) +
  
  # CREATE THREE PANELS (One for each Model)
  facet_wrap(~ Model_Type, scales = "free_y") + 
  
  # Grayscale for the EJ Comparison
  scale_fill_manual(values = c(
    "Non-Disadvantaged" = "#F0F0F0", # Very Light Grey
    "Disadvantaged" = "#636363"  # Dark Grey
  )) +
  
  # Labels
  labs(
    title = "Annual Welfare Loss by Model Type and Environmental Burdens Status",
    subtitle = "Comparing Disadvantaged vs. Non-Disadvantaged ZIP Codes across Hawaii Parks",
    x = "Year",
    y = "Average Annual Welfare Loss ($/Park)",
    fill = "Community Status"
  ) +
  
  # Academic Theme
  theme_bw() +
  theme(
    text = element_text(family = "serif", size = 20),
    plot.title = element_text(face = "bold", size =20, hjust = 0.5),
    plot.subtitle = element_text(hjust = 0.5, size = 20),
    
    # Grid and Panel styling
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    strip.background = element_rect(fill = "white", color = "black"),
    strip.text = element_text(face = "bold", size = 20),
    
    # Legend at bottom
    legend.position = "bottom",
    panel.border = element_rect(colour = "black", fill = NA, size = 0.8)
  )

Show code
ggsave("images/comparision_envir_observed.png", width=12, height=10, dpi=300)
Show code
# 1. Prepare data: Group by Park, Year, and EJ Status
plot_data_three_panel <- final_comparison_master1 %>%
  filter(parkname!="Stay_Home") %>%
  group_by(year, parkname, is_disadvantaged_50_soci) %>%
  summarise(
    Observed_RUM = mean(loss_rum_observed, na.rm = TRUE),
    Frequency_RUM = mean(loss_rum_52wk, na.rm = TRUE),
    MDCEV = mean(loss_mdcev, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  # Pivot to long format
  pivot_longer(
    cols = c(Observed_RUM, Frequency_RUM, MDCEV),
    names_to = "Model_Type",
    values_to = "Welfare_Loss"
  ) %>%
  # Clean up factor labels for the legend and facets
  mutate(
    Model_Type = factor(Model_Type, 
                        levels = c("Observed_RUM", "Frequency_RUM", "MDCEV"),
                        labels = c("Observed RUM", "Frequency RUM (52-Wk)", "MDCEV")),
    EJ_Group = factor(is_disadvantaged_50_soci, 
                      levels = c(0, 1), 
                      labels = c("Non-Disadvantaged", "Disadvantaged (EJ)"))
  )

# 2. Create the Three-Panel Plot
ggplot(plot_data_three_panel, aes(x = factor(year), y = Welfare_Loss, fill = EJ_Group)) +
  # Boxplot with black outlines
  geom_boxplot(outlier.shape = 21, outlier.size = 1, color = "black", alpha = 0.8) +
  
  # CREATE THREE PANELS (One for each Model)
  facet_wrap(~ Model_Type, scales = "free_y") + 
  
  # Grayscale for the EJ Comparison
  scale_fill_manual(values = c(
    "Non-Disadvantaged" = "#F0F0F0", # Very Light Grey
    "Disadvantaged (EJ)" = "#636363"  # Dark Grey
  )) +
  
  # Labels
  labs(
    title = "Annual Welfare Loss by Model Type and Social Vulnerabilities Status",
    subtitle = "Comparing Disadvantaged vs. Non-Disadvantaged ZIP Codes across Hawaii Parks",
    x = "Year",
    y = "Average Annual Welfare Loss ($/Park)",
    fill = "Community Status"
  ) +
  
  # Academic Theme
  theme_bw() +
  theme(
    text = element_text(family = "serif", size = 12),
    plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
    plot.subtitle = element_text(hjust = 0.5, size = 11),
    
    # Grid and Panel styling
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    strip.background = element_rect(fill = "white", color = "black"),
    strip.text = element_text(face = "bold", size = 11),
    
    # Legend at bottom
    legend.position = "bottom",
    panel.border = element_rect(colour = "black", fill = NA, size = 0.8)
  )

Show code
ggsave("images/comparision_social_vulnerabilities.png", width=12, height=10, dpi=300)



# 1. Prepare data: Group by Park, Year, and EJ Status
plot_data_three_panel <- final_comparison_master1 %>%
  filter(parkname!="Stay_Home") %>%
  group_by(year, parkname, is_disadvantaged_50_soci) %>%
  summarise(
    Observed_RUM = mean(loss_rum_observed, na.rm = TRUE),
    MDCEV = mean(loss_mdcev, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  # Pivot to long format
  pivot_longer(
    cols = c(Observed_RUM, MDCEV),
    names_to = "Model_Type",
    values_to = "Welfare_Loss"
  ) %>%
  # Clean up factor labels for the legend and facets
  mutate(
    Model_Type = factor(Model_Type, 
                        levels = c("Observed_RUM", "MDCEV"),
                        labels = c("Observed RUM", "MDCEV")),
    EJ_Group = factor(is_disadvantaged_50_soci, 
                      levels = c(0, 1), 
                      labels = c("Non-Disadvantaged", "Disadvantaged"))
  )

# 2. Create the Three-Panel Plot
ggplot(plot_data_three_panel, aes(x = factor(year), y = Welfare_Loss, fill = EJ_Group)) +
  # Boxplot with black outlines
  geom_boxplot(outlier.shape = 21, outlier.size = 1, color = "black", alpha = 0.8) +
  
  # CREATE THREE PANELS (One for each Model)
  facet_wrap(~ Model_Type, scales = "free_y") + 
  
  # Grayscale for the EJ Comparison
  scale_fill_manual(values = c(
    "Non-Disadvantaged" = "#F0F0F0", # Very Light Grey
    "Disadvantaged" = "#636363"  # Dark Grey
  )) +
  
  # Labels
  labs(
    title = "Annual Welfare Loss by Model Type and Social Vulnerabilities Status",
    subtitle = "Comparing Disadvantaged vs. Non-Disadvantaged ZIP Codes across Hawaii Parks",
    x = "Year",
    y = "Average Annual Welfare Loss ($/Park)",
    fill = "Community Status"
  ) +
  
  # Academic Theme
  theme_bw() +
  theme(
    text = element_text(family = "serif", size = 20),
    plot.title = element_text(face = "bold", size = 20, hjust = 0.5),
    plot.subtitle = element_text(hjust = 0.5, size = 20),
    
    # Grid and Panel styling
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    strip.background = element_rect(fill = "white", color = "black"),
    strip.text = element_text(face = "bold", size = 20),
    
    # Legend at bottom
    legend.position = "bottom",
    panel.border = element_rect(colour = "black", fill = NA, size = 0.8)
  )

Show code
ggsave("images/comparision_social_observed.png", width=12, height=10, dpi=300)

Map

Grey Map

Show code
options(tigris_use_cache = TRUE)
# 1. Get Hawaii Outlines
islands_outline <- states(cb = TRUE) %>%
  filter(STUSPS == "HI")

# 2. Get ZCTAs
zip_shapes <- zctas(cb = TRUE, year = 2020) %>%
  filter(str_starts(ZCTA5CE20, "967") | str_starts(ZCTA5CE20, "968"))

# 3. Zip averages — fix zip type here
zip_averages <- final_comparison_master1 %>%
  filter(parkname == "Stay_Home") %>%
  group_by(zip) %>%
  summarise(
    loss_mdcev       = mean(loss_mdcev,        na.rm = TRUE),
    is_disadvantaged = ifelse(
      mean(is_disadvantaged_50, na.rm = TRUE) > 0.5, "Yes", "No"
    ),
    .groups = "drop"
  ) %>%
  mutate(zip = as.character(zip))   # <-- KEY FIX

# Quick check before join
message(paste("Zip averages rows:", nrow(zip_averages)))
message(paste("Zip type:", class(zip_averages$zip)))
message(paste("ZCTA5CE20 type:", class(zip_shapes$ZCTA5CE20)))

# 4. Join
map_data <- zip_shapes %>%
  inner_join(zip_averages, by = c("ZCTA5CE20" = "zip"))

message(paste("Map data rows after join:", nrow(map_data)))

# 5. Bins
map_data <- map_data %>%
  mutate(
    loss_mdcev = ifelse(is.na(loss_mdcev) | loss_mdcev > 0, 0, loss_mdcev),
    loss_bin   = cut(loss_mdcev,
      breaks = c(-Inf, -46.03, -30.09, -17.94, 0),
      labels = c(">-$100",
                 "-$100 to -$50",
                 "-$50 to -$20",
                 "0 to -$20"),
      include.lowest = TRUE,
      right          = TRUE
    )
  )

# 6. Island labels
island_labels <- tibble(
  name     = c("Oʻahu", "Maui", "Hawaiʻi", "Kauaʻi", "Molokaʻi", "Lānaʻi"),
  x_anchor = c(-157.98, -156.33, -155.55, -159.52, -157.02, -156.92),
  y_anchor = c( 21.47,   20.80,   19.60,   22.06,   21.13,   20.83),
  x_label  = c(-158.60, -155.10, -157.20, -160.50, -156.60, -157.60),
  y_label  = c( 20.90,   21.50,   19.60,   22.20,   21.60,   20.15)
) %>%
  mutate(
    shrink      = 0.4,
    x_seg_start = x_label + shrink * (x_anchor - x_label),
    y_seg_start = y_label + shrink * (y_anchor - y_label)
  )

# 7. Plot
hawaii_map <- ggplot() +
  geom_sf(data = islands_outline, fill = "gray95", color = "gray60", size = 0.4) +
  
  geom_sf_pattern(
    data             = filter(map_data, is_disadvantaged == "No"),
    aes(fill = loss_bin, pattern = is_disadvantaged),
    pattern_color    = NA,
    pattern_fill     = NA,
    pattern_density  = 0.01,
    color            = "black",
    size             = 0.1
  ) +
  
  geom_sf_pattern(
    data             = filter(map_data, is_disadvantaged == "Yes"),
    aes(fill = loss_bin, pattern = is_disadvantaged),
    pattern_color    = "grey60",
    pattern_fill     = NA,
    pattern_density  = 0.4,
    pattern_spacing  = 0.03,
    pattern_angle    = 45,
    pattern_size     = 0.6,
    color            = "grey60",
    size             = 0.2
  ) +
  
  geom_segment(
    data      = island_labels,
    aes(x = x_seg_start, y = y_seg_start, xend = x_anchor, yend = y_anchor),
    color     = "gray50",
    linewidth = 0.3
  ) +
  
  geom_point(
    data  = island_labels,
    aes(x = x_anchor, y = y_anchor),
    color = "gray30",
    size  = 1.0
  ) +
  
  # White halo
  geom_text(
    data   = island_labels,
    aes(x = x_label, y = y_label, label = name),
    size   = 12, fontface = "bold", color = "white",
    family = "Times New Roman", hjust = 0.5, vjust = 0.5
  ) +
  # Actual text
  geom_text(
    data   = island_labels,
    aes(x = x_label, y = y_label, label = name),
    size   = 12, fontface = "bold", color = "gray10",
    family = "Times New Roman", hjust = 0.5, vjust = 0.5
  ) +
  
  annotation_north_arrow(
    location    = "bl",
    which_north = "true",
    height = unit(2.0, "cm"),
    width  = unit(1.5, "cm"),
    style  = north_arrow_fancy_orienteering(
      fill        = c("gray10", "white"),
      line_col    = "gray10",
      text_col    = "gray10",
      text_family = "Times New Roman",
      text_size   = 16
    ),
    pad_x = unit(1.0, "cm"),
    pad_y = unit(0.9, "cm")
  ) +
  
  annotation_scale(
    location      = "bl",
    unit_category = "imperial",
    text_family   = "Times New Roman",
    text_cex      = 1,
    style         = "bar",
    bar_cols      = c("gray10", "white"),
    line_col      = "gray10",
    text_col      = "gray10",
    pad_x = unit(1.0, "cm"),
    pad_y = unit(0.3, "cm")
  ) +
  
  scale_fill_manual(
    values = c(
      ">-$100"        = "gray10",
      "-$100 to -$50" = "gray40",
      "-$50 to -$20"  = "gray70",
      "0 to -$20"     = "gray92"
    ),
    name     = "Avg Welfare Loss",
    na.value = "transparent",
    guide    = guide_legend(
      override.aes   = list(pattern = "none"),
      reverse        = FALSE,
      keyheight      = unit(5, "mm"),
      keywidth       = unit(5, "mm"),
      label.position = "right"
    )
  ) +
  
  scale_pattern_manual(
    values = c("No" = "none", "Yes" = "crosshatch"),
    name   = "Community Type",
    labels = c("No" = "Non-Disadvantaged", "Yes" = "Disadvantaged"),
    guide  = guide_legend(
      override.aes = list(
        fill            = c("white", "white"),
        color           = c("black", "black"),
        pattern_color   = c(NA, "black"),
        pattern_fill    = c(NA, NA),
        pattern_density = c(0.01, 0.4),
        pattern_spacing = c(0.03, 0.02),
        pattern_angle   = c(45, 45)
      ),
      keyheight      = unit(7, "mm"),
      keywidth       = unit(7, "mm"),
      label.position = "right"
    )
  ) +
  
  coord_sf(xlim = c(-161.5, -154.0), ylim = c(18.7, 22.5), expand = FALSE) +
  theme_minimal() +
  theme(
    panel.grid           = element_blank(),
    axis.text            = element_blank(),
    axis.title           = element_blank(),
    axis.ticks           = element_blank(),
    legend.position      = c(0.1, 0.15),
    legend.justification = c("left", "bottom"),
    legend.box           = "vertical",
    legend.background    = element_rect(fill = "white", color = NA),
    text                 = element_text(family = "Times New Roman"),
    plot.title           = element_text(family = "Times New Roman", face = "bold"),
    legend.title         = element_text(family = "Times New Roman",
                                        face = "bold", size = 14),
    legend.text          = element_text(family = "Times New Roman", size = 14)
  )

ggsave(
  filename = "images/hawaii_welfare_loss_map.png",
  plot     = hawaii_map,
  width    = 12,
  height   = 10,
  dpi      = 300,
  units    = "in",
  bg       = "white"
)

Color Map

Show code
# 1. Get Hawaii Outlines
islands_outline <- states(cb = TRUE) %>%
  filter(STUSPS == "HI")

# 2. Get ZCTAs
zip_shapes <- zctas(cb = TRUE, year = 2020) %>%
  filter(str_starts(ZCTA5CE20, "967") | str_starts(ZCTA5CE20, "968"))

# 3. Zip averages
zip_averages <- final_comparison_master1 %>%
  filter(parkname == "Stay_Home") %>%
  group_by(zip) %>%
  summarise(
    loss_mdcev       = mean(loss_mdcev,        na.rm = TRUE),
    is_disadvantaged = ifelse(
      mean(is_disadvantaged_50, na.rm = TRUE) > 0.5, "Yes", "No"
    ),
    .groups = "drop"
  ) %>%
  mutate(zip = as.character(zip))

# 4. Join
map_data <- zip_shapes %>%
  inner_join(zip_averages, by = c("ZCTA5CE20" = "zip"))

# 5. Bins
map_data <- map_data %>%
  mutate(
    loss_mdcev = ifelse(is.na(loss_mdcev) | loss_mdcev > 0, 0, loss_mdcev),
    loss_bin   = cut(loss_mdcev,
      breaks = c(-Inf, -46.03, -30.09, -17.94, 0),
      labels = c(">-$100",
                 "-$100 to -$50",
                 "-$50 to -$20",
                 "0 to -$20"),
      include.lowest = TRUE,
      right          = TRUE
    )
  )

# 6. Island labels
island_labels <- tibble(
  name     = c("Oʻahu", "Maui", "Hawaiʻi", "Kauaʻi", "Molokaʻi", "Lānaʻi"),
  x_anchor = c(-157.98, -156.33, -155.55, -159.52, -157.02, -156.92),
  y_anchor = c( 21.47,   20.80,   19.60,   22.06,   21.13,   20.83),
  x_label  = c(-158.60, -155.10, -157.20, -160.50, -156.60, -157.60),
  y_label  = c( 20.90,   21.50,   19.60,   22.20,   21.60,   20.15)
) %>%
  mutate(
    shrink      = 0.4,
    x_seg_start = x_label + shrink * (x_anchor - x_label),
    y_seg_start = y_label + shrink * (y_anchor - y_label)
  )

# 7. Color map — white to dark red
hawaii_map_color <- ggplot() +
  geom_sf(data = islands_outline, fill = "gray95", color = "gray60", size = 0.4) +
  
# Non-DAC layer
geom_sf_pattern(
    data             = filter(map_data, is_disadvantaged == "No"),
    aes(fill = loss_bin, pattern = is_disadvantaged),
    pattern_color    = NA,
    pattern_fill     = "transparent",   # <-- change from NA
    pattern_density  = 0.01,
    color            = "white",
    size             = 0.1
) +

# DAC layer
geom_sf_pattern(
    data             = filter(map_data, is_disadvantaged == "Yes"),
    aes(fill = loss_bin, pattern = is_disadvantaged),
    pattern_color    = "black",
    pattern_fill     = "transparent",   # <-- change from NA
    pattern_density  = 0.4,
    pattern_spacing  = 0.03,
    pattern_angle    = 45,
    pattern_size     = 0.6,
    color            = "white",
    size             = 0.2
) +
  
  geom_segment(
    data      = island_labels,
    aes(x = x_seg_start, y = y_seg_start, xend = x_anchor, yend = y_anchor),
    color     = "gray30",
    linewidth = 0.3
  ) +
  
  geom_point(
    data  = island_labels,
    aes(x = x_anchor, y = y_anchor),
    color = "gray20",
    size  = 1.0
  ) +
  
  # White halo
  geom_text(
    data   = island_labels,
    aes(x = x_label, y = y_label, label = name),
    size   = 12, fontface = "bold", color = "white",
    family = "Times New Roman", hjust = 0.5, vjust = 0.5
  ) +
  # Actual text
  geom_text(
    data   = island_labels,
    aes(x = x_label, y = y_label, label = name),
    size   = 11, fontface = "bold", color = "gray10",
    family = "Times New Roman", hjust = 0.5, vjust = 0.5
  ) +
  
  annotation_north_arrow(
    location    = "bl",
    which_north = "true",
    height = unit(2.0, "cm"),
    width  = unit(1.5, "cm"),
    style  = north_arrow_fancy_orienteering(
      fill        = c("gray10", "white"),
      line_col    = "gray10",
      text_col    = "gray10",
      text_family = "Times New Roman",
      text_size   = 16
    ),
    pad_x = unit(1.0, "cm"),
    pad_y = unit(0.9, "cm")
  ) +
  
  annotation_scale(
    location      = "bl",
    unit_category = "imperial",
    text_family   = "Times New Roman",
    text_cex      = 1,
    style         = "bar",
    bar_cols      = c("gray10", "white"),
    line_col      = "gray10",
    text_col      = "gray10",
    pad_x = unit(1.0, "cm"),
    pad_y = unit(0.3, "cm")
  ) +
  
  # ── COLOR SCALE: white (low loss) to dark red (high loss) ─────────────────
  scale_fill_manual(
    values = c(
      ">-$100"        = "#67000d",   # darkest red   — highest loss
      "-$100 to -$50" = "#cb181d",   # medium red
      "-$50 to -$20"  = "#fc8d59",   # light orange-red
      "0 to -$20"     = "#fff5f0"    # near white    — lowest loss
    ),
    name     = "Avg Welfare Loss",
    na.value = "transparent",
    guide    = guide_legend(
      override.aes   = list(pattern = "none"),
      reverse        = FALSE,
      keyheight      = unit(6, "mm"),
      keywidth       = unit(6, "mm"),
      label.position = "right"
    )
  ) +
  
  scale_pattern_manual(
    values = c("No" = "none", "Yes" = "crosshatch"),
    name   = "Community Type",
    labels = c("No" = "Non-Disadvantaged", "Yes" = "Disadvantaged"),
    guide  = guide_legend(
      override.aes = list(
        fill            = c("white", "white"),
        color           = c("black", "black"),
        pattern_color   = c(NA, "black"),
        pattern_fill    = c(NA, NA),
        pattern_density = c(0.01, 0.4),
        pattern_spacing = c(0.03, 0.02),
        pattern_angle   = c(45, 45)
      ),
      keyheight      = unit(7, "mm"),
      keywidth       = unit(7, "mm"),
      label.position = "right"
    )
  ) +
  
  coord_sf(xlim = c(-161.5, -154.0), ylim = c(18.7, 22.5), expand = FALSE) +
  theme_minimal() +
  theme(
    panel.grid           = element_blank(),
    axis.text            = element_blank(),
    axis.title           = element_blank(),
    axis.ticks           = element_blank(),
    legend.position      = c(0.1, 0.15),
    legend.justification = c("left", "bottom"),
    legend.box           = "vertical",
    legend.background    = element_rect(fill = "white", color = NA),
    text                 = element_text(family = "Times New Roman"),
    plot.title           = element_text(family = "Times New Roman", face = "bold"),
    legend.title         = element_text(family = "Times New Roman",
                                        face = "bold", size = 14),
    legend.text          = element_text(family = "Times New Roman", size = 14)
  )

ggsave(
  filename = "images/hawaii_welfare_loss_map_color.png",
  plot     = hawaii_map_color,
  width    = 12,
  height   = 10,
  dpi      = 300,
  units    = "in",
  bg       = "white"
)

Color RUM

Show code
# 1. Get Hawaii Outlines
islands_outline <- states(cb = TRUE) %>%
  filter(STUSPS == "HI")

# 2. Get ZCTAs
zip_shapes <- zctas(cb = TRUE, year = 2020) %>%
  filter(str_starts(ZCTA5CE20, "967") | str_starts(ZCTA5CE20, "968"))

plot_data <- final_comparison_master1 %>%
  group_by(zip, parkname, year,is_disadvantaged_50) %>%
  filter(parkname!="Stay_Home")%>%
  summarise(
    ObservedRUM = mean(loss_rum_observed, na.rm = TRUE),
    FrequencyRUM = mean(loss_rum_52wk, na.rm = TRUE),
    .groups = "drop"
  )


# 3. Zip averages
zip_averages <- plot_data %>%
  # Step 1: Sum welfare across all sites within each zip and year
  group_by(zip, year) %>%
  summarise(
    loss_rum_observed = sum(ObservedRUM, na.rm = TRUE),
    is_disadvantaged  = ifelse(
      mean(is_disadvantaged_50, na.rm = TRUE) > 0.5, "Yes", "No"
    ),
    .groups = "drop"
  ) %>%
  
  # Step 2: Average the yearly sums across all years
  group_by(zip) %>%
  summarise(
    loss_rum_observed = mean(loss_rum_observed, na.rm = TRUE),
    is_disadvantaged  = ifelse(          # <-- uses is_disadvantaged not is_disadvantaged_50
      mean(is_disadvantaged == "Yes", na.rm = TRUE) > 0.5, "Yes", "No"
    ),
    .groups = "drop"
  ) %>%
  mutate(zip = as.character(zip))

# 4. Join
map_data <- zip_shapes %>%
  inner_join(zip_averages, by = c("ZCTA5CE20" = "zip"))

# 5. Bins
map_data <- map_data %>%
  mutate(
    loss_rum_observed = ifelse(is.na(loss_rum_observed) | loss_rum_observed > 0, 0, loss_rum_observed),
    loss_bin   = cut(loss_rum_observed,
      breaks = c(-Inf, -400, -300, -200, -100),
      labels = c(">-$400",
                 "-$400 to -$300",
                 "-$300 to -$200",
                 "-$100 to -$200"),
      include.lowest = TRUE,
      right          = TRUE
    )
  )

# 6. Island labels
island_labels <- tibble(
  name     = c("Oʻahu", "Maui", "Hawaiʻi", "Kauaʻi", "Molokaʻi", "Lānaʻi"),
  x_anchor = c(-157.98, -156.33, -155.55, -159.52, -157.02, -156.92),
  y_anchor = c( 21.47,   20.80,   19.60,   22.06,   21.13,   20.83),
  x_label  = c(-158.60, -155.10, -157.20, -160.50, -156.60, -157.60),
  y_label  = c( 20.90,   21.50,   19.60,   22.20,   21.60,   20.15)
) %>%
  mutate(
    shrink      = 0.4,
    x_seg_start = x_label + shrink * (x_anchor - x_label),
    y_seg_start = y_label + shrink * (y_anchor - y_label)
  )

# 7. Color map — white to dark red
hawaii_map_color <- ggplot() +
  geom_sf(data = islands_outline, fill = "gray95", color = "gray60", size = 0.4) +
  
# Non-DAC layer
geom_sf_pattern(
    data             = filter(map_data, is_disadvantaged == "No"),
    aes(fill = loss_bin, pattern = is_disadvantaged),
    pattern_color    = NA,
    pattern_fill     = "transparent",   # <-- change from NA
    pattern_density  = 0.01,
    color            = "white",
    size             = 0.1
) +

# DAC layer
geom_sf_pattern(
    data             = filter(map_data, is_disadvantaged == "Yes"),
    aes(fill = loss_bin, pattern = is_disadvantaged),
    pattern_color    = "black",
    pattern_fill     = "transparent",   # <-- change from NA
    pattern_density  = 0.4,
    pattern_spacing  = 0.03,
    pattern_angle    = 45,
    pattern_size     = 0.6,
    color            = "white",
    size             = 0.2
) +
  
  geom_segment(
    data      = island_labels,
    aes(x = x_seg_start, y = y_seg_start, xend = x_anchor, yend = y_anchor),
    color     = "gray30",
    linewidth = 0.3
  ) +
  
  geom_point(
    data  = island_labels,
    aes(x = x_anchor, y = y_anchor),
    color = "gray20",
    size  = 1.0
  ) +
  
  # White halo
  geom_text(
    data   = island_labels,
    aes(x = x_label, y = y_label, label = name),
    size   = 12, fontface = "bold", color = "white",
    family = "Times New Roman", hjust = 0.5, vjust = 0.5
  ) +
  # Actual text
  geom_text(
    data   = island_labels,
    aes(x = x_label, y = y_label, label = name),
    size   = 11, fontface = "bold", color = "gray10",
    family = "Times New Roman", hjust = 0.5, vjust = 0.5
  ) +
  
  annotation_north_arrow(
    location    = "bl",
    which_north = "true",
    height = unit(2.0, "cm"),
    width  = unit(1.5, "cm"),
    style  = north_arrow_fancy_orienteering(
      fill        = c("gray10", "white"),
      line_col    = "gray10",
      text_col    = "gray10",
      text_family = "Times New Roman",
      text_size   = 16
    ),
    pad_x = unit(1.0, "cm"),
    pad_y = unit(0.9, "cm")
  ) +
  
  annotation_scale(
    location      = "bl",
    unit_category = "imperial",
    text_family   = "Times New Roman",
    text_cex      = 1,
    style         = "bar",
    bar_cols      = c("gray10", "white"),
    line_col      = "gray10",
    text_col      = "gray10",
    pad_x = unit(1.0, "cm"),
    pad_y = unit(0.3, "cm")
  ) +
  
  # ── COLOR SCALE: white (low loss) to dark red (high loss) ─────────────────
  scale_fill_manual(
    values = c(
      ">-$400"        = "#67000d",   # darkest red   — highest loss
      "-$400 to -$300" = "#cb181d",   # medium red
      "-$300 to -$200"  = "#fc8d59",   # light orange-red
      "-$100 to -$200"     = "#fff5f0"    # near white    — lowest loss
    ),
    name     = "Avg Welfare Loss",
    na.value = "transparent",
    guide    = guide_legend(
      override.aes   = list(pattern = "none"),
      reverse        = FALSE,
      keyheight      = unit(6, "mm"),
      keywidth       = unit(6, "mm"),
      label.position = "right"
    )
  ) +
  
  scale_pattern_manual(
    values = c("No" = "none", "Yes" = "crosshatch"),
    name   = "Community Type",
    labels = c("No" = "Non-Disadvantaged", "Yes" = "Disadvantaged"),
    guide  = guide_legend(
      override.aes = list(
        fill            = c("white", "white"),
        color           = c("black", "black"),
        pattern_color   = c(NA, "black"),
        pattern_fill    = c(NA, NA),
        pattern_density = c(0.01, 0.4),
        pattern_spacing = c(0.03, 0.02),
        pattern_angle   = c(45, 45)
      ),
      keyheight      = unit(7, "mm"),
      keywidth       = unit(7, "mm"),
      label.position = "right"
    )
  ) +
  
  coord_sf(xlim = c(-161.5, -154.0), ylim = c(18.7, 22.5), expand = FALSE) +
  theme_minimal() +
  theme(
    panel.grid           = element_blank(),
    axis.text            = element_blank(),
    axis.title           = element_blank(),
    axis.ticks           = element_blank(),
    legend.position      = c(0.1, 0.15),
    legend.justification = c("left", "bottom"),
    legend.box           = "vertical",
    legend.background    = element_rect(fill = "white", color = NA),
    text                 = element_text(family = "Times New Roman"),
    plot.title           = element_text(family = "Times New Roman", face = "bold"),
    legend.title         = element_text(family = "Times New Roman",
                                        face = "bold", size = 14),
    legend.text          = element_text(family = "Times New Roman", size = 14)
  )

ggsave(
  filename = "images/hawaii_welfare_loss_map_colorRUM.png",
  plot     = hawaii_map_color,
  width    = 12,
  height   = 10,
  dpi      = 300,
  units    = "in",
  bg       = "white"
)
Show code
print(hawaii_map_color)

Show code
print(hawaii_map)

Sensitivity Test

Show code
get_price_coef <- function(mle, yr, model_name) {
  
  tryCatch({
    # mlogit coefficients
    beta  <- coef(mle)["price"]
    
    # variance-covariance matrix
    se    <- sqrt(vcov(mle)["price", "price"])
    tibble(
      year       = yr,
      model      = model_name,
      beta_price = beta,
      se_price   = se,
      ci_low     = beta - 1.96 * se,
      ci_high    = beta + 1.96 * se
    )
  }, error = function(e) {
    message(paste("Failed for", model_name, yr, ":", e$message))
    tibble(
      year       = yr,
      model      = model_name,
      beta_price = NA_real_,
      se_price   = NA_real_,
      ci_low     = NA_real_,
      ci_high    = NA_real_
    )
  })
}



# PRice Stability
price_stability_all <- map_df(2018:2023, function(yr) {
  
  obs_key  <- paste0("Year_", yr)   
  freq_key <- paste0("52wk_", yr)   
  
  obs_mle  <- model_list_obs[[obs_key]]
  freq_mle <- model_list_52[[freq_key]]
  
  bind_rows(
    if (!is.null(obs_mle))
      get_price_coef(obs_mle,  yr, "Observed RUM"),
    if (!is.null(freq_mle))
      get_price_coef(freq_mle, yr, "Frequency RUM")
  )
})

print(price_stability_all)
# A tibble: 6 × 6
   year model        beta_price se_price   ci_low  ci_high
  <int> <chr>             <dbl>    <dbl>    <dbl>    <dbl>
1  2018 Observed RUM   -0.00608 0.000275 -0.00662 -0.00554
2  2019 Observed RUM   -0.00632 0.000290 -0.00689 -0.00575
3  2020 Observed RUM   -0.00667 0.000354 -0.00737 -0.00598
4  2021 Observed RUM   -0.00667 0.000240 -0.00714 -0.00620
5  2022 Observed RUM   -0.00664 0.000247 -0.00713 -0.00616
6  2023 Observed RUM   -0.00704 0.000264 -0.00755 -0.00652
Show code
year_pairs <- combn(2018:2023, 2, simplify = FALSE)
# Pairwise welfare stability — all three models 
test_year_pairs_model <- function(yr1, yr2, loss_col) {
  
  d1 <- final_comparison_master %>%
    filter(year == yr1, parkname != "Stay_Home") %>%
    pull({{ loss_col }})
  
  d2 <- final_comparison_master %>%
    filter(year == yr2, parkname != "Stay_Home") %>%
    pull({{ loss_col }})
  
  t_result <- t.test(d1, d2)
  
  tibble(
    year1   = yr1,
    year2   = yr2,
    mean1   = round(mean(d1, na.rm = TRUE), 2),
    mean2   = round(mean(d2, na.rm = TRUE), 2),
    t_stat  = round(t_result$statistic, 3),
    p_value = round(t_result$p.value,   4),
    stable  = t_result$p.value > 0.05
  )
}

stability_all <- bind_rows(
  map_df(year_pairs, ~test_year_pairs_model(.x[1], .x[2], loss_mdcev)) %>%
    mutate(model = "MDCEV"),
  map_df(year_pairs, ~test_year_pairs_model(.x[1], .x[2], loss_rum_observed)) %>%
    mutate(model = "Observed RUM"),
  map_df(year_pairs, ~test_year_pairs_model(.x[1], .x[2], loss_rum_52wk)) %>%
    mutate(model = "Frequency RUM")
)


# Test  Convergent validity across models — do they agree on rankings? 
site_welfare <- final_comparison_master %>%
    filter(parkname != "Stay_Home") %>%
    group_by(parkname) %>%
    summarise(
        mean_mdcev = mean(loss_mdcev,        na.rm = TRUE),
        mean_obs   = mean(loss_rum_observed, na.rm = TRUE),
        mean_52wk  = mean(loss_rum_52wk,     na.rm = TRUE),
        .groups    = "drop"
    )

# Spearman rank correlations across models
cross_model_cor <- site_welfare %>%
    select(mean_mdcev, mean_obs, mean_52wk) %>%
    cor(method = "spearman", use = "complete.obs")

colnames(cross_model_cor) <- c("MDCEV", "Observed RUM", "Frequency RUM")
rownames(cross_model_cor) <- c("MDCEV", "Observed RUM", "Frequency RUM")

print(round(cross_model_cor, 3))
              MDCEV Observed RUM Frequency RUM
MDCEV         1.000        0.912         0.912
Observed RUM  0.912        1.000         1.000
Frequency RUM 0.912        1.000         1.000
Show code
# Transfer error across models 
# Ji et al. transfer error as:
# TE = |predicted - actual| / |actual| × 100

cross_model_te <- site_welfare %>%
    mutate(
        # Use MDCEV to predict each RUM
        te_mdcev_to_obs  = abs((mean_mdcev - mean_obs)   / mean_obs)  * 100,
        te_mdcev_to_52wk = abs((mean_mdcev - mean_52wk)  / mean_52wk) * 100,
        # Use observed RUM to predict frequency RUM
        te_obs_to_52wk   = abs((mean_obs   - mean_52wk)  / mean_52wk) * 100
    ) %>%
    summarise(
        # Mean transfer error across sites
        mean_te_mdcev_obs  = round(mean(te_mdcev_to_obs,  na.rm = TRUE), 1),
        mean_te_mdcev_52wk = round(mean(te_mdcev_to_52wk, na.rm = TRUE), 1),
        mean_te_obs_52wk   = round(mean(te_obs_to_52wk,   na.rm = TRUE), 1),
        # Median transfer error
        med_te_mdcev_obs   = round(median(te_mdcev_to_obs,  na.rm = TRUE), 1),
        med_te_mdcev_52wk  = round(median(te_mdcev_to_52wk, na.rm = TRUE), 1),
        med_te_obs_52wk    = round(median(te_obs_to_52wk,   na.rm = TRUE), 1)
    )

print(cross_model_te)
# A tibble: 1 × 6
  mean_te_mdcev_obs mean_te_mdcev_52wk mean_te_obs_52wk med_te_mdcev_obs
              <dbl>              <dbl>            <dbl>            <dbl>
1              71.7               76.9             19.7             73.7
# ℹ 2 more variables: med_te_mdcev_52wk <dbl>, med_te_obs_52wk <dbl>
Show code
# Equality tests — are model welfare estimates significantly different from each other? 
# (Ji et al. Section 4)

# Paired t-test: MDCEV vs Observed RUM
t_mdcev_obs <- t.test(
    final_comparison_master$loss_mdcev,
    final_comparison_master$loss_rum_observed,
    paired = TRUE
)

# Paired t-test: MDCEV vs Frequency RUM
t_mdcev_52wk <- t.test(
    final_comparison_master$loss_mdcev,
    final_comparison_master$loss_rum_52wk,
    paired = TRUE
)

# Paired t-test: Observed RUM vs Frequency RUM
t_obs_52wk <- t.test(
    final_comparison_master$loss_rum_observed,
    final_comparison_master$loss_rum_52wk,
    paired = TRUE
)

equality_tests <- tibble(
    comparison     = c("MDCEV vs Observed RUM",
                       "MDCEV vs Frequency RUM",
                       "Observed RUM vs Frequency RUM"),
    mean_diff      = c(round(t_mdcev_obs$estimate,  2),
                       round(t_mdcev_52wk$estimate, 2),
                       round(t_obs_52wk$estimate,   2)),
    t_stat         = c(round(t_mdcev_obs$statistic,  3),
                       round(t_mdcev_52wk$statistic, 3),
                       round(t_obs_52wk$statistic,   3)),
    p_value        = c(round(t_mdcev_obs$p.value,  4),
                       round(t_mdcev_52wk$p.value, 4),
                       round(t_obs_52wk$p.value,   4)),
    significantly_different = c(
        t_mdcev_obs$p.value  < 0.05,
        t_mdcev_52wk$p.value < 0.05,
        t_obs_52wk$p.value   < 0.05
    )
)

print(equality_tests)
# A tibble: 3 × 5
  comparison                    mean_diff t_stat p_value significantly_different
  <chr>                             <dbl>  <dbl>   <dbl> <lgl>                  
1 MDCEV vs Observed RUM              6.81   69.3       0 TRUE                   
2 MDCEV vs Frequency RUM          1757.     91.2       0 TRUE                   
3 Observed RUM vs Frequency RUM   1751.     90.8       0 TRUE                   
Show code
# Transfer error by year pair AND model pair 
# Ji et al.'s most comprehensive test — applies both dimensions

te_full <- final_comparison_master %>%
    filter(parkname != "Stay_Home", year!=2020, year!=2021) %>%
    group_by(parkname, year) %>%
    summarise(
        mean_mdcev = mean(loss_mdcev,        na.rm = TRUE),
        mean_obs   = mean(loss_rum_observed, na.rm = TRUE),
        mean_52wk  = mean(loss_rum_52wk,     na.rm = TRUE),
        .groups    = "drop"
    ) %>%
    full_join(
        final_comparison_master %>%
        
            filter(parkname != "Stay_Home") %>%
            group_by(parkname, year) %>%
            summarise(
                pred_mdcev = mean(loss_mdcev,        na.rm = TRUE),
                pred_obs   = mean(loss_rum_observed, na.rm = TRUE),
                pred_52wk  = mean(loss_rum_52wk,     na.rm = TRUE),
                .groups    = "drop"
            ) %>%
            rename(year2 = year),
        by = "parkname"
    ) %>%
    filter(year != year2) %>%
    mutate(
        # Within model temporal transfer error
        te_mdcev_time = abs((pred_mdcev - mean_mdcev) / mean_mdcev) * 100,
        te_obs_time   = abs((pred_obs   - mean_obs)   / mean_obs)   * 100,
        te_52wk_time  = abs((pred_52wk  - mean_52wk)  / mean_52wk)  * 100,
        # Cross model transfer error (use MDCEV from year1 to predict RUM in year2)
        te_mdcev_to_obs_cross  = abs((pred_mdcev - mean_obs)   / mean_obs)   * 100,
        te_mdcev_to_52wk_cross = abs((pred_mdcev - mean_52wk)  / mean_52wk)  * 100
    ) %>%
    group_by(year, year2) %>%
    summarise(
        te_mdcev       = round(mean(te_mdcev_time,na.rm = TRUE), 1),
        te_obs         = round(mean(te_obs_time,na.rm = TRUE), 1),
        te_52wk        = round(mean(te_52wk_time,na.rm = TRUE), 1),
        te_cross_mdcev_obs  = round(mean(te_mdcev_to_obs_cross,  na.rm = TRUE), 1),
        te_cross_mdcev_52wk = round(mean(te_mdcev_to_52wk_cross, na.rm = TRUE), 1),
        .groups        = "drop"
    )

print(te_full, n = 30)
# A tibble: 20 × 7
    year year2 te_mdcev te_obs te_52wk te_cross_mdcev_obs te_cross_mdcev_52wk
   <dbl> <dbl>    <dbl>  <dbl>   <dbl>              <dbl>               <dbl>
 1  2018  2019     36.3   27.8    26.7               74.8                79.5
 2  2018  2020     76.4   76.5    68.9               69.3                71.7
 3  2018  2021     78     63.3    49.6               68.3                77  
 4  2018  2022     84.1   64.2    46.3               59.7                70.7
 5  2018  2023     91     62.7    48.1               64.2                71.3
 6  2019  2018     95.5   54.2    51.6               63.9                74  
 7  2019  2020    128.    99      84.1               67.8                69.7
 8  2019  2021    220.   109.     78.8               59.9                71.6
 9  2019  2022    172.   106.     80                 52.8                66.1
10  2019  2023    200.    95.4    70.9               59.4                68.2
11  2022  2018     82.5   71.4    69.2               77.6                82  
12  2022  2019     81.5   77.5    77.6               78.3                81  
13  2022  2020     51.7   44.5    46.1               76.7                78  
14  2022  2021     54.3   33.7    29.3               74.3                77.4
15  2022  2023     33.9   19.4    19.1               75.5                76.8
16  2023  2018     90.2   74.5    76.7               75.7                80.2
17  2023  2019     85.9   72.6    72.1               75.7                79  
18  2023  2020     48.5   47.1    50.1               69.2                70.9
19  2023  2021     78.2   44.7    39.1               70.4                74.1
20  2023  2022     32.8   21.1    21                 69                  71.6
Show code
# Summary table matching Ji et al. Table format
summary_ji <- tibble(
    Test               = c(
        "Mean Spearman Rank Correlation",
        "Mean Transfer Error — MDCEV",
        "Mean Transfer Error — Observed RUM",
        "Mean Transfer Error — Frequency RUM",
        "% Year Pairs Stable — MDCEV",
        "% Year Pairs Stable — Observed RUM",
        "% Year Pairs Stable — Frequency RUM"
    ),
    Value = c(
        round(mean(c(cross_model_cor[1,2],
                     cross_model_cor[1,3],
                     cross_model_cor[2,3])), 3),
        round(mean(te_full$te_mdcev,  na.rm = TRUE), 1),
        round(mean(te_full$te_obs,    na.rm = TRUE), 1),
        round(mean(te_full$te_52wk,   na.rm = TRUE), 1),
        round(mean(stability_all$stable[stability_all$model == "MDCEV"])         * 100, 1),
        round(mean(stability_all$stable[stability_all$model == "Observed RUM"])  * 100, 1),
        round(mean(stability_all$stable[stability_all$model == "Frequency RUM"]) * 100, 1)
    )
)

print(summary_ji)
# A tibble: 7 × 2
  Test                                 Value
  <chr>                                <dbl>
1 Mean Spearman Rank Correlation       0.941
2 Mean Transfer Error — MDCEV         91    
3 Mean Transfer Error — Observed RUM  63.3  
4 Mean Transfer Error — Frequency RUM 55.3  
5 % Year Pairs Stable — MDCEV         66.7  
6 % Year Pairs Stable — Observed RUM  26.7  
7 % Year Pairs Stable — Frequency RUM 33.3  

What Years?

Given those % pair stable years. What Years are stable across models? Is this just a covid issue? It does kind of look like it because its unstable for comparisons to 2020 then becomes more stable after 2021/2022 later comparsions.

Interesting MDCEV 2019 to years excluding 2020 stable… thats were the 66% comes from.

Show code
site_welfare <- final_comparison_master %>%
    filter(parkname != "Stay_Home") %>%
    group_by(parkname) %>%
    summarise(
        mean_mdcev = mean(loss_mdcev,        na.rm = TRUE),
        mean_obs   = mean(loss_rum_observed, na.rm = TRUE),
        mean_52wk  = mean(loss_rum_52wk,     na.rm = TRUE),
        .groups    = "drop"
    )
# Which year pairs are NOT significantly different (stable) by model 
stability_all %>%
  filter(stable == TRUE) %>%
  select(model, year1, year2, mean1, mean2, t_stat, p_value) %>%
  arrange(model, year1, year2) %>%
  print(n = 50)
# A tibble: 19 × 7
   model         year1 year2  mean1  mean2 t_stat p_value
   <chr>         <int> <int>  <dbl>  <dbl>  <dbl>   <dbl>
 1 Frequency RUM  2018  2019 -14.6  -14.0  -1.84   0.0653
 2 Frequency RUM  2020  2022 -11.6  -11.5  -0.692  0.489 
 3 Frequency RUM  2021  2022 -11.2  -11.5   1.49   0.135 
 4 Frequency RUM  2021  2023 -11.2  -11.1  -0.38   0.704 
 5 Frequency RUM  2022  2023 -11.5  -11.1  -1.83   0.0677
 6 MDCEV          2018  2019  -3.45  -3.05 -1.70   0.0884
 7 MDCEV          2019  2021  -3.05  -2.79 -1.42   0.155 
 8 MDCEV          2019  2022  -3.05  -2.84 -1.16   0.248 
 9 MDCEV          2019  2023  -3.05  -2.82 -1.21   0.226 
10 MDCEV          2020  2021  -2.57  -2.79  1.03   0.302 
11 MDCEV          2020  2022  -2.57  -2.84  1.27   0.203 
12 MDCEV          2020  2023  -2.57  -2.82  1.15   0.252 
13 MDCEV          2021  2022  -2.79  -2.84  0.279  0.78  
14 MDCEV          2021  2023  -2.79  -2.82  0.167  0.867 
15 MDCEV          2022  2023  -2.84  -2.82 -0.101  0.920 
16 Observed RUM   2018  2019 -13.0  -12.5  -1.29   0.198 
17 Observed RUM   2020  2023 -10.3  -10.9   1.84   0.0661
18 Observed RUM   2021  2022 -11.6  -11.1  -1.55   0.122 
19 Observed RUM   2022  2023 -11.1  -10.9  -0.914  0.361 
Show code
# ── Summary count of stable pairs per model 
stability_all %>%
  group_by(model) %>%
  summarise(
    n_pairs        = n(),
    n_stable       = sum(stable),
    n_unstable     = sum(!stable),
    pct_stable     = round(mean(stable) * 100, 1),
    stable_pairs   = paste(
      paste0(year1[stable], "–", year2[stable]),
      collapse = ", "
    ),
    .groups = "drop"
  ) %>%
  print()
# A tibble: 3 × 6
  model         n_pairs n_stable n_unstable pct_stable stable_pairs             
  <chr>           <int>    <int>      <int>      <dbl> <chr>                    
1 Frequency RUM      15        5         10       33.3 2018–2019, 2020–2022, 20…
2 MDCEV              15       10          5       66.7 2018–2019, 2019–2021, 20…
3 Observed RUM       15        4         11       26.7 2018–2019, 2020–2023, 20…
Show code
# ── Full table showing stable vs unstable clearly 
stability_all %>%
  mutate(
    year_pair  = paste0(year1, "–", year2),
    stability  = ifelse(stable, "Stable", "Unstable"),
    sig_stars  = case_when(
      p_value < 0.001 ~ "***",
      p_value < 0.01  ~ "**",
      p_value < 0.05  ~ "*",
      TRUE            ~ "n.s."
    )
  ) %>%
  select(model, year_pair, mean1, mean2, t_stat, p_value, sig_stars, stability) %>%
  arrange(model, year_pair) %>%
  print(n = 50)
# A tibble: 45 × 8
   model         year_pair  mean1  mean2  t_stat p_value sig_stars stability
   <chr>         <chr>      <dbl>  <dbl>   <dbl>   <dbl> <chr>     <chr>    
 1 Frequency RUM 2018–2019 -14.6  -14.0   -1.84   0.0653 n.s.      Stable   
 2 Frequency RUM 2018–2020 -14.6  -11.6  -10.4    0      ***       Unstable 
 3 Frequency RUM 2018–2021 -14.6  -11.2  -13.2    0      ***       Unstable 
 4 Frequency RUM 2018–2022 -14.6  -11.5  -11.9    0      ***       Unstable 
 5 Frequency RUM 2018–2023 -14.6  -11.1  -13.3    0      ***       Unstable 
 6 Frequency RUM 2019–2020 -14.0  -11.6   -8.34   0      ***       Unstable 
 7 Frequency RUM 2019–2021 -14.0  -11.2  -10.9    0      ***       Unstable 
 8 Frequency RUM 2019–2022 -14.0  -11.5   -9.62   0      ***       Unstable 
 9 Frequency RUM 2019–2023 -14.0  -11.1  -11.0    0      ***       Unstable 
10 Frequency RUM 2020–2021 -11.6  -11.2   -2.02   0.0435 *         Unstable 
11 Frequency RUM 2020–2022 -11.6  -11.5   -0.692  0.489  n.s.      Stable   
12 Frequency RUM 2020–2023 -11.6  -11.1   -2.31   0.021  *         Unstable 
13 Frequency RUM 2021–2022 -11.2  -11.5    1.49   0.135  n.s.      Stable   
14 Frequency RUM 2021–2023 -11.2  -11.1   -0.38   0.704  n.s.      Stable   
15 Frequency RUM 2022–2023 -11.5  -11.1   -1.83   0.0677 n.s.      Stable   
16 MDCEV         2018–2019  -3.45  -3.05  -1.70   0.0884 n.s.      Stable   
17 MDCEV         2018–2020  -3.45  -2.57  -3.56   0.0004 ***       Unstable 
18 MDCEV         2018–2021  -3.45  -2.79  -2.96   0.0031 **        Unstable 
19 MDCEV         2018–2022  -3.45  -2.84  -2.73   0.0063 **        Unstable 
20 MDCEV         2018–2023  -3.45  -2.82  -2.74   0.0061 **        Unstable 
21 MDCEV         2019–2020  -3.05  -2.57  -2.25   0.0246 *         Unstable 
22 MDCEV         2019–2021  -3.05  -2.79  -1.42   0.155  n.s.      Stable   
23 MDCEV         2019–2022  -3.05  -2.84  -1.16   0.248  n.s.      Stable   
24 MDCEV         2019–2023  -3.05  -2.82  -1.21   0.226  n.s.      Stable   
25 MDCEV         2020–2021  -2.57  -2.79   1.03   0.302  n.s.      Stable   
26 MDCEV         2020–2022  -2.57  -2.84   1.27   0.203  n.s.      Stable   
27 MDCEV         2020–2023  -2.57  -2.82   1.15   0.252  n.s.      Stable   
28 MDCEV         2021–2022  -2.79  -2.84   0.279  0.78   n.s.      Stable   
29 MDCEV         2021–2023  -2.79  -2.82   0.167  0.867  n.s.      Stable   
30 MDCEV         2022–2023  -2.84  -2.82  -0.101  0.920  n.s.      Stable   
31 Observed RUM  2018–2019 -13.0  -12.5   -1.29   0.198  n.s.      Stable   
32 Observed RUM  2018–2020 -13.0  -10.3   -7.83   0      ***       Unstable 
33 Observed RUM  2018–2021 -13.0  -11.6   -4.17   0      ***       Unstable 
34 Observed RUM  2018–2022 -13.0  -11.1   -5.68   0      ***       Unstable 
35 Observed RUM  2018–2023 -13.0  -10.9   -6.38   0      ***       Unstable 
36 Observed RUM  2019–2020 -12.5  -10.3   -6.49   0      ***       Unstable 
37 Observed RUM  2019–2021 -12.5  -11.6   -2.79   0.0053 **        Unstable 
38 Observed RUM  2019–2022 -12.5  -11.1   -4.26   0      ***       Unstable 
39 Observed RUM  2019–2023 -12.5  -10.9   -4.99   0      ***       Unstable 
40 Observed RUM  2020–2021 -10.3  -11.6    4.12   0      ***       Unstable 
41 Observed RUM  2020–2022 -10.3  -11.1    2.75   0.0059 **        Unstable 
42 Observed RUM  2020–2023 -10.3  -10.9    1.84   0.0661 n.s.      Stable   
43 Observed RUM  2021–2022 -11.6  -11.1   -1.55   0.122  n.s.      Stable   
44 Observed RUM  2021–2023 -11.6  -10.9   -2.40   0.0166 *         Unstable 
45 Observed RUM  2022–2023 -11.1  -10.9   -0.914  0.361  n.s.      Stable   

By park site

Show code
welfare_by_year <- final_comparison_master %>%
  filter(parkname != "Stay_Home") %>%
  group_by(parkname, year) %>%
  summarise(
    mean_mdcev = mean(loss_mdcev,        na.rm = TRUE),
    mean_obs   = mean(loss_rum_observed, na.rm = TRUE),
    mean_52wk  = mean(loss_rum_52wk,     na.rm = TRUE),
    .groups    = "drop"
  )

sequential_stability <- welfare_by_year %>%
  arrange(parkname, year) %>%
  group_by(parkname) %>%
  mutate(
    pct_change_mdcev = (mean_mdcev - lag(mean_mdcev)) / abs(lag(mean_mdcev)) * 100,
    pct_change_obs   = (mean_obs   - lag(mean_obs))   / abs(lag(mean_obs))   * 100,
    pct_change_52wk  = (mean_52wk  - lag(mean_52wk))  / abs(lag(mean_52wk))  * 100,
    year_pair        = paste0(lag(year), "→", year)
  ) %>%
  filter(!is.na(pct_change_mdcev)) %>%
  ungroup()

#Sequential year agfter year examination
sequential_pairs <- list(
  c(2018, 2019), c(2019, 2020), c(2020, 2021),
  c(2021, 2022), c(2022, 2023)
)

# Pct change by site 
pct_change_site <- welfare_by_year %>%
  arrange(parkname, year) %>%
  group_by(parkname) %>%
  mutate(
    pct_change_mdcev = (mean_mdcev - lag(mean_mdcev)) / abs(lag(mean_mdcev)) * 100,
    pct_change_obs   = (mean_obs   - lag(mean_obs))   / abs(lag(mean_obs))   * 100,
    pct_change_52wk  = (mean_52wk  - lag(mean_52wk))  / abs(lag(mean_52wk))  * 100
  ) %>%
  filter(!is.na(pct_change_mdcev))

# Average pct change across all sites and year pairs
pct_change_summary <- pct_change_site %>%
  group_by(parkname) %>%
  summarise(
    model = "MDCEV",
    mean_pct_change   = round(mean(abs(pct_change_mdcev), na.rm = TRUE), 1),
    median_pct_change = round(median(abs(pct_change_mdcev), na.rm = TRUE), 1),
    .groups = "drop"
  ) %>%
  bind_rows(
    pct_change_site %>%
      summarise(
        model             = "Observed RUM",
        mean_pct_change   = round(mean(abs(pct_change_obs),  na.rm = TRUE), 1),
        median_pct_change = round(median(abs(pct_change_obs), na.rm = TRUE), 1)
      ),
    pct_change_site %>%
      summarise(
        model             = "Frequency RUM",
        mean_pct_change   = round(mean(abs(pct_change_52wk), na.rm = TRUE), 1),
        median_pct_change = round(median(abs(pct_change_52wk), na.rm = TRUE), 1)
      )
  )

print(pct_change_summary)
# A tibble: 66 × 4
   parkname                            model mean_pct_change median_pct_change
   <chr>                               <chr>           <dbl>             <dbl>
 1 Ainapo Cabin                        MDCEV            64.4              66.7
 2 Hipalau                             MDCEV            38.4              35  
 3 Kaluahaulu                          MDCEV            51.5              28.2
 4 Kamananui Trail                     MDCEV           156                66.4
 5 Kaunala Trail                       MDCEV            40.9              14.6
 6 Kawaikoi                            MDCEV            41.3              26.1
 7 Keanakolu Ranger & Bunkhouse Cabins MDCEV            38.1              25.6
 8 Kuaokala Trail                      MDCEV            41.9              25.2
 9 Kulanaahane Trail                   MDCEV            25.5              12  
10 Kuliouou Trail                      MDCEV            85.6              33.3
# ℹ 56 more rows
Show code
# Site level pct change table
pct_change_site %>%
  group_by(parkname) %>%
  summarise(
    mean_pct_mdcev = round(mean(abs(pct_change_mdcev), na.rm = TRUE), 1),
    mean_pct_obs   = round(mean(abs(pct_change_obs),   na.rm = TRUE), 1),
    mean_pct_52wk  = round(mean(abs(pct_change_52wk),  na.rm = TRUE), 1),
    .groups        = "drop"
  ) %>%
  arrange(mean_pct_mdcev) %>%
  print(n = 22)
# A tibble: 22 × 4
   parkname                            mean_pct_mdcev mean_pct_obs mean_pct_52wk
   <chr>                                        <dbl>        <dbl>         <dbl>
 1 Peacock Flats Campsite                        14.1         11.2           9.3
 2 Kulanaahane Trail                             25.5         31.8          26.9
 3 Keanakolu Ranger & Bunkhouse Cabins           38.1         56.7          55.3
 4 Hipalau                                       38.4         53.1          50.5
 5 Kaunala Trail                                 40.9         34.2          26.5
 6 Kawaikoi                                      41.3         32.8          32.2
 7 Kuaokala Trail                                41.9         27            19.3
 8 Wiliwili                                      45.6         32.6          32.3
 9 Kaluahaulu                                    51.5         52.8          48.5
10 Waimanu Campsite                              55.6         54.2          39.8
11 Maakua Ridge Trail                            57.3         35.7          29.3
12 Manana Trail                                  60.3         38.2          31.4
13 Waialae Cabin Campsite                        60.6         38.7          38.4
14 Sugi Grove                                    60.7         47.8          43.6
15 Ainapo Cabin                                  64.4         54.5          53.9
16 Kuliouou Trail                                85.6         48.4          39.8
17 Waikoali                                      87.9         75.4          68  
18 Lonomea                                      110.          62            57.5
19 Waimano Trail                                120.          77.3          67.2
20 Waikolu                                      128.          89.6          83.3
21 Kamananui Trail                              156           53.6          46.8
22 Wiliwilinui Ridge Trail                      178.          58.1          51.7

This looks at the linear welfare stability, which actually shows that MDCEV is less like we mention in paper but if we look pairwise comarision (i.e. which gives us aa really clear covid examination) across all years broadly it does have more stable. That is also interesting…

Show code
#| cache: true

site_change_ci <- sequential_stability %>%
  pivot_longer(
    cols      = c(pct_change_mdcev, pct_change_obs, pct_change_52wk),
    names_to  = "model",
    values_to = "pct_change"
  ) %>%
  mutate(
    model = case_when(
      model == "pct_change_mdcev" ~ "MDCEV",
      model == "pct_change_obs"   ~ "Observed RUM",
      model == "pct_change_52wk"  ~ "Frequency RUM"
    ),
    model = factor(model, levels = c("MDCEV", "Observed RUM", "Frequency RUM"))
  ) %>%
  group_by(model, year_pair) %>%
  summarise(
    mean_change = mean(pct_change,  na.rm = TRUE),
    sd_change   = sd(pct_change,    na.rm = TRUE),
    n           = n(),
    se          = sd_change / sqrt(n),
    ci_low      = mean_change - 1.96 * se,
    ci_high     = mean_change + 1.96 * se,
    .groups     = "drop"
  )

p_line <- ggplot(site_change_ci,
       aes(x     = year_pair,
           y     = mean_change,
           color = model,
           group = model)) +
  
  geom_hline(yintercept =  0,  linetype = "dashed",
             color = "gray40", linewidth = 0.4) +
  geom_hline(yintercept =  30, linetype = "dotted",
             color = "gray60", linewidth = 0.4) +
  geom_hline(yintercept = -30, linetype = "dotted",
             color = "gray60", linewidth = 0.4) +

  
  geom_ribbon(
    aes(ymin = ci_low, ymax = ci_high, fill = model),
    alpha = 0.15, color = NA
  ) +
  geom_line(linewidth = 0.9) +
  geom_point(size = 2.5) +
  
  scale_color_manual(
    values = c("MDCEV"         = "gray10",
               "Observed RUM"  = "gray50",
               "Frequency RUM" = "gray75"),
    name = "Model"
  ) +
  scale_fill_manual(
    values = c("MDCEV"         = "gray10",
               "Observed RUM"  = "gray50",
               "Frequency RUM" = "gray75"),
    guide  = "none"
  ) +
  scale_x_discrete(name = "Year Pair") +    # <-- fix here
  scale_y_continuous(
    name   = "Mean % Change Across Sites",
    labels = scales::percent_format(scale = 1)
  ) +
  labs(
    title    = "Sequential Welfare Stability Across Models",
    subtitle = "Mean % change across all sites | 95% CI "
  ) +
  theme_minimal(base_family = "Times New Roman") +
  theme(
    panel.grid.minor   = element_blank(),
    panel.grid.major.x = element_blank(),
    axis.text.x  = element_text(size = 11, angle = 20, hjust = 1),
    axis.text.y  = element_text(size = 11),
    legend.position  = "bottom",
    legend.title = element_text(face = "bold", size = 12),
    legend.text  = element_text(size = 11),
    plot.title   = element_text(face = "bold", size = 14),
    plot.subtitle = element_text(size = 11, color = "gray40")
  )

print(p_line)

Source Code
---
title: "Unequal Recreational Losses? Revealed Preference Evidence of Campsite Closures in Hawaiʻi"
format:
  html:
    code-fold: true        # Enables dropdown for code
    code-tools: true       # (Optional) Adds buttons like "Show Code"
    code-summary: "Show code"  # (Optional) Custom label for dropdown
    toc: true
    toc-location: left
    page-layout: full
editor: visual
---

This site provides a walk through on the results from Unequal Recreational Losses? Revealed Preference Evidence of Campsite Closures in Hawaiʻi. This tab covers the comparison by loading the data from the models. Tabs MDCEV and RUM are the codes that will run each model. 

# Data

## Libraries

```{r, message=FALSE}


library(dplyr)
library(tidyr)
library(haven)
library(readr)
library(mlogit)
library(lmtest)
library(purrr)
library(stringr)
library(readr)
library(ggplot2)
library(stargazer)
library(readxl)
library(patchwork)
library(broom)
library(tidyverse)
library(sf)
library(tigris)
library(ggrepel)
library(ggspatial)
library(ggpattern)


final_comparison_master=read_csv("data/final_comparison_master.csv")


final_data_2018 <- read_csv("data/final_data_2018.csv")
final_data_2019 <- read_csv("data/final_data_2019.csv")
final_data_2020 <- read_csv("data/final_data_2020.csv")
final_data_2021 <- read_csv("data/final_data_2021.csv")
final_data_2022 <- read_csv("data/final_data_2022.csv")
final_data_2023 <- read_csv("data/final_data_2023.csv")

model_list_obs <- readRDS("data/model_list_obs.rds")
model_list_52 <- readRDS("data/model_list_52.rds")

```



## Welfare Table

```{r}
#| cache: true

# Calculate Mean and 95% Confidence Intervals
summary_wide <- final_comparison_master %>%
  filter(year %in% c(2018, 2023)) %>%
  group_by(parkname, year) %>%
  summarise(
    n = n(),
    mean_val = mean(loss_rum_observed, na.rm = TRUE),
    sd_val   = sd(loss_rum_observed, na.rm = TRUE),
    # Calculate Standard Error and 95% CI bounds
    se_val   = sd_val / sqrt(n),
    LCI      = mean_val - (1.96 * se_val),
    UCI      = mean_val + (1.96 * se_val),
    .groups = "drop"
  ) %>%
  select(parkname, year, mean_val, LCI, UCI) %>%
  pivot_wider(
    names_from = year, 
    values_from = c(mean_val, LCI, UCI),
    names_glue = "{year}_{.value}" 
  ) %>%
  # 2018 Stats then 2023 Stats
  select(parkname, 
         contains("2018"), 
         contains("2023")) %>%
  as.data.frame()

```

```{r}
#| cache: true
#| eval: false

stargazer(summary_wide, 
          type = "html", 
          out = "tables/Welfare_Comparison_Table.html",
          summary = FALSE, 
          rownames = FALSE,
          title = "Welfare Loss: Mean and 95% Confidence Intervals (2018 vs 2023)",
          digits = 2,
          # Groups the 3 columns under each year
          column.labels = c("Park Name", "2018", "2023"),
          column.separate = c(1, 3, 3), 
          covariate.labels = c("Park", 
                               "Mean", "Lower CI", "Upper CI", 
                               "Mean", "Lower CI", "Upper CI"))



# Calculate Mean and 95% Confidence Intervals
summary_wide <- final_comparison_master %>%
  filter(year %in% c(2018, 2023)) %>%
  group_by(parkname, year) %>%
  summarise(
    n = n(),
    mean_val = mean(loss_rum_52wk, na.rm = TRUE),
    sd_val   = sd(loss_rum_52wk, na.rm = TRUE),
    # Calculate Standard Error and 95% CI bounds
    se_val   = sd_val / sqrt(n),
    LCI      = mean_val - (1.96 * se_val),
    UCI      = mean_val + (1.96 * se_val),
    .groups = "drop"
  ) %>%
  select(parkname, year, mean_val, LCI, UCI) %>%
  pivot_wider(
    names_from = year, 
    values_from = c(mean_val, LCI, UCI),
    names_glue = "{year}_{.value}" 
  ) %>%
  # 2018 Stats then 2023 Stats
  select(parkname, 
         contains("2018"), 
         contains("2023")) %>%
  as.data.frame()


```

```{r}
#| cache: true
#| eval: false
stargazer(summary_wide, 
          type = "html", 
          out = "tables/Welfare_Comparison_Table_frequency.html",
          summary = FALSE, 
          rownames = FALSE,
          title = "Welfare Loss Frequency: Mean and 95% Confidence Intervals (2018 vs 2023)",
          digits = 2,
          # Groups the 3 columns under each year
          column.labels = c("Park Name", "2018", "2023"),
          column.separate = c(1, 3, 3), 
          covariate.labels = c("Park", 
                               "Mean", "Lower CI", "Upper CI", 
                               "Mean", "Lower CI", "Upper CI"))
```

```{r}
#| cache: true
#| message: false
#| warning: false
#| code-fold: true
#| code-summary: "Show the Code"


# Calculate Mean and 95% Confidence Intervals
summary_wide <- final_comparison_master %>%
  filter(year %in% c(2019, 2020)) %>%
  group_by(parkname, year) %>%
  summarise(
    n = n(),
    mean_val = mean(loss_rum_52wk, na.rm = TRUE),
    sd_val   = sd(loss_rum_52wk, na.rm = TRUE),
    # Calculate Standard Error and 95% CI bounds
    se_val   = sd_val / sqrt(n),
    LCI      = mean_val - (1.96 * se_val),
    UCI      = mean_val + (1.96 * se_val),
    .groups = "drop"
  ) %>%
  select(parkname, year, mean_val, LCI, UCI) %>%
  pivot_wider(
    names_from = year, 
    values_from = c(mean_val, LCI, UCI),
    names_glue = "{year}_{.value}" 
  ) %>%
  # 2018 Stats then 2023 Stats
  select(parkname, 
         contains("2019"), 
         contains("2020")) %>%
  as.data.frame()

```

```{r}
#| cache: true
#| eval: false

stargazer(summary_wide, 
          type = "html", 
          out = "tables/Welfare_Comparison_Table_frequency_1920.html",
          summary = FALSE, 
          rownames = FALSE,
          title = "Welfare Loss Frequency: Mean and 95% Confidence Intervals (2019 & 2020)",
          digits = 2,
          # Groups the 3 columns under each year
          column.labels = c("Park Name", "2019", "2020"),
          column.separate = c(1, 3, 3), 
          covariate.labels = c("Park", 
                               "Mean", "Lower CI", "Upper CI", 
                               "Mean", "Lower CI", "Upper CI"))
```

```{r}
#| cache: true


# ── Build summary for both models ─────────────────────────────────────────────
calc_summary <- function(data, loss_col, years) {
    data %>%
        filter(year %in% years, parkname != "Stay_Home") %>%
        group_by(parkname, year) %>%
        summarise(
            n        = n(),
            mean_val = mean(.data[[loss_col]],  na.rm = TRUE),
            sd_val   = sd(.data[[loss_col]],    na.rm = TRUE),
            se_val   = sd_val / sqrt(n),
            LCI      = mean_val - 1.96 * se_val,
            UCI      = mean_val + 1.96 * se_val,
            .groups  = "drop"
        ) %>%
        select(parkname, year, mean_val, LCI, UCI)
}

# ── Frequency RUM ─────────────────────────────────────────────────────────────
freq_wide <- calc_summary(final_comparison_master, "loss_rum_52wk", c(2021, 2022)) %>%
    pivot_wider(
        names_from  = year,
        values_from = c(mean_val, LCI, UCI),
        names_glue  = "{year}_freq_{.value}"
    )

# ── Observed RUM ──────────────────────────────────────────────────────────────
obs_wide <- calc_summary(final_comparison_master, "loss_rum_observed", c(2021, 2022)) %>%
    pivot_wider(
        names_from  = year,
        values_from = c(mean_val, LCI, UCI),
        names_glue  = "{year}_obs_{.value}"
    )

# ── Join both models ──────────────────────────────────────────────────────────
summary_wide <- freq_wide %>%
    left_join(obs_wide, by = "parkname") %>%
    select(
        parkname,
        # 2019 — Observed then Frequency
        `2021_obs_mean_val`, `2021_obs_LCI`, `2021_obs_UCI`,
        `2021_freq_mean_val`, `2021_freq_LCI`, `2021_freq_UCI`,
        # 2020 — Observed then Frequency
        `2022_obs_mean_val`, `2022_obs_LCI`, `2022_obs_UCI`,
        `2022_freq_mean_val`, `2022_freq_LCI`, `2022_freq_UCI`
    ) %>%
    arrange(parkname) %>%
    as.data.frame()

# ── Export ────────────────────────────────────────────────────────────────────
stargazer(
    summary_wide,
    type       = "html",
    out        = "tables/Welfare_Comparison_Table_frequency_2122.html",
    summary    = FALSE,
    rownames   = FALSE,
    title      = "Welfare Loss: Observed and Frequency RUM — Mean and 95% CI (2021 & 2022)",
    digits     = 2,
    # 4 groups: 2020 Observed, 2020 Frequency, 2022 Observed, 2022 Frequency
    column.labels    = c("Park Name",
                         "2021 Observed", "2021 Frequency",
                         "2022 Observed", "2022 Frequency"),
    column.separate  = c(1, 3, 3, 3, 3),
    covariate.labels = c("Park",
                         "Mean", "Lower CI", "Upper CI",   # 2019 Observed
                         "Mean", "Lower CI", "Upper CI",   # 2019 Frequency
                         "Mean", "Lower CI", "Upper CI",   # 2020 Observed
                         "Mean", "Lower CI", "Upper CI"),  # 2020 Frequency
    notes       = "95% confidence intervals. Observed RUM conditions welfare on observed trips. Frequency RUM uses 52 weekly choice occasions.",
    notes.align = "l"
)

message("Saved to Welfare_Comparison_Table_frequency_2122.html")
```

#### Plot

So this model takes the park mean of the welfare per person and shows the mean per park across years. The stability shows that will including a frequency trip in RUM shows the highest variation, the RUM opt out is more concise and then the MDCEV is always in the middle.

```{r}
#| cache: true

# Summarize and Reshape the data
plot_data <- final_comparison_master %>%
  group_by(parkname, year) %>%
  filter(parkname!="Stay_Home")%>%
  summarise(
    ObservedRUM = mean(loss_rum_observed, na.rm = TRUE),
    FrequencyRUM = mean(loss_rum_52wk, na.rm = TRUE),
    MDCEV = mean(loss_mdcev, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  pivot_longer(
    cols = c(ObservedRUM, FrequencyRUM, MDCEV),
    names_to = "Model_Type",
    values_to = "Mean_Welfare_Loss"
  )


# Create the Boxplot 
p=ggplot(plot_data, aes(x = factor(year), y = Mean_Welfare_Loss, fill = Model_Type)) +
  # Classic boxplot with black outlines
  geom_boxplot(outlier.shape = 21, outlier.fill = "white", color = "black", alpha = 0.8) +
  
  # Labels
  labs(
    title = "Distribution of Mean Park Welfare Loss from Site Closures by Year",
    subtitle = "Comparing Observed RUM and MDCEV Model",
    x = "Year",
    y = "Average Welfare Loss Across sites ($)",
    fill = "Model Type"
  ) +
  
  # Colors
scale_fill_manual(values = c(
    "ObservedRUM"  = "#D9D9D9",   # Light Grey
    "MDCEV"        = "#252525"    # Dark Grey / Black
  )) +
  
  # Theme Customization
  theme_bw() + # Starting with theme_bw gives a clean white background
  theme(
    # Set all text to Times New Roman (serif)
    text = element_text(family = "serif", size =20),
    
    # Title and axis styling
    plot.title = element_text(face = "bold", size =26, hjust = 0.5),
    plot.subtitle = element_text(hjust = 0.5),
    axis.title = element_text(face = "bold"),
    axis.text = element_text(color = "black"),
    
    # Remove gridlines for the "Classic/Blank" look
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    
    # Legend styling
    legend.position = "bottom",
    legend.background = element_blank(),
    
    # Classic full border (Using 'size' instead of 'linewidth' for compatibility)
    panel.border = element_rect(colour = "black", fill = NA, size = 1)
  )

p
ggsave("images/comparision_perperson_2models.png", plot = p,width=12, height=10, dpi=300)


```

## Per participant

```{r}
#| cache: true
full_year_clean=rbind(final_data_2018,final_data_2019,final_data_2020,final_data_2021,final_data_2022,final_data_2023)
# Identify participants by ensuring IDs match as characters
participants_only_welfare <- final_comparison_master %>%
    filter(parkname!="Stay_Home")%>%
  left_join(
    # Convert ID to character on the fly so the join works
    full_year_clean %>% 
      select(id, parkname, year, VisitCount), 
    by = c("id", "parkname", "year")
  ) %>%
  # Filter for people who actually visited the park in question
  filter(VisitCount > 0)

# Summary for the "Participants Only" group
participants_summary_table <- participants_only_welfare %>%
  group_by(year, parkname) %>%
  summarise(
    avg_loss_obs_participant   = mean(loss_rum_observed, na.rm = TRUE),
    avg_loss_52wk_participant  = mean(loss_rum_52wk, na.rm = TRUE),
    avg_loss_mdcev_participant = mean(loss_mdcev, na.rm = TRUE),
    num_participants           = n(),
    .groups = "drop"
  )
```

#### Plot

```{r}
#| cache: true
# Reshape the Participants Summary for plotting
plot_data_participants <- participants_summary_table %>%
  # Select the three mean columns we created earlier
  select(year, parkname, 
         ObservedRUM = avg_loss_obs_participant, 
         FrequencyRUM = avg_loss_52wk_participant, 
         MDCEV = avg_loss_mdcev_participant) %>%
  pivot_longer(
    cols = c(ObservedRUM, FrequencyRUM, MDCEV),
    names_to = "Model_Type",
    values_to = "Mean_Welfare_Loss"
  )

#  Create the Grayscale Boxplot
participants=ggplot(plot_data_participants, aes(x = factor(year), y = Mean_Welfare_Loss, fill = Model_Type)) +
  # Standard boxplot with black outlines and white outliers
  geom_boxplot(outlier.shape = 21, outlier.fill = "white", color = "black", alpha = 1) +
  
  # Labels
  labs(
    title = "Mean Annual Welfare Loss: Active Participants Only",
    subtitle = "Calculated for individuals with Observed Visits (VisitCount > 0)",
    x = "Year",
    y = "Mean Annual Welfare Loss ($/User)",
    fill = "Model Type"
  ) +
  
  # Shades of Grey
  scale_fill_manual(values = c(
    "ObservedRUM"  = "#D9D9D9",   # Light Grey
    "FrequencyRUM" = "#737373",   # Medium Grey
    "MDCEV"        = "#252525"    # Dark Grey / Black
  )) +
  
  # Theme Customization (Blank Background + Times New Roman)
  theme_bw() + 
  theme(
    text = element_text(family = "serif", size = 20),
    plot.title = element_text(face = "bold", size = 26, hjust = 0.5),
    plot.subtitle = element_text(hjust = 0.5),
    axis.title = element_text(face = "bold"),
    axis.text = element_text(color = "black"),
    
    # Remove all gridlines for a completely blank white background
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    
    # Legend styling
    legend.position = "bottom",
    legend.background = element_blank(),
    
    # Classic full black border around the plot (using size for compatibility)
    panel.border = element_rect(colour = "black", fill = NA, size = 1)
  )

ggsave("images/comparision_perparticipate.png",plot=participants, width=12, height=10, dpi=300)

participants

```

### EJ

Now that we have the per person welfare across individuals we can link the zip to a location with a higher EJ score and see how this spread spills over to EJ


EJScreen, developed by the U.S. Environmental Protection Agency (EPA), is a screening tool that provides percentile rankings for environmental and demographic indicators, such as air pollution, proximity to hazardous sites, poverty, and limited English proficiency, across census tracts in the U.S. It is primarily used for identifying areas with potential environmental justice concerns, but it does not designate any community as "disadvantaged."

Instead, it enables comparisons across regions and supports risk assessments, permitting decisions, and community advocacy. In contrast, the Climate and Economic Justice Screening Tool (CEJST), developed by the White House Council on Environmental Quality (CEQ), is designed specifically to identify disadvantaged communities (DACs) eligible for benefits under the Justice40 Initiative, which aims to direct 40% of federal climate and infrastructure investments to these communities. CEJST is a binary approach labeling each census tract as either disadvantaged or not. While EJScreen is more detailed and flexible for analysis, CEJST provides a clear policy designation to guide funding and investment.

We use the [**State Percentiles**]{.underline} here.

The U.S. percentiles are computed based on the block groups nationwide. The state percentiles are calculated within each individual state. EJScreen focuses on the U.S. percentiles as a way to visualize all results in common units. The U.S. percentile uses the U.S. block groups as the basis of comparison. The state percentile was also calculated based on the block groups of a given state (or the District of Columbia or Puerto Rico). The national or state mean value was calculated as the average of the block groups with data for that indicator, within the respective geographic scope.

#### Social Vulnerabilities

Using EJScreen gives us flexibility where we can both examine the social factors and environmental factors. Here we will look at the social vulnerabilities of 90th percentiles within the state who are identified using :

-   Percentile for Demographic Index

-   Percentile for Supplemental Demographic Index

-   Percentile for % people of color

-   Percentile for % low income

-   Percentile for % unemployed

-   Percentile for % persons with disabilities

-   Percentile for % limited English speaking

-   Percentile for % less than high school education

-   Percentile for % under age 5

-   Percentile for % over age 64

-   Percentile for Low Life Expectancy


### Environmental Vulnerabilities

Here we will look at the environmental hazard of 90th percentiles within the state who are identified using :

-   -Percentile for Particulate Matter 2.5

-   Percentile for Ozone

-   Percentile for Diesel particulate matter

-   Percentile for Toxic Releases to Air

-   Percentile for Traffic proximity

-   Percentile for Lead paint

-   Percentile for Superfund proximity

-   Percentile for RMP facility proximity

-   Percentile for Hazardous waste proximity

-   Percentile for Underground storage tanks

-   Percentile for Wastewater discharge

-   Percentile for Nitrogen Dioxide (NO2)

-   Percentile for Drinking Water Non-Compliance

-   Percentile for Particulate Matter 2.5

-   Percentile for Ozone EJ Index

-   Percentile for Diesel particulate matter EJ Index

-   Percentile for Toxic Releases to Air EJ Index

-   Percentile for Traffic proximity EJ Index

-   Percentile for Lead paint EJ Index

-   Percentile for Superfund proximity EJ Index

-   Percentile for RMP Facility Proximity EJ Index

-   Percentile for Hazardous waste proximity EJ Index

-   Percentile for Underground storage tanks EJ Index

-   Percentile for Wastewater discharge EJ Index

-   Percentile for Nitrogen Dioxide (NO2) EJ Index

-   Percentile for Drinking Water Non-Compliance EJ Index





```{r}
#| cache: true
# Hud data

ZIP_TRACT_122025 <- read_excel("data/ZIP_TRACT_122025.xlsx")

# only hawaii

dishawaii <- readRDS("data/dishawaii.rds")

# clean tract
dishawaii <- dishawaii %>%
    mutate(TRACT = str_pad(as.character(ID), 11, "left", "0"))

# This taking the tact by zip code filtering hawaii  and making sure tract and zip in write order
# THIS is where you need to make changes
# It should be in an earlier chunk in your script

dishawaii_tract <- dishawaii %>%
  rowwise() %>%
  mutate(
    # ── CHANGE 1: threshold from >= 80 to >= 90 ───────────────────────────
    env_burden = any(c_across(c(
      P_PM25, P_OZONE, P_DSLPM, P_RSEI_AIR, P_PTRAF,
      P_LDPNT, P_PNPL, P_PRMP, P_PTSDF, P_UST, P_PWDIS,
      P_NO2, P_DWATER,
      P_D2_PM25, P_D5_PM25, P_D2_OZONE, P_D5_OZONE,
      P_D2_DSLPM, P_D5_DSLPM, P_D2_RSEI_AIR, P_D5_RSEI_AIR,
      P_D2_PTRAF, P_D5_PTRAF, P_D2_LDPNT, P_D5_LDPNT,
      P_D2_PNPL, P_D5_PNPL, P_D2_PRMP, P_D5_PRMP,
      P_D2_PTSDF, P_D5_PTSDF, P_D2_UST, P_D5_UST,
      P_D2_PWDIS, P_D5_PWDIS, P_D2_NO2, P_D5_NO2,
      P_D2_DWATER, P_D5_DWATER
    )) >= 90,   # <-- CHANGE THIS NUMBER (80 or 90)
    na.rm = TRUE),

    soc_vulnerable = any(c_across(c(
      P_PEOPCOLORPCT, P_LOWINCPCT, P_UNEMPPCT, P_DISABILITYPCT,
      P_LINGISOPCT, P_LESSHSPCT, P_UNDER5PCT, P_OVER64PCT, P_LIFEEXPPCT
    )) >= 90,   # <-- CHANGE THIS NUMBER (80 or 90)
    na.rm = TRUE),

    # ── CHANGE 2: AND vs OR logic ─────────────────────────────────────────
    DAC_flag = as.integer(env_burden | soc_vulnerable)
    #                               ^ change & to | for OR logic
  ) %>%
  ungroup()


hud_crosswalk <- ZIP_TRACT_122025 %>%
    rename_with(toupper) %>% 
    mutate(
        # Ensure standard 11-digit Tract and 5-digit ZIP
        TRACT = str_pad(as.character(TRACT), 11, "left", "0"),
        ZIP   = str_pad(as.character(ZIP), 5, "left", "0")
    ) %>%
    # Filter for Hawaii only: 
    # Either by ZIP prefix (967/968) OR Census Tract State prefix (15)
    filter(str_starts(TRACT, "15"))


#Define thursholds into the 90s
dishawaii_zip <- hud_crosswalk %>%
  left_join(dishawaii_tract, by = "TRACT") %>%
  group_by(ZIP) %>%
  summarise(
    
    # ── Binary population share flags ────────────────────────────────────────
    pct_pop_in_DAC  = sum(TOT_RATIO[DAC_flag == 1],          na.rm = TRUE),
    pct_pop_in_Envi = sum(TOT_RATIO[env_burden == TRUE],     na.rm = TRUE),
    pct_pop_in_Soci = sum(TOT_RATIO[soc_vulnerable == TRUE], na.rm = TRUE),
    
    # ── Environmental composite ───────────────────────────────────────────────
    # Use P_ columns — state percentiles, all on 0-100 scale
    # Excludes PM25 and OZONE (limited Hawaii data)
    env_composite = weighted.mean(
      rowMeans(cbind(
        P_DSLPM,    P_RSEI_AIR, P_PTRAF,  P_LDPNT,
        P_PNPL,     P_PRMP,     P_PTSDF,  P_UST,
        P_PWDIS,    P_NO2,      P_DWATER
      ), na.rm = TRUE),
      TOT_RATIO, na.rm = TRUE
    ),
    
    # ── Social vulnerability composite ────────────────────────────────────────
    # All 9 social P_ indicators — state percentiles
    soci_composite = weighted.mean(
      rowMeans(cbind(
        P_PEOPCOLORPCT, P_LOWINCPCT,    P_UNEMPPCT,
        P_DISABILITYPCT, P_LINGISOPCT,  P_LESSHSPCT,
        P_UNDER5PCT,    P_OVER64PCT,    P_LIFEEXPPCT
      ), na.rm = TRUE),
      TOT_RATIO, na.rm = TRUE
    ),
    
    # ── Also store the index scores EJScreen already computed ─────────────────
    # DEMOGIDX_2 is EJScreen's own demographic index — useful as a check
    demog_index = weighted.mean(P_DEMOGIDX_2, TOT_RATIO, na.rm = TRUE),
    
    total_ratio = sum(TOT_RATIO, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  mutate(
    
    # ── Binary flags ──────────────────────────────────────────────────────────
    is_disadvantaged_50      = ifelse(pct_pop_in_DAC  >= 1, 1, 0),
    is_disadvantaged_50_envi = ifelse(pct_pop_in_Envi >= 1, 1, 0),
    is_disadvantaged_50_soci = ifelse(pct_pop_in_Soci >= 1, 1, 0),
    
    # ── Environmental percentile blocks ───────────────────────────────────────
    # High env_pctile
    env_pctile = ntile(env_composite, 100),
    env_block  = cut(env_pctile,
      breaks = c(0, 20, 40, 60, 80, 90, 95, 100),
      labels = c(
        "0-20th\n(Lowest)",  "20-40th", "40-60th",
        "60-80th", "80-90th", "90-95th", "95-100th\n(Highest)"
      ),
      include.lowest = TRUE
    ),
    
    # ── Social vulnerability percentile blocks ────────────────────────────────
    # High soci_pctile = high vulnerability = most vulnerable
    soci_pctile = ntile(soci_composite, 100),
    soci_block  = cut(soci_pctile,
      breaks = c(0, 20, 40, 60, 80, 90, 95, 100),
      labels = c(
        "0-20th\n(Lowest)",  "20-40th", "40-60th",
        "60-80th", "80-90th", "90-95th", "95-100th\n(Highest)"
      ),
      include.lowest = TRUE
    )
  )

# ── Verification checks ───────────────────────────────────────────────────────

# 1. Composite scores should increase with block
dishawaii_zip %>%
  group_by(env_block) %>%
  summarise(
    mean_env  = round(mean(env_composite,  na.rm = TRUE), 1),
    mean_flag = round(mean(pct_pop_in_Envi, na.rm = TRUE), 2),
    n         = n()
  ) %>% arrange(env_block)

dishawaii_zip %>%
  group_by(soci_block) %>%
  summarise(
    mean_soci = round(mean(soci_composite,  na.rm = TRUE), 1),
    mean_flag = round(mean(pct_pop_in_Soci, na.rm = TRUE), 2),
    n         = n()
  ) %>% arrange(soci_block)

# 2. Check coverage
dishawaii_zip %>%
  summarise(
    total_zips      = n(),
    env_complete    = sum(!is.na(env_composite)),
    soci_complete   = sum(!is.na(soci_composite)),
    total_ratio_avg = round(mean(total_ratio), 2)  # should be close to 1.0
  )

# ── Join to welfare data ──────────────────────────────────────────────────────
final_comparison_master1 <- final_comparison_master %>%
  left_join(
    dishawaii_zip %>%
      mutate(ZIP = as.numeric(ZIP)) %>%   # convert ZIP to numeric here
      select(ZIP,
             env_composite,  env_pctile,  env_block,
             soci_composite, soci_pctile, soci_block,
             demog_index,
             pct_pop_in_DAC, pct_pop_in_Envi, pct_pop_in_Soci,
             is_disadvantaged_50, is_disadvantaged_50_envi, is_disadvantaged_50_soci),
    by = c("zip" = "ZIP")
  )


```

# Income

```{r}
#| cache: true
final_comparison_master1 <- final_comparison_master1 %>%
  mutate(
    income_pctile = 101 - ntile(income, 100),
    
    income_block = cut(income_pctile,
      breaks = c(0, 20, 40, 60, 80, 90, 95, 100),
      labels = c(
        "0-20th\n(Lowest)",  "20-40th",  "40-60th",
        "60-80th", "80-90th", "90-95th", "95-100th\n(Highest)"
      ),
      include.lowest = TRUE
    )
  )

avg_data <- final_comparison_master1 %>%
  filter(year %in% c(2018, 2023), parkname !="Stay_Home") %>%
  pivot_longer(
    cols      = c(loss_mdcev, loss_rum_observed,loss_rum_52wk),
    names_to  = "model",
    values_to = "loss"
  ) %>%
  mutate(
    model = recode(model,
      "loss_mdcev"        = "MDCEV",
      "loss_rum_observed" = "Observed RUM",
      "loss_rum_52wk" = "Frequency RUM"
    )
  ) %>%
  group_by(income_block, model, year) %>%
  summarise(
    mean_loss = mean(loss,           na.rm = TRUE),
    se        = sd(loss, na.rm = TRUE) / sqrt(n()),
    p25       = quantile(loss, 0.25, na.rm = TRUE),
    p75       = quantile(loss, 0.75, na.rm = TRUE),
    n         = n(),
    .groups   = "drop"
  ) %>%
  mutate(
    ci_low  = mean_loss - 1.96 * se,
    ci_high = mean_loss + 1.96 * se,
    year    = factor(year, labels = c("2018", "2023"))
  )


avg_data %>% distinct(income_block) %>% arrange(income_block)

# ── Shade the 90th+ blocks  ──────────────────
vulnerable_positions <- avg_data %>%
  filter(grepl("90-95th|95-100th", as.character(income_block))) %>%
  mutate(block_num = as.numeric(income_block)) %>%
  distinct(block_num) %>%
  pull(block_num)

shade_start <- min(vulnerable_positions) - 0.5
shade_end   <- max(vulnerable_positions) + 0.5

# ── Plot ──────────────────────────────────────────────────────────────────────
# ── Step 1: Build each indicator separately with identical structure ──────────

# Income — already built, just add indicator column
income_avg <- final_comparison_master1 %>%
    filter(year %in% c(2018, 2023), !is.na(income_block), parkname !="Stay_Home") %>%
    pivot_longer(
        cols      = c(loss_mdcev, loss_rum_observed,loss_rum_52wk ),
        names_to  = "model",
        values_to = "loss"
    ) %>%
    mutate(
        model        = recode(model,
                              "loss_mdcev"        = "MDCEV",
                              "loss_rum_observed" = "Observed RUM",
      "loss_rum_52wk" = "Frequency RUM"),
        pctile_block = income_block,
        indicator    = "Income\n(Least → Most Vulnerable)"
    )

# Environmental burden — high pctile = most vulnerable = already RIGHT
env_avg <- final_comparison_master1 %>%
    filter(year %in% c(2018, 2023), !is.na(env_block), parkname !="Stay_Home") %>%
    pivot_longer(
        cols      = c(loss_mdcev, loss_rum_observed,loss_rum_52wk),
        names_to  = "model",
        values_to = "loss"
    ) %>%
    mutate(
        model        = recode(model,
                              "loss_mdcev"        = "MDCEV",
                              "loss_rum_observed" = "Observed RUM",
      "loss_rum_52wk" = "Frequency RUM"),
        pctile_block = env_block,
        indicator    = "Environmental Burden\n(Least → Most Vulnerable)"
    )

# Social vulnerability — high pctile = most vulnerable = already RIGHT
soci_avg <- final_comparison_master1 %>%
    filter(year %in% c(2018, 2023), !is.na(soci_block), parkname !="Stay_Home") %>%
    pivot_longer(
        cols      = c(loss_mdcev, loss_rum_observed,loss_rum_52wk),
        names_to  = "model",
        values_to = "loss"
    ) %>%
    mutate(
        model        = recode(model,
                              "loss_mdcev"        = "MDCEV",
                              "loss_rum_observed" = "Observed RUM",
      "loss_rum_52wk" = "Frequency RUM"),
        pctile_block = soci_block,
        indicator    = "Social Vulnerability\n(Least → Most Vulnerable)"
    )

# ── Step 2: Combine and summarise ─────────────────────────────────────────────
all_avg <- bind_rows(income_avg, env_avg, soci_avg) %>%
    mutate(
        year = factor(year, labels = c("2018", "2023")),
        indicator = factor(indicator, levels = c(
            "Income\n(Least → Most Vulnerable)",
            "Social Vulnerability\n(Least → Most Vulnerable)",
            "Environmental Burden\n(Least → Most Vulnerable)"
        ))
    ) %>%
    group_by(indicator, pctile_block, model, year) %>%
    summarise(
        mean_loss = mean(loss,           na.rm = TRUE),
        se        = sd(loss, na.rm = TRUE) / sqrt(n()),
        p25       = quantile(loss, 0.25, na.rm = TRUE),
        p75       = quantile(loss, 0.75, na.rm = TRUE),
        n         = n(),
        .groups   = "drop"
    ) %>%
    mutate(
        ci_low  = mean_loss - 1.96 * se,
        ci_high = mean_loss + 1.96 * se
    )

# ── Step 3: Shading positions — 90th+ blocks are RIGHT in all three ───────────
vulnerable_positions <- all_avg %>%
    filter(grepl("90-95th|95-100th", as.character(pctile_block))) %>%
    mutate(block_num = as.numeric(pctile_block)) %>%
    distinct(block_num) %>%
    pull(block_num)

shade_start <- min(vulnerable_positions) - 0.5
shade_end   <- max(vulnerable_positions) + 0.5
```

```{r}
#| cache: true
# ── Step 4: Plot ──────────────────────────────────────────────────────────────
ggplot(all_avg,
       aes(x     = pctile_block,
           y     = mean_loss,
           color = model,
           group = model)) +
  
  geom_rect(
    data = tibble(xmin = shade_start, xmax = shade_end),
    aes(xmin = xmin, xmax = xmax, ymin = -Inf, ymax = Inf),
    fill        = "gray90",
    alpha       = 0.5,
    inherit.aes = FALSE
  ) +
  
  geom_errorbar(
    aes(ymin = ci_low, ymax = ci_high),
    width     = 0.2,
    linewidth = 0.4,
    position  = position_dodge(0.4)   # slightly wider dodge for 3 models
  ) +
  
  geom_line(linewidth = 0.8,  position = position_dodge(0.4)) +
  geom_point(size = 2.5,      position = position_dodge(0.4)) +
  
  geom_hline(
    yintercept = 0, linetype = "dashed",
    color = "gray40", linewidth = 0.4
  ) +
  
  geom_text(
    data = tibble(
      pctile_block = factor("90-95th",
                            levels = levels(all_avg$pctile_block)),
      mean_loss    = -2,
      year         = factor("2018"),
      indicator    = factor("Income\n(Least → Most Vulnerable)",
                            levels = levels(all_avg$indicator))
    ),
    aes(x = pctile_block, y = mean_loss),
    label       = "90th+\nVulnerable",
    inherit.aes = FALSE,
    size        = 3.5,
    color       = "gray40",
    hjust       = 0.5,
    family      = "Times New Roman"
  ) +
  
  facet_grid(indicator ~ year) +
  
  # ── Three model grayscale ─────────────────────────────────────────────────
  scale_color_manual(
    values = c(
      "MDCEV"         = "gray10",
      "Observed RUM"  = "gray55",
      "Frequency RUM" = "gray80"    # lightest gray for third model
    ),
    name = "Model"
  ) +
  scale_linetype_manual(
    values = c(
      "MDCEV"         = "solid",
      "Observed RUM"  = "solid",
      "Frequency RUM" = "dashed"    # dashed helps distinguish in grayscale
    ),
    name = "Model"
  ) +
  aes(linetype = model) +           # add linetype aesthetic
  
  scale_y_continuous(
    labels = scales::dollar_format(prefix = "$"),
    name   = "Mean Welfare Loss"
  ) +
  
  labs(
    x     = "Percentile Block",
    title = "Mean Welfare Loss Across Distributions"
  ) +
  
  theme_minimal(base_family = "Times New Roman") +
  theme(
    panel.grid.minor   = element_blank(),
    panel.grid.major.x = element_blank(),
    axis.text.x   = element_text(size = 16, angle = 20,
                                 hjust = 1, lineheight = 0.8),
    axis.text.y   = element_text(size = 20),
    strip.text.x  = element_text(face = "bold", size = 16),
    strip.text.y  = element_text(face = "bold", size = 20,
                                 angle = 0, hjust = 0),
    legend.position = "bottom",
    legend.title  = element_text(face = "bold", size = 20),
    legend.text   = element_text(size = 20),
    plot.title    = element_text(face = "bold", size = 20),
    panel.spacing = unit(1.2, "lines")
  )
ggsave("images/social_vulnerabilities.png", width=12, height=10, dpi=300)

```

```{r}
#| cache: true
income_lookup <- final_comparison_master1 %>%
mutate(id = as.character(id)) %>%
distinct(id, income_block, income) %>%
group_by(id) %>%
slice(1) %>%   # keep one row per person if duplicates exist
ungroup()

# ── Rebuild all_avg_appendix with three models ────────────────────────────────

income_avg_all <- final_comparison_master1 %>%
  filter(!is.na(income_block), parkname!="Stay_Home") %>%
  pivot_longer(
    cols      = c(loss_mdcev, loss_rum_observed, loss_rum_52wk),  # <-- added
    names_to  = "model",
    values_to = "loss"
  ) %>%
  mutate(
    model = recode(model,
      "loss_mdcev"        = "MDCEV",
      "loss_rum_observed" = "Observed RUM",
      "loss_rum_52wk"     = "Frequency RUM"   # <-- added
    ),
    pctile_block = income_block,
    indicator    = "Income\n(Least → Most Vulnerable)"
  )

env_avg_all <- final_comparison_master1 %>%
  filter(!is.na(env_block), parkname!="Stay_Home") %>%
  pivot_longer(
    cols      = c(loss_mdcev, loss_rum_observed, loss_rum_52wk),
    names_to  = "model",
    values_to = "loss"
  ) %>%
  mutate(
    model = recode(model,
      "loss_mdcev"        = "MDCEV",
      "loss_rum_observed" = "Observed RUM",
      "loss_rum_52wk"     = "Frequency RUM"
    ),
    pctile_block = env_block,
    indicator    = "Environmental Burden\n(Least → Most Vulnerable)"
  )

soci_avg_all <- final_comparison_master1 %>%
  filter(!is.na(soci_block), parkname!="Stay_Home") %>%
  pivot_longer(
    cols      = c(loss_mdcev, loss_rum_observed, loss_rum_52wk),
    names_to  = "model",
    values_to = "loss"
  ) %>%
  mutate(
    model = recode(model,
      "loss_mdcev"        = "MDCEV",
      "loss_rum_observed" = "Observed RUM",
      "loss_rum_52wk"     = "Frequency RUM"
    ),
    pctile_block = soci_block,
    indicator    = "Social Vulnerability\n(Least → Most Vulnerable)"
  )

all_avg_appendix <- bind_rows(income_avg_all, env_avg_all, soci_avg_all) %>%
  mutate(
    year      = factor(year),
    model     = factor(model, levels = c("MDCEV", "Observed RUM", "Frequency RUM")),
    indicator = factor(indicator, levels = c(
      "Income\n(Least → Most Vulnerable)",
      "Social Vulnerability\n(Least → Most Vulnerable)",
      "Environmental Burden\n(Least → Most Vulnerable)"
    ))
  ) %>%
  group_by(indicator, pctile_block, model, year) %>%
  summarise(
    mean_loss = mean(loss,           na.rm = TRUE),
    se        = sd(loss, na.rm = TRUE) / sqrt(n()),
    p25       = quantile(loss, 0.25, na.rm = TRUE),
    p75       = quantile(loss, 0.75, na.rm = TRUE),
    n         = n(),
    .groups   = "drop"
  ) %>%
  mutate(
    ci_low  = mean_loss - 1.96 * se,
    ci_high = mean_loss + 1.96 * se
  )

# ── Updated helper function with three models ─────────────────────────────────
make_appendix_plot <- function(data, years, title_suffix) {
  
  plot_data <- data %>% filter(year %in% years)
  
  vulnerable_positions <- plot_data %>%
    filter(grepl("90-95th|95-100th", as.character(pctile_block))) %>%
    mutate(block_num = as.numeric(pctile_block)) %>%
    distinct(block_num) %>%
    pull(block_num)
  
  shade_start <- min(vulnerable_positions) - 0.5
  shade_end   <- max(vulnerable_positions) + 0.5
  
  ggplot(plot_data,
         aes(x        = pctile_block,
             y        = mean_loss,
             color    = model,
             linetype = model,        # <-- added for readability
             group    = model)) +
    
    geom_rect(
      data = tibble(xmin = shade_start, xmax = shade_end),
      aes(xmin = xmin, xmax = xmax, ymin = -Inf, ymax = Inf),
      fill        = "gray90",
      alpha       = 0.5,
      inherit.aes = FALSE
    ) +
    
    geom_errorbar(
      aes(ymin = ci_low, ymax = ci_high),
      width     = 0.2,
      linewidth = 0.35,
      position  = position_dodge(0.4)   # slightly wider for 3 models
    ) +
    
    geom_line(linewidth = 0.7,  position = position_dodge(0.4)) +
    geom_point(size = 1.8,      position = position_dodge(0.4)) +
    
    geom_hline(
      yintercept = 0, linetype = "dashed",
      color = "gray40", linewidth = 0.4
    ) +
    
    facet_grid(indicator ~ year) +
    
    # ── Three model color and linetype scales ─────────────────────────────────
    scale_color_manual(
      values = c(
        "MDCEV"         = "gray10",
        "Observed RUM"  = "gray55",
        "Frequency RUM" = "gray75"    # <-- added
      ),
      name = "Model"
    ) +
    scale_linetype_manual(
      values = c(
        "MDCEV"         = "solid",
        "Observed RUM"  = "solid",
        "Frequency RUM" = "dashed"    # <-- dashed to distinguish from Observed RUM
      ),
      name = "Model"
    ) +
    scale_fill_manual(
      values = c(
        "MDCEV"         = "gray10",
        "Observed RUM"  = "gray55",
        "Frequency RUM" = "gray75"
      ),
      guide  = "none"
    ) +
    scale_y_continuous(
      labels = scales::dollar_format(prefix = "$"),
      name   = "Mean Welfare Loss"
    ) +
    
    labs(
      x     = "Percentile Block",
      title = paste("Mean Welfare Loss Across Distributions —", title_suffix)
    ) +
    
    theme_minimal(base_family = "Times New Roman") +
    theme(
      panel.grid.minor   = element_blank(),
      panel.grid.major.x = element_blank(),
      axis.text.x   = element_text(size = 12, angle = 30,
                                   hjust = 1, lineheight = 0.8),
      axis.text.y   = element_text(size = 20),
      strip.text.x  = element_text(face = "bold", size = 12),
      strip.text.y  = element_text(face = "bold", size = 14,
                                   angle = 0, hjust = 0),
      legend.position  = "bottom",
      legend.title     = element_text(face = "bold", size = 20),
      legend.text      = element_text(size = 20),
      plot.title       = element_text(face = "bold", size = 20),
      panel.spacing    = unit(0.8, "lines")
    )
}

# ── Build and save ────────────────────────────────────────────────────────────
appendix_a <- make_appendix_plot(
  all_avg_appendix,
  years        = c(2018, 2019, 2020),
  title_suffix = "2018–2020"
)

appendix_b <- make_appendix_plot(
  all_avg_appendix,
  years        = c(2021, 2022, 2023),
  title_suffix = "2021–2023"
)


ggsave(
  "images/appendix_A_welfare_2018_2020.png",
  plot   = appendix_a,
  width  = 14,
  height = 12,
  dpi    = 300,
  bg     = "white"
)

ggsave(
  "images/appendix_B_welfare_2021_2023.png",
  plot   = appendix_b,
  width  = 14,
  height = 12,
  dpi    = 300,
  bg     = "white"
)
print(appendix_a)

```

# Low-income Welfare

So low income individuals live further away and choose sites closer to them then higher income individuals. Thats whats explained by the gap. We know we value there time less

```{r}
#| cache: true

# COMPARE: Average distance to all sites vs distance to actually visited sites


# Distance to ALL sites (potential distance) 
# This is the average distance each income block's zip codes are from
# every campsite regardless of whether they visited

distance_all_sites <- full_year_clean %>%
  filter(parkname != "Stay_Home") %>%
  mutate(id = as.character(id)) %>%
  left_join(income_lookup, by = "id") %>%
  filter(!is.na(income_block)) %>%
  group_by(income_block, id) %>%
  summarise(
    mean_dist_all_sites = mean(travel_distance_km, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  group_by(income_block) %>%
  summarise(
    mean_dist_potential  = round(mean(mean_dist_all_sites, na.rm = TRUE), 1),
    median_dist_potential= round(median(mean_dist_all_sites, na.rm = TRUE), 1),
    n_individuals        = n_distinct(id),
    .groups = "drop"
  )

# Distance to VISITED sites only ────────────────────────────────────
# Average distance to sites that were actually visited (VisitCount > 0)

distance_visited_sites <- full_year_clean %>%
  filter(parkname != "Stay_Home", VisitCount > 0) %>%
  mutate(id = as.character(id)) %>%
  left_join(income_lookup, by = "id") %>%
  filter(!is.na(income_block)) %>%
  group_by(income_block, id) %>%
  summarise(
    mean_dist_visited = mean(travel_distance_km, na.rm = TRUE),
    n_sites_visited   = n_distinct(parkname),
    total_trips       = sum(VisitCount,          na.rm = TRUE),
    .groups = "drop"
  ) %>%
  group_by(income_block) %>%
  summarise(
    mean_dist_visited   = round(mean(mean_dist_visited, na.rm = TRUE), 1),
    median_dist_visited = round(median(mean_dist_visited, na.rm = TRUE), 1),
    mean_sites_visited  = round(mean(n_sites_visited,   na.rm = TRUE), 2),
    mean_trips          = round(mean(total_trips,       na.rm = TRUE), 2),
    n_individuals       = n_distinct(id),
    .groups = "drop"
  )

# Combine and compare ───────────────────────────────────────────────
distance_comparison <- distance_all_sites %>%
  left_join(distance_visited_sites, by = "income_block",
            suffix = c("_all", "_visited")) %>%
  mutate(
    # Selection effect: do people choose closer sites?
    distance_gap    = round(mean_dist_visited - mean_dist_potential, 1),
    selection_ratio = round(mean_dist_visited / mean_dist_potential, 3)
  ) %>%
  arrange(income_block)

message("\n── Distance: potential vs visited by income block ──")
print(distance_comparison %>%
  select(income_block, mean_dist_potential, mean_dist_visited,
         distance_gap, selection_ratio, mean_sites_visited))

# ── Site-level distance by income block 
# Which sites does each income block actually visit
# and how far are they from those sites?

site_distance_by_income <- full_year_clean %>%
  filter(parkname != "Stay_Home", VisitCount > 0) %>%
  mutate(id = as.character(id)) %>%
  left_join(income_lookup, by = "id") %>%
  filter(!is.na(income_block)) %>%
  group_by(income_block, parkname, island_park) %>%
  summarise(
    mean_distance = round(mean(travel_distance_km, na.rm = TRUE), 1),
    total_trips   = sum(VisitCount, na.rm = TRUE),
    n_visitors    = n_distinct(id),
    .groups       = "drop"
  ) %>%
  group_by(income_block) %>%
  mutate(
    pct_trips = round(total_trips / sum(total_trips) * 100, 1)
  ) %>%
  ungroup() %>%
  arrange(income_block, desc(pct_trips))

message("\n── Top 3 sites by income block with distances ──")
site_distance_by_income %>%
  group_by(income_block) %>%
  slice_max(pct_trips, n = 3) %>%
  print(n = 30)

# ── Plot comparison ───────────────────────────────────────────────────
distance_comparison %>%
  pivot_longer(
    cols      = c(mean_dist_potential, mean_dist_visited),
    names_to  = "type",
    values_to = "distance"
  ) %>%
  mutate(
    type = case_when(
      type == "mean_dist_potential" ~ "Average Distance to All Sites\n(Potential)",
      type == "mean_dist_visited"   ~ "Average Distance to Visited Sites\n(Actual)"
    ),
    # ── Clean x-axis labels with income ranges ────────────────────────────────
    income_label = case_when(
      income_block == "0-20th\n(Lowest)"    ~ "> $102k",
      income_block == "20-40th"             ~ "$90k–$102k",
      income_block == "40-60th"             ~ "$79k–$90k",
      income_block == "60-80th"             ~ "$67k–$79k",
      income_block == "80-90th"             ~ "$63k–$67k",
      income_block == "90-95th"             ~ "$57k–$63k",
      income_block == "95-100th\n(Highest)" ~ "< $57k",
      TRUE ~ as.character(income_block)
    ),
    income_label = factor(income_label, levels = c(
      "< $57k", "$57k–$63k", "$63k–$67k", "$67k–$79k",
      "$79k–$90k", "$90k–$102k", "> $102k"
    ))
  ) %>%
  ggplot(aes(x     = income_label,
             y     = distance,
             color = type,
             group = type)) +

  geom_line(linewidth = 0.9) +
  geom_point(size = 3) +

  # ── High / Low income arrows on x axis ────────────────────────────────────
  annotate("segment",
           x = 0.5, xend = 1.8, y = -30, yend = -30,
           arrow = arrow(length = unit(0.2, "cm"), ends = "first"),
           color = "gray30", linewidth = 0.6) +
  annotate("text",
           x = 1.0, y = -45,
           label = "Lower Income",
           size = 4.5, color = "gray30",
           family = "Times New Roman",
           hjust = 0.5, fontface = "bold") +

  annotate("segment",
           x = 5.2, xend = 6.5, y = -30, yend = -30,
           arrow = arrow(length = unit(0.2, "cm"), ends = "last"),
           color = "gray30", linewidth = 0.6) +
  annotate("text",
           x = 6.0, y = -45,
           label = "Higher Income",
           size = 4.5, color = "gray30",
           family = "Times New Roman",
           hjust = 0.5, fontface = "bold") +

  scale_color_manual(
    values = c(
      "Average Distance to All Sites\n(Potential)"  = "gray70",
      "Average Distance to Visited Sites\n(Actual)" = "gray10"
    ),
    name = ""
  ) +
  scale_y_continuous(
    name   = "Mean Distance (km)",
    labels = scales::number_format(suffix = " km"),
    expand = expansion(mult = c(0.15, 0.05))  # extra space at bottom for arrows
  ) +
  labs(
    x     = "Household Income Range",
    title = "Potential vs Actual Distance by Income Block"
  ) +
  coord_cartesian(clip = "off") +   # allow annotations outside plot area
  theme_minimal(base_family = "Times New Roman") +
  theme(
    panel.grid.minor   = element_blank(),
    panel.grid.major.x = element_blank(),
    axis.text.x   = element_text(size = 14, angle = 20, hjust = 1),
    axis.text.y   = element_text(size = 16),
    axis.title    = element_text(size = 16),
    legend.position  = "bottom",
    legend.text   = element_text(size = 16),
    plot.title    = element_text(face = "bold", size = 20),
    plot.margin   = margin(t = 10, r = 10, b = 30, l = 10)  # bottom space for arrows
  )

ggsave("images/distance_sitesvisited.png", width=12, height=8, dpi=300)
```

```{r}
#| cache: true


# Test 1: Rank-based selection 
# For each person: rank all 22 sites by distance (1 = closest)
# Then check which rank sites they actually visited
# If constrained -> visit low-rank (nearby) sites disproportionately

rank_selection <- full_year_clean %>%
  filter(parkname != "Stay_Home") %>%
  mutate(id = as.character(id)) %>%
  left_join(income_lookup, by = "id") %>%
  filter(!is.na(income_block)) %>%
  group_by(id) %>%
  mutate(
    # Rank sites by distance for each individual — 1 = closest
    distance_rank = rank(travel_distance_km, ties.method = "first"),
    n_sites       = n()
  ) %>%
  ungroup()

# For visited sites: what is their average distance rank?
rank_visited <- rank_selection %>%
  filter(VisitCount > 0) %>%
  group_by(id, income_block) %>%
  summarise(
    mean_rank_visited    = mean(distance_rank, na.rm = TRUE),
    min_rank_visited     = min(distance_rank,  na.rm = TRUE),
    median_rank_visited  = median(distance_rank, na.rm = TRUE),
    n_sites_visited      = n(),
    .groups = "drop"
  )

rank_summary <- rank_visited %>%
  group_by(income_block) %>%
  summarise(
    mean_rank_of_visited   = round(mean(mean_rank_visited),   2),
    median_rank_of_visited = round(median(median_rank_visited), 2),
    mean_closest_visited   = round(mean(min_rank_visited),    2),
    pct_visit_top5_closest = round(mean(min_rank_visited <= 5) * 100, 1),
    pct_visit_top3_closest = round(mean(min_rank_visited <= 3) * 100, 1),
    .groups = "drop"
  ) %>%
  arrange(income_block)

message("── Test 1: Distance rank of visited sites ──")
message("Lower rank = visits closer sites | Higher rank = travels further")
print(rank_summary)

#Test 2: Probability of visiting top-N closest sites 
# Does low income visit their N closest sites more often?

visit_probability <- rank_selection %>%
  mutate(
    visited    = VisitCount > 0,
    top3       = distance_rank <= 3,
    top5       = distance_rank <= 5,
    top10      = distance_rank <= 10,
    bottom5    = distance_rank > 17   # furthest 5 sites
  ) %>%
  group_by(income_block) %>%
  summarise(
    prob_visit_top3    = round(mean(visited[top3],    na.rm = TRUE) * 100, 1),
    prob_visit_top5    = round(mean(visited[top5],    na.rm = TRUE) * 100, 1),
    prob_visit_top10   = round(mean(visited[top10],   na.rm = TRUE) * 100, 1),
    prob_visit_bottom5 = round(mean(visited[bottom5], na.rm = TRUE) * 100, 1),
    # Ratio: how much more likely to visit nearby vs far sites?
    near_far_ratio     = round(
      mean(visited[top5],    na.rm = TRUE) /
      mean(visited[bottom5], na.rm = TRUE), 2),
    .groups = "drop"
  ) %>%
  arrange(income_block)

message("\n── Test 2: Probability of visiting near vs far sites ──")
message("Higher near_far_ratio = stronger geographic constraint")
print(visit_probability)

# Test 3: Individual-level regression 
# Does distance predict visit probability differently across income groups?
# If low income is more constrained:
# β_distance should be more negative for low income


regression_by_income <- rank_selection %>%
  mutate(visited = as.integer(VisitCount > 0)) %>%
  group_by(income_block) %>%
  do(
    tidy(glm(visited ~ travel_distance_km,
             data   = .,
             family = binomial(link = "logit")))
  ) %>%
  filter(term == "travel_distance_km") %>%
  mutate(
    estimate  = round(estimate,   5),
    std.error = round(std.error,  5),
    p.value   = round(p.value,    4)
  ) %>%
  select(income_block, estimate, std.error, p.value) %>%
  arrange(income_block)

message("\n── Test 3: Logit coefficient on distance by income block ──")
message("More negative estimate = distance matters MORE for that income group")
message("= stronger geographic constraint on site selection")
print(regression_by_income)

# Test 4: Substitution range 
# How spread out are the sites each person visits?
# Constrained campers visit a narrow geographic range
# Unconstrained campers visit sites spread across islands

substitution_range <- final_data_2018 %>%
  filter(parkname != "Stay_Home", VisitCount > 0) %>%
  mutate(id = as.character(id)) %>%
  left_join(income_lookup, by = "id") %>%
  filter(!is.na(income_block)) %>%
  group_by(id, income_block) %>%
  summarise(
    n_islands_visited   = n_distinct(island_park),
    n_sites_visited     = n_distinct(parkname),
    range_km            = max(travel_distance_km) - min(travel_distance_km),
    max_dist            = max(travel_distance_km),
    min_dist            = min(travel_distance_km),
    .groups = "drop"
  ) %>%
  group_by(income_block) %>%
  summarise(
    mean_islands_visited = round(mean(n_islands_visited),  2),
    pct_visit_own_island = round(mean(n_islands_visited == 1) * 100, 1),
    mean_range_km        = round(mean(range_km,            na.rm = TRUE), 1),
    mean_max_dist        = round(mean(max_dist,            na.rm = TRUE), 1),
    mean_min_dist        = round(mean(min_dist,            na.rm = TRUE), 1),
    .groups = "drop"
  ) %>%
  arrange(income_block)

message("\n── Test 4: Geographic substitution range ──")
message("pct_visit_own_island: % who only visit sites on their home island")
message("mean_range_km: spread between closest and furthest site visited")
print(substitution_range)

# Test 5: Gravity model 
# Classic spatial interaction: visits ~ 1/distance
# If constrained: stronger distance decay for low income

gravity_by_income <- rank_selection %>%
  mutate(
    visited       = as.integer(VisitCount > 0),
    log_distance  = log(travel_distance_km + 1)
  ) %>%
  group_by(income_block) %>%
  do(
    tidy(glm(visited ~ log_distance,
             data   = .,
             family = binomial(link = "logit")))
  ) %>%
  filter(term == "log_distance") %>%
  mutate(
    estimate  = round(estimate,  3),
    std.error = round(std.error, 3),
    p.value   = round(p.value,   4)
  ) %>%
  select(income_block, estimate, std.error, p.value) %>%
  rename(log_distance_coef = estimate) %>%
  arrange(income_block)

message("\n── Test 5: Gravity model — log distance effect by income ──")
message("More negative coefficient = stronger distance decay = more constrained")
print(gravity_by_income)

# ── Plot all four measures together 

p1 <- rank_summary %>%
  ggplot(aes(x = income_block, y = mean_rank_of_visited, group = 1)) +
  geom_line(linewidth = 0.8, color = "gray10") +
  geom_point(size = 3,       color = "gray10") +
  labs(x = "", y = "Mean Distance Rank\nof Visited Sites",
       title = "Test 1: Distance Rank") +
  theme_minimal(base_family = "Times New Roman") +
  theme(axis.text.x = element_text(angle = 20, hjust = 1, size = 9),
        panel.grid.minor = element_blank(),
        plot.title = element_text(face = "bold", size = 11))

p2 <- visit_probability %>%
  ggplot(aes(x = income_block, y = near_far_ratio, group = 1)) +
  geom_line(linewidth = 0.8, color = "gray10") +
  geom_point(size = 3,       color = "gray10") +
  geom_hline(yintercept = 1, linetype = "dashed",
             color = "gray50", linewidth = 0.4) +
  labs(x = "", y = "Near/Far Visit Ratio\n(top 5 / bottom 5)",
       title = "Test 2: Near/Far Ratio") +
  theme_minimal(base_family = "Times New Roman") +
  theme(axis.text.x = element_text(angle = 20, hjust = 1, size = 9),
        panel.grid.minor = element_blank(),
        plot.title = element_text(face = "bold", size = 11))

p3 <- regression_by_income %>%
  ggplot(aes(x = income_block, y = estimate, group = 1)) +
  geom_col(fill = "gray30", alpha = 0.8, width = 0.6) +
  geom_errorbar(aes(ymin = estimate - 1.96 * std.error,
                    ymax = estimate + 1.96 * std.error),
                width = 0.2, color = "gray10") +
  labs(x = "", y = "Distance Coefficient\n(logit)",
       title = "Test 3: Distance Sensitivity") +
  theme_minimal(base_family = "Times New Roman") +
  theme(axis.text.x = element_text(angle = 20, hjust = 1, size = 9),
        panel.grid.minor = element_blank(),
        plot.title = element_text(face = "bold", size = 11))

p4 <- substitution_range %>%
  ggplot(aes(x = income_block, y = pct_visit_own_island, group = 1)) +
  geom_line(linewidth = 0.8, color = "gray10") +
  geom_point(size = 3,       color = "gray10") +
  scale_y_continuous(labels = scales::percent_format(scale = 1)) +
  labs(x = "", y = "% Visiting Own\nIsland Sites Only",
       title = "Test 4:  Remain on Home Island") +
  theme_minimal(base_family = "Times New Roman") +
  theme(axis.text.x = element_text(angle = 20, hjust = 1, size = 9),
        panel.grid.minor = element_blank(),
        plot.title = element_text(face = "bold", size = 11))

combined <- (p1 + p2) / (p3 + p4) +
  plot_annotation(
    title   = "Four Tests of Geographic Selection Constraint by Income Block",
    caption = "All four tests ask: does distance matter more for lower income communities?",
    theme   = theme(
      plot.title   = element_text(family = "Times New Roman",
                                  face = "bold", size = 14),
      plot.caption = element_text(family = "Times New Roman",
                                  size = 10, color = "gray40")
    )
  )

print(combined)

ggsave("images/geographic_constraint_tests.png",
       combined, width = 12, height = 10,
       dpi = 300, bg = "white")


```

```{r}
#| cache: true

all_years_raw <- map_df(2018:2023, function(yr) {
  get(paste0("final_data_", yr)) %>%
    mutate(year = yr, id = as.character(id))
})

site_by_income <- all_years_raw %>%
  filter(parkname != "Stay_Home", VisitCount > 0) %>%
  mutate(id = as.character(id)) %>%
  left_join(income_lookup, by = "id") %>%
  filter(!is.na(income_block)) %>%
  group_by(income_block, parkname) %>%
  summarise(
    total_trips = sum(VisitCount, na.rm = TRUE),
    .groups     = "drop"
  ) %>%
  group_by(income_block) %>%
  mutate(
    pct_trips = round(total_trips / sum(total_trips) * 100, 1)
  ) %>%
  ungroup() %>%
  arrange(income_block, desc(pct_trips))


# Check if high income disproportionately visits Peacock and Waimanu
site_by_income %>%
  filter(parkname %in% c("Peacock Flats Campsite", 
                          "Waimanu Campsite")) %>%
  select(income_block, parkname, total_trips, pct_trips) %>%
  arrange(parkname, income_block) %>%
  print(n = 20)

# What share of each income group's trips go to these two sites?
site_by_income %>%
  mutate(
    high_value_site = parkname %in% c("Peacock Flats Campsite",
                                       "Waimanu Campsite")
  ) %>%
  group_by(income_block, high_value_site) %>%
  summarise(
    pct_trips = sum(pct_trips),
    .groups   = "drop"
  ) %>%
  filter(high_value_site) %>%
  arrange(income_block)

```

# Spread Across Models & DCA

The spread here show the value of park based on whether or not the person comes from a disadvantage community.

The MDCEV is fairly stable compared to the RUM and the spread is larger whe nconsidering each trip occasion.

For DCA are consistently less in Observed RUM and MDCEV but is greater in Frequency 52.

```{r}
#| cache: true


# 1. Prepare data: Group by Park, Year, and EJ Status
plot_data_three_panel <- final_comparison_master1 %>%
  filter(parkname!="Stay_Home") %>%
  group_by(year, parkname, is_disadvantaged_50) %>%
  summarise(
    Observed_RUM = mean(loss_rum_observed, na.rm = TRUE),
    Frequency_RUM = mean(loss_rum_52wk, na.rm = TRUE),
    MDCEV = mean(loss_mdcev, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  # Pivot to long format
  pivot_longer(
    cols = c(Observed_RUM, Frequency_RUM, MDCEV),
    names_to = "Model_Type",
    values_to = "Welfare_Loss"
  ) %>%
  # Clean up factor labels for the legend and facets
  mutate(
    Model_Type = factor(Model_Type, 
                        levels = c("Observed_RUM", "Frequency_RUM", "MDCEV"),
                        labels = c("Observed RUM", "Frequency RUM (52-Wk)", "MDCEV")),
    EJ_Group = factor(is_disadvantaged_50, 
                      levels = c(0, 1), 
                      labels = c("Non-Disadvantaged", "Disadvantaged (EJ)"))
  )

# 2. Create the Three-Panel Plot
ggplot(plot_data_three_panel, aes(x = factor(year), y = Welfare_Loss, fill = EJ_Group)) +
  # Boxplot with black outlines
  geom_boxplot(outlier.shape = 21, outlier.size = 1, color = "black", alpha = 0.8) +
  
  # CREATE THREE PANELS (One for each Model)
  facet_wrap(~ Model_Type, scales = "free_y") + 
  
  # Grayscale for the EJ Comparison
  scale_fill_manual(values = c(
    "Non-Disadvantaged" = "#F0F0F0", # Very Light Grey
    "Disadvantaged (EJ)" = "#636363"  # Dark Grey
  )) +
  
  # Labels
  labs(
    title = "Annual Welfare Loss by Model Type and EJ Status",
    subtitle = "Comparing Disadvantaged vs. Non-Disadvantaged ZIP Codes across Hawaii Parks",
    x = "Year",
    y = "Average Annual Welfare Loss ($/Park)",
    fill = "Community Status"
  ) +
  
  # Academic Theme
  theme_bw() +
  theme(
    text = element_text(family = "serif", size = 16),
    plot.title = element_text(face = "bold", size = 20, hjust = 0.5),
    plot.subtitle = element_text(hjust = 0.5, size = 16),
    
    # Grid and Panel styling
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    strip.background = element_rect(fill = "white", color = "black"),
    strip.text = element_text(face = "bold", size = 16),
    
    # Legend at bottom
    legend.position = "bottom",
    panel.border = element_rect(colour = "black", fill = NA, size = 0.8)
  )

ggsave("images/comparision_EJstatus.png", width=12, height=10, dpi=300)

```

```{r}
#| cache: true
# 1. Prepare data: Group by Park, Year, and EJ Status
plot_data_three_panel <- final_comparison_master1 %>%
  filter(parkname!="Stay_Home") %>%
  group_by(year, parkname, is_disadvantaged_50_envi) %>%
  summarise(
    Observed_RUM = mean(loss_rum_observed, na.rm = TRUE),
    Frequency_RUM = mean(loss_rum_52wk, na.rm = TRUE),
    MDCEV = mean(loss_mdcev, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  # Pivot to long format
  pivot_longer(
    cols = c(Observed_RUM, Frequency_RUM, MDCEV),
    names_to = "Model_Type",
    values_to = "Welfare_Loss"
  ) %>%
  # Clean up factor labels for the legend and facets
  mutate(
    Model_Type = factor(Model_Type, 
                        levels = c("Observed_RUM", "Frequency_RUM", "MDCEV"),
                        labels = c("Observed RUM", "Frequency RUM (52-Wk)", "MDCEV")),
    EJ_Group = factor(is_disadvantaged_50_envi, 
                      levels = c(0, 1), 
                      labels = c("Non-Disadvantaged", "Disadvantaged (EJ)"))
  )

# 2. Create the Three-Panel Plot
ggplot(plot_data_three_panel, aes(x = factor(year), y = Welfare_Loss, fill = EJ_Group)) +
  # Boxplot with black outlines
  geom_boxplot(outlier.shape = 21, outlier.size = 1, color = "black", alpha = 0.8) +
  
  # CREATE THREE PANELS (One for each Model)
  facet_wrap(~ Model_Type, scales = "free_y") + 
  
  # Grayscale for the EJ Comparison
  scale_fill_manual(values = c(
    "Non-Disadvantaged" = "#F0F0F0", # Very Light Grey
    "Disadvantaged (EJ)" = "#636363"  # Dark Grey
  )) +
  
  # Labels
  labs(
    title = "Annual Welfare Loss by Model Type and Environmental Burdens Status",
    subtitle = "Comparing Disadvantaged vs. Non-Disadvantaged ZIP Codes across Hawaii Parks",
    x = "Year",
    y = "Average Annual Welfare Loss ($/Park)",
    fill = "Community Status"
  ) +
  
  # Academic Theme
  theme_bw() +
  theme(
    text = element_text(family = "serif", size = 12),
    plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
    plot.subtitle = element_text(hjust = 0.5, size = 11),
    
    # Grid and Panel styling
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    strip.background = element_rect(fill = "white", color = "black"),
    strip.text = element_text(face = "bold", size = 11),
    
    # Legend at bottom
    legend.position = "bottom",
    panel.border = element_rect(colour = "black", fill = NA, size = 0.8)
  )

ggsave("images/comparision_envir_status.png", width=12, height=10, dpi=300)

# 1. Prepare data: Group by Park, Year, and EJ Status
plot_data_three_panel <- final_comparison_master1 %>%
  filter(parkname!="Stay_Home") %>%
  group_by(year, parkname, is_disadvantaged_50_envi) %>%
  summarise(
    Observed_RUM = mean(loss_rum_observed, na.rm = TRUE),
    MDCEV = mean(loss_mdcev, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  # Pivot to long format
  pivot_longer(
    cols = c(Observed_RUM, MDCEV),
    names_to = "Model_Type",
    values_to = "Welfare_Loss"
  ) %>%
  # Clean up factor labels for the legend and facets
  mutate(
    Model_Type = factor(Model_Type, 
                        levels = c("Observed_RUM",  "MDCEV"),
                        labels = c("Observed RUM", "MDCEV")),
    EJ_Group = factor(is_disadvantaged_50_envi, 
                      levels = c(0, 1), 
                      labels = c("Non-Disadvantaged", "Disadvantaged"))
  )

# 2. Create the Three-Panel Plot
ggplot(plot_data_three_panel, aes(x = factor(year), y = Welfare_Loss, fill = EJ_Group)) +
  # Boxplot with black outlines
  geom_boxplot(outlier.shape = 21, outlier.size = 1, color = "black", alpha = 0.8) +
  
  # CREATE THREE PANELS (One for each Model)
  facet_wrap(~ Model_Type, scales = "free_y") + 
  
  # Grayscale for the EJ Comparison
  scale_fill_manual(values = c(
    "Non-Disadvantaged" = "#F0F0F0", # Very Light Grey
    "Disadvantaged" = "#636363"  # Dark Grey
  )) +
  
  # Labels
  labs(
    title = "Annual Welfare Loss by Model Type and Environmental Burdens Status",
    subtitle = "Comparing Disadvantaged vs. Non-Disadvantaged ZIP Codes across Hawaii Parks",
    x = "Year",
    y = "Average Annual Welfare Loss ($/Park)",
    fill = "Community Status"
  ) +
  
  # Academic Theme
  theme_bw() +
  theme(
    text = element_text(family = "serif", size = 20),
    plot.title = element_text(face = "bold", size =20, hjust = 0.5),
    plot.subtitle = element_text(hjust = 0.5, size = 20),
    
    # Grid and Panel styling
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    strip.background = element_rect(fill = "white", color = "black"),
    strip.text = element_text(face = "bold", size = 20),
    
    # Legend at bottom
    legend.position = "bottom",
    panel.border = element_rect(colour = "black", fill = NA, size = 0.8)
  )

ggsave("images/comparision_envir_observed.png", width=12, height=10, dpi=300)


```

```{r}
#| cache: true

# 1. Prepare data: Group by Park, Year, and EJ Status
plot_data_three_panel <- final_comparison_master1 %>%
  filter(parkname!="Stay_Home") %>%
  group_by(year, parkname, is_disadvantaged_50_soci) %>%
  summarise(
    Observed_RUM = mean(loss_rum_observed, na.rm = TRUE),
    Frequency_RUM = mean(loss_rum_52wk, na.rm = TRUE),
    MDCEV = mean(loss_mdcev, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  # Pivot to long format
  pivot_longer(
    cols = c(Observed_RUM, Frequency_RUM, MDCEV),
    names_to = "Model_Type",
    values_to = "Welfare_Loss"
  ) %>%
  # Clean up factor labels for the legend and facets
  mutate(
    Model_Type = factor(Model_Type, 
                        levels = c("Observed_RUM", "Frequency_RUM", "MDCEV"),
                        labels = c("Observed RUM", "Frequency RUM (52-Wk)", "MDCEV")),
    EJ_Group = factor(is_disadvantaged_50_soci, 
                      levels = c(0, 1), 
                      labels = c("Non-Disadvantaged", "Disadvantaged (EJ)"))
  )

# 2. Create the Three-Panel Plot
ggplot(plot_data_three_panel, aes(x = factor(year), y = Welfare_Loss, fill = EJ_Group)) +
  # Boxplot with black outlines
  geom_boxplot(outlier.shape = 21, outlier.size = 1, color = "black", alpha = 0.8) +
  
  # CREATE THREE PANELS (One for each Model)
  facet_wrap(~ Model_Type, scales = "free_y") + 
  
  # Grayscale for the EJ Comparison
  scale_fill_manual(values = c(
    "Non-Disadvantaged" = "#F0F0F0", # Very Light Grey
    "Disadvantaged (EJ)" = "#636363"  # Dark Grey
  )) +
  
  # Labels
  labs(
    title = "Annual Welfare Loss by Model Type and Social Vulnerabilities Status",
    subtitle = "Comparing Disadvantaged vs. Non-Disadvantaged ZIP Codes across Hawaii Parks",
    x = "Year",
    y = "Average Annual Welfare Loss ($/Park)",
    fill = "Community Status"
  ) +
  
  # Academic Theme
  theme_bw() +
  theme(
    text = element_text(family = "serif", size = 12),
    plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
    plot.subtitle = element_text(hjust = 0.5, size = 11),
    
    # Grid and Panel styling
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    strip.background = element_rect(fill = "white", color = "black"),
    strip.text = element_text(face = "bold", size = 11),
    
    # Legend at bottom
    legend.position = "bottom",
    panel.border = element_rect(colour = "black", fill = NA, size = 0.8)
  )
ggsave("images/comparision_social_vulnerabilities.png", width=12, height=10, dpi=300)



# 1. Prepare data: Group by Park, Year, and EJ Status
plot_data_three_panel <- final_comparison_master1 %>%
  filter(parkname!="Stay_Home") %>%
  group_by(year, parkname, is_disadvantaged_50_soci) %>%
  summarise(
    Observed_RUM = mean(loss_rum_observed, na.rm = TRUE),
    MDCEV = mean(loss_mdcev, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  # Pivot to long format
  pivot_longer(
    cols = c(Observed_RUM, MDCEV),
    names_to = "Model_Type",
    values_to = "Welfare_Loss"
  ) %>%
  # Clean up factor labels for the legend and facets
  mutate(
    Model_Type = factor(Model_Type, 
                        levels = c("Observed_RUM", "MDCEV"),
                        labels = c("Observed RUM", "MDCEV")),
    EJ_Group = factor(is_disadvantaged_50_soci, 
                      levels = c(0, 1), 
                      labels = c("Non-Disadvantaged", "Disadvantaged"))
  )

# 2. Create the Three-Panel Plot
ggplot(plot_data_three_panel, aes(x = factor(year), y = Welfare_Loss, fill = EJ_Group)) +
  # Boxplot with black outlines
  geom_boxplot(outlier.shape = 21, outlier.size = 1, color = "black", alpha = 0.8) +
  
  # CREATE THREE PANELS (One for each Model)
  facet_wrap(~ Model_Type, scales = "free_y") + 
  
  # Grayscale for the EJ Comparison
  scale_fill_manual(values = c(
    "Non-Disadvantaged" = "#F0F0F0", # Very Light Grey
    "Disadvantaged" = "#636363"  # Dark Grey
  )) +
  
  # Labels
  labs(
    title = "Annual Welfare Loss by Model Type and Social Vulnerabilities Status",
    subtitle = "Comparing Disadvantaged vs. Non-Disadvantaged ZIP Codes across Hawaii Parks",
    x = "Year",
    y = "Average Annual Welfare Loss ($/Park)",
    fill = "Community Status"
  ) +
  
  # Academic Theme
  theme_bw() +
  theme(
    text = element_text(family = "serif", size = 20),
    plot.title = element_text(face = "bold", size = 20, hjust = 0.5),
    plot.subtitle = element_text(hjust = 0.5, size = 20),
    
    # Grid and Panel styling
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    strip.background = element_rect(fill = "white", color = "black"),
    strip.text = element_text(face = "bold", size = 20),
    
    # Legend at bottom
    legend.position = "bottom",
    panel.border = element_rect(colour = "black", fill = NA, size = 0.8)
  )
ggsave("images/comparision_social_observed.png", width=12, height=10, dpi=300)



```

# Map

## Grey Map

```{r}
#| cache: true


options(tigris_use_cache = TRUE)
# 1. Get Hawaii Outlines
islands_outline <- states(cb = TRUE) %>%
  filter(STUSPS == "HI")

# 2. Get ZCTAs
zip_shapes <- zctas(cb = TRUE, year = 2020) %>%
  filter(str_starts(ZCTA5CE20, "967") | str_starts(ZCTA5CE20, "968"))

# 3. Zip averages — fix zip type here
zip_averages <- final_comparison_master1 %>%
  filter(parkname == "Stay_Home") %>%
  group_by(zip) %>%
  summarise(
    loss_mdcev       = mean(loss_mdcev,        na.rm = TRUE),
    is_disadvantaged = ifelse(
      mean(is_disadvantaged_50, na.rm = TRUE) > 0.5, "Yes", "No"
    ),
    .groups = "drop"
  ) %>%
  mutate(zip = as.character(zip))   # <-- KEY FIX

# Quick check before join
message(paste("Zip averages rows:", nrow(zip_averages)))
message(paste("Zip type:", class(zip_averages$zip)))
message(paste("ZCTA5CE20 type:", class(zip_shapes$ZCTA5CE20)))

# 4. Join
map_data <- zip_shapes %>%
  inner_join(zip_averages, by = c("ZCTA5CE20" = "zip"))

message(paste("Map data rows after join:", nrow(map_data)))

# 5. Bins
map_data <- map_data %>%
  mutate(
    loss_mdcev = ifelse(is.na(loss_mdcev) | loss_mdcev > 0, 0, loss_mdcev),
    loss_bin   = cut(loss_mdcev,
      breaks = c(-Inf, -46.03, -30.09, -17.94, 0),
      labels = c(">-$100",
                 "-$100 to -$50",
                 "-$50 to -$20",
                 "0 to -$20"),
      include.lowest = TRUE,
      right          = TRUE
    )
  )

# 6. Island labels
island_labels <- tibble(
  name     = c("Oʻahu", "Maui", "Hawaiʻi", "Kauaʻi", "Molokaʻi", "Lānaʻi"),
  x_anchor = c(-157.98, -156.33, -155.55, -159.52, -157.02, -156.92),
  y_anchor = c( 21.47,   20.80,   19.60,   22.06,   21.13,   20.83),
  x_label  = c(-158.60, -155.10, -157.20, -160.50, -156.60, -157.60),
  y_label  = c( 20.90,   21.50,   19.60,   22.20,   21.60,   20.15)
) %>%
  mutate(
    shrink      = 0.4,
    x_seg_start = x_label + shrink * (x_anchor - x_label),
    y_seg_start = y_label + shrink * (y_anchor - y_label)
  )

# 7. Plot
hawaii_map <- ggplot() +
  geom_sf(data = islands_outline, fill = "gray95", color = "gray60", size = 0.4) +
  
  geom_sf_pattern(
    data             = filter(map_data, is_disadvantaged == "No"),
    aes(fill = loss_bin, pattern = is_disadvantaged),
    pattern_color    = NA,
    pattern_fill     = NA,
    pattern_density  = 0.01,
    color            = "black",
    size             = 0.1
  ) +
  
  geom_sf_pattern(
    data             = filter(map_data, is_disadvantaged == "Yes"),
    aes(fill = loss_bin, pattern = is_disadvantaged),
    pattern_color    = "grey60",
    pattern_fill     = NA,
    pattern_density  = 0.4,
    pattern_spacing  = 0.03,
    pattern_angle    = 45,
    pattern_size     = 0.6,
    color            = "grey60",
    size             = 0.2
  ) +
  
  geom_segment(
    data      = island_labels,
    aes(x = x_seg_start, y = y_seg_start, xend = x_anchor, yend = y_anchor),
    color     = "gray50",
    linewidth = 0.3
  ) +
  
  geom_point(
    data  = island_labels,
    aes(x = x_anchor, y = y_anchor),
    color = "gray30",
    size  = 1.0
  ) +
  
  # White halo
  geom_text(
    data   = island_labels,
    aes(x = x_label, y = y_label, label = name),
    size   = 12, fontface = "bold", color = "white",
    family = "Times New Roman", hjust = 0.5, vjust = 0.5
  ) +
  # Actual text
  geom_text(
    data   = island_labels,
    aes(x = x_label, y = y_label, label = name),
    size   = 12, fontface = "bold", color = "gray10",
    family = "Times New Roman", hjust = 0.5, vjust = 0.5
  ) +
  
  annotation_north_arrow(
    location    = "bl",
    which_north = "true",
    height = unit(2.0, "cm"),
    width  = unit(1.5, "cm"),
    style  = north_arrow_fancy_orienteering(
      fill        = c("gray10", "white"),
      line_col    = "gray10",
      text_col    = "gray10",
      text_family = "Times New Roman",
      text_size   = 16
    ),
    pad_x = unit(1.0, "cm"),
    pad_y = unit(0.9, "cm")
  ) +
  
  annotation_scale(
    location      = "bl",
    unit_category = "imperial",
    text_family   = "Times New Roman",
    text_cex      = 1,
    style         = "bar",
    bar_cols      = c("gray10", "white"),
    line_col      = "gray10",
    text_col      = "gray10",
    pad_x = unit(1.0, "cm"),
    pad_y = unit(0.3, "cm")
  ) +
  
  scale_fill_manual(
    values = c(
      ">-$100"        = "gray10",
      "-$100 to -$50" = "gray40",
      "-$50 to -$20"  = "gray70",
      "0 to -$20"     = "gray92"
    ),
    name     = "Avg Welfare Loss",
    na.value = "transparent",
    guide    = guide_legend(
      override.aes   = list(pattern = "none"),
      reverse        = FALSE,
      keyheight      = unit(5, "mm"),
      keywidth       = unit(5, "mm"),
      label.position = "right"
    )
  ) +
  
  scale_pattern_manual(
    values = c("No" = "none", "Yes" = "crosshatch"),
    name   = "Community Type",
    labels = c("No" = "Non-Disadvantaged", "Yes" = "Disadvantaged"),
    guide  = guide_legend(
      override.aes = list(
        fill            = c("white", "white"),
        color           = c("black", "black"),
        pattern_color   = c(NA, "black"),
        pattern_fill    = c(NA, NA),
        pattern_density = c(0.01, 0.4),
        pattern_spacing = c(0.03, 0.02),
        pattern_angle   = c(45, 45)
      ),
      keyheight      = unit(7, "mm"),
      keywidth       = unit(7, "mm"),
      label.position = "right"
    )
  ) +
  
  coord_sf(xlim = c(-161.5, -154.0), ylim = c(18.7, 22.5), expand = FALSE) +
  theme_minimal() +
  theme(
    panel.grid           = element_blank(),
    axis.text            = element_blank(),
    axis.title           = element_blank(),
    axis.ticks           = element_blank(),
    legend.position      = c(0.1, 0.15),
    legend.justification = c("left", "bottom"),
    legend.box           = "vertical",
    legend.background    = element_rect(fill = "white", color = NA),
    text                 = element_text(family = "Times New Roman"),
    plot.title           = element_text(family = "Times New Roman", face = "bold"),
    legend.title         = element_text(family = "Times New Roman",
                                        face = "bold", size = 14),
    legend.text          = element_text(family = "Times New Roman", size = 14)
  )

ggsave(
  filename = "images/hawaii_welfare_loss_map.png",
  plot     = hawaii_map,
  width    = 12,
  height   = 10,
  dpi      = 300,
  units    = "in",
  bg       = "white"
)
```

## Color Map

```{r}
#| cache: true


# 1. Get Hawaii Outlines
islands_outline <- states(cb = TRUE) %>%
  filter(STUSPS == "HI")

# 2. Get ZCTAs
zip_shapes <- zctas(cb = TRUE, year = 2020) %>%
  filter(str_starts(ZCTA5CE20, "967") | str_starts(ZCTA5CE20, "968"))

# 3. Zip averages
zip_averages <- final_comparison_master1 %>%
  filter(parkname == "Stay_Home") %>%
  group_by(zip) %>%
  summarise(
    loss_mdcev       = mean(loss_mdcev,        na.rm = TRUE),
    is_disadvantaged = ifelse(
      mean(is_disadvantaged_50, na.rm = TRUE) > 0.5, "Yes", "No"
    ),
    .groups = "drop"
  ) %>%
  mutate(zip = as.character(zip))

# 4. Join
map_data <- zip_shapes %>%
  inner_join(zip_averages, by = c("ZCTA5CE20" = "zip"))

# 5. Bins
map_data <- map_data %>%
  mutate(
    loss_mdcev = ifelse(is.na(loss_mdcev) | loss_mdcev > 0, 0, loss_mdcev),
    loss_bin   = cut(loss_mdcev,
      breaks = c(-Inf, -46.03, -30.09, -17.94, 0),
      labels = c(">-$100",
                 "-$100 to -$50",
                 "-$50 to -$20",
                 "0 to -$20"),
      include.lowest = TRUE,
      right          = TRUE
    )
  )

# 6. Island labels
island_labels <- tibble(
  name     = c("Oʻahu", "Maui", "Hawaiʻi", "Kauaʻi", "Molokaʻi", "Lānaʻi"),
  x_anchor = c(-157.98, -156.33, -155.55, -159.52, -157.02, -156.92),
  y_anchor = c( 21.47,   20.80,   19.60,   22.06,   21.13,   20.83),
  x_label  = c(-158.60, -155.10, -157.20, -160.50, -156.60, -157.60),
  y_label  = c( 20.90,   21.50,   19.60,   22.20,   21.60,   20.15)
) %>%
  mutate(
    shrink      = 0.4,
    x_seg_start = x_label + shrink * (x_anchor - x_label),
    y_seg_start = y_label + shrink * (y_anchor - y_label)
  )

# 7. Color map — white to dark red
hawaii_map_color <- ggplot() +
  geom_sf(data = islands_outline, fill = "gray95", color = "gray60", size = 0.4) +
  
# Non-DAC layer
geom_sf_pattern(
    data             = filter(map_data, is_disadvantaged == "No"),
    aes(fill = loss_bin, pattern = is_disadvantaged),
    pattern_color    = NA,
    pattern_fill     = "transparent",   # <-- change from NA
    pattern_density  = 0.01,
    color            = "white",
    size             = 0.1
) +

# DAC layer
geom_sf_pattern(
    data             = filter(map_data, is_disadvantaged == "Yes"),
    aes(fill = loss_bin, pattern = is_disadvantaged),
    pattern_color    = "black",
    pattern_fill     = "transparent",   # <-- change from NA
    pattern_density  = 0.4,
    pattern_spacing  = 0.03,
    pattern_angle    = 45,
    pattern_size     = 0.6,
    color            = "white",
    size             = 0.2
) +
  
  geom_segment(
    data      = island_labels,
    aes(x = x_seg_start, y = y_seg_start, xend = x_anchor, yend = y_anchor),
    color     = "gray30",
    linewidth = 0.3
  ) +
  
  geom_point(
    data  = island_labels,
    aes(x = x_anchor, y = y_anchor),
    color = "gray20",
    size  = 1.0
  ) +
  
  # White halo
  geom_text(
    data   = island_labels,
    aes(x = x_label, y = y_label, label = name),
    size   = 12, fontface = "bold", color = "white",
    family = "Times New Roman", hjust = 0.5, vjust = 0.5
  ) +
  # Actual text
  geom_text(
    data   = island_labels,
    aes(x = x_label, y = y_label, label = name),
    size   = 11, fontface = "bold", color = "gray10",
    family = "Times New Roman", hjust = 0.5, vjust = 0.5
  ) +
  
  annotation_north_arrow(
    location    = "bl",
    which_north = "true",
    height = unit(2.0, "cm"),
    width  = unit(1.5, "cm"),
    style  = north_arrow_fancy_orienteering(
      fill        = c("gray10", "white"),
      line_col    = "gray10",
      text_col    = "gray10",
      text_family = "Times New Roman",
      text_size   = 16
    ),
    pad_x = unit(1.0, "cm"),
    pad_y = unit(0.9, "cm")
  ) +
  
  annotation_scale(
    location      = "bl",
    unit_category = "imperial",
    text_family   = "Times New Roman",
    text_cex      = 1,
    style         = "bar",
    bar_cols      = c("gray10", "white"),
    line_col      = "gray10",
    text_col      = "gray10",
    pad_x = unit(1.0, "cm"),
    pad_y = unit(0.3, "cm")
  ) +
  
  # ── COLOR SCALE: white (low loss) to dark red (high loss) ─────────────────
  scale_fill_manual(
    values = c(
      ">-$100"        = "#67000d",   # darkest red   — highest loss
      "-$100 to -$50" = "#cb181d",   # medium red
      "-$50 to -$20"  = "#fc8d59",   # light orange-red
      "0 to -$20"     = "#fff5f0"    # near white    — lowest loss
    ),
    name     = "Avg Welfare Loss",
    na.value = "transparent",
    guide    = guide_legend(
      override.aes   = list(pattern = "none"),
      reverse        = FALSE,
      keyheight      = unit(6, "mm"),
      keywidth       = unit(6, "mm"),
      label.position = "right"
    )
  ) +
  
  scale_pattern_manual(
    values = c("No" = "none", "Yes" = "crosshatch"),
    name   = "Community Type",
    labels = c("No" = "Non-Disadvantaged", "Yes" = "Disadvantaged"),
    guide  = guide_legend(
      override.aes = list(
        fill            = c("white", "white"),
        color           = c("black", "black"),
        pattern_color   = c(NA, "black"),
        pattern_fill    = c(NA, NA),
        pattern_density = c(0.01, 0.4),
        pattern_spacing = c(0.03, 0.02),
        pattern_angle   = c(45, 45)
      ),
      keyheight      = unit(7, "mm"),
      keywidth       = unit(7, "mm"),
      label.position = "right"
    )
  ) +
  
  coord_sf(xlim = c(-161.5, -154.0), ylim = c(18.7, 22.5), expand = FALSE) +
  theme_minimal() +
  theme(
    panel.grid           = element_blank(),
    axis.text            = element_blank(),
    axis.title           = element_blank(),
    axis.ticks           = element_blank(),
    legend.position      = c(0.1, 0.15),
    legend.justification = c("left", "bottom"),
    legend.box           = "vertical",
    legend.background    = element_rect(fill = "white", color = NA),
    text                 = element_text(family = "Times New Roman"),
    plot.title           = element_text(family = "Times New Roman", face = "bold"),
    legend.title         = element_text(family = "Times New Roman",
                                        face = "bold", size = 14),
    legend.text          = element_text(family = "Times New Roman", size = 14)
  )

ggsave(
  filename = "images/hawaii_welfare_loss_map_color.png",
  plot     = hawaii_map_color,
  width    = 12,
  height   = 10,
  dpi      = 300,
  units    = "in",
  bg       = "white"
)




```

## Color RUM

```{r}
#| cache: true


# 1. Get Hawaii Outlines
islands_outline <- states(cb = TRUE) %>%
  filter(STUSPS == "HI")

# 2. Get ZCTAs
zip_shapes <- zctas(cb = TRUE, year = 2020) %>%
  filter(str_starts(ZCTA5CE20, "967") | str_starts(ZCTA5CE20, "968"))

plot_data <- final_comparison_master1 %>%
  group_by(zip, parkname, year,is_disadvantaged_50) %>%
  filter(parkname!="Stay_Home")%>%
  summarise(
    ObservedRUM = mean(loss_rum_observed, na.rm = TRUE),
    FrequencyRUM = mean(loss_rum_52wk, na.rm = TRUE),
    .groups = "drop"
  )


# 3. Zip averages
zip_averages <- plot_data %>%
  # Step 1: Sum welfare across all sites within each zip and year
  group_by(zip, year) %>%
  summarise(
    loss_rum_observed = sum(ObservedRUM, na.rm = TRUE),
    is_disadvantaged  = ifelse(
      mean(is_disadvantaged_50, na.rm = TRUE) > 0.5, "Yes", "No"
    ),
    .groups = "drop"
  ) %>%
  
  # Step 2: Average the yearly sums across all years
  group_by(zip) %>%
  summarise(
    loss_rum_observed = mean(loss_rum_observed, na.rm = TRUE),
    is_disadvantaged  = ifelse(          # <-- uses is_disadvantaged not is_disadvantaged_50
      mean(is_disadvantaged == "Yes", na.rm = TRUE) > 0.5, "Yes", "No"
    ),
    .groups = "drop"
  ) %>%
  mutate(zip = as.character(zip))

# 4. Join
map_data <- zip_shapes %>%
  inner_join(zip_averages, by = c("ZCTA5CE20" = "zip"))

# 5. Bins
map_data <- map_data %>%
  mutate(
    loss_rum_observed = ifelse(is.na(loss_rum_observed) | loss_rum_observed > 0, 0, loss_rum_observed),
    loss_bin   = cut(loss_rum_observed,
      breaks = c(-Inf, -400, -300, -200, -100),
      labels = c(">-$400",
                 "-$400 to -$300",
                 "-$300 to -$200",
                 "-$100 to -$200"),
      include.lowest = TRUE,
      right          = TRUE
    )
  )

# 6. Island labels
island_labels <- tibble(
  name     = c("Oʻahu", "Maui", "Hawaiʻi", "Kauaʻi", "Molokaʻi", "Lānaʻi"),
  x_anchor = c(-157.98, -156.33, -155.55, -159.52, -157.02, -156.92),
  y_anchor = c( 21.47,   20.80,   19.60,   22.06,   21.13,   20.83),
  x_label  = c(-158.60, -155.10, -157.20, -160.50, -156.60, -157.60),
  y_label  = c( 20.90,   21.50,   19.60,   22.20,   21.60,   20.15)
) %>%
  mutate(
    shrink      = 0.4,
    x_seg_start = x_label + shrink * (x_anchor - x_label),
    y_seg_start = y_label + shrink * (y_anchor - y_label)
  )

# 7. Color map — white to dark red
hawaii_map_color <- ggplot() +
  geom_sf(data = islands_outline, fill = "gray95", color = "gray60", size = 0.4) +
  
# Non-DAC layer
geom_sf_pattern(
    data             = filter(map_data, is_disadvantaged == "No"),
    aes(fill = loss_bin, pattern = is_disadvantaged),
    pattern_color    = NA,
    pattern_fill     = "transparent",   # <-- change from NA
    pattern_density  = 0.01,
    color            = "white",
    size             = 0.1
) +

# DAC layer
geom_sf_pattern(
    data             = filter(map_data, is_disadvantaged == "Yes"),
    aes(fill = loss_bin, pattern = is_disadvantaged),
    pattern_color    = "black",
    pattern_fill     = "transparent",   # <-- change from NA
    pattern_density  = 0.4,
    pattern_spacing  = 0.03,
    pattern_angle    = 45,
    pattern_size     = 0.6,
    color            = "white",
    size             = 0.2
) +
  
  geom_segment(
    data      = island_labels,
    aes(x = x_seg_start, y = y_seg_start, xend = x_anchor, yend = y_anchor),
    color     = "gray30",
    linewidth = 0.3
  ) +
  
  geom_point(
    data  = island_labels,
    aes(x = x_anchor, y = y_anchor),
    color = "gray20",
    size  = 1.0
  ) +
  
  # White halo
  geom_text(
    data   = island_labels,
    aes(x = x_label, y = y_label, label = name),
    size   = 12, fontface = "bold", color = "white",
    family = "Times New Roman", hjust = 0.5, vjust = 0.5
  ) +
  # Actual text
  geom_text(
    data   = island_labels,
    aes(x = x_label, y = y_label, label = name),
    size   = 11, fontface = "bold", color = "gray10",
    family = "Times New Roman", hjust = 0.5, vjust = 0.5
  ) +
  
  annotation_north_arrow(
    location    = "bl",
    which_north = "true",
    height = unit(2.0, "cm"),
    width  = unit(1.5, "cm"),
    style  = north_arrow_fancy_orienteering(
      fill        = c("gray10", "white"),
      line_col    = "gray10",
      text_col    = "gray10",
      text_family = "Times New Roman",
      text_size   = 16
    ),
    pad_x = unit(1.0, "cm"),
    pad_y = unit(0.9, "cm")
  ) +
  
  annotation_scale(
    location      = "bl",
    unit_category = "imperial",
    text_family   = "Times New Roman",
    text_cex      = 1,
    style         = "bar",
    bar_cols      = c("gray10", "white"),
    line_col      = "gray10",
    text_col      = "gray10",
    pad_x = unit(1.0, "cm"),
    pad_y = unit(0.3, "cm")
  ) +
  
  # ── COLOR SCALE: white (low loss) to dark red (high loss) ─────────────────
  scale_fill_manual(
    values = c(
      ">-$400"        = "#67000d",   # darkest red   — highest loss
      "-$400 to -$300" = "#cb181d",   # medium red
      "-$300 to -$200"  = "#fc8d59",   # light orange-red
      "-$100 to -$200"     = "#fff5f0"    # near white    — lowest loss
    ),
    name     = "Avg Welfare Loss",
    na.value = "transparent",
    guide    = guide_legend(
      override.aes   = list(pattern = "none"),
      reverse        = FALSE,
      keyheight      = unit(6, "mm"),
      keywidth       = unit(6, "mm"),
      label.position = "right"
    )
  ) +
  
  scale_pattern_manual(
    values = c("No" = "none", "Yes" = "crosshatch"),
    name   = "Community Type",
    labels = c("No" = "Non-Disadvantaged", "Yes" = "Disadvantaged"),
    guide  = guide_legend(
      override.aes = list(
        fill            = c("white", "white"),
        color           = c("black", "black"),
        pattern_color   = c(NA, "black"),
        pattern_fill    = c(NA, NA),
        pattern_density = c(0.01, 0.4),
        pattern_spacing = c(0.03, 0.02),
        pattern_angle   = c(45, 45)
      ),
      keyheight      = unit(7, "mm"),
      keywidth       = unit(7, "mm"),
      label.position = "right"
    )
  ) +
  
  coord_sf(xlim = c(-161.5, -154.0), ylim = c(18.7, 22.5), expand = FALSE) +
  theme_minimal() +
  theme(
    panel.grid           = element_blank(),
    axis.text            = element_blank(),
    axis.title           = element_blank(),
    axis.ticks           = element_blank(),
    legend.position      = c(0.1, 0.15),
    legend.justification = c("left", "bottom"),
    legend.box           = "vertical",
    legend.background    = element_rect(fill = "white", color = NA),
    text                 = element_text(family = "Times New Roman"),
    plot.title           = element_text(family = "Times New Roman", face = "bold"),
    legend.title         = element_text(family = "Times New Roman",
                                        face = "bold", size = 14),
    legend.text          = element_text(family = "Times New Roman", size = 14)
  )

ggsave(
  filename = "images/hawaii_welfare_loss_map_colorRUM.png",
  plot     = hawaii_map_color,
  width    = 12,
  height   = 10,
  dpi      = 300,
  units    = "in",
  bg       = "white"
)



```

```{r}
#| cache: true
print(hawaii_map_color)


print(hawaii_map)
```

# Sensitivity Test

```{r}
#| cache: true


get_price_coef <- function(mle, yr, model_name) {
  
  tryCatch({
    # mlogit coefficients
    beta  <- coef(mle)["price"]
    
    # variance-covariance matrix
    se    <- sqrt(vcov(mle)["price", "price"])
    tibble(
      year       = yr,
      model      = model_name,
      beta_price = beta,
      se_price   = se,
      ci_low     = beta - 1.96 * se,
      ci_high    = beta + 1.96 * se
    )
  }, error = function(e) {
    message(paste("Failed for", model_name, yr, ":", e$message))
    tibble(
      year       = yr,
      model      = model_name,
      beta_price = NA_real_,
      se_price   = NA_real_,
      ci_low     = NA_real_,
      ci_high    = NA_real_
    )
  })
}



# PRice Stability
price_stability_all <- map_df(2018:2023, function(yr) {
  
  obs_key  <- paste0("Year_", yr)   
  freq_key <- paste0("52wk_", yr)   
  
  obs_mle  <- model_list_obs[[obs_key]]
  freq_mle <- model_list_52[[freq_key]]
  
  bind_rows(
    if (!is.null(obs_mle))
      get_price_coef(obs_mle,  yr, "Observed RUM"),
    if (!is.null(freq_mle))
      get_price_coef(freq_mle, yr, "Frequency RUM")
  )
})

print(price_stability_all)
year_pairs <- combn(2018:2023, 2, simplify = FALSE)
# Pairwise welfare stability — all three models 
test_year_pairs_model <- function(yr1, yr2, loss_col) {
  
  d1 <- final_comparison_master %>%
    filter(year == yr1, parkname != "Stay_Home") %>%
    pull({{ loss_col }})
  
  d2 <- final_comparison_master %>%
    filter(year == yr2, parkname != "Stay_Home") %>%
    pull({{ loss_col }})
  
  t_result <- t.test(d1, d2)
  
  tibble(
    year1   = yr1,
    year2   = yr2,
    mean1   = round(mean(d1, na.rm = TRUE), 2),
    mean2   = round(mean(d2, na.rm = TRUE), 2),
    t_stat  = round(t_result$statistic, 3),
    p_value = round(t_result$p.value,   4),
    stable  = t_result$p.value > 0.05
  )
}

stability_all <- bind_rows(
  map_df(year_pairs, ~test_year_pairs_model(.x[1], .x[2], loss_mdcev)) %>%
    mutate(model = "MDCEV"),
  map_df(year_pairs, ~test_year_pairs_model(.x[1], .x[2], loss_rum_observed)) %>%
    mutate(model = "Observed RUM"),
  map_df(year_pairs, ~test_year_pairs_model(.x[1], .x[2], loss_rum_52wk)) %>%
    mutate(model = "Frequency RUM")
)


# Test  Convergent validity across models — do they agree on rankings? 
site_welfare <- final_comparison_master %>%
    filter(parkname != "Stay_Home") %>%
    group_by(parkname) %>%
    summarise(
        mean_mdcev = mean(loss_mdcev,        na.rm = TRUE),
        mean_obs   = mean(loss_rum_observed, na.rm = TRUE),
        mean_52wk  = mean(loss_rum_52wk,     na.rm = TRUE),
        .groups    = "drop"
    )

# Spearman rank correlations across models
cross_model_cor <- site_welfare %>%
    select(mean_mdcev, mean_obs, mean_52wk) %>%
    cor(method = "spearman", use = "complete.obs")

colnames(cross_model_cor) <- c("MDCEV", "Observed RUM", "Frequency RUM")
rownames(cross_model_cor) <- c("MDCEV", "Observed RUM", "Frequency RUM")

print(round(cross_model_cor, 3))

# Transfer error across models 
# Ji et al. transfer error as:
# TE = |predicted - actual| / |actual| × 100

cross_model_te <- site_welfare %>%
    mutate(
        # Use MDCEV to predict each RUM
        te_mdcev_to_obs  = abs((mean_mdcev - mean_obs)   / mean_obs)  * 100,
        te_mdcev_to_52wk = abs((mean_mdcev - mean_52wk)  / mean_52wk) * 100,
        # Use observed RUM to predict frequency RUM
        te_obs_to_52wk   = abs((mean_obs   - mean_52wk)  / mean_52wk) * 100
    ) %>%
    summarise(
        # Mean transfer error across sites
        mean_te_mdcev_obs  = round(mean(te_mdcev_to_obs,  na.rm = TRUE), 1),
        mean_te_mdcev_52wk = round(mean(te_mdcev_to_52wk, na.rm = TRUE), 1),
        mean_te_obs_52wk   = round(mean(te_obs_to_52wk,   na.rm = TRUE), 1),
        # Median transfer error
        med_te_mdcev_obs   = round(median(te_mdcev_to_obs,  na.rm = TRUE), 1),
        med_te_mdcev_52wk  = round(median(te_mdcev_to_52wk, na.rm = TRUE), 1),
        med_te_obs_52wk    = round(median(te_obs_to_52wk,   na.rm = TRUE), 1)
    )

print(cross_model_te)

# Equality tests — are model welfare estimates significantly different from each other? 
# (Ji et al. Section 4)

# Paired t-test: MDCEV vs Observed RUM
t_mdcev_obs <- t.test(
    final_comparison_master$loss_mdcev,
    final_comparison_master$loss_rum_observed,
    paired = TRUE
)

# Paired t-test: MDCEV vs Frequency RUM
t_mdcev_52wk <- t.test(
    final_comparison_master$loss_mdcev,
    final_comparison_master$loss_rum_52wk,
    paired = TRUE
)

# Paired t-test: Observed RUM vs Frequency RUM
t_obs_52wk <- t.test(
    final_comparison_master$loss_rum_observed,
    final_comparison_master$loss_rum_52wk,
    paired = TRUE
)

equality_tests <- tibble(
    comparison     = c("MDCEV vs Observed RUM",
                       "MDCEV vs Frequency RUM",
                       "Observed RUM vs Frequency RUM"),
    mean_diff      = c(round(t_mdcev_obs$estimate,  2),
                       round(t_mdcev_52wk$estimate, 2),
                       round(t_obs_52wk$estimate,   2)),
    t_stat         = c(round(t_mdcev_obs$statistic,  3),
                       round(t_mdcev_52wk$statistic, 3),
                       round(t_obs_52wk$statistic,   3)),
    p_value        = c(round(t_mdcev_obs$p.value,  4),
                       round(t_mdcev_52wk$p.value, 4),
                       round(t_obs_52wk$p.value,   4)),
    significantly_different = c(
        t_mdcev_obs$p.value  < 0.05,
        t_mdcev_52wk$p.value < 0.05,
        t_obs_52wk$p.value   < 0.05
    )
)

print(equality_tests)

# Transfer error by year pair AND model pair 
# Ji et al.'s most comprehensive test — applies both dimensions

te_full <- final_comparison_master %>%
    filter(parkname != "Stay_Home", year!=2020, year!=2021) %>%
    group_by(parkname, year) %>%
    summarise(
        mean_mdcev = mean(loss_mdcev,        na.rm = TRUE),
        mean_obs   = mean(loss_rum_observed, na.rm = TRUE),
        mean_52wk  = mean(loss_rum_52wk,     na.rm = TRUE),
        .groups    = "drop"
    ) %>%
    full_join(
        final_comparison_master %>%
        
            filter(parkname != "Stay_Home") %>%
            group_by(parkname, year) %>%
            summarise(
                pred_mdcev = mean(loss_mdcev,        na.rm = TRUE),
                pred_obs   = mean(loss_rum_observed, na.rm = TRUE),
                pred_52wk  = mean(loss_rum_52wk,     na.rm = TRUE),
                .groups    = "drop"
            ) %>%
            rename(year2 = year),
        by = "parkname"
    ) %>%
    filter(year != year2) %>%
    mutate(
        # Within model temporal transfer error
        te_mdcev_time = abs((pred_mdcev - mean_mdcev) / mean_mdcev) * 100,
        te_obs_time   = abs((pred_obs   - mean_obs)   / mean_obs)   * 100,
        te_52wk_time  = abs((pred_52wk  - mean_52wk)  / mean_52wk)  * 100,
        # Cross model transfer error (use MDCEV from year1 to predict RUM in year2)
        te_mdcev_to_obs_cross  = abs((pred_mdcev - mean_obs)   / mean_obs)   * 100,
        te_mdcev_to_52wk_cross = abs((pred_mdcev - mean_52wk)  / mean_52wk)  * 100
    ) %>%
    group_by(year, year2) %>%
    summarise(
        te_mdcev       = round(mean(te_mdcev_time,na.rm = TRUE), 1),
        te_obs         = round(mean(te_obs_time,na.rm = TRUE), 1),
        te_52wk        = round(mean(te_52wk_time,na.rm = TRUE), 1),
        te_cross_mdcev_obs  = round(mean(te_mdcev_to_obs_cross,  na.rm = TRUE), 1),
        te_cross_mdcev_52wk = round(mean(te_mdcev_to_52wk_cross, na.rm = TRUE), 1),
        .groups        = "drop"
    )

print(te_full, n = 30)

# Summary table matching Ji et al. Table format
summary_ji <- tibble(
    Test               = c(
        "Mean Spearman Rank Correlation",
        "Mean Transfer Error — MDCEV",
        "Mean Transfer Error — Observed RUM",
        "Mean Transfer Error — Frequency RUM",
        "% Year Pairs Stable — MDCEV",
        "% Year Pairs Stable — Observed RUM",
        "% Year Pairs Stable — Frequency RUM"
    ),
    Value = c(
        round(mean(c(cross_model_cor[1,2],
                     cross_model_cor[1,3],
                     cross_model_cor[2,3])), 3),
        round(mean(te_full$te_mdcev,  na.rm = TRUE), 1),
        round(mean(te_full$te_obs,    na.rm = TRUE), 1),
        round(mean(te_full$te_52wk,   na.rm = TRUE), 1),
        round(mean(stability_all$stable[stability_all$model == "MDCEV"])         * 100, 1),
        round(mean(stability_all$stable[stability_all$model == "Observed RUM"])  * 100, 1),
        round(mean(stability_all$stable[stability_all$model == "Frequency RUM"]) * 100, 1)
    )
)

print(summary_ji)


```

# What Years?

Given those % pair stable years. What Years are stable across models? Is this just a covid issue? It does kind of look like it because its unstable for comparisons to 2020 then becomes more stable after 2021/2022 later comparsions.

Interesting MDCEV 2019 to years excluding 2020 stable... thats were the 66% comes from.

```{r}
#| cache: true

site_welfare <- final_comparison_master %>%
    filter(parkname != "Stay_Home") %>%
    group_by(parkname) %>%
    summarise(
        mean_mdcev = mean(loss_mdcev,        na.rm = TRUE),
        mean_obs   = mean(loss_rum_observed, na.rm = TRUE),
        mean_52wk  = mean(loss_rum_52wk,     na.rm = TRUE),
        .groups    = "drop"
    )
# Which year pairs are NOT significantly different (stable) by model 
stability_all %>%
  filter(stable == TRUE) %>%
  select(model, year1, year2, mean1, mean2, t_stat, p_value) %>%
  arrange(model, year1, year2) %>%
  print(n = 50)

# ── Summary count of stable pairs per model 
stability_all %>%
  group_by(model) %>%
  summarise(
    n_pairs        = n(),
    n_stable       = sum(stable),
    n_unstable     = sum(!stable),
    pct_stable     = round(mean(stable) * 100, 1),
    stable_pairs   = paste(
      paste0(year1[stable], "–", year2[stable]),
      collapse = ", "
    ),
    .groups = "drop"
  ) %>%
  print()

# ── Full table showing stable vs unstable clearly 
stability_all %>%
  mutate(
    year_pair  = paste0(year1, "–", year2),
    stability  = ifelse(stable, "Stable", "Unstable"),
    sig_stars  = case_when(
      p_value < 0.001 ~ "***",
      p_value < 0.01  ~ "**",
      p_value < 0.05  ~ "*",
      TRUE            ~ "n.s."
    )
  ) %>%
  select(model, year_pair, mean1, mean2, t_stat, p_value, sig_stars, stability) %>%
  arrange(model, year_pair) %>%
  print(n = 50)


```

# By park site

```{r}
#| cache: true
welfare_by_year <- final_comparison_master %>%
  filter(parkname != "Stay_Home") %>%
  group_by(parkname, year) %>%
  summarise(
    mean_mdcev = mean(loss_mdcev,        na.rm = TRUE),
    mean_obs   = mean(loss_rum_observed, na.rm = TRUE),
    mean_52wk  = mean(loss_rum_52wk,     na.rm = TRUE),
    .groups    = "drop"
  )

sequential_stability <- welfare_by_year %>%
  arrange(parkname, year) %>%
  group_by(parkname) %>%
  mutate(
    pct_change_mdcev = (mean_mdcev - lag(mean_mdcev)) / abs(lag(mean_mdcev)) * 100,
    pct_change_obs   = (mean_obs   - lag(mean_obs))   / abs(lag(mean_obs))   * 100,
    pct_change_52wk  = (mean_52wk  - lag(mean_52wk))  / abs(lag(mean_52wk))  * 100,
    year_pair        = paste0(lag(year), "→", year)
  ) %>%
  filter(!is.na(pct_change_mdcev)) %>%
  ungroup()

#Sequential year agfter year examination
sequential_pairs <- list(
  c(2018, 2019), c(2019, 2020), c(2020, 2021),
  c(2021, 2022), c(2022, 2023)
)

# Pct change by site 
pct_change_site <- welfare_by_year %>%
  arrange(parkname, year) %>%
  group_by(parkname) %>%
  mutate(
    pct_change_mdcev = (mean_mdcev - lag(mean_mdcev)) / abs(lag(mean_mdcev)) * 100,
    pct_change_obs   = (mean_obs   - lag(mean_obs))   / abs(lag(mean_obs))   * 100,
    pct_change_52wk  = (mean_52wk  - lag(mean_52wk))  / abs(lag(mean_52wk))  * 100
  ) %>%
  filter(!is.na(pct_change_mdcev))

# Average pct change across all sites and year pairs
pct_change_summary <- pct_change_site %>%
  group_by(parkname) %>%
  summarise(
    model = "MDCEV",
    mean_pct_change   = round(mean(abs(pct_change_mdcev), na.rm = TRUE), 1),
    median_pct_change = round(median(abs(pct_change_mdcev), na.rm = TRUE), 1),
    .groups = "drop"
  ) %>%
  bind_rows(
    pct_change_site %>%
      summarise(
        model             = "Observed RUM",
        mean_pct_change   = round(mean(abs(pct_change_obs),  na.rm = TRUE), 1),
        median_pct_change = round(median(abs(pct_change_obs), na.rm = TRUE), 1)
      ),
    pct_change_site %>%
      summarise(
        model             = "Frequency RUM",
        mean_pct_change   = round(mean(abs(pct_change_52wk), na.rm = TRUE), 1),
        median_pct_change = round(median(abs(pct_change_52wk), na.rm = TRUE), 1)
      )
  )

print(pct_change_summary)

# Site level pct change table
pct_change_site %>%
  group_by(parkname) %>%
  summarise(
    mean_pct_mdcev = round(mean(abs(pct_change_mdcev), na.rm = TRUE), 1),
    mean_pct_obs   = round(mean(abs(pct_change_obs),   na.rm = TRUE), 1),
    mean_pct_52wk  = round(mean(abs(pct_change_52wk),  na.rm = TRUE), 1),
    .groups        = "drop"
  ) %>%
  arrange(mean_pct_mdcev) %>%
  print(n = 22)
```

This looks at the linear welfare stability, which actually shows that MDCEV is less like we mention in paper but if we look pairwise comarision (i.e. which gives us aa really clear covid examination) across all years broadly it does have more stable. That is also interesting...

```{r}
#| cache: true

site_change_ci <- sequential_stability %>%
  pivot_longer(
    cols      = c(pct_change_mdcev, pct_change_obs, pct_change_52wk),
    names_to  = "model",
    values_to = "pct_change"
  ) %>%
  mutate(
    model = case_when(
      model == "pct_change_mdcev" ~ "MDCEV",
      model == "pct_change_obs"   ~ "Observed RUM",
      model == "pct_change_52wk"  ~ "Frequency RUM"
    ),
    model = factor(model, levels = c("MDCEV", "Observed RUM", "Frequency RUM"))
  ) %>%
  group_by(model, year_pair) %>%
  summarise(
    mean_change = mean(pct_change,  na.rm = TRUE),
    sd_change   = sd(pct_change,    na.rm = TRUE),
    n           = n(),
    se          = sd_change / sqrt(n),
    ci_low      = mean_change - 1.96 * se,
    ci_high     = mean_change + 1.96 * se,
    .groups     = "drop"
  )

p_line <- ggplot(site_change_ci,
       aes(x     = year_pair,
           y     = mean_change,
           color = model,
           group = model)) +
  
  geom_hline(yintercept =  0,  linetype = "dashed",
             color = "gray40", linewidth = 0.4) +
  geom_hline(yintercept =  30, linetype = "dotted",
             color = "gray60", linewidth = 0.4) +
  geom_hline(yintercept = -30, linetype = "dotted",
             color = "gray60", linewidth = 0.4) +

  
  geom_ribbon(
    aes(ymin = ci_low, ymax = ci_high, fill = model),
    alpha = 0.15, color = NA
  ) +
  geom_line(linewidth = 0.9) +
  geom_point(size = 2.5) +
  
  scale_color_manual(
    values = c("MDCEV"         = "gray10",
               "Observed RUM"  = "gray50",
               "Frequency RUM" = "gray75"),
    name = "Model"
  ) +
  scale_fill_manual(
    values = c("MDCEV"         = "gray10",
               "Observed RUM"  = "gray50",
               "Frequency RUM" = "gray75"),
    guide  = "none"
  ) +
  scale_x_discrete(name = "Year Pair") +    # <-- fix here
  scale_y_continuous(
    name   = "Mean % Change Across Sites",
    labels = scales::percent_format(scale = 1)
  ) +
  labs(
    title    = "Sequential Welfare Stability Across Models",
    subtitle = "Mean % change across all sites | 95% CI "
  ) +
  theme_minimal(base_family = "Times New Roman") +
  theme(
    panel.grid.minor   = element_blank(),
    panel.grid.major.x = element_blank(),
    axis.text.x  = element_text(size = 11, angle = 20, hjust = 1),
    axis.text.y  = element_text(size = 11),
    legend.position  = "bottom",
    legend.title = element_text(face = "bold", size = 12),
    legend.text  = element_text(size = 11),
    plot.title   = element_text(face = "bold", size = 14),
    plot.subtitle = element_text(size = 11, color = "gray40")
  )

print(p_line)


```