Code
| Row | rownames | Ozone | Solar | Wind | Temp | Month | Day |
|---|---|---|---|---|---|---|---|
| Int64 | Int64? | Int64? | Float64 | Int64 | Int64 | Int64 | |
| 1 | 2 | 36 | 118 | 8.0 | 72 | 5 | 2 |
| 2 | 3 | 12 | 149 | 12.6 | 74 | 5 | 3 |
| 3 | 4 | 18 | 313 | 11.5 | 62 | 5 | 4 |
| 4 | 5 | missing | missing | 14.3 | 56 | 5 | 5 |
Lecture 35
April 24, 2026
Imputing one value for a missing datum cannot be correct in general, because we don’t know what value to impute with certainty (if we did, it wouldn’t be missing).
—- Rubin (1987)
In both cases, important to include as much information as possible in the imputation model!
No need to be limited to linear regression!
| Row | rownames | Ozone | Solar | Wind | Temp | Month | Day |
|---|---|---|---|---|---|---|---|
| Int64 | Int64? | Int64? | Float64 | Int64 | Int64 | Int64 | |
| 1 | 2 | 36 | 118 | 8.0 | 72 | 5 | 2 |
| 2 | 3 | 12 | 149 | 12.6 | 74 | 5 | 3 |
| 3 | 4 | 18 | 313 | 11.5 | 62 | 5 | 4 |
| 4 | 5 | missing | missing | 14.3 | 56 | 5 | 5 |
Figure 2: Air quality dataset.
p1 = scatter(dat_ozone_complete.Temp, dat_ozone_complete.Wind, xlabel = "Temperature (°C)", ylabel="Wind (mph)", markersize=5, color=:blue, label="Not Missing")
scatter!(dat_ozone_missing.Temp, dat_ozone_missing.Wind, markersize=5, color=:orange, label="Missing")
plot!(size=(600, 450))
p2 = scatter(dat_ozone_complete.Month, dat_ozone_complete.Day, xlabel = "Month", ylabel="Day", markersize=5, color=:blue, label="Not Missing")
scatter!(dat_ozone_missing.Month, dat_ozone_missing.Day, markersize=5, color=:orange, label="Missing")
plot!(size=(600, 450))
display(p1)
display(p2)Figure 4: Air quality dataset.
p1 = scatter(dat_solar_complete.Temp, dat_solar_complete.Wind, xlabel = "Temperature (°C)", ylabel="Wind (mph)", markersize=5, color=:blue, label="Not Missing")
scatter!(dat_solar_missing.Temp, dat_solar_missing.Wind, markersize=5, color=:orange, label="Missing")
plot!(size=(600, 450))
p2 = scatter(dat_solar_complete.Month, dat_solar_complete.Day, xlabel = "Month", ylabel="Day", markersize=5, color=:blue, label="Not Missing")
scatter!(dat_solar_missing.Month, dat_solar_missing.Day, markersize=5, color=:orange, label="Missing")
plot!(size=(600, 450))
display(p1)
display(p2)Impute using available data:
\[ \begin{align*} \text{Ozone} &\sim f(\text{Wind}, \text{Temp}, \text{Month}, \text{Day}) \\ \text{Solar.R} &\sim g(\text{Wind}, \text{Temp}, \text{Month}, \text{Day}) \\ \end{align*} \]
# bootstrap linear models and get predictions
nboot = 100
function impute_bootstrap_model(dat_complete, dat_missing, model_formula)
idx = sample(1:nrow(dat_complete), nrow(dat_complete), replace=true)
dat_boot = dat_complete[idx, :]
mod = lm(model_formula, dat_boot)
return mod
end
function impute_predict_regression(dat_complete, dat_missing, nboot, model_formula)
impute_out = zeros(nrow(dat_missing), nboot)
for i = 1:nboot
mod = impute_bootstrap_model(dat_complete, dat_missing, model_formula)
impute_out[:, i] = predict(mod, dat_missing) .+ rand(Normal(0, GLM.dispersion(mod.model)), size(impute_out)[1])
end
return impute_out
end
## model for solar radiation
impute_solar = impute_predict_regression(dat_solar_complete, dat_solar_missing, nboot, @formula(Solar ~ Wind + Temp + Month + Day))
## model for ozone
impute_ozone = impute_predict_regression(dat_ozone_complete, dat_ozone_missing, nboot, @formula(Ozone ~ Wind + Temp + Month + Day))
# impute values into the complete-case dataset and plot
function impute_variables(dat, impute_ozone, impute_solar)
impute = deepcopy(dat)
impute[ismissing.(impute.Ozone), :Ozone] = round.(impute_ozone[:, 1]; digits=0)
impute[ismissing.(impute.Solar), :Solar] = round.(impute_solar[:, 1]; digits=0)
return impute
end
impute1 = impute_variables(dat, impute_ozone[:, 1], impute_solar[:, 1])
p1 = scatter(impute1.Solar[.!(impute1.Miss_Ozone) .& .!(impute1.Miss_Solar)], impute1.Ozone[.!(impute1.Miss_Ozone) .& .!(impute1.Miss_Solar)], color=:blue, markersize=5, xlabel="Solar Radiation (lang)", ylabel="Ozone (ppb)", label="Observed")
scatter!(impute1.Solar[impute1.Miss_Ozone .| impute1.Miss_Solar], impute1.Ozone[impute1.Miss_Ozone .| impute1.Miss_Solar], color=:orange, markersize=5, label="Imputed")
plot!(size=(600, 450))
impute2 = impute_variables(dat, impute_ozone[:, 2], impute_solar[:, 2])
p2 = scatter(impute2.Solar[.!(impute2.Miss_Ozone) .& .!(impute2.Miss_Solar)], impute2.Ozone[.!(impute2.Miss_Ozone) .& .!(impute2.Miss_Solar)], color=:blue, markersize=5, xlabel="Solar Radiation (lang)", ylabel="Ozone (ppb)", label="Observed")
scatter!(impute2.Solar[impute2.Miss_Ozone .| impute2.Miss_Solar], impute2.Ozone[impute2.Miss_Ozone .| impute2.Miss_Solar], color=:orange, markersize=5, label="Imputed")
plot!(size=(600, 450))
display(p1)
display(p2)p1 = plot(dat.rownames, dat.Ozone, lw=3, color=:blue, label="Observations", xlabel="Day Number", ylabel="Ozone (ppb)")
for i = 1:nrow(dat_ozone_missing)
label = i == 1 ? "Imputations" : false
boxplot!(p1, [dat_ozone_missing[i, :rownames]], impute_ozone[i, :], color=:orange, label=label)
end
plot!(size=(1200, 500))
display(p1)Figure 7: Air quality dataset.
p2 = plot(dat.rownames, dat.Solar, lw=3, color=:blue, label="Observations", xlabel="Day Number", ylabel="Solar (lang)")
for i = 1:nrow(dat_solar_missing)
label = i == 1 ? "Imputations" : false
boxplot!(p2, [dat_solar_missing[i, :rownames]], impute_solar[i, :], color=:orange, label=label)
end
plot!(size=(1200, 500))
display(p2)Figure 8: Air quality dataset.
β_boot = zeros(nboot)
σ_boot = zeros(nboot)
for i = 1:nboot
dat_boot = impute_variables(dat, impute_ozone[:, i], impute_solar[:, i])
model_boot = lm(@formula(Ozone ~ Solar), dat_boot)
β_boot[i] = coef(model_boot)[2]
σ_boot[i] = stderror(model_boot.model)[2]
end
β_est = mean(β_boot)
σ_est = sqrt(mean(σ_boot.^2) + (1 + 1/nboot) * var(β_boot))
# also get complete case estimates for later
cc_model = lm(@formula(Ozone ~ Solar), dropmissing(dat))
β_cc = coef(cc_model)[2]
σ_cc = stderror(cc_model.model)[2]
histogram(β_boot, xlabel=L"$\beta$", ylabel="Count", label=false)
vline!([β_cc], color=:red, label="Complete-Case")
plot!(size=(500, 450))Imputed:
Complete Case:
# bootstrap linear models and get predictions
nboot = 100 # number of bootstrap samples for parameter variabiity
k = 5 # number of nearest-neighbors to sample from
function impute_pmm(dat_complete, dat_missing, nboot, nneighbors, model_formula, target_name)
impute = zeros(nrow(dat_missing), nboot)
candidates = zeros(nrow(dat_missing), nboot, nneighbors)
for i = 1:nboot
mod = impute_bootstrap_model(dat_complete, dat_missing, model_formula)
ŷ = predict(mod, dat_missing) # get predicted value for missing data
y = predict(mod, dat_complete)
for j = 1:nrow(dat_missing)
d = abs.(y .- ŷ[j])
sort_idx = sortperm(d)
candidates[j, i, 1:nneighbors] = dat_complete[sort_idx[1:nneighbors], target_name]
impute[j, i] = sample(candidates[j, i, :])
end
end
return impute
end
impute_ozone_pmm = impute_pmm(dat_ozone_complete, dat_ozone_missing, 100, 5, @formula(Ozone ~ Wind + Temp + Month + Day), Symbol("Ozone"))
impute_solar_pmm = impute_pmm(dat_solar_complete, dat_solar_missing, 100, 5, @formula(Solar ~ Wind + Temp + Month + Day), Symbol("Solar"))
function impute_variables(dat, impute_ozone, impute_solar)
impute = deepcopy(dat)
impute[ismissing.(impute.Ozone), :Ozone] = round.(impute_ozone[:, 1]; digits=0)
impute[ismissing.(impute.Solar), :Solar] = round.(impute_solar[:, 1]; digits=0)
return impute
end
impute1 = impute_variables(dat, impute_ozone_pmm[:, 1], impute_solar_pmm[:, 1])
p1 = scatter(impute1.Solar[.!(impute1.Miss_Ozone) .& .!(impute1.Miss_Solar)], impute1.Ozone[.!(impute1.Miss_Ozone) .& .!(impute1.Miss_Solar)], color=:blue, markersize=5, xlabel=L"Solar Radiation (W/m$^2$)", ylabel="Ozone (ppb)", label="Observed")
scatter!(impute1.Solar[impute1.Miss_Ozone .| impute1.Miss_Solar], impute1.Ozone[impute1.Miss_Ozone .| impute1.Miss_Solar], color=:orange, markersize=5, label="Imputed")
plot!(size=(600, 450))
impute2 = impute_variables(dat, impute_ozone_pmm[:, 2], impute_solar_pmm[:, 2])
p2 = scatter(impute2.Solar[.!(impute2.Miss_Ozone) .& .!(impute2.Miss_Solar)], impute2.Ozone[.!(impute2.Miss_Ozone) .& .!(impute2.Miss_Solar)], color=:blue, markersize=5, xlabel=L"Solar Radiation (W/m$^2$)", ylabel="Ozone (ppb)", label="Observed")
scatter!(impute2.Solar[impute2.Miss_Ozone .| impute2.Miss_Solar], impute2.Ozone[impute2.Miss_Ozone .| impute2.Miss_Solar], color=:orange, markersize=5, label="Imputed")
plot!(size=(600, 450))
display(p1)
display(p2)p1mm = plot(dat.rownames, dat.Ozone, lw=3, color=:blue, label="Observations", xlabel="Day Number", ylabel="Ozone (ppb)")
for i = 1:nrow(dat_ozone_missing)
label = i == 1 ? "Imputations" : false
boxplot!(p1mm, [dat_ozone_missing[i, :rownames]], impute_ozone_pmm[i, :], color=:orange, label=label)
end
plot!(size=(1200, 500))
display(p1mm)Figure 11: Air quality dataset.
p2mm = plot(dat.rownames, dat.Solar, lw=3, color=:blue, label="Observations", xlabel="Day Number", ylabel=L"Solar (W/m$^2$)")
for i = 1:nrow(dat_solar_missing)
label = i == 1 ? "Imputations" : false
boxplot!(p2mm, [dat_solar_missing[i, :rownames]], impute_solar_pmm[i, :], color=:orange, label=label)
end
plot!(size=(1200, 500))
display(p2mm)Figure 12: Air quality dataset.
β_boot = zeros(nboot)
σ_boot = zeros(nboot)
for i = 1:nboot
dat_boot = impute_variables(dat, impute_ozone[:, i], impute_solar[:, i])
model_boot = lm(@formula(Ozone ~ Solar), dat_boot)
β_boot[i] = coef(model_boot)[2]
σ_boot[i] = stderror(model_boot.model)[2]
end
β_est = mean(β_boot)
σ_est = sqrt(mean(σ_boot.^2) + (1 + 1/nboot) * var(β_boot))
# also get complete case estimates for later
cc_model = lm(@formula(Ozone ~ Solar), dropmissing(dat))
β_cc = coef(cc_model)[2]
σ_cc = stderror(cc_model.model)[2]
histogram(β_boot, xlabel=L"$\beta$", ylabel="Count", label=false)
vline!([β_cc], color=:red, label="Complete-Case")
plot!(size=(500, 450))Imputed:
Complete Case:
impute_model_df = DataFrame(impute_solar', :auto)
impute_model_df_stk = stack(impute_model_df)
impute_model_df_stk.Method .= "Prediction"
impute_pmm_df = DataFrame(impute_solar_pmm', :auto)
impute_pmm_df_stk = stack(impute_pmm_df)
impute_pmm_df_stk.Method .= "PMM"
impute_all_df = vcat(impute_model_df_stk, impute_pmm_df_stk)
@df impute_all_df groupedboxplot(:variable, :value, group=:Method, xlabel="Imputed Case", ylabel=L"Solar Radiation (W/m$^2$)")
plot!(size=(1200, 500))Figure 14: Air quality dataset.
impute_model_df = DataFrame(impute_ozone', :auto)
impute_model_df_stk = stack(impute_model_df)
impute_model_df_stk.Method .= "Prediction"
impute_pmm_df = DataFrame(impute_ozone_pmm', :auto)
impute_pmm_df_stk = stack(impute_pmm_df)
impute_pmm_df_stk.Method .= "PMM"
impute_all_df = vcat(impute_model_df_stk, impute_pmm_df_stk)
@df impute_all_df groupedboxplot(:variable, :value, group=:Method, xlabel="Imputed Case", ylabel="Ozone (ppb)")
plot!(size=(1200, 500))Figure 15: Air quality dataset.
Friday: Finish Missing Data. Class in 205 Riley-Robb
Next Week: Complete Poll on Ed.
HW6 released, due 5/1.
Quiz 4: Due 5/1