Skip to content

Instantly share code, notes, and snippets.

@feyderm
Last active February 8, 2017 22:03
Show Gist options
  • Save feyderm/e3dd42aa092f2f8847faea3bcd3249cd to your computer and use it in GitHub Desktop.
Save feyderm/e3dd42aa092f2f8847faea3bcd3249cd to your computer and use it in GitHub Desktop.
Patterns of missing data

Data are from the freetrade dataset of the Amelia package.

Missing data is represented by filled circles, and columns represent unique patterns of missing data within an observation. The percentage of observations having a given pattern is reflected in the bar chart on top. The percentage of missing values for a given variable across all observations is reflected in the bar chart to the right.

year country tariff polity pop gdp.pc intresmi signed fiveop usheg
0 0 0.339181286549708 0.0116959064327485 0 0 0.0760233918128655 0.0175438596491228 0.105263157894737 0
x.year x.country x.tariff x.polity x.pop x.gdp.pc x.intresmi x.signed x.fiveop x.usheg freq percent
TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE 96 0.56140350877193
TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE 52 0.304093567251462
TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE FALSE TRUE 9 0.0526315789473684
TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE 5 0.0292397660818713
TRUE TRUE FALSE TRUE TRUE TRUE FALSE TRUE FALSE TRUE 4 0.0233918128654971
TRUE TRUE FALSE TRUE TRUE TRUE TRUE FALSE TRUE TRUE 2 0.0116959064327485
TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE 2 0.0116959064327485
TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE 1 0.00584795321637427
<!DOCTYPE html>
<meta charset="utf-8">
<style>
text {
font-family: sans-serif;
}
text.var_labels {
font-size: 0.75em;
dominant-baseline: middle;
text-anchor: end;
}
circle.absent {
fill: #3f51b5;
}
circle.present {
stroke: #1a1a1a;
fill: #FFFFFF;
}
.bar_absent {
stroke: #3f51b5;
fill: #3f51b5;
}
.bar_combo {
stroke: #1a1a1a;
fill: #999999;
}
.axis {
opacity: 0.8;
}
.axis_label {
fill: #000000;
}
</style>
<body>
<div id="block"></div>
<script src="https://d3js.org/d3.v4.min.js"></script>
<script type="text/javascript">
// variable labels
const var_labels = ["Year",
"Country",
"Tariff rate",
"Polity IV score",
"Population",
"GDP",
"International reserves",
"IMF agreement",
"Financial openness",
"US hegemony"];
// dimensions
const svg_dx = 800,
svg_dy = 700;
const dot_plot_dx = 200,
dot_plot_dy = 300;
margin = { top: dot_plot_dy * 0.05,
bottom: dot_plot_dy * 0.05,
left: dot_plot_dx * 0.10,
right: dot_plot_dx * 0.10 };
// scales
let cxScale = d3.scalePoint()
.range([margin.left, dot_plot_dx - margin.right]);
let cyScale = d3.scalePoint()
.range([margin.bottom, dot_plot_dy - margin.top]);
let barScale = d3.scaleLinear()
.range([0, 100]);
// axes
const percentFormat = d3.format(".0%");
let bar_combos_axis = d3.axisLeft(barScale)
.ticks(5)
.tickFormat(percentFormat);
let bar_element_axis = d3.axisBottom(barScale)
.ticks(4)
.tickFormat(percentFormat);
// organize layout
let svg = d3.select("#block")
.append("svg")
.attr("height", svg_dy)
.attr("width", svg_dx);
let circle_combos = svg.append("g")
.attr("transform", "translate(400, 125)");
let bar_combos = svg.append("g")
.attr("transform", "translate(400, 15)");
let bar_elements = svg.append("g")
.attr("transform", "translate(600, 125)");
d3.csv("freetrade_missing_data_combos.csv", (d) => {
// array of variable names excluding summary stats
const col_names = d.columns.filter((col) => col != "freq")
.filter((col) => col != "percent");
cyScale.domain(col_names);
// data excluding array of column names appended by d3.csv()
const d_wo_cols = d.filter((datum) => typeof(datum) != 'array');
// combos across x-position
cxScale.domain(d3.range(d_wo_cols.length));
let combo = circle_combos.selectAll("g")
.data(d_wo_cols)
.enter()
.append("g");
// circle per combo per variable (column name)
combo.each(function(d, i) {
d3.select(this)
.selectAll("circle")
.data(col_names)
.enter()
.append("circle")
.attr("cx", () => cxScale(i)) // combos spread across x
.attr("cy", (col_name) => cyScale(col_name)) // variables spread across y
.attr("r", 7)
.attr("class", (col_name) => d_wo_cols[i][col_name] == "TRUE" ? "present" : "absent");
});
// variable labels
circle_combos.append("g")
.selectAll("text")
.data(var_labels)
.enter()
.append("text")
.attr("class", "var_labels")
.text((d) => d)
.attr("y", (d, i) => cyScale(col_names[i]));
// bar chart of combo percentages
barScale.domain([d3.max(d_wo_cols, (d) => +d.percent), 0]);
bar_combos.selectAll("rect")
.data(d_wo_cols)
.enter()
.append("rect")
.attr("class", "bar_combo")
.attr("x", (d, i) => cxScale(i) - 10)
.attr("y", (d) => barScale(+d.percent))
.attr("height", (d) => d3.max(barScale.range()) - barScale(+d.percent))
.attr("width", 20);
// bar chart axis
bar_combos.append("g")
.attr("id", "bar_combos_axis")
.attr("class", "axis")
.call(bar_combos_axis);
// axis label
d3.select("#bar_combos_axis")
.append("text")
.attr("class", "axis_label")
.text("% of observations")
.attr("x", 0)
.attr("y", 0)
.attr("transform", "translate(-35, 17) rotate(270)");
});
// bar chart of variable percentages
d3.csv("freetrade_missing_data_by_variable.csv", (d) => {
// first item in array is object of variable names and % values
const var_percents = d[0];
const col_names = Object.keys(var_percents);
cyScale.domain(col_names);
barScale.domain([0, d3.max(Object.values(var_percents))]);
bar_elements.selectAll("rect")
.data(col_names)
.enter()
.append("rect")
.attr("class", "bar_absent")
.attr("x", 0)
.attr("y", (d) => cyScale(d) - 10)
.attr("height", 20)
.attr("width", (d) => barScale(var_percents[d]));
// bar chart axis
bar_elements.append("g")
.attr("id", "bar_vars_axis")
.attr("class", "axis")
.attr("transform", "translate(0, " + dot_plot_dy + ")")
.call(bar_element_axis);
// axis label
d3.select("#bar_vars_axis")
.append("text")
.attr("class", "axis_label")
.text("% missing")
.attr("x", 0)
.attr("y", 0)
.attr("transform", "translate(47, 30)");
});
</script>
</body>
library(Amelia)
library(magrittr)
library(plyr)
library(dplyr)
data(freetrade)
# % missing by variable
n_NA <- plyr::colwise(function(x) { is.na(x) %>% sum() })
(n_NA(freetrade) / nrow(freetrade)) %>% write.csv("freetrade_missing_data_by_variable.csv", row.names = FALSE)
# count (freq) and % of total for each combination of missing data elements
# across all observations
freetrade %>%
is.na() %>%
not() %>%
plyr::count() %>%
dplyr::mutate(percent = freq/sum(freq)) %>%
dplyr::arrange(desc(percent)) %>%
write.csv("freetrade_missing_data_combos.csv", row.names = FALSE)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment