Skip to content

Instantly share code, notes, and snippets.

@MCodrescu
Created February 2, 2023 16:41
Show Gist options
  • Save MCodrescu/5af97b6b934b66a1a0960ed472972a3e to your computer and use it in GitHub Desktop.
Save MCodrescu/5af97b6b934b66a1a0960ed472972a3e to your computer and use it in GitHub Desktop.
Cleaning messy data with pandas and the tidyverse
---
title: "Cleaning messy data with pandas and the tidyverse"
format: html
---
```{python}
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
```
```{python}
housing_data = pd.read_excel(
"https://www2.census.gov/programs-surveys/mhs/tables/time-series/mhstabavgsls.xlsx",
header = 3
)
```
```{python}
housing_data_clean = (
housing_data
.iloc[1:117, :]
.rename(
columns = {
housing_data.columns[0]: "month"
}
)
.assign(
year = lambda x:
pd.to_numeric(x.month, errors = "coerce")
.fillna(method = "ffill")
)
.query(
"~ month.str.contains('[0-9]|Note', na = True)"
)
.melt(
id_vars = ["month", "year"],
var_name = "region",
value_name = "price"
)
.sort_values(
["year", "month"]
)
.assign(
region = lambda x:
x.region
.str.replace("Unnamed: \d+", "", regex = True)
.replace("", np.nan)
.fillna(method = "ffill"),
house_type = lambda x:
["total", "single", "double"] * int(len(x)/3),
date = lambda x:
pd.to_datetime(
x.month.str.cat(x.year.astype(int).astype(str), sep = " 01 "),
format = "%B %d %Y"
)
)
.reset_index(
drop = True
)
)
```
```{python}
sns.relplot(
data = (
housing_data_clean
.groupby(["date", "house_type"])
.agg(
med_price = ("price", np.median)
)
),
x = "date",
y = "med_price",
hue = "house_type",
kind = "line"
)
plt.show()
```
```{r}
library(tidyverse)
library(rio)
```
```{r}
housing_data <-
rio::import(
"https://www2.census.gov/programs-surveys/mhs/tables/time-series/mhstabavgsls.xlsx",
skip = 3
)
```
```{r}
housing_data_clean <-
housing_data |>
dplyr::slice(1:117) |>
dplyr::rename_with(
.cols = 1,
~ "month"
) |>
dplyr::mutate(
year = readr::parse_number(month)
) |>
tidyr::fill(year) |>
dplyr::filter(
!stringr::str_detect(month, "[0-9]|Note"),
) |>
tidyr::pivot_longer(
cols = -c("month", "year"),
names_to = "region",
values_to = "price",
values_transform = as.numeric
) |>
dplyr::arrange(
year,
month
) |>
dplyr::mutate(
region = stringr::str_replace(region, ".*\\d+", NA_character_)
) |>
tidyr::fill(region) |>
dplyr::mutate(
house_type = rep(c("total", "single", "double"), dplyr::n()/3),
date = lubridate::my(
paste(
month,
year
)
)
)
```
```{r}
housing_data_clean |>
dplyr::group_by(
date,
house_type
) |>
dplyr::summarise(
med_price = median(price)
) |>
ggplot2::ggplot(
ggplot2::aes(
x = date,
y = med_price,
color = house_type
)
) +
ggplot2::geom_line()
```
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment