MCodrescu/ComparingRAndPython.qmd

## ComparingRAndPython.qmd
---
title: "Cleaning messy data with pandas and the tidyverse"
format: html
---

```{python}
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
```

```{python}
housing_data = pd.read_excel(
  "https://www2.census.gov/programs-surveys/mhs/tables/time-series/mhstabavgsls.xlsx",
  header = 3
)
```

```{python}
housing_data_clean = (
  housing_data
  .iloc[1:117, :]
  .rename(
    columns = {
      housing_data.columns[0]: "month"
    }
  )
  .assign(
    year = lambda x:
      pd.to_numeric(x.month, errors = "coerce")
      .fillna(method = "ffill")
  )
  .query(
    "~ month.str.contains('[0-9]|Note', na = True)"
  )
  .melt(
    id_vars = ["month", "year"],
    var_name = "region",
    value_name = "price"
  )
  .sort_values(
    ["year", "month"]
  )
  .assign(
    region = lambda x:
      x.region
      .str.replace("Unnamed: \d+", "", regex = True)
      .replace("", np.nan)
      .fillna(method = "ffill"),

    house_type = lambda x:
      ["total", "single", "double"] * int(len(x)/3),

    date = lambda x:
      pd.to_datetime(
        x.month.str.cat(x.year.astype(int).astype(str), sep = " 01 "),
        format = "%B %d %Y"
      )
  )
  .reset_index(
    drop = True
  )
)
```

```{python}
sns.relplot(
  data = (
    housing_data_clean
    .groupby(["date", "house_type"])
    .agg(
      med_price = ("price", np.median)
    )
  ),
  x = "date",
  y = "med_price",
  hue = "house_type",
  kind = "line"
)

plt.show()
```

```{r}
library(tidyverse)
library(rio)
```

```{r}
housing_data <-
  rio::import(
    "https://www2.census.gov/programs-surveys/mhs/tables/time-series/mhstabavgsls.xlsx",
    skip = 3
  )
```

```{r}
housing_data_clean <-
  housing_data |>
  dplyr::slice(1:117) |>
  dplyr::rename_with(
    .cols = 1,
    ~ "month"
  ) |>
  dplyr::mutate(
    year = readr::parse_number(month)
  ) |>
  tidyr::fill(year) |>
  dplyr::filter(
    !stringr::str_detect(month, "[0-9]|Note"),
  ) |>
  tidyr::pivot_longer(
    cols = -c("month", "year"),
    names_to = "region",
    values_to = "price",
    values_transform = as.numeric
  ) |>
  dplyr::arrange(
    year,
    month
  ) |>
  dplyr::mutate(
    region = stringr::str_replace(region, ".*\\d+", NA_character_)
  ) |>
  tidyr::fill(region) |>
  dplyr::mutate(
    house_type = rep(c("total", "single", "double"), dplyr::n()/3),
    date = lubridate::my(
      paste(
        month,
        year
      )
    )
  )
```

```{r}
housing_data_clean |>
  dplyr::group_by(
    date,
    house_type
  ) |>
  dplyr::summarise(
    med_price = median(price)
  ) |>
  ggplot2::ggplot(
    ggplot2::aes(
      x = date,
      y = med_price,
      color = house_type
    )
  ) +
  ggplot2::geom_line()
```
	---
	title: "Cleaning messy data with pandas and the tidyverse"
	format: html
	---

	```{python}
	import pandas as pd
	import numpy as np
	import seaborn as sns
	import matplotlib.pyplot as plt
	```

	```{python}
	housing_data = pd.read_excel(
	"https://www2.census.gov/programs-surveys/mhs/tables/time-series/mhstabavgsls.xlsx",
	header = 3
	)
	```

	```{python}
	housing_data_clean = (
	housing_data
	.iloc[1:117, :]
	.rename(
	columns = {
	housing_data.columns[0]: "month"
	}
	)
	.assign(
	year = lambda x:
	pd.to_numeric(x.month, errors = "coerce")
	.fillna(method = "ffill")
	)
	.query(
	"~ month.str.contains('[0-9]\|Note', na = True)"
	)
	.melt(
	id_vars = ["month", "year"],
	var_name = "region",
	value_name = "price"
	)
	.sort_values(
	["year", "month"]
	)
	.assign(
	region = lambda x:
	x.region
	.str.replace("Unnamed: \d+", "", regex = True)
	.replace("", np.nan)
	.fillna(method = "ffill"),

	house_type = lambda x:
	["total", "single", "double"] * int(len(x)/3),

	date = lambda x:
	pd.to_datetime(
	x.month.str.cat(x.year.astype(int).astype(str), sep = " 01 "),
	format = "%B %d %Y"
	)
	)
	.reset_index(
	drop = True
	)
	)
	```

	```{python}
	sns.relplot(
	data = (
	housing_data_clean
	.groupby(["date", "house_type"])
	.agg(
	med_price = ("price", np.median)
	)
	),
	x = "date",
	y = "med_price",
	hue = "house_type",
	kind = "line"
	)

	plt.show()
	```

	```{r}
	library(tidyverse)
	library(rio)
	```

	```{r}
	housing_data <-
	rio::import(
	"https://www2.census.gov/programs-surveys/mhs/tables/time-series/mhstabavgsls.xlsx",
	skip = 3
	)
	```

	```{r}
	housing_data_clean <-
	housing_data \|>
	dplyr::slice(1:117) \|>
	dplyr::rename_with(
	.cols = 1,
	~ "month"
	) \|>
	dplyr::mutate(
	year = readr::parse_number(month)
	) \|>
	tidyr::fill(year) \|>
	dplyr::filter(
	!stringr::str_detect(month, "[0-9]\|Note"),
	) \|>
	tidyr::pivot_longer(
	cols = -c("month", "year"),
	names_to = "region",
	values_to = "price",
	values_transform = as.numeric
	) \|>
	dplyr::arrange(
	year,
	month
	) \|>
	dplyr::mutate(
	region = stringr::str_replace(region, ".*\\d+", NA_character_)
	) \|>
	tidyr::fill(region) \|>
	dplyr::mutate(
	house_type = rep(c("total", "single", "double"), dplyr::n()/3),
	date = lubridate::my(
	paste(
	month,
	year
	)
	)
	)
	```

	```{r}
	housing_data_clean \|>
	dplyr::group_by(
	date,
	house_type
	) \|>
	dplyr::summarise(
	med_price = median(price)
	) \|>
	ggplot2::ggplot(
	ggplot2::aes(
	x = date,
	y = med_price,
	color = house_type
	)
	) +
	ggplot2::geom_line()
	```