agoldst/nobel-genres.Rmd

## nobel-genres.Rmd
---
title: "Nobel genre tallies"
output:
  html_document:
    self_contained: false
...

```{r setup, include=F}
library(tidyverse)
library(rvest)
library(knitr)
opts_chunk$set(echo=F)
```

```{r constants}
# JSON data available via
# <http://api.nobelprize.org/2.1/nobelPrizes?nobelPrizeCategory=lit> and
# <http://api.nobelprize.org/2.1/laureates?nobelPrizeCategory=lit>
# but Wikipedia's table is easier to scrape
url <- "https://en.wikipedia.org/wiki/List_of_Nobel_laureates_in_Literature"
data_file <- "nobel.tsv"
```

```{r prize-retrieval}
if (!file.exists(data_file)) {
    list_page <- read_html(url)
    # first table is the big list
    list_page %>% html_node("table") %>%
        html_table(fill=T) %>%
        select(-Picture) %>%
        write_tsv(data_file)
}
```

```{r load-prize}
laureates <- read_tsv(data_file) %>%
    rename(year=Year, genre=`Genre(s)`)
```

```{r genre-tally}
genres <- laureates %>%
    mutate(genre=str_split(genre, ", ")) %>%
    select(year, genre) %>%
    unnest(genre) %>%
    filter(genre != "Not awarded") %>%
    # recode genre
    mutate(genre=case_when(
        genre %in% c("novel", "short story") ~ "fiction",
        genre %in% c("memoirs", "biography", "autobiography",
                     "philosophy", "literary criticism", "philology",
                     "history", "law", "essay") ~ "non-fiction",
        genre %in% c("music", "songwriting") ~ "music",
        TRUE ~ genre)) %>%
    # deduplicate
    distinct() %>%
    mutate(count=1) %>%
    pivot_wider(names_from="genre", values_from="count",
                values_fill = 0, values_fn=sum)
```

```{r genre-cum-plot}
genres_cum <- genres %>%
    mutate(across(!year, cumsum)) %>%
    pivot_longer(-year, names_to="genre", values_to="count") %>%
    mutate(genre=fct_reorder2(genre, year, count))

genres_cum %>%
    group_by(year) %>%
    mutate(count = count/sum(count)) %>%
    ggplot(aes(year, count, fill=genre)) +
        geom_area(position="stack") +
        scale_x_continuous(breaks=seq(1910, 2020, by=10)) +
        scale_y_continuous(labels=function (x) str_c(x * 100, "%")) +
        scale_fill_viridis_d() +
        coord_cartesian(expand=F) +
        labs(y="cumulative proportion of laureates' genres",
             title="Genres' cumulative share of the literature Nobel, 1901–2021",
             caption=str_wrap(
"Data from https://en.wikipedia.org/wiki/List_of_Nobel_laureates_in_Literature. The height of each strip represents the cumulative proportion of laureates up to that date who worked in that genre. N.B. when a writer worked in multiple genres, that writer's prize is counted multiple times."))
```

```{r genre-cum-facet}
genres_cum %>%
    filter(genre %in% c("fiction", "poetry", "non-fiction", "drama")) %>%
    ggplot(aes(year, count, fill=genre)) +
        geom_area() +
        facet_wrap(~ genre) +
        scale_fill_viridis_d(guide="none") +
        coord_cartesian(expand=F) +
        labs(y="cumulative count of laureates' genres",
             title="Cumulative tally of literature Nobel laureates' genres",
             caption="Data as above, for the four most numerous genres only.")
```
	---
	title: "Nobel genre tallies"
	output:
	html_document:
	self_contained: false
	...

	```{r setup, include=F}
	library(tidyverse)
	library(rvest)
	library(knitr)
	opts_chunk$set(echo=F)
	```

	```{r constants}
	# JSON data available via
	# <http://api.nobelprize.org/2.1/nobelPrizes?nobelPrizeCategory=lit> and
	# <http://api.nobelprize.org/2.1/laureates?nobelPrizeCategory=lit>
	# but Wikipedia's table is easier to scrape
	url <- "https://en.wikipedia.org/wiki/List_of_Nobel_laureates_in_Literature"
	data_file <- "nobel.tsv"
	```

	```{r prize-retrieval}
	if (!file.exists(data_file)) {
	list_page <- read_html(url)
	# first table is the big list
	list_page %>% html_node("table") %>%
	html_table(fill=T) %>%
	select(-Picture) %>%
	write_tsv(data_file)
	}
	```

	```{r load-prize}
	laureates <- read_tsv(data_file) %>%
	rename(year=Year, genre=`Genre(s)`)
	```

	```{r genre-tally}
	genres <- laureates %>%
	mutate(genre=str_split(genre, ", ")) %>%
	select(year, genre) %>%
	unnest(genre) %>%
	filter(genre != "Not awarded") %>%
	# recode genre
	mutate(genre=case_when(
	genre %in% c("novel", "short story") ~ "fiction",
	genre %in% c("memoirs", "biography", "autobiography",
	"philosophy", "literary criticism", "philology",
	"history", "law", "essay") ~ "non-fiction",
	genre %in% c("music", "songwriting") ~ "music",
	TRUE ~ genre)) %>%
	# deduplicate
	distinct() %>%
	mutate(count=1) %>%
	pivot_wider(names_from="genre", values_from="count",
	values_fill = 0, values_fn=sum)
	```

	```{r genre-cum-plot}
	genres_cum <- genres %>%
	mutate(across(!year, cumsum)) %>%
	pivot_longer(-year, names_to="genre", values_to="count") %>%
	mutate(genre=fct_reorder2(genre, year, count))

	genres_cum %>%
	group_by(year) %>%
	mutate(count = count/sum(count)) %>%
	ggplot(aes(year, count, fill=genre)) +
	geom_area(position="stack") +
	scale_x_continuous(breaks=seq(1910, 2020, by=10)) +
	scale_y_continuous(labels=function (x) str_c(x * 100, "%")) +
	scale_fill_viridis_d() +
	coord_cartesian(expand=F) +
	labs(y="cumulative proportion of laureates' genres",
	title="Genres' cumulative share of the literature Nobel, 1901–2021",
	caption=str_wrap(
	"Data from https://en.wikipedia.org/wiki/List_of_Nobel_laureates_in_Literature. The height of each strip represents the cumulative proportion of laureates up to that date who worked in that genre. N.B. when a writer worked in multiple genres, that writer's prize is counted multiple times."))
	```

	```{r genre-cum-facet}
	genres_cum %>%
	filter(genre %in% c("fiction", "poetry", "non-fiction", "drama")) %>%
	ggplot(aes(year, count, fill=genre)) +
	geom_area() +
	facet_wrap(~ genre) +
	scale_fill_viridis_d(guide="none") +
	coord_cartesian(expand=F) +
	labs(y="cumulative count of laureates' genres",
	title="Cumulative tally of literature Nobel laureates' genres",
	caption="Data as above, for the four most numerous genres only.")
	```