Skip to content

Instantly share code, notes, and snippets.

@jmclawson
Last active August 6, 2019 16:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jmclawson/79d95f5d10f4e5abd577fc5cf6e8e6ea to your computer and use it in GitHub Desktop.
Save jmclawson/79d95f5d10f4e5abd577fc5cf6e8e6ea to your computer and use it in GitHub Desktop.
building a corpus of titles from Wikipedia
base_beg <- "https://en.wikipedia.org/wiki/Category:"
base_end <- "th-century_novels"
get_cat_pages <- function(){
categories <<- data.frame(century=c(),
nation=c(),
url=c(),
stringsAsFactors = FALSE)
for (century in centuries){
cat_url <- paste0(base_beg,century,base_end)
page <- read_html(cat_url)
cat_list <- html_nodes(page, "div#mw-subcategories > div.mw-content-ltr ul > li")
for (nation in nations) {
if (length(grep(nation, html_text(cat_list))) > 0) {
this_url <-
cat_list[[grep(nation, html_text(cat_list))]] %>%
html_nodes("a") %>%
html_attr("href")
this_url <- paste0("https://en.wikipedia.org",this_url)
categories <<- rbind(categories,
data.frame(century=century,
nation=nation,
url=this_url,
stringsAsFactors = FALSE))
}
}
}
}
get_subcat_urls <- function(){
subcategories <<- data.frame(century=c(),
nation=c(),
year=c(),
url=c(),
stringsAsFactors = FALSE)
for (row in 1:nrow(categories)){
page <- read_html(categories[row,"url"])
sub_list <- html_nodes(page, "div#mw-subcategories")
if (length(sub_list)>0){
sub_list <- html_nodes(page, "div.mw-content-ltr ul > li")}
if (length(sub_list)>0){
index <- grep("[0-9]{4}",html_text(sub_list))
if (length(index) > 0) {
for (ind in index){
the_url <-
sub_list[[ind]] %>%
html_nodes("a") %>%
html_attr("href")
the_url <- paste0("https://en.wikipedia.org",the_url)
the_year <- gsub("[a-zA-Z ]","",
sub_list[[ind]] %>%
html_nodes("a") %>%
html_text()) %>%
as.numeric()
subcategories <<-
rbind(subcategories,
data.frame(century=categories[row,"century"],
nation=categories[row,"nation"],
year=the_year,
url=the_url,
stringsAsFactors = FALSE))
}
}
}
}
}
get_subcat_pages <- function() {
if(!dir.exists("year_lists")){dir.create("year_lists")}
for (row in 1:nrow(subcategories)){
id <- paste0(subcategories[row,"nation"],
subcategories[row,"year"])
this_url <- subcategories[row,"url"]
this_file <- paste0("year_lists/",id,".html")
if(!file.exists(this_file)){
download.file(this_url, destfile = this_file)
# Save the server! Wait before downloading more
randomtime <- sample(1:10,1)*sample(c(0.5,1,pi/2),1)
cat("Wait for",randomtime,"seconds")
Sys.sleep(randomtime)
}
}
}
parse_subcat_pages <- function(){
corpus_wikipedia <<- data.frame(titles=c(),
year=c(),
nation=c(),
stringsAsFactors = FALSE)
files <- list.files(path="year_lists/")
for (filename in files){
id <- gsub(".html","",filename)
this_length <- nchar(id)
this_year <- substr(id,this_length-3,this_length)
if (!is.na(as.numeric(this_year))) {
this_nation <- substr(id,1,this_length-4)
these_titles <- read_html(paste0("year_lists/",filename)) %>%
html_nodes("div#mw-pages") %>%
html_nodes("div.mw-content-ltr") %>%
html_nodes("li") %>%
html_text()
this_data <- data.frame(titles=as.character(these_titles),
year=as.numeric(this_year),
nation=this_nation,
stringsAsFactors = FALSE)
corpus_wikipedia <<- rbind(corpus_wikipedia,
this_data)
}
}
corpus_byyear <<- corpus_wikipedia %>%
group_by(year) %>%
summarize(count=n())
nation_byyear <<- corpus_wikipedia %>%
group_by(nation,year) %>%
summarize(count=n())
}
record_avg <- function(df=corpus_byyear,
window=3){
moving_avg <<- data.frame(avg=c(),
year=c(),
stringsAsFactors = FALSE)
start <- ceiling(window/2)
end <- nrow(df)-ceiling(window/2)
for (row in start:end){
this_start <- row-ceiling(window/2)
this_end <- row+ceiling(window/2)
this_avg <- mean(df$count[this_start:this_end])
if ("nation" %in% colnames(df)){
moving_avg <<- rbind(moving_avg,
data.frame(avg=this_avg,
year=df$year[row],
nation=df$nation[row]))
} else {
moving_avg <<- rbind(moving_avg,
data.frame(avg=this_avg,
year=df$year[row]))
}
}
}
@jmclawson
Copy link
Author

jmclawson commented Aug 6, 2019

These functions are explained further in a corresponding blog post here: https://jmclawson.net/blog/posts/selecting-a-better-corpus/

The resulting data is available for exploring here: https://jmclawson.net/projects/wiki-corpus.html

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment