Skip to content

Instantly share code, notes, and snippets.

@adrienshen
Created December 5, 2019 16:25
Show Gist options
  • Save adrienshen/90a9d8f0f5e2ad1b971ce69505ef6107 to your computer and use it in GitHub Desktop.
Save adrienshen/90a9d8f0f5e2ad1b971ce69505ef6107 to your computer and use it in GitHub Desktop.
Web scraping film scripts using R and PostgreSQL
# source and credits: https://www.mikejohnpage.com/blog/web-scraping-r-s-data-table-and-writing-to-postgresql-and-mysql/
library(tidyverse)
library(robotstxt)
library(data.table)
library(rvest) # scraping library
library(stringi)
library(tidytext)
library(RPostgreSQL)
rtxt <- robotstxt("https://www.imsdb.com")
rtxt$permissions
paths_allowed("https://www.imsdb.com/scripts/10-Things-I-Hate-About-You.html")
url <- "https://www.imsdb.com/all%20scripts/"
links <- read_html(url) %>%
html_nodes("a") %>%
html_attr("href")
names <- read_html(url) %>%
html_nodes("a") %>%
html_text()
movies_dt <- data.table(links = links, names = names)
head(movies_dt)
tail(movies_dt)
movies_dt <- movies_dt[str_detect(links, "^/Movie")][!1:5]
names_replace <- str_replace_all(movies_dt$names, " ", "-") %>%
paste0(".html")
movies_dt$script = NA
movies_dt.sample = movies_dt[20:25]
names_replace.sample = names_replace[20:25]
for (i in seq_along(names_replace.sample)) {
temp_url <- paste0("https://www.imsdb.com/scripts/", names_replace.sample[i])
if (html_session(temp_url)$response$status_code == 404) {
movies_dt.sample$script[i] <- NA
next
}
script_text <- temp_url %>%
read_html() %>%
html_nodes("pre") %>%
html_text()
if (length(script_text) == 0) {
movies_dt.sample$script[i] = NA
} else {
movies_dt.sample$script[i] <- script_text
}
}
# Inspect the first few lines of movies_dt after the scraping
substring(movies_dt.sample$scripts[1], 1, 1000)
cat(substring(movies_dt.sample$scripts[1], 1, 2000))
movies_dt.sample[, scripts := str_replace(movies_dt.sample$scripts, "\\\n|\\\t|\\\r", "")]
movies_dt.sample <- unique(movies_dt.sample)
# movies_dt.first10 <- na.omit(movies_dt.first10)
# title_of_medium_articles <- "Transition into Data Engineering from a SWE Series"
movies_dt.sample[, script := str_to_lower(script)]
movies_dt.sample
movies_dt.sample[, script := stri_trans_general(movies_dt.sample$script, "latin-ascii")]
movies_dt.sample[, script := str_replace_all(script, "[^[:alnum:]']", " ")]
cat(substring(movies_dt.sample$script[1:10], 1, 2000))
# make one row for each work in script
tidy_movies <- unnest_tokens(movies_dt.sample, word, script, token = "words")
# remove stop words
tidy_movies <- tidy_movies %>% anti_join(stop_words)
tidy_movies
## PostgreSQL write to
conn <- dbConnect(drv = dbDriver("PostgreSQL"),
dbname = "learnde",
host = "localhost",
port = 5432,
user = "adrienshen",
password = "")
dbWriteTable(conn=conn,
name="movie_scripts_words",
value=tidy_movies)
dbWriteTable(conn=conn,
name="movie_scripts",
value=movies_dt.sample)
dbDisconnect(conn)
# cat(substring(movies_dt.sample$script[1:10], 1, 2000))
conn <- dbConnect(drv = dbDriver("PostgreSQL"),
dbname = "learnde",
host = "localhost",
port = 5432,
user = "adrienshen",
password = "")
# send query
query <- dbSendQuery(conn=conn,
"select word, count(*) as count
from movie_scripts_words
where names = '500 Days of Summer'
group by word
order by count desc;")
tidy_movies_st <- dbFetch(query, -1)
# to be cont.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment