Created
January 11, 2017 03:39
-
-
Save ProQuestionAsker/97f4569aafdaae1230fddc5bc819d273 to your computer and use it in GitHub Desktop.
Parsing Transcript
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Installing Necessary Packages | |
# For Web Scraping Transcripts | |
library(rvest) | |
library(curl) | |
# For Data Frame Manipulation | |
library(dplyr) | |
library(tidyr) | |
library(stringr) | |
library(stringi) | |
# Import Transcript (with formatting) | |
RO <- readLines("RogueOneTranscript.txt") | |
# Convert to Data Frame | |
RO <- as.data.frame(RO) | |
# Remove empty rows | |
RO <- RO %>% | |
filter(!(RO == "")) | |
# Separating Character from words | |
RO_full <- RO %>% | |
separate(col = RO, into = c("Character", "Words"), sep = ":", extra = "merge") %>% | |
# Eliminate script notes | |
filter(!is.na(Words)) %>% | |
# Trim white space and convert Character to factor | |
mutate(Character = as.factor(str_trim(Character)), | |
Words = str_trim(Words)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment