Created
April 9, 2024 04:07
-
-
Save saketkc/c48c637b39160d464e13e17bc6607686 to your computer and use it in GitHub Desktop.
seenunseenbooks
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(dplyr) | |
library(httr) | |
library(rvest) | |
library(xml2) | |
library(stringr) | |
library(DT) | |
library(htmltools) | |
domains_to_remove <- c( | |
"soundcloud.com", | |
"open.spotify.com", | |
"www.hotstar.com", | |
"mubi.com", | |
"www.jiocinema.com", | |
"en.wikipedia.org", | |
"www.netflix.com", | |
"www.primevideo.com", | |
"www.youtube.com", | |
"timesofindia.indiatimes.com", | |
"www.business-standard.com", | |
"www.livemint.com", | |
"www.mxplayer.in", | |
"youtu.be", | |
"www.hindustantimes.com", | |
"www.vudi.com", | |
"seenunseen.in", | |
"www.indiatoday.in", | |
"primevideo.com", | |
"vimeo.com", | |
"www.hoichoi.tv", | |
"tv.apple.com", | |
"www.imdb.com", | |
"www.deccanchronicle.com", | |
"www.voot.com", | |
"www.vudu.com", | |
"www.youtube.com", | |
"www.zee5.com", | |
"youtu.be", | |
"youtube.com", | |
"www.hulu.com", | |
"music.youtube.com", | |
"indianexpress.com", | |
"www.newindianexpress.com", | |
"www.oppenheimermovie.com", | |
"www.ft.com", | |
"www.dailymotion.com", | |
"watch.plex.tv", | |
"scroll.in", | |
"m.thewire.in", | |
"theprint.in", | |
"openthemagazine.com", | |
"images.shulcloud.com", | |
"criterion.com", | |
"watch.plex.tv", | |
"www.ft.com", | |
"tvfplay.com", | |
"scroll.in", | |
"music.apple.com", | |
"magiclanternmovies.in", | |
"www.instagram.com", | |
"www.oppenheimermovie.com", | |
"www.outlookindia.com", | |
"www.thehindu.com", | |
"indianexpress.com", | |
"www.sonyliv.com", | |
"www.storytel.com", | |
"blogs.intoday.in", | |
"www.miramax.com", | |
"www.theatlantic.com", | |
"thenetworkstate.com", | |
"www.bbc.co.uk", | |
"www.metamorphosis.media", | |
"alikazimi.ca", | |
"www.criterion.com", | |
"economictimes.indiatimes.com", | |
"www.triangleofsadness.film", | |
"indiauncut.com", | |
"www.cultureunplugged.com" | |
) | |
CleanStringEnds <- function(input_string) { | |
# Replace non-alphanumeric characters at the end of the string with an empty string | |
cleaned_string <- sub("[^A-Za-z0-9]+$", "", input_string) | |
return(cleaned_string) | |
} | |
CleanTitle <- function(string) { | |
clean_string <- gsub("—", "", string) | |
clean_string <- str_replace_all(clean_string, "[\r\n]", "") | |
clean_string <- gsub("[[:cntrl:]]+$", "", clean_string) | |
clean_string <- gsub("\t", " ", clean_string) | |
clean_string <- gsub("\\s+", " ", clean_string) | |
clean_string <- trimws(x = clean_string) | |
clean_string <- str_to_title(string = clean_string) | |
return(clean_string) | |
} | |
CleanAuthor <- function(string) { | |
clean_string <- gsub("[^[:alnum:][:space:]]", "", string) | |
clean_string <- gsub("\\s+", " ", clean_string) | |
clean_string <- trimws(x = clean_string) | |
clean_string <- str_to_title(string = clean_string) | |
return(clean_string) | |
} | |
CleanURL <- function(url) { | |
actual_url <- sub(".*?q=", "", url) | |
actual_url <- sub("&.*", "", actual_url) | |
actual_url <- URLdecode(actual_url) | |
return(actual_url) | |
} | |
FigureOutLastPage <- function(base_url = "https://seenunseen.in/feed/?paged=") { | |
min_page <- 1 | |
max_page <- 1000 # Set an upper limit for the search range | |
while (min_page <= max_page) { | |
mid_page <- min_page + floor((max_page - min_page) / 2) | |
url <- paste0(base_url, mid_page) | |
response <- GET(url) | |
if (status_code(response) == 404) { | |
max_page <- mid_page - 1 | |
} else { | |
min_page <- mid_page + 1 | |
} | |
Sys.sleep(0.1) | |
} | |
last_valid_page <- max_page | |
cat("Last valid page found:", last_valid_page, "\n") | |
return(last_valid_page) | |
} | |
FeedToEpLinks <- function(url) { | |
xml_data <- GET(url) | |
xml_doc <- read_xml(xml_data) | |
items <- xml_find_all(xml_doc, "//item") | |
# Loop through each <item> element and extract <title> and <link> | |
titles <- xml_text(xml_find_all(items, "//title")) | |
links <- xml_text(xml_find_all(items, "//link")) | |
# Create a data frame with titles and links | |
data_df <- data.frame(title = titles, link = links) %>% filter(link != "https://seenunseen.in") | |
Sys.sleep(0.1) | |
return(data_df) | |
} | |
FetchBooksFromEpisode <- function(episode_url) { | |
# Fetch the HTML content from the webpage | |
webpage <- read_html(GET(url = episode_url)) | |
# Extract content from the specific div class | |
content_div <- html_node(webpage, xpath = "//div[contains(@class, 'entry-content')]") | |
# Extract all text nodes and create a sequential mapping | |
content_div_strings <- html_text(html_nodes(content_div, xpath = ".//text()"), trim = TRUE) | |
content_div_sequential <- setNames(content_div_strings[-1], content_div_strings[-length(content_div_strings)]) | |
# Iterate through all <i> tags to get book details | |
books <- list() | |
i_tags <- html_nodes(content_div, "i") | |
for (i_tag in i_tags) { | |
name <- html_text(i_tag) | |
author <- content_div_sequential[name] | |
# Extract book URL | |
tryCatch( | |
{ | |
book_url <- html_attr(html_node(i_tag, "a"), "data-saferedirecturl") | |
if (is.null(book_url)) { | |
book_url <- html_attr(html_node(i_tag, "a"), "href") | |
} | |
}, | |
error = function(e) { | |
print(e) | |
book_url <- NA | |
} | |
) | |
books[[name]] <- data.frame(name = name, author = as.character(author), book_url = book_url) | |
} | |
books.df <- bind_rows(books) | |
books.df$name <- CleanTitle(books.df$name) | |
books.df$author <- CleanAuthor(books.df$author) | |
books.df$book_url <- CleanURL(books.df$book_url) | |
books.df$episode_url <- episode_url | |
Sys.sleep(3) | |
return(books.df) | |
} | |
base_url <- "https://seenunseen.in/feed/?paged=" | |
last_valid_page <- 38 | |
dfs <- lapply(X = paste0(base_url, seq(1, last_valid_page)), FUN = FeedToEpLinks) | |
dfs_combined <- bind_rows(dfs) | |
books.recommend <- lapply(X = dfs_combined$link, FetchBooksFromEpisode) | |
books.recommend.df <- bind_rows(books.recommend) | |
books.recommend.df$name <- CleanTitle(books.recommend.df$name) | |
books.recommend.df$author <- CleanAuthor(books.recommend.df$author) | |
books.recommend.df <- books.recommend.df %>% filter(!grepl(x = name, pattern = "Episode Art")) | |
books.recommend.df$name <- CleanStringEnds(books.recommend.df$name) | |
books.recommend.df.filtered <- books.recommend.df %>% | |
filter(name != "") %>% | |
filter(!grepl("The Illustration For This Episode", x = name)) | |
books.recommend.df.filtered$author <- gsub(pattern = "Edited By ", replacement = "", x = books.recommend.df.filtered$author) | |
books.recommend.df.grouped <- books.recommend.df.filtered %>% | |
group_by(name) %>% | |
tally() %>% | |
arrange(name, desc(n)) | |
books.recommend.df.grouped2 <- books.recommend.df.filtered %>% | |
group_by(name, author) %>% | |
tally() %>% | |
arrange(author) | |
books.recommend.df.grouped2.filtered <- books.recommend.df.grouped2 %>% | |
filter(author != "") %>% | |
filter(author != "And") %>% | |
filter(author != "And Then | |
") %>% | |
filter(author != "And The") | |
domain <- sub("^(http[s]?://)?([^/]+).*", "\\2", books.recommend.df.filtered$book_url) | |
books.recommend.df.filtered$domain <- domain | |
books.recommend.df.filtered2 <- books.recommend.df.filtered %>% | |
filter(!domain %in% domains_to_remove) %>% | |
filter(domain != "NA") | |
episodes_df <- dfs_combined | |
colnames(episodes_df) <- c("episode_title", "episode_url") | |
episodes_df$episode <- stringr::str_split_fixed(string = episodes_df$episode_title, pattern = ":", n = 2)[, 1] | |
books.recommend.df.filtered2.joined <- left_join(books.recommend.df.filtered2, episodes_df) | |
booktoauthor <- books.recommend.df.filtered2.joined %>% | |
dplyr::select(name, author) %>% | |
unique() %>% | |
arrange(name, desc(author)) | |
booktoauthor.cleaned <- booktoauthor %>% | |
group_by(name) %>% | |
filter(row_number() == 1) | |
booktoauthor.cleaned$author_cleaned <- booktoauthor.cleaned$author | |
booktoauthor.cleaned$author <- NULL | |
books.recommend.df.filtered2.joined2 <- left_join(books.recommend.df.filtered2.joined, booktoauthor.cleaned) | |
saveRDS( | |
books.recommend.df.filtered2.joined2, | |
paste0( | |
"~/github/misc_projects/12_seenunseen_books/recommended_books_", | |
gsub(pattern = " ", replacement = "_", date()), ".rds" | |
) | |
) | |
books.recommend.df.filtered2.grouped <- books.recommend.df.filtered2.joined2 %>% | |
group_by(name, author_cleaned) %>% | |
summarise( | |
book_url = sprintf('<a href="%s">Book</a>', book_url[1]), | |
n_episodes = length(unique(episode)), | |
episodes = paste0(sprintf('<a href="%s">%s</a>', episode_url, episode), collapse = ", ") | |
) %>% | |
arrange(desc(n_episodes), name, author_cleaned) | |
colnames(books.recommend.df.filtered2.grouped) <- c( | |
"Book", "Author", | |
"Link", | |
"Total episodes", | |
"Episodes" | |
) | |
saveRDS( | |
books.recommend.df.filtered2.grouped, | |
paste0( | |
"~/github/misc_projects/12_seenunseen_books/books.recommend.df.filtered2_", | |
gsub(pattern = " ", replacement = "_", date()), ".rds" | |
) | |
) | |
# Render the table as a sortable HTML table | |
datatablex <- datatable(books.recommend.df.filtered2.grouped, | |
escape = FALSE, options = list( | |
pageLength = 50, | |
autoWidth = TRUE, | |
ordering = TRUE | |
), caption = htmltools::tags$caption( | |
htmltools::withTags( | |
div(HTML("Books recommended on <a href='https://seenunseen.in/'>'The Seen and the Unseen' </a> podcast by Amit Varma | <a href='https://saket-choudhary.me/seenunseencap'>Generated captions </a>")) | |
), | |
# " <a href='test'>test</a>", | |
style = "font-size:22pt;" | |
) | |
) | |
saveWidget(widget = datatablex, "~/github/seenunseenbooks/index.html", selfcontained = TRUE, title = "SeenUnseen Books") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment