Skip to content

Instantly share code, notes, and snippets.

@saketkc
Created April 9, 2024 04:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save saketkc/c48c637b39160d464e13e17bc6607686 to your computer and use it in GitHub Desktop.
Save saketkc/c48c637b39160d464e13e17bc6607686 to your computer and use it in GitHub Desktop.
seenunseenbooks
library(dplyr)
library(httr)
library(rvest)
library(xml2)
library(stringr)
library(DT)
library(htmltools)
domains_to_remove <- c(
"soundcloud.com",
"open.spotify.com",
"www.hotstar.com",
"mubi.com",
"www.jiocinema.com",
"en.wikipedia.org",
"www.netflix.com",
"www.primevideo.com",
"www.youtube.com",
"timesofindia.indiatimes.com",
"www.business-standard.com",
"www.livemint.com",
"www.mxplayer.in",
"youtu.be",
"www.hindustantimes.com",
"www.vudi.com",
"seenunseen.in",
"www.indiatoday.in",
"primevideo.com",
"vimeo.com",
"www.hoichoi.tv",
"tv.apple.com",
"www.imdb.com",
"www.deccanchronicle.com",
"www.voot.com",
"www.vudu.com",
"www.youtube.com",
"www.zee5.com",
"youtu.be",
"youtube.com",
"www.hulu.com",
"music.youtube.com",
"indianexpress.com",
"www.newindianexpress.com",
"www.oppenheimermovie.com",
"www.ft.com",
"www.dailymotion.com",
"watch.plex.tv",
"scroll.in",
"m.thewire.in",
"theprint.in",
"openthemagazine.com",
"images.shulcloud.com",
"criterion.com",
"watch.plex.tv",
"www.ft.com",
"tvfplay.com",
"scroll.in",
"music.apple.com",
"magiclanternmovies.in",
"www.instagram.com",
"www.oppenheimermovie.com",
"www.outlookindia.com",
"www.thehindu.com",
"indianexpress.com",
"www.sonyliv.com",
"www.storytel.com",
"blogs.intoday.in",
"www.miramax.com",
"www.theatlantic.com",
"thenetworkstate.com",
"www.bbc.co.uk",
"www.metamorphosis.media",
"alikazimi.ca",
"www.criterion.com",
"economictimes.indiatimes.com",
"www.triangleofsadness.film",
"indiauncut.com",
"www.cultureunplugged.com"
)
CleanStringEnds <- function(input_string) {
# Replace non-alphanumeric characters at the end of the string with an empty string
cleaned_string <- sub("[^A-Za-z0-9]+$", "", input_string)
return(cleaned_string)
}
CleanTitle <- function(string) {
clean_string <- gsub("—", "", string)
clean_string <- str_replace_all(clean_string, "[\r\n]", "")
clean_string <- gsub("[[:cntrl:]]+$", "", clean_string)
clean_string <- gsub("\t", " ", clean_string)
clean_string <- gsub("\\s+", " ", clean_string)
clean_string <- trimws(x = clean_string)
clean_string <- str_to_title(string = clean_string)
return(clean_string)
}
CleanAuthor <- function(string) {
clean_string <- gsub("[^[:alnum:][:space:]]", "", string)
clean_string <- gsub("\\s+", " ", clean_string)
clean_string <- trimws(x = clean_string)
clean_string <- str_to_title(string = clean_string)
return(clean_string)
}
CleanURL <- function(url) {
actual_url <- sub(".*?q=", "", url)
actual_url <- sub("&.*", "", actual_url)
actual_url <- URLdecode(actual_url)
return(actual_url)
}
FigureOutLastPage <- function(base_url = "https://seenunseen.in/feed/?paged=") {
min_page <- 1
max_page <- 1000 # Set an upper limit for the search range
while (min_page <= max_page) {
mid_page <- min_page + floor((max_page - min_page) / 2)
url <- paste0(base_url, mid_page)
response <- GET(url)
if (status_code(response) == 404) {
max_page <- mid_page - 1
} else {
min_page <- mid_page + 1
}
Sys.sleep(0.1)
}
last_valid_page <- max_page
cat("Last valid page found:", last_valid_page, "\n")
return(last_valid_page)
}
FeedToEpLinks <- function(url) {
xml_data <- GET(url)
xml_doc <- read_xml(xml_data)
items <- xml_find_all(xml_doc, "//item")
# Loop through each <item> element and extract <title> and <link>
titles <- xml_text(xml_find_all(items, "//title"))
links <- xml_text(xml_find_all(items, "//link"))
# Create a data frame with titles and links
data_df <- data.frame(title = titles, link = links) %>% filter(link != "https://seenunseen.in")
Sys.sleep(0.1)
return(data_df)
}
FetchBooksFromEpisode <- function(episode_url) {
# Fetch the HTML content from the webpage
webpage <- read_html(GET(url = episode_url))
# Extract content from the specific div class
content_div <- html_node(webpage, xpath = "//div[contains(@class, 'entry-content')]")
# Extract all text nodes and create a sequential mapping
content_div_strings <- html_text(html_nodes(content_div, xpath = ".//text()"), trim = TRUE)
content_div_sequential <- setNames(content_div_strings[-1], content_div_strings[-length(content_div_strings)])
# Iterate through all <i> tags to get book details
books <- list()
i_tags <- html_nodes(content_div, "i")
for (i_tag in i_tags) {
name <- html_text(i_tag)
author <- content_div_sequential[name]
# Extract book URL
tryCatch(
{
book_url <- html_attr(html_node(i_tag, "a"), "data-saferedirecturl")
if (is.null(book_url)) {
book_url <- html_attr(html_node(i_tag, "a"), "href")
}
},
error = function(e) {
print(e)
book_url <- NA
}
)
books[[name]] <- data.frame(name = name, author = as.character(author), book_url = book_url)
}
books.df <- bind_rows(books)
books.df$name <- CleanTitle(books.df$name)
books.df$author <- CleanAuthor(books.df$author)
books.df$book_url <- CleanURL(books.df$book_url)
books.df$episode_url <- episode_url
Sys.sleep(3)
return(books.df)
}
base_url <- "https://seenunseen.in/feed/?paged="
last_valid_page <- 38
dfs <- lapply(X = paste0(base_url, seq(1, last_valid_page)), FUN = FeedToEpLinks)
dfs_combined <- bind_rows(dfs)
books.recommend <- lapply(X = dfs_combined$link, FetchBooksFromEpisode)
books.recommend.df <- bind_rows(books.recommend)
books.recommend.df$name <- CleanTitle(books.recommend.df$name)
books.recommend.df$author <- CleanAuthor(books.recommend.df$author)
books.recommend.df <- books.recommend.df %>% filter(!grepl(x = name, pattern = "Episode Art"))
books.recommend.df$name <- CleanStringEnds(books.recommend.df$name)
books.recommend.df.filtered <- books.recommend.df %>%
filter(name != "") %>%
filter(!grepl("The Illustration For This Episode", x = name))
books.recommend.df.filtered$author <- gsub(pattern = "Edited By ", replacement = "", x = books.recommend.df.filtered$author)
books.recommend.df.grouped <- books.recommend.df.filtered %>%
group_by(name) %>%
tally() %>%
arrange(name, desc(n))
books.recommend.df.grouped2 <- books.recommend.df.filtered %>%
group_by(name, author) %>%
tally() %>%
arrange(author)
books.recommend.df.grouped2.filtered <- books.recommend.df.grouped2 %>%
filter(author != "") %>%
filter(author != "And") %>%
filter(author != "And Then
") %>%
filter(author != "And The")
domain <- sub("^(http[s]?://)?([^/]+).*", "\\2", books.recommend.df.filtered$book_url)
books.recommend.df.filtered$domain <- domain
books.recommend.df.filtered2 <- books.recommend.df.filtered %>%
filter(!domain %in% domains_to_remove) %>%
filter(domain != "NA")
episodes_df <- dfs_combined
colnames(episodes_df) <- c("episode_title", "episode_url")
episodes_df$episode <- stringr::str_split_fixed(string = episodes_df$episode_title, pattern = ":", n = 2)[, 1]
books.recommend.df.filtered2.joined <- left_join(books.recommend.df.filtered2, episodes_df)
booktoauthor <- books.recommend.df.filtered2.joined %>%
dplyr::select(name, author) %>%
unique() %>%
arrange(name, desc(author))
booktoauthor.cleaned <- booktoauthor %>%
group_by(name) %>%
filter(row_number() == 1)
booktoauthor.cleaned$author_cleaned <- booktoauthor.cleaned$author
booktoauthor.cleaned$author <- NULL
books.recommend.df.filtered2.joined2 <- left_join(books.recommend.df.filtered2.joined, booktoauthor.cleaned)
saveRDS(
books.recommend.df.filtered2.joined2,
paste0(
"~/github/misc_projects/12_seenunseen_books/recommended_books_",
gsub(pattern = " ", replacement = "_", date()), ".rds"
)
)
books.recommend.df.filtered2.grouped <- books.recommend.df.filtered2.joined2 %>%
group_by(name, author_cleaned) %>%
summarise(
book_url = sprintf('<a href="%s">Book</a>', book_url[1]),
n_episodes = length(unique(episode)),
episodes = paste0(sprintf('<a href="%s">%s</a>', episode_url, episode), collapse = ", ")
) %>%
arrange(desc(n_episodes), name, author_cleaned)
colnames(books.recommend.df.filtered2.grouped) <- c(
"Book", "Author",
"Link",
"Total episodes",
"Episodes"
)
saveRDS(
books.recommend.df.filtered2.grouped,
paste0(
"~/github/misc_projects/12_seenunseen_books/books.recommend.df.filtered2_",
gsub(pattern = " ", replacement = "_", date()), ".rds"
)
)
# Render the table as a sortable HTML table
datatablex <- datatable(books.recommend.df.filtered2.grouped,
escape = FALSE, options = list(
pageLength = 50,
autoWidth = TRUE,
ordering = TRUE
), caption = htmltools::tags$caption(
htmltools::withTags(
div(HTML("Books recommended on <a href='https://seenunseen.in/'>'The Seen and the Unseen' </a> podcast by Amit Varma | <a href='https://saket-choudhary.me/seenunseencap'>Generated captions </a>"))
),
# " <a href='test'>test</a>",
style = "font-size:22pt;"
)
)
saveWidget(widget = datatablex, "~/github/seenunseenbooks/index.html", selfcontained = TRUE, title = "SeenUnseen Books")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment