saketkc/seenunseenbooks.R

## seenunseenbooks.R
library(dplyr)
library(httr)
library(rvest)
library(xml2)
library(stringr)
library(DT)
library(htmltools)


domains_to_remove <- c(
  "soundcloud.com",
  "open.spotify.com",
  "www.hotstar.com",
  "mubi.com",
  "www.jiocinema.com",
  "en.wikipedia.org",
  "www.netflix.com",
  "www.primevideo.com",
  "www.youtube.com",
  "timesofindia.indiatimes.com",
  "www.business-standard.com",
  "www.livemint.com",
  "www.mxplayer.in",
  "youtu.be",
  "www.hindustantimes.com",
  "www.vudi.com",
  "seenunseen.in",
  "www.indiatoday.in",
  "primevideo.com",
  "vimeo.com",
  "www.hoichoi.tv",
  "tv.apple.com",
  "www.imdb.com",
  "www.deccanchronicle.com",
  "www.voot.com",
  "www.vudu.com",
  "www.youtube.com",
  "www.zee5.com",
  "youtu.be",
  "youtube.com",
  "www.hulu.com",
  "music.youtube.com",
  "indianexpress.com",
  "www.newindianexpress.com",
  "www.oppenheimermovie.com",
  "www.ft.com",
  "www.dailymotion.com",
  "watch.plex.tv",
  "scroll.in",
  "m.thewire.in",
  "theprint.in",
  "openthemagazine.com",
  "images.shulcloud.com",
  "criterion.com",
  "watch.plex.tv",
  "www.ft.com",
  "tvfplay.com",
  "scroll.in",
  "music.apple.com",
  "magiclanternmovies.in",
  "www.instagram.com",
  "www.oppenheimermovie.com",
  "www.outlookindia.com",
  "www.thehindu.com",
  "indianexpress.com",
  "www.sonyliv.com",
  "www.storytel.com",
  "blogs.intoday.in",
  "www.miramax.com",
  "www.theatlantic.com",
  "thenetworkstate.com",
  "www.bbc.co.uk",
  "www.metamorphosis.media",
  "alikazimi.ca",
  "www.criterion.com",
  "economictimes.indiatimes.com",
  "www.triangleofsadness.film",
  "indiauncut.com",
  "www.cultureunplugged.com"
)

CleanStringEnds <- function(input_string) {
  # Replace non-alphanumeric characters at the end of the string with an empty string
  cleaned_string <- sub("[^A-Za-z0-9]+$", "", input_string)
  return(cleaned_string)
}

CleanTitle <- function(string) {
  clean_string <- gsub("—", "", string)
  clean_string <- str_replace_all(clean_string, "[\r\n]", "")
  clean_string <- gsub("[[:cntrl:]]+$", "", clean_string)
  clean_string <- gsub("\t", " ", clean_string)
  clean_string <- gsub("\\s+", " ", clean_string)
  clean_string <- trimws(x = clean_string)
  clean_string <- str_to_title(string = clean_string)
  return(clean_string)
}

CleanAuthor <- function(string) {
  clean_string <- gsub("[^[:alnum:][:space:]]", "", string)
  clean_string <- gsub("\\s+", " ", clean_string)
  clean_string <- trimws(x = clean_string)
  clean_string <- str_to_title(string = clean_string)
  return(clean_string)
}

CleanURL <- function(url) {
  actual_url <- sub(".*?q=", "", url)
  actual_url <- sub("&.*", "", actual_url)
  actual_url <- URLdecode(actual_url)
  return(actual_url)
}

FigureOutLastPage <- function(base_url = "https://seenunseen.in/feed/?paged=") {
  min_page <- 1
  max_page <- 1000 # Set an upper limit for the search range

  while (min_page <= max_page) {
    mid_page <- min_page + floor((max_page - min_page) / 2)
    url <- paste0(base_url, mid_page)

    response <- GET(url)

    if (status_code(response) == 404) {
      max_page <- mid_page - 1
    } else {
      min_page <- mid_page + 1
    }
    Sys.sleep(0.1)
  }

  last_valid_page <- max_page
  cat("Last valid page found:", last_valid_page, "\n")
  return(last_valid_page)
}

FeedToEpLinks <- function(url) {
  xml_data <- GET(url)
  xml_doc <- read_xml(xml_data)

  items <- xml_find_all(xml_doc, "//item")

  # Loop through each <item> element and extract <title> and <link>
  titles <- xml_text(xml_find_all(items, "//title"))
  links <- xml_text(xml_find_all(items, "//link"))
  # Create a data frame with titles and links
  data_df <- data.frame(title = titles, link = links) %>% filter(link != "https://seenunseen.in")
  Sys.sleep(0.1)
  return(data_df)
}


FetchBooksFromEpisode <- function(episode_url) {
  # Fetch the HTML content from the webpage
  webpage <- read_html(GET(url = episode_url))

  # Extract content from the specific div class
  content_div <- html_node(webpage, xpath = "//div[contains(@class, 'entry-content')]")

  # Extract all text nodes and create a sequential mapping
  content_div_strings <- html_text(html_nodes(content_div, xpath = ".//text()"), trim = TRUE)
  content_div_sequential <- setNames(content_div_strings[-1], content_div_strings[-length(content_div_strings)])

  # Iterate through all <i> tags to get book details
  books <- list()
  i_tags <- html_nodes(content_div, "i")

  for (i_tag in i_tags) {
    name <- html_text(i_tag)
    author <- content_div_sequential[name]

    # Extract book URL
    tryCatch(
      {
        book_url <- html_attr(html_node(i_tag, "a"), "data-saferedirecturl")
        if (is.null(book_url)) {
          book_url <- html_attr(html_node(i_tag, "a"), "href")
        }
      },
      error = function(e) {
        print(e)
        book_url <- NA
      }
    )

    books[[name]] <- data.frame(name = name, author = as.character(author), book_url = book_url)
  }

  books.df <- bind_rows(books)
  books.df$name <- CleanTitle(books.df$name)
  books.df$author <- CleanAuthor(books.df$author)
  books.df$book_url <- CleanURL(books.df$book_url)
  books.df$episode_url <- episode_url
  Sys.sleep(3)

  return(books.df)
}

base_url <- "https://seenunseen.in/feed/?paged="
last_valid_page <- 38
dfs <- lapply(X = paste0(base_url, seq(1, last_valid_page)), FUN = FeedToEpLinks)
dfs_combined <- bind_rows(dfs)

books.recommend <- lapply(X = dfs_combined$link, FetchBooksFromEpisode)

books.recommend.df <- bind_rows(books.recommend)
books.recommend.df$name <- CleanTitle(books.recommend.df$name)
books.recommend.df$author <- CleanAuthor(books.recommend.df$author)
books.recommend.df <- books.recommend.df %>% filter(!grepl(x = name, pattern = "Episode Art"))

books.recommend.df$name <- CleanStringEnds(books.recommend.df$name)
books.recommend.df.filtered <- books.recommend.df %>%
  filter(name != "") %>%
  filter(!grepl("The Illustration For This Episode", x = name))
books.recommend.df.filtered$author <- gsub(pattern = "Edited By ", replacement = "", x = books.recommend.df.filtered$author)

books.recommend.df.grouped <- books.recommend.df.filtered %>%
  group_by(name) %>%
  tally() %>%
  arrange(name, desc(n))
books.recommend.df.grouped2 <- books.recommend.df.filtered %>%
  group_by(name, author) %>%
  tally() %>%
  arrange(author)
books.recommend.df.grouped2.filtered <- books.recommend.df.grouped2 %>%
  filter(author != "") %>%
  filter(author != "And") %>%
  filter(author != "And Then
") %>%
  filter(author != "And The")

domain <- sub("^(http[s]?://)?([^/]+).*", "\\2", books.recommend.df.filtered$book_url)

books.recommend.df.filtered$domain <- domain
books.recommend.df.filtered2 <- books.recommend.df.filtered %>%
  filter(!domain %in% domains_to_remove) %>%
  filter(domain != "NA")


episodes_df <- dfs_combined
colnames(episodes_df) <- c("episode_title", "episode_url")
episodes_df$episode <- stringr::str_split_fixed(string = episodes_df$episode_title, pattern = ":", n = 2)[, 1]


books.recommend.df.filtered2.joined <- left_join(books.recommend.df.filtered2, episodes_df)
booktoauthor <- books.recommend.df.filtered2.joined %>%
  dplyr::select(name, author) %>%
  unique() %>%
  arrange(name, desc(author))
booktoauthor.cleaned <- booktoauthor %>%
  group_by(name) %>%
  filter(row_number() == 1)

booktoauthor.cleaned$author_cleaned <- booktoauthor.cleaned$author
booktoauthor.cleaned$author <- NULL
books.recommend.df.filtered2.joined2 <- left_join(books.recommend.df.filtered2.joined, booktoauthor.cleaned)


saveRDS(
  books.recommend.df.filtered2.joined2,
  paste0(
    "~/github/misc_projects/12_seenunseen_books/recommended_books_",
    gsub(pattern = " ", replacement = "_", date()), ".rds"
  )
)

books.recommend.df.filtered2.grouped <- books.recommend.df.filtered2.joined2 %>%
  group_by(name, author_cleaned) %>%
  summarise(
    book_url = sprintf('<a href="%s">Book</a>', book_url[1]),
    n_episodes = length(unique(episode)),
    episodes = paste0(sprintf('<a href="%s">%s</a>', episode_url, episode), collapse = ", ")
  ) %>%
  arrange(desc(n_episodes), name, author_cleaned)
colnames(books.recommend.df.filtered2.grouped) <- c(
  "Book", "Author",
  "Link",
  "Total episodes",
  "Episodes"
)
saveRDS(
  books.recommend.df.filtered2.grouped,
  paste0(
    "~/github/misc_projects/12_seenunseen_books/books.recommend.df.filtered2_",
    gsub(pattern = " ", replacement = "_", date()), ".rds"
  )
)

# Render the table as a sortable HTML table
datatablex <- datatable(books.recommend.df.filtered2.grouped,
  escape = FALSE, options = list(
    pageLength = 50,
    autoWidth = TRUE,
    ordering = TRUE
  ), caption = htmltools::tags$caption(
    htmltools::withTags(
      div(HTML("Books recommended on <a href='https://seenunseen.in/'>'The Seen and the Unseen' </a> podcast by Amit Varma | <a href='https://saket-choudhary.me/seenunseencap'>Generated captions </a>"))
    ),
    # " <a href='test'>test</a>",
    style = "font-size:22pt;"
  )
)
saveWidget(widget = datatablex, "~/github/seenunseenbooks/index.html", selfcontained = TRUE, title = "SeenUnseen Books")
	library(dplyr)
	library(httr)
	library(rvest)
	library(xml2)
	library(stringr)
	library(DT)
	library(htmltools)


	domains_to_remove <- c(
	"soundcloud.com",
	"open.spotify.com",
	"www.hotstar.com",
	"mubi.com",
	"www.jiocinema.com",
	"en.wikipedia.org",
	"www.netflix.com",
	"www.primevideo.com",
	"www.youtube.com",
	"timesofindia.indiatimes.com",
	"www.business-standard.com",
	"www.livemint.com",
	"www.mxplayer.in",
	"youtu.be",
	"www.hindustantimes.com",
	"www.vudi.com",
	"seenunseen.in",
	"www.indiatoday.in",
	"primevideo.com",
	"vimeo.com",
	"www.hoichoi.tv",
	"tv.apple.com",
	"www.imdb.com",
	"www.deccanchronicle.com",
	"www.voot.com",
	"www.vudu.com",
	"www.youtube.com",
	"www.zee5.com",
	"youtu.be",
	"youtube.com",
	"www.hulu.com",
	"music.youtube.com",
	"indianexpress.com",
	"www.newindianexpress.com",
	"www.oppenheimermovie.com",
	"www.ft.com",
	"www.dailymotion.com",
	"watch.plex.tv",
	"scroll.in",
	"m.thewire.in",
	"theprint.in",
	"openthemagazine.com",
	"images.shulcloud.com",
	"criterion.com",
	"watch.plex.tv",
	"www.ft.com",
	"tvfplay.com",
	"scroll.in",
	"music.apple.com",
	"magiclanternmovies.in",
	"www.instagram.com",
	"www.oppenheimermovie.com",
	"www.outlookindia.com",
	"www.thehindu.com",
	"indianexpress.com",
	"www.sonyliv.com",
	"www.storytel.com",
	"blogs.intoday.in",
	"www.miramax.com",
	"www.theatlantic.com",
	"thenetworkstate.com",
	"www.bbc.co.uk",
	"www.metamorphosis.media",
	"alikazimi.ca",
	"www.criterion.com",
	"economictimes.indiatimes.com",
	"www.triangleofsadness.film",
	"indiauncut.com",
	"www.cultureunplugged.com"
	)

	CleanStringEnds <- function(input_string) {
	# Replace non-alphanumeric characters at the end of the string with an empty string
	cleaned_string <- sub("[^A-Za-z0-9]+$", "", input_string)
	return(cleaned_string)
	}

	CleanTitle <- function(string) {
	clean_string <- gsub("—", "", string)
	clean_string <- str_replace_all(clean_string, "[\r\n]", "")
	clean_string <- gsub("[[:cntrl:]]+$", "", clean_string)
	clean_string <- gsub("\t", " ", clean_string)
	clean_string <- gsub("\\s+", " ", clean_string)
	clean_string <- trimws(x = clean_string)
	clean_string <- str_to_title(string = clean_string)
	return(clean_string)
	}

	CleanAuthor <- function(string) {
	clean_string <- gsub("[^[:alnum:][:space:]]", "", string)
	clean_string <- gsub("\\s+", " ", clean_string)
	clean_string <- trimws(x = clean_string)
	clean_string <- str_to_title(string = clean_string)
	return(clean_string)
	}

	CleanURL <- function(url) {
	actual_url <- sub(".*?q=", "", url)
	actual_url <- sub("&.*", "", actual_url)
	actual_url <- URLdecode(actual_url)
	return(actual_url)
	}

	FigureOutLastPage <- function(base_url = "https://seenunseen.in/feed/?paged=") {
	min_page <- 1
	max_page <- 1000 # Set an upper limit for the search range

	while (min_page <= max_page) {
	mid_page <- min_page + floor((max_page - min_page) / 2)
	url <- paste0(base_url, mid_page)

	response <- GET(url)

	if (status_code(response) == 404) {
	max_page <- mid_page - 1
	} else {
	min_page <- mid_page + 1
	}
	Sys.sleep(0.1)
	}

	last_valid_page <- max_page
	cat("Last valid page found:", last_valid_page, "\n")
	return(last_valid_page)
	}

	FeedToEpLinks <- function(url) {
	xml_data <- GET(url)
	xml_doc <- read_xml(xml_data)

	items <- xml_find_all(xml_doc, "//item")

	# Loop through each <item> element and extract <title> and <link>
	titles <- xml_text(xml_find_all(items, "//title"))
	links <- xml_text(xml_find_all(items, "//link"))
	# Create a data frame with titles and links
	data_df <- data.frame(title = titles, link = links) %>% filter(link != "https://seenunseen.in")
	Sys.sleep(0.1)
	return(data_df)
	}


	FetchBooksFromEpisode <- function(episode_url) {
	# Fetch the HTML content from the webpage
	webpage <- read_html(GET(url = episode_url))

	# Extract content from the specific div class
	content_div <- html_node(webpage, xpath = "//div[contains(@class, 'entry-content')]")

	# Extract all text nodes and create a sequential mapping
	content_div_strings <- html_text(html_nodes(content_div, xpath = ".//text()"), trim = TRUE)
	content_div_sequential <- setNames(content_div_strings[-1], content_div_strings[-length(content_div_strings)])

	# Iterate through all <i> tags to get book details
	books <- list()
	i_tags <- html_nodes(content_div, "i")

	for (i_tag in i_tags) {
	name <- html_text(i_tag)
	author <- content_div_sequential[name]

	# Extract book URL
	tryCatch(
	{
	book_url <- html_attr(html_node(i_tag, "a"), "data-saferedirecturl")
	if (is.null(book_url)) {
	book_url <- html_attr(html_node(i_tag, "a"), "href")
	}
	},
	error = function(e) {
	print(e)
	book_url <- NA
	}
	)

	books[[name]] <- data.frame(name = name, author = as.character(author), book_url = book_url)
	}

	books.df <- bind_rows(books)
	books.df$name <- CleanTitle(books.df$name)
	books.df$author <- CleanAuthor(books.df$author)
	books.df$book_url <- CleanURL(books.df$book_url)
	books.df$episode_url <- episode_url
	Sys.sleep(3)

	return(books.df)
	}

	base_url <- "https://seenunseen.in/feed/?paged="
	last_valid_page <- 38
	dfs <- lapply(X = paste0(base_url, seq(1, last_valid_page)), FUN = FeedToEpLinks)
	dfs_combined <- bind_rows(dfs)

	books.recommend <- lapply(X = dfs_combined$link, FetchBooksFromEpisode)

	books.recommend.df <- bind_rows(books.recommend)
	books.recommend.df$name <- CleanTitle(books.recommend.df$name)
	books.recommend.df$author <- CleanAuthor(books.recommend.df$author)
	books.recommend.df <- books.recommend.df %>% filter(!grepl(x = name, pattern = "Episode Art"))

	books.recommend.df$name <- CleanStringEnds(books.recommend.df$name)
	books.recommend.df.filtered <- books.recommend.df %>%
	filter(name != "") %>%
	filter(!grepl("The Illustration For This Episode", x = name))
	books.recommend.df.filtered$author <- gsub(pattern = "Edited By ", replacement = "", x = books.recommend.df.filtered$author)

	books.recommend.df.grouped <- books.recommend.df.filtered %>%
	group_by(name) %>%
	tally() %>%
	arrange(name, desc(n))
	books.recommend.df.grouped2 <- books.recommend.df.filtered %>%
	group_by(name, author) %>%
	tally() %>%
	arrange(author)
	books.recommend.df.grouped2.filtered <- books.recommend.df.grouped2 %>%
	filter(author != "") %>%
	filter(author != "And") %>%
	filter(author != "And Then
	") %>%
	filter(author != "And The")

	domain <- sub("^(http[s]?://)?([^/]+).*", "\\2", books.recommend.df.filtered$book_url)

	books.recommend.df.filtered$domain <- domain
	books.recommend.df.filtered2 <- books.recommend.df.filtered %>%
	filter(!domain %in% domains_to_remove) %>%
	filter(domain != "NA")


	episodes_df <- dfs_combined
	colnames(episodes_df) <- c("episode_title", "episode_url")
	episodes_df$episode <- stringr::str_split_fixed(string = episodes_df$episode_title, pattern = ":", n = 2)[, 1]


	books.recommend.df.filtered2.joined <- left_join(books.recommend.df.filtered2, episodes_df)
	booktoauthor <- books.recommend.df.filtered2.joined %>%
	dplyr::select(name, author) %>%
	unique() %>%
	arrange(name, desc(author))
	booktoauthor.cleaned <- booktoauthor %>%
	group_by(name) %>%
	filter(row_number() == 1)

	booktoauthor.cleaned$author_cleaned <- booktoauthor.cleaned$author
	booktoauthor.cleaned$author <- NULL
	books.recommend.df.filtered2.joined2 <- left_join(books.recommend.df.filtered2.joined, booktoauthor.cleaned)


	saveRDS(
	books.recommend.df.filtered2.joined2,
	paste0(
	"~/github/misc_projects/12_seenunseen_books/recommended_books_",
	gsub(pattern = " ", replacement = "_", date()), ".rds"
	)
	)

	books.recommend.df.filtered2.grouped <- books.recommend.df.filtered2.joined2 %>%
	group_by(name, author_cleaned) %>%
	summarise(
	book_url = sprintf('<a href="%s">Book</a>', book_url[1]),
	n_episodes = length(unique(episode)),
	episodes = paste0(sprintf('<a href="%s">%s</a>', episode_url, episode), collapse = ", ")
	) %>%
	arrange(desc(n_episodes), name, author_cleaned)
	colnames(books.recommend.df.filtered2.grouped) <- c(
	"Book", "Author",
	"Link",
	"Total episodes",
	"Episodes"
	)
	saveRDS(
	books.recommend.df.filtered2.grouped,
	paste0(
	"~/github/misc_projects/12_seenunseen_books/books.recommend.df.filtered2_",
	gsub(pattern = " ", replacement = "_", date()), ".rds"
	)
	)

	# Render the table as a sortable HTML table
	datatablex <- datatable(books.recommend.df.filtered2.grouped,
	escape = FALSE, options = list(
	pageLength = 50,
	autoWidth = TRUE,
	ordering = TRUE
	), caption = htmltools::tags$caption(
	htmltools::withTags(
	div(HTML("Books recommended on <a href='https://seenunseen.in/'>'The Seen and the Unseen' </a> podcast by Amit Varma \| <a href='https://saket-choudhary.me/seenunseencap'>Generated captions </a>"))
	),
	# " <a href='test'>test</a>",
	style = "font-size:22pt;"
	)
	)
	saveWidget(widget = datatablex, "~/github/seenunseenbooks/index.html", selfcontained = TRUE, title = "SeenUnseen Books")