thedivtagguy/library.r

## library.r
library(tidyverse)
library(gdata)
library(stringr)
library(readr)
library(gsubfn)
# Read in File
# This is the fixed file from Excel, but we want to do everything in R
# data <-
#   read.csv("D:/Srishti/Year 2 Semester 1/Co-Create/books.csv", header = FALSE)


# Read Fixed Width File.
# This is the normal text file. We'll skip the first 10 lines and read everything into one column.
data <-
  read_fwf(
    "https://raw.githubusercontent.com/thedivtagguy/srishtilibrary/main/books_raw.txt",
    fwf_empty(
      "https://raw.githubusercontent.com/thedivtagguy/srishtilibrary/main/books_raw.txt",
      col_names = c("Title")
    ),
    skip = 10
  )

# Time to Separate those columns.
# Remove Date Component to New Column
data <- data %>% mutate(Date = str_extract(Title, "\\d+/\\d+/\\d+"))
# Delete Date from main column
data$Title <- gsub("\\d+/\\d+/\\d+", "", data$Title)
# Extract Account No.
data <-
  data %>% mutate(Account_No = str_extract(Title, "\\d{4,5}\\s"))
data$Title <- gsub("\\d{4,5}\\s", "", data$Title) %>% trimws()

# Rearrange Columns in the order Account_No, Title, Date
data <- data[, c(3, 1, 2)]

# Function to Fix Rows
delim <- function(df, col_numb) {
  for (i in nrow(df):2) {
    if (is.na(df[i, (col_numb - 1)])) {
      if (!is.na(df[i, (col_numb)])) {
        paste(df[i - 1, col_numb], df[i, col_numb], sep = ' ') -> df[i - 1, col_numb]
        NA -> df[i, col_numb]
      }
    }
  }
  df
}

# Fix Rows
data <- data %>%
  as_tibble() %>%
  mutate(across(everything(), na_if, "")) %>%
  delim(2) %>%
  drop_na()


data$Title <- gsub("\\s+", " ", data$Title)


# Separate Authors from Title Column
data <- data %>%
  dplyr::mutate(Author = str_extract(Title, "[^/]+$"))


# Extract Book ID
data <-
  data %>% mutate(Book_ID = str_extract(Author, "\\d+\\.*\\d*"))


# Delete Book ID from Author Column
data$Author <- gsub("\\d+\\.*\\d*", "", data$Author)
data$Title <- gsub("\\d+\\.*\\d*", "", data$Title)

# Delete the author abbreviations and weird Et al lines.
data$Author <- gsub("  .*", "", data$Author)
data$Title <- gsub("  .*", "", data$Title)
data$Author <- gsub("Et al", "", data$Author)
data$Author <- gsub("ED", "", data$Author)
data$Author <- gsub("\\(", "", data$Author)
data$Author <- gsub("\\)", "", data$Author)
data$Author <- gsub("Et. al.,", "", data$Author)
data$Author <- gsub("ET AL", "", data$Author)
data$Author <- gsub("& Et", "", data$Author)
data$Author <- gsub("\\s\\.", "", data$Author)
data$Author <- gsub("Fic", "", data$Author)


# Delete Where Author Names and Titles are Same


# Remove extra punctuation in author column
data$Author <-
  gsub("\\.$|\\,$", "", data$Author, ignore.case = TRUE)


# Add Space After Comma for Authors
data$Author <- textclean::add_comma_space(data$Author) %>% trimws()

# Classify Books
data <- data %>% mutate(
  Book_Type = case_when(
    startsWith(Book_ID, "00") ~ "Computer science, knowledge & systems",
    startsWith(Book_ID, "01") ~ "Bibliographies",
    startsWith(Book_ID, "02") ~ "Library & Information Sciences",
    startsWith(Book_ID, "03") ~ "Encyclopedias & books of facts",
    startsWith(Book_ID, "04") ~ "Unassigned",
    startsWith(Book_ID, "05") ~ "Magazines, journals & serials",
    startsWith(Book_ID, "06") ~ "Associations, organizations & museums",
    startsWith(Book_ID, "07") ~ "News media, journalism & publishing",
    startsWith(Book_ID, "08") ~ "Quotations",
    startsWith(Book_ID, "09") ~ "Manuscripts & rare books",
    startsWith(Book_ID, "10") ~ "Philosophy",
    startsWith(Book_ID, "11") ~ "Metaphysics",
    startsWith(Book_ID, "12") ~ "Epistemology",
    startsWith(Book_ID, "13") ~ "Parapsychology & occultism",
    startsWith(Book_ID, "14") ~ "Philosophical schools of thought",
    startsWith(Book_ID, "15") ~ "Psychology",
    startsWith(Book_ID, "16") ~ "Philosophical logic",
    startsWith(Book_ID, "17") ~ "Ethics",
    startsWith(Book_ID, "18") ~ "Ancient, medieval, eastern philosophy",
    startsWith(Book_ID, "19") ~ "Modern Western philosophy",
    startsWith(Book_ID, "20") ~ "Religion",
    startsWith(Book_ID, "21") ~ "Philosophy & theory of religion",
    startsWith(Book_ID, "22") ~ "Bible",
    startsWith(Book_ID, "23") ~ "Christianity",
    startsWith(Book_ID, "24") ~ "Christian practice & observance",
    startsWith(Book_ID, "25") ~ "Christian orders & local church",
    startsWith(Book_ID, "26") ~ "Social & ecclesiastical theology",
    startsWith(Book_ID, "27") ~ "History of Christianity",
    startsWith(Book_ID, "28") ~ "Christian denominations",
    startsWith(Book_ID, "29") ~ "Other religions",
    startsWith(Book_ID, "30") ~ "Social sciences, sociology & anthropology",
    startsWith(Book_ID, "31") ~ "Statistics",
    startsWith(Book_ID, "32") ~ "Political science",
    startsWith(Book_ID, "33") ~ "Economics",
    startsWith(Book_ID, "34") ~ "Law",
    startsWith(Book_ID, "35") ~ "Public administration & military science",
    startsWith(Book_ID, "36") ~ "Social problems & services",
    startsWith(Book_ID, "37") ~ "Education",
    startsWith(Book_ID, "38") ~ "Commerce, communications & transportation",
    startsWith(Book_ID, "39") ~ "Customs, etiquette, folklore",
    startsWith(Book_ID, "40") ~ "Language",
    startsWith(Book_ID, "41") ~ "Linguistics",
    startsWith(Book_ID, "42") ~ "English & Old English languages",
    startsWith(Book_ID, "43") ~ "German and related languages",
    startsWith(Book_ID, "44") ~ "French & related languages",
    startsWith(Book_ID, "45") ~ "Italian, Romanian & related languages",
    startsWith(Book_ID, "46") ~ "Spanish, Portuguese, Galician",
    startsWith(Book_ID, "47") ~ "Latin & related Italic languages",
    startsWith(Book_ID, "48") ~ "Classical & modern Greek languages",
    startsWith(Book_ID, "49") ~ "Other languages",
    startsWith(Book_ID, "50") ~ "Science",
    startsWith(Book_ID, "51") ~ "Mathematics",
    startsWith(Book_ID, "52") ~ "Astronomy",
    startsWith(Book_ID, "53") ~ "Physics",
    startsWith(Book_ID, "54") ~ "Chemistry",
    startsWith(Book_ID, "55") ~ "Earth sciences",
    startsWith(Book_ID, "56") ~ "Fossils & prehistoric life",
    startsWith(Book_ID, "57") ~ "Biology",
    startsWith(Book_ID, "58") ~ "Plants",
    startsWith(Book_ID, "59") ~ "Animals",
    startsWith(Book_ID, "60") ~ "Technology",
    startsWith(Book_ID, "61") ~ "Medicine",
    startsWith(Book_ID, "62") ~ "Engineering",
    startsWith(Book_ID, "63") ~ "Agriculture",
    startsWith(Book_ID, "64") ~ "Home & family management",
    startsWith(Book_ID, "65") ~ "Management & Public Relations",
    startsWith(Book_ID, "66") ~ "Chemical Engineering",
    startsWith(Book_ID, "67") ~ "Manufacturing",
    startsWith(Book_ID, "68") ~ "Manufacture for specific uses",
    startsWith(Book_ID, "69") ~ "Construction of buildings",
    startsWith(Book_ID, "70") ~ "Arts",
    startsWith(Book_ID, "71") ~ "Area planning & landscape architecture",
    startsWith(Book_ID, "72") ~ "Architecture",
    startsWith(Book_ID, "73") ~ "Sculpture, ceramics & metalwork",
    startsWith(Book_ID, "74") ~ "Graphic arts & decorative arts",
    startsWith(Book_ID, "75") ~ "Painting",
    startsWith(Book_ID, "76") ~ "Printmaking & prints",
    startsWith(Book_ID, "77") ~ "Photography, computer art, film, video",
    startsWith(Book_ID, "78") ~ "Music",
    startsWith(Book_ID, "79") ~ "Outline of sports, games & entertainment",
    startsWith(Book_ID, "80") ~ "Literature, rhetoric & criticism",
    startsWith(Book_ID, "81") ~ "American literature in English",
    startsWith(Book_ID, "82") ~ "English & Old English literatures",
    startsWith(Book_ID, "83") ~ "German & related literatures",
    startsWith(Book_ID, "84") ~ "French & related literatures",
    startsWith(Book_ID, "85") ~ "Italian, Romanian & related literatures",
    startsWith(Book_ID, "86") ~ "Spanish, Portuguese, Galician literatures",
    startsWith(Book_ID, "87") ~ "Latin & Italic literatures",
    startsWith(Book_ID, "88") ~ "Classical & modern Greek literatures",
    startsWith(Book_ID, "89") ~ "Other literatures",
    startsWith(Book_ID, "90") ~ "History",
    startsWith(Book_ID, "91") ~ "Geography & travel",
    startsWith(Book_ID, "92") ~ "Biography & genealogy",
    startsWith(Book_ID, "93") ~ "History of ancient world",
    startsWith(Book_ID, "94") ~ "History of Europe",
    startsWith(Book_ID, "95") ~ "History of Asia",
    startsWith(Book_ID, "96") ~ "History of Africa",
    startsWith(Book_ID, "97") ~ "History of North America",
    startsWith(Book_ID, "98") ~ "History of South America",
    startsWith(Book_ID, "99") ~ "History of other areas",
    T ~ "Uncategorized"
  )
) %>%
  mutate(Book_Type = factor(Book_Type))

# General Categories

data <- data %>% mutate(
  Category = case_when(
    startsWith(Book_ID, "0") ~ "Computer science, information & general works",
    startsWith(Book_ID, "1") ~ "Philosophy & psychology",
    startsWith(Book_ID, "2") ~ "Religion",
    startsWith(Book_ID, "3") ~ "Social Sciences",
    startsWith(Book_ID, "4") ~ "Language",
    startsWith(Book_ID, "5") ~ "Science",
    startsWith(Book_ID, "6") ~ "Technology",
    startsWith(Book_ID, "7") ~ "Arts & recreation",
    startsWith(Book_ID, "8") ~ "Literature",
    startsWith(Book_ID, "9") ~ "History & geography",
    T ~ "Uncategorized"
  )
) %>%
  mutate(Book_Type = factor(Book_Type))

# Clean Book Title
data$Title <- sub("/.*", "", data$Title)
data$Title <- str_to_title(data$Title)
data$Author <- str_to_title(data$Author)

i <- 1
for (i in 1:length(data)) {
  if (data$Title[i] == data$Author[i])
    data$Author[i] = NA
}
	library(tidyverse)
	library(gdata)
	library(stringr)
	library(readr)
	library(gsubfn)
	# Read in File
	# This is the fixed file from Excel, but we want to do everything in R
	# data <-
	# read.csv("D:/Srishti/Year 2 Semester 1/Co-Create/books.csv", header = FALSE)


	# Read Fixed Width File.
	# This is the normal text file. We'll skip the first 10 lines and read everything into one column.
	data <-
	read_fwf(
	"https://raw.githubusercontent.com/thedivtagguy/srishtilibrary/main/books_raw.txt",
	fwf_empty(
	"https://raw.githubusercontent.com/thedivtagguy/srishtilibrary/main/books_raw.txt",
	col_names = c("Title")
	),
	skip = 10
	)

	# Time to Separate those columns.
	# Remove Date Component to New Column
	data <- data %>% mutate(Date = str_extract(Title, "\\d+/\\d+/\\d+"))
	# Delete Date from main column
	data$Title <- gsub("\\d+/\\d+/\\d+", "", data$Title)
	# Extract Account No.
	data <-
	data %>% mutate(Account_No = str_extract(Title, "\\d{4,5}\\s"))
	data$Title <- gsub("\\d{4,5}\\s", "", data$Title) %>% trimws()

	# Rearrange Columns in the order Account_No, Title, Date
	data <- data[, c(3, 1, 2)]

	# Function to Fix Rows
	delim <- function(df, col_numb) {
	for (i in nrow(df):2) {
	if (is.na(df[i, (col_numb - 1)])) {
	if (!is.na(df[i, (col_numb)])) {
	paste(df[i - 1, col_numb], df[i, col_numb], sep = ' ') -> df[i - 1, col_numb]
	NA -> df[i, col_numb]
	}
	}
	}
	df
	}

	# Fix Rows
	data <- data %>%
	as_tibble() %>%
	mutate(across(everything(), na_if, "")) %>%
	delim(2) %>%
	drop_na()


	data$Title <- gsub("\\s+", " ", data$Title)


	# Separate Authors from Title Column
	data <- data %>%
	dplyr::mutate(Author = str_extract(Title, "[^/]+$"))


	# Extract Book ID
	data <-
	data %>% mutate(Book_ID = str_extract(Author, "\\d+\\.\\d"))


	# Delete Book ID from Author Column
	data$Author <- gsub("\\d+\\.\\d", "", data$Author)
	data$Title <- gsub("\\d+\\.\\d", "", data$Title)

	# Delete the author abbreviations and weird Et al lines.
	data$Author <- gsub(" .*", "", data$Author)
	data$Title <- gsub(" .*", "", data$Title)
	data$Author <- gsub("Et al", "", data$Author)
	data$Author <- gsub("ED", "", data$Author)
	data$Author <- gsub("\\(", "", data$Author)
	data$Author <- gsub("\\)", "", data$Author)
	data$Author <- gsub("Et. al.,", "", data$Author)
	data$Author <- gsub("ET AL", "", data$Author)
	data$Author <- gsub("& Et", "", data$Author)
	data$Author <- gsub("\\s\\.", "", data$Author)
	data$Author <- gsub("Fic", "", data$Author)


	# Delete Where Author Names and Titles are Same


	# Remove extra punctuation in author column
	data$Author <-
	gsub("\\.$\|\\,$", "", data$Author, ignore.case = TRUE)




	# Add Space After Comma for Authors
	data$Author <- textclean::add_comma_space(data$Author) %>% trimws()

	# Classify Books
	data <- data %>% mutate(
	Book_Type = case_when(
	startsWith(Book_ID, "00") ~ "Computer science, knowledge & systems",
	startsWith(Book_ID, "01") ~ "Bibliographies",
	startsWith(Book_ID, "02") ~ "Library & Information Sciences",
	startsWith(Book_ID, "03") ~ "Encyclopedias & books of facts",
	startsWith(Book_ID, "04") ~ "Unassigned",
	startsWith(Book_ID, "05") ~ "Magazines, journals & serials",
	startsWith(Book_ID, "06") ~ "Associations, organizations & museums",
	startsWith(Book_ID, "07") ~ "News media, journalism & publishing",
	startsWith(Book_ID, "08") ~ "Quotations",
	startsWith(Book_ID, "09") ~ "Manuscripts & rare books",
	startsWith(Book_ID, "10") ~ "Philosophy",
	startsWith(Book_ID, "11") ~ "Metaphysics",
	startsWith(Book_ID, "12") ~ "Epistemology",
	startsWith(Book_ID, "13") ~ "Parapsychology & occultism",
	startsWith(Book_ID, "14") ~ "Philosophical schools of thought",
	startsWith(Book_ID, "15") ~ "Psychology",
	startsWith(Book_ID, "16") ~ "Philosophical logic",
	startsWith(Book_ID, "17") ~ "Ethics",
	startsWith(Book_ID, "18") ~ "Ancient, medieval, eastern philosophy",
	startsWith(Book_ID, "19") ~ "Modern Western philosophy",
	startsWith(Book_ID, "20") ~ "Religion",
	startsWith(Book_ID, "21") ~ "Philosophy & theory of religion",
	startsWith(Book_ID, "22") ~ "Bible",
	startsWith(Book_ID, "23") ~ "Christianity",
	startsWith(Book_ID, "24") ~ "Christian practice & observance",
	startsWith(Book_ID, "25") ~ "Christian orders & local church",
	startsWith(Book_ID, "26") ~ "Social & ecclesiastical theology",
	startsWith(Book_ID, "27") ~ "History of Christianity",
	startsWith(Book_ID, "28") ~ "Christian denominations",
	startsWith(Book_ID, "29") ~ "Other religions",
	startsWith(Book_ID, "30") ~ "Social sciences, sociology & anthropology",
	startsWith(Book_ID, "31") ~ "Statistics",
	startsWith(Book_ID, "32") ~ "Political science",
	startsWith(Book_ID, "33") ~ "Economics",
	startsWith(Book_ID, "34") ~ "Law",
	startsWith(Book_ID, "35") ~ "Public administration & military science",
	startsWith(Book_ID, "36") ~ "Social problems & services",
	startsWith(Book_ID, "37") ~ "Education",
	startsWith(Book_ID, "38") ~ "Commerce, communications & transportation",
	startsWith(Book_ID, "39") ~ "Customs, etiquette, folklore",
	startsWith(Book_ID, "40") ~ "Language",
	startsWith(Book_ID, "41") ~ "Linguistics",
	startsWith(Book_ID, "42") ~ "English & Old English languages",
	startsWith(Book_ID, "43") ~ "German and related languages",
	startsWith(Book_ID, "44") ~ "French & related languages",
	startsWith(Book_ID, "45") ~ "Italian, Romanian & related languages",
	startsWith(Book_ID, "46") ~ "Spanish, Portuguese, Galician",
	startsWith(Book_ID, "47") ~ "Latin & related Italic languages",
	startsWith(Book_ID, "48") ~ "Classical & modern Greek languages",
	startsWith(Book_ID, "49") ~ "Other languages",
	startsWith(Book_ID, "50") ~ "Science",
	startsWith(Book_ID, "51") ~ "Mathematics",
	startsWith(Book_ID, "52") ~ "Astronomy",
	startsWith(Book_ID, "53") ~ "Physics",
	startsWith(Book_ID, "54") ~ "Chemistry",
	startsWith(Book_ID, "55") ~ "Earth sciences",
	startsWith(Book_ID, "56") ~ "Fossils & prehistoric life",
	startsWith(Book_ID, "57") ~ "Biology",
	startsWith(Book_ID, "58") ~ "Plants",
	startsWith(Book_ID, "59") ~ "Animals",
	startsWith(Book_ID, "60") ~ "Technology",
	startsWith(Book_ID, "61") ~ "Medicine",
	startsWith(Book_ID, "62") ~ "Engineering",
	startsWith(Book_ID, "63") ~ "Agriculture",
	startsWith(Book_ID, "64") ~ "Home & family management",
	startsWith(Book_ID, "65") ~ "Management & Public Relations",
	startsWith(Book_ID, "66") ~ "Chemical Engineering",
	startsWith(Book_ID, "67") ~ "Manufacturing",
	startsWith(Book_ID, "68") ~ "Manufacture for specific uses",
	startsWith(Book_ID, "69") ~ "Construction of buildings",
	startsWith(Book_ID, "70") ~ "Arts",
	startsWith(Book_ID, "71") ~ "Area planning & landscape architecture",
	startsWith(Book_ID, "72") ~ "Architecture",
	startsWith(Book_ID, "73") ~ "Sculpture, ceramics & metalwork",
	startsWith(Book_ID, "74") ~ "Graphic arts & decorative arts",
	startsWith(Book_ID, "75") ~ "Painting",
	startsWith(Book_ID, "76") ~ "Printmaking & prints",
	startsWith(Book_ID, "77") ~ "Photography, computer art, film, video",
	startsWith(Book_ID, "78") ~ "Music",
	startsWith(Book_ID, "79") ~ "Outline of sports, games & entertainment",
	startsWith(Book_ID, "80") ~ "Literature, rhetoric & criticism",
	startsWith(Book_ID, "81") ~ "American literature in English",
	startsWith(Book_ID, "82") ~ "English & Old English literatures",
	startsWith(Book_ID, "83") ~ "German & related literatures",
	startsWith(Book_ID, "84") ~ "French & related literatures",
	startsWith(Book_ID, "85") ~ "Italian, Romanian & related literatures",
	startsWith(Book_ID, "86") ~ "Spanish, Portuguese, Galician literatures",
	startsWith(Book_ID, "87") ~ "Latin & Italic literatures",
	startsWith(Book_ID, "88") ~ "Classical & modern Greek literatures",
	startsWith(Book_ID, "89") ~ "Other literatures",
	startsWith(Book_ID, "90") ~ "History",
	startsWith(Book_ID, "91") ~ "Geography & travel",
	startsWith(Book_ID, "92") ~ "Biography & genealogy",
	startsWith(Book_ID, "93") ~ "History of ancient world",
	startsWith(Book_ID, "94") ~ "History of Europe",
	startsWith(Book_ID, "95") ~ "History of Asia",
	startsWith(Book_ID, "96") ~ "History of Africa",
	startsWith(Book_ID, "97") ~ "History of North America",
	startsWith(Book_ID, "98") ~ "History of South America",
	startsWith(Book_ID, "99") ~ "History of other areas",
	T ~ "Uncategorized"
	)
	) %>%
	mutate(Book_Type = factor(Book_Type))

	# General Categories

	data <- data %>% mutate(
	Category = case_when(
	startsWith(Book_ID, "0") ~ "Computer science, information & general works",
	startsWith(Book_ID, "1") ~ "Philosophy & psychology",
	startsWith(Book_ID, "2") ~ "Religion",
	startsWith(Book_ID, "3") ~ "Social Sciences",
	startsWith(Book_ID, "4") ~ "Language",
	startsWith(Book_ID, "5") ~ "Science",
	startsWith(Book_ID, "6") ~ "Technology",
	startsWith(Book_ID, "7") ~ "Arts & recreation",
	startsWith(Book_ID, "8") ~ "Literature",
	startsWith(Book_ID, "9") ~ "History & geography",
	T ~ "Uncategorized"
	)
	) %>%
	mutate(Book_Type = factor(Book_Type))

	# Clean Book Title
	data$Title <- sub("/.*", "", data$Title)
	data$Title <- str_to_title(data$Title)
	data$Author <- str_to_title(data$Author)

	i <- 1
	for (i in 1:length(data)) {
	if (data$Title[i] == data$Author[i])
	data$Author[i] = NA
	}