Created
February 12, 2021 09:40
-
-
Save thedivtagguy/441a12beef3e4d50756c09c3082d87d0 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyverse) | |
library(gdata) | |
library(stringr) | |
library(readr) | |
library(gsubfn) | |
# Read in File | |
# This is the fixed file from Excel, but we want to do everything in R | |
# data <- | |
# read.csv("D:/Srishti/Year 2 Semester 1/Co-Create/books.csv", header = FALSE) | |
# Read Fixed Width File. | |
# This is the normal text file. We'll skip the first 10 lines and read everything into one column. | |
data <- | |
read_fwf( | |
"https://raw.githubusercontent.com/thedivtagguy/srishtilibrary/main/books_raw.txt", | |
fwf_empty( | |
"https://raw.githubusercontent.com/thedivtagguy/srishtilibrary/main/books_raw.txt", | |
col_names = c("Title") | |
), | |
skip = 10 | |
) | |
# Time to Separate those columns. | |
# Remove Date Component to New Column | |
data <- data %>% mutate(Date = str_extract(Title, "\\d+/\\d+/\\d+")) | |
# Delete Date from main column | |
data$Title <- gsub("\\d+/\\d+/\\d+", "", data$Title) | |
# Extract Account No. | |
data <- | |
data %>% mutate(Account_No = str_extract(Title, "\\d{4,5}\\s")) | |
data$Title <- gsub("\\d{4,5}\\s", "", data$Title) %>% trimws() | |
# Rearrange Columns in the order Account_No, Title, Date | |
data <- data[, c(3, 1, 2)] | |
# Function to Fix Rows | |
delim <- function(df, col_numb) { | |
for (i in nrow(df):2) { | |
if (is.na(df[i, (col_numb - 1)])) { | |
if (!is.na(df[i, (col_numb)])) { | |
paste(df[i - 1, col_numb], df[i, col_numb], sep = ' ') -> df[i - 1, col_numb] | |
NA -> df[i, col_numb] | |
} | |
} | |
} | |
df | |
} | |
# Fix Rows | |
data <- data %>% | |
as_tibble() %>% | |
mutate(across(everything(), na_if, "")) %>% | |
delim(2) %>% | |
drop_na() | |
data$Title <- gsub("\\s+", " ", data$Title) | |
# Separate Authors from Title Column | |
data <- data %>% | |
dplyr::mutate(Author = str_extract(Title, "[^/]+$")) | |
# Extract Book ID | |
data <- | |
data %>% mutate(Book_ID = str_extract(Author, "\\d+\\.*\\d*")) | |
# Delete Book ID from Author Column | |
data$Author <- gsub("\\d+\\.*\\d*", "", data$Author) | |
data$Title <- gsub("\\d+\\.*\\d*", "", data$Title) | |
# Delete the author abbreviations and weird Et al lines. | |
data$Author <- gsub(" .*", "", data$Author) | |
data$Title <- gsub(" .*", "", data$Title) | |
data$Author <- gsub("Et al", "", data$Author) | |
data$Author <- gsub("ED", "", data$Author) | |
data$Author <- gsub("\\(", "", data$Author) | |
data$Author <- gsub("\\)", "", data$Author) | |
data$Author <- gsub("Et. al.,", "", data$Author) | |
data$Author <- gsub("ET AL", "", data$Author) | |
data$Author <- gsub("& Et", "", data$Author) | |
data$Author <- gsub("\\s\\.", "", data$Author) | |
data$Author <- gsub("Fic", "", data$Author) | |
# Delete Where Author Names and Titles are Same | |
# Remove extra punctuation in author column | |
data$Author <- | |
gsub("\\.$|\\,$", "", data$Author, ignore.case = TRUE) | |
# Add Space After Comma for Authors | |
data$Author <- textclean::add_comma_space(data$Author) %>% trimws() | |
# Classify Books | |
data <- data %>% mutate( | |
Book_Type = case_when( | |
startsWith(Book_ID, "00") ~ "Computer science, knowledge & systems", | |
startsWith(Book_ID, "01") ~ "Bibliographies", | |
startsWith(Book_ID, "02") ~ "Library & Information Sciences", | |
startsWith(Book_ID, "03") ~ "Encyclopedias & books of facts", | |
startsWith(Book_ID, "04") ~ "Unassigned", | |
startsWith(Book_ID, "05") ~ "Magazines, journals & serials", | |
startsWith(Book_ID, "06") ~ "Associations, organizations & museums", | |
startsWith(Book_ID, "07") ~ "News media, journalism & publishing", | |
startsWith(Book_ID, "08") ~ "Quotations", | |
startsWith(Book_ID, "09") ~ "Manuscripts & rare books", | |
startsWith(Book_ID, "10") ~ "Philosophy", | |
startsWith(Book_ID, "11") ~ "Metaphysics", | |
startsWith(Book_ID, "12") ~ "Epistemology", | |
startsWith(Book_ID, "13") ~ "Parapsychology & occultism", | |
startsWith(Book_ID, "14") ~ "Philosophical schools of thought", | |
startsWith(Book_ID, "15") ~ "Psychology", | |
startsWith(Book_ID, "16") ~ "Philosophical logic", | |
startsWith(Book_ID, "17") ~ "Ethics", | |
startsWith(Book_ID, "18") ~ "Ancient, medieval, eastern philosophy", | |
startsWith(Book_ID, "19") ~ "Modern Western philosophy", | |
startsWith(Book_ID, "20") ~ "Religion", | |
startsWith(Book_ID, "21") ~ "Philosophy & theory of religion", | |
startsWith(Book_ID, "22") ~ "Bible", | |
startsWith(Book_ID, "23") ~ "Christianity", | |
startsWith(Book_ID, "24") ~ "Christian practice & observance", | |
startsWith(Book_ID, "25") ~ "Christian orders & local church", | |
startsWith(Book_ID, "26") ~ "Social & ecclesiastical theology", | |
startsWith(Book_ID, "27") ~ "History of Christianity", | |
startsWith(Book_ID, "28") ~ "Christian denominations", | |
startsWith(Book_ID, "29") ~ "Other religions", | |
startsWith(Book_ID, "30") ~ "Social sciences, sociology & anthropology", | |
startsWith(Book_ID, "31") ~ "Statistics", | |
startsWith(Book_ID, "32") ~ "Political science", | |
startsWith(Book_ID, "33") ~ "Economics", | |
startsWith(Book_ID, "34") ~ "Law", | |
startsWith(Book_ID, "35") ~ "Public administration & military science", | |
startsWith(Book_ID, "36") ~ "Social problems & services", | |
startsWith(Book_ID, "37") ~ "Education", | |
startsWith(Book_ID, "38") ~ "Commerce, communications & transportation", | |
startsWith(Book_ID, "39") ~ "Customs, etiquette, folklore", | |
startsWith(Book_ID, "40") ~ "Language", | |
startsWith(Book_ID, "41") ~ "Linguistics", | |
startsWith(Book_ID, "42") ~ "English & Old English languages", | |
startsWith(Book_ID, "43") ~ "German and related languages", | |
startsWith(Book_ID, "44") ~ "French & related languages", | |
startsWith(Book_ID, "45") ~ "Italian, Romanian & related languages", | |
startsWith(Book_ID, "46") ~ "Spanish, Portuguese, Galician", | |
startsWith(Book_ID, "47") ~ "Latin & related Italic languages", | |
startsWith(Book_ID, "48") ~ "Classical & modern Greek languages", | |
startsWith(Book_ID, "49") ~ "Other languages", | |
startsWith(Book_ID, "50") ~ "Science", | |
startsWith(Book_ID, "51") ~ "Mathematics", | |
startsWith(Book_ID, "52") ~ "Astronomy", | |
startsWith(Book_ID, "53") ~ "Physics", | |
startsWith(Book_ID, "54") ~ "Chemistry", | |
startsWith(Book_ID, "55") ~ "Earth sciences", | |
startsWith(Book_ID, "56") ~ "Fossils & prehistoric life", | |
startsWith(Book_ID, "57") ~ "Biology", | |
startsWith(Book_ID, "58") ~ "Plants", | |
startsWith(Book_ID, "59") ~ "Animals", | |
startsWith(Book_ID, "60") ~ "Technology", | |
startsWith(Book_ID, "61") ~ "Medicine", | |
startsWith(Book_ID, "62") ~ "Engineering", | |
startsWith(Book_ID, "63") ~ "Agriculture", | |
startsWith(Book_ID, "64") ~ "Home & family management", | |
startsWith(Book_ID, "65") ~ "Management & Public Relations", | |
startsWith(Book_ID, "66") ~ "Chemical Engineering", | |
startsWith(Book_ID, "67") ~ "Manufacturing", | |
startsWith(Book_ID, "68") ~ "Manufacture for specific uses", | |
startsWith(Book_ID, "69") ~ "Construction of buildings", | |
startsWith(Book_ID, "70") ~ "Arts", | |
startsWith(Book_ID, "71") ~ "Area planning & landscape architecture", | |
startsWith(Book_ID, "72") ~ "Architecture", | |
startsWith(Book_ID, "73") ~ "Sculpture, ceramics & metalwork", | |
startsWith(Book_ID, "74") ~ "Graphic arts & decorative arts", | |
startsWith(Book_ID, "75") ~ "Painting", | |
startsWith(Book_ID, "76") ~ "Printmaking & prints", | |
startsWith(Book_ID, "77") ~ "Photography, computer art, film, video", | |
startsWith(Book_ID, "78") ~ "Music", | |
startsWith(Book_ID, "79") ~ "Outline of sports, games & entertainment", | |
startsWith(Book_ID, "80") ~ "Literature, rhetoric & criticism", | |
startsWith(Book_ID, "81") ~ "American literature in English", | |
startsWith(Book_ID, "82") ~ "English & Old English literatures", | |
startsWith(Book_ID, "83") ~ "German & related literatures", | |
startsWith(Book_ID, "84") ~ "French & related literatures", | |
startsWith(Book_ID, "85") ~ "Italian, Romanian & related literatures", | |
startsWith(Book_ID, "86") ~ "Spanish, Portuguese, Galician literatures", | |
startsWith(Book_ID, "87") ~ "Latin & Italic literatures", | |
startsWith(Book_ID, "88") ~ "Classical & modern Greek literatures", | |
startsWith(Book_ID, "89") ~ "Other literatures", | |
startsWith(Book_ID, "90") ~ "History", | |
startsWith(Book_ID, "91") ~ "Geography & travel", | |
startsWith(Book_ID, "92") ~ "Biography & genealogy", | |
startsWith(Book_ID, "93") ~ "History of ancient world", | |
startsWith(Book_ID, "94") ~ "History of Europe", | |
startsWith(Book_ID, "95") ~ "History of Asia", | |
startsWith(Book_ID, "96") ~ "History of Africa", | |
startsWith(Book_ID, "97") ~ "History of North America", | |
startsWith(Book_ID, "98") ~ "History of South America", | |
startsWith(Book_ID, "99") ~ "History of other areas", | |
T ~ "Uncategorized" | |
) | |
) %>% | |
mutate(Book_Type = factor(Book_Type)) | |
# General Categories | |
data <- data %>% mutate( | |
Category = case_when( | |
startsWith(Book_ID, "0") ~ "Computer science, information & general works", | |
startsWith(Book_ID, "1") ~ "Philosophy & psychology", | |
startsWith(Book_ID, "2") ~ "Religion", | |
startsWith(Book_ID, "3") ~ "Social Sciences", | |
startsWith(Book_ID, "4") ~ "Language", | |
startsWith(Book_ID, "5") ~ "Science", | |
startsWith(Book_ID, "6") ~ "Technology", | |
startsWith(Book_ID, "7") ~ "Arts & recreation", | |
startsWith(Book_ID, "8") ~ "Literature", | |
startsWith(Book_ID, "9") ~ "History & geography", | |
T ~ "Uncategorized" | |
) | |
) %>% | |
mutate(Book_Type = factor(Book_Type)) | |
# Clean Book Title | |
data$Title <- sub("/.*", "", data$Title) | |
data$Title <- str_to_title(data$Title) | |
data$Author <- str_to_title(data$Author) | |
i <- 1 | |
for (i in 1:length(data)) { | |
if (data$Title[i] == data$Author[i]) | |
data$Author[i] = NA | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment