Skip to content

Instantly share code, notes, and snippets.

@thedivtagguy
Created February 12, 2021 07:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save thedivtagguy/74de5ae9d853828530f3414e6b52df95 to your computer and use it in GitHub Desktop.
Save thedivtagguy/74de5ae9d853828530f3414e6b52df95 to your computer and use it in GitHub Desktop.
Library Parser
library(shiny)
library(tidyverse)
library(gdata)
library(stringr)
library(readr)
library(gsubfn)
library(waiter)
# Define UI for data upload app ----
gif <- paste0("https://media1.tenor.com/images",
"/cb27704982766b4f02691ea975d9a259/tenor.gif?itemid=11365139")
loading_screen <- tagList(
h3("Bear with me a second.", style = "color:gray;"),
img(src = gif, height = "200px")
)
ui <- fluidPage(
use_waiter(),
waiter_on_busy(html = loading_screen, color = "white"), # App title ----
titlePanel("Srishti Library Database"),
# Sidebar layout with input and output definitions ----
# Sidebar panel for inputs ----
sidebarPanel(
# Input: Select a file ----
fileInput("file1", "Choose Library File",
multiple = TRUE,
accept = c("text/csv",
"text/comma-separated-values,text/plain",
".csv")),
# Copy the line below to make a checkbox
checkboxInput("checkbox", label = "Or Use default file", value = FALSE),
# Horizontal line ----
tags$hr(),
numericInput("skiplines", "Number of Lines to Skip:", 10, min = 1, max = 100),
# Horizontal line ----
tags$hr(),
),
# Main panel for displaying outputs ----
mainPanel(
# Output: Data file ----
dataTableOutput('myTable')
)
)
# Define server logic to read selected file ----
server <- function(input, output) {
output$myTable <- renderDataTable({
# input$file1 will be NULL initially. After the user selects
# and uploads a file, head of that data file by default,
# or all rows if selected, will be shown.
if(input$checkbox == FALSE){
req(input$file1)
data <- readr::read_fwf(input$file1$datapath,
fwf_empty(input$file1$datapath),
skip = input$skiplines
)}
else {
data <- readr::read_fwf("https://raw.githubusercontent.com/thedivtagguy/srishtilibrary/main/books_raw.txt", fwf_empty("https://raw.githubusercontent.com/thedivtagguy/srishtilibrary/main/books_raw.txt", col_names = c("Title")),skip = 10
)
}
data <- data %>% rename(Title = X1)
# Time to Separate those columns.
# Remove Date Component to New Column
data <- data %>% mutate(Date = str_extract(Title, "\\d+/\\d+/\\d+"))
# Delete Date from main column
data$Title <- gsub("\\d+/\\d+/\\d+", "", data$Title)
# Extract Account No.
data <- data %>% mutate(Account_No = str_extract(Title, "\\d{4,5}\\s"))
data$Title <- gsub("\\d{4,5}\\s", "", data$Title) %>% trimws()
# Rearrange Columns in the order Account_No, Title, Date
data <- data[, c(3, 1, 2)]
# Function to Fix Rows
delim <- function(df, col_numb) {
for (i in nrow(df):2) {
if (is.na(df[i, (col_numb - 1)])) {
if (!is.na(df[i, (col_numb)])) {
paste(df[i - 1, col_numb], df[i, col_numb], sep = ' ') -> df[i - 1, col_numb]
NA -> df[i, col_numb]
}
}
}
df
}
# Fix Rows
data <- data %>%
as_tibble() %>%
mutate(across(everything(), na_if, "")) %>%
delim(2) %>%
drop_na()
data$acc <- gsub("\\s+"," ",data$Title)
# Separate Authors from Title Column
data <- data %>%
dplyr::mutate(Author = str_extract(Title, "[^/]+$"))
# Extract Book ID
data <- data %>% mutate(Book_ID = str_extract(Author, "\\d+\\.*\\d*"))
# Delete Book ID from Author Column
data$Author <- gsub("\\d+\\.*\\d*", "", data$Author)
data$Title <- gsub("\\d+\\.*\\d*", "", data$Title)
# Delete the author abbreviations and weird Et al lines.
data$Author <- gsub(" .*", "", data$Author)
data$Title <- gsub(" .*", "", data$Title)
data$Author <- gsub("Et al", "", data$Author)
data$Author <- gsub("ED", "", data$Author)
data$Author <- gsub("\\(", "", data$Author)
data$Author <- gsub("\\)", "", data$Author)
data$Author <- gsub("Et. al.,", "", data$Author)
data$Author <- gsub("ET AL", "", data$Author)
data$Author <- gsub("& Et", "", data$Author)
data$Author <- gsub("\\s\\.", "", data$Author)
data$Author <- gsub("Fic", "", data$Author)
data$Author <- gsub("\\s([A-Z]+[A-Za-z]{2})\\s", "", data$Author)
# Delete Where Author Names and Titles are Same
i <- 1
for (i in 1:length(data)){
if(data$Title[i] == data$Author[i]) data$Author[i] = NA
}
# Remove extra punctuation in author column
data$Author <- gsub("\\.$|\\,$", "", data$Author, ignore.case = TRUE)
# Add Space After Comma for Authors
data$Author <- textclean::add_comma_space(data$Author) %>% trimws()
# Classify Books
data <- data %>% mutate(
Book_Type = case_when(
startsWith(Book_ID, "00") ~ "Computer science, knowledge & systems",
startsWith(Book_ID, "01") ~ "Bibliographies",
startsWith(Book_ID, "02") ~ "Library & Information Sciences",
startsWith(Book_ID, "03") ~ "Encyclopedias & books of facts",
startsWith(Book_ID, "04") ~ "Unassigned",
startsWith(Book_ID, "05") ~ "Magazines, journals & serials",
startsWith(Book_ID, "06") ~ "Associations, organizations & museums",
startsWith(Book_ID, "07") ~ "News media, journalism & publishing",
startsWith(Book_ID, "08") ~ "Quotations",
startsWith(Book_ID, "09") ~ "Manuscripts & rare books",
startsWith(Book_ID, "10") ~ "Philosophy",
startsWith(Book_ID, "11") ~ "Metaphysics",
startsWith(Book_ID, "12") ~ "Epistemology",
startsWith(Book_ID, "13") ~ "Parapsychology & occultism",
startsWith(Book_ID, "14") ~ "Philosophical schools of thought",
startsWith(Book_ID, "15") ~ "Psychology",
startsWith(Book_ID, "16") ~ "Philosophical logic",
startsWith(Book_ID, "17") ~ "Ethics",
startsWith(Book_ID, "18") ~ "Ancient, medieval, eastern philosophy",
startsWith(Book_ID, "19") ~ "Modern Western philosophy",
startsWith(Book_ID, "20") ~ "Religion",
startsWith(Book_ID, "21") ~ "Philosophy & theory of religion",
startsWith(Book_ID, "22") ~ "Bible",
startsWith(Book_ID, "23") ~ "Christianity",
startsWith(Book_ID, "24") ~ "Christian practice & observance",
startsWith(Book_ID, "25") ~ "Christian orders & local church",
startsWith(Book_ID, "26") ~ "Social & ecclesiastical theology",
startsWith(Book_ID, "27") ~ "History of Christianity",
startsWith(Book_ID, "28") ~ "Christian denominations",
startsWith(Book_ID, "29") ~ "Other religions",
startsWith(Book_ID, "30") ~ "Social sciences, sociology & anthropology",
startsWith(Book_ID, "31") ~ "Statistics",
startsWith(Book_ID, "32") ~ "Political science",
startsWith(Book_ID, "33") ~ "Economics",
startsWith(Book_ID, "34") ~ "Law",
startsWith(Book_ID, "35") ~ "Public administration & military science",
startsWith(Book_ID, "36") ~ "Social problems & services",
startsWith(Book_ID, "37") ~ "Education",
startsWith(Book_ID, "38") ~ "Commerce, communications & transportation",
startsWith(Book_ID, "39") ~ "Customs, etiquette, folklore",
startsWith(Book_ID, "40") ~ "Language",
startsWith(Book_ID, "41") ~ "Linguistics",
startsWith(Book_ID, "42") ~ "English & Old English languages",
startsWith(Book_ID, "43") ~ "German and related languages",
startsWith(Book_ID, "44") ~ "French & related languages",
startsWith(Book_ID, "45") ~ "Italian, Romanian & related languages",
startsWith(Book_ID, "46") ~ "Spanish, Portuguese, Galician",
startsWith(Book_ID, "47") ~ "Latin & related Italic languages",
startsWith(Book_ID, "48") ~ "Classical & modern Greek languages",
startsWith(Book_ID, "49") ~ "Other languages",
startsWith(Book_ID, "50") ~ "Science",
startsWith(Book_ID, "51") ~ "Mathematics",
startsWith(Book_ID, "52") ~ "Astronomy",
startsWith(Book_ID, "53") ~ "Physics",
startsWith(Book_ID, "54") ~ "Chemistry",
startsWith(Book_ID, "55") ~ "Earth sciences",
startsWith(Book_ID, "56") ~ "Fossils & prehistoric life",
startsWith(Book_ID, "57") ~ "Biology",
startsWith(Book_ID, "58") ~ "Plants",
startsWith(Book_ID, "59") ~ "Animals",
startsWith(Book_ID, "60") ~ "Technology",
startsWith(Book_ID, "61") ~ "Medicine",
startsWith(Book_ID, "62") ~ "Engineering",
startsWith(Book_ID, "63") ~ "Agriculture",
startsWith(Book_ID, "64") ~ "Home & family management",
startsWith(Book_ID, "65") ~ "Management & Public Relations",
startsWith(Book_ID, "66") ~ "Chemical Engineering",
startsWith(Book_ID, "67") ~ "Manufacturing",
startsWith(Book_ID, "68") ~ "Manufacture for specific uses",
startsWith(Book_ID, "69") ~ "Construction of buildings",
startsWith(Book_ID, "70") ~ "Arts",
startsWith(Book_ID, "71") ~ "Area planning & landscape architecture",
startsWith(Book_ID, "72") ~ "Architecture",
startsWith(Book_ID, "73") ~ "Sculpture, ceramics & metalwork",
startsWith(Book_ID, "74") ~ "Graphic arts & decorative arts",
startsWith(Book_ID, "75") ~ "Painting",
startsWith(Book_ID, "76") ~ "Printmaking & prints",
startsWith(Book_ID, "77") ~ "Photography, computer art, film, video",
startsWith(Book_ID, "78") ~ "Music",
startsWith(Book_ID, "79") ~ "Outline of sports, games & entertainment",
startsWith(Book_ID, "80") ~ "Literature, rhetoric & criticism",
startsWith(Book_ID, "81") ~ "American literature in English",
startsWith(Book_ID, "82") ~ "English & Old English literatures",
startsWith(Book_ID, "83") ~ "German & related literatures",
startsWith(Book_ID, "84") ~ "French & related literatures",
startsWith(Book_ID, "85") ~ "Italian, Romanian & related literatures",
startsWith(Book_ID, "86") ~ "Spanish, Portuguese, Galician literatures",
startsWith(Book_ID, "87") ~ "Latin & Italic literatures",
startsWith(Book_ID, "88") ~ "Classical & modern Greek literatures",
startsWith(Book_ID, "89") ~ "Other literatures",
startsWith(Book_ID, "90") ~ "History",
startsWith(Book_ID, "91") ~ "Geography & travel",
startsWith(Book_ID, "92") ~ "Biography & genealogy",
startsWith(Book_ID, "93") ~ "History of ancient world",
startsWith(Book_ID, "94") ~ "History of Europe",
startsWith(Book_ID, "95") ~ "History of Asia",
startsWith(Book_ID, "96") ~ "History of Africa",
startsWith(Book_ID, "97") ~ "History of North America",
startsWith(Book_ID, "98") ~ "History of South America",
startsWith(Book_ID, "99") ~ "History of other areas",
T ~ "Uncategorized")) %>%
mutate(Book_Type = factor(Book_Type))
# General Categories
data <- data %>% mutate(
Category = case_when(
startsWith(Book_ID, "0") ~ "Computer science, information & general works",
startsWith(Book_ID, "1") ~ "Philosophy & psychology",
startsWith(Book_ID, "2") ~ "Religion",
startsWith(Book_ID, "3") ~ "Social Sciences",
startsWith(Book_ID, "4") ~ "Language",
startsWith(Book_ID, "5") ~ "Science",
startsWith(Book_ID, "6") ~ "Technology",
startsWith(Book_ID, "7") ~ "Arts & recreation",
startsWith(Book_ID, "8") ~ "Literature",
startsWith(Book_ID, "9") ~ "History & geography",
T ~ "Uncategorized")) %>%
mutate(Book_Type = factor(Book_Type))
# Clean Book Title
data$Title <- sub("/.*", "", data$Title)
data$Title <- str_to_title(data$Title)
data$Author <- str_to_title(data$Author)
data$acc <- NULL
return(data)
}, options = list( extensions = 'Buttons', buttons =
list("copy", list(
extend = "collection"
, buttons = c("csv", "excel", "pdf")
, text = "Download")),pageLength = 10, info = FALSE, searchHighlight = TRUE))
}
# Run the app ----
shinyApp(ui, server)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment