Skip to content

Instantly share code, notes, and snippets.

@GeorgeOduor
Last active October 16, 2022 18:39
Show Gist options
  • Save GeorgeOduor/1819d88e26a86950349aef84460a9c85 to your computer and use it in GitHub Desktop.
Save GeorgeOduor/1819d88e26a86950349aef84460a9c85 to your computer and use it in GitHub Desktop.
rm(list = ls())
library(pdftools)
library(magick)
library(mbanalytics)
library(R6)
statement = "E:/Reports/improptu/Oct/KaribuStatement.pdf"
# pdf_file <- pdf_convert(statement,pages = 1,dpi = 100)
file = pdf_text(pdf = statement)
statement_cleaned = function(file) {
tryCatch(
expr = {
# prepare dataset ---------------------------------------------------------
print("Preparing Statement")
ftext = file %>%
map_df(
~ enframe(.) %>%
# mutate(value = trimws(value)) %>%
separate_rows(value, sep = "\n") %>% #view("raw_text") %>%
slice(which(grepl(
"Tran", value
)):which(grepl(
"Note:", value
))) %>%
mutate(x = which(grepl(
"Note:", value
))) %>%
select(-x) %>%
filter(
value != "",
trimws(value) != "Id",
!grepl("Note:", value),
trimws(value) != "_"
) %>%
mutate(value = trimws(value, which = "both")) %>%
separate(value, into = paste(1:7), sep = "\\s\\s+") %>%
row_to_names(1) %>% clean_names("upper_camel") %>%
mutate(
TranParticulars = ifelse(is.na(TranParticulars), TranDate, TranParticulars),
TranDate = ifelse(is.na(ValueDate), NA, TranDate)
)
) %>%
rowid_to_column() %>%
slice(-nrow(.))
# clean financials --------------------------------------------------------
print("Cleaning Statement Financials")
details <-
ftext %>% select(rowid, TranDate, ValueDate, TranParticulars)
financials2 = ftext %>%
filter(!is.na(ValueDate)) %>%
mutate_at(c('Debit', 'Credit', 'Balance', 'Instrument'),
parse_number) %>%
mutate(
Balance = ifelse(is.na(Credit), Debit, Credit),
Debit = ifelse(Balance == Debit, 0, Debit),
Debit = ifelse(
Debit == 0 &
(Instrument + Balance) == lag(Balance),
Instrument,
Debit
),
Credit = ifelse(
is.na(Credit) & Debit == 0,
Instrument,
ifelse(Credit == Balance, 0, Credit)
),
Credit = ifelse(is.na(Credit), 0, Credit)
) %>%
select(-TranDate, -ValueDate, -TranParticulars, -X1) %>%
mutate(transaction_id = rowid) %>%
mutate_all(
.funs = function(x)
ifelse(x == 0, NA, x)
)
combi = details %>% left_join(financials2,by = "rowid")
combi_missing <-
nrow(filter(combi, is.na(transaction_id)))
# final clean -------------------------------------------------------------
print("Cleaning Statement Transaction Particulars")
while (combi_missing > 0) {
combi <-
combi %>% mutate(transaction_id = ifelse(
is.na(transaction_id),
lag(transaction_id),
transaction_id
))
combi_missing <-
nrow(filter(combi, is.na(transaction_id)))
}
clean_file <- combi %>%
select(-TranParticulars) %>%
left_join(
combi %>%
select(transaction_id, TranParticulars) %>%
group_by(transaction_id) %>%
mutate(id = row_number()) %>%
reshape2::dcast(transaction_id ~ id, value.var = 'TranParticulars') %>%
mutate_all(
.funs = function(x)
ifelse(is.na(x), "", x)
) %>%
unite("TranParticulars", names(select(
., -transaction_id
))) %>%
mutate(TranParticulars = gsub("__", "", TranParticulars)) ,
by = c("rowid" = "transaction_id")
) %>%
select(names(combi)) %>%
filter(!is.na(Balance)) %>% select(-rowid, -transaction_id) %>%
mutate_at(
c('TranDate', 'ValueDate'),
.funs = function(x)
dmy(x)
)
summaries = clean_file %>%
summarise_at(
c("Debit", "Credit"),
.funs = function(x)
sum(x, na.rm = T)
)%>% mutate_all(.funs = function(x)format(x,big.mark=","))
print("Done...exiting!")
return(list("clean_statement"=clean_file,"summaries"=summaries))
},
error = function(e) {
print(e)
}
)
}
report <- statement_cleaned(file)
report$summaries
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment