Skip to content

Instantly share code, notes, and snippets.

@erikcs
Last active November 27, 2016 23:15
Show Gist options
  • Save erikcs/58186bb2b42bacb02b572b9059160bbb to your computer and use it in GitHub Desktop.
Save erikcs/58186bb2b42bacb02b572b9059160bbb to your computer and use it in GitHub Desktop.
combine stuff with data.table
library(readr)
library(data.table)
# convert file encoding before reading into R AND remove all the commas
# $ for file in *.txt; do iconv -f UTF-16LE -t UTF-8 "$file" | sed 's/,//g' > "$file.utf8.txt"; done
process = function(df) {
# remove duplicate columns
df[, grep('.*_[0-9]$', names(df)) := NULL]
# remove all ...yr columns
df[, grep('yr$', names(df)) := NULL]
# remove the first column
df[, names(df)[1] := NULL]
# remove punctuation from col names
setnames(df, gsub('\\.', "", names(df)))
# and delimit the time variable names with a dot to make reshape happy
setnames(df, gsub('([0-9]{4})$', ".\\1", names(df)))
# remove rows with missing company name or isin
df = na.omit(df, cols = c("Company name", 'ISIN number'))
reshape(df,
direction = 'long',
sep = '.',
timevar = 'year',
idvar = '_rownum', # not used
varying = grep('.*[0-9]{4}$', names(df), value = TRUE))
}
# get file list in relevant working directory
files = list.files(pattern = "*.utf8.txt")
merged = rbindlist(lapply(
files,
function(f)
process(as.data.table(read_tsv(
file = f,
# treat these tokens as missing values
na = c('n.a.', 'Credit needed', ''))))))
# remove last column
merged[, `_rownum` := NULL]
# remove rows with year < last year
merged = merged[ !`Last year` < year ]
fwrite(merged, 'MERGED.txt', sep = '\t')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment