erikcs/combine.R

## combine.R
library(readr)
library(data.table)

# convert file encoding before reading into R AND remove all the commas
# $ for file in *.txt; do iconv -f UTF-16LE -t UTF-8 "$file" | sed 's/,//g' > "$file.utf8.txt"; done

process = function(df) {
  # remove duplicate columns
  df[, grep('.*_[0-9]$', names(df)) := NULL]
  # remove all ...yr columns
  df[, grep('yr$', names(df)) := NULL]
  # remove the first column
  df[, names(df)[1] := NULL]
  # remove punctuation from col names
  setnames(df, gsub('\\.', "", names(df)))
  # and delimit the time variable names with a dot to make reshape happy
  setnames(df, gsub('([0-9]{4})$', ".\\1", names(df)))
  # remove rows with missing company name or isin
  df = na.omit(df, cols = c("Company name", 'ISIN number'))

  reshape(df,
          direction = 'long',
          sep = '.',
          timevar = 'year',
          idvar = '_rownum', # not used
          varying = grep('.*[0-9]{4}$', names(df), value = TRUE))
}

# get file list in relevant working directory
files = list.files(pattern = "*.utf8.txt")

merged = rbindlist(lapply(
  files,
   function(f)
    process(as.data.table(read_tsv(
      file = f,
      # treat these tokens as missing values
      na = c('n.a.', 'Credit needed', ''))))))

# remove last column
merged[, `_rownum` := NULL]
# remove rows with year < last year
merged = merged[ !`Last year` < year ]

fwrite(merged, 'MERGED.txt', sep = '\t')
	library(readr)
	library(data.table)

	# convert file encoding before reading into R AND remove all the commas
	# $ for file in *.txt; do iconv -f UTF-16LE -t UTF-8 "$file" \| sed 's/,//g' > "$file.utf8.txt"; done

	process = function(df) {
	# remove duplicate columns
	df[, grep('.*_[0-9]$', names(df)) := NULL]
	# remove all ...yr columns
	df[, grep('yr$', names(df)) := NULL]
	# remove the first column
	df[, names(df)[1] := NULL]
	# remove punctuation from col names
	setnames(df, gsub('\\.', "", names(df)))
	# and delimit the time variable names with a dot to make reshape happy
	setnames(df, gsub('([0-9]{4})$', ".\\1", names(df)))
	# remove rows with missing company name or isin
	df = na.omit(df, cols = c("Company name", 'ISIN number'))

	reshape(df,
	direction = 'long',
	sep = '.',
	timevar = 'year',
	idvar = '_rownum', # not used
	varying = grep('.*[0-9]{4}$', names(df), value = TRUE))
	}

	# get file list in relevant working directory
	files = list.files(pattern = "*.utf8.txt")

	merged = rbindlist(lapply(
	files,
	function(f)
	process(as.data.table(read_tsv(
	file = f,
	# treat these tokens as missing values
	na = c('n.a.', 'Credit needed', ''))))))

	# remove last column
	merged[, `_rownum` := NULL]
	# remove rows with year < last year
	merged = merged[ !`Last year` < year ]

	fwrite(merged, 'MERGED.txt', sep = '\t')