Skip to content

Instantly share code, notes, and snippets.

@iki
Last active August 29, 2015 14:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save iki/fa53c7a22bb9dab68910 to your computer and use it in GitHub Desktop.
Save iki/fa53c7a22bb9dab68910 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
from __future__ import print_function
import re
import os
import sys
import glob
LF = '\n'
# Config
fieldNames = 'timestamp id title empty url text source'.split()
requiredFieldNames = 'timestamp id'.split()
replaceSepFieldName = 'text'
sep = '\t'
valid = re.compile('^\d+' + sep)
quote = '"'
replaceLF = ' '
replaceSep = ' '
appendEmptyFields = False
# Init
totalFields = len(fieldNames)
requiredFields = dict(zip(map(fieldNames.index, requiredFieldNames), requiredFieldNames))
replaceSepField = fieldNames.index(replaceSepFieldName)
# Methods
def log(msg, flog = None):
(flog or sys.stderr).write('{0}\n'.format(msg))
def error(msg, ferr = None):
(ferr or sys.stderr).write('Error: {0}\n'.format(msg))
def check(line, lineNo = 0, ferr = None):
fields = line.split(sep)
width = len(fields)
fixed = False
for index in requiredFields:
if index >= width or not fields[index]:
error('Missing line {lineNo} field {field}: \'{line}\''.format(
lineNo = lineNo, field = requiredFields[index], line = line), ferr)
return None
if width > totalFields:
log('Join {current} - {total} = {joined} {name} fields on line {lineNo}: \'{line}\''.format(
lineNo = lineNo, total = totalFields, current = width + 1, joined = width + 1 - totalFields, line = line,
name = fieldNames[replaceSepField]), ferr)
replaceToField = replaceSepField + width + 1 - totalFields
fields[replaceSepField:replaceToField] = [replaceSep.join(fields[replaceSepField:replaceToField])]
width = len(fields)
fixed = True
elif appendEmptyFields and width < totalFields:
log('Append {total} - {current} = {added} empty fields on line {lineNo}: \'{line}\''.format(
lineNo = lineNo, total = totalFields, current = width, added = totalFields - width, line = line), ferr)
fields[width:width] = [''] * (totalFields - width)
width = len(fields)
fixed = True
for index, field in zip(range(width), fields):
if field and (field is quote or field.startswith(quote) and not field.endswith(quote)):
fields[index] += quote
fixed = True
log('End-quote field {name} on line {lineNo}: \'{field}\''.format(
lineNo = lineNo, field = field, line = line, name = fieldNames[index]), ferr)
return sep.join(fields) if fixed else line
def write(fout, line, lineNo = 0, ferr = None):
if line:
line = check(line, lineNo, ferr)
if line:
fout.write(line + LF)
return 1
return 0
def files(masks, process):
for mask in masks:
files = glob.glob(mask)
if not files:
error('No files matching \'{0}\' found.'.format(mask))
continue
for infile in files:
process(infile)
def process(infile):
outfile = '{0}-normalized{1}'.format(*os.path.splitext(infile))
errfile = '{0}-errors{1}'.format(*os.path.splitext(infile))
with file(infile) as fin:
with file(outfile, 'w') as fout:
with file(errfile, 'w') as ferr:
linesIn = linesOut = 0
multiline = ''
for line in fin:
linesIn += 1
line = line.strip()
if not line:
continue
if valid.match(line):
linesOut += write(fout, multiline, linesIn, ferr)
multiline = line
elif multiline:
multiline += replaceLF + line
else:
error('Invalid line {lineNo}: \'{line}\''.format(lineNo = linesIn, line = line))
linesOut += write(fout, multiline, linesIn, ferr)
print('{name}: {linesIn} -> {linesOut}'.format(name = outfile, linesIn = linesIn, linesOut = linesOut))
# Run
if __name__ == '__main__':
files(sys.argv[1:], process)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment