andreasvc/preprocess.py

## preprocess.py
# -*- coding: UTF-8 -*-
"""Preprocessing of text files.
Writes one paragraph per line, and normalizes punctuation & whitespace.
No sentence or word tokenization.

Usage: preprocess.py [FILE]
or: preprocess.py --batch FILES...

By default, produce cleaned version given a single filename to standard output.
Diagnostic information is written to standard error.

Options:
    --batch
         Enable batch mode: convert all files and write them to a directory
         called "cleaned/" (must not exist yet). Hyphenation changes will be
         written to a logfile 'hyphchanges.txt'.
    --paratext FILE
         Specify a CSV file or excel sheet with line numbers of front and back
         matter of each text in columns "start" and "end"; filenames in column
         "Label"; will be removed and saved to a separate directory.
    --manual
         Apply corrections specified in files manual-patterns.txt and
         manual-corrections.txt. These files contain lines of the form
         filename:linenumber:text (same format as output of grep -n). The text
         in manual-patterns.txt (which can be part of a line) will be replaced
         with the corresponding text in manual-corrections.txt (i.e., both
         files must be in same order).
"""
from __future__ import print_function, unicode_literals, division
import io
import os
import sys
from getopt import gnu_getopt, GetoptError
from itertools import islice, product  # cartesian product
from collections import Counter, OrderedDict
import numpy
import pandas
try:
	import re2 as re
except ImportError:
	import re

CORRECTIONS = {}  # filename => [(lineno, orig, repl), ...]
HYPHCHANGES = {}  # original => (dehyphenated, [alt1, alt2, ...], eol)

# In Dutch, a hyphen between these combinations of letters cannot be removed`
VOWELCLASHES = set('aa ae ee ie oe ai ei oi ui oo au eu ou uu ii ij'.split())

INDENTRE = re.compile(r'^\s+')
LIGATURES = {
		'ﬁ': 'fi',
		'ﬀ': 'ff',
		'ﬂ': 'fl',
		'ﬃ': 'ffi',
		'ﬄ': 'ffl',
		'Ĳ': 'IJ',
		'ĳ': 'ij',
		'Æ': 'AE',
		'æ': 'ae',
		}

# contractions of genitive determiner 'des' (e.g., 's avonds, 's nachts, etc)
TEMPORALCONTRACTIONS = set('ochtends morgens middags avonds nachts winters '
	'zomers zaterdags zondags maandags dinsdags woensdags donderdags vrijdags '
	'lands werelds hemelsnaam namiddags'.split())

ACRONYMS = set('fbi cia dea dna dhs ups pcp suv kgb navo swat '
	'seal bmw csi dos mpeg hd usb gsm fsc klpd psv aex kno np tec '
	'swot gba bbc avro adsl rsa'.split())


def single(args, dictionary, paratext):
	"""Convert a single file or standard input."""
	print(clean(args[0] if len(args) else '/dev/stdin', dictionary, paratext))


def batch(args, dictionary, paratext):
	"""Convert a list of files specified at command line."""
	os.mkdir('cleaned/')
	if paratext is not None:
		os.mkdir('paratext/')

	print('Dictionary has %d types and a total frequency mass of %d tokens.'
			% (len(dictionary), dictionary.sum()), file=sys.stderr)

	if CORRECTIONS:
		print('Corrections without corresponding files:',
				' '.join(sorted(set(CORRECTIONS)
					- set(os.path.basename(a) for a in args))) or '{}',
				'\n\nFiles with no corrections:',
				' '.join(sorted(set(os.path.basename(a) for a in args)
					- set(CORRECTIONS))) or '{}', file=sys.stderr)

	for filename in args:
		if isinstance(filename, bytes):
			filename = filename.decode(sys.getfilesystemencoding())
		text = clean(filename, dictionary, paratext)

		# write to file
		with io.open('cleaned/%s' % os.path.basename(filename),
				'w', encoding='utf8') as out:
			out.write(text)
		print('Special characters:',
				''.join(a for a, _ in Counter(text).most_common() if a > '~'),
				file=sys.stderr)


def readtext(filename, debug=False):
	"""Read file and decode."""
	text = open(filename, 'rb').read()

	# try to decode
	try:  # we are optimistic
		tmp = text.decode('utf8')
	except UnicodeDecodeError:
		tmp = text.decode('windows-1252')
	finally:
		text = tmp
	fname = os.path.basename(filename)
	if fname in CORRECTIONS:
		# text.splitlines() gives wrong results
		lines = io.StringIO(text).readlines()
		for lineno, orig, fix in CORRECTIONS[fname]:
			if orig not in lines[lineno - 1]:
				raise ValueError('cannot apply correction, pattern not found:'
						'\n%s:%s: %r => %r\nactual line: %r' % (
						fname, lineno, orig, fix, lines[lineno - 1]))
			lines[lineno - 1] = lines[lineno - 1].replace(orig, fix)
			if debug:
				print(fname, lineno, '\n', orig, '\n', fix, '\n',
						lines[lineno - 1], '\n', file=sys.stderr)
		return ''.join(lines)
	return text


def stripparatext(filename, text, paratext):
	"""Strip front and back matter using given table of line numbers.

	Front and back matter is written to separate directory,
	remaining core text is returned for further processing."""
	label = os.path.basename(filename).split('.')[0]
	row = paratext.ix[label]
	start = int(row['start']) - 1
	end = int(row['end']) - 1

	# text.splitlines() gives wrong results
	lines = io.StringIO(text).readlines()

	with io.open('paratext/%s_front.txt' % label, 'w',
			encoding='utf8') as out:
		out.writelines(lines[:start])
	with io.open('paratext/%s_back.txt' % label, 'w',
			encoding='utf8') as out:
		out.writelines(lines[end + 1:])
	return ''.join(lines[start:end + 1])


def clean(filename, dictionary, paratext):
	"""Apply all the filters to given file."""
	print(filename, file=sys.stderr)
	text = readtext(filename)
	if paratext is not None:
		text = stripparatext(filename, text, paratext)

	# get rid of carriage returns, keep line feeds
	if '\n' in text:
		text = re.sub('\r', '', text)
	else:
		text = re.sub('\r', '\n', text)

	text = expandligatures(text)
	text = simplifyunicodespacepunct(text)

	# normalize dashes
	text = re.sub('--+', '-', text)

	# replace square brackets because Alpino uses them for bracketed input
	text = text.replace('[', '(').replace(']', ')')

	text = fixellipses(text)

	# detect fixed-width formatting
	threshold = detecthardbreaks(io.StringIO(text))

	# detect double-spaced lines
	# if threshold is not None and text.count('\n\n') > 0.4 * text.count('\n'):
	# 	print('detected double spacing.', text.count('\n\n'),
	# 			0.4 * text.count('\n'), text.count('\n'), file=sys.stderr)
	text = text.replace('\n\n', '\n')

	# remove running heads, page numbers, hyphenation
	text = dehyphenate(
				pagenumbers(text),
			dictionary, threshold=40, allhyphens=True)

	# remove separators, empty lines
	text = re.sub(r'\n[ \t]*(==+|\*+|\*(?: \*)*|~|\.\.|)[ \t]*(?=\n)',
			'\n', text)

	# restore paragraphs if hard line breaks are detected
	if threshold is None:
		print('No hard line breaks detected.', file=sys.stderr)
	else:
		indent = detectindent(io.StringIO(text))
		print('Removing hard line breaks; threshold=%d; indent=%d.' % (
				threshold, indent), file=sys.stderr)
		text = '\n'.join(fixparagraphs(io.StringIO(text),
				threshold=threshold, indent=2 if indent else 0))

	# add space after closing quotes and clause-ending punctuation
	text = fixpunctspacing(text)

	# capitalize acronyms (small caps may have been converted to
	# lower case)
	text = re.sub('\\b(%s)\\b' % '|'.join(ACRONYMS),
			lambda x: x.group().upper(),
			text)

	text = fixcontractions(text)

	# space after dashes at start of line (dialogue/list)
	text = re.sub(r'\n[ \t]*-([^ ])', r'\n- \1', text)

	# normalize whitespace
	# no leading or trailling whitespace;
	# collapse spaces and tabs to single space
	text = re.sub('\n[ \t]+', '\n', text)
	text = re.sub('[ \t]+\n', '\n', text)
	text = re.sub('[ \t]+', ' ', text)

	text = removespuriousparagraphbreaks(text)

	# one paragraph per line
	text = re.sub('\n\n+', '\n', text)

	return text


def simplifyunicodespacepunct(text):
	"""Turn various unicode whitespace and punctuation characters into simple
	ASCII equivalents where appropriate, and discard control characters.

	NB: this discards some information (e.g., left vs right quotes, dash vs
	hyphens), but given that such information is not consistently encoded
	across languages and texts, it is more reliable to normalize to a common
	denominator.

	>>> simplifyunicodespacepunct('‘De verraders’, riep de sjah.')
	"'De verraders', riep de sjah."
	"""
	# Some exotic control codes not handled:
	# U+0085    NEL: Next Line
	# U+2028	LINE SEPARATOR
	# U+2029	PARAGRAPH SEPARATOR

	# Normalize spaces
	# U+00A0 NO-BREAK SPACE
	# U+2000 EN QUAD
	# U+2001 EM QUAD
	# U+2002 EN SPACE
	# U+2003 EM SPACE
	# U+2004 THREE-PER-EM SPACE
	# U+2005 FOUR-PER-EM SPACE
	# U+2006 SIX-PER-EM SPACE
	# U+2007 FIGURE SPACE
	# U+2008 PUNCTUATION SPACE
	# U+2009 THIN SPACE
	# U+200A HAIR SPACE
	text = re.sub('[\u00a0\u2000-\u200a]', ' ', text)

	# remove discretionary hyphen, soft space
	# special case: treat soft hyphen at end of line as a regular hyphen,
	# to ensure that it will be dehyphenated properly.
	text = re.sub('\u00ad+\n', '-\n', text)
	#      8 BACKSPACE
	# U+00AD SOFT HYPHEN
	# U+200B ZERO WIDTH SPACE
	# U+2027 HYPHENATION POINT
	text = re.sub('[\b\u00ad\u200b\u2027]', '', text)

	# hyphens
	# U+00B7 MIDDLE DOT
	# U+2010 HYPHEN
	# U+2011 NON-BREAKING HYPHEN
	# U+2212 MINUS SIGN
	text = re.sub('[\u00b7\u2010\u2011\u2212]', '-', text)

	# dashes/bullet points
	# U+2012 FIGURE DASH
	# U+2013 EN DASH
	# U+2014 EM DASH
	# U+2015 HORIZONTAL BAR
	# U+2022 BULLET
	# U+2043 HYPHEN BULLET
	text = re.sub('[\u2012-\u2015\u2022\u2043]', ' - ', text)

	# U+2044 FRACTION SLASH
	# U+2215 DIVISION SLASH
	text = text.replace('[\u2044\u2215]', '/')  # e.g., 'he/she'

	# single quotes:
	# U+2018 left single quotation mark
	# U+2019 right single quotation mark
	# U+201A single low-9 quotation mark
	# U+201B single high-reversed-9 quotation mark
	# U+2039 single left-pointing angle quotation mark
	# U+203A single right-pointing angle quotation mark
	# U+02BC modifier letter apostrophe
	text = re.sub('[\u2018-\u201b\u2039\u203a\u02bc]', "'", text)

	# double quotes:
	# U+201C left double quotation mark
	# U+201D right double quotation mark
	# U+201E double low-9 quotation mark
	# U+201F double high-reversed-9 quotation mark
	# U+00AB left-pointing double angle quotation mark
	# U+00BB right-pointing double angle quotation mark
	text = re.sub("[\u201c-\u201f\u00ab\u00bb<>]|''", '"', text)

	return text


def fixpunctspacing(text):
	"""Add space after closing quotes and clause-ending punctuation.

	>>> fixpunctspacing("'Maar natuurlijk!'zei hij.")
	"'Maar natuurlijk!' zei hij."
	>>> fixpunctspacing('P.S.:zie www.nos.nl om 12:47 etc.Ongeever 42.7 %.')
	'P.S.: zie www.nos.nl om 12:47 etc. Ongeever 42.7 %.'
	>>> fixpunctspacing(' Hoezo,waarom.En toen. Dank u.Voor de tweede kans. ')
	' Hoezo, waarom. En toen. Dank u. Voor de tweede kans. '
	>>> fixpunctspacing('NHK-collecteurs mogen niet ... het huis binnengaan.2')
	'NHK-collecteurs mogen niet ... het huis binnengaan.'
	"""

	# ensure whitespace after closing quote
	text = re.sub(r"([.!?])(['\"])(\S)", r"\1\2 \3", text, flags=re.UNICODE)

	# ensure space after clause-ending punctuation between words:
	# "like.This" => "like. This"
	# but only when there is a single period (to preserve URLs), and the period
	# is preceded and followed by 2 or more letters (to preserve acronyms
	# e.g., P.S., and numbers 1.5)
	# '[^\W\d]': \w minus digits
	# comma
	text = re.sub(r'(\s[^\W\d][^\W\d]+,)([^\W\d][^\W\d]+)\b', r'\1 \2',
			text, flags=re.UNICODE)
	# period
	text = re.sub(r'([^\W\dA-Z]\.)([^\W\da-z])', r'\1 \2',
			text, flags=re.UNICODE)
	# enumerations
	text = re.sub(r'(\d+\.)([^\W\da-z])', r'\1 \2', text, flags=re.UNICODE)
	# remove footnotes/endnotes
	text = re.sub(r'([^\W\dA-Z]\.)\d+', r'\1', text, flags=re.UNICODE)
	# colon
	text = re.sub(r':([^\W\d])', r': \1', text, flags=re.UNICODE)
	# other punctuation
	text = re.sub(r'([?!;])(\w)', r'\1 \2', text, flags=re.UNICODE)
	return text


def fixellipses(text):
	"""Ellipsis.

	>>> fixellipses('Er was eens . . . een prin...')
	'Er was eens ... een prin... '
	>>> fixellipses('Dus. ... en toen.')
	'Dus. <<< ... en toen.'
	"""
	text = text.replace('. . .', '...')
	text = text.replace('…', '...')
	text = re.sub(r'(?<![\.\w ])\.\.\.', ' ...', text)
	text = re.sub(r'\.\.\.(?![\. ])', '... ', text)

	# ellipsis at start of new sentence erronously gets merged with any
	# preceding sentence ending punctuation during tokenization, so insert
	# '<<<' as a separator to be removed after tokenization.
	text = re.sub(r'([.!?;]) \.\.\.', r'\1 <<< ...', text)

	return text


def expandligatures(text):
	"""Expand single unicode ligatures into multiple ascii characters.

	>>> expandligatures('ﬁlosoof')
	'filosoof'
	"""
	return re.sub('[%s]' % ''.join(LIGATURES),
			lambda x: LIGATURES[x.group()], text)


def pagenumbers(text, threshold=50):
	"""Strip lines that contain only a number.

	Only applied when a threshold is reached,
	to avoid removing chapter numbers."""
	# remove page numbers of the form |23| in running text;
	text = re.sub(r'\|[0-9]+\|', '', text)
	# Remove other page numbers only when on their own line.
	pagenumber = re.compile(r'\n+[\t ]*-*[\t ]*[0-9]+[\t ]*-*[\t ]*\n+[\f\v]*')
	if len(list(islice(pagenumber.finditer(text), threshold))) >= threshold:
		return pagenumber.sub('\n', text)
	return text


def runningheads(text):
	"""Strip running heads (file name, page number, time stamp).

	Examples::

	De stiefmoeder midprice.pdf 11

	03-09-12 14:12
	---
	een diep gevoel van beschaving dat lieden elkaar slechts 9

	IJsland rev.indd | Sander Pinkse Boekproductie | 14-10-10 / 11:10 | Pag. 10
	---
	Medusa 001-384: Medusa 001-384 04-04-2012 08:15 Pagina 6
	---
	BW-Het geluid van de nacht-11e.indd 5 12-07-13 09:58
	"""
	text = re.sub(
			r'.*\.pdf \d+\n\d{1,2}-\d{1,2}-\d{2,4} \d{1,2}:\d{2}\n',
			'\n', text)
	text = re.sub(
			r'(?: \d+)?\n'
			r'.* \| \d{2}-\d{2}-\d{2} / \d{2}:\d{2} \| Pag\. \d+\b',
			'', text)
	text = re.sub(
			r'\n.* \d{1,2}-\d{1,2}-\d{2,4} +\d{1,2}:\d{2} +Pagina \d+',
			'', text)
	text = re.sub(
			r'\n.*\.indd \d+ +\d{1,2}-\d{1,2}-\d{2,4}[ \n]+'
			r'\d{1,2}:\d{2}(?::\d\d)?',
			'', text)
	# detect repeating lines that are identical except for digits
	# NB: detection of repeating lines works, but repeating lines should be
	# removed in chunks, together with any page numbers on the preceding line.
	# patterns = Counter(re.sub(r'\d+', r'\d*', line)
	# 		for line in text.splitlines())
	# for pattern, n in patterns.most_common():
	# 	if n < 50:
	# 		break
	# 	if pattern.strip() not in ('', '\\d*'):
	# 		print('removing repeating line pattern:', pattern, file=sys.stderr)
	# 		text = re.sub('\\n+%s' % pattern, '', text)
	# 		# text = re.sub('(?:\d*\\n*|^)%s\\n*' % pattern, ' ', text)
	return text


def vowelclash(seq):
	"""Test for vowel clash in the given sequence of words/hyphens.

	>>> vowelclash(['zee', '', 'egel'])
	True
	>>> vowelclash(['zee', '-', 'egel'])
	False
	>>> vowelclash(['zee', '', 'paard'])
	False
	>>> vowelclash(['0', '', '2'])
	True"""
	for a, b, c in zip(seq[::2], seq[1::2], seq[2::2]):
		if b == '':
			if a[-1].islower() and c[0].isupper():
				return True
			elif a[-1].isdigit() or c[0].isdigit():
				return True
			elif (a[-1] + c[0]).lower() in VOWELCLASHES:
				return True
	return False


def dehyphenate(text, dictionary, threshold=0, allhyphens=False, debug=False):
	r"""Remove line breaks due to hyphenation and remove the hyphen where
	appropriate.

	The decision is based on a dictionary of word counts; in the absence of
	data, leave the hyphen(s) unchanged.
	Hyphens next to numbers are always left unchanged.

	:param threshold: if > 0, only consider hyphens at the end of line when the
		line is at least this long.
	:param allhyphens: if True, hyphens not at the end of a line are also
		considered for removal.

	>>> d = Counter({'geweldig': 2, '06-nummer': 3,
	...		'noord-holland': 5, '123': 5, '1-2-3': 1})
	>>> print(dehyphenate('Ge-wel-dig 06-nummer, Noord-Hol-\nland 1-2-3',
	...		d, allhyphens=True))
	Geweldig 06-nummer, Noord-Holland 1-2-3
	"""

	def repl(match, eol=False):
		"""Produce replacement for hyphenated match.

		Replacement will not have newlines, and hyphens are removed based on
		word frequencies. If the word has more than one hyphen, all
		possibilities are considered.

		:param eol: whether this an end-of-line hyphen.
			If True, will default to removing the hyphen in the absence of
			evidence.
		"""
		# try dehyphenated variants first, or try hyphenated ones first
		sep = ('', '-') if eol else ('-', '')
		# the hyphenated word, but without newlines
		hyphenated = '%s-%s' % match.group(1, 3)
		# the exact form, including hyphens and newlines
		original = ''.join(match.group(1, 2, 3))
		# all possibilities of removing hyphens or not, e.g.:
		# 'Noord-Hol-\nland'
		# => ['NoordHolland', 'NoordHol-land', 'Noord-Holland', 'Noord-Hol-land']
		# 'abra-ca-dabra'
		# => ['abracadabra', 'abraca-dabra', 'abra-cadabra', 'abra-ca-dabra']
		components = re.findall(r'[^-]+|-+', hyphenated)
		if len(components) <= 15:  # i.e., max 7 hyphens.
			options = [''.join(x) for x in product(*[
				sep if a[0] == '-' else (a, ) for a in components])
				if not vowelclash(x)]
			# use the one with the highest frequency; when none of the options
			# is in the dictionary, use the first option, determined by 'eol'.
			result = max(options, key=lambda x: dictionary.get(x.lower(), 0)
					or thistext.get(x.lower(), 0))
			HYPHCHANGES[original] = result, options, eol
			if debug:
				print(original, result, eol,
						[(x, dictionary.get(x.lower(),
							1 if x.lower() in thistext else 0))
						for x in options], file=sys.stderr)
			return result
		return hyphenated

	def repleol(match):
		"""Decide whether to dehyphenate a given end-of-line match."""
		# a hyphen may indicate the end of paragraph, when a sentence is
		# interrupted; therefore, leave line alone if the second part is
		# capitalized, unless it is common token in the dictionary
		# (geographical names).
		hyphenated = ('%s-%s' % match.group(1, 3)).lower()
		if (match.group(1)[-1].islower() != match.group(3)[0].islower()
				and match.group(1) != 'Mc'
				and (dictionary.get(hyphenated, 0)
					or thistext.get(hyphenated, 0)) < 10):
			return match.group()
		# Test if the length of this line meets the threshold
		a = match.string.find('\n', match.start(2), match.start(2) + threshold)
		b = match.string.rfind(
				'\n', max(0, match.start() - threshold), match.start())
		if a != -1 and b != -1 and a - b < threshold:
			return match.group()
		if match.group(4) is None:
			return repl(match, eol=True)
		# add original whitespace after hyphen and padding so that paragraph
		# detection based on line length is not affected; add padding after
		# first word to avoid padding being taken for start of paragraph
		# indent.
		# Before        After (where A, -, B, C are groups 1-4)
		# ....A-        ....AB
		# B C D.        C   D.
		return '%s%s%s%s' % (repl(match, eol=True), match.group(2).lstrip('-'),
				(match.group(4) or '')[1:], ' ' * len(match.group(1)))

	# collect word counts from this text that will be used as back off for the
	# main dictionary.
	thistext = pandas.Series(Counter(
			re.findall(r'[\w-]+', text.lower(), flags=re.UNICODE)))

	if allhyphens:
		# match hyphenated words on a single line of the form "A-B"
		# (but not "A- B" or "A -B" or "A--B")
		text = re.sub(r'\b(\w\w+)(-)(\w[-\w]+)\b', repl, text, flags=re.UNICODE)
	# match hyphenated words at the end of lines: A-\nB
	text = re.sub((
			'(\\w[-\\w]+)'  # (1) hyphenated word at end of line
			'(-[\\t ]*\\n[\\t ]*)'  # (2) hyphen, any whitespace
			'([-\\w\']+)'  # (3) 2nd part of hyphenated word on next line
			'( [-\\S]+ )?'),  # (4) subsequent word
			repleol, text, flags=re.UNICODE)
	return text


def detecthardbreaks(lines, threshold=0.8, maxlen=100):
	"""Use histogram of line lengths to determine whether at least 'threshold'
	fraction of lines have a length between 0 and the most common line length,
	indicating hard line breaks (i.e., line breaks that do not signify a
	paragraph break but are used for formatting). When the most common line
	length is greater than 'maxlen', assume there is no fixed-width formatting.
	"""
	minlinelength = 20  # Ignore lines under 20 chars (typical of spaces)
	maxlinelength = 1900  # Discard larger than this to stay in range
	binwidth = 10  # Size of bucket: [n, n + 2 * binwidth] lengths

	# Build the line length histogram
	hist = Counter()
	numlines = 0
	for line in lines:
		length = len(line)
		if minlinelength < length < maxlinelength:
			hist[length] += 1
			numlines += 1
	if not numlines:
		return False

	# create a bucket for the interval [0, mode + binwidth]
	mode = max(hist, key=hist.get)
	freqmass = sum(hist[a] for a in hist
			# if mode - binwidth < a < mode + binwidth)
			if 0 < a <= mode + binwidth)
	# are at least 'threshold' fraction of lines in biggest bucket?
	if mode < maxlen and freqmass / float(numlines) >= threshold:
		print('line length counts:', hist, file=sys.stderr)
		print('lines in [0, %d + %d]: %d of %d (%g >= %g)' % (
				mode, binwidth, freqmass, numlines,
				freqmass / float(numlines), threshold), file=sys.stderr)
		return mode - binwidth
	else:
		print('freqmass %g; mode %d; proportion %g' % (
				freqmass, mode, freqmass / float(numlines)),
				file=sys.stderr)
		return None


def indentlen(line):
	"""Return the number degree of indentation (tab is 8 spaces) in a line."""
	match = INDENTRE.match(line)
	if match:
		grp = match.group()
		return grp.count(' ') + 8 * grp.count('\t')
	return 0


def detectindent(lines, threshold=0.05):
	"""Determine whether the text appears to use indentation as paragraph
	markers. Returns 0 if no suitable indent is detected.

	:param threshold: minimum proportion of lines that should have the
		indentation.
	:returns: the relative indent that indicates a paragraph start."""
	x = numpy.array([indentlen(line) for line in lines])
	hist = pandas.Series(x[:-1] - x[1:]).abs().value_counts()
	print('indent counts:', Counter(hist.to_dict()), file=sys.stderr)
	for a, b in hist[:5].iteritems():
		if a > 1 and b > threshold * hist.sum():
			return a
	return 0


def fixparagraphs(lines, threshold=45, indent=0):
	"""Yield paragraphs given a list of fixed-width lines.

	:param threshold: A new paragraph starts when the length of the previous
		line is below the given threshold,
		or the current line starts with a dash.
	:param indent: A new paragraph also starts when the indentation of
		the current line is `indent` spaces or more (a tab equals 8 spaces).
		A value of 0 disables this feature. Assumes that any margins have
		already been trimmed.

	>>> list(fixparagraphs([  # doctest: +ELLIPSIS
	...     'A new paragraph also starts',
	...     'when the current line is ',
	...     'indented by at least this ',
	...     'number of spaces (a tab equals',
	...     '8 spaces).',
	...     'A value < 0 disables this ',
	...     'feature.'], threshold=20))
	['A new paragraph ... equals 8 spaces).', 'A value ... this feature.']
	>>> list(fixparagraphs([  # doctest: +ELLIPSIS
	...     '   A new paragraph also ',
	...     'starts when the current ',
	...     'line is indented by at ',
	...     'least this number of spaces ',
	...     '(a tab equals 8 spaces).',
	...     '   A value < 0 disables this ',
	...     'feature.'], threshold=20, indent=3))
	['A new paragraph ... equals 8 spaces).', 'A value ... this feature.']
	"""
	para = []
	for line in lines:
		thisindent = indentlen(line)
		strippedline = line.strip()
		if indent and thisindent >= indent:  # start of paragraph
			# or strippedline.startswith('-'):
			if para:
				yield ' '.join(para)
			para = [strippedline]
		elif len(line) >= threshold:  # within paragraph
			para.append(strippedline)
		else:  # shorter than threshold; end of paragraph
			para.append(strippedline)
			yield ' '.join(para)
			para = []
	if para:
		yield ' '.join(para)


def fixcontractions(text):
	"""Fix the spacing around the contracted form of the Dutch genitive 'des'.

	>>> fixcontractions("Het was 'savonds laat.")
	"Het was 's avonds laat."
	>>> fixcontractions("Een enorme berg TFT 's en CRT 's.")
	"Een enorme berg TFT's en CRT's."
	>>> fixcontractions("Bedenk 's wat nieuws.")
	"Bedenk 's wat nieuws."
	"""
	# Separate contractions of genitive determiner 'des'
	# ('s avonds, 's nachts, etc).
	text = re.sub(
			"('s)(%s)" % '|'.join(TEMPORALCONTRACTIONS),
			r'\1 \2', text, flags=re.IGNORECASE)

	# attach plural markers to preceding acronyms
	text = re.sub(
			"\\b([A-Z]+) +'s\\b(?! (?:%s))" % '|'.join(TEMPORALCONTRACTIONS),
			r"\1's", text, flags=re.UNICODE)
	# cannot do this for all capitalized words, e.g. "Bedenk ’s wat nieuws"
	return text


def removespuriousparagraphbreaks(text):
	"""Texts may contain spurious line breaks inside paragraphs, especially
	texts without without fixed-width formatting. This function applies the
	heuristic of removing paragraph breaks without sentence ending punctuation.

	>>> removespuriousparagraphbreaks(
	...     'Once upon a time, in a land far far away, there was\n'
	...     'a king.')
	'Once upon a time, in a land far far away, there was a king.'
	>>> removespuriousparagraphbreaks(
	...     'Once upon a time, in a land far far away, --- wait!\n'
	...     'Not this again.')
	'Once upon a time, in a land far far away, --- wait!\nnot this again.'
	"""
	# Use a threshold to avoid affecting titles and other non-parapgraphs.
	# Avoid merging paragraphs when 2nd part starts with dialogue/list dash.
	return re.sub(r'(?<=[^\n]{100}[^.!?:;\'")\n-])\n([^\n\'-])', r' \1', text)


def writehyphchanges(dictionary):
	"""Write a logfile of hyphenation changes."""
	with io.open('hyphchanges.txt', 'w') as out:
		for orig in sorted(HYPHCHANGES):
			new, opts, eol = HYPHCHANGES[orig]
			opts = OrderedDict((a.lower(), dictionary.get(a.lower(), 0))
					for a in opts)
			out.write('%s %s{%s}\n' % (
				new if new == orig else '%s => %s' % (orig, new),
					'(eol) ' if eol else '',
					', '.join('%r: %s' % (a, b) for a, b in opts.items())))


def readcorrections():
	"""Read the parallel files describing manual corrections."""
	with io.open('manual-patterns.txt', encoding='utf8') as inp:
		before = inp.read().splitlines()
	with io.open('manual-corrections.txt', encoding='utf8') as inp:
		after = inp.read().splitlines()
	assert len(before) == len(after)
	for orig, fix in zip(before, after):
		filename1, lineno1, text1 = orig.split(':', 2)
		filename2, lineno2, text2 = fix.split(':', 2)
		assert filename1 == filename2
		assert lineno1 == lineno2
		CORRECTIONS.setdefault(filename1, []).append(
				(int(lineno1), text1, text2))


def main():
	"""Load dictionary and parse CLI arguments."""
	try:
		opts, args = gnu_getopt(
				sys.argv[1:], 'h', ('help', 'batch', 'manual', 'paratext='))
	except GetoptError as err:
		print('error:', err, file=sys.stderr)
		print(__doc__)
		sys.exit(2)
	opts = dict(opts)
	if '--help' in opts:
		print(__doc__)
		return
	if '--batch' not in opts and not args:
		print('reading from stdin; run preprocess.py --help for help.',
				file=sys.stderr)
		sys.stdout.flush()

	# Sonar 500 corpus word counts (case folded)
	dictionary = pandas.read_table('sonar-word.freqsort.lower.gz',
			encoding='utf8', index_col=0, header=None)
	# List of Dutch words from the OpenTaal project, version 2.10-2
	wordlist = {a.rstrip().lower() for a in io.open('/usr/share/dict/dutch',
			encoding='utf8')}
	dictionary = dictionary[1].reindex(  # pylint: disable=no-member
			index=dictionary.index | pandas.Index(wordlist), fill_value=1)

	# Manual corrections (format is output of grep)
	if '--manual' in opts:
		readcorrections()

	if '--paratext' in opts:
		# line numbers of front and back matter
		if (opts['--paratext'].endswith('.xlsx')
				or opts['--paratext'].endswith('.xls')):
			paratext = pandas.read_excel(opts['--paratext'], index_col='Label')
		else:
			paratext = pandas.read_csv(opts['--paratext'], index_col='Label')
		paratext = paratext[~paratext['start'].isnull()]
	else:
		paratext = None

	if '--batch' in opts:
		batch(args, dictionary, paratext)
		writehyphchanges(dictionary)
	else:
		single(args, dictionary, paratext)

if __name__ == '__main__':
	main()