Skip to content

Instantly share code, notes, and snippets.

@andreasvc
Last active February 8, 2022 09:28
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save andreasvc/889c4acb3e9a77f44e91 to your computer and use it in GitHub Desktop.
Save andreasvc/889c4acb3e9a77f44e91 to your computer and use it in GitHub Desktop.
# -*- coding: UTF-8 -*-
"""Preprocessing of text files.
Writes one paragraph per line, and normalizes punctuation & whitespace.
No sentence or word tokenization.
Usage: preprocess.py [FILE]
or: preprocess.py --batch FILES...
By default, produce cleaned version given a single filename to standard output.
Diagnostic information is written to standard error.
Options:
--batch
Enable batch mode: convert all files and write them to a directory
called "cleaned/" (must not exist yet). Hyphenation changes will be
written to a logfile 'hyphchanges.txt'.
--paratext FILE
Specify a CSV file or excel sheet with line numbers of front and back
matter of each text in columns "start" and "end"; filenames in column
"Label"; will be removed and saved to a separate directory.
--manual
Apply corrections specified in files manual-patterns.txt and
manual-corrections.txt. These files contain lines of the form
filename:linenumber:text (same format as output of grep -n). The text
in manual-patterns.txt (which can be part of a line) will be replaced
with the corresponding text in manual-corrections.txt (i.e., both
files must be in same order).
"""
from __future__ import print_function, unicode_literals, division
import io
import os
import sys
from getopt import gnu_getopt, GetoptError
from itertools import islice, product # cartesian product
from collections import Counter, OrderedDict
import numpy
import pandas
try:
import re2 as re
except ImportError:
import re
CORRECTIONS = {} # filename => [(lineno, orig, repl), ...]
HYPHCHANGES = {} # original => (dehyphenated, [alt1, alt2, ...], eol)
# In Dutch, a hyphen between these combinations of letters cannot be removed`
VOWELCLASHES = set('aa ae ee ie oe ai ei oi ui oo au eu ou uu ii ij'.split())
INDENTRE = re.compile(r'^\s+')
LIGATURES = {
'fi': 'fi',
'ff': 'ff',
'fl': 'fl',
'ffi': 'ffi',
'ffl': 'ffl',
'IJ': 'IJ',
'ij': 'ij',
'Æ': 'AE',
'æ': 'ae',
}
# contractions of genitive determiner 'des' (e.g., 's avonds, 's nachts, etc)
TEMPORALCONTRACTIONS = set('ochtends morgens middags avonds nachts winters '
'zomers zaterdags zondags maandags dinsdags woensdags donderdags vrijdags '
'lands werelds hemelsnaam namiddags'.split())
ACRONYMS = set('fbi cia dea dna dhs ups pcp suv kgb navo swat '
'seal bmw csi dos mpeg hd usb gsm fsc klpd psv aex kno np tec '
'swot gba bbc avro adsl rsa'.split())
def single(args, dictionary, paratext):
"""Convert a single file or standard input."""
print(clean(args[0] if len(args) else '/dev/stdin', dictionary, paratext))
def batch(args, dictionary, paratext):
"""Convert a list of files specified at command line."""
os.mkdir('cleaned/')
if paratext is not None:
os.mkdir('paratext/')
print('Dictionary has %d types and a total frequency mass of %d tokens.'
% (len(dictionary), dictionary.sum()), file=sys.stderr)
if CORRECTIONS:
print('Corrections without corresponding files:',
' '.join(sorted(set(CORRECTIONS)
- set(os.path.basename(a) for a in args))) or '{}',
'\n\nFiles with no corrections:',
' '.join(sorted(set(os.path.basename(a) for a in args)
- set(CORRECTIONS))) or '{}', file=sys.stderr)
for filename in args:
if isinstance(filename, bytes):
filename = filename.decode(sys.getfilesystemencoding())
text = clean(filename, dictionary, paratext)
# write to file
with io.open('cleaned/%s' % os.path.basename(filename),
'w', encoding='utf8') as out:
out.write(text)
print('Special characters:',
''.join(a for a, _ in Counter(text).most_common() if a > '~'),
file=sys.stderr)
def readtext(filename, debug=False):
"""Read file and decode."""
text = open(filename, 'rb').read()
# try to decode
try: # we are optimistic
tmp = text.decode('utf8')
except UnicodeDecodeError:
tmp = text.decode('windows-1252')
finally:
text = tmp
fname = os.path.basename(filename)
if fname in CORRECTIONS:
# text.splitlines() gives wrong results
lines = io.StringIO(text).readlines()
for lineno, orig, fix in CORRECTIONS[fname]:
if orig not in lines[lineno - 1]:
raise ValueError('cannot apply correction, pattern not found:'
'\n%s:%s: %r => %r\nactual line: %r' % (
fname, lineno, orig, fix, lines[lineno - 1]))
lines[lineno - 1] = lines[lineno - 1].replace(orig, fix)
if debug:
print(fname, lineno, '\n', orig, '\n', fix, '\n',
lines[lineno - 1], '\n', file=sys.stderr)
return ''.join(lines)
return text
def stripparatext(filename, text, paratext):
"""Strip front and back matter using given table of line numbers.
Front and back matter is written to separate directory,
remaining core text is returned for further processing."""
label = os.path.basename(filename).split('.')[0]
row = paratext.ix[label]
start = int(row['start']) - 1
end = int(row['end']) - 1
# text.splitlines() gives wrong results
lines = io.StringIO(text).readlines()
with io.open('paratext/%s_front.txt' % label, 'w',
encoding='utf8') as out:
out.writelines(lines[:start])
with io.open('paratext/%s_back.txt' % label, 'w',
encoding='utf8') as out:
out.writelines(lines[end + 1:])
return ''.join(lines[start:end + 1])
def clean(filename, dictionary, paratext):
"""Apply all the filters to given file."""
print(filename, file=sys.stderr)
text = readtext(filename)
if paratext is not None:
text = stripparatext(filename, text, paratext)
# get rid of carriage returns, keep line feeds
if '\n' in text:
text = re.sub('\r', '', text)
else:
text = re.sub('\r', '\n', text)
text = expandligatures(text)
text = simplifyunicodespacepunct(text)
# normalize dashes
text = re.sub('--+', '-', text)
# replace square brackets because Alpino uses them for bracketed input
text = text.replace('[', '(').replace(']', ')')
text = fixellipses(text)
# detect fixed-width formatting
threshold = detecthardbreaks(io.StringIO(text))
# detect double-spaced lines
# if threshold is not None and text.count('\n\n') > 0.4 * text.count('\n'):
# print('detected double spacing.', text.count('\n\n'),
# 0.4 * text.count('\n'), text.count('\n'), file=sys.stderr)
text = text.replace('\n\n', '\n')
# remove running heads, page numbers, hyphenation
text = dehyphenate(
pagenumbers(text),
dictionary, threshold=40, allhyphens=True)
# remove separators, empty lines
text = re.sub(r'\n[ \t]*(==+|\*+|\*(?: \*)*|~|\.\.|)[ \t]*(?=\n)',
'\n', text)
# restore paragraphs if hard line breaks are detected
if threshold is None:
print('No hard line breaks detected.', file=sys.stderr)
else:
indent = detectindent(io.StringIO(text))
print('Removing hard line breaks; threshold=%d; indent=%d.' % (
threshold, indent), file=sys.stderr)
text = '\n'.join(fixparagraphs(io.StringIO(text),
threshold=threshold, indent=2 if indent else 0))
# add space after closing quotes and clause-ending punctuation
text = fixpunctspacing(text)
# capitalize acronyms (small caps may have been converted to
# lower case)
text = re.sub('\\b(%s)\\b' % '|'.join(ACRONYMS),
lambda x: x.group().upper(),
text)
text = fixcontractions(text)
# space after dashes at start of line (dialogue/list)
text = re.sub(r'\n[ \t]*-([^ ])', r'\n- \1', text)
# normalize whitespace
# no leading or trailling whitespace;
# collapse spaces and tabs to single space
text = re.sub('\n[ \t]+', '\n', text)
text = re.sub('[ \t]+\n', '\n', text)
text = re.sub('[ \t]+', ' ', text)
text = removespuriousparagraphbreaks(text)
# one paragraph per line
text = re.sub('\n\n+', '\n', text)
return text
def simplifyunicodespacepunct(text):
"""Turn various unicode whitespace and punctuation characters into simple
ASCII equivalents where appropriate, and discard control characters.
NB: this discards some information (e.g., left vs right quotes, dash vs
hyphens), but given that such information is not consistently encoded
across languages and texts, it is more reliable to normalize to a common
denominator.
>>> simplifyunicodespacepunct('‘De verraders’, riep de sjah.')
"'De verraders', riep de sjah."
"""
# Some exotic control codes not handled:
# U+0085 NEL: Next Line
# U+2028 LINE SEPARATOR
# U+2029 PARAGRAPH SEPARATOR
# Normalize spaces
# U+00A0 NO-BREAK SPACE
# U+2000 EN QUAD
# U+2001 EM QUAD
# U+2002 EN SPACE
# U+2003 EM SPACE
# U+2004 THREE-PER-EM SPACE
# U+2005 FOUR-PER-EM SPACE
# U+2006 SIX-PER-EM SPACE
# U+2007 FIGURE SPACE
# U+2008 PUNCTUATION SPACE
# U+2009 THIN SPACE
# U+200A HAIR SPACE
text = re.sub('[\u00a0\u2000-\u200a]', ' ', text)
# remove discretionary hyphen, soft space
# special case: treat soft hyphen at end of line as a regular hyphen,
# to ensure that it will be dehyphenated properly.
text = re.sub('\u00ad+\n', '-\n', text)
# 8 BACKSPACE
# U+00AD SOFT HYPHEN
# U+200B ZERO WIDTH SPACE
# U+2027 HYPHENATION POINT
text = re.sub('[\b\u00ad\u200b\u2027]', '', text)
# hyphens
# U+00B7 MIDDLE DOT
# U+2010 HYPHEN
# U+2011 NON-BREAKING HYPHEN
# U+2212 MINUS SIGN
text = re.sub('[\u00b7\u2010\u2011\u2212]', '-', text)
# dashes/bullet points
# U+2012 FIGURE DASH
# U+2013 EN DASH
# U+2014 EM DASH
# U+2015 HORIZONTAL BAR
# U+2022 BULLET
# U+2043 HYPHEN BULLET
text = re.sub('[\u2012-\u2015\u2022\u2043]', ' - ', text)
# U+2044 FRACTION SLASH
# U+2215 DIVISION SLASH
text = text.replace('[\u2044\u2215]', '/') # e.g., 'he/she'
# single quotes:
# U+2018 left single quotation mark
# U+2019 right single quotation mark
# U+201A single low-9 quotation mark
# U+201B single high-reversed-9 quotation mark
# U+2039 single left-pointing angle quotation mark
# U+203A single right-pointing angle quotation mark
# U+02BC modifier letter apostrophe
text = re.sub('[\u2018-\u201b\u2039\u203a\u02bc]', "'", text)
# double quotes:
# U+201C left double quotation mark
# U+201D right double quotation mark
# U+201E double low-9 quotation mark
# U+201F double high-reversed-9 quotation mark
# U+00AB left-pointing double angle quotation mark
# U+00BB right-pointing double angle quotation mark
text = re.sub("[\u201c-\u201f\u00ab\u00bb<>]|''", '"', text)
return text
def fixpunctspacing(text):
"""Add space after closing quotes and clause-ending punctuation.
>>> fixpunctspacing("'Maar natuurlijk!'zei hij.")
"'Maar natuurlijk!' zei hij."
>>> fixpunctspacing('P.S.:zie www.nos.nl om 12:47 etc.Ongeever 42.7 %.')
'P.S.: zie www.nos.nl om 12:47 etc. Ongeever 42.7 %.'
>>> fixpunctspacing(' Hoezo,waarom.En toen. Dank u.Voor de tweede kans. ')
' Hoezo, waarom. En toen. Dank u. Voor de tweede kans. '
>>> fixpunctspacing('NHK-collecteurs mogen niet ... het huis binnengaan.2')
'NHK-collecteurs mogen niet ... het huis binnengaan.'
"""
# ensure whitespace after closing quote
text = re.sub(r"([.!?])(['\"])(\S)", r"\1\2 \3", text, flags=re.UNICODE)
# ensure space after clause-ending punctuation between words:
# "like.This" => "like. This"
# but only when there is a single period (to preserve URLs), and the period
# is preceded and followed by 2 or more letters (to preserve acronyms
# e.g., P.S., and numbers 1.5)
# '[^\W\d]': \w minus digits
# comma
text = re.sub(r'(\s[^\W\d][^\W\d]+,)([^\W\d][^\W\d]+)\b', r'\1 \2',
text, flags=re.UNICODE)
# period
text = re.sub(r'([^\W\dA-Z]\.)([^\W\da-z])', r'\1 \2',
text, flags=re.UNICODE)
# enumerations
text = re.sub(r'(\d+\.)([^\W\da-z])', r'\1 \2', text, flags=re.UNICODE)
# remove footnotes/endnotes
text = re.sub(r'([^\W\dA-Z]\.)\d+', r'\1', text, flags=re.UNICODE)
# colon
text = re.sub(r':([^\W\d])', r': \1', text, flags=re.UNICODE)
# other punctuation
text = re.sub(r'([?!;])(\w)', r'\1 \2', text, flags=re.UNICODE)
return text
def fixellipses(text):
"""Ellipsis.
>>> fixellipses('Er was eens . . . een prin...')
'Er was eens ... een prin... '
>>> fixellipses('Dus. ... en toen.')
'Dus. <<< ... en toen.'
"""
text = text.replace('. . .', '...')
text = text.replace('…', '...')
text = re.sub(r'(?<![\.\w ])\.\.\.', ' ...', text)
text = re.sub(r'\.\.\.(?![\. ])', '... ', text)
# ellipsis at start of new sentence erronously gets merged with any
# preceding sentence ending punctuation during tokenization, so insert
# '<<<' as a separator to be removed after tokenization.
text = re.sub(r'([.!?;]) \.\.\.', r'\1 <<< ...', text)
return text
def expandligatures(text):
"""Expand single unicode ligatures into multiple ascii characters.
>>> expandligatures('filosoof')
'filosoof'
"""
return re.sub('[%s]' % ''.join(LIGATURES),
lambda x: LIGATURES[x.group()], text)
def pagenumbers(text, threshold=50):
"""Strip lines that contain only a number.
Only applied when a threshold is reached,
to avoid removing chapter numbers."""
# remove page numbers of the form |23| in running text;
text = re.sub(r'\|[0-9]+\|', '', text)
# Remove other page numbers only when on their own line.
pagenumber = re.compile(r'\n+[\t ]*-*[\t ]*[0-9]+[\t ]*-*[\t ]*\n+[\f\v]*')
if len(list(islice(pagenumber.finditer(text), threshold))) >= threshold:
return pagenumber.sub('\n', text)
return text
def runningheads(text):
"""Strip running heads (file name, page number, time stamp).
Examples::
De stiefmoeder midprice.pdf 11
03-09-12 14:12
---
een diep gevoel van beschaving dat lieden elkaar slechts 9
IJsland rev.indd | Sander Pinkse Boekproductie | 14-10-10 / 11:10 | Pag. 10
---
Medusa 001-384: Medusa 001-384 04-04-2012 08:15 Pagina 6
---
BW-Het geluid van de nacht-11e.indd 5 12-07-13 09:58
"""
text = re.sub(
r'.*\.pdf \d+\n\d{1,2}-\d{1,2}-\d{2,4} \d{1,2}:\d{2}\n',
'\n', text)
text = re.sub(
r'(?: \d+)?\n'
r'.* \| \d{2}-\d{2}-\d{2} / \d{2}:\d{2} \| Pag\. \d+\b',
'', text)
text = re.sub(
r'\n.* \d{1,2}-\d{1,2}-\d{2,4} +\d{1,2}:\d{2} +Pagina \d+',
'', text)
text = re.sub(
r'\n.*\.indd \d+ +\d{1,2}-\d{1,2}-\d{2,4}[ \n]+'
r'\d{1,2}:\d{2}(?::\d\d)?',
'', text)
# detect repeating lines that are identical except for digits
# NB: detection of repeating lines works, but repeating lines should be
# removed in chunks, together with any page numbers on the preceding line.
# patterns = Counter(re.sub(r'\d+', r'\d*', line)
# for line in text.splitlines())
# for pattern, n in patterns.most_common():
# if n < 50:
# break
# if pattern.strip() not in ('', '\\d*'):
# print('removing repeating line pattern:', pattern, file=sys.stderr)
# text = re.sub('\\n+%s' % pattern, '', text)
# # text = re.sub('(?:\d*\\n*|^)%s\\n*' % pattern, ' ', text)
return text
def vowelclash(seq):
"""Test for vowel clash in the given sequence of words/hyphens.
>>> vowelclash(['zee', '', 'egel'])
True
>>> vowelclash(['zee', '-', 'egel'])
False
>>> vowelclash(['zee', '', 'paard'])
False
>>> vowelclash(['0', '', '2'])
True"""
for a, b, c in zip(seq[::2], seq[1::2], seq[2::2]):
if b == '':
if a[-1].islower() and c[0].isupper():
return True
elif a[-1].isdigit() or c[0].isdigit():
return True
elif (a[-1] + c[0]).lower() in VOWELCLASHES:
return True
return False
def dehyphenate(text, dictionary, threshold=0, allhyphens=False, debug=False):
r"""Remove line breaks due to hyphenation and remove the hyphen where
appropriate.
The decision is based on a dictionary of word counts; in the absence of
data, leave the hyphen(s) unchanged.
Hyphens next to numbers are always left unchanged.
:param threshold: if > 0, only consider hyphens at the end of line when the
line is at least this long.
:param allhyphens: if True, hyphens not at the end of a line are also
considered for removal.
>>> d = Counter({'geweldig': 2, '06-nummer': 3,
... 'noord-holland': 5, '123': 5, '1-2-3': 1})
>>> print(dehyphenate('Ge-wel-dig 06-nummer, Noord-Hol-\nland 1-2-3',
... d, allhyphens=True))
Geweldig 06-nummer, Noord-Holland 1-2-3
"""
def repl(match, eol=False):
"""Produce replacement for hyphenated match.
Replacement will not have newlines, and hyphens are removed based on
word frequencies. If the word has more than one hyphen, all
possibilities are considered.
:param eol: whether this an end-of-line hyphen.
If True, will default to removing the hyphen in the absence of
evidence.
"""
# try dehyphenated variants first, or try hyphenated ones first
sep = ('', '-') if eol else ('-', '')
# the hyphenated word, but without newlines
hyphenated = '%s-%s' % match.group(1, 3)
# the exact form, including hyphens and newlines
original = ''.join(match.group(1, 2, 3))
# all possibilities of removing hyphens or not, e.g.:
# 'Noord-Hol-\nland'
# => ['NoordHolland', 'NoordHol-land', 'Noord-Holland', 'Noord-Hol-land']
# 'abra-ca-dabra'
# => ['abracadabra', 'abraca-dabra', 'abra-cadabra', 'abra-ca-dabra']
components = re.findall(r'[^-]+|-+', hyphenated)
if len(components) <= 15: # i.e., max 7 hyphens.
options = [''.join(x) for x in product(*[
sep if a[0] == '-' else (a, ) for a in components])
if not vowelclash(x)]
# use the one with the highest frequency; when none of the options
# is in the dictionary, use the first option, determined by 'eol'.
result = max(options, key=lambda x: dictionary.get(x.lower(), 0)
or thistext.get(x.lower(), 0))
HYPHCHANGES[original] = result, options, eol
if debug:
print(original, result, eol,
[(x, dictionary.get(x.lower(),
1 if x.lower() in thistext else 0))
for x in options], file=sys.stderr)
return result
return hyphenated
def repleol(match):
"""Decide whether to dehyphenate a given end-of-line match."""
# a hyphen may indicate the end of paragraph, when a sentence is
# interrupted; therefore, leave line alone if the second part is
# capitalized, unless it is common token in the dictionary
# (geographical names).
hyphenated = ('%s-%s' % match.group(1, 3)).lower()
if (match.group(1)[-1].islower() != match.group(3)[0].islower()
and match.group(1) != 'Mc'
and (dictionary.get(hyphenated, 0)
or thistext.get(hyphenated, 0)) < 10):
return match.group()
# Test if the length of this line meets the threshold
a = match.string.find('\n', match.start(2), match.start(2) + threshold)
b = match.string.rfind(
'\n', max(0, match.start() - threshold), match.start())
if a != -1 and b != -1 and a - b < threshold:
return match.group()
if match.group(4) is None:
return repl(match, eol=True)
# add original whitespace after hyphen and padding so that paragraph
# detection based on line length is not affected; add padding after
# first word to avoid padding being taken for start of paragraph
# indent.
# Before After (where A, -, B, C are groups 1-4)
# ....A- ....AB
# B C D. C D.
return '%s%s%s%s' % (repl(match, eol=True), match.group(2).lstrip('-'),
(match.group(4) or '')[1:], ' ' * len(match.group(1)))
# collect word counts from this text that will be used as back off for the
# main dictionary.
thistext = pandas.Series(Counter(
re.findall(r'[\w-]+', text.lower(), flags=re.UNICODE)))
if allhyphens:
# match hyphenated words on a single line of the form "A-B"
# (but not "A- B" or "A -B" or "A--B")
text = re.sub(r'\b(\w\w+)(-)(\w[-\w]+)\b', repl, text, flags=re.UNICODE)
# match hyphenated words at the end of lines: A-\nB
text = re.sub((
'(\\w[-\\w]+)' # (1) hyphenated word at end of line
'(-[\\t ]*\\n[\\t ]*)' # (2) hyphen, any whitespace
'([-\\w\']+)' # (3) 2nd part of hyphenated word on next line
'( [-\\S]+ )?'), # (4) subsequent word
repleol, text, flags=re.UNICODE)
return text
def detecthardbreaks(lines, threshold=0.8, maxlen=100):
"""Use histogram of line lengths to determine whether at least 'threshold'
fraction of lines have a length between 0 and the most common line length,
indicating hard line breaks (i.e., line breaks that do not signify a
paragraph break but are used for formatting). When the most common line
length is greater than 'maxlen', assume there is no fixed-width formatting.
"""
minlinelength = 20 # Ignore lines under 20 chars (typical of spaces)
maxlinelength = 1900 # Discard larger than this to stay in range
binwidth = 10 # Size of bucket: [n, n + 2 * binwidth] lengths
# Build the line length histogram
hist = Counter()
numlines = 0
for line in lines:
length = len(line)
if minlinelength < length < maxlinelength:
hist[length] += 1
numlines += 1
if not numlines:
return False
# create a bucket for the interval [0, mode + binwidth]
mode = max(hist, key=hist.get)
freqmass = sum(hist[a] for a in hist
# if mode - binwidth < a < mode + binwidth)
if 0 < a <= mode + binwidth)
# are at least 'threshold' fraction of lines in biggest bucket?
if mode < maxlen and freqmass / float(numlines) >= threshold:
print('line length counts:', hist, file=sys.stderr)
print('lines in [0, %d + %d]: %d of %d (%g >= %g)' % (
mode, binwidth, freqmass, numlines,
freqmass / float(numlines), threshold), file=sys.stderr)
return mode - binwidth
else:
print('freqmass %g; mode %d; proportion %g' % (
freqmass, mode, freqmass / float(numlines)),
file=sys.stderr)
return None
def indentlen(line):
"""Return the number degree of indentation (tab is 8 spaces) in a line."""
match = INDENTRE.match(line)
if match:
grp = match.group()
return grp.count(' ') + 8 * grp.count('\t')
return 0
def detectindent(lines, threshold=0.05):
"""Determine whether the text appears to use indentation as paragraph
markers. Returns 0 if no suitable indent is detected.
:param threshold: minimum proportion of lines that should have the
indentation.
:returns: the relative indent that indicates a paragraph start."""
x = numpy.array([indentlen(line) for line in lines])
hist = pandas.Series(x[:-1] - x[1:]).abs().value_counts()
print('indent counts:', Counter(hist.to_dict()), file=sys.stderr)
for a, b in hist[:5].iteritems():
if a > 1 and b > threshold * hist.sum():
return a
return 0
def fixparagraphs(lines, threshold=45, indent=0):
"""Yield paragraphs given a list of fixed-width lines.
:param threshold: A new paragraph starts when the length of the previous
line is below the given threshold,
or the current line starts with a dash.
:param indent: A new paragraph also starts when the indentation of
the current line is `indent` spaces or more (a tab equals 8 spaces).
A value of 0 disables this feature. Assumes that any margins have
already been trimmed.
>>> list(fixparagraphs([ # doctest: +ELLIPSIS
... 'A new paragraph also starts',
... 'when the current line is ',
... 'indented by at least this ',
... 'number of spaces (a tab equals',
... '8 spaces).',
... 'A value < 0 disables this ',
... 'feature.'], threshold=20))
['A new paragraph ... equals 8 spaces).', 'A value ... this feature.']
>>> list(fixparagraphs([ # doctest: +ELLIPSIS
... ' A new paragraph also ',
... 'starts when the current ',
... 'line is indented by at ',
... 'least this number of spaces ',
... '(a tab equals 8 spaces).',
... ' A value < 0 disables this ',
... 'feature.'], threshold=20, indent=3))
['A new paragraph ... equals 8 spaces).', 'A value ... this feature.']
"""
para = []
for line in lines:
thisindent = indentlen(line)
strippedline = line.strip()
if indent and thisindent >= indent: # start of paragraph
# or strippedline.startswith('-'):
if para:
yield ' '.join(para)
para = [strippedline]
elif len(line) >= threshold: # within paragraph
para.append(strippedline)
else: # shorter than threshold; end of paragraph
para.append(strippedline)
yield ' '.join(para)
para = []
if para:
yield ' '.join(para)
def fixcontractions(text):
"""Fix the spacing around the contracted form of the Dutch genitive 'des'.
>>> fixcontractions("Het was 'savonds laat.")
"Het was 's avonds laat."
>>> fixcontractions("Een enorme berg TFT 's en CRT 's.")
"Een enorme berg TFT's en CRT's."
>>> fixcontractions("Bedenk 's wat nieuws.")
"Bedenk 's wat nieuws."
"""
# Separate contractions of genitive determiner 'des'
# ('s avonds, 's nachts, etc).
text = re.sub(
"('s)(%s)" % '|'.join(TEMPORALCONTRACTIONS),
r'\1 \2', text, flags=re.IGNORECASE)
# attach plural markers to preceding acronyms
text = re.sub(
"\\b([A-Z]+) +'s\\b(?! (?:%s))" % '|'.join(TEMPORALCONTRACTIONS),
r"\1's", text, flags=re.UNICODE)
# cannot do this for all capitalized words, e.g. "Bedenk ’s wat nieuws"
return text
def removespuriousparagraphbreaks(text):
"""Texts may contain spurious line breaks inside paragraphs, especially
texts without without fixed-width formatting. This function applies the
heuristic of removing paragraph breaks without sentence ending punctuation.
>>> removespuriousparagraphbreaks(
... 'Once upon a time, in a land far far away, there was\n'
... 'a king.')
'Once upon a time, in a land far far away, there was a king.'
>>> removespuriousparagraphbreaks(
... 'Once upon a time, in a land far far away, --- wait!\n'
... 'Not this again.')
'Once upon a time, in a land far far away, --- wait!\nnot this again.'
"""
# Use a threshold to avoid affecting titles and other non-parapgraphs.
# Avoid merging paragraphs when 2nd part starts with dialogue/list dash.
return re.sub(r'(?<=[^\n]{100}[^.!?:;\'")\n-])\n([^\n\'-])', r' \1', text)
def writehyphchanges(dictionary):
"""Write a logfile of hyphenation changes."""
with io.open('hyphchanges.txt', 'w') as out:
for orig in sorted(HYPHCHANGES):
new, opts, eol = HYPHCHANGES[orig]
opts = OrderedDict((a.lower(), dictionary.get(a.lower(), 0))
for a in opts)
out.write('%s %s{%s}\n' % (
new if new == orig else '%s => %s' % (orig, new),
'(eol) ' if eol else '',
', '.join('%r: %s' % (a, b) for a, b in opts.items())))
def readcorrections():
"""Read the parallel files describing manual corrections."""
with io.open('manual-patterns.txt', encoding='utf8') as inp:
before = inp.read().splitlines()
with io.open('manual-corrections.txt', encoding='utf8') as inp:
after = inp.read().splitlines()
assert len(before) == len(after)
for orig, fix in zip(before, after):
filename1, lineno1, text1 = orig.split(':', 2)
filename2, lineno2, text2 = fix.split(':', 2)
assert filename1 == filename2
assert lineno1 == lineno2
CORRECTIONS.setdefault(filename1, []).append(
(int(lineno1), text1, text2))
def main():
"""Load dictionary and parse CLI arguments."""
try:
opts, args = gnu_getopt(
sys.argv[1:], 'h', ('help', 'batch', 'manual', 'paratext='))
except GetoptError as err:
print('error:', err, file=sys.stderr)
print(__doc__)
sys.exit(2)
opts = dict(opts)
if '--help' in opts:
print(__doc__)
return
if '--batch' not in opts and not args:
print('reading from stdin; run preprocess.py --help for help.',
file=sys.stderr)
sys.stdout.flush()
# Sonar 500 corpus word counts (case folded)
dictionary = pandas.read_table('sonar-word.freqsort.lower.gz',
encoding='utf8', index_col=0, header=None)
# List of Dutch words from the OpenTaal project, version 2.10-2
wordlist = {a.rstrip().lower() for a in io.open('/usr/share/dict/dutch',
encoding='utf8')}
dictionary = dictionary[1].reindex( # pylint: disable=no-member
index=dictionary.index | pandas.Index(wordlist), fill_value=1)
# Manual corrections (format is output of grep)
if '--manual' in opts:
readcorrections()
if '--paratext' in opts:
# line numbers of front and back matter
if (opts['--paratext'].endswith('.xlsx')
or opts['--paratext'].endswith('.xls')):
paratext = pandas.read_excel(opts['--paratext'], index_col='Label')
else:
paratext = pandas.read_csv(opts['--paratext'], index_col='Label')
paratext = paratext[~paratext['start'].isnull()]
else:
paratext = None
if '--batch' in opts:
batch(args, dictionary, paratext)
writehyphchanges(dictionary)
else:
single(args, dictionary, paratext)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment