Last active
February 8, 2022 09:28
-
-
Save andreasvc/889c4acb3e9a77f44e91 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: UTF-8 -*- | |
"""Preprocessing of text files. | |
Writes one paragraph per line, and normalizes punctuation & whitespace. | |
No sentence or word tokenization. | |
Usage: preprocess.py [FILE] | |
or: preprocess.py --batch FILES... | |
By default, produce cleaned version given a single filename to standard output. | |
Diagnostic information is written to standard error. | |
Options: | |
--batch | |
Enable batch mode: convert all files and write them to a directory | |
called "cleaned/" (must not exist yet). Hyphenation changes will be | |
written to a logfile 'hyphchanges.txt'. | |
--paratext FILE | |
Specify a CSV file or excel sheet with line numbers of front and back | |
matter of each text in columns "start" and "end"; filenames in column | |
"Label"; will be removed and saved to a separate directory. | |
--manual | |
Apply corrections specified in files manual-patterns.txt and | |
manual-corrections.txt. These files contain lines of the form | |
filename:linenumber:text (same format as output of grep -n). The text | |
in manual-patterns.txt (which can be part of a line) will be replaced | |
with the corresponding text in manual-corrections.txt (i.e., both | |
files must be in same order). | |
""" | |
from __future__ import print_function, unicode_literals, division | |
import io | |
import os | |
import sys | |
from getopt import gnu_getopt, GetoptError | |
from itertools import islice, product # cartesian product | |
from collections import Counter, OrderedDict | |
import numpy | |
import pandas | |
try: | |
import re2 as re | |
except ImportError: | |
import re | |
CORRECTIONS = {} # filename => [(lineno, orig, repl), ...] | |
HYPHCHANGES = {} # original => (dehyphenated, [alt1, alt2, ...], eol) | |
# In Dutch, a hyphen between these combinations of letters cannot be removed` | |
VOWELCLASHES = set('aa ae ee ie oe ai ei oi ui oo au eu ou uu ii ij'.split()) | |
INDENTRE = re.compile(r'^\s+') | |
LIGATURES = { | |
'fi': 'fi', | |
'ff': 'ff', | |
'fl': 'fl', | |
'ffi': 'ffi', | |
'ffl': 'ffl', | |
'IJ': 'IJ', | |
'ij': 'ij', | |
'Æ': 'AE', | |
'æ': 'ae', | |
} | |
# contractions of genitive determiner 'des' (e.g., 's avonds, 's nachts, etc) | |
TEMPORALCONTRACTIONS = set('ochtends morgens middags avonds nachts winters ' | |
'zomers zaterdags zondags maandags dinsdags woensdags donderdags vrijdags ' | |
'lands werelds hemelsnaam namiddags'.split()) | |
ACRONYMS = set('fbi cia dea dna dhs ups pcp suv kgb navo swat ' | |
'seal bmw csi dos mpeg hd usb gsm fsc klpd psv aex kno np tec ' | |
'swot gba bbc avro adsl rsa'.split()) | |
def single(args, dictionary, paratext): | |
"""Convert a single file or standard input.""" | |
print(clean(args[0] if len(args) else '/dev/stdin', dictionary, paratext)) | |
def batch(args, dictionary, paratext): | |
"""Convert a list of files specified at command line.""" | |
os.mkdir('cleaned/') | |
if paratext is not None: | |
os.mkdir('paratext/') | |
print('Dictionary has %d types and a total frequency mass of %d tokens.' | |
% (len(dictionary), dictionary.sum()), file=sys.stderr) | |
if CORRECTIONS: | |
print('Corrections without corresponding files:', | |
' '.join(sorted(set(CORRECTIONS) | |
- set(os.path.basename(a) for a in args))) or '{}', | |
'\n\nFiles with no corrections:', | |
' '.join(sorted(set(os.path.basename(a) for a in args) | |
- set(CORRECTIONS))) or '{}', file=sys.stderr) | |
for filename in args: | |
if isinstance(filename, bytes): | |
filename = filename.decode(sys.getfilesystemencoding()) | |
text = clean(filename, dictionary, paratext) | |
# write to file | |
with io.open('cleaned/%s' % os.path.basename(filename), | |
'w', encoding='utf8') as out: | |
out.write(text) | |
print('Special characters:', | |
''.join(a for a, _ in Counter(text).most_common() if a > '~'), | |
file=sys.stderr) | |
def readtext(filename, debug=False): | |
"""Read file and decode.""" | |
text = open(filename, 'rb').read() | |
# try to decode | |
try: # we are optimistic | |
tmp = text.decode('utf8') | |
except UnicodeDecodeError: | |
tmp = text.decode('windows-1252') | |
finally: | |
text = tmp | |
fname = os.path.basename(filename) | |
if fname in CORRECTIONS: | |
# text.splitlines() gives wrong results | |
lines = io.StringIO(text).readlines() | |
for lineno, orig, fix in CORRECTIONS[fname]: | |
if orig not in lines[lineno - 1]: | |
raise ValueError('cannot apply correction, pattern not found:' | |
'\n%s:%s: %r => %r\nactual line: %r' % ( | |
fname, lineno, orig, fix, lines[lineno - 1])) | |
lines[lineno - 1] = lines[lineno - 1].replace(orig, fix) | |
if debug: | |
print(fname, lineno, '\n', orig, '\n', fix, '\n', | |
lines[lineno - 1], '\n', file=sys.stderr) | |
return ''.join(lines) | |
return text | |
def stripparatext(filename, text, paratext): | |
"""Strip front and back matter using given table of line numbers. | |
Front and back matter is written to separate directory, | |
remaining core text is returned for further processing.""" | |
label = os.path.basename(filename).split('.')[0] | |
row = paratext.ix[label] | |
start = int(row['start']) - 1 | |
end = int(row['end']) - 1 | |
# text.splitlines() gives wrong results | |
lines = io.StringIO(text).readlines() | |
with io.open('paratext/%s_front.txt' % label, 'w', | |
encoding='utf8') as out: | |
out.writelines(lines[:start]) | |
with io.open('paratext/%s_back.txt' % label, 'w', | |
encoding='utf8') as out: | |
out.writelines(lines[end + 1:]) | |
return ''.join(lines[start:end + 1]) | |
def clean(filename, dictionary, paratext): | |
"""Apply all the filters to given file.""" | |
print(filename, file=sys.stderr) | |
text = readtext(filename) | |
if paratext is not None: | |
text = stripparatext(filename, text, paratext) | |
# get rid of carriage returns, keep line feeds | |
if '\n' in text: | |
text = re.sub('\r', '', text) | |
else: | |
text = re.sub('\r', '\n', text) | |
text = expandligatures(text) | |
text = simplifyunicodespacepunct(text) | |
# normalize dashes | |
text = re.sub('--+', '-', text) | |
# replace square brackets because Alpino uses them for bracketed input | |
text = text.replace('[', '(').replace(']', ')') | |
text = fixellipses(text) | |
# detect fixed-width formatting | |
threshold = detecthardbreaks(io.StringIO(text)) | |
# detect double-spaced lines | |
# if threshold is not None and text.count('\n\n') > 0.4 * text.count('\n'): | |
# print('detected double spacing.', text.count('\n\n'), | |
# 0.4 * text.count('\n'), text.count('\n'), file=sys.stderr) | |
text = text.replace('\n\n', '\n') | |
# remove running heads, page numbers, hyphenation | |
text = dehyphenate( | |
pagenumbers(text), | |
dictionary, threshold=40, allhyphens=True) | |
# remove separators, empty lines | |
text = re.sub(r'\n[ \t]*(==+|\*+|\*(?: \*)*|~|\.\.|)[ \t]*(?=\n)', | |
'\n', text) | |
# restore paragraphs if hard line breaks are detected | |
if threshold is None: | |
print('No hard line breaks detected.', file=sys.stderr) | |
else: | |
indent = detectindent(io.StringIO(text)) | |
print('Removing hard line breaks; threshold=%d; indent=%d.' % ( | |
threshold, indent), file=sys.stderr) | |
text = '\n'.join(fixparagraphs(io.StringIO(text), | |
threshold=threshold, indent=2 if indent else 0)) | |
# add space after closing quotes and clause-ending punctuation | |
text = fixpunctspacing(text) | |
# capitalize acronyms (small caps may have been converted to | |
# lower case) | |
text = re.sub('\\b(%s)\\b' % '|'.join(ACRONYMS), | |
lambda x: x.group().upper(), | |
text) | |
text = fixcontractions(text) | |
# space after dashes at start of line (dialogue/list) | |
text = re.sub(r'\n[ \t]*-([^ ])', r'\n- \1', text) | |
# normalize whitespace | |
# no leading or trailling whitespace; | |
# collapse spaces and tabs to single space | |
text = re.sub('\n[ \t]+', '\n', text) | |
text = re.sub('[ \t]+\n', '\n', text) | |
text = re.sub('[ \t]+', ' ', text) | |
text = removespuriousparagraphbreaks(text) | |
# one paragraph per line | |
text = re.sub('\n\n+', '\n', text) | |
return text | |
def simplifyunicodespacepunct(text): | |
"""Turn various unicode whitespace and punctuation characters into simple | |
ASCII equivalents where appropriate, and discard control characters. | |
NB: this discards some information (e.g., left vs right quotes, dash vs | |
hyphens), but given that such information is not consistently encoded | |
across languages and texts, it is more reliable to normalize to a common | |
denominator. | |
>>> simplifyunicodespacepunct('‘De verraders’, riep de sjah.') | |
"'De verraders', riep de sjah." | |
""" | |
# Some exotic control codes not handled: | |
# U+0085 NEL: Next Line | |
# U+2028 LINE SEPARATOR | |
# U+2029 PARAGRAPH SEPARATOR | |
# Normalize spaces | |
# U+00A0 NO-BREAK SPACE | |
# U+2000 EN QUAD | |
# U+2001 EM QUAD | |
# U+2002 EN SPACE | |
# U+2003 EM SPACE | |
# U+2004 THREE-PER-EM SPACE | |
# U+2005 FOUR-PER-EM SPACE | |
# U+2006 SIX-PER-EM SPACE | |
# U+2007 FIGURE SPACE | |
# U+2008 PUNCTUATION SPACE | |
# U+2009 THIN SPACE | |
# U+200A HAIR SPACE | |
text = re.sub('[\u00a0\u2000-\u200a]', ' ', text) | |
# remove discretionary hyphen, soft space | |
# special case: treat soft hyphen at end of line as a regular hyphen, | |
# to ensure that it will be dehyphenated properly. | |
text = re.sub('\u00ad+\n', '-\n', text) | |
# 8 BACKSPACE | |
# U+00AD SOFT HYPHEN | |
# U+200B ZERO WIDTH SPACE | |
# U+2027 HYPHENATION POINT | |
text = re.sub('[\b\u00ad\u200b\u2027]', '', text) | |
# hyphens | |
# U+00B7 MIDDLE DOT | |
# U+2010 HYPHEN | |
# U+2011 NON-BREAKING HYPHEN | |
# U+2212 MINUS SIGN | |
text = re.sub('[\u00b7\u2010\u2011\u2212]', '-', text) | |
# dashes/bullet points | |
# U+2012 FIGURE DASH | |
# U+2013 EN DASH | |
# U+2014 EM DASH | |
# U+2015 HORIZONTAL BAR | |
# U+2022 BULLET | |
# U+2043 HYPHEN BULLET | |
text = re.sub('[\u2012-\u2015\u2022\u2043]', ' - ', text) | |
# U+2044 FRACTION SLASH | |
# U+2215 DIVISION SLASH | |
text = text.replace('[\u2044\u2215]', '/') # e.g., 'he/she' | |
# single quotes: | |
# U+2018 left single quotation mark | |
# U+2019 right single quotation mark | |
# U+201A single low-9 quotation mark | |
# U+201B single high-reversed-9 quotation mark | |
# U+2039 single left-pointing angle quotation mark | |
# U+203A single right-pointing angle quotation mark | |
# U+02BC modifier letter apostrophe | |
text = re.sub('[\u2018-\u201b\u2039\u203a\u02bc]', "'", text) | |
# double quotes: | |
# U+201C left double quotation mark | |
# U+201D right double quotation mark | |
# U+201E double low-9 quotation mark | |
# U+201F double high-reversed-9 quotation mark | |
# U+00AB left-pointing double angle quotation mark | |
# U+00BB right-pointing double angle quotation mark | |
text = re.sub("[\u201c-\u201f\u00ab\u00bb<>]|''", '"', text) | |
return text | |
def fixpunctspacing(text): | |
"""Add space after closing quotes and clause-ending punctuation. | |
>>> fixpunctspacing("'Maar natuurlijk!'zei hij.") | |
"'Maar natuurlijk!' zei hij." | |
>>> fixpunctspacing('P.S.:zie www.nos.nl om 12:47 etc.Ongeever 42.7 %.') | |
'P.S.: zie www.nos.nl om 12:47 etc. Ongeever 42.7 %.' | |
>>> fixpunctspacing(' Hoezo,waarom.En toen. Dank u.Voor de tweede kans. ') | |
' Hoezo, waarom. En toen. Dank u. Voor de tweede kans. ' | |
>>> fixpunctspacing('NHK-collecteurs mogen niet ... het huis binnengaan.2') | |
'NHK-collecteurs mogen niet ... het huis binnengaan.' | |
""" | |
# ensure whitespace after closing quote | |
text = re.sub(r"([.!?])(['\"])(\S)", r"\1\2 \3", text, flags=re.UNICODE) | |
# ensure space after clause-ending punctuation between words: | |
# "like.This" => "like. This" | |
# but only when there is a single period (to preserve URLs), and the period | |
# is preceded and followed by 2 or more letters (to preserve acronyms | |
# e.g., P.S., and numbers 1.5) | |
# '[^\W\d]': \w minus digits | |
# comma | |
text = re.sub(r'(\s[^\W\d][^\W\d]+,)([^\W\d][^\W\d]+)\b', r'\1 \2', | |
text, flags=re.UNICODE) | |
# period | |
text = re.sub(r'([^\W\dA-Z]\.)([^\W\da-z])', r'\1 \2', | |
text, flags=re.UNICODE) | |
# enumerations | |
text = re.sub(r'(\d+\.)([^\W\da-z])', r'\1 \2', text, flags=re.UNICODE) | |
# remove footnotes/endnotes | |
text = re.sub(r'([^\W\dA-Z]\.)\d+', r'\1', text, flags=re.UNICODE) | |
# colon | |
text = re.sub(r':([^\W\d])', r': \1', text, flags=re.UNICODE) | |
# other punctuation | |
text = re.sub(r'([?!;])(\w)', r'\1 \2', text, flags=re.UNICODE) | |
return text | |
def fixellipses(text): | |
"""Ellipsis. | |
>>> fixellipses('Er was eens . . . een prin...') | |
'Er was eens ... een prin... ' | |
>>> fixellipses('Dus. ... en toen.') | |
'Dus. <<< ... en toen.' | |
""" | |
text = text.replace('. . .', '...') | |
text = text.replace('…', '...') | |
text = re.sub(r'(?<![\.\w ])\.\.\.', ' ...', text) | |
text = re.sub(r'\.\.\.(?![\. ])', '... ', text) | |
# ellipsis at start of new sentence erronously gets merged with any | |
# preceding sentence ending punctuation during tokenization, so insert | |
# '<<<' as a separator to be removed after tokenization. | |
text = re.sub(r'([.!?;]) \.\.\.', r'\1 <<< ...', text) | |
return text | |
def expandligatures(text): | |
"""Expand single unicode ligatures into multiple ascii characters. | |
>>> expandligatures('filosoof') | |
'filosoof' | |
""" | |
return re.sub('[%s]' % ''.join(LIGATURES), | |
lambda x: LIGATURES[x.group()], text) | |
def pagenumbers(text, threshold=50): | |
"""Strip lines that contain only a number. | |
Only applied when a threshold is reached, | |
to avoid removing chapter numbers.""" | |
# remove page numbers of the form |23| in running text; | |
text = re.sub(r'\|[0-9]+\|', '', text) | |
# Remove other page numbers only when on their own line. | |
pagenumber = re.compile(r'\n+[\t ]*-*[\t ]*[0-9]+[\t ]*-*[\t ]*\n+[\f\v]*') | |
if len(list(islice(pagenumber.finditer(text), threshold))) >= threshold: | |
return pagenumber.sub('\n', text) | |
return text | |
def runningheads(text): | |
"""Strip running heads (file name, page number, time stamp). | |
Examples:: | |
De stiefmoeder midprice.pdf 11 | |
03-09-12 14:12 | |
--- | |
een diep gevoel van beschaving dat lieden elkaar slechts 9 | |
IJsland rev.indd | Sander Pinkse Boekproductie | 14-10-10 / 11:10 | Pag. 10 | |
--- | |
Medusa 001-384: Medusa 001-384 04-04-2012 08:15 Pagina 6 | |
--- | |
BW-Het geluid van de nacht-11e.indd 5 12-07-13 09:58 | |
""" | |
text = re.sub( | |
r'.*\.pdf \d+\n\d{1,2}-\d{1,2}-\d{2,4} \d{1,2}:\d{2}\n', | |
'\n', text) | |
text = re.sub( | |
r'(?: \d+)?\n' | |
r'.* \| \d{2}-\d{2}-\d{2} / \d{2}:\d{2} \| Pag\. \d+\b', | |
'', text) | |
text = re.sub( | |
r'\n.* \d{1,2}-\d{1,2}-\d{2,4} +\d{1,2}:\d{2} +Pagina \d+', | |
'', text) | |
text = re.sub( | |
r'\n.*\.indd \d+ +\d{1,2}-\d{1,2}-\d{2,4}[ \n]+' | |
r'\d{1,2}:\d{2}(?::\d\d)?', | |
'', text) | |
# detect repeating lines that are identical except for digits | |
# NB: detection of repeating lines works, but repeating lines should be | |
# removed in chunks, together with any page numbers on the preceding line. | |
# patterns = Counter(re.sub(r'\d+', r'\d*', line) | |
# for line in text.splitlines()) | |
# for pattern, n in patterns.most_common(): | |
# if n < 50: | |
# break | |
# if pattern.strip() not in ('', '\\d*'): | |
# print('removing repeating line pattern:', pattern, file=sys.stderr) | |
# text = re.sub('\\n+%s' % pattern, '', text) | |
# # text = re.sub('(?:\d*\\n*|^)%s\\n*' % pattern, ' ', text) | |
return text | |
def vowelclash(seq): | |
"""Test for vowel clash in the given sequence of words/hyphens. | |
>>> vowelclash(['zee', '', 'egel']) | |
True | |
>>> vowelclash(['zee', '-', 'egel']) | |
False | |
>>> vowelclash(['zee', '', 'paard']) | |
False | |
>>> vowelclash(['0', '', '2']) | |
True""" | |
for a, b, c in zip(seq[::2], seq[1::2], seq[2::2]): | |
if b == '': | |
if a[-1].islower() and c[0].isupper(): | |
return True | |
elif a[-1].isdigit() or c[0].isdigit(): | |
return True | |
elif (a[-1] + c[0]).lower() in VOWELCLASHES: | |
return True | |
return False | |
def dehyphenate(text, dictionary, threshold=0, allhyphens=False, debug=False): | |
r"""Remove line breaks due to hyphenation and remove the hyphen where | |
appropriate. | |
The decision is based on a dictionary of word counts; in the absence of | |
data, leave the hyphen(s) unchanged. | |
Hyphens next to numbers are always left unchanged. | |
:param threshold: if > 0, only consider hyphens at the end of line when the | |
line is at least this long. | |
:param allhyphens: if True, hyphens not at the end of a line are also | |
considered for removal. | |
>>> d = Counter({'geweldig': 2, '06-nummer': 3, | |
... 'noord-holland': 5, '123': 5, '1-2-3': 1}) | |
>>> print(dehyphenate('Ge-wel-dig 06-nummer, Noord-Hol-\nland 1-2-3', | |
... d, allhyphens=True)) | |
Geweldig 06-nummer, Noord-Holland 1-2-3 | |
""" | |
def repl(match, eol=False): | |
"""Produce replacement for hyphenated match. | |
Replacement will not have newlines, and hyphens are removed based on | |
word frequencies. If the word has more than one hyphen, all | |
possibilities are considered. | |
:param eol: whether this an end-of-line hyphen. | |
If True, will default to removing the hyphen in the absence of | |
evidence. | |
""" | |
# try dehyphenated variants first, or try hyphenated ones first | |
sep = ('', '-') if eol else ('-', '') | |
# the hyphenated word, but without newlines | |
hyphenated = '%s-%s' % match.group(1, 3) | |
# the exact form, including hyphens and newlines | |
original = ''.join(match.group(1, 2, 3)) | |
# all possibilities of removing hyphens or not, e.g.: | |
# 'Noord-Hol-\nland' | |
# => ['NoordHolland', 'NoordHol-land', 'Noord-Holland', 'Noord-Hol-land'] | |
# 'abra-ca-dabra' | |
# => ['abracadabra', 'abraca-dabra', 'abra-cadabra', 'abra-ca-dabra'] | |
components = re.findall(r'[^-]+|-+', hyphenated) | |
if len(components) <= 15: # i.e., max 7 hyphens. | |
options = [''.join(x) for x in product(*[ | |
sep if a[0] == '-' else (a, ) for a in components]) | |
if not vowelclash(x)] | |
# use the one with the highest frequency; when none of the options | |
# is in the dictionary, use the first option, determined by 'eol'. | |
result = max(options, key=lambda x: dictionary.get(x.lower(), 0) | |
or thistext.get(x.lower(), 0)) | |
HYPHCHANGES[original] = result, options, eol | |
if debug: | |
print(original, result, eol, | |
[(x, dictionary.get(x.lower(), | |
1 if x.lower() in thistext else 0)) | |
for x in options], file=sys.stderr) | |
return result | |
return hyphenated | |
def repleol(match): | |
"""Decide whether to dehyphenate a given end-of-line match.""" | |
# a hyphen may indicate the end of paragraph, when a sentence is | |
# interrupted; therefore, leave line alone if the second part is | |
# capitalized, unless it is common token in the dictionary | |
# (geographical names). | |
hyphenated = ('%s-%s' % match.group(1, 3)).lower() | |
if (match.group(1)[-1].islower() != match.group(3)[0].islower() | |
and match.group(1) != 'Mc' | |
and (dictionary.get(hyphenated, 0) | |
or thistext.get(hyphenated, 0)) < 10): | |
return match.group() | |
# Test if the length of this line meets the threshold | |
a = match.string.find('\n', match.start(2), match.start(2) + threshold) | |
b = match.string.rfind( | |
'\n', max(0, match.start() - threshold), match.start()) | |
if a != -1 and b != -1 and a - b < threshold: | |
return match.group() | |
if match.group(4) is None: | |
return repl(match, eol=True) | |
# add original whitespace after hyphen and padding so that paragraph | |
# detection based on line length is not affected; add padding after | |
# first word to avoid padding being taken for start of paragraph | |
# indent. | |
# Before After (where A, -, B, C are groups 1-4) | |
# ....A- ....AB | |
# B C D. C D. | |
return '%s%s%s%s' % (repl(match, eol=True), match.group(2).lstrip('-'), | |
(match.group(4) or '')[1:], ' ' * len(match.group(1))) | |
# collect word counts from this text that will be used as back off for the | |
# main dictionary. | |
thistext = pandas.Series(Counter( | |
re.findall(r'[\w-]+', text.lower(), flags=re.UNICODE))) | |
if allhyphens: | |
# match hyphenated words on a single line of the form "A-B" | |
# (but not "A- B" or "A -B" or "A--B") | |
text = re.sub(r'\b(\w\w+)(-)(\w[-\w]+)\b', repl, text, flags=re.UNICODE) | |
# match hyphenated words at the end of lines: A-\nB | |
text = re.sub(( | |
'(\\w[-\\w]+)' # (1) hyphenated word at end of line | |
'(-[\\t ]*\\n[\\t ]*)' # (2) hyphen, any whitespace | |
'([-\\w\']+)' # (3) 2nd part of hyphenated word on next line | |
'( [-\\S]+ )?'), # (4) subsequent word | |
repleol, text, flags=re.UNICODE) | |
return text | |
def detecthardbreaks(lines, threshold=0.8, maxlen=100): | |
"""Use histogram of line lengths to determine whether at least 'threshold' | |
fraction of lines have a length between 0 and the most common line length, | |
indicating hard line breaks (i.e., line breaks that do not signify a | |
paragraph break but are used for formatting). When the most common line | |
length is greater than 'maxlen', assume there is no fixed-width formatting. | |
""" | |
minlinelength = 20 # Ignore lines under 20 chars (typical of spaces) | |
maxlinelength = 1900 # Discard larger than this to stay in range | |
binwidth = 10 # Size of bucket: [n, n + 2 * binwidth] lengths | |
# Build the line length histogram | |
hist = Counter() | |
numlines = 0 | |
for line in lines: | |
length = len(line) | |
if minlinelength < length < maxlinelength: | |
hist[length] += 1 | |
numlines += 1 | |
if not numlines: | |
return False | |
# create a bucket for the interval [0, mode + binwidth] | |
mode = max(hist, key=hist.get) | |
freqmass = sum(hist[a] for a in hist | |
# if mode - binwidth < a < mode + binwidth) | |
if 0 < a <= mode + binwidth) | |
# are at least 'threshold' fraction of lines in biggest bucket? | |
if mode < maxlen and freqmass / float(numlines) >= threshold: | |
print('line length counts:', hist, file=sys.stderr) | |
print('lines in [0, %d + %d]: %d of %d (%g >= %g)' % ( | |
mode, binwidth, freqmass, numlines, | |
freqmass / float(numlines), threshold), file=sys.stderr) | |
return mode - binwidth | |
else: | |
print('freqmass %g; mode %d; proportion %g' % ( | |
freqmass, mode, freqmass / float(numlines)), | |
file=sys.stderr) | |
return None | |
def indentlen(line): | |
"""Return the number degree of indentation (tab is 8 spaces) in a line.""" | |
match = INDENTRE.match(line) | |
if match: | |
grp = match.group() | |
return grp.count(' ') + 8 * grp.count('\t') | |
return 0 | |
def detectindent(lines, threshold=0.05): | |
"""Determine whether the text appears to use indentation as paragraph | |
markers. Returns 0 if no suitable indent is detected. | |
:param threshold: minimum proportion of lines that should have the | |
indentation. | |
:returns: the relative indent that indicates a paragraph start.""" | |
x = numpy.array([indentlen(line) for line in lines]) | |
hist = pandas.Series(x[:-1] - x[1:]).abs().value_counts() | |
print('indent counts:', Counter(hist.to_dict()), file=sys.stderr) | |
for a, b in hist[:5].iteritems(): | |
if a > 1 and b > threshold * hist.sum(): | |
return a | |
return 0 | |
def fixparagraphs(lines, threshold=45, indent=0): | |
"""Yield paragraphs given a list of fixed-width lines. | |
:param threshold: A new paragraph starts when the length of the previous | |
line is below the given threshold, | |
or the current line starts with a dash. | |
:param indent: A new paragraph also starts when the indentation of | |
the current line is `indent` spaces or more (a tab equals 8 spaces). | |
A value of 0 disables this feature. Assumes that any margins have | |
already been trimmed. | |
>>> list(fixparagraphs([ # doctest: +ELLIPSIS | |
... 'A new paragraph also starts', | |
... 'when the current line is ', | |
... 'indented by at least this ', | |
... 'number of spaces (a tab equals', | |
... '8 spaces).', | |
... 'A value < 0 disables this ', | |
... 'feature.'], threshold=20)) | |
['A new paragraph ... equals 8 spaces).', 'A value ... this feature.'] | |
>>> list(fixparagraphs([ # doctest: +ELLIPSIS | |
... ' A new paragraph also ', | |
... 'starts when the current ', | |
... 'line is indented by at ', | |
... 'least this number of spaces ', | |
... '(a tab equals 8 spaces).', | |
... ' A value < 0 disables this ', | |
... 'feature.'], threshold=20, indent=3)) | |
['A new paragraph ... equals 8 spaces).', 'A value ... this feature.'] | |
""" | |
para = [] | |
for line in lines: | |
thisindent = indentlen(line) | |
strippedline = line.strip() | |
if indent and thisindent >= indent: # start of paragraph | |
# or strippedline.startswith('-'): | |
if para: | |
yield ' '.join(para) | |
para = [strippedline] | |
elif len(line) >= threshold: # within paragraph | |
para.append(strippedline) | |
else: # shorter than threshold; end of paragraph | |
para.append(strippedline) | |
yield ' '.join(para) | |
para = [] | |
if para: | |
yield ' '.join(para) | |
def fixcontractions(text): | |
"""Fix the spacing around the contracted form of the Dutch genitive 'des'. | |
>>> fixcontractions("Het was 'savonds laat.") | |
"Het was 's avonds laat." | |
>>> fixcontractions("Een enorme berg TFT 's en CRT 's.") | |
"Een enorme berg TFT's en CRT's." | |
>>> fixcontractions("Bedenk 's wat nieuws.") | |
"Bedenk 's wat nieuws." | |
""" | |
# Separate contractions of genitive determiner 'des' | |
# ('s avonds, 's nachts, etc). | |
text = re.sub( | |
"('s)(%s)" % '|'.join(TEMPORALCONTRACTIONS), | |
r'\1 \2', text, flags=re.IGNORECASE) | |
# attach plural markers to preceding acronyms | |
text = re.sub( | |
"\\b([A-Z]+) +'s\\b(?! (?:%s))" % '|'.join(TEMPORALCONTRACTIONS), | |
r"\1's", text, flags=re.UNICODE) | |
# cannot do this for all capitalized words, e.g. "Bedenk ’s wat nieuws" | |
return text | |
def removespuriousparagraphbreaks(text): | |
"""Texts may contain spurious line breaks inside paragraphs, especially | |
texts without without fixed-width formatting. This function applies the | |
heuristic of removing paragraph breaks without sentence ending punctuation. | |
>>> removespuriousparagraphbreaks( | |
... 'Once upon a time, in a land far far away, there was\n' | |
... 'a king.') | |
'Once upon a time, in a land far far away, there was a king.' | |
>>> removespuriousparagraphbreaks( | |
... 'Once upon a time, in a land far far away, --- wait!\n' | |
... 'Not this again.') | |
'Once upon a time, in a land far far away, --- wait!\nnot this again.' | |
""" | |
# Use a threshold to avoid affecting titles and other non-parapgraphs. | |
# Avoid merging paragraphs when 2nd part starts with dialogue/list dash. | |
return re.sub(r'(?<=[^\n]{100}[^.!?:;\'")\n-])\n([^\n\'-])', r' \1', text) | |
def writehyphchanges(dictionary): | |
"""Write a logfile of hyphenation changes.""" | |
with io.open('hyphchanges.txt', 'w') as out: | |
for orig in sorted(HYPHCHANGES): | |
new, opts, eol = HYPHCHANGES[orig] | |
opts = OrderedDict((a.lower(), dictionary.get(a.lower(), 0)) | |
for a in opts) | |
out.write('%s %s{%s}\n' % ( | |
new if new == orig else '%s => %s' % (orig, new), | |
'(eol) ' if eol else '', | |
', '.join('%r: %s' % (a, b) for a, b in opts.items()))) | |
def readcorrections(): | |
"""Read the parallel files describing manual corrections.""" | |
with io.open('manual-patterns.txt', encoding='utf8') as inp: | |
before = inp.read().splitlines() | |
with io.open('manual-corrections.txt', encoding='utf8') as inp: | |
after = inp.read().splitlines() | |
assert len(before) == len(after) | |
for orig, fix in zip(before, after): | |
filename1, lineno1, text1 = orig.split(':', 2) | |
filename2, lineno2, text2 = fix.split(':', 2) | |
assert filename1 == filename2 | |
assert lineno1 == lineno2 | |
CORRECTIONS.setdefault(filename1, []).append( | |
(int(lineno1), text1, text2)) | |
def main(): | |
"""Load dictionary and parse CLI arguments.""" | |
try: | |
opts, args = gnu_getopt( | |
sys.argv[1:], 'h', ('help', 'batch', 'manual', 'paratext=')) | |
except GetoptError as err: | |
print('error:', err, file=sys.stderr) | |
print(__doc__) | |
sys.exit(2) | |
opts = dict(opts) | |
if '--help' in opts: | |
print(__doc__) | |
return | |
if '--batch' not in opts and not args: | |
print('reading from stdin; run preprocess.py --help for help.', | |
file=sys.stderr) | |
sys.stdout.flush() | |
# Sonar 500 corpus word counts (case folded) | |
dictionary = pandas.read_table('sonar-word.freqsort.lower.gz', | |
encoding='utf8', index_col=0, header=None) | |
# List of Dutch words from the OpenTaal project, version 2.10-2 | |
wordlist = {a.rstrip().lower() for a in io.open('/usr/share/dict/dutch', | |
encoding='utf8')} | |
dictionary = dictionary[1].reindex( # pylint: disable=no-member | |
index=dictionary.index | pandas.Index(wordlist), fill_value=1) | |
# Manual corrections (format is output of grep) | |
if '--manual' in opts: | |
readcorrections() | |
if '--paratext' in opts: | |
# line numbers of front and back matter | |
if (opts['--paratext'].endswith('.xlsx') | |
or opts['--paratext'].endswith('.xls')): | |
paratext = pandas.read_excel(opts['--paratext'], index_col='Label') | |
else: | |
paratext = pandas.read_csv(opts['--paratext'], index_col='Label') | |
paratext = paratext[~paratext['start'].isnull()] | |
else: | |
paratext = None | |
if '--batch' in opts: | |
batch(args, dictionary, paratext) | |
writehyphchanges(dictionary) | |
else: | |
single(args, dictionary, paratext) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment