Skip to content

Instantly share code, notes, and snippets.

@Cilyan
Created March 15, 2015 03:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Cilyan/475c1add5e01ac5f8e54 to your computer and use it in GitHub Desktop.
Save Cilyan/475c1add5e01ac5f8e54 to your computer and use it in GitHub Desktop.
A formatter that cleans a text's whitespacing around punctuation, quotes, parenthesis, brackets or curly brackets.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re # Oh yeah :)
class WhiteFormater:
"""
A formatter that cleans a text's whitespacing around punctuation,
quotes, parenthesis, brackets or curly brackets. There should be no
spaces before a punctuation or a closing delimiter, but at least a space
after them. There should be at least a space before an opening delimiter
and none after them. Quotes are handled in pair. The first encountered
is the opening one, the second is a closing one.
`strip`: If set, starting and trailing whitespaces are removed
`keependline`: If set, if the string contains an ending newline,
it will be preserved, even if `strip` is set.
`reduce_whitespace`: If set, multiple whitespaces are merged into one
single: a single space, except if the chain contains a tab. In
this case, the tab will be kept.
"""
# List of tokens the lexer will understand: a name and its associated regex
tokens = [
("word", re.compile(r"[\w\-_\d]+")),
("punct", re.compile(r"[\.,;:!?$]")),
("open", re.compile(r"[(\[\{]")),
("close", re.compile(r"[)\]\}]")),
("white", re.compile(r"\s+")),
("quote", re.compile(r'"'))
]
def __init__(self, strip=True, keependline=True, reduce_whitespace=False):
self.strip = strip
self.keependline = keependline
self.reduce_whitespace = reduce_whitespace
def _do_reduce_whitespace(self, whitespace):
"""
This function merges a chain of whitespaces according to the
settings and the priority of whitespace types.
"""
# Do we need to take action ?
if len(whitespace) > 1 and self.reduce_whitespace:
# Give higher priority to a tab
if "\t" in whitespace:
return "\t"
else:
return " "
else:
return whitespace
def _tokenize(self, source, initpos=0):
"""
This is the lexer. It is responsible of "eating" the source by
matching tokens one after the other. It does only that. Tokens
are yield and will be processed by other functions.
Objects yield are tuples of dimension 2, containing the token
name (or identifier) followed by the matched text.
This format will be retained by all filters, so that they can
be chained.
"""
pos = initpos
end = len(source)
# Till the end of the source
while pos < end:
# Try all the regexes to find one that matches
for token_name, regex in self.tokens:
match = regex.match(source, pos)
if match:
# Advance the reading cursor to after the match
pos = match.end()
# Push the token and matched text to the parser
yield (token_name, match.group())
break
else:
# In case no regexes match, this usually indicates that the
# text contains a syntax error. In our case, this may just be
# a character that is not a letter or a digit but is valid
# (i.e. / or + or *...). Just push a fake "unknown" token that
# will not be taken into account)
yield ("unknown", source[pos])
# Advance just by one character in the source
pos += 1
def _quote_sorter(self, tokenizer):
"""
This is a filter that sort quotes. The first one encountered will
be transformed to an opening delimiter, the second one to a closing
one.
"""
in_quote = False
# Process all matched tokens
for token, matched in tokenizer:
# Transform quote tokens
if token == "quote":
if in_quote:
yield "close", matched
else:
yield "open", matched
in_quote = not in_quote
# Other tokens are left untouched
else:
yield token, matched
def _correcter(self, tokenizer):
"""
The main filter that cleans whitespaces in the text.
The main idea in the algorithm is that it look at the previous and
the next token to take a decision about the current token.
"""
# At the beginning, there are no previous token
prev_token, prev_matched = (None, None)
# Initialise the current token with the first token.
# Then, these "registers" will be shifted at each iteration
# ---
# Note: if the string is empty, this will raise a StopIteration,
# which is fine because then our generator must also end, and to
# end a generator, we need to raise a StopIteration. So just let the
# exception being catched by the caller.
token, matched = next(tokenizer)
# Iterate over tokens, but actually the one we fetch is the "next"
# considering the token we analyse actually. Then it is shifted.
# So in a way, the for loop is one step ahead of the analysed token.
for next_token, next_matched in tokenizer:
if token == "white":
# A whitespace is accepted if it is not after an opening
# delimiter, and not before a punctuation or a closing
# delimiter. It is also not accepted at the start of the string
# if self.strip is set.
if (
(prev_token != "open") and
(next_token not in ["close", "punct"]) and
((prev_token != None) or (not self.strip))
):
# Call do_reduce_whitespace to merge whitespaces if needed
yield token, self._do_reduce_whitespace(matched)
# Else, reject token
elif token == "open":
# If there was no whitespace before an opening delimiter,
# we need to add one.
if prev_token not in ["white", None]:
yield "white", " "
# Output the opening delimiter itself
yield token, matched
elif token in ["close", "punct"]:
# First output the delimiter
yield token, matched
# Then check that it is followed by a whitespace, else add one
if next_token not in ["white", "close", "punct"]:
yield "white", " "
else:
# Other tokens ("word", "unknown") are passed through untouched
yield token, matched
# Shift token reading position. previous = actual, actual = next
prev_token, prev_matched = token, matched
token, matched = next_token, next_matched
# Handle the last token (remember, the for loop is one step ahead)
if token != "white":
# Only whitespaces must be processed in the last token.
# Is it? I think so. Maybe not... I don't see a corner case, yet :)
yield token, matched
else:
if self.strip:
# If strip is set, we sould remove the trailing whitespace,
# but still retain the endline if the option is set.
if matched.endswith("\n") and self.keependline:
yield "white", "\n"
else:
# If we do not strip, check however if the option is set to
# remove the endline. Merge the whitespace if needed.
if matched.endswith("\n") and not self.keependline:
yield "white", self._do_reduce_whitespace(matched[:-1])
else:
yield "white", self._do_reduce_whitespace(matched)
def format_line(self, source):
"""
Format a single line of text, returning the cleaned text as a
string.
Note that multi-line text will probably result in badly cleaned
text around the newline. If `reduce_whitespace` is set, it will
even merge lines together.
"""
# Create the tokenizer by chaining the filters
tokenizer = self._correcter(self._quote_sorter(self._tokenize(source)))
# Join all pieces together in a single string
return "".join((matched for token, matched in tokenizer))
# Test functions using the formatter
def test_simple_string():
s = (
' Hello ( this is )not a "proper " sentence : punctuation , '
'and (parenthesis) or brackets like [and/or ] are not '
'{ correctly}spaced :please correct " if you can " ! \n'
)
formatter = WhiteFormater()
print(formatter.format_line(s))
# Hello (this is) not a "proper" sentence: punctuation, and (parenthesis)
# or brackets like [and/or] are not {correctly} spaced: please correct
# "if you can"!
def test_file():
formatter = WhiteFormater(reduce_whitespace=True)
with open("test_in.txt", "r") as fin, open("test_out.txt", "w") as fout:
for line in fin:
fout.write(formatter.format_line(line))
if __name__ == "__main__":
test_file()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment