jtauber/tokenize_01.py

## tokenize_01.py
# Opens the file with the given filename for reading and puts the resultant
# file object in the variable `f`.
f = open("OCR Output linebreaks removed.txt")

# `f.read()` reads the file and returns a string.
# `.split()` splits that string on whitespace and returns a list of strings.
# `for A in B:` iterates over the list B and runs the indented block with each
# list item in the variable A.
for token in f.read().split():

    # in this case, each list item is put in the variable `token` and we just
    # print it on a line of its own.
    print(token)

# Technically, this is redundant as Python will close the file at the end of
# the script anyway. This is more important to do when writing files to make
# sure that writing to disk actually finishes.
f.close()

# Note this is printing to STDOUT so if you want the result in a file, you'll
# want to do something like `python3 tokenize_01.py > output.txt`. We _could_
# hardcode that output to a file of a particular name in this script but
# writing to STDOUT puts more flexibility in the hands of the user of the
# script in what they want to do with the output.

# We'll see an improved way of opening files in tokenize_02.py.

## tokenize_02.py
# This time, we're going to use a `with` statement that will take care of
# certain things like file closing for us.

# This is the same as our earlier `f = open(...)` except that it executes
# the indented code that follows in a special context that knows what to do
# when it's finished or if there's an error.

with open("OCR Output linebreaks removed.txt") as f:
    for token in f.read().split():
        print(token)

# For the most part, this is the same as the previous script but is a little
# more idiomatic Python (in a good sense of "idiomatic", i.e. it's how the
# locals talk).

## tokenize_03.py
# Okay, now let's strip off punctuation and make sure we're normalizing to
# Unicode Normalization Form C. Maybe the input file was already normalized
# but there's no harm (other than a fraction of a second of extra work) to
# do it here to be sure.

# Unicode normalization is done via `normalize` in the `unicodedata` module
# so need to import it first.

import unicodedata

with open("OCR Output linebreaks removed.txt") as f:
    for token in f.read().split():

        # `.strip()` will strip any of the given characters from the start
        # or end of the string and return a new string without them. Note that
        # I'm deliberately not stripping middle-dot in order to make a point
        # later on!
        token = token.strip(".,")

        # Now let's normalize it to NFC
        token = unicodedata.normalize("NFC", token)

        print(token)

# If you look at the output, you'll notice we didn't strip out the middle dot.
# However, here's where it gets a little interesting and there's an important
# lesson here. If you copied the middle dot from the output and pasted it into
# the `strip` it wouldn't work. You might initially think: what's going on, I
# pasted it from the output into `strip` and yet it's not stripping it!

# The issue is that the output is normalized to NFC whereas we're stripping
# before normalization. Turns out that what was in the input file was
# U+0387 (GREEK ANO TELEIA) but under NFC that becomes U+00B7 (MIDDLE DOT).

# We could strip U+0387 instead of U+00B7. Or we could strip both. But perhaps
# the cleanest thing would be to just normalize BEFORE we strip. Then we only
# need to worry about the NFC versions of what we want to strip.

# So in subsequent versions of this script, we'll do:
#     token = unicodedata.normalize("NFC", token).strip(".,\u00B7")

# Note that `\uXXXX` is the way of getting a Unicode character in a string
# by its code point without needing to work out how to enter it directly.

## tokenize_04.py
# Next up, I want to add one more sanity check which is just making sure we
# don't have any stray characters. To do this, I'm going to make sure each
# of our tokens matches a particular regular expression.

# First we need to import the `re` module
import re

import unicodedata

# We're going to count the number of errors in the regular expression matching
# we get so let's initialize a variable.
error_count = 0

with open("OCR Output linebreaks removed.txt") as f:
    for token in f.read().split():
        # You'll notice I've added `\uFEFF` to the stripping. That's because
        # I noticed when I first ran this script (with the regular expression
        # test below) that the file starts with a \uFEFF which I don't want
        # as part of the word token.
        token = unicodedata.normalize("NFC", token).strip(".,;\u00B7\uFEFF")

        # Now this could (and, in fact in our case DOES) lead to a token that
        # is an empty string. We don't want to include those tokens in our
        # output so we're just going to skip them with the `continue` statement
        # which, in a `for` loop just goes to the next item in the iteration.
        if token == "":
            continue

        # `re.match` tests if the given pattern is a match for the given
        # string. Technically `re.match` is returning a "match object" (if
        # there is a match) or `None` but for testing "truth", a match object
        # counts as `True` and `None` counts as `False`.
        #
        # I'll explain the regular expression in more detail at the end.
        if re.match(r"[\u0370-\u03FF\u1F00-\u1FFF]+\u2019?$", token):
            print(token)
        else: # we got a `None` from `re.match` so it didn't match

            # increment our count
            error_count += 1

            # A quick little way to help us find the errors in the output
            print("*** ERROR ***")
            print(token)

            # As well as printing the token, let's actually iterate over each
            # character (if you try a `for` loop over a string it will
            # iterate over each character, in this case putting it in `ch`)
            for ch in token:
                # `ord()` converts a character to its Unicode code point
                # `hex()` converts an integer to its hexadecimal representation
                print(ch, hex(ord(ch)))

    # When we're all done, print out the number of errors we got (if any) so
    # we can easily see at the bottom of our output if anything went wrong.
    #
    # This `if` statement works because `0` counts as `False` and all other
    # numbers count as `True`
    if error_count:
        print("***", error_count, "errors")

# In our case there were a couple of errors:
#     - a stray `|` in κακῶν|
#     - a Latin `o` in Ἐντoλὴν


# Let me explain the regular expression:
#
# \u0370-\u03FF\u1F00-\u1FFF]+\u2019?$
#
# Here's what you need to know:
#
# 1. remember that `\uXXXX` just means the Unicode code point. It is
#    equivalent to literally typing in the equivalent character
# 2. `[`...`]` means match one of the characters in the brackets. `-` is used
#    to indicate ranges of characters. So `[\u0370-\u03FF\u1F00-\u1FFF]` means
#    "match one character in the range U+0370 to U+03FFF or U+1F00 to U+1FFF"
#    (which are the Greek areas in Unicode, assuming precomposed characters)
# 3. The `+` means "allow what precedes to occur one or more times". So
#    `[\u0370-\u03FF\u1F00-\u1FFF]+` means "one or more Greek characters"
# 4. The `?` means "what precedes is optional, i.e. allow it to occur zero or
#    one time". So `\u2019?` matches U+2019 if it's there but also won't
#    complain if it isn't
# 5. The `$` means "end of the string"
# 6. So all in all, this regular expression is saying "match one or more Greek
#    characters, optionally followed by a single U+2019, followed by the end
#    of the string

## tokenize_05.py
# It would be nice if errors were written to STDERR not STDOUT. In this
# version, we do that.

import re

# we're going to need `sys` for the special file object `sys.stderr`
import sys

import unicodedata

error_count = 0

with open("OCR Output linebreaks removed.txt") as f:
    for token in f.read().split():
        token = unicodedata.normalize("NFC", token).strip(".,;\u00B7\uFEFF")

        if token == "":
            continue

        if re.match(r"[\u0370-\u03FF\u1F00-\u1FFF]+\u2019?$", token):
            print(token)
        else:
            error_count += 1

            # you can give `print()` a `file` argument to print to something
            # other than STDOUT
            print("*** ERROR ***", file=sys.stderr)
            print(token, file=sys.stderr)

            for ch in token:
                print(ch, hex(ord(ch)), file=sys.stderr)

    if error_count:
        print("***", error_count, "errors", file=sys.stderr)
	# Opens the file with the given filename for reading and puts the resultant
	# file object in the variable `f`.
	f = open("OCR Output linebreaks removed.txt")

	# `f.read()` reads the file and returns a string.
	# `.split()` splits that string on whitespace and returns a list of strings.
	# `for A in B:` iterates over the list B and runs the indented block with each
	# list item in the variable A.
	for token in f.read().split():

	# in this case, each list item is put in the variable `token` and we just
	# print it on a line of its own.
	print(token)

	# Technically, this is redundant as Python will close the file at the end of
	# the script anyway. This is more important to do when writing files to make
	# sure that writing to disk actually finishes.
	f.close()

	# Note this is printing to STDOUT so if you want the result in a file, you'll
	# want to do something like `python3 tokenize_01.py > output.txt`. We _could_
	# hardcode that output to a file of a particular name in this script but
	# writing to STDOUT puts more flexibility in the hands of the user of the
	# script in what they want to do with the output.

	# We'll see an improved way of opening files in tokenize_02.py.
	# This time, we're going to use a `with` statement that will take care of
	# certain things like file closing for us.

	# This is the same as our earlier `f = open(...)` except that it executes
	# the indented code that follows in a special context that knows what to do
	# when it's finished or if there's an error.

	with open("OCR Output linebreaks removed.txt") as f:
	for token in f.read().split():
	print(token)

	# For the most part, this is the same as the previous script but is a little
	# more idiomatic Python (in a good sense of "idiomatic", i.e. it's how the
	# locals talk).
	# Okay, now let's strip off punctuation and make sure we're normalizing to
	# Unicode Normalization Form C. Maybe the input file was already normalized
	# but there's no harm (other than a fraction of a second of extra work) to
	# do it here to be sure.

	# Unicode normalization is done via `normalize` in the `unicodedata` module
	# so need to import it first.

	import unicodedata

	with open("OCR Output linebreaks removed.txt") as f:
	for token in f.read().split():

	# `.strip()` will strip any of the given characters from the start
	# or end of the string and return a new string without them. Note that
	# I'm deliberately not stripping middle-dot in order to make a point
	# later on!
	token = token.strip(".,")

	# Now let's normalize it to NFC
	token = unicodedata.normalize("NFC", token)

	print(token)

	# If you look at the output, you'll notice we didn't strip out the middle dot.
	# However, here's where it gets a little interesting and there's an important
	# lesson here. If you copied the middle dot from the output and pasted it into
	# the `strip` it wouldn't work. You might initially think: what's going on, I
	# pasted it from the output into `strip` and yet it's not stripping it!

	# The issue is that the output is normalized to NFC whereas we're stripping
	# before normalization. Turns out that what was in the input file was
	# U+0387 (GREEK ANO TELEIA) but under NFC that becomes U+00B7 (MIDDLE DOT).

	# We could strip U+0387 instead of U+00B7. Or we could strip both. But perhaps
	# the cleanest thing would be to just normalize BEFORE we strip. Then we only
	# need to worry about the NFC versions of what we want to strip.

	# So in subsequent versions of this script, we'll do:
	# token = unicodedata.normalize("NFC", token).strip(".,\u00B7")

	# Note that `\uXXXX` is the way of getting a Unicode character in a string
	# by its code point without needing to work out how to enter it directly.
	# Next up, I want to add one more sanity check which is just making sure we
	# don't have any stray characters. To do this, I'm going to make sure each
	# of our tokens matches a particular regular expression.

	# First we need to import the `re` module
	import re

	import unicodedata

	# We're going to count the number of errors in the regular expression matching
	# we get so let's initialize a variable.
	error_count = 0

	with open("OCR Output linebreaks removed.txt") as f:
	for token in f.read().split():
	# You'll notice I've added `\uFEFF` to the stripping. That's because
	# I noticed when I first ran this script (with the regular expression
	# test below) that the file starts with a \uFEFF which I don't want
	# as part of the word token.
	token = unicodedata.normalize("NFC", token).strip(".,;\u00B7\uFEFF")

	# Now this could (and, in fact in our case DOES) lead to a token that
	# is an empty string. We don't want to include those tokens in our
	# output so we're just going to skip them with the `continue` statement
	# which, in a `for` loop just goes to the next item in the iteration.
	if token == "":
	continue

	# `re.match` tests if the given pattern is a match for the given
	# string. Technically `re.match` is returning a "match object" (if
	# there is a match) or `None` but for testing "truth", a match object
	# counts as `True` and `None` counts as `False`.
	#
	# I'll explain the regular expression in more detail at the end.
	if re.match(r"[\u0370-\u03FF\u1F00-\u1FFF]+\u2019?$", token):
	print(token)
	else: # we got a `None` from `re.match` so it didn't match

	# increment our count
	error_count += 1

	# A quick little way to help us find the errors in the output
	print("* ERROR *")
	print(token)

	# As well as printing the token, let's actually iterate over each
	# character (if you try a `for` loop over a string it will
	# iterate over each character, in this case putting it in `ch`)
	for ch in token:
	# `ord()` converts a character to its Unicode code point
	# `hex()` converts an integer to its hexadecimal representation
	print(ch, hex(ord(ch)))

	# When we're all done, print out the number of errors we got (if any) so
	# we can easily see at the bottom of our output if anything went wrong.
	#
	# This `if` statement works because `0` counts as `False` and all other
	# numbers count as `True`
	if error_count:
	print("***", error_count, "errors")

	# In our case there were a couple of errors:
	# - a stray `\|` in κακῶν\|
	# - a Latin `o` in Ἐντoλὴν


	# Let me explain the regular expression:
	#
	# \u0370-\u03FF\u1F00-\u1FFF]+\u2019?$
	#
	# Here's what you need to know:
	#
	# 1. remember that `\uXXXX` just means the Unicode code point. It is
	# equivalent to literally typing in the equivalent character
	# 2. `[`...`]` means match one of the characters in the brackets. `-` is used
	# to indicate ranges of characters. So `[\u0370-\u03FF\u1F00-\u1FFF]` means
	# "match one character in the range U+0370 to U+03FFF or U+1F00 to U+1FFF"
	# (which are the Greek areas in Unicode, assuming precomposed characters)
	# 3. The `+` means "allow what precedes to occur one or more times". So
	# `[\u0370-\u03FF\u1F00-\u1FFF]+` means "one or more Greek characters"
	# 4. The `?` means "what precedes is optional, i.e. allow it to occur zero or
	# one time". So `\u2019?` matches U+2019 if it's there but also won't
	# complain if it isn't
	# 5. The `$` means "end of the string"
	# 6. So all in all, this regular expression is saying "match one or more Greek
	# characters, optionally followed by a single U+2019, followed by the end
	# of the string
	# It would be nice if errors were written to STDERR not STDOUT. In this
	# version, we do that.

	import re

	# we're going to need `sys` for the special file object `sys.stderr`
	import sys

	import unicodedata

	error_count = 0

	with open("OCR Output linebreaks removed.txt") as f:
	for token in f.read().split():
	token = unicodedata.normalize("NFC", token).strip(".,;\u00B7\uFEFF")

	if token == "":
	continue

	if re.match(r"[\u0370-\u03FF\u1F00-\u1FFF]+\u2019?$", token):
	print(token)
	else:
	error_count += 1

	# you can give `print()` a `file` argument to print to something
	# other than STDOUT
	print("* ERROR *", file=sys.stderr)
	print(token, file=sys.stderr)

	for ch in token:
	print(ch, hex(ord(ch)), file=sys.stderr)

	if error_count:
	print("***", error_count, "errors", file=sys.stderr)