kylebgorman/normcheck.py

## normcheck.py
#!/usr/bin/env python

"""Applies a given normalization form to file and detects changes.

This script reads text files line by line, decoding them into Unicode using a
specified encoding (by default, UTF-8), and then applying a specified Unicode
normalization (by default, NFC). If, for any line this normalization is not
no-op (i.e., if it changes the line) it logs a fatal error with the filename and
affected line number.

Therefore, this script can be used to detect if a file is not in a specified
encoding/normalization pair.

E.g., if this fails:

  ./normcheck --encoding=utf16 --norm=NFC yourfile

it means that (some portion of) `yourfile` is not a UTF-18-encoded file in NFC.
"""


import argparse
import sys
import unicodedata


assert sys.version_info[0] >= 3, "Python 3 only"


def main(args):
  for path in args.paths:
    with open(path, "r", encoding=args.encoding) as source:
      for (linenum, line) in enumerate(source, 1):
        normed_line = unicodedata.normalize(args.form, line)
        if normed_line != line:
          exit("{} (line {}) does not match in {}".format(path,
               linenum, args.form))


if __name__ == "__main__":
  parser = argparse.ArgumentParser()
  parser.add_argument("paths", nargs="+", help="Input path(s)")
  parser.add_argument("--form", choices=("NFC", "NFKC", "NFD", "NFKD"),
                      default="NFC", help="Normalization form")
  parser.add_argument("--encoding", default="utf8", help="Encoding")

  args = parser.parse_args()
	#!/usr/bin/env python

	"""Applies a given normalization form to file and detects changes.

	This script reads text files line by line, decoding them into Unicode using a
	specified encoding (by default, UTF-8), and then applying a specified Unicode
	normalization (by default, NFC). If, for any line this normalization is not
	no-op (i.e., if it changes the line) it logs a fatal error with the filename and
	affected line number.

	Therefore, this script can be used to detect if a file is not in a specified
	encoding/normalization pair.

	E.g., if this fails:

	./normcheck --encoding=utf16 --norm=NFC yourfile

	it means that (some portion of) `yourfile` is not a UTF-18-encoded file in NFC.
	"""


	import argparse
	import sys
	import unicodedata


	assert sys.version_info[0] >= 3, "Python 3 only"


	def main(args):
	for path in args.paths:
	with open(path, "r", encoding=args.encoding) as source:
	for (linenum, line) in enumerate(source, 1):
	normed_line = unicodedata.normalize(args.form, line)
	if normed_line != line:
	exit("{} (line {}) does not match in {}".format(path,
	linenum, args.form))


	if __name__ == "__main__":
	parser = argparse.ArgumentParser()
	parser.add_argument("paths", nargs="+", help="Input path(s)")
	parser.add_argument("--form", choices=("NFC", "NFKC", "NFD", "NFKD"),
	default="NFC", help="Normalization form")
	parser.add_argument("--encoding", default="utf8", help="Encoding")

	args = parser.parse_args()