EmilStenstrom/run_tokenize_stats.py

## run_tokenize_stats.py
import bz2
import os
import re
from collections import defaultdict
from enum import Enum, auto

from conllu import parse
from spacy.lang.sv import Swedish


def get_num_sentences_from_corpus(path):
    with bz2.open(path, "rt", encoding="utf-8") as fp:
        return len(fp.read().split("\n\n"))

def get_sentence_from_corpus(path):
    with bz2.open(path, "rt", encoding="utf-8") as fp:
        lines = ""
        for i, line in enumerate(fp.readlines()):
            if line != "\n":
                lines += line
            else:
                yield lines
                lines = ""

def get_orginal_sentence(parsed_sentence):
    out = []
    for token in parsed_sentence:
        spaceafter = False
        if "misc" in token and token["misc"] and \
                "SpaceAfter" in token["misc"] and token["misc"]["SpaceAfter"] == "No":
            spaceafter = True

        out.append(token["form"] + (" " if not spaceafter else ""))

    return "".join(out)

def print_stats(stats):
    print(f"""
        Correct sentences: {stats["sentence_correct"]}
        Incorrect sentences: {stats["sentence_incorrect"]}
    """)


ABBREVIATIONS = [
    # From UD_Swedish-Talbanken README about exceptions
    "bl a", "d v s", "e d", "f n", "fr o m", "m fl",
    "m m", "o s v", "s k", "t ex", "t o m", "t v",
    # From UD_Swedish-Talbanken abbreviations
    "tel", "sid", "kungl", "prof", "proc", "doc", "f", "milj", "fig",
    "kap", "mt", "mos", "kor", "t h", "vol",
    # From SpaCy tokenizer_exceptions.py
    "ang", "anm", "bil", "bl a", "dvs", "e kr", "el", "e d", "eng",
    "etc", "exkl", "f d", "fid", "f kr", "forts", "fr o m", "f ö",
    "förf", "inkl", "jur", "kl", "kr", "lat", "m a o", "max", "m fl",
    "min", "m m", "obs", "o d", "osv", "p g a", "ref", "resp", "s a s",
    "s k", "st", "s t", "t ex", "t o m", "ung", "äv", "övers"
]
ABBREVIATIONS += [abbr.replace(" ", ".") for abbr in ABBREVIATIONS]
ABBREVIATIONS += [abbr + "." for abbr in ABBREVIATIONS]

class ERROR_TYPES(Enum):
    UNKNOWN = auto()
    DASH = auto()
    ABBR = auto()
    GENITIVE = auto()
    SINGLELETTER = auto()
    LISTS = auto()
    PARENTESISEQUAL = auto()

def categorize_error(spacy_token, ud_token):
    if "-" in ud_token and ud_token.split("-")[0] == spacy_token:
        return ERROR_TYPES.DASH

    if ":" in ud_token and ud_token.split(":")[0] == spacy_token or \
            ud_token[-1] == "'":
        return ERROR_TYPES.GENITIVE

    if ud_token.lower() in ABBREVIATIONS:
        return ERROR_TYPES.ABBR

    if len(spacy_token) == 2 and spacy_token[1] == ".":
        return ERROR_TYPES.SINGLELETTER

    if re.match(r"^\(?[1-9a-z](\)|\.)?$", ud_token):
        return ERROR_TYPES.LISTS

    if spacy_token == "(=":
        return ERROR_TYPES.PARENTESISEQUAL

    return ERROR_TYPES.UNKNOWN

def main():
    nlp = Swedish()

    stats = {"sentence_correct": 0, "sentence_incorrect": 0}
    incorrect_types = defaultdict(lambda: 0)

    base_path = os.path.expanduser("~/Downloads/")
    corpus_paths = [
        base_path + "sv_talbanken-ud-train.conllu.bz2",
        base_path + "sv_talbanken-ud-dev.conllu.bz2",
        base_path + "sv_talbanken-ud-test.conllu.bz2",
    ]

    num_sentences = sum(get_num_sentences_from_corpus(corpus_path) for corpus_path in corpus_paths)

    sentence_count = 0
    for corpus_path in corpus_paths:
        for conllu_sentence in get_sentence_from_corpus(corpus_path):
            sentence_count += 1

            if sentence_count % 100 == 0:
                print(f"{sentence_count}/{num_sentences} sentences parsed.")

            sentence = parse(conllu_sentence)[0]
            doc = nlp(get_orginal_sentence(sentence))

            sentence_correct = True
            for j, token in enumerate(doc):

                spacy_token = token.text
                ud_token = sentence[j]["form"]
                if spacy_token != ud_token:
                    sentence_correct = False
                    error_type = categorize_error(spacy_token, ud_token)
                    # assert spacy_token != "gälleräven", (ud_token, error_type)

                    incorrect_types[error_type] += 1
                    if error_type == ERROR_TYPES.UNKNOWN:
                        print(f"UNKNOWN TYPE. SpaCy: '{spacy_token}', UD: '{ud_token}'")

                    break

            stats["sentence_correct" if sentence_correct else "sentence_incorrect"] += 1

    print("Done.")
    print_stats(stats)
    print(f"{100*stats['sentence_correct']/num_sentences:.2f}% where correctly tokenized.")
    print("\n".join([f"{key}: {value}" for key, value in incorrect_types.items()]))


if __name__ == '__main__':
    main()
	import bz2
	import os
	import re
	from collections import defaultdict
	from enum import Enum, auto

	from conllu import parse
	from spacy.lang.sv import Swedish


	def get_num_sentences_from_corpus(path):
	with bz2.open(path, "rt", encoding="utf-8") as fp:
	return len(fp.read().split("\n\n"))

	def get_sentence_from_corpus(path):
	with bz2.open(path, "rt", encoding="utf-8") as fp:
	lines = ""
	for i, line in enumerate(fp.readlines()):
	if line != "\n":
	lines += line
	else:
	yield lines
	lines = ""

	def get_orginal_sentence(parsed_sentence):
	out = []
	for token in parsed_sentence:
	spaceafter = False
	if "misc" in token and token["misc"] and \
	"SpaceAfter" in token["misc"] and token["misc"]["SpaceAfter"] == "No":
	spaceafter = True

	out.append(token["form"] + (" " if not spaceafter else ""))

	return "".join(out)

	def print_stats(stats):
	print(f"""
	Correct sentences: {stats["sentence_correct"]}
	Incorrect sentences: {stats["sentence_incorrect"]}
	""")


	ABBREVIATIONS = [
	# From UD_Swedish-Talbanken README about exceptions
	"bl a", "d v s", "e d", "f n", "fr o m", "m fl",
	"m m", "o s v", "s k", "t ex", "t o m", "t v",
	# From UD_Swedish-Talbanken abbreviations
	"tel", "sid", "kungl", "prof", "proc", "doc", "f", "milj", "fig",
	"kap", "mt", "mos", "kor", "t h", "vol",
	# From SpaCy tokenizer_exceptions.py
	"ang", "anm", "bil", "bl a", "dvs", "e kr", "el", "e d", "eng",
	"etc", "exkl", "f d", "fid", "f kr", "forts", "fr o m", "f ö",
	"förf", "inkl", "jur", "kl", "kr", "lat", "m a o", "max", "m fl",
	"min", "m m", "obs", "o d", "osv", "p g a", "ref", "resp", "s a s",
	"s k", "st", "s t", "t ex", "t o m", "ung", "äv", "övers"
	]
	ABBREVIATIONS += [abbr.replace(" ", ".") for abbr in ABBREVIATIONS]
	ABBREVIATIONS += [abbr + "." for abbr in ABBREVIATIONS]

	class ERROR_TYPES(Enum):
	UNKNOWN = auto()
	DASH = auto()
	ABBR = auto()
	GENITIVE = auto()
	SINGLELETTER = auto()
	LISTS = auto()
	PARENTESISEQUAL = auto()

	def categorize_error(spacy_token, ud_token):
	if "-" in ud_token and ud_token.split("-")[0] == spacy_token:
	return ERROR_TYPES.DASH

	if ":" in ud_token and ud_token.split(":")[0] == spacy_token or \
	ud_token[-1] == "'":
	return ERROR_TYPES.GENITIVE

	if ud_token.lower() in ABBREVIATIONS:
	return ERROR_TYPES.ABBR

	if len(spacy_token) == 2 and spacy_token[1] == ".":
	return ERROR_TYPES.SINGLELETTER

	if re.match(r"^\(?[1-9a-z](\)\|\.)?$", ud_token):
	return ERROR_TYPES.LISTS

	if spacy_token == "(=":
	return ERROR_TYPES.PARENTESISEQUAL

	return ERROR_TYPES.UNKNOWN

	def main():
	nlp = Swedish()

	stats = {"sentence_correct": 0, "sentence_incorrect": 0}
	incorrect_types = defaultdict(lambda: 0)

	base_path = os.path.expanduser("~/Downloads/")
	corpus_paths = [
	base_path + "sv_talbanken-ud-train.conllu.bz2",
	base_path + "sv_talbanken-ud-dev.conllu.bz2",
	base_path + "sv_talbanken-ud-test.conllu.bz2",
	]

	num_sentences = sum(get_num_sentences_from_corpus(corpus_path) for corpus_path in corpus_paths)

	sentence_count = 0
	for corpus_path in corpus_paths:
	for conllu_sentence in get_sentence_from_corpus(corpus_path):
	sentence_count += 1

	if sentence_count % 100 == 0:
	print(f"{sentence_count}/{num_sentences} sentences parsed.")

	sentence = parse(conllu_sentence)[0]
	doc = nlp(get_orginal_sentence(sentence))

	sentence_correct = True
	for j, token in enumerate(doc):

	spacy_token = token.text
	ud_token = sentence[j]["form"]
	if spacy_token != ud_token:
	sentence_correct = False
	error_type = categorize_error(spacy_token, ud_token)
	# assert spacy_token != "gälleräven", (ud_token, error_type)

	incorrect_types[error_type] += 1
	if error_type == ERROR_TYPES.UNKNOWN:
	print(f"UNKNOWN TYPE. SpaCy: '{spacy_token}', UD: '{ud_token}'")

	break

	stats["sentence_correct" if sentence_correct else "sentence_incorrect"] += 1

	print("Done.")
	print_stats(stats)
	print(f"{100*stats['sentence_correct']/num_sentences:.2f}% where correctly tokenized.")
	print("\n".join([f"{key}: {value}" for key, value in incorrect_types.items()]))


	if __name__ == '__main__':
	main()