alix-tz/lexical_exploration_duvalais.py

## lexical_exploration_duvalais.py
import os
import unicodedata
from collections import Counter

from spacy.lang.fr import French
from tqdm import tqdm
import pandas as pd
import lxml.etree as ET


def check_paths(list_of_paths):
    """
    Check if the paths in the given list exist
    """
    for path in list_of_paths:
        if not os.path.exists(path):
            print("ERROR: ", path, " does not exist")


def load_alto_xml(path: str) -> ET.ElementTree:
    """Load an ALTO XML file and return the XML tree."""
    try:
        return ET.parse(path, ET.XMLParser(encoding="utf-8"))
    except Exception:
        return False

def get_strings_in_alto(xml: ET.ElementTree) -> list:
    """Return a list of all the <String> elements in an ALTO XML file."""
    strings = []
    ns = dict(namespaces={"a": "http://www.loc.gov/standards/alto/ns-v4#"})
    for line in xml.xpath("//a:TextLine", **ns):
        for string in line.xpath("./a:String", **ns):
            strings.append(string)
    return strings


def spacy_count_tokens(data):
    """Count the tokens in a string using Spacy"""
    nlp = French()
    tokens = nlp.tokenizer(data)
    return Counter([token.text for token in tokens])


def snippet_from_counted_tokens(counted_tokens, dataset):
    """Print some info about the tokens in a dataset"""
    #print(f"Found {len(counted_tokens)} tokens in {dataset}")
    print(f"Found {sum(counted_tokens.values())} tokens in {dataset}")
    print(f"Found {len(set(counted_tokens.keys()))} different tokens in {dataset}")
    unica = sum(1 for c in counted_tokens.values() if c == 1)
    print(f"Found {unica} unica (token appearing only once) in {dataset}")
    print(f"The most frequent token in {dataset} is {counted_tokens.most_common(1)[0][0]} with {counted_tokens.most_common(1)[0][1]} occurences")


def main(dataset):
    print(f"Processing {dataset}")

    list_xml_files = []
    for root, dirs, files in os.walk(dataset):
        for file in files:
            if file.endswith(".xml"):
                list_xml_files.append(os.path.join(root, file))
    check_paths(list_xml_files)

    print(f"[DEBUG] Found {len(list_xml_files)} XML files.")

    data = ""
    for xml in tqdm(list_xml_files, desc="Loading XML files"):
        lines = get_strings_in_alto(load_alto_xml(xml))
        for line in lines:
            data += " " + unicodedata.normalize("NFC", line.get("CONTENT", "")).strip()

    counted_tokens = spacy_count_tokens(data)
    snippet_from_counted_tokens(counted_tokens, dataset)

    # no_unicas is a Counter object with the tokens that appear more than once
    no_unicas = Counter({k: v for k, v in counted_tokens.items() if v > 1})

    # turn Counter into dataframe
    #df = pd.DataFrame.from_records(counted_tokens.most_common(), columns=['token','count'])
    df = pd.DataFrame.from_records(no_unicas.most_common(), columns=['token','count'])
    print(df.to_string())
    #df.to_csv(f"token_count_{dataset}.csv", index=False)


main("valais-recensement/data/fr")
	import os
	import unicodedata
	from collections import Counter

	from spacy.lang.fr import French
	from tqdm import tqdm
	import pandas as pd
	import lxml.etree as ET


	def check_paths(list_of_paths):
	"""
	Check if the paths in the given list exist
	"""
	for path in list_of_paths:
	if not os.path.exists(path):
	print("ERROR: ", path, " does not exist")


	def load_alto_xml(path: str) -> ET.ElementTree:
	"""Load an ALTO XML file and return the XML tree."""
	try:
	return ET.parse(path, ET.XMLParser(encoding="utf-8"))
	except Exception:
	return False

	def get_strings_in_alto(xml: ET.ElementTree) -> list:
	"""Return a list of all the <String> elements in an ALTO XML file."""
	strings = []
	ns = dict(namespaces={"a": "http://www.loc.gov/standards/alto/ns-v4#"})
	for line in xml.xpath("//a:TextLine", **ns):
	for string in line.xpath("./a:String", **ns):
	strings.append(string)
	return strings


	def spacy_count_tokens(data):
	"""Count the tokens in a string using Spacy"""
	nlp = French()
	tokens = nlp.tokenizer(data)
	return Counter([token.text for token in tokens])


	def snippet_from_counted_tokens(counted_tokens, dataset):
	"""Print some info about the tokens in a dataset"""
	#print(f"Found {len(counted_tokens)} tokens in {dataset}")
	print(f"Found {sum(counted_tokens.values())} tokens in {dataset}")
	print(f"Found {len(set(counted_tokens.keys()))} different tokens in {dataset}")
	unica = sum(1 for c in counted_tokens.values() if c == 1)
	print(f"Found {unica} unica (token appearing only once) in {dataset}")
	print(f"The most frequent token in {dataset} is {counted_tokens.most_common(1)[0][0]} with {counted_tokens.most_common(1)[0][1]} occurences")


	def main(dataset):
	print(f"Processing {dataset}")

	list_xml_files = []
	for root, dirs, files in os.walk(dataset):
	for file in files:
	if file.endswith(".xml"):
	list_xml_files.append(os.path.join(root, file))
	check_paths(list_xml_files)

	print(f"[DEBUG] Found {len(list_xml_files)} XML files.")

	data = ""
	for xml in tqdm(list_xml_files, desc="Loading XML files"):
	lines = get_strings_in_alto(load_alto_xml(xml))
	for line in lines:
	data += " " + unicodedata.normalize("NFC", line.get("CONTENT", "")).strip()

	counted_tokens = spacy_count_tokens(data)
	snippet_from_counted_tokens(counted_tokens, dataset)

	# no_unicas is a Counter object with the tokens that appear more than once
	no_unicas = Counter({k: v for k, v in counted_tokens.items() if v > 1})

	# turn Counter into dataframe
	#df = pd.DataFrame.from_records(counted_tokens.most_common(), columns=['token','count'])
	df = pd.DataFrame.from_records(no_unicas.most_common(), columns=['token','count'])
	print(df.to_string())
	#df.to_csv(f"token_count_{dataset}.csv", index=False)


	main("valais-recensement/data/fr")