Skip to content

Instantly share code, notes, and snippets.

@alix-tz
Created November 23, 2023 16:07
Show Gist options
  • Save alix-tz/64677e95bb5944e9ce71a1d69ed98941 to your computer and use it in GitHub Desktop.
Save alix-tz/64677e95bb5944e9ce71a1d69ed98941 to your computer and use it in GitHub Desktop.
Python script to explore the lexical variety of the French Du Valais Recensement
import os
import unicodedata
from collections import Counter
from spacy.lang.fr import French
from tqdm import tqdm
import pandas as pd
import lxml.etree as ET
def check_paths(list_of_paths):
"""
Check if the paths in the given list exist
"""
for path in list_of_paths:
if not os.path.exists(path):
print("ERROR: ", path, " does not exist")
def load_alto_xml(path: str) -> ET.ElementTree:
"""Load an ALTO XML file and return the XML tree."""
try:
return ET.parse(path, ET.XMLParser(encoding="utf-8"))
except Exception:
return False
def get_strings_in_alto(xml: ET.ElementTree) -> list:
"""Return a list of all the <String> elements in an ALTO XML file."""
strings = []
ns = dict(namespaces={"a": "http://www.loc.gov/standards/alto/ns-v4#"})
for line in xml.xpath("//a:TextLine", **ns):
for string in line.xpath("./a:String", **ns):
strings.append(string)
return strings
def spacy_count_tokens(data):
"""Count the tokens in a string using Spacy"""
nlp = French()
tokens = nlp.tokenizer(data)
return Counter([token.text for token in tokens])
def snippet_from_counted_tokens(counted_tokens, dataset):
"""Print some info about the tokens in a dataset"""
#print(f"Found {len(counted_tokens)} tokens in {dataset}")
print(f"Found {sum(counted_tokens.values())} tokens in {dataset}")
print(f"Found {len(set(counted_tokens.keys()))} different tokens in {dataset}")
unica = sum(1 for c in counted_tokens.values() if c == 1)
print(f"Found {unica} unica (token appearing only once) in {dataset}")
print(f"The most frequent token in {dataset} is {counted_tokens.most_common(1)[0][0]} with {counted_tokens.most_common(1)[0][1]} occurences")
def main(dataset):
print(f"Processing {dataset}")
list_xml_files = []
for root, dirs, files in os.walk(dataset):
for file in files:
if file.endswith(".xml"):
list_xml_files.append(os.path.join(root, file))
check_paths(list_xml_files)
print(f"[DEBUG] Found {len(list_xml_files)} XML files.")
data = ""
for xml in tqdm(list_xml_files, desc="Loading XML files"):
lines = get_strings_in_alto(load_alto_xml(xml))
for line in lines:
data += " " + unicodedata.normalize("NFC", line.get("CONTENT", "")).strip()
counted_tokens = spacy_count_tokens(data)
snippet_from_counted_tokens(counted_tokens, dataset)
# no_unicas is a Counter object with the tokens that appear more than once
no_unicas = Counter({k: v for k, v in counted_tokens.items() if v > 1})
# turn Counter into dataframe
#df = pd.DataFrame.from_records(counted_tokens.most_common(), columns=['token','count'])
df = pd.DataFrame.from_records(no_unicas.most_common(), columns=['token','count'])
print(df.to_string())
#df.to_csv(f"token_count_{dataset}.csv", index=False)
main("valais-recensement/data/fr")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment