Skip to content

Instantly share code, notes, and snippets.

@Foadsf
Created May 6, 2024 19:07
Show Gist options
  • Save Foadsf/93b91180adcfae2aeb0663e6b1782993 to your computer and use it in GitHub Desktop.
Save Foadsf/93b91180adcfae2aeb0663e6b1782993 to your computer and use it in GitHub Desktop.
from pylatexenc.latex2text import LatexNodes2Text
from nltk.probability import FreqDist
from nltk.corpus import stopwords
import argparse
import nltk
def analyze_word_frequency(filename, words_to_check):
"""Analyzes word frequency in a LaTeX document.
Args:
filename (str): Path to the LaTeX file.
words_to_check (list): List of words to check frequency for.
Returns:
dict: A dictionary containing the frequency of the words_to_check.
"""
with open(filename, "r", encoding="utf-8") as f:
latex_code = f.read()
# Convert LaTeX to plain text
converter = LatexNodes2Text()
plain_text = converter.latex_to_text(latex_code)
# Tokenize and lowercase the text
tokens = nltk.word_tokenize(plain_text.lower())
# Remove stopwords
stop_words = set(stopwords.words("english"))
filtered_tokens = [w for w in tokens if w not in stop_words and w.isalnum()]
# Calculate word frequencies
word_dist = FreqDist(filtered_tokens)
# Extract frequencies for words_to_check
word_counts = {word: word_dist[word] for word in words_to_check}
return word_counts
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Analyze word frequency in LaTeX documents."
)
parser.add_argument("filename", help="Path to the LaTeX file.")
parser.add_argument(
"-w", "--words", nargs="+", help="Words to check frequency for."
)
args = parser.parse_args()
word_counts = analyze_word_frequency(args.filename, args.words)
print(word_counts)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment