Skip to content

Instantly share code, notes, and snippets.

@jdegene
Last active October 7, 2022 17:29
Show Gist options
  • Save jdegene/1ea18ae7137a9c1d00d89f43c6799d25 to your computer and use it in GitHub Desktop.
Save jdegene/1ea18ae7137a9c1d00d89f43c6799d25 to your computer and use it in GitHub Desktop.
import os
import pandas as pd
import fitz
from collections import Counter
import spacy
nlp = spacy.load('en_core_web_trf')
magazin_fol = "myfol/magazines/"
out_full_fp = "myfol/word_count_full.csv"
# load output csv file. This lets you resume where you left of if you have a lot of pdfs to parse.
if os.path.isfile(out_full_fp):
out_df = pd.read_csv(out_full_fp, encoding="utf8", sep=";", decimal=",")
else:
out_df = pd.DataFrame()
# iterate every magazine, extract its words and put them into a pandas dataframe
for mag in os.listdir(magazin_fol):
if len(out_df) > 0 and mag in out_df["name"].unique():
continue
doc = fitz.open(magazin_fol + mag)
mag_df = pd.DataFrame()
for page in doc:
text = page.get_text().replace("-\n", "").replace("\n"," ")
nlp_txt = nlp(text)
# make all upper case words lower case, then lemmatize all words
# This is purely done because some words in the magazine are fully UPPER CASE. This still preserves the german
# case, where nouns are written with a leading upper case character (which is important for lemmatization).
tok_lower_txt = nlp(" ".join([token.lower_.capitalize() if token.is_upper else token.text for token in nlp_txt]))
tok_lem_txt = nlp(" ".join([token.lemma_ for token in tok_lower_txt]))
# if words are actual words (that is what 'is_alpha' does), no stopwords and longer than 2 chars.
# convert to lemma and append to list
tok_list = []
for tok in tok_lem_txt:
if tok.is_alpha and not tok.is_stop and len(tok) > 2:
tok_list.append(tok.lemma_)
# count occurences
count_words_dict = Counter(tok_list)
page_df = pd.DataFrame.from_dict(count_words_dict, orient="index").reset_index()
if len(page_df) == 0:
continue
page_df.columns = ['word', 'word_cnt_page']
page_df["name"] = mag
page_df["page"] = page.number
page_df["date"] = mag[:10].replace("_","-") # my magazines had their issue date as part of the filename
mag_df = pd.concat([mag_df,page_df])
out_df = pd.concat([out_df, mag_df])
out_df.to_csv(out_full_fp , encoding="utf8", sep=";", decimal=",", index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment