Last active
October 7, 2022 17:29
-
-
Save jdegene/1ea18ae7137a9c1d00d89f43c6799d25 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import pandas as pd | |
import fitz | |
from collections import Counter | |
import spacy | |
nlp = spacy.load('en_core_web_trf') | |
magazin_fol = "myfol/magazines/" | |
out_full_fp = "myfol/word_count_full.csv" | |
# load output csv file. This lets you resume where you left of if you have a lot of pdfs to parse. | |
if os.path.isfile(out_full_fp): | |
out_df = pd.read_csv(out_full_fp, encoding="utf8", sep=";", decimal=",") | |
else: | |
out_df = pd.DataFrame() | |
# iterate every magazine, extract its words and put them into a pandas dataframe | |
for mag in os.listdir(magazin_fol): | |
if len(out_df) > 0 and mag in out_df["name"].unique(): | |
continue | |
doc = fitz.open(magazin_fol + mag) | |
mag_df = pd.DataFrame() | |
for page in doc: | |
text = page.get_text().replace("-\n", "").replace("\n"," ") | |
nlp_txt = nlp(text) | |
# make all upper case words lower case, then lemmatize all words | |
# This is purely done because some words in the magazine are fully UPPER CASE. This still preserves the german | |
# case, where nouns are written with a leading upper case character (which is important for lemmatization). | |
tok_lower_txt = nlp(" ".join([token.lower_.capitalize() if token.is_upper else token.text for token in nlp_txt])) | |
tok_lem_txt = nlp(" ".join([token.lemma_ for token in tok_lower_txt])) | |
# if words are actual words (that is what 'is_alpha' does), no stopwords and longer than 2 chars. | |
# convert to lemma and append to list | |
tok_list = [] | |
for tok in tok_lem_txt: | |
if tok.is_alpha and not tok.is_stop and len(tok) > 2: | |
tok_list.append(tok.lemma_) | |
# count occurences | |
count_words_dict = Counter(tok_list) | |
page_df = pd.DataFrame.from_dict(count_words_dict, orient="index").reset_index() | |
if len(page_df) == 0: | |
continue | |
page_df.columns = ['word', 'word_cnt_page'] | |
page_df["name"] = mag | |
page_df["page"] = page.number | |
page_df["date"] = mag[:10].replace("_","-") # my magazines had their issue date as part of the filename | |
mag_df = pd.concat([mag_df,page_df]) | |
out_df = pd.concat([out_df, mag_df]) | |
out_df.to_csv(out_full_fp , encoding="utf8", sep=";", decimal=",", index=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment