jdegene/get_info_from_mag.py

## get_info_from_mag.py
import os
import pandas as pd
import fitz
from collections import Counter

import spacy
nlp = spacy.load('en_core_web_trf')

magazin_fol = "myfol/magazines/"
out_full_fp = "myfol/word_count_full.csv"

# load output csv file. This lets you resume where you left of if you have a lot of pdfs to parse.
if os.path.isfile(out_full_fp):
    out_df = pd.read_csv(out_full_fp, encoding="utf8", sep=";", decimal=",")
else:
    out_df = pd.DataFrame()

# iterate every magazine, extract its words and put them into a pandas dataframe
for mag in os.listdir(magazin_fol):

    if len(out_df) > 0 and mag in out_df["name"].unique():
        continue

    doc = fitz.open(magazin_fol + mag)
    mag_df = pd.DataFrame()

    for page in doc:
        text = page.get_text().replace("-\n", "").replace("\n"," ")

        nlp_txt = nlp(text)

        # make all upper case words lower case, then lemmatize all words
        # This is purely done because some words in the magazine are fully UPPER CASE. This still preserves the german
        # case, where nouns are written with a leading upper case character (which is important for lemmatization).
        tok_lower_txt = nlp(" ".join([token.lower_.capitalize() if token.is_upper else token.text for token in nlp_txt]))
        tok_lem_txt = nlp(" ".join([token.lemma_ for token in tok_lower_txt]))

        # if words are actual words (that is what 'is_alpha' does), no stopwords and longer than 2 chars.
        # convert to lemma and append to list
        tok_list = []
        for tok in tok_lem_txt:
            if tok.is_alpha and not tok.is_stop and len(tok) > 2:
                tok_list.append(tok.lemma_)

        # count occurences
        count_words_dict = Counter(tok_list)

        page_df = pd.DataFrame.from_dict(count_words_dict, orient="index").reset_index()
        if len(page_df) == 0:
            continue
        page_df.columns = ['word', 'word_cnt_page']
        page_df["name"] = mag
        page_df["page"] = page.number
        page_df["date"] = mag[:10].replace("_","-") # my magazines had their issue date as part of the filename

        mag_df = pd.concat([mag_df,page_df])

    out_df = pd.concat([out_df, mag_df])
    out_df.to_csv(out_full_fp , encoding="utf8", sep=";", decimal=",", index=False)
	import os
	import pandas as pd
	import fitz
	from collections import Counter

	import spacy
	nlp = spacy.load('en_core_web_trf')

	magazin_fol = "myfol/magazines/"
	out_full_fp = "myfol/word_count_full.csv"

	# load output csv file. This lets you resume where you left of if you have a lot of pdfs to parse.
	if os.path.isfile(out_full_fp):
	out_df = pd.read_csv(out_full_fp, encoding="utf8", sep=";", decimal=",")
	else:
	out_df = pd.DataFrame()

	# iterate every magazine, extract its words and put them into a pandas dataframe
	for mag in os.listdir(magazin_fol):

	if len(out_df) > 0 and mag in out_df["name"].unique():
	continue

	doc = fitz.open(magazin_fol + mag)
	mag_df = pd.DataFrame()

	for page in doc:
	text = page.get_text().replace("-\n", "").replace("\n"," ")

	nlp_txt = nlp(text)

	# make all upper case words lower case, then lemmatize all words
	# This is purely done because some words in the magazine are fully UPPER CASE. This still preserves the german
	# case, where nouns are written with a leading upper case character (which is important for lemmatization).
	tok_lower_txt = nlp(" ".join([token.lower_.capitalize() if token.is_upper else token.text for token in nlp_txt]))
	tok_lem_txt = nlp(" ".join([token.lemma_ for token in tok_lower_txt]))

	# if words are actual words (that is what 'is_alpha' does), no stopwords and longer than 2 chars.
	# convert to lemma and append to list
	tok_list = []
	for tok in tok_lem_txt:
	if tok.is_alpha and not tok.is_stop and len(tok) > 2:
	tok_list.append(tok.lemma_)

	# count occurences
	count_words_dict = Counter(tok_list)

	page_df = pd.DataFrame.from_dict(count_words_dict, orient="index").reset_index()
	if len(page_df) == 0:
	continue
	page_df.columns = ['word', 'word_cnt_page']
	page_df["name"] = mag
	page_df["page"] = page.number
	page_df["date"] = mag[:10].replace("_","-") # my magazines had their issue date as part of the filename

	mag_df = pd.concat([mag_df,page_df])

	out_df = pd.concat([out_df, mag_df])
	out_df.to_csv(out_full_fp , encoding="utf8", sep=";", decimal=",", index=False)