jdegene/process_pdf_word_data.py

## process_pdf_word_data.py
import pandas as pd

data_fol = "/myLocal/dataFol/"

# wc = wordcount. Make words lower case to make interpretation easier. Remove words that might still have <2 characters.
wc_df = pd.read_csv(data_fol + "word_count_full.csv", sep=";", decimal=",", encoding="utf8")
wc_df["word"] = wc_df["word"].str.lower()
wc_df = wc_df[wc_df["word"].str.len() > 2]

# convert str date column to pandas datetime format and extract months & years...
wc_df["date"] = pd.to_datetime(wc_df["date"], format="%Y-%m-%d")
wc_df["month"] = wc_df["date"].dt.month
wc_df["year"] = wc_df["date"].dt.year

wc_df['total_count'] = wc_df.groupby("word")["word_cnt_page"].transform("sum")
wc_df['cnt_in_magazine'] = wc_df.groupby(["word","name"])["word_cnt_page"].transform("sum")

wc_df['count_per_month'] = wc_df.groupby(["word", "month"])["word_cnt_page"].transform("sum")
wc_df['count_per_year'] = wc_df.groupby(["word", "year"])["word_cnt_page"].transform("sum")

# get number of available magazine per time unit
wc_df['magazines_in_month'] = wc_df.groupby(["month"])["name"].transform("nunique")
wc_df['magazines_in_year'] = wc_df.groupby(["year"])["name"].transform("nunique")

wc_df.to_csv(data_fol + "word_count_full_enh.csv", sep=";", decimal=",", encoding="utf8")

# group by date. Only keep words that occur >5 times
wc_date_df = wc_df[wc_df["total_count"] > 5].groupby(["word", "date"])['word_cnt_page'].sum().reset_index()
wc_date_df.to_csv(data_fol + "word_count_byDate.csv", sep=";", decimal=",", encoding="utf8", index=False)

# group by month. Only keep words that occur >5 times
wc_month_df = wc_df[wc_df["total_count"] > 5].groupby(["word", "month"])['count_per_month', 'magazines_in_month'].sum().reset_index()
wc_month_df['avg. occurences per magazine'] = wc_month_df["count_per_month"] / wc_month_df['magazines_in_month']
wc_month_df.to_csv(data_fol + "word_count_byMonth.csv", sep=";", decimal=",", encoding="utf8", index=False)

# group by year. Only keep words that occur >5 times
wc_year_df = wc_df[wc_df["total_count"] > 5].groupby(["word", "year"])['count_per_year', 'magazines_in_year'].sum().reset_index()
wc_year_df['avg. occurences per magazine'] = wc_year_df["count_per_year"] / wc_year_df['magazines_in_year']
wc_year_df.to_csv(data_fol + "word_count_byYear.csv", sep=";", decimal=",", encoding="utf8", index=False)
	import pandas as pd

	data_fol = "/myLocal/dataFol/"

	# wc = wordcount. Make words lower case to make interpretation easier. Remove words that might still have <2 characters.
	wc_df = pd.read_csv(data_fol + "word_count_full.csv", sep=";", decimal=",", encoding="utf8")
	wc_df["word"] = wc_df["word"].str.lower()
	wc_df = wc_df[wc_df["word"].str.len() > 2]

	# convert str date column to pandas datetime format and extract months & years...
	wc_df["date"] = pd.to_datetime(wc_df["date"], format="%Y-%m-%d")
	wc_df["month"] = wc_df["date"].dt.month
	wc_df["year"] = wc_df["date"].dt.year

	wc_df['total_count'] = wc_df.groupby("word")["word_cnt_page"].transform("sum")
	wc_df['cnt_in_magazine'] = wc_df.groupby(["word","name"])["word_cnt_page"].transform("sum")

	wc_df['count_per_month'] = wc_df.groupby(["word", "month"])["word_cnt_page"].transform("sum")
	wc_df['count_per_year'] = wc_df.groupby(["word", "year"])["word_cnt_page"].transform("sum")

	# get number of available magazine per time unit
	wc_df['magazines_in_month'] = wc_df.groupby(["month"])["name"].transform("nunique")
	wc_df['magazines_in_year'] = wc_df.groupby(["year"])["name"].transform("nunique")

	wc_df.to_csv(data_fol + "word_count_full_enh.csv", sep=";", decimal=",", encoding="utf8")

	# group by date. Only keep words that occur >5 times
	wc_date_df = wc_df[wc_df["total_count"] > 5].groupby(["word", "date"])['word_cnt_page'].sum().reset_index()
	wc_date_df.to_csv(data_fol + "word_count_byDate.csv", sep=";", decimal=",", encoding="utf8", index=False)

	# group by month. Only keep words that occur >5 times
	wc_month_df = wc_df[wc_df["total_count"] > 5].groupby(["word", "month"])['count_per_month', 'magazines_in_month'].sum().reset_index()
	wc_month_df['avg. occurences per magazine'] = wc_month_df["count_per_month"] / wc_month_df['magazines_in_month']
	wc_month_df.to_csv(data_fol + "word_count_byMonth.csv", sep=";", decimal=",", encoding="utf8", index=False)

	# group by year. Only keep words that occur >5 times
	wc_year_df = wc_df[wc_df["total_count"] > 5].groupby(["word", "year"])['count_per_year', 'magazines_in_year'].sum().reset_index()
	wc_year_df['avg. occurences per magazine'] = wc_year_df["count_per_year"] / wc_year_df['magazines_in_year']
	wc_year_df.to_csv(data_fol + "word_count_byYear.csv", sep=";", decimal=",", encoding="utf8", index=False)