Skip to content

Instantly share code, notes, and snippets.

@jdegene
Last active September 24, 2022 21:57
Show Gist options
  • Save jdegene/2810b28375cf6672ad9dccda66236aa8 to your computer and use it in GitHub Desktop.
Save jdegene/2810b28375cf6672ad9dccda66236aa8 to your computer and use it in GitHub Desktop.
import pandas as pd
data_fol = "/myLocal/dataFol/"
# wc = wordcount. Make words lower case to make interpretation easier. Remove words that might still have <2 characters.
wc_df = pd.read_csv(data_fol + "word_count_full.csv", sep=";", decimal=",", encoding="utf8")
wc_df["word"] = wc_df["word"].str.lower()
wc_df = wc_df[wc_df["word"].str.len() > 2]
# convert str date column to pandas datetime format and extract months & years...
wc_df["date"] = pd.to_datetime(wc_df["date"], format="%Y-%m-%d")
wc_df["month"] = wc_df["date"].dt.month
wc_df["year"] = wc_df["date"].dt.year
wc_df['total_count'] = wc_df.groupby("word")["word_cnt_page"].transform("sum")
wc_df['cnt_in_magazine'] = wc_df.groupby(["word","name"])["word_cnt_page"].transform("sum")
wc_df['count_per_month'] = wc_df.groupby(["word", "month"])["word_cnt_page"].transform("sum")
wc_df['count_per_year'] = wc_df.groupby(["word", "year"])["word_cnt_page"].transform("sum")
# get number of available magazine per time unit
wc_df['magazines_in_month'] = wc_df.groupby(["month"])["name"].transform("nunique")
wc_df['magazines_in_year'] = wc_df.groupby(["year"])["name"].transform("nunique")
wc_df.to_csv(data_fol + "word_count_full_enh.csv", sep=";", decimal=",", encoding="utf8")
# group by date. Only keep words that occur >5 times
wc_date_df = wc_df[wc_df["total_count"] > 5].groupby(["word", "date"])['word_cnt_page'].sum().reset_index()
wc_date_df.to_csv(data_fol + "word_count_byDate.csv", sep=";", decimal=",", encoding="utf8", index=False)
# group by month. Only keep words that occur >5 times
wc_month_df = wc_df[wc_df["total_count"] > 5].groupby(["word", "month"])['count_per_month', 'magazines_in_month'].sum().reset_index()
wc_month_df['avg. occurences per magazine'] = wc_month_df["count_per_month"] / wc_month_df['magazines_in_month']
wc_month_df.to_csv(data_fol + "word_count_byMonth.csv", sep=";", decimal=",", encoding="utf8", index=False)
# group by year. Only keep words that occur >5 times
wc_year_df = wc_df[wc_df["total_count"] > 5].groupby(["word", "year"])['count_per_year', 'magazines_in_year'].sum().reset_index()
wc_year_df['avg. occurences per magazine'] = wc_year_df["count_per_year"] / wc_year_df['magazines_in_year']
wc_year_df.to_csv(data_fol + "word_count_byYear.csv", sep=";", decimal=",", encoding="utf8", index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment