Created
September 16, 2022 17:30
-
-
Save jacobobryant/f19b451af55a9541a1ac016a24e32981 to your computer and use it in GitHub Desktop.
Newsletter topic extraction
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Usage: python tfidf.py extract_keywords | |
Reads from storage/keywords/corpus.csv, which has columns `<ID>,<Title>,<Description>`. Writes | |
keywords to storage/keywords/output.csv | |
I use this for newsletter topic modeling at https://thesample.ai/. | |
""" | |
import sys | |
import nltk | |
import re | |
import csv | |
import random | |
import numpy as np | |
import json | |
import shutil | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
import pandas as pd | |
import fasttext | |
exclusions = [ | |
'subscribe', | |
'newsletter', | |
'weekly', | |
'things', | |
'daily', | |
'day', | |
'ideas', | |
'week', | |
'inbox', | |
'delivered', | |
'interesting', | |
'better', | |
'best', | |
'thoughts', | |
'latest', | |
'new', | |
'analysis', | |
'good', | |
'people' | |
'free', | |
'like', | |
'notes' | |
] | |
def extract_keywords(): | |
try: | |
nltk.corpus.stopwords.words('english') | |
except: | |
nltk.download('stopwords') | |
with open('storage/keywords/corpus.csv', newline='') as csvfile: | |
nls = [{'id': row[0], 'text': row[1] + ' ' + row[2]} for row in csv.reader(csvfile)] | |
random.shuffle(nls) | |
tfidf_vectorizer = TfidfVectorizer( | |
use_idf=True, | |
max_df=0.8, | |
min_df=3/len(nls), | |
stop_words='english' | |
) | |
texts = [nl['text'] for nl in nls] | |
matrix = tfidf_vectorizer.fit_transform(texts) | |
with open('storage/keywords/output.csv.tmp', 'w') as csvfile: | |
writer = csv.writer(csvfile) | |
for nl, v in list(zip(nls, matrix)): | |
#print(nl['text']) | |
x = list(zip(v.data, v.indices)) | |
x.sort(key=lambda pair: -pair[0]) | |
feature_names = tfidf_vectorizer.get_feature_names() | |
keywords = [feature_names[i] | |
for value, i in x | |
if feature_names[i] not in exclusions] | |
#print(', '.join(keywords[:5])) | |
#print() | |
writer.writerow([nl['id']] + keywords[:5]) | |
shutil.copyfile('storage/keywords/output.csv.tmp', 'storage/keywords/output.csv') | |
def main(): | |
eval(sys.argv[1])(*sys.argv[2:]) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Nice, I'll give it a try if what I got now doesn't work very well.