Skip to content

Instantly share code, notes, and snippets.

@dennisseah
Created November 28, 2022 04:13
Show Gist options
  • Save dennisseah/ed7627863b6560630f65b74e41248462 to your computer and use it in GitHub Desktop.
Save dennisseah/ed7627863b6560630f65b74e41248462 to your computer and use it in GitHub Desktop.
TF-IDF with PyPDF2
# nltk==3.7
# pandas==1.5.2
# PyPDF2==2.11.2
# scikit-learn==1.1.3
from PyPDF2 import PdfReader
import io
import pandas as pd
import requests
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
def tokenizeText(text: str):
tokens = word_tokenize(text.lower())
return " ".join([PorterStemmer().stem(item) for item in tokens])
def main():
response = requests.get("https://www.databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf")
reader = PdfReader(io.BytesIO(response.content))
text = " ".join([tokenizeText(p.extract_text().replace("\n", " ")) for p in reader.pages])
vectorizer = TfidfVectorizer()
matrix = vectorizer.fit_transform([text]).todense()
df_matrix = pd.DataFrame(matrix, columns=vectorizer.get_feature_names_out())
words = df_matrix.sum(axis=0).sort_values(ascending=True)
df_words = words.to_frame(name="count")
df_words.reset_index(inplace=True)
df_words = df_words.rename(columns = {"index": "word"})
print(df_words.squeeze().to_json(orient="records"))
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment