Skip to content

Instantly share code, notes, and snippets.

@michaelguia
Created June 7, 2018 01:53
Show Gist options
  • Save michaelguia/ce18117805523b4fd6221e19861ff9f0 to your computer and use it in GitHub Desktop.
Save michaelguia/ce18117805523b4fd6221e19861ff9f0 to your computer and use it in GitHub Desktop.
import pandas as pd
import unicodedata
import string
import numpy as np
from nltk.util import ngrams
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
docs = []
docs.append('code PYTHON code.')
docs.append('Python Students Students.')
docs.append('Jupiter is a planets.')
normalize = (unicodedata
.normalize('NFKD', corpus)
.encode('ASCII', 'ignore')
.decode('utf-8'))
tokens = list(
map(lambda s: word_tokenize(s.lower()), sent_tokenize(corpus))
)
sw = stopwords.words('english')
pt = string.punctuation
filtered = [list(
filter(lambda token: token not in sw and token not in pt, row)
) for row in tokens]
stemmer_snowball = SnowballStemmer('english')
tokens_stemsnowball = [list(
map(stemmer_snowball.stem, row)
) for row in filtered]
documents = [row +
list(
map(lambda ng: '-'.join(ng), ngrams(row, 2))
)
for row in tokens_stemsnowball]
vocabulary = set()
[[vocabulary.add(token) for token in row] for row in documents]
vocabulary_lookup = {word:i for i,word in enumerate(vocabulary)}
matrix = np.zeros((len(documents), len(vocabulary)))
for doc_id, document in enumerate(documents):
for word in document:
word_id = vocabulary_lookup[word]
matrix[doc_id][word_id] += 1
tf = matrix/np.sum(matrix, axis=1).reshape(3,1)
tf
doc_freq = np.sum(matrix > 0, axis=0)
doc_freq
doc_freq = np.sum(matrix > 0, axis=0)
doc_freq
idf = np.log(matrix.shape[0] / doc_freq)
idf
idf = np.log(matrix.shape[0] / doc_freq)
idf
tfidf = tf * idf
tfidf
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment