michaelguia/tfidf.py

## tfidf.py
import pandas as pd
import unicodedata
import string
import numpy as np
from nltk.util import ngrams
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
docs = []
docs.append('code PYTHON code.')
docs.append('Python Students Students.')
docs.append('Jupiter is a planets.')
normalize = (unicodedata
.normalize('NFKD', corpus)
.encode('ASCII', 'ignore')
.decode('utf-8'))
tokens = list(
    map(lambda s: word_tokenize(s.lower()), sent_tokenize(corpus))
)
sw = stopwords.words('english')
pt = string.punctuation
filtered = [list(
    filter(lambda token: token not in sw and token not in pt, row)
) for row in tokens]
stemmer_snowball = SnowballStemmer('english')
tokens_stemsnowball = [list(
    map(stemmer_snowball.stem, row)
) for row in filtered]

documents = [row +
             list(
                 map(lambda ng: '-'.join(ng), ngrams(row, 2))
             )
             for row in tokens_stemsnowball]

vocabulary = set()
[[vocabulary.add(token) for token in row] for row in documents]
vocabulary_lookup = {word:i for i,word in enumerate(vocabulary)}
matrix = np.zeros((len(documents), len(vocabulary)))

for doc_id, document in enumerate(documents):
    for word in document:
        word_id = vocabulary_lookup[word]
        matrix[doc_id][word_id] += 1

tf = matrix/np.sum(matrix, axis=1).reshape(3,1)
tf

doc_freq = np.sum(matrix > 0, axis=0)
doc_freq
doc_freq = np.sum(matrix > 0, axis=0)
doc_freq
idf = np.log(matrix.shape[0] / doc_freq)
idf
idf = np.log(matrix.shape[0] / doc_freq)
idf
tfidf = tf * idf
tfidf
	import pandas as pd
	import unicodedata
	import string
	import numpy as np
	from nltk.util import ngrams
	from nltk.tokenize import sent_tokenize, word_tokenize
	from nltk.corpus import stopwords
	from nltk.stem.snowball import SnowballStemmer
	docs = []
	docs.append('code PYTHON code.')
	docs.append('Python Students Students.')
	docs.append('Jupiter is a planets.')
	normalize = (unicodedata
	.normalize('NFKD', corpus)
	.encode('ASCII', 'ignore')
	.decode('utf-8'))
	tokens = list(
	map(lambda s: word_tokenize(s.lower()), sent_tokenize(corpus))
	)
	sw = stopwords.words('english')
	pt = string.punctuation
	filtered = [list(
	filter(lambda token: token not in sw and token not in pt, row)
	) for row in tokens]
	stemmer_snowball = SnowballStemmer('english')
	tokens_stemsnowball = [list(
	map(stemmer_snowball.stem, row)
	) for row in filtered]

	documents = [row +
	list(
	map(lambda ng: '-'.join(ng), ngrams(row, 2))
	)
	for row in tokens_stemsnowball]

	vocabulary = set()
	[[vocabulary.add(token) for token in row] for row in documents]
	vocabulary_lookup = {word:i for i,word in enumerate(vocabulary)}
	matrix = np.zeros((len(documents), len(vocabulary)))

	for doc_id, document in enumerate(documents):
	for word in document:
	word_id = vocabulary_lookup[word]
	matrix[doc_id][word_id] += 1

	tf = matrix/np.sum(matrix, axis=1).reshape(3,1)
	tf

	doc_freq = np.sum(matrix > 0, axis=0)
	doc_freq
	doc_freq = np.sum(matrix > 0, axis=0)
	doc_freq
	idf = np.log(matrix.shape[0] / doc_freq)
	idf
	idf = np.log(matrix.shape[0] / doc_freq)
	idf
	tfidf = tf * idf
	tfidf