Created
June 7, 2018 01:53
-
-
Save michaelguia/ce18117805523b4fd6221e19861ff9f0 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import unicodedata | |
import string | |
import numpy as np | |
from nltk.util import ngrams | |
from nltk.tokenize import sent_tokenize, word_tokenize | |
from nltk.corpus import stopwords | |
from nltk.stem.snowball import SnowballStemmer | |
docs = [] | |
docs.append('code PYTHON code.') | |
docs.append('Python Students Students.') | |
docs.append('Jupiter is a planets.') | |
normalize = (unicodedata | |
.normalize('NFKD', corpus) | |
.encode('ASCII', 'ignore') | |
.decode('utf-8')) | |
tokens = list( | |
map(lambda s: word_tokenize(s.lower()), sent_tokenize(corpus)) | |
) | |
sw = stopwords.words('english') | |
pt = string.punctuation | |
filtered = [list( | |
filter(lambda token: token not in sw and token not in pt, row) | |
) for row in tokens] | |
stemmer_snowball = SnowballStemmer('english') | |
tokens_stemsnowball = [list( | |
map(stemmer_snowball.stem, row) | |
) for row in filtered] | |
documents = [row + | |
list( | |
map(lambda ng: '-'.join(ng), ngrams(row, 2)) | |
) | |
for row in tokens_stemsnowball] | |
vocabulary = set() | |
[[vocabulary.add(token) for token in row] for row in documents] | |
vocabulary_lookup = {word:i for i,word in enumerate(vocabulary)} | |
matrix = np.zeros((len(documents), len(vocabulary))) | |
for doc_id, document in enumerate(documents): | |
for word in document: | |
word_id = vocabulary_lookup[word] | |
matrix[doc_id][word_id] += 1 | |
tf = matrix/np.sum(matrix, axis=1).reshape(3,1) | |
tf | |
doc_freq = np.sum(matrix > 0, axis=0) | |
doc_freq | |
doc_freq = np.sum(matrix > 0, axis=0) | |
doc_freq | |
idf = np.log(matrix.shape[0] / doc_freq) | |
idf | |
idf = np.log(matrix.shape[0] / doc_freq) | |
idf | |
tfidf = tf * idf | |
tfidf |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment