vporiz

## TF_IDF_Sklearn.py
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
        'This is the first document.',
        'This document is the second document.',
        'And this is the third one.',
        'Is this the first document?',
    ]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())

## TF_IDF_PySpark.py
from pyspark.mllib.feature import HashingTF, IDF

# Load documents (one per line).
documents = sc.textFile("data/mllib/kmeans_data.txt").map(lambda line: line.split(" "))

hashingTF = HashingTF()
tf = hashingTF.transform(documents)

# While applying HashingTF only needs a single pass to the data, applying IDF needs two passes:
# First to compute the IDF vector and second to scale the term frequencies by IDF.
	from sklearn.feature_extraction.text import TfidfVectorizer
	corpus = [
	'This is the first document.',
	'This document is the second document.',
	'And this is the third one.',
	'Is this the first document?',
	]
	vectorizer = TfidfVectorizer()
	X = vectorizer.fit_transform(corpus)
	print(vectorizer.get_feature_names())
	from pyspark.mllib.feature import HashingTF, IDF

	# Load documents (one per line).
	documents = sc.textFile("data/mllib/kmeans_data.txt").map(lambda line: line.split(" "))

	hashingTF = HashingTF()
	tf = hashingTF.transform(documents)

	# While applying HashingTF only needs a single pass to the data, applying IDF needs two passes:
	# First to compute the IDF vector and second to scale the term frequencies by IDF.