Skip to content

Instantly share code, notes, and snippets.

@otknoy
Created October 11, 2015 11:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save otknoy/bbd10a902b015fd59061 to your computer and use it in GitHub Desktop.
Save otknoy/bbd10a902b015fd59061 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
def tf(doc):
vectorizer = CountVectorizer(token_pattern=u'(?u)\\b\\w+\\b')
features = vectorizer.fit_transform(doc)
terms = vectorizer.get_feature_names()
return features, terms
def tfidf(docs):
vectorizer = TfidfVectorizer(min_df=1, max_df=50, token_pattern=u'(?u)\\b\\w+\\b')
features = vectorizer.fit_transform(docs)
terms = vectorizer.get_feature_names()
return features, terms
def reduction(x, dim=10):
'''
dimensionality reduction using LSA
'''
lsa = TruncatedSVD()
x = lsa.fit_transform(x)
x = Normalizer(copy=False).fit_transform(x)
return x
if __name__ == '__main__':
docs = [['山下', 'さん', 'は', '山下', 'くん', 'と', '東京特許許可局', 'へ', '行く', 'た', '。'],
['山下', 'さん', 'は', '山下', 'くん', 'と', '北海道', 'へ', '行く', 'た', '。'],
['山下', 'さん', 'は', '下山', 'くん', 'と', 'New York', 'へ', '行く', 'た', '。'],
['山上', 'さん', 'は', '山下', 'くん', 'と', '東京特許許可局', 'へ', '行く', 'た', '。'],]
docs = [' '.join(d) for d in docs]
features, terms = tfidf(docs)
print(terms)
print(features.toarray())
features, terms = tfidf(docs)
print(terms)
print(features.toarray())
features = reduction(features, dim=2)
print(features)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment