Skip to content

Instantly share code, notes, and snippets.

@rasika
Forked from quwubin/cluster_example.py
Last active August 29, 2015 14:01
Show Gist options
  • Save rasika/692fae312c7776c672e3 to your computer and use it in GitHub Desktop.
Save rasika/692fae312c7776c672e3 to your computer and use it in GitHub Desktop.
import sys
from nltk import numpy
from nltk.cluster import kmeans, gaac, euclidean_distance
import nltk.corpus
from nltk import decorators
import nltk.stem
stemmer_func = nltk.stem.snowball.EnglishStemmer().stem
stopwords = nltk.corpus.stopwords.words("english")
#return normalized the words(stemmed, lowercase)
@decorators.memoize
def normalize_word(word):
return stemmer_func(word.lower())
#return list of normalize word
def get_words(job_titles):
words = set()
for title in job_titles:
for word in title.split():
words.add(normalize_word(word))
return list(words)
#return array removing stopwords
def vectorspaced(title,words):
title_components = [normalize_word(word) for word in title.split()]
return numpy.array([word in title_components and not word in stopwords for word in words], numpy.short)
def doCluster(clusterer='gaac',filename='example.txt'):
title_file=open(filename)
job_titles = [line.strip() for line in title_file.readlines()]
words = get_words(job_titles)
if clusterer=='gaac':
cluster = nltk.cluster.gaac.GAAClusterer(5)
elif clusterer=='kmeans':
cluster = nltk.cluster.kmeans.KMeansClusterer(5, euclidean_distance)
cluster = nltk.cluster.gaac.GAAClusterer(5)
#cluster = KMeansClusterer(5, euclidean_distance)
cluster.cluster([vectorspaced(title,words) for title in job_titles if title])
# NOTE: This is inefficient, cluster.classify should really just be
# called when you are classifying previously unseen examples!
classified_examples = [cluster.classify(vectorspaced(title,words)) for title in job_titles]
for cluster_id, title in sorted(zip(classified_examples, job_titles)):
print cluster_id, title
if __name__ == '__main__':
filename = 'example.txt'
if len(sys.argv) == 2:
filename = sys.argv[1]
doCluster('gaac',filename)
Not so skilled worker
Skilled worker
Banana picker
Police officer
Office worker
Fireman
IT consultant
Rapist of old ladies
Engineer
Stupid bastard son
Genious computer analyst
Computer banana peeler
Potato peeler
CEO of a major business
Business economist
Data analyst
Economist analyst bastard
Psychologist data enumerator
Psychologist genious
Evil genious
Murderer and rapist of cats
Cat psychologist
Top Software Engineer in IT with NLTK experience
xim
fission6
@rasika
Copy link
Author

rasika commented May 14, 2014

changes done to comply with nltk 3.0

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment