Skip to content

Instantly share code, notes, and snippets.

@anshulkgupta93
Created April 1, 2016 10:08
Show Gist options
  • Save anshulkgupta93/3a04a420d3004c6447470ae5ed1d98a4 to your computer and use it in GitHub Desktop.
Save anshulkgupta93/3a04a420d3004c6447470ae5ed1d98a4 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/home/anshul/cube26/problem_3\n"
]
}
],
"source": [
"cd /home/anshul/cube26/problem_3"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### importing required files"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import sys\n",
"sys.path.append(\"/usr/local/lib/python2.7/dist-packages\")\n",
"import nltk\n",
"import json \n",
"from nltk.tokenize import RegexpTokenizer\n",
"from nltk.stem import PorterStemmer\n",
"from nltk.corpus import stopwords\n",
"import numpy as np\n",
"\n",
"import os\n",
"#import matplotlib.pyplot as plt\n",
"from sklearn.feature_extraction.text import TfidfTransformer\n",
"from sklearn.cluster import KMeans\n",
"from sklearn.externals import joblib\n",
"transformer = TfidfTransformer()\n",
"ps=PorterStemmer()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Tokenizer"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"class Splitter(object):\n",
" def __init__(self):\n",
" self.nltk_splitter = nltk.data.load('tokenizers/punkt/english.pickle')\n",
" self.nltk_tokenizer=RegexpTokenizer(r'\\w+')\n",
"\n",
" def split(self, text):\n",
" \"\"\"\n",
" input format: a paragraph of text\n",
" output format: a list of lists of words.\n",
" e.g.: ['this', 'is', 'a', 'sentence', 'this', 'is', 'another', 'one']\n",
" \"\"\"\n",
" #sentences = self.nltk_splitter.tokenize(text)\n",
" tokenized_sentences = [self.nltk_tokenizer.tokenize(sent) for sent in text]\n",
" \n",
" \"\"\"filtered_sentences=[]\n",
" for list_words in tokenized_sentences:\n",
" filtered_sentence=[w for w in list_words if not w in stop_words]\n",
" filtered_sentences.append(filtered_sentence)\"\"\"\n",
" return tokenized_sentences"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### This is text preprocessor(stemming, stopwords removal)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"class POSTagger(object):\n",
" def __init__(self):\n",
" pass\n",
" \n",
" def pos_tag(self, sentences, training):\n",
" \n",
" stemmed_scores= open(\"AFINN-111.txt\")\n",
" wordlist=[]\n",
" for line in stemmed_scores:\n",
" term ,score = line.split(\"\\t\")\n",
" wordlist.append(term)\n",
"\n",
" if training==1:\n",
" all_words=[]\n",
" for sentence in sentences:\n",
" for w in sentence:\n",
" all_words.append(ps.stem(w).lower())\n",
" \n",
" remove_list=set(wordlist + nltk.corpus.stopwords.words())\n",
" \n",
"\n",
" pos=[[((word).encode('utf-8').lower(),ps.stem(word).encode('utf-8').lower(),[]) for word in sentence if word not in remove_list] for sentence in sentences]\n",
" \n",
" #pos = [[(word, [postag]) for word in sentence] for sentence in pos]\n",
" if training==1:\n",
" return pos,all_words\n",
" else:\n",
" return pos\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Logic:\n",
"### Create Tf-idf matrix \n",
"### Apply kmeans and save the model\n",
"### for visualization apply PCA after kmeans and build a scatter plot"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"class TfidfCluster():\n",
" def __init__(self,fname,data):\n",
" self.test_data=data\n",
" self.fname= fname\n",
"\n",
"\n",
" def find_features(self,document):\n",
" words=set(document)\n",
" features=[]\n",
" for w in word_features:\n",
" features.append(w in words)\n",
" return features\n",
"\n",
" def find_TfidfFeatures(self,document,word_features):\n",
" words=document\n",
" features=[]\n",
" for w in word_features:\n",
" features.append(words.count(w))\n",
"\n",
" return features\n",
"\n",
" def clustering_code(self):\n",
" splitter = Splitter()\n",
" postagger = POSTagger()\n",
" \"\"\"Creating Training Set\"\"\"\n",
" document=[]\n",
" doc= pd.read_csv('SampleData.csv')\n",
" for sentence in doc[\"full_article\"][:200]:\n",
" document.append(str(sentence))\n",
" #document=''.join(document)\n",
" listSentences=splitter.split(document)\t\n",
" #print listSentences\n",
" sentences_tags,all_words= postagger.pos_tag(listSentences,1)\n",
" all_words=nltk.FreqDist(all_words)\n",
"\n",
" col=1000\n",
" word_features= [w for (w,c) in all_words.most_common(col)]\n",
"\n",
" def create_tfidf_set(sentences_tags,word_features):\n",
"\n",
" feature_sets=np.zeros((len(sentences_tags),col))\n",
" i=0\n",
" for sent in sentences_tags:\n",
" feature_sets[i,]= self.find_TfidfFeatures([word[1] for word in sent],word_features)\n",
" i=i+1\n",
" tfidf=transformer.fit_transform(feature_sets)\n",
" feature_sets= tfidf.toarray()\n",
" return pd.DataFrame(feature_sets,columns=word_features)\n",
"\n",
" feature_sets= create_tfidf_set(sentences_tags,word_features)\n",
"\n",
"\n",
" if not os.path.isfile('kmeans_model.pkl'):\n",
" kmeans_model = KMeans(n_clusters=3, random_state=2)\n",
" good_columns = feature_sets._get_numeric_data().dropna(axis=1)\n",
" kmeans_model.fit(good_columns)\n",
" train_labels = kmeans_model.labels_\n",
" cluster_centers= kmeans_model.cluster_centers_\n",
" \n",
" from sklearn.decomposition import PCA\n",
" pca_2 = PCA(2)\n",
" plot = pca_2.fit(feature_sets)\n",
" plot_columns= plot.transform(feature_sets)\n",
" import matplotlib.pyplot as plt\n",
"\n",
" fig = plt.figure()\n",
" fig.set_size_inches(7,7)\n",
"\n",
" plt.scatter(x=plot_columns[:,0], y=plot_columns[:,1], c=train_labels)\n",
" plt.show()\t\t\t\n",
" \"\"\"saving the model\"\"\"\n",
" joblib.dump(kmeans_model, 'kmeans_model.pkl') \n",
"\n",
" else:\n",
" kmeans_model = joblib.load('kmeans_model.pkl') \n",
"\n",
" \"\"\" Creating testing data\"\"\"\n",
"\n",
"\n",
"\n",
" listSentences=splitter.split(self.test_data)\n",
"\n",
" test_sentences= postagger.pos_tag(listSentences,0)\n",
" test_set=create_tfidf_set(test_sentences,word_features)\n",
" predicted_labels=kmeans_model.predict(test_set)\n",
" \n",
"\n",
" train_labels=list(train_labels)\n",
"\n",
" indices=[[j for j, x in enumerate(train_labels) if x == i] for i in range(10)] \n",
"\n",
" sentences_set=[]\n",
" for index,sentence in enumerate(sentences_tags):\n",
" concat_sentence=' '\n",
" for word in sentence:\n",
" concat_sentence=concat_sentence + word[0]+ ' '\n",
" sentences_set.append(concat_sentence)\n",
" sentences_set =np.array(sentences_set) \n",
"\n",
" for i in range(10):\n",
" print \"\\nThis is cluster label \" + str(i) +\"\\n\" \n",
" \n",
" print doc[\"heading\"][indices[i]]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.11"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment