Skip to content

Instantly share code, notes, and snippets.

@xccds
Created May 22, 2015 01:33
Show Gist options
  • Save xccds/ae012b5484c50defc4c0 to your computer and use it in GitHub Desktop.
Save xccds/ae012b5484c50defc4c0 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"collapsed": true
},
"source": [
"### Python的文本挖掘\n",
"- 本文主要演练三种文本挖掘方法\n",
" - 使用的是sogou的语料库http://www.sogou.com/labs/dl/c.html\n",
" - 常规的词袋模型用于分类\n",
" - 使用word2vec得到词向量,再对词汇进行聚类,用类编号作为特征再进行分类\n",
" - 使用word2vec得到词向量,对文档中的词向量平均化作为文档向量,用文档向量作为特征进行分类"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from os import path\n",
"import os\n",
"import re"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"['SogouC.reduced/Reduced/C000008',\n",
" 'SogouC.reduced/Reduced/C000010',\n",
" 'SogouC.reduced/Reduced/C000013',\n",
" 'SogouC.reduced/Reduced/C000014',\n",
" 'SogouC.reduced/Reduced/C000016',\n",
" 'SogouC.reduced/Reduced/C000020',\n",
" 'SogouC.reduced/Reduced/C000022',\n",
" 'SogouC.reduced/Reduced/C000023',\n",
" 'SogouC.reduced/Reduced/C000024']"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"rootdir = 'SogouC.reduced/Reduced'\n",
"dirs = os.listdir(rootdir)\n",
"dirs = [path.join(rootdir,f) for f in dirs if f.startswith('C')]\n",
"dirs"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def load_txt(x):\n",
" with open(x) as f:\n",
" res = [t.decode('gbk','ignore') for t in f]\n",
" return ''.join(res)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"text_t = {}\n",
"for i, d in enumerate(dirs):\n",
" files = os.listdir(d)\n",
" files = [path.join(d, x) for x in files if x.endswith('txt') and not x.startswith('.')]\n",
" text_t[i] = [load_txt(f) for f in files]"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# to dataframe\n",
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"flen = [len(t) for t in text_t.values()]"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"labels = np.repeat(text_t.keys(),flen)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# flatter nested list\n",
"import itertools\n",
"merged = list(itertools.chain.from_iterable(text_t.values()))"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>label</th>\n",
" <th>txt</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td> 0</td>\n",
" <td>   本报记者陈雪频实习记者唐翔发自上海\\r\\n  一家刚刚成立两年的网络支付公司,它的目标是...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td> 0</td>\n",
" <td> 证券通:百联股份未来5年有能力保持高速增长\\r\\n\\r\\n 深度报告 权威内参...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td> 0</td>\n",
" <td> 5月09日消息快评\\r\\n\\r\\n 深度报告 权威内参 来自“证券通”www....</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td> 0</td>\n",
" <td> 5月09日消息快评\\r\\n\\r\\n 深度报告 权威内参 来自“证券通”www....</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td> 0</td>\n",
" <td> 5月09日消息快评\\r\\n\\r\\n 深度报告 权威内参 来自“证券通”www....</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" label txt\n",
"0 0   本报记者陈雪频实习记者唐翔发自上海\\r\\n  一家刚刚成立两年的网络支付公司,它的目标是...\n",
"1 0 证券通:百联股份未来5年有能力保持高速增长\\r\\n\\r\\n 深度报告 权威内参...\n",
"2 0 5月09日消息快评\\r\\n\\r\\n 深度报告 权威内参 来自“证券通”www....\n",
"3 0 5月09日消息快评\\r\\n\\r\\n 深度报告 权威内参 来自“证券通”www....\n",
"4 0 5月09日消息快评\\r\\n\\r\\n 深度报告 权威内参 来自“证券通”www...."
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame({'label': labels, 'txt': merged})\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Building Trie..., from /Users/xiaokai/anaconda/lib/python2.7/site-packages/jieba/dict.txt\n",
"DEBUG:jieba:Building Trie..., from /Users/xiaokai/anaconda/lib/python2.7/site-packages/jieba/dict.txt\n",
"dumping model to file cache /var/folders/7t/2bxpnffn2r9fdg94r9fk6t4h0000gn/T/jieba.cache\n",
"DEBUG:jieba:dumping model to file cache /var/folders/7t/2bxpnffn2r9fdg94r9fk6t4h0000gn/T/jieba.cache\n",
"loading model cost 3.71529507637 seconds.\n",
"DEBUG:jieba:loading model cost 3.71529507637 seconds.\n",
"Trie has been built succesfully.\n",
"DEBUG:jieba:Trie has been built succesfully.\n"
]
}
],
"source": [
"# cut word\n",
"import jieba\n",
"jieba.enable_parallel(4)\n",
"def cutword_1(x):\n",
" words = jieba.cut(x)\n",
" return ' '.join(words)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df['seg_word'] = df.txt.map(cutword_1)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from cPickle import dump,load\n",
"#dump(df, open('df.pickle', 'wb'))\n",
"df = load(open('df.pickle','rb'))"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(17903, 10000)"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# model \n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"vect = TfidfVectorizer(ngram_range=(1,1), min_df = 2, max_features = 10000)\n",
"xvec = vect.fit_transform(df.seg_word)\n",
"xvec.shape"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"y = df.label"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from sklearn.cross_validation import train_test_split\n",
"train_X, test_X, train_y, test_y = train_test_split(xvec, y , train_size=0.7, random_state=1)\n",
"from sklearn.naive_bayes import MultinomialNB\n",
"clf = MultinomialNB()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"clf.fit(train_X, train_y)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" precision recall f1-score support\n",
"\n",
" 0 0.91 0.88 0.89 576\n",
" 1 0.86 0.83 0.84 604\n",
" 2 0.88 0.83 0.86 616\n",
" 3 0.99 0.97 0.98 580\n",
" 4 0.87 0.88 0.88 597\n",
" 5 0.88 0.80 0.83 607\n",
" 6 0.78 0.89 0.83 599\n",
" 7 0.74 0.79 0.76 613\n",
" 8 0.92 0.93 0.92 579\n",
"\n",
"avg / total 0.87 0.86 0.87 5371\n",
"\n"
]
}
],
"source": [
"from sklearn import metrics\n",
"pre = clf.predict(test_X)\n",
"print metrics.classification_report(test_y, pre)\n",
"#print metrics.confusion_matrix(test_y, pre)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# word2vec\n",
"txt = df.seg_word.values\n",
"txtlist = []\n",
"for sent in txt:\n",
" temp = [w for w in sent.split()]\n",
" txtlist.append(temp)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"num_features = 100\n",
"min_word_count = 10\n",
"num_workers = 4\n",
"context = 20\n",
"epoch = 20\n",
"sample = 1e-5"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from gensim.models import word2vec"
]
},
{
"cell_type": "code",
"execution_count": 89,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"model = word2vec.Word2Vec(txtlist, workers = num_workers,\n",
" sample = sample,\n",
" size = num_features,\n",
" min_count=min_word_count,\n",
" window = context,\n",
" iter = epoch)"
]
},
{
"cell_type": "code",
"execution_count": 90,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(57675, 100)"
]
},
"execution_count": 90,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.syn0.shape"
]
},
{
"cell_type": "code",
"execution_count": 91,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"在线 0.809367001057\n",
"网络 0.792132735252\n",
"网民 0.789814114571\n",
"网站 0.766795158386\n",
"网络广告 0.763081729412\n",
"门户网站 0.757833242416\n",
"互联网内容 0.728336572647\n",
"访问量 0.703088879585\n",
"商业模式 0.701648652554\n",
"Web2 0.698530614376\n"
]
}
],
"source": [
"for w in model.most_similar(u'互联网'):\n",
" print w[0], w[1]"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"#model.save('sogo_wv')\n",
"model = word2vec.Word2Vec.load('sogo_wv')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# kmeans based on word_vec\n",
"from sklearn.cluster import KMeans\n",
"word_vectors = model.syn0\n",
"num_clusters = word_vectors.shape[0]//20\n",
"kmeans_clustering = KMeans(n_clusters = num_clusters)\n",
"idx = kmeans_clustering.fit_predict(word_vectors)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"word_centroid_map = dict(zip(model.index2word, idx))\n",
"word_centroid_df = pd.DataFrame(zip( model.index2word, idx )) \n",
"word_centroid_df.columns = ['word','cluster'] \n",
"word_centroid_df.head() "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# 观察前十个群的效果 \n",
"for cluster in xrange(10): \n",
" print \"\\nCluster %d\" % cluster \n",
" words = word_centroid_df.ix[word_centroid_df.cluster==cluster,'word'].values \n",
" print ' '.join(words) "
]
},
{
"cell_type": "code",
"execution_count": 107,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# 观察有很多词的大群 \n",
"big_cluster = word_centroid_df.groupby('cluster').apply(lambda x: len(x.word)).reset_index() \n",
"big_cluster.columns = ['cluster','word_num'] \n",
"key_cluster = big_cluster.ix[big_cluster['word_num']>=10,'cluster'].values \n",
"\n",
"\n",
"def create_bag_of_centroids( wordlist, word_centroid_map ): \n",
" # 从词到类别编号的映射函数 \n",
" # wordlist是文本中的词,word_centroid_map是诩到编号的dict \n",
" num_centroids = max( word_centroid_map.values() ) + 1 \n",
" bag_of_centroids = np.zeros( num_centroids, dtype=\"float32\" ) \n",
" for word in wordlist: \n",
" if word in word_centroid_map: \n",
" index = word_centroid_map[word] \n",
" if index in key_cluster: \n",
" bag_of_centroids[index] += 1 \n",
" return bag_of_centroids "
]
},
{
"cell_type": "code",
"execution_count": 112,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# 从原始文本映射成群编号 \n",
"train_centroids = np.zeros( (len(txtlist), num_clusters),dtype=\"float32\" ) \n",
"for i, review in enumerate(txtlist): \n",
" train_centroids[i] = create_bag_of_centroids( review,word_centroid_map ) \n",
"# 变为0-1特征 \n",
"train_centroids = np.where(train_centroids>0,1,0) \n",
"train_centroids_df = pd.DataFrame(train_centroids) \n",
"train_centroids_df= train_centroids_df.ix[:,train_centroids.sum(axis=0)!=0] "
]
},
{
"cell_type": "code",
"execution_count": 113,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(17910, 1429)"
]
},
"execution_count": 113,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_centroids_df.shape"
]
},
{
"cell_type": "code",
"execution_count": 118,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from sklearn.linear_model import SGDClassifier\n",
"from sklearn.cross_validation import train_test_split\n",
"train_X, test_X, train_y, test_y = train_test_split(train_centroids_df.values, y , train_size=0.7, random_state=1)\n",
"from sklearn.naive_bayes import MultinomialNB\n",
"clf = SGDClassifier()"
]
},
{
"cell_type": "code",
"execution_count": 119,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" precision recall f1-score support\n",
"\n",
" 0 0.88 0.91 0.90 577\n",
" 1 0.74 0.86 0.80 603\n",
" 2 0.83 0.86 0.84 619\n",
" 3 0.85 0.73 0.78 584\n",
" 4 0.82 0.75 0.78 570\n",
" 5 0.75 0.74 0.75 600\n",
" 6 0.82 0.82 0.82 600\n",
" 7 0.99 0.94 0.96 615\n",
" 8 0.71 0.75 0.73 605\n",
"\n",
"avg / total 0.82 0.82 0.82 5373\n",
"\n",
"[[525 1 1 1 9 4 3 0 33]\n",
" [ 0 519 9 13 22 18 2 1 19]\n",
" [ 3 14 533 2 4 47 10 0 6]\n",
" [ 1 52 30 425 13 22 11 0 30]\n",
" [ 8 53 6 21 429 5 10 1 37]\n",
" [ 26 19 26 18 10 447 31 1 22]\n",
" [ 7 8 19 5 9 27 491 1 33]\n",
" [ 1 3 11 1 1 9 6 578 5]\n",
" [ 23 29 8 14 26 14 37 1 453]]\n"
]
}
],
"source": [
"clf.fit(train_X, train_y)\n",
"from sklearn import metrics\n",
"pre = clf.predict(test_X)\n",
"print metrics.classification_report(test_y, pre)\n",
"print metrics.confusion_matrix(test_y, pre)"
]
},
{
"cell_type": "code",
"execution_count": 95,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(17910, 100)"
]
},
"execution_count": 95,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 将词向量平均化为文档向量 \n",
"def sentvec(sent,m=num_features,model=model): \n",
" res = np.zeros(m) \n",
" words = sent.split() \n",
" num = 0 \n",
" for w in words: \n",
" if w in model.index2word: \n",
" res += model[w] \n",
" num += 1.0 \n",
" if num == 0: return np.zeros(m) \n",
" else: return res/num \n",
" \n",
"n = df.shape[0] \n",
"sent_matrix = np.zeros([n,num_features],float) \n",
"for i ,sent in enumerate(df.seg_word.values): \n",
" sent_matrix[i,:] = sentvec(sent) \n",
"sent_matrix.shape "
]
},
{
"cell_type": "code",
"execution_count": 99,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from sklearn.ensemble import GradientBoostingClassifier\n",
"from sklearn.cross_validation import train_test_split\n",
"train_X, test_X, train_y, test_y = train_test_split(sent_matrix, y , train_size=0.7, random_state=1)\n",
"clf = GradientBoostingClassifier()"
]
},
{
"cell_type": "code",
"execution_count": 100,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" precision recall f1-score support\n",
"\n",
" 0 0.93 0.93 0.93 577\n",
" 1 0.83 0.84 0.84 603\n",
" 2 0.91 0.85 0.88 619\n",
" 3 0.85 0.87 0.86 584\n",
" 4 0.85 0.83 0.84 570\n",
" 5 0.83 0.80 0.81 600\n",
" 6 0.88 0.88 0.88 600\n",
" 7 0.97 0.96 0.97 615\n",
" 8 0.76 0.83 0.80 605\n",
"\n",
"avg / total 0.87 0.87 0.87 5373\n",
"\n",
"[[539 1 2 3 5 3 6 1 17]\n",
" [ 0 507 4 20 20 9 4 1 38]\n",
" [ 3 10 529 11 6 35 12 3 10]\n",
" [ 0 25 8 509 11 13 5 1 12]\n",
" [ 6 27 4 17 472 9 5 2 28]\n",
" [ 15 12 22 15 12 477 19 4 24]\n",
" [ 6 7 8 9 3 13 530 3 21]\n",
" [ 0 1 0 2 5 4 5 592 6]\n",
" [ 9 20 6 10 22 12 19 2 505]]\n"
]
}
],
"source": [
"clf.fit(train_X, train_y)\n",
"from sklearn import metrics\n",
"pre = clf.predict(test_X)\n",
"print metrics.classification_report(test_y, pre)\n",
"print metrics.confusion_matrix(test_y, pre)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.9"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment