Skip to content

Instantly share code, notes, and snippets.

@mikeyee
Last active November 4, 2018 10:04
Show Gist options
  • Save mikeyee/dedb7f938cadb3e55074a8d01e52013c to your computer and use it in GitHub Desktop.
Save mikeyee/dedb7f938cadb3e55074a8d01e52013c to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"source": [
"import numpy as np\n",
"from emo_utils import *\n",
"import emoji\n",
"import matplotlib.pyplot as plt\n",
"import re\n",
"import jieba\n",
"\n%matplotlib inline"
],
"outputs": [],
"execution_count": 1,
"metadata": {
"collapsed": false,
"outputHidden": false,
"inputHidden": false
}
},
{
"cell_type": "code",
"source": [
"#讀入訓練及測試用的留言庫\n",
"\n",
"X_train, Y_train = read_csv('data/train_emoji_ch.csv')\n",
"X_test, Y_test = read_csv('data/tesss_ch.csv')"
],
"outputs": [],
"execution_count": 2,
"metadata": {
"collapsed": false,
"outputHidden": false,
"inputHidden": false
}
},
{
"cell_type": "code",
"source": [
"#測試emoji的參數\n",
"label_to_emoji(7)"
],
"outputs": [
{
"output_type": "execute_result",
"execution_count": 3,
"data": {
"text/plain": [
"'😡'"
]
},
"metadata": {}
}
],
"execution_count": 3,
"metadata": {
"collapsed": false,
"outputHidden": false,
"inputHidden": false
}
},
{
"cell_type": "code",
"source": [
"#計算訓練留言庫中最長的句子及分割字詞示範\n",
"\n",
"text=max(X_train, key=len)\n",
"text = list(jieba.cut(text,cut_all=False))\n",
"maxLen = len(text)\n",
"\n\n",
"maxLen=18 #後面再切碎字串,這裡用最大的\n",
"print(maxLen)\n",
"text"
],
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"Building prefix dict from the default dictionary ...\n",
"Loading model from cache /var/folders/bv/2q78w3811yg71lmsppx6lxrw0000gn/T/jieba.cache\n",
"Loading model cost 0.947 seconds.\n",
"Prefix dict has been built succesfully.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"18\n"
]
},
{
"output_type": "execute_result",
"execution_count": 4,
"data": {
"text/plain": [
"['我', '的', '程式', '明明', '可以', '執行', '但', '導師', '仍給', '我', '零分']"
]
},
"metadata": {}
}
],
"execution_count": 4,
"metadata": {
"collapsed": false,
"outputHidden": false,
"inputHidden": false
}
},
{
"cell_type": "code",
"source": [
"#閱讀訓練留言庫的留言\n",
"index = 7\n",
"print(X_train[index], label_to_emoji(Y_train[index]))"
],
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"這功課太多了 😞\n"
]
}
],
"execution_count": 5,
"metadata": {
"collapsed": false,
"outputHidden": false,
"inputHidden": false
}
},
{
"cell_type": "code",
"source": [
"Y_oh_train = convert_to_one_hot(Y_train, C = 8)\n",
"Y_oh_test = convert_to_one_hot(Y_test, C = 8)"
],
"outputs": [],
"execution_count": 6,
"metadata": {
"collapsed": false,
"outputHidden": false,
"inputHidden": false
}
},
{
"cell_type": "code",
"source": [
"#將訓練留言的正確答案化為one hot 陣列\n",
"index = 0\n",
"print(Y_train[index], \"is converted into one hot\", Y_oh_train[index])"
],
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"3 is converted into one hot [0. 0. 0. 1. 0. 0. 0. 0.]\n"
]
}
],
"execution_count": 7,
"metadata": {
"collapsed": false,
"outputHidden": false,
"inputHidden": false
}
},
{
"cell_type": "code",
"source": [
"#讀入facebook預先訓練好的中文文字向量\n",
"word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('data/wiki.zh.vec')"
],
"outputs": [],
"execution_count": 8,
"metadata": {
"collapsed": false,
"outputHidden": false,
"inputHidden": false
}
},
{
"cell_type": "code",
"source": [
"#讀入文字向量中單詞的數據\n",
"word = \"差\"\n",
"index = 235052\n",
"print(\"the index of\", word, \"in the vocabulary is\", word_to_index[word])\n",
"print(\"the\", str(index) + \"th word in the vocabulary is\", index_to_word[index])\n",
"len(word_to_vec_map[word])"
],
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"the index of 差 in the vocabulary is 235052\n",
"the 235052th word in the vocabulary is 差\n"
]
},
{
"output_type": "execute_result",
"execution_count": 9,
"data": {
"text/plain": [
"300"
]
},
"metadata": {}
}
],
"execution_count": 9,
"metadata": {
"collapsed": false,
"outputHidden": false,
"inputHidden": false
}
},
{
"cell_type": "code",
"source": [
"#加總留言字詞向量\n",
"\n",
"def sentence_to_avg(sentence, word_to_vec_map):\n",
" \"\"\"\n",
" Converts a sentence (string) into a list of words (strings). Extracts the GloVe representation of each word\n",
" and averages its value into a single vector encoding the meaning of the sentence.\n",
"\n",
" Arguments:\n",
" sentence -- string, one training example from X\n",
" word_to_vec_map -- dictionary mapping every word in a vocabulary into its 50-dimensional vector representation\n",
"\n",
" Returns:\n",
" avg -- average vector encoding information about the sentence, numpy-array of shape (50,)\n",
" \"\"\"\n",
"\n\n",
" words = list(jieba.cut(sentence,cut_all=False))\n",
" temps=[]\n",
" for w in words:\n",
" try:\n",
" word_to_vec_map[w]\n",
" temp=w\n",
" except:\n",
" temp=list(w)\n",
" temps.append(temp)\n",
" flat_list=[]\n",
" _ = [flat_list.extend(item) if isinstance(item, list) else flat_list.append(item) for item in temps if item]\n",
" while ' ' in flat_list:\n",
" flat_list.remove(' ')\n",
" words=flat_list\n",
"\n",
" avg = np.zeros((300,))\n",
"\n",
" for w in words:\n",
" avg += word_to_vec_map[w]\n",
" avg = avg/len(words)\n",
"\n\n return avg"
],
"outputs": [],
"execution_count": 10,
"metadata": {
"collapsed": false,
"outputHidden": false,
"inputHidden": false
}
},
{
"cell_type": "code",
"source": [
"#測試把留言的字詞向量加總及平均\n",
"\n",
"avg = sentence_to_avg(\"這是我一生最差勁的一天 \", word_to_vec_map)\n",
"print(\"avg = \", avg)"
],
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"avg = [-1.13861875e-01 1.04477125e+00 -5.89520000e-01 3.24456250e-01\n",
" 7.38658750e-01 -4.64008750e-01 -5.60848750e-01 2.58263750e-02\n",
" 4.23486250e-01 -1.34124125e-01 9.67411250e-01 1.42290625e-01\n",
" 1.35261125e+00 -9.27590000e-01 2.08043500e-01 -6.24710000e-01\n",
" 9.57933750e-01 -4.19742250e-01 -1.12124625e+00 -6.56251250e-01\n",
" -9.99676250e-01 1.06239875e+00 1.18810875e+00 8.31875000e-03\n",
" 5.75907500e-01 -4.11583750e-01 9.39983750e-01 1.99731000e-01\n",
" 7.24997500e-01 2.47968175e-01 7.64845000e-01 -1.00287625e+00\n",
" -9.05686250e-01 -5.49693750e-01 -8.29576250e-01 1.19548875e+00\n",
" 1.05869750e+00 -5.47840000e-01 1.93496250e-01 1.26827750e+00\n",
" 7.34667500e-01 1.07951875e+00 1.17510625e+00 -1.09544750e+00\n",
" -3.72637425e-01 -4.29343750e-01 -7.19981250e-01 -3.67023750e-01\n",
" 5.13111375e-01 -8.94367500e-01 1.25980625e+00 -3.23177875e-01\n",
" -3.98428750e-01 -3.33925750e-01 4.74337750e-02 -9.53198750e-01\n",
" -1.10110875e+00 -1.26326375e+00 -3.35262500e+00 3.98726250e-01\n",
" 1.14972250e+00 -1.73711250e-01 8.78507500e-01 1.17542625e+00\n",
" 4.49652500e-01 8.36718750e-01 -9.26055000e-01 -7.36686250e-01\n",
" -5.71937500e-01 -3.44220563e-01 1.33897750e+00 -8.14458750e-01\n",
" 1.17143875e+00 -6.51095000e-01 -8.13095000e-01 4.27236625e-01\n",
" -1.26325313e-01 -8.62573750e-01 -5.51665000e-01 -1.07435750e+00\n",
" -3.31313750e-01 2.76122500e-03 7.34642500e-01 -7.46127500e-01\n",
" -6.92805000e-01 -9.29848750e-01 2.51736750e-01 5.15750000e-01\n",
" 8.19151250e-02 -1.02911625e+00 -6.78373750e-01 7.21015000e-01\n",
" -2.91189750e-01 -2.18004375e-01 -9.07598750e-01 1.01732000e+00\n",
" 6.72615000e-01 7.39838750e-01 3.13162500e-02 1.33909000e-01\n",
" 5.16993750e-01 7.64466250e-01 -5.07446250e-01 8.62035000e-01\n",
" -5.14123750e-01 6.94323750e-01 -6.69157500e-02 -6.91896250e-01\n",
" -8.54900000e-01 8.69995000e-02 3.28381250e-01 -7.18932500e-01\n",
" -1.10166787e-01 5.39598750e-01 2.46171625e-01 4.52513750e-01\n",
" -5.27893750e-01 2.25719250e-01 -8.03552500e-01 -9.48873750e-01\n",
" -8.62896250e-01 9.29968750e-01 6.87881250e-01 -7.01957500e-02\n",
" -6.60365000e-01 -8.36058750e-01 8.01417500e-01 8.33808750e-01\n",
" -8.51356250e-01 3.72643750e-01 8.79275000e-01 3.20965000e-01\n",
" -9.38807500e-01 9.33356250e-01 -1.07136500e+00 4.09523750e-01\n",
" -4.92283750e-01 -2.83116625e-01 -7.83887500e-01 -4.53477500e-01\n",
" 1.90866250e-02 7.95182500e-01 1.12467125e+00 -1.14985250e+00\n",
" -8.80171250e-01 3.48727375e-01 -1.42132000e+00 -4.00085000e-02\n",
" -7.22596250e-01 -2.97139738e-01 5.76632500e-01 7.57280000e-01\n",
" -2.39040000e-01 1.53809625e-01 1.01213625e+00 -7.82800000e-01\n",
" 3.56789875e-01 -4.20626250e-01 -4.67568000e-02 6.97733750e-01\n",
" 5.33088750e-01 2.95906500e-01 3.64505750e-01 -1.32033875e-01\n",
" -4.67241125e-02 3.72525000e-01 -2.80784750e-01 8.81593750e-01\n",
" 5.25787500e-01 3.54515000e-01 5.13618750e-01 6.84813750e-01\n",
" 1.07395750e+00 -1.06314563e-01 5.69411125e-02 8.81600000e-02\n",
" 6.70105000e-01 7.66377500e-01 -1.22160150e-01 -4.73730000e-01\n",
" -6.71396250e-01 -5.60345000e-01 2.17737125e-01 1.59381812e-01\n",
" -2.97697750e-01 8.97773750e-01 -5.75956375e-02 -6.32732500e-01\n",
" -4.58300000e-01 8.10427500e-01 -3.30479000e-01 -8.91516250e-02\n",
" -5.06517875e-01 -4.16750000e-01 -2.51331125e-01 -6.24966250e-01\n",
" -8.69408750e-01 -8.17258750e-01 3.27987500e-03 4.15771250e-01\n",
" 3.45661250e-02 -5.34625000e-01 -6.72638750e-02 -6.20112500e-01\n",
" -9.17050000e-03 1.18512750e-01 3.74020000e-01 -4.08962500e-01\n",
" -1.78093500e-01 4.28253750e-01 -9.24425000e-01 2.97356375e-01\n",
" -5.55003750e-01 5.52548750e-01 -6.87260000e-01 1.47368000e-01\n",
" 4.36581250e-01 -2.34388250e-01 1.02227375e+00 6.80735000e-01\n",
" -5.53666250e-01 7.15455000e-01 3.31547375e-01 -2.03136875e-01\n",
" -9.04048750e-01 7.41556250e-01 -1.52710375e-01 -3.47773750e-01\n",
" -9.16613750e-01 2.28735000e-02 -7.97330000e-01 -4.90311250e-01\n",
" 3.36850750e-01 -5.14460000e-01 -4.94883625e-02 1.04841000e+00\n",
" 3.49311625e-01 -9.82771250e-01 -2.46683125e-01 -5.05893750e-01\n",
" 7.70962500e-01 -7.81355000e-01 5.95733750e-01 -1.43723687e-01\n",
" -3.12608750e-01 -5.29705000e-03 -4.71833750e-01 -1.94240000e-01\n",
" -6.85587500e-01 4.58510000e-01 -3.72441250e-01 -1.05118125e-01\n",
" -8.62170000e-01 1.70604925e-01 -5.23317500e-01 9.52750000e-04\n",
" 4.80752500e-01 6.40448750e-01 -1.55436250e+00 -5.59417500e-02\n",
" 3.91768750e-01 -2.51005000e-01 -4.59052500e-01 -3.99057500e-02\n",
" -3.77393892e-01 -3.03079500e-01 -6.06856250e-01 -9.01596250e-01\n",
" -1.79545000e+00 -1.00723125e+00 7.72862500e-03 3.44258750e-01\n",
" 1.05848500e+00 -6.01907500e-01 -3.07066625e-02 -2.38434125e-01\n",
" -4.34837500e-02 8.50942500e-01 7.36421250e-02 -4.72792500e-01\n",
" 6.98252500e-01 1.26219250e+00 1.41322875e-01 6.67311250e-01\n",
" -2.27989913e-01 -2.26622500e-01 5.46474750e-01 3.89776250e-01\n",
" -9.58172500e-01 6.50233500e-02 -9.52265000e-01 -1.01913250e+00\n",
" -5.33670000e-01 -4.93423750e-01 -2.16531000e-02 -8.96391250e-01\n",
" 2.50648500e-01 3.73640362e-01 1.06248250e+00 2.75682000e-01]\n"
]
}
],
"execution_count": 11,
"metadata": {
"collapsed": false,
"outputHidden": false,
"inputHidden": false
}
},
{
"cell_type": "code",
"source": [
"#模型參數 learning_rate 及 num_iterations均可以修改\n",
"\n",
"def model(X, Y, word_to_vec_map, learning_rate = 0.005, num_iterations = 1000):\n",
" \"\"\"\n",
" Model to train word vector representations in numpy.\n",
"\n",
" Arguments:\n",
" X -- input data, numpy array of sentences as strings, of shape (m, 1)\n",
" Y -- labels, numpy array of integers between 0 and 7, numpy-array of shape (m, 1)\n",
" word_to_vec_map -- dictionary mapping every word in a vocabulary into its 50-dimensional vector representation\n",
" learning_rate -- learning_rate for the stochastic gradient descent algorithm\n",
" num_iterations -- number of iterations\n",
"\n",
" Returns:\n",
" pred -- vector of predictions, numpy-array of shape (m, 1)\n",
" W -- weight matrix of the softmax layer, of shape (n_y, n_h)\n",
" b -- bias of the softmax layer, of shape (n_y,)\n",
" \"\"\"\n",
" np.random.seed(1)\n",
"\n\n",
" m = Y.shape[0] # number of training examples\n",
" n_y = 8 # number of classes \n",
" n_h = 300 # dimensions of the GloVe vectors \n",
"\n\n",
" W = np.random.randn(n_y, n_h) / np.sqrt(n_h)\n",
" b = np.zeros((n_y,))\n",
"\n\n",
" Y_oh = convert_to_one_hot(Y, C = n_y) \n",
"\n\n",
" for t in range(num_iterations): # Loop over the number of iterations\n",
" for i in range(m): # Loop over the training examples\n",
" # Average the word vectors of the words from the j'th training example\n",
" avg = sentence_to_avg(X[i], word_to_vec_map)\n",
" # Forward propagate the avg through the softmax layer\n",
" z = np.dot(W, avg) + b\n",
" a = softmax(z)\n",
" # Compute cost using the j'th training label's one hot representation and \"A\" (the output of the softmax)\n",
" cost = -np.sum(Y_oh[i] * np.log(a))\n",
"\n",
" # Compute gradients \n",
" dz = a - Y_oh[i]\n",
" dW = np.dot(dz.reshape(n_y,1), avg.reshape(1, n_h))\n",
" db = dz\n",
"\n",
" # Update parameters with Stochastic Gradient Descent\n",
" W = W - learning_rate * dW\n",
" b = b - learning_rate * db\n",
" \n",
" if t % 100 == 0:\n",
" print(\"Epoch: \" + str(t) + \" --- cost = \" + str(cost))\n",
" pred = predict(X, Y, W, b, word_to_vec_map)\n",
"\n return pred, W, b"
],
"outputs": [],
"execution_count": 12,
"metadata": {
"collapsed": false,
"outputHidden": false,
"inputHidden": false
}
},
{
"cell_type": "code",
"source": [
"#訓練模型\n",
"pred, W, b = model(X_train, Y_train, word_to_vec_map)\n",
"print(pred)"
],
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Epoch: 0 --- cost = 2.838557314655528\n",
"Accuracy: 0.059602649006622516\n",
"Epoch: 100 --- cost = 1.208787957239081\n",
"Accuracy: 0.5960264900662252\n",
"Epoch: 200 --- cost = 0.619723973651847\n",
"Accuracy: 0.7218543046357616\n",
"Epoch: 300 --- cost = 0.4027164591549059\n",
"Accuracy: 0.7682119205298014\n",
"Epoch: 400 --- cost = 0.3002130588259839\n",
"Accuracy: 0.8013245033112583\n",
"Epoch: 500 --- cost = 0.23925132641522678\n",
"Accuracy: 0.8211920529801324\n",
"Epoch: 600 --- cost = 0.19674781080695078\n",
"Accuracy: 0.8609271523178808\n",
"Epoch: 700 --- cost = 0.1649123704857175\n",
"Accuracy: 0.8807947019867549\n",
"Epoch: 800 --- cost = 0.14047599147368353\n",
"Accuracy: 0.9139072847682119\n",
"Epoch: 900 --- cost = 0.12151579557825612\n",
"Accuracy: 0.9403973509933775\n",
"[[3.]\n",
" [2.]\n",
" [3.]\n",
" [0.]\n",
" [0.]\n",
" [3.]\n",
" [2.]\n",
" [3.]\n",
" [1.]\n",
" [3.]\n",
" [3.]\n",
" [1.]\n",
" [3.]\n",
" [2.]\n",
" [3.]\n",
" [2.]\n",
" [3.]\n",
" [1.]\n",
" [2.]\n",
" [7.]\n",
" [0.]\n",
" [2.]\n",
" [2.]\n",
" [4.]\n",
" [2.]\n",
" [2.]\n",
" [0.]\n",
" [7.]\n",
" [4.]\n",
" [2.]\n",
" [0.]\n",
" [3.]\n",
" [2.]\n",
" [2.]\n",
" [3.]\n",
" [4.]\n",
" [2.]\n",
" [2.]\n",
" [0.]\n",
" [2.]\n",
" [3.]\n",
" [0.]\n",
" [3.]\n",
" [2.]\n",
" [4.]\n",
" [7.]\n",
" [7.]\n",
" [4.]\n",
" [2.]\n",
" [1.]\n",
" [1.]\n",
" [1.]\n",
" [2.]\n",
" [0.]\n",
" [3.]\n",
" [4.]\n",
" [4.]\n",
" [2.]\n",
" [1.]\n",
" [2.]\n",
" [0.]\n",
" [3.]\n",
" [2.]\n",
" [2.]\n",
" [0.]\n",
" [0.]\n",
" [3.]\n",
" [2.]\n",
" [1.]\n",
" [2.]\n",
" [2.]\n",
" [4.]\n",
" [3.]\n",
" [3.]\n",
" [2.]\n",
" [4.]\n",
" [0.]\n",
" [0.]\n",
" [0.]\n",
" [3.]\n",
" [7.]\n",
" [3.]\n",
" [2.]\n",
" [0.]\n",
" [1.]\n",
" [2.]\n",
" [7.]\n",
" [0.]\n",
" [2.]\n",
" [2.]\n",
" [2.]\n",
" [3.]\n",
" [2.]\n",
" [2.]\n",
" [2.]\n",
" [4.]\n",
" [1.]\n",
" [3.]\n",
" [3.]\n",
" [4.]\n",
" [1.]\n",
" [1.]\n",
" [3.]\n",
" [1.]\n",
" [0.]\n",
" [4.]\n",
" [0.]\n",
" [3.]\n",
" [3.]\n",
" [4.]\n",
" [4.]\n",
" [4.]\n",
" [7.]\n",
" [2.]\n",
" [7.]\n",
" [0.]\n",
" [4.]\n",
" [4.]\n",
" [0.]\n",
" [3.]\n",
" [3.]\n",
" [7.]\n",
" [7.]\n",
" [3.]\n",
" [3.]\n",
" [2.]\n",
" [5.]\n",
" [5.]\n",
" [5.]\n",
" [5.]\n",
" [5.]\n",
" [5.]\n",
" [6.]\n",
" [7.]\n",
" [6.]\n",
" [5.]\n",
" [5.]\n",
" [1.]\n",
" [5.]\n",
" [7.]\n",
" [7.]\n",
" [7.]\n",
" [7.]\n",
" [7.]\n",
" [7.]\n",
" [7.]\n",
" [3.]\n",
" [0.]\n",
" [6.]\n",
" [6.]\n",
" [2.]]\n"
]
}
],
"execution_count": 13,
"metadata": {
"collapsed": false,
"outputHidden": false,
"inputHidden": false
}
},
{
"cell_type": "code",
"source": [
"print(\"Training set:\")\n",
"pred_train = predict(X_train, Y_train, W, b, word_to_vec_map)\n",
"print('Test set:')\n",
"pred_test = predict(X_test, Y_test, W, b, word_to_vec_map)"
],
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Training set:\n",
"Accuracy: 0.9536423841059603\n",
"Test set:\n",
"Accuracy: 0.5882352941176471\n"
]
}
],
"execution_count": 14,
"metadata": {
"collapsed": false,
"outputHidden": false,
"inputHidden": false
}
},
{
"cell_type": "code",
"source": [
"#輸入多個留言看結果\n",
"X_my_sentences = np.array([\"政府可恥\", \"我憎恨你\", \"這位同學是白癡\", \"曼聯快搶攻\", \"真想吃薯條\", \"我可不可以去廁所\"])\n",
"Y_my_labels = np.array([[7], [7], [2], [1], [4],[6]])\n",
"\n",
"pred = predict(X_my_sentences, Y_my_labels , W, b, word_to_vec_map)\n",
"print_predictions(X_my_sentences, pred)"
],
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Accuracy: 0.6666666666666666\n",
"\n",
"政府可恥 😡\n",
"我憎恨你 😞\n",
"這位同學是白癡 😄\n",
"曼聯快搶攻 ⚾\n",
"真想吃薯條 🍴\n",
"我可不可以去廁所 😡\n"
]
}
],
"execution_count": 15,
"metadata": {
"collapsed": false,
"outputHidden": false,
"inputHidden": false
}
},
{
"cell_type": "code",
"source": [
"#輸入單詞看結果\n",
"\n",
"my_sentence=\"我訂了三文魚\"\n",
"X_my_sentences = np.array([my_sentence])\n",
"#Y_my_labels = np.array([[1]])\n",
"pred = predict(X_my_sentences, Y_my_labels , W, b, word_to_vec_map)\n",
"print_predictions(X_my_sentences, pred)"
],
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Accuracy: 0.16666666666666666\n",
"\n",
"我訂了三文魚 🍴\n"
]
}
],
"execution_count": 16,
"metadata": {
"collapsed": false,
"outputHidden": false,
"inputHidden": false
}
},
{
"cell_type": "code",
"source": [
"print(Y_test.shape)\n",
"print(' '+ label_to_emoji(0)+ ' ' + label_to_emoji(1) + ' ' + label_to_emoji(2)+ ' ' + label_to_emoji(3)+' ' + label_to_emoji(4)+' ' + label_to_emoji(5)+' ' + label_to_emoji(6)+' ' + label_to_emoji(7))\n",
"print(pd.crosstab(Y_test, pred_test.reshape(51,), rownames=['Actual'], colnames=['Predicted'], margins=True))\n",
"plot_confusion_matrix(Y_test, pred_test)"
],
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"(51,)\n",
" ❤️ ⚾ 😄 😞 🍴 😭 💩 😡\n",
"Predicted 0.0 1.0 2.0 3.0 4.0 7.0 All\n",
"Actual \n",
"0 6 0 0 0 1 0 7\n",
"1 1 4 0 0 0 1 6\n",
"2 5 0 7 2 0 4 18\n",
"3 1 1 1 8 0 3 14\n",
"4 0 0 1 0 5 0 6\n",
"All 13 5 9 10 6 8 51\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/pandas/core/generic.py:7460: RuntimeWarning: '<' not supported between instances of 'str' and 'float', sort order is undefined for incomparable objects\n",
" return_indexers=True)\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": [
"<Figure size 336x288 with 2 Axes>"
],
"image/png": [
"\n"
]
},
"metadata": {}
}
],
"execution_count": 17,
"metadata": {
"collapsed": false,
"outputHidden": false,
"inputHidden": false
}
},
{
"cell_type": "code",
"source": [
"#輸入gensim,看文字向量的關連性\n",
"\n",
"from gensim.models.keyedvectors import KeyedVectors\n",
"word_vectors = KeyedVectors.load_word2vec_format(\"data/wiki.zh.vec\", binary = False)"
],
"outputs": [],
"execution_count": 18,
"metadata": {
"collapsed": false,
"outputHidden": false,
"inputHidden": false
}
},
{
"cell_type": "code",
"source": [
"word_vectors.most_similar('小貓', topn = 10)"
],
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n",
" if np.issubdtype(vec.dtype, np.int):\n"
]
},
{
"output_type": "execute_result",
"execution_count": 19,
"data": {
"text/plain": [
"[('愛犬', 0.9518550038337708),\n",
" ('花貓', 0.9500274062156677),\n",
" ('養貓', 0.9474766850471497),\n",
" ('貓', 0.947167158126831),\n",
" ('貓咪', 0.9470548033714294),\n",
" ('小臉', 0.9433143734931946),\n",
" ('猴兒', 0.9423208236694336),\n",
" ('貓叫', 0.9421137571334839),\n",
" ('小輩', 0.9417970180511475),\n",
" ('柴犬', 0.9417864680290222)]"
]
},
"metadata": {}
}
],
"execution_count": 19,
"metadata": {
"collapsed": false,
"outputHidden": false,
"inputHidden": false
}
},
{
"cell_type": "code",
"source": [
"word_vectors.similarity(\"電視\", \"劉德華\")"
],
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n",
" if np.issubdtype(vec.dtype, np.int):\n"
]
},
{
"output_type": "execute_result",
"execution_count": 20,
"data": {
"text/plain": [
"0.85784405"
]
},
"metadata": {}
}
],
"execution_count": 20,
"metadata": {
"collapsed": false,
"outputHidden": false,
"inputHidden": false
}
},
{
"cell_type": "code",
"source": [],
"outputs": [],
"execution_count": 21,
"metadata": {
"collapsed": false,
"outputHidden": false,
"inputHidden": false
}
},
{
"cell_type": "code",
"source": [],
"outputs": [],
"execution_count": 21,
"metadata": {
"collapsed": false,
"outputHidden": false,
"inputHidden": false
}
}
],
"metadata": {
"kernel_info": {
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.6.5",
"mimetype": "text/x-python",
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"pygments_lexer": "ipython3",
"nbconvert_exporter": "python",
"file_extension": ".py"
},
"kernelspec": {
"name": "python3",
"language": "python",
"display_name": "Python 3"
},
"nteract": {
"version": "0.12.2"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment