Last active
May 16, 2022 08:15
-
-
Save YHTerrance/3370b729c23d1b333e695ed4cd0e9b08 to your computer and use it in GitHub Desktop.
TFIDF
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "5a876275-54f0-4d6a-8530-7c83e1e2277c", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"import os\n", | |
"from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer\n", | |
"import codecs\n", | |
"import jieba" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "772d9dde-ced1-4f60-acb6-65e18f31563e", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"hotelFiles = os.listdir(\"./data/comments/keyPlayers\")\n", | |
"stopwords_file = \"./data/stopwords.dat\"" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"id": "47027439-b3c8-443c-bbdb-931c59f0f2b8", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"## 讀取檔案\n", | |
"def readfile(filepath):\n", | |
" fp = codecs.open(filepath, \"r\", encoding=\"utf-8\")\n", | |
" content = fp.read()\n", | |
" fp.close()\n", | |
" return content\n", | |
"\n", | |
"def read_words_list(filepath):\n", | |
" wordslist = readfile(filepath).splitlines()\n", | |
" return wordslist\n", | |
"\n", | |
"stopwords = read_words_list(stopwords_file) + [\" \"]\n", | |
"\n", | |
"def jieba_cut(sentence):\n", | |
" return [str(x) for x in jieba.cut(sentence, cut_all=False) if str(x) not in stopwords]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"id": "19d35a77-1e49-4d3f-afc1-eff566834959", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"Building prefix dict from the default dictionary ...\n", | |
"Loading model from cache /tmp/jieba.cache\n", | |
"Loading model cost 0.683 seconds.\n", | |
"Prefix dict has been built successfully.\n" | |
] | |
} | |
], | |
"source": [ | |
"corpus = []\n", | |
"\n", | |
"for hotelFile in hotelFiles:\n", | |
" df = pd.read_csv(f\"./data/comments/keyPlayers/{hotelFile}\")\n", | |
" commentsOfHotel = \"\".join(df.loc[:,\"好留言\"].dropna())\n", | |
" commentsOfHotel = \" \".join(jieba_cut(commentsOfHotel))\n", | |
" corpus.append(commentsOfHotel) " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"id": "dee4a818-3958-45c8-a787-7022167fc026", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"vectorizer = CountVectorizer()\n", | |
"X = vectorizer.fit_transform(corpus)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"id": "57304119", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import warnings\n", | |
"warnings.filterwarnings('ignore')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"id": "76645943-025d-423a-8570-34003866b52c", | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [], | |
"source": [ | |
"word = vectorizer.get_feature_names()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"id": "b9b1fc62-b9e8-4ec9-bec5-76a8e830e381", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([[0, 0, 0, ..., 0, 0, 0],\n", | |
" [0, 0, 0, ..., 0, 0, 0],\n", | |
" [0, 0, 0, ..., 0, 0, 0],\n", | |
" ...,\n", | |
" [0, 0, 0, ..., 1, 0, 0],\n", | |
" [0, 0, 0, ..., 0, 0, 0],\n", | |
" [0, 0, 0, ..., 0, 0, 0]])" | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"X.toarray()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"id": "9c5a8957-a381-4710-ad22-31e1104bf96c", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"transformer = TfidfTransformer()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"id": "53d690f3-2741-4f86-8780-8d9ccd137bd6", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"tfidf = transformer.fit_transform(X)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"id": "fcddd9f6-2b1c-458f-bd0b-d454d3e8e0b6", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"weight = tfidf.toarray()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"id": "984f2e48-d183-4a58-adda-24043b83cd23", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"余舍行旅.csv\n", | |
"[('早餐', 0.46456436851633826), ('房間', 0.21776454774203355), ('牛排', 0.178824851899184), ('很棒', 0.1596940016774913), ('親切', 0.1596940016774913), ('好吃', 0.1451763651613557), ('服務人員', 0.1451763651613557), ('排餐', 0.134118638924388), ('飯店', 0.13065872864522013), ('乾淨', 0.11614109212908456), ('餐盒', 0.10607622222922101), ('親子', 0.10393006246572759), ('停車場', 0.101623455612949), ('大廳', 0.09978649910051496), ('自助', 0.0983219813427562), ('咖啡', 0.09347050138099551), ('一口', 0.089412425949592), ('上菜', 0.089412425949592), ('茶籽堂', 0.089412425949592), ('環境', 0.08710581909681342)]\n", | |
"達斯旅店.csv\n", | |
"[('夜市', 0.5815335351298657), ('逢甲', 0.40779162170053185), ('位置', 0.2148579512185598), ('地點', 0.1885488143346545), ('房間', 0.1885488143346545), ('停車場', 0.17539424589270186), ('下樓', 0.16415002803540593), ('乾淨', 0.16223967745074924), ('停車位', 0.1588597467717163), ('親切', 0.11839111597757376), ('樓下', 0.11052392248070454), ('車位', 0.09668631675415902), ('舒適', 0.09646683524098602), ('麥當勞', 0.09452035722037896), ('很近', 0.09443706980741409), ('停車', 0.08868911729086176), ('服務人員', 0.08769712294635093), ('便宜', 0.08438966231444466), ('很棒', 0.07892741065171584), ('免費', 0.07468557245546253)]\n", | |
"順天環匯酒店.csv\n", | |
"[('泳池', 0.5109325859178896), ('房間', 0.3017022852274329), ('游泳池', 0.23224208450813164), ('早餐', 0.20437896741213196), ('舒適', 0.1849143038490718), ('飯店', 0.16544964028601158), ('健身房', 0.1600005773919892), ('頂樓', 0.1542246999429643), ('乾淨', 0.1362526449414213), ('很棒', 0.1362526449414213), ('整體', 0.12432530498316274), ('無邊際', 0.11988058714181878), ('設施', 0.11396486290123252), ('舒服', 0.10705564959683103), ('浴缸', 0.10014594127910999), ('迎賓', 0.09886962069702977), ('環境', 0.09732331781530094), ('設備', 0.09732331781530094), ('給你', 0.08991044035636409), ('空間', 0.08288353665544183)]\n", | |
"探索私旅 Explore Hotel.csv\n", | |
"[('房間', 0.3499046985890953), ('早餐', 0.30470542794729677), ('電影', 0.2639782117841078), ('親切', 0.2556096684570674), ('乾淨', 0.25327177514800886), ('夜市', 0.25264052703648654), ('逢甲', 0.22833424651805106), ('服務人員', 0.19716233573060382), ('舒適', 0.15975604278566713), ('飯店', 0.15430095839786384), ('服務', 0.14728727847068823), ('藍光', 0.14638791744391433), ('好吃', 0.10754309221669299), ('很棒', 0.1067637944470068), ('人員', 0.10442590113794827), ('入住', 0.1028673055985759), ('環境', 0.1028673055985759), ('電視', 0.09207583234251927), ('停車', 0.09125517447942881), ('停車場', 0.090398541283597)]\n", | |
"台中裕元花園酒店.csv\n", | |
"[('房間', 0.2993563679843881), ('早餐', 0.2993563679843881), ('泳池', 0.20835278709188892), ('健身房', 0.20506044867877746), ('飯店', 0.17462454799089305), ('服務', 0.16215136599154353), ('三溫', 0.15364173922292004), ('親切', 0.14967818399219404), ('很棒', 0.13720500199284455), ('spa', 0.13341197887164383), ('舒適', 0.12473181999349503), ('態度', 0.11950364447520563), ('游泳池', 0.11905873548107938), ('好吃', 0.11225863799414554), ('服務人員', 0.11225863799414554), ('夜景', 0.10559466467404838), ('舒服', 0.09978545599479603), ('浴缸', 0.09167811403643177), ('高樓', 0.08929405161080954), ('乾淨', 0.08731227399544653)]\n", | |
"葉綠宿旅館 - 逢甲館.csv\n", | |
"[('逢甲', 0.37158231393381086), ('夜市', 0.3655700003443324), ('乾淨', 0.2523955339927772), ('房間', 0.21032961166064765), ('早餐', 0.1963076375499378), ('空間', 0.17912317899893238), ('舒適', 0.16125270227316318), ('很近', 0.15099630449005033), ('親切', 0.14723072816245336), ('環境', 0.1332087540517435), ('公共', 0.12498811426560215), ('舒服', 0.11918677994103366), ('背包客', 0.11711195213254598), ('地點', 0.11217579288567875), ('服務人員', 0.11217579288567875), ('位置', 0.0981538187749689), ('植物', 0.08635969914575813), ('盆栽', 0.08635969914575813), ('簡單', 0.08244941132773287), ('提供', 0.082098123707844)]\n", | |
"文華道會館 - In One City Inn.csv\n", | |
"[('夜市', 0.3987908408229342), ('逢甲', 0.22867831294441332), ('泡腳', 0.2166773835576271), ('房間', 0.1934970340298882), ('停車', 0.18725913313698495), ('停車場', 0.17590639457262564), ('早餐', 0.17590639457262564), ('不錯', 0.15831575511536308), ('乾淨', 0.15831575511536308), ('態度', 0.14980730650958796), ('位置', 0.1407251156581005), ('服務人員', 0.1407251156581005), ('親切', 0.1407251156581005), ('很近', 0.13957679428802697), ('文化', 0.10833869177881356), ('開業', 0.10833869177881356), ('cp', 0.10621574330105843), ('整體', 0.09362956656849247), ('好吃', 0.08795319728631282), ('舒服', 0.08795319728631282)]\n", | |
"Hotel 7 逢甲.csv\n", | |
"[('房間', 0.3729969493455771), ('夜市', 0.24325695288121096), ('早餐', 0.24014872081153593), ('逢甲', 0.2299296263089174), ('空間', 0.1958151554852874), ('乾淨', 0.19416279554975246), ('不錯', 0.1839437010471339), ('頂樓', 0.18218053197070327), ('舒適', 0.1635055120418968), ('閣樓', 0.15850748931777447), ('地點', 0.1430673230366597), ('服務人員', 0.1430673230366597), ('很棒', 0.13795777578535043), ('親切', 0.1226291340314226), ('整體', 0.10334688761723503), ('環境', 0.1021909450261855), ('公共', 0.1012114066503907), ('落地窗', 0.09754307034939969), ('整潔', 0.09708139777487623), ('走路', 0.09255735595313337)]\n", | |
"台中逢甲圖樂文旅.csv\n", | |
"[('爛爛爛', 0.965978409431068), ('夜市', 0.11413460412394073), ('房間', 0.09875307258284957), ('逢甲', 0.07067621861321588), ('乾淨', 0.06002637745232033), ('早餐', 0.0590582100740571), ('親切', 0.056153707939267404), ('很近', 0.0493851652459359), ('舒適', 0.04163119726531894), ('服務人員', 0.038726695130529246), ('地點', 0.03775852775226601), ('位置', 0.034854025617476324), ('舒服', 0.03291769086094986), ('很大', 0.03194952348268663), ('不錯', 0.02614051921310724), ('環境', 0.02614051921310724), ('服務', 0.022267849700054316), ('可愛的', 0.020710854025830002), ('很棒', 0.020331514943527855), ('入住', 0.019363347565264623)]\n", | |
"星享道酒店 .csv\n", | |
"[('早餐', 0.3365549034645201), ('夜市', 0.30201730916484315), ('逢甲', 0.23839305662070173), ('腳池', 0.2159169064134771), ('泡腳池', 0.18748747509731575), ('地點', 0.1542543307545717), ('房間', 0.1542543307545717), ('位置', 0.14023120977688336), ('舒適', 0.14023120977688336), ('親切', 0.14023120977688336), ('環境', 0.12620808879919504), ('浴缸', 0.12368423238332318), ('不錯', 0.11218496782150669), ('舒服', 0.11218496782150669), ('飯店', 0.11218496782150669), ('很近', 0.11126953495546851), ('停車', 0.10449706727125115), ('好吃', 0.09816184684381836), ('很棒', 0.09816184684381836), ('服務', 0.09816184684381836)]\n", | |
"逢甲金瑞文旅.csv\n", | |
"[('房間', 0.4158825117791048), ('乾淨', 0.37152171052266697), ('夜市', 0.31427725516343535), ('逢甲', 0.25507460722451764), ('地點', 0.18853340533986085), ('早餐', 0.18853340533986085), ('很大', 0.1663530047116419), ('舒適', 0.15526280439753246), ('整潔', 0.13862750392636827), ('空間', 0.12986539985651663), ('舒服', 0.12199220345520408), ('cp', 0.12053666216335177), ('環境', 0.10535690298403988), ('不錯', 0.09981180282698515), ('位置', 0.09426670266993042), ('浴室', 0.09426670266993042), ('試營', 0.0853790588820486), ('很近', 0.08171208634249319), ('服務人員', 0.07763140219876623), ('值高', 0.07336185619861398)]\n", | |
"MINI HOTELS(逢甲館).csv\n", | |
"[('早餐', 0.3341528192873411), ('房間', 0.3318323135978457), ('夜市', 0.31564392549682513), ('逢甲', 0.2761401770499555), ('乾淨', 0.17403792671215682), ('空間', 0.16797820325910096), ('舒適', 0.1508328698172026), ('不錯', 0.13690983568023005), ('八樓', 0.1357713707407958), ('親切', 0.1345893299907346), ('設計', 0.13170969879084818), ('舒服', 0.12298680154325749), ('飲料', 0.12251101659081028), ('地點', 0.12066629585376207), ('很棒', 0.11834579016426665), ('整體', 0.1161025816643786), ('位置', 0.11602528447477121), ('好吃', 0.1044227560272941), ('cp', 0.10088402460575606), ('環境', 0.09282022757981698)]\n", | |
"KUN Hotel逢甲.csv\n", | |
"[('夜市', 0.32209491668465196), ('逢甲', 0.3157241546986178), ('房間', 0.2981839238820279), ('早餐', 0.2280230006156684), ('乾淨', 0.20697472363576055), ('親切', 0.18241840049253474), ('舒適', 0.17540230816589877), ('位置', 0.1613701235126269), ('地點', 0.1613701235126269), ('不錯', 0.15435403118599092), ('停車', 0.15311246126601477), ('很棒', 0.13681380036940105), ('服務', 0.13681380036940105), ('飯店', 0.13681380036940105), ('停車場', 0.13330575420608307), ('服務人員', 0.13330575420608307), ('環境', 0.12628966187944712), ('很近', 0.1232708940398051), ('人員', 0.09822529257290331), ('服務態度', 0.09709570714430205)]\n", | |
"台中萬楓酒店.csv\n", | |
"[('房間', 0.3281621995926408), ('早餐', 0.3281621995926408), ('服務人員', 0.2187747997284272), ('親切', 0.2187747997284272), ('多一些', 0.1684260516391013), ('乾淨', 0.1640810997963204), ('舒服', 0.1640810997963204), ('飯店', 0.1640810997963204), ('保全', 0.14624966468329412), ('整潔', 0.136734249830267), ('舒適', 0.136734249830267), ('服務態度', 0.1164471008825341), ('好吃', 0.1093873998642136), ('品質', 0.10564214531794666), ('水準', 0.09990780975416028), ('專業', 0.09260448383443831), ('隔音', 0.08733532566190057), ('好好', 0.08616249382515542), ('以同', 0.08421302581955065), ('制服', 0.08421302581955065)]\n", | |
"豐邑逢甲商旅.csv\n", | |
"[('早餐', 0.5317758238674191), ('夜市', 0.3750659672308816), ('逢甲', 0.24816205113812886), ('房間', 0.24225343087293533), ('好吃', 0.16544136742541923), ('舒適', 0.16544136742541923), ('地點', 0.1595327471602257), ('入住', 0.12408102556906443), ('服務人員', 0.12408102556906443), ('乾淨', 0.1181724053038709), ('親切', 0.1181724053038709), ('電視', 0.10239051362800133), ('服務', 0.09453792424309672), ('選擇', 0.09434930818608929), ('很近', 0.0937664918077204), ('位置', 0.08862930397790317), ('很大', 0.08862930397790317), ('環境', 0.08272068371270962), ('泡澡', 0.08192777506363293), ('飯店', 0.07681206344751608)]\n" | |
] | |
} | |
], | |
"source": [ | |
"for i in range(len(weight)):\n", | |
" # print(hotelFiles[i])\n", | |
" keywords = []\n", | |
" for j in range(len(word)):\n", | |
" keywords.append((word[j], weight[i][j]))\n", | |
" \n", | |
" keywords = sorted(keywords, key=lambda x: -x[1])[:20]\n", | |
" print(hotelFiles[i])\n", | |
" print(keywords) " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "934ddca7-6f19-4e79-bdf9-77377a554921", | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.9.7" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment