Skip to content

Instantly share code, notes, and snippets.

@YHTerrance
Last active May 16, 2022 08:15
Show Gist options
  • Save YHTerrance/3370b729c23d1b333e695ed4cd0e9b08 to your computer and use it in GitHub Desktop.
Save YHTerrance/3370b729c23d1b333e695ed4cd0e9b08 to your computer and use it in GitHub Desktop.
TFIDF
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "5a876275-54f0-4d6a-8530-7c83e1e2277c",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import os\n",
"from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer\n",
"import codecs\n",
"import jieba"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "772d9dde-ced1-4f60-acb6-65e18f31563e",
"metadata": {},
"outputs": [],
"source": [
"hotelFiles = os.listdir(\"./data/comments/keyPlayers\")\n",
"stopwords_file = \"./data/stopwords.dat\""
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "47027439-b3c8-443c-bbdb-931c59f0f2b8",
"metadata": {},
"outputs": [],
"source": [
"## 讀取檔案\n",
"def readfile(filepath):\n",
" fp = codecs.open(filepath, \"r\", encoding=\"utf-8\")\n",
" content = fp.read()\n",
" fp.close()\n",
" return content\n",
"\n",
"def read_words_list(filepath):\n",
" wordslist = readfile(filepath).splitlines()\n",
" return wordslist\n",
"\n",
"stopwords = read_words_list(stopwords_file) + [\" \"]\n",
"\n",
"def jieba_cut(sentence):\n",
" return [str(x) for x in jieba.cut(sentence, cut_all=False) if str(x) not in stopwords]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "19d35a77-1e49-4d3f-afc1-eff566834959",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Building prefix dict from the default dictionary ...\n",
"Loading model from cache /tmp/jieba.cache\n",
"Loading model cost 0.683 seconds.\n",
"Prefix dict has been built successfully.\n"
]
}
],
"source": [
"corpus = []\n",
"\n",
"for hotelFile in hotelFiles:\n",
" df = pd.read_csv(f\"./data/comments/keyPlayers/{hotelFile}\")\n",
" commentsOfHotel = \"\".join(df.loc[:,\"好留言\"].dropna())\n",
" commentsOfHotel = \" \".join(jieba_cut(commentsOfHotel))\n",
" corpus.append(commentsOfHotel) "
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "dee4a818-3958-45c8-a787-7022167fc026",
"metadata": {},
"outputs": [],
"source": [
"vectorizer = CountVectorizer()\n",
"X = vectorizer.fit_transform(corpus)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "57304119",
"metadata": {},
"outputs": [],
"source": [
"import warnings\n",
"warnings.filterwarnings('ignore')"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "76645943-025d-423a-8570-34003866b52c",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"word = vectorizer.get_feature_names()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "b9b1fc62-b9e8-4ec9-bec5-76a8e830e381",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[0, 0, 0, ..., 0, 0, 0],\n",
" [0, 0, 0, ..., 0, 0, 0],\n",
" [0, 0, 0, ..., 0, 0, 0],\n",
" ...,\n",
" [0, 0, 0, ..., 1, 0, 0],\n",
" [0, 0, 0, ..., 0, 0, 0],\n",
" [0, 0, 0, ..., 0, 0, 0]])"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X.toarray()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "9c5a8957-a381-4710-ad22-31e1104bf96c",
"metadata": {},
"outputs": [],
"source": [
"transformer = TfidfTransformer()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "53d690f3-2741-4f86-8780-8d9ccd137bd6",
"metadata": {},
"outputs": [],
"source": [
"tfidf = transformer.fit_transform(X)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "fcddd9f6-2b1c-458f-bd0b-d454d3e8e0b6",
"metadata": {},
"outputs": [],
"source": [
"weight = tfidf.toarray()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "984f2e48-d183-4a58-adda-24043b83cd23",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"余舍行旅.csv\n",
"[('早餐', 0.46456436851633826), ('房間', 0.21776454774203355), ('牛排', 0.178824851899184), ('很棒', 0.1596940016774913), ('親切', 0.1596940016774913), ('好吃', 0.1451763651613557), ('服務人員', 0.1451763651613557), ('排餐', 0.134118638924388), ('飯店', 0.13065872864522013), ('乾淨', 0.11614109212908456), ('餐盒', 0.10607622222922101), ('親子', 0.10393006246572759), ('停車場', 0.101623455612949), ('大廳', 0.09978649910051496), ('自助', 0.0983219813427562), ('咖啡', 0.09347050138099551), ('一口', 0.089412425949592), ('上菜', 0.089412425949592), ('茶籽堂', 0.089412425949592), ('環境', 0.08710581909681342)]\n",
"達斯旅店.csv\n",
"[('夜市', 0.5815335351298657), ('逢甲', 0.40779162170053185), ('位置', 0.2148579512185598), ('地點', 0.1885488143346545), ('房間', 0.1885488143346545), ('停車場', 0.17539424589270186), ('下樓', 0.16415002803540593), ('乾淨', 0.16223967745074924), ('停車位', 0.1588597467717163), ('親切', 0.11839111597757376), ('樓下', 0.11052392248070454), ('車位', 0.09668631675415902), ('舒適', 0.09646683524098602), ('麥當勞', 0.09452035722037896), ('很近', 0.09443706980741409), ('停車', 0.08868911729086176), ('服務人員', 0.08769712294635093), ('便宜', 0.08438966231444466), ('很棒', 0.07892741065171584), ('免費', 0.07468557245546253)]\n",
"順天環匯酒店.csv\n",
"[('泳池', 0.5109325859178896), ('房間', 0.3017022852274329), ('游泳池', 0.23224208450813164), ('早餐', 0.20437896741213196), ('舒適', 0.1849143038490718), ('飯店', 0.16544964028601158), ('健身房', 0.1600005773919892), ('頂樓', 0.1542246999429643), ('乾淨', 0.1362526449414213), ('很棒', 0.1362526449414213), ('整體', 0.12432530498316274), ('無邊際', 0.11988058714181878), ('設施', 0.11396486290123252), ('舒服', 0.10705564959683103), ('浴缸', 0.10014594127910999), ('迎賓', 0.09886962069702977), ('環境', 0.09732331781530094), ('設備', 0.09732331781530094), ('給你', 0.08991044035636409), ('空間', 0.08288353665544183)]\n",
"探索私旅 Explore Hotel.csv\n",
"[('房間', 0.3499046985890953), ('早餐', 0.30470542794729677), ('電影', 0.2639782117841078), ('親切', 0.2556096684570674), ('乾淨', 0.25327177514800886), ('夜市', 0.25264052703648654), ('逢甲', 0.22833424651805106), ('服務人員', 0.19716233573060382), ('舒適', 0.15975604278566713), ('飯店', 0.15430095839786384), ('服務', 0.14728727847068823), ('藍光', 0.14638791744391433), ('好吃', 0.10754309221669299), ('很棒', 0.1067637944470068), ('人員', 0.10442590113794827), ('入住', 0.1028673055985759), ('環境', 0.1028673055985759), ('電視', 0.09207583234251927), ('停車', 0.09125517447942881), ('停車場', 0.090398541283597)]\n",
"台中裕元花園酒店.csv\n",
"[('房間', 0.2993563679843881), ('早餐', 0.2993563679843881), ('泳池', 0.20835278709188892), ('健身房', 0.20506044867877746), ('飯店', 0.17462454799089305), ('服務', 0.16215136599154353), ('三溫', 0.15364173922292004), ('親切', 0.14967818399219404), ('很棒', 0.13720500199284455), ('spa', 0.13341197887164383), ('舒適', 0.12473181999349503), ('態度', 0.11950364447520563), ('游泳池', 0.11905873548107938), ('好吃', 0.11225863799414554), ('服務人員', 0.11225863799414554), ('夜景', 0.10559466467404838), ('舒服', 0.09978545599479603), ('浴缸', 0.09167811403643177), ('高樓', 0.08929405161080954), ('乾淨', 0.08731227399544653)]\n",
"葉綠宿旅館 - 逢甲館.csv\n",
"[('逢甲', 0.37158231393381086), ('夜市', 0.3655700003443324), ('乾淨', 0.2523955339927772), ('房間', 0.21032961166064765), ('早餐', 0.1963076375499378), ('空間', 0.17912317899893238), ('舒適', 0.16125270227316318), ('很近', 0.15099630449005033), ('親切', 0.14723072816245336), ('環境', 0.1332087540517435), ('公共', 0.12498811426560215), ('舒服', 0.11918677994103366), ('背包客', 0.11711195213254598), ('地點', 0.11217579288567875), ('服務人員', 0.11217579288567875), ('位置', 0.0981538187749689), ('植物', 0.08635969914575813), ('盆栽', 0.08635969914575813), ('簡單', 0.08244941132773287), ('提供', 0.082098123707844)]\n",
"文華道會館 - In One City Inn.csv\n",
"[('夜市', 0.3987908408229342), ('逢甲', 0.22867831294441332), ('泡腳', 0.2166773835576271), ('房間', 0.1934970340298882), ('停車', 0.18725913313698495), ('停車場', 0.17590639457262564), ('早餐', 0.17590639457262564), ('不錯', 0.15831575511536308), ('乾淨', 0.15831575511536308), ('態度', 0.14980730650958796), ('位置', 0.1407251156581005), ('服務人員', 0.1407251156581005), ('親切', 0.1407251156581005), ('很近', 0.13957679428802697), ('文化', 0.10833869177881356), ('開業', 0.10833869177881356), ('cp', 0.10621574330105843), ('整體', 0.09362956656849247), ('好吃', 0.08795319728631282), ('舒服', 0.08795319728631282)]\n",
"Hotel 7 逢甲.csv\n",
"[('房間', 0.3729969493455771), ('夜市', 0.24325695288121096), ('早餐', 0.24014872081153593), ('逢甲', 0.2299296263089174), ('空間', 0.1958151554852874), ('乾淨', 0.19416279554975246), ('不錯', 0.1839437010471339), ('頂樓', 0.18218053197070327), ('舒適', 0.1635055120418968), ('閣樓', 0.15850748931777447), ('地點', 0.1430673230366597), ('服務人員', 0.1430673230366597), ('很棒', 0.13795777578535043), ('親切', 0.1226291340314226), ('整體', 0.10334688761723503), ('環境', 0.1021909450261855), ('公共', 0.1012114066503907), ('落地窗', 0.09754307034939969), ('整潔', 0.09708139777487623), ('走路', 0.09255735595313337)]\n",
"台中逢甲圖樂文旅.csv\n",
"[('爛爛爛', 0.965978409431068), ('夜市', 0.11413460412394073), ('房間', 0.09875307258284957), ('逢甲', 0.07067621861321588), ('乾淨', 0.06002637745232033), ('早餐', 0.0590582100740571), ('親切', 0.056153707939267404), ('很近', 0.0493851652459359), ('舒適', 0.04163119726531894), ('服務人員', 0.038726695130529246), ('地點', 0.03775852775226601), ('位置', 0.034854025617476324), ('舒服', 0.03291769086094986), ('很大', 0.03194952348268663), ('不錯', 0.02614051921310724), ('環境', 0.02614051921310724), ('服務', 0.022267849700054316), ('可愛的', 0.020710854025830002), ('很棒', 0.020331514943527855), ('入住', 0.019363347565264623)]\n",
"星享道酒店 .csv\n",
"[('早餐', 0.3365549034645201), ('夜市', 0.30201730916484315), ('逢甲', 0.23839305662070173), ('腳池', 0.2159169064134771), ('泡腳池', 0.18748747509731575), ('地點', 0.1542543307545717), ('房間', 0.1542543307545717), ('位置', 0.14023120977688336), ('舒適', 0.14023120977688336), ('親切', 0.14023120977688336), ('環境', 0.12620808879919504), ('浴缸', 0.12368423238332318), ('不錯', 0.11218496782150669), ('舒服', 0.11218496782150669), ('飯店', 0.11218496782150669), ('很近', 0.11126953495546851), ('停車', 0.10449706727125115), ('好吃', 0.09816184684381836), ('很棒', 0.09816184684381836), ('服務', 0.09816184684381836)]\n",
"逢甲金瑞文旅.csv\n",
"[('房間', 0.4158825117791048), ('乾淨', 0.37152171052266697), ('夜市', 0.31427725516343535), ('逢甲', 0.25507460722451764), ('地點', 0.18853340533986085), ('早餐', 0.18853340533986085), ('很大', 0.1663530047116419), ('舒適', 0.15526280439753246), ('整潔', 0.13862750392636827), ('空間', 0.12986539985651663), ('舒服', 0.12199220345520408), ('cp', 0.12053666216335177), ('環境', 0.10535690298403988), ('不錯', 0.09981180282698515), ('位置', 0.09426670266993042), ('浴室', 0.09426670266993042), ('試營', 0.0853790588820486), ('很近', 0.08171208634249319), ('服務人員', 0.07763140219876623), ('值高', 0.07336185619861398)]\n",
"MINI HOTELS(逢甲館).csv\n",
"[('早餐', 0.3341528192873411), ('房間', 0.3318323135978457), ('夜市', 0.31564392549682513), ('逢甲', 0.2761401770499555), ('乾淨', 0.17403792671215682), ('空間', 0.16797820325910096), ('舒適', 0.1508328698172026), ('不錯', 0.13690983568023005), ('八樓', 0.1357713707407958), ('親切', 0.1345893299907346), ('設計', 0.13170969879084818), ('舒服', 0.12298680154325749), ('飲料', 0.12251101659081028), ('地點', 0.12066629585376207), ('很棒', 0.11834579016426665), ('整體', 0.1161025816643786), ('位置', 0.11602528447477121), ('好吃', 0.1044227560272941), ('cp', 0.10088402460575606), ('環境', 0.09282022757981698)]\n",
"KUN Hotel逢甲.csv\n",
"[('夜市', 0.32209491668465196), ('逢甲', 0.3157241546986178), ('房間', 0.2981839238820279), ('早餐', 0.2280230006156684), ('乾淨', 0.20697472363576055), ('親切', 0.18241840049253474), ('舒適', 0.17540230816589877), ('位置', 0.1613701235126269), ('地點', 0.1613701235126269), ('不錯', 0.15435403118599092), ('停車', 0.15311246126601477), ('很棒', 0.13681380036940105), ('服務', 0.13681380036940105), ('飯店', 0.13681380036940105), ('停車場', 0.13330575420608307), ('服務人員', 0.13330575420608307), ('環境', 0.12628966187944712), ('很近', 0.1232708940398051), ('人員', 0.09822529257290331), ('服務態度', 0.09709570714430205)]\n",
"台中萬楓酒店.csv\n",
"[('房間', 0.3281621995926408), ('早餐', 0.3281621995926408), ('服務人員', 0.2187747997284272), ('親切', 0.2187747997284272), ('多一些', 0.1684260516391013), ('乾淨', 0.1640810997963204), ('舒服', 0.1640810997963204), ('飯店', 0.1640810997963204), ('保全', 0.14624966468329412), ('整潔', 0.136734249830267), ('舒適', 0.136734249830267), ('服務態度', 0.1164471008825341), ('好吃', 0.1093873998642136), ('品質', 0.10564214531794666), ('水準', 0.09990780975416028), ('專業', 0.09260448383443831), ('隔音', 0.08733532566190057), ('好好', 0.08616249382515542), ('以同', 0.08421302581955065), ('制服', 0.08421302581955065)]\n",
"豐邑逢甲商旅.csv\n",
"[('早餐', 0.5317758238674191), ('夜市', 0.3750659672308816), ('逢甲', 0.24816205113812886), ('房間', 0.24225343087293533), ('好吃', 0.16544136742541923), ('舒適', 0.16544136742541923), ('地點', 0.1595327471602257), ('入住', 0.12408102556906443), ('服務人員', 0.12408102556906443), ('乾淨', 0.1181724053038709), ('親切', 0.1181724053038709), ('電視', 0.10239051362800133), ('服務', 0.09453792424309672), ('選擇', 0.09434930818608929), ('很近', 0.0937664918077204), ('位置', 0.08862930397790317), ('很大', 0.08862930397790317), ('環境', 0.08272068371270962), ('泡澡', 0.08192777506363293), ('飯店', 0.07681206344751608)]\n"
]
}
],
"source": [
"for i in range(len(weight)):\n",
" # print(hotelFiles[i])\n",
" keywords = []\n",
" for j in range(len(word)):\n",
" keywords.append((word[j], weight[i][j]))\n",
" \n",
" keywords = sorted(keywords, key=lambda x: -x[1])[:20]\n",
" print(hotelFiles[i])\n",
" print(keywords) "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "934ddca7-6f19-4e79-bdf9-77377a554921",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment