Created
April 11, 2021 06:24
-
-
Save yumaueno/df87c7893b61d6d5b5e71a2eca6648ce to your computer and use it in GitHub Desktop.
Mecab×Light gbmでNishikaのテキストデータを分類予測
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
!apt install aptitude | |
!aptitude install mecab libmecab-dev mecab-ipadic-utf8 git make curl xz-utils file -y | |
!pip install mecab-python3==0.7 | |
import pandas as pd | |
import numpy as np | |
import collections | |
import MeCab | |
import lightgbm as lgb | |
from sklearn.model_selection import train_test_split | |
from sklearn.metrics import f1_score | |
df = pd.read_csv("/content/drive/MyDrive/Stabiz/python/data-science/nishika_text_data/train.csv") | |
m = MeCab.Tagger("") | |
text_list = [] | |
length_list = [] | |
# それぞれの文書を取り出して形態素解析 | |
for sentence in df["body"]: | |
ma = m.parse(sentence) | |
word_list = [] | |
# 形態解析後の単語だけ抽出 | |
for text in ma.split("\n"): | |
word_list.append(text.split("\t")[0]) | |
# 単語の数を集計 | |
length_list.append(len(word_list)) | |
# 単語の頻度を集計 | |
data = collections.Counter(word_list) | |
text_data = pd.DataFrame.from_dict(data, orient='index') | |
text_list.append(text_data) | |
feature = pd.concat(text_list, axis=1) | |
#Nanを0に置換 | |
feature = feature.fillna(0) | |
#Arrayに変換 | |
feature_temp = feature.values.sum(axis=1) | |
#上位k件 | |
K = 100 | |
#上位k件のインデックス | |
indices = np.argpartition(-feature_temp, K)[:K] | |
## 各文書に対して全体で頻出の上位k個の単語の出現数をその文書の単語出現数で割ったものを変数とする ## | |
modi_feature = [] | |
for index, row in feature.iloc[indices].T.reset_index(drop=True).iterrows(): | |
modi_feature_temp = row/length_list[index] | |
modi_feature.append(modi_feature_temp) | |
modi_feature = pd.concat(modi_feature, axis=1).T | |
# 各文書と作成した特徴量を結合 | |
df = pd.concat([df, modi_feature], axis=1) | |
## Light gbmと実装 | |
df = df.drop(["writing_id", "body"], axis=1) | |
df_train, df_val = train_test_split(df, test_size=0.2) | |
col = "author" | |
train_y = df_train[col] | |
train_x = df_train.drop(col, axis=1) | |
val_y = df_val[col] | |
val_x = df_val.drop(col, axis=1) | |
trains = lgb.Dataset(train_x.values, train_y) | |
valids = lgb.Dataset(val_x.values, val_y) | |
params = { | |
"objective": "binary", | |
"metrics": "binary_logloss" | |
} | |
model = lgb.train(params, trains, valid_sets=valids, num_boost_round=1000, early_stopping_rounds=100) | |
## 予測:量的変数で返ってきているので0.5を閾値にして0,1に振り分ける | |
predict_list = [] | |
for i in model.predict(val_x): | |
if i > 0.5: | |
predict = 1 | |
else: | |
predict = 0 | |
predict_list.append(predict) | |
f1_score(val_y, predict_list) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment