Skip to content

Instantly share code, notes, and snippets.

@yumaueno
Created April 11, 2021 06:24
Show Gist options
  • Save yumaueno/df87c7893b61d6d5b5e71a2eca6648ce to your computer and use it in GitHub Desktop.
Save yumaueno/df87c7893b61d6d5b5e71a2eca6648ce to your computer and use it in GitHub Desktop.
Mecab×Light gbmでNishikaのテキストデータを分類予測
!apt install aptitude
!aptitude install mecab libmecab-dev mecab-ipadic-utf8 git make curl xz-utils file -y
!pip install mecab-python3==0.7
import pandas as pd
import numpy as np
import collections
import MeCab
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
df = pd.read_csv("/content/drive/MyDrive/Stabiz/python/data-science/nishika_text_data/train.csv")
m = MeCab.Tagger("")
text_list = []
length_list = []
# それぞれの文書を取り出して形態素解析
for sentence in df["body"]:
ma = m.parse(sentence)
word_list = []
# 形態解析後の単語だけ抽出
for text in ma.split("\n"):
word_list.append(text.split("\t")[0])
# 単語の数を集計
length_list.append(len(word_list))
# 単語の頻度を集計
data = collections.Counter(word_list)
text_data = pd.DataFrame.from_dict(data, orient='index')
text_list.append(text_data)
feature = pd.concat(text_list, axis=1)
#Nanを0に置換
feature = feature.fillna(0)
#Arrayに変換
feature_temp = feature.values.sum(axis=1)
#上位k件
K = 100
#上位k件のインデックス
indices = np.argpartition(-feature_temp, K)[:K]
## 各文書に対して全体で頻出の上位k個の単語の出現数をその文書の単語出現数で割ったものを変数とする ##
modi_feature = []
for index, row in feature.iloc[indices].T.reset_index(drop=True).iterrows():
modi_feature_temp = row/length_list[index]
modi_feature.append(modi_feature_temp)
modi_feature = pd.concat(modi_feature, axis=1).T
# 各文書と作成した特徴量を結合
df = pd.concat([df, modi_feature], axis=1)
## Light gbmと実装
df = df.drop(["writing_id", "body"], axis=1)
df_train, df_val = train_test_split(df, test_size=0.2)
col = "author"
train_y = df_train[col]
train_x = df_train.drop(col, axis=1)
val_y = df_val[col]
val_x = df_val.drop(col, axis=1)
trains = lgb.Dataset(train_x.values, train_y)
valids = lgb.Dataset(val_x.values, val_y)
params = {
"objective": "binary",
"metrics": "binary_logloss"
}
model = lgb.train(params, trains, valid_sets=valids, num_boost_round=1000, early_stopping_rounds=100)
## 予測:量的変数で返ってきているので0.5を閾値にして0,1に振り分ける
predict_list = []
for i in model.predict(val_x):
if i > 0.5:
predict = 1
else:
predict = 0
predict_list.append(predict)
f1_score(val_y, predict_list)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment