Skip to content

Instantly share code, notes, and snippets.

@simoncos
Last active October 9, 2017 06:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save simoncos/88ffadcde31c03d6eadfa411649eba39 to your computer and use it in GitHub Desktop.
Save simoncos/88ffadcde31c03d6eadfa411649eba39 to your computer and use it in GitHub Desktop.
neu_feature_to_sklearn
# -*- coding: utf-8 -*-
import json
import pandas as pd
from pandas.io.json import json_normalize
from sklearn import feature_extraction
from sklearn.ensemble import AdaBoostClassifier
# f = open("data/neu_features.txt")
# for line in f:
# json.loads(line)
# f.close()
from itertools import islice
with open("neu_features.txt") as myfile:
head = list(islice(myfile, 5))
data_list = []
target_list = []
for line in head:
parsed_json = json.loads(line)
join_grams = list(parsed_json['skipgram']) + list(parsed_json['bigram']) + list(parsed_json['unigram'])
doc_dict = {}
for jg in join_grams:
doc_dict[jg['ngram']] = jg['count']
data_list.append(doc_dict)
target_list.append(parsed_json['label'])
print(data_list)
print(target_list)
v = feature_extraction.DictVectorizer(sparse=False)
X = v.fit_transform(data_list)
clf = AdaBoostClassifier()
clf.fit(X, target_list)
X_test = v.transform([{'服>力親為': 1, '阿拉丁金>服': 1, '親>力親為': 1, '服>親': 1, '服': 1, '親': 1, '阿拉丁金>親': 1, '力親為': 1, '阿拉丁金': 1}, ])
print(clf.predict(X_test))
{"bigram":[{"count":1,"ngram":"透明>之旅"},{"count":1,"ngram":"赴>春天"},{"count":1,"ngram":"春天>約會"},{"count":1,"ngram":"之旅>廣東"},{"count":1,"ngram":"站>經濟"},{"count":1,"ngram":"經濟>赴"},{"count":1,"ngram":"廣東>站"}],"label":"中性","skipgram":[{"count":1,"ngram":"之旅>站"},{"count":1,"ngram":"透明>廣東"},{"count":1,"ngram":"站>赴"},{"count":1,"ngram":"赴>約會"},{"count":1,"ngram":"廣東>經濟"},{"count":1,"ngram":"經濟>春天"}],"unigram":[{"count":1,"ngram":"約會"},{"count":1,"ngram":"之旅"},{"count":1,"ngram":"經濟"},{"count":1,"ngram":"赴"},{"count":1,"ngram":"廣東"},{"count":1,"ngram":"春天"},{"count":1,"ngram":"站"},{"count":1,"ngram":"透明"}]}
{"bigram":[{"count":1,"ngram":"約會>組"},{"count":1,"ngram":"透明>之旅"},{"count":1,"ngram":"赴>春天"},{"count":1,"ngram":"春天>約會"},{"count":1,"ngram":"之旅>廣東"},{"count":1,"ngram":"站>經濟"},{"count":1,"ngram":"組>圖"},{"count":1,"ngram":"經濟>赴"},{"count":1,"ngram":"廣東>站"}],"label":"中性","skipgram":[{"count":1,"ngram":"之旅>站"},{"count":1,"ngram":"約會>圖"},{"count":1,"ngram":"透明>廣東"},{"count":1,"ngram":"站>赴"},{"count":1,"ngram":"赴>約會"},{"count":1,"ngram":"春天>組"},{"count":1,"ngram":"廣東>經濟"},{"count":1,"ngram":"經濟>春天"}],"unigram":[{"count":1,"ngram":"約會"},{"count":1,"ngram":"之旅"},{"count":1,"ngram":"經濟"},{"count":1,"ngram":"赴"},{"count":1,"ngram":"組"},{"count":1,"ngram":"廣東"},{"count":1,"ngram":"圖"},{"count":1,"ngram":"春天"},{"count":1,"ngram":"站"},{"count":1,"ngram":"透明"}]}
{"bigram":[{"count":1,"ngram":"透明>之旅"},{"count":1,"ngram":"赴>春天"},{"count":1,"ngram":"春天>約會"},{"count":1,"ngram":"經濟>赴"},{"count":1,"ngram":"之旅>經濟"}],"label":"中性","skipgram":[{"count":1,"ngram":"赴>約會"},{"count":1,"ngram":"之旅>赴"},{"count":1,"ngram":"透明>經濟"},{"count":1,"ngram":"經濟>春天"}],"unigram":[{"count":1,"ngram":"約會"},{"count":1,"ngram":"之旅"},{"count":1,"ngram":"經濟"},{"count":1,"ngram":"赴"},{"count":1,"ngram":"春天"},{"count":1,"ngram":"透明"}]}
{"bigram":[{"count":1,"ngram":"親>力親為"},{"count":1,"ngram":"服>親"},{"count":1,"ngram":"阿拉丁金>服"}],"label":"中性","skipgram":[{"count":1,"ngram":"阿拉丁金>親"},{"count":1,"ngram":"服>力親為"}],"unigram":[{"count":1,"ngram":"阿拉丁金"},{"count":1,"ngram":"親"},{"count":1,"ngram":"力親為"},{"count":1,"ngram":"服"}]}
{"bigram":[{"count":1,"ngram":"ppmoney>有利"},{"count":1,"ngram":"民>蘊"},{"count":1,"ngram":"財富>ppmoney"},{"count":1,"ngram":"蘊>財富"},{"count":1,"ngram":"有利>網"}],"label":"中性","skipgram":[{"count":1,"ngram":"蘊>ppmoney"},{"count":1,"ngram":"ppmoney>網"},{"count":1,"ngram":"民>財富"},{"count":1,"ngram":"財富>有利"}],"unigram":[{"count":1,"ngram":"民"},{"count":1,"ngram":"網"},{"count":1,"ngram":"蘊"},{"count":1,"ngram":"財富"},{"count":1,"ngram":"ppmoney"},{"count":1,"ngram":"有利"}]}
{"bigram":[{"count":1,"ngram":"深圳>停工"},{"count":1,"ngram":"海馬>深圳"},{"count":1,"ngram":"停工>安穩"},{"count":1,"ngram":"受>影響"},{"count":1,"ngram":"安穩>受"}],"label":"中性","skipgram":[{"count":1,"ngram":"海馬>停工"},{"count":1,"ngram":"深圳>安穩"},{"count":1,"ngram":"停工>受"},{"count":1,"ngram":"安穩>影響"}],"unigram":[{"count":1,"ngram":"影響"},{"count":1,"ngram":"停工"},{"count":1,"ngram":"受"},{"count":1,"ngram":"海馬"},{"count":1,"ngram":"安穩"},{"count":1,"ngram":"深圳"}]}
{"bigram":[{"count":1,"ngram":"金融>充滿"},{"count":1,"ngram":"獅子座>金融"},{"count":1,"ngram":"范>獅子座"},{"count":1,"ngram":"充滿>潛力"},{"count":1,"ngram":"理財>范"}],"label":"中性","skipgram":[{"count":1,"ngram":"范>金融"},{"count":1,"ngram":"金融>潛力"},{"count":1,"ngram":"理財>獅子座"},{"count":1,"ngram":"獅子座>充滿"}],"unigram":[{"count":1,"ngram":"獅子座"},{"count":1,"ngram":"充滿"},{"count":1,"ngram":"范"},{"count":1,"ngram":"理財"},{"count":1,"ngram":"潛力"},{"count":1,"ngram":"金融"}]}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment