Skip to content

Instantly share code, notes, and snippets.

@wannaphong
Last active December 11, 2018 12:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save wannaphong/f386fd8b0520ebdbf2eddf8651a45b36 to your computer and use it in GitHub Desktop.
Save wannaphong/f386fd8b0520ebdbf2eddf8651a45b36 to your computer and use it in GitHub Desktop.
from sklearn_crfsuite import scorers,metrics
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_validate,train_test_split
import sklearn_crfsuite
def doc2features(doc, i):
word = doc[i][0]
postag = doc[i][1]
# Features from current word
features={
'word.word': word,
'word.isspace':word.isspace(),
'postag':postag,
'word.isdigit()': word.isdigit()
}
if i > 0:
prevword = doc[i-1][0]
postag1 = doc[i-1][1]
features['word.prevword'] = prevword
features['word.previsspace']=prevword.isspace()
features['word.prepostag'] = postag1
features['word.prevwordisdigit'] = prevword.isdigit()
else:
features['BOS'] = True # Special "Beginning of Sequence" tag
# Features from next word
if i < len(doc)-1:
nextword = doc[i+1][0]
postag1 = doc[i+1][1]
features['word.nextword'] = nextword
features['word.nextisspace']=nextword.isspace()
features['word.nextpostag'] = postag1
features['word.nextwordisdigit'] = nextword.isdigit()
else:
features['EOS'] = True # Special "End of Sequence" tag
return features
def extract_features(doc):
return [doc2features(doc, i) for i in range(len(doc))]
def get_labels(doc):
return [tag for (token,postag,tag) in doc]
X_data = [extract_features(doc) for doc in datatofile] # เอา คำ แยกออกมา
y_data = [get_labels(doc) for doc in datatofile] # เอา tag แยกออกมา
X, X_test, y, y_test = train_test_split(X_data, y_data, test_size=0.1) # แบ่ง 0.1 หรือ 10%
crf = sklearn_crfsuite.CRF(
algorithm='lbfgs',
c1=0.1,
c2=0.1,
max_iterations=500,
all_possible_transitions=True,
model_filename=file_name+"-pos.model0" # ตั้งชื่อโมเดล
)
crf.fit(X, y); # train
labels = list(crf.classes_)
labels.remove('O')
y_pred = crf.predict(X_test)
e=metrics.flat_f1_score(y_test, y_pred,
average='weighted', labels=labels)
print(e) # โชว์ประสิทธิภาพ
sorted_labels = sorted(
labels,
key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
y_test, y_pred, labels=sorted_labels, digits=3
))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment