wannaphong/train.py

## train.py
from sklearn_crfsuite import scorers,metrics
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_validate,train_test_split
import sklearn_crfsuite
def doc2features(doc, i):
    word = doc[i][0]
    postag = doc[i][1]
    # Features from current word
    features={
        'word.word': word,
        'word.isspace':word.isspace(),
        'postag':postag,
        'word.isdigit()': word.isdigit()
    }
    if i > 0:
        prevword = doc[i-1][0]
        postag1 = doc[i-1][1]
        features['word.prevword'] = prevword
        features['word.previsspace']=prevword.isspace()
        features['word.prepostag'] = postag1
        features['word.prevwordisdigit'] = prevword.isdigit()
    else:
        features['BOS'] = True # Special "Beginning of Sequence" tag
    # Features from next word
    if i < len(doc)-1:
        nextword = doc[i+1][0]
        postag1 = doc[i+1][1]
        features['word.nextword'] = nextword
        features['word.nextisspace']=nextword.isspace()
        features['word.nextpostag'] = postag1
        features['word.nextwordisdigit'] = nextword.isdigit()
    else:
        features['EOS'] = True # Special "End of Sequence" tag
    return features

def extract_features(doc):
    return [doc2features(doc, i) for i in range(len(doc))]

def get_labels(doc):
    return [tag for (token,postag,tag) in doc]


X_data = [extract_features(doc) for doc in datatofile] # เอา คำ แยกออกมา
y_data = [get_labels(doc) for doc in datatofile] # เอา tag แยกออกมา

X, X_test, y, y_test = train_test_split(X_data, y_data, test_size=0.1) # แบ่ง 0.1 หรือ 10%
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=500,
    all_possible_transitions=True,
    model_filename=file_name+"-pos.model0" # ตั้งชื่อโมเดล
)
crf.fit(X, y); # train

labels = list(crf.classes_)
labels.remove('O')
y_pred = crf.predict(X_test)
e=metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)
print(e) # โชว์ประสิทธิภาพ
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))
	from sklearn_crfsuite import scorers,metrics
	from sklearn.metrics import make_scorer
	from sklearn.model_selection import cross_validate,train_test_split
	import sklearn_crfsuite
	def doc2features(doc, i):
	word = doc[i][0]
	postag = doc[i][1]
	# Features from current word
	features={
	'word.word': word,
	'word.isspace':word.isspace(),
	'postag':postag,
	'word.isdigit()': word.isdigit()
	}
	if i > 0:
	prevword = doc[i-1][0]
	postag1 = doc[i-1][1]
	features['word.prevword'] = prevword
	features['word.previsspace']=prevword.isspace()
	features['word.prepostag'] = postag1
	features['word.prevwordisdigit'] = prevword.isdigit()
	else:
	features['BOS'] = True # Special "Beginning of Sequence" tag
	# Features from next word
	if i < len(doc)-1:
	nextword = doc[i+1][0]
	postag1 = doc[i+1][1]
	features['word.nextword'] = nextword
	features['word.nextisspace']=nextword.isspace()
	features['word.nextpostag'] = postag1
	features['word.nextwordisdigit'] = nextword.isdigit()
	else:
	features['EOS'] = True # Special "End of Sequence" tag
	return features

	def extract_features(doc):
	return [doc2features(doc, i) for i in range(len(doc))]

	def get_labels(doc):
	return [tag for (token,postag,tag) in doc]


	X_data = [extract_features(doc) for doc in datatofile] # เอา คำ แยกออกมา
	y_data = [get_labels(doc) for doc in datatofile] # เอา tag แยกออกมา

	X, X_test, y, y_test = train_test_split(X_data, y_data, test_size=0.1) # แบ่ง 0.1 หรือ 10%
	crf = sklearn_crfsuite.CRF(
	algorithm='lbfgs',
	c1=0.1,
	c2=0.1,
	max_iterations=500,
	all_possible_transitions=True,
	model_filename=file_name+"-pos.model0" # ตั้งชื่อโมเดล
	)
	crf.fit(X, y); # train

	labels = list(crf.classes_)
	labels.remove('O')
	y_pred = crf.predict(X_test)
	e=metrics.flat_f1_score(y_test, y_pred,
	average='weighted', labels=labels)
	print(e) # โชว์ประสิทธิภาพ
	sorted_labels = sorted(
	labels,
	key=lambda name: (name[1:], name[0])
	)
	print(metrics.flat_classification_report(
	y_test, y_pred, labels=sorted_labels, digits=3
	))