Created Nov 23, 2017

OOV recognition trick in convseg
# -*- coding:utf-8 -*-
# Filename:
# Author:hankcs
# Date: 2017-11-21 17:51
def load_words(path, dict):
with open(path) as src:
for line in src:
gold_file = 'data/datasets/sighan2005-pku/test.txt'
train_file = 'data/datasets/sighan2005-pku/train.txt'
dev_file = 'data/datasets/sighan2005-pku/dev.txt'
embeddings_file = 'data/embeddings/news_tensite.pku.words.w2v50'
gold_dict = set()
train_dict = set()
embeddings_dict = set()
load_words(gold_file, gold_dict)
load_words(train_file, train_dict)
load_words(dev_file, train_dict)
with open(embeddings_file) as embeddings_file:
for line in embeddings_file:
except UnicodeDecodeError:
print('Test Word OOV Type : %d' % len(gold_dict - train_dict))
print('Test Word OOV Type (+WE) : %d' % len(gold_dict - (train_dict | embeddings_dict)))
