hankcs/OOV.py

## OOV.py
# -*- coding:utf-8 -*-
# Filename: OOV.py
# Author：hankcs
# Date: 2017-11-21 17:51

def load_words(path, dict):
    with open(path) as src:
        for line in src:
            dict.update(line.split())


gold_file = 'data/datasets/sighan2005-pku/test.txt'
train_file = 'data/datasets/sighan2005-pku/train.txt'
dev_file = 'data/datasets/sighan2005-pku/dev.txt'
embeddings_file = 'data/embeddings/news_tensite.pku.words.w2v50'
gold_dict = set()
train_dict = set()
embeddings_dict = set()
load_words(gold_file, gold_dict)
load_words(train_file, train_dict)
load_words(dev_file, train_dict)
with open(embeddings_file) as embeddings_file:
    try:
        for line in embeddings_file:
            embeddings_dict.add(line.split()[0])
    except UnicodeDecodeError:
        pass

print('Test Word OOV Type : %d' % len(gold_dict - train_dict))
print('Test Word OOV Type (+WE) : %d' % len(gold_dict - (train_dict | embeddings_dict)))
	# -- coding:utf-8 --
	# Filename: OOV.py
	# Author：hankcs
	# Date: 2017-11-21 17:51

	def load_words(path, dict):
	with open(path) as src:
	for line in src:
	dict.update(line.split())


	gold_file = 'data/datasets/sighan2005-pku/test.txt'
	train_file = 'data/datasets/sighan2005-pku/train.txt'
	dev_file = 'data/datasets/sighan2005-pku/dev.txt'
	embeddings_file = 'data/embeddings/news_tensite.pku.words.w2v50'
	gold_dict = set()
	train_dict = set()
	embeddings_dict = set()
	load_words(gold_file, gold_dict)
	load_words(train_file, train_dict)
	load_words(dev_file, train_dict)
	with open(embeddings_file) as embeddings_file:
	try:
	for line in embeddings_file:
	embeddings_dict.add(line.split()[0])
	except UnicodeDecodeError:
	pass

	print('Test Word OOV Type : %d' % len(gold_dict - train_dict))
	print('Test Word OOV Type (+WE) : %d' % len(gold_dict - (train_dict \| embeddings_dict)))