vadimkantorov/find_domain_words.py

## find_domain_words.py
# Usage: python3 find_domain_words --ours chats.arpa --theirs ru_wiyalen_no_punkt.arpa.binary > domain_words.txt

import argparse
import kenlm

parser = argparse.ArgumentParser()
parser.add_argument('--ours', required = True)
parser.add_argument('--theirs', required = True)
args = parser.parse_args()

ours = kenlm.LanguageModel(args.ours)
theirs = kenlm.LanguageModel(args.theirs)

vocab = []
for l in open(args.ours):
  if l.startswith('-'):
    vocab.append(l.split()[1])
  if '2-grams' in l:
    break

scores = [(w, log_prob_ours, log_prob_theirs, log_prob_ours - log_prob_theirs) for w in vocab for log_prob_ours, log_prob_theirs in [(ours.score(w), theirs.score(w))]]

for w, log_prob_ours, log_prob_theirs, log_prob_ratio in sorted(scores, key = lambda s: s[-1], reverse = True):
  print(w, log_prob_ratio)
	# Usage: python3 find_domain_words --ours chats.arpa --theirs ru_wiyalen_no_punkt.arpa.binary > domain_words.txt

	import argparse
	import kenlm

	parser = argparse.ArgumentParser()
	parser.add_argument('--ours', required = True)
	parser.add_argument('--theirs', required = True)
	args = parser.parse_args()

	ours = kenlm.LanguageModel(args.ours)
	theirs = kenlm.LanguageModel(args.theirs)

	vocab = []
	for l in open(args.ours):
	if l.startswith('-'):
	vocab.append(l.split()[1])
	if '2-grams' in l:
	break

	scores = [(w, log_prob_ours, log_prob_theirs, log_prob_ours - log_prob_theirs) for w in vocab for log_prob_ours, log_prob_theirs in [(ours.score(w), theirs.score(w))]]

	for w, log_prob_ours, log_prob_theirs, log_prob_ratio in sorted(scores, key = lambda s: s[-1], reverse = True):
	print(w, log_prob_ratio)