Skip to content

Instantly share code, notes, and snippets.

@dustalov
Last active January 11, 2018 12:50
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dustalov/d9830eee6a9350ffac0559abc3a02b1f to your computer and use it in GitHub Desktop.
Save dustalov/d9830eee6a9350ffac0559abc3a02b1f to your computer and use it in GitHub Desktop.
Extracting and cross-validating the WCL dataset of the 1.0 version
#!/usr/bin/awk -f
BEGIN {
FS = ":";
}
/^#/ {
next;
}
{
sub(/^[!$]+/, "");
gsub(/<(|\/)(VERB|GENUS|HYPER|LST|PRT|RGET|REST)>/, "");
if (length(DEFINIENDUM)) printf "%s\t", $1;
for (i = 2; i <= NF; i++) {
if (i > 2) printf " : ";
for (j = 1; j <= split($i, words, "\t"); j++) {
sub(/^.+_/, "", words[j]);
if (words[j] == "") continue;
if (words[j] == "TARGET") words[j] = $1;
if (j > 1) printf " ";
printf "%s", words[j];
}
}
print "";
}
#!/usr/bin/awk -f
NR % 2 == 1 {
comment = $0;
next;
}
/^[^!]/ && length(GOOD) > 0 {
print comment;
print $0;
}
/^!/ && length(BAD) > 0 {
print comment;
print $0;
}
#!/usr/bin/env python
from collections import namedtuple
Sentence = namedtuple('Sentence', 'comment tokens good')
def read_sentences(f):
sentences, comment = [], None
for i, line in enumerate(f):
if i % 2 == 0:
comment = line
else:
sentences.append(Sentence(comment, line, not line.startswith('!')))
return sentences
def write_fold(filename, sentences, indices):
with open(filename, 'w', encoding='UTF-8', newline='') as f:
for i in indices:
f.write(sentences[i].comment)
f.write(sentences[i].tokens)
if __name__ == '__main__':
import argparse
from sklearn.model_selection import KFold
parser = argparse.ArgumentParser()
parser.add_argument('-k', type=int, default=10)
parser.add_argument('--seed', type=int, default=None)
parser.add_argument('--prefix', default='wcl-kfold-')
parser.add_argument('wcl', type=argparse.FileType('r', encoding='UTF-8'))
args = parser.parse_args()
kf = KFold(n_splits=args.k, shuffle=True, random_state=args.seed)
sentences = read_sentences(args.wcl)
for i, (train_index, test_index) in enumerate(kf.split(sentences)):
write_fold('%s%d-train.pos' % (args.prefix, i + 1), sentences, train_index)
write_fold('%s%d.test' % (args.prefix, i + 1), sentences, test_index)
LC_COLLATE = C
SEED = 1337
WCL_WRAPPER = /srv/definitions/wcl-extract
measure:
./measure.py
kfold: wiki_really_all.txt
./kfold.py --seed=$(SEED) $<
eval:
for i in $$(seq 10); do (cd $(WCL_WRAPPER) && java -jar target/wcl-wrapper.jar -l en -t $(CURDIR)/wcl-kfold-$$i-train) < $(CURDIR)/wcl-kfold-$$i-test.tsv > $(CURDIR)/wcl-kfold-$$i-test-eval.tsv 2>/dev/null; done
extract:
for i in $$(seq 10); do ./extract.awk -vDEFINIENDUM=1 wcl-kfold-$$i.test > wcl-kfold-$$i-test.tsv; done
wiki_really_all.txt: wiki_really_good.txt wiki_really_bad.txt
cat wiki_really_good.txt wiki_really_bad.txt > $@
wiki_really_good.txt: wiki_all.txt
./filter.awk -vGOOD=1 wiki_all.txt > $@
wiki_really_bad.txt: wiki_all.txt
./filter.awk -vBAD=1 wiki_all.txt > $@
wiki_all.txt:
cat wiki_{good,bad}.txt > $@
clean:
rm -fv wiki_really*.txt *.train *.test *.tsv *.pos
#!/usr/bin/env python
import argparse
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from kfold import read_sentences
parser = argparse.ArgumentParser()
parser.add_argument('-k', type=int, default=10)
parser.add_argument('--prefix', default='wcl-kfold-')
args = parser.parse_args()
sentences = {}
for i in range(args.k):
filename = '%s%d.test' % (args.prefix, i + 1)
with open(filename, encoding='UTF-8') as f:
sentences[i] = read_sentences(f)
trues = {i: [sentence.good for sentence in sentences[i]] for i in range(args.k)}
preds = {i: [None] * len(sentences[i]) for i in range(args.k)}
for i in range(args.k):
filename = '%s%d-test-eval.tsv' % (args.prefix, i + 1)
with open(filename, encoding='UTF-8') as f:
for j, line in enumerate(f):
_, good = line.rstrip().lower().split('\t', 1)
preds[i][j] = good == 'true'
precision_scores = [precision_score(trues[i], preds[i]) for i in range(args.k)]
print('Pr=%.4f±%.4f' % (np.mean(precision_scores), np.std(precision_scores)))
recall_scores = [recall_score(trues[i], preds[i]) for i in range(args.k)]
print('Re=%.4f±%.4f' % (np.mean(recall_scores), np.std(recall_scores)))
f1_scores = [f1_score(trues[i], preds[i]) for i in range(args.k)]
print('F1=%.4f±%.4f' % (np.mean(f1_scores), np.std(f1_scores)))
accuracy_scores = [accuracy_score(trues[i], preds[i]) for i in range(args.k)]
print('Ac=%.4f±%.4f' % (np.mean(accuracy_scores), np.std(accuracy_scores)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment