Created
March 6, 2019 17:01
-
-
Save bennokr/325016e5d5bb53a2c102930b714f8364 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys, os | |
try: | |
redirects_file = sys.argv[1] | |
label_files = sys.argv[2:] | |
except: | |
print('Usage: anchor_counts.py [redirects_file.nt] *[label_files.nt]\nOutput: N(uri & label) N(uri) N(label) uri label') | |
sys.exit(0) | |
redirects = {} | |
for line in open(redirects_file): | |
if line[0] == '#': | |
continue | |
uri_from, _, uri_to = line.strip()[:-2].split(' ', 2) | |
redirects[uri_from] = uri_to | |
from collections import Counter | |
count_uri = Counter() | |
count_label = Counter() | |
count_uri_label = Counter() | |
for label_file in label_files: | |
for line in open(label_file): | |
if line[0] == '#': | |
continue | |
uri, _, label = line.strip()[:-2].split(' ', 2) | |
uri = redirects.get(uri,uri) | |
count_uri[uri] += 1 | |
count_label[label] += 1 | |
count_uri_label[(uri,label)] += 1 | |
for (uri,label), c in count_uri_label.most_common(): | |
print(c, count_uri[uri], count_label[label], uri, label, sep='\t') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
try: | |
_, host, typ, index, counts_file = sys.argv | |
except: | |
print('Usage: python es_index_anchors.py [host] [type] [index] [counts_file]') | |
sys.exit(0) | |
action = { | |
'_type': typ, | |
'_index': index | |
} | |
from elasticsearch import Elasticsearch, helpers | |
def stream(): | |
for line in open(counts_file): | |
c_both, c_entity, c_label, entity, label = line.strip().split('\t', 4) | |
label = label[1:-4] | |
c_both, c_entity, c_label = int(c_both), int(c_entity), int(c_label) | |
p_entity_label = c_both / c_label | |
p_label_entity = c_both / c_entity | |
doc = dict(action) | |
doc.update({ | |
'c_both' : c_both, | |
'c_entity' : c_entity, | |
'c_label' : c_label, | |
'p_entity_label' : p_entity_label, | |
'p_label_entity' : p_label_entity, | |
'entity':entity, | |
'label':label | |
}) | |
yield doc | |
es = Elasticsearch([host], timeout=30) | |
results = helpers.parallel_bulk(es, stream(), thread_count=16) | |
for i, (status, r) in enumerate(results): | |
if not status: | |
print >> sys.stderr, r | |
if 0 == i % 100000: | |
print('indexed %d ' % i, end='\r') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment