Skip to content

Instantly share code, notes, and snippets.

@bennokr
Created March 6, 2019 17:01
Show Gist options
  • Save bennokr/325016e5d5bb53a2c102930b714f8364 to your computer and use it in GitHub Desktop.
Save bennokr/325016e5d5bb53a2c102930b714f8364 to your computer and use it in GitHub Desktop.
import sys, os
try:
redirects_file = sys.argv[1]
label_files = sys.argv[2:]
except:
print('Usage: anchor_counts.py [redirects_file.nt] *[label_files.nt]\nOutput: N(uri & label) N(uri) N(label) uri label')
sys.exit(0)
redirects = {}
for line in open(redirects_file):
if line[0] == '#':
continue
uri_from, _, uri_to = line.strip()[:-2].split(' ', 2)
redirects[uri_from] = uri_to
from collections import Counter
count_uri = Counter()
count_label = Counter()
count_uri_label = Counter()
for label_file in label_files:
for line in open(label_file):
if line[0] == '#':
continue
uri, _, label = line.strip()[:-2].split(' ', 2)
uri = redirects.get(uri,uri)
count_uri[uri] += 1
count_label[label] += 1
count_uri_label[(uri,label)] += 1
for (uri,label), c in count_uri_label.most_common():
print(c, count_uri[uri], count_label[label], uri, label, sep='\t')
import sys
try:
_, host, typ, index, counts_file = sys.argv
except:
print('Usage: python es_index_anchors.py [host] [type] [index] [counts_file]')
sys.exit(0)
action = {
'_type': typ,
'_index': index
}
from elasticsearch import Elasticsearch, helpers
def stream():
for line in open(counts_file):
c_both, c_entity, c_label, entity, label = line.strip().split('\t', 4)
label = label[1:-4]
c_both, c_entity, c_label = int(c_both), int(c_entity), int(c_label)
p_entity_label = c_both / c_label
p_label_entity = c_both / c_entity
doc = dict(action)
doc.update({
'c_both' : c_both,
'c_entity' : c_entity,
'c_label' : c_label,
'p_entity_label' : p_entity_label,
'p_label_entity' : p_label_entity,
'entity':entity,
'label':label
})
yield doc
es = Elasticsearch([host], timeout=30)
results = helpers.parallel_bulk(es, stream(), thread_count=16)
for i, (status, r) in enumerate(results):
if not status:
print >> sys.stderr, r
if 0 == i % 100000:
print('indexed %d ' % i, end='\r')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment