bennokr/anchor_counts.py

## anchor_counts.py
import sys, os
try:
    redirects_file = sys.argv[1]
    label_files = sys.argv[2:]
except:
    print('Usage: anchor_counts.py [redirects_file.nt] *[label_files.nt]\nOutput: N(uri & label) N(uri) N(label) uri label')
    sys.exit(0)

redirects = {}
for line in open(redirects_file):
    if line[0] == '#':
        continue
    uri_from, _, uri_to = line.strip()[:-2].split(' ', 2)
    redirects[uri_from] = uri_to

from collections import Counter
count_uri = Counter()
count_label = Counter()
count_uri_label = Counter()
for label_file in label_files:
    for line in open(label_file):
        if line[0] == '#':
            continue
        uri, _, label = line.strip()[:-2].split(' ', 2)
        uri = redirects.get(uri,uri)
        count_uri[uri] += 1
        count_label[label] += 1
        count_uri_label[(uri,label)] += 1

for (uri,label), c in count_uri_label.most_common():
    print(c, count_uri[uri], count_label[label], uri, label, sep='\t')

## es_index_anchors.py
import sys
try:
    _, host, typ, index, counts_file = sys.argv
except:
    print('Usage: python es_index_anchors.py [host] [type] [index] [counts_file]')
    sys.exit(0)

action = {
    '_type': typ,
    '_index': index
}

from elasticsearch import Elasticsearch, helpers


def stream():
    for line in open(counts_file):
        c_both, c_entity, c_label, entity, label = line.strip().split('\t', 4)
        label = label[1:-4]
        c_both, c_entity, c_label = int(c_both), int(c_entity), int(c_label)
        p_entity_label = c_both / c_label
        p_label_entity = c_both / c_entity

        doc = dict(action)
        doc.update({
            'c_both' : c_both,
            'c_entity' : c_entity,
            'c_label' : c_label,
            'p_entity_label' : p_entity_label,
            'p_label_entity' : p_label_entity,
            'entity':entity,
            'label':label
        })
        yield doc


es = Elasticsearch([host], timeout=30)
results = helpers.parallel_bulk(es, stream(), thread_count=16)
for i, (status, r) in enumerate(results):
    if not status:
        print >> sys.stderr, r
    if 0 == i % 100000:
        print('indexed %d    ' % i, end='\r')
	import sys, os
	try:
	redirects_file = sys.argv[1]
	label_files = sys.argv[2:]
	except:
	print('Usage: anchor_counts.py [redirects_file.nt] *[label_files.nt]\nOutput: N(uri & label) N(uri) N(label) uri label')
	sys.exit(0)

	redirects = {}
	for line in open(redirects_file):
	if line[0] == '#':
	continue
	uri_from, _, uri_to = line.strip()[:-2].split(' ', 2)
	redirects[uri_from] = uri_to

	from collections import Counter
	count_uri = Counter()
	count_label = Counter()
	count_uri_label = Counter()
	for label_file in label_files:
	for line in open(label_file):
	if line[0] == '#':
	continue
	uri, _, label = line.strip()[:-2].split(' ', 2)
	uri = redirects.get(uri,uri)
	count_uri[uri] += 1
	count_label[label] += 1
	count_uri_label[(uri,label)] += 1

	for (uri,label), c in count_uri_label.most_common():
	print(c, count_uri[uri], count_label[label], uri, label, sep='\t')
	import sys
	try:
	_, host, typ, index, counts_file = sys.argv
	except:
	print('Usage: python es_index_anchors.py [host] [type] [index] [counts_file]')
	sys.exit(0)

	action = {
	'_type': typ,
	'_index': index
	}

	from elasticsearch import Elasticsearch, helpers


	def stream():
	for line in open(counts_file):
	c_both, c_entity, c_label, entity, label = line.strip().split('\t', 4)
	label = label[1:-4]
	c_both, c_entity, c_label = int(c_both), int(c_entity), int(c_label)
	p_entity_label = c_both / c_label
	p_label_entity = c_both / c_entity

	doc = dict(action)
	doc.update({
	'c_both' : c_both,
	'c_entity' : c_entity,
	'c_label' : c_label,
	'p_entity_label' : p_entity_label,
	'p_label_entity' : p_label_entity,
	'entity':entity,
	'label':label
	})
	yield doc


	es = Elasticsearch([host], timeout=30)
	results = helpers.parallel_bulk(es, stream(), thread_count=16)
	for i, (status, r) in enumerate(results):
	if not status:
	print >> sys.stderr, r
	if 0 == i % 100000:
	print('indexed %d ' % i, end='\r')