Skip to content

Instantly share code, notes, and snippets.

@glouppe
Last active August 29, 2015 14:13
Show Gist options
  • Save glouppe/1b90984e2d8168ca9de4 to your computer and use it in GitHub Desktop.
Save glouppe/1b90984e2d8168ca9de4 to your computer and use it in GitHub Desktop.
import sys
sys.path.append("/usr/lib/python2.7/dist-packages/")
sys.path.append("/usr/local/lib/python2.7/dist-packages/")
import string
import re
from joblib import Parallel, delayed
from invenio.dbquery import run_sql
from invenio.bibauthorid_dbinterface import get_title_of_paper
from invenio.bibauthorid_dbinterface import get_authors_of_paper
from invenio.bibauthorid_dbinterface import get_keywords_for_paper
from invenio.bibauthorid_dbinterface import get_collaborations_for_paper
from invenio.bibrank_citation_searcher import get_refers_to
from invenio.bibrank_citation_searcher import get_cited_by
from invenio.search_engine_utils import get_fieldvalues
def get_affiliation(table, bibref_value, bibrec):
"""Returns institution name and field number of a signature."""
table_name = str(table)[0:2] + 'x'
q = run_sql("""SELECT f2.value, r.field_number
FROM bibrec AS b
INNER JOIN bibrec_bib%s AS r ON (r.id_bibrec = b.id)
INNER JOIN bib%s AS f ON (r.id_bibxxx = f.id)
INNER JOIN bibrec_bib%s AS r2 ON (r2.id_bibrec = b.id AND
r.field_number = r2.field_number)
INNER JOIN bib%s AS f2 ON (r2.id_bibxxx = f2.id)
WHERE b.id = %d AND
f.id = %d AND
f2.tag = '%s__u'
""" % (table_name, table_name, table_name, table_name,
bibrec, bibref_value, table))
if len(q) > 0:
return q[0]
else:
q = run_sql("""SELECT field_number
FROM bib%s, bibrec_bib%s
WHERE bib%s.id = bibrec_bib%s.id_bibxxx AND
bib%s.id = %s AND bibrec_bib%s.id_bibrec = %s
""" % (table_name, table_name, table_name, table_name,
table_name, bibref_value, table_name, bibrec))
if len(q) > 0:
return None, q[0][0]
return None, None
def _getter_sig(i, signature):
affiliation, position = get_affiliation(signature[1],
signature[2],
signature[3])
return {'signature_id': i,
'author_name': signature[4],
'publication_id': signature[3],
'author_affiliation': affiliation,
'signature_position': position}
def extract_signature_data(signatures, n_jobs=1):
return Parallel(n_jobs=n_jobs, verbose=3)(delayed(_getter_sig)(i, signature)
for i, signature
in signatures)
def get_year(recid):
for tag in ["773__y", "260__c", "269__c", "909C4y", "925__a"]:
date = get_fieldvalues([recid], tag)
if len(date) == 1:
date = date[0]
match_obj = re.search("\d\d\d\d", date)
if match_obj is not None:
return int(match_obj.group())
return None
if __name__ == "__main__":
LIMIT = int(sys.argv[1])
all_pairs = []
all_y = []
# Same author, same names
for letter in string.ascii_lowercase:
query = """SELECT a1.personid, a1.bibref_table, a1.bibref_value,
a1.bibrec, a1.name, a1.flag,
a2.personid, a2.bibref_table, a2.bibref_value,
a2.bibrec, a2.name, a2.flag
FROM aidPERSONIDPAPERS as a1
INNER JOIN aidPERSONIDPAPERS as a2 ON a1.personid = a2.personid
WHERE a1.name = a2.name AND
a1.bibrec <> a2.bibrec AND
a1.flag = 2 AND a2.flag = 2
AND a1.name LIKE '%s%%'
ORDER BY RAND()
LIMIT %d""" % (letter, LIMIT)
print query
pairs = run_sql(query)
y = [0.0] * len(pairs)
all_pairs.extend(pairs)
all_y.extend(y)
# Same author, different names
for letter in string.ascii_lowercase:
query = """SELECT a1.personid, a1.bibref_table, a1.bibref_value,
a1.bibrec, a1.name, a1.flag,
a2.personid, a2.bibref_table, a2.bibref_value,
a2.bibrec, a2.name, a2.flag
FROM aidPERSONIDPAPERS as a1
INNER JOIN aidPERSONIDPAPERS as a2 ON a1.personid = a2.personid
WHERE a1.name <> a2.name AND
a1.bibrec <> a2.bibrec AND
a1.flag = 2 AND a2.flag = 2
AND a1.name LIKE '%s%%'
ORDER BY RAND()
LIMIT %d""" % (letter, LIMIT)
print query
pairs = run_sql(query)
y = [0.0] * len(pairs)
all_pairs.extend(pairs)
all_y.extend(y)
# Different authors, same name
for letter in string.ascii_lowercase:
query = """SELECT a1.personid, a1.bibref_table, a1.bibref_value,
a1.bibrec, a1.name, a1.flag,
a2.personid, a2.bibref_table, a2.bibref_value,
a2.bibrec, a2.name, a2.flag
FROM aidPERSONIDPAPERS as a1
INNER JOIN aidPERSONIDPAPERS as a2 ON a1.name = a2.name
WHERE a1.personid <> a2.personid AND
a1.bibrec <> a2.bibrec AND
a1.flag = 2 AND a2.flag = 2
AND a1.name LIKE '%s%%'
ORDER BY RAND()
LIMIT %d""" % (letter, LIMIT)
print query
pairs = run_sql(query)
y = [1.0] * len(pairs)
all_pairs.extend(pairs)
all_y.extend(y)
# Different authors, different names
for letter in string.ascii_lowercase:
query1 = """SELECT a1.personid, a1.bibref_table, a1.bibref_value,
a1.bibrec, a1.name, a1.flag
FROM aidPERSONIDPAPERS as a1
WHERE a1.flag = 2 AND a1.name LIKE '%s%%'
ORDER BY RAND()
LIMIT %d""" % (letter, LIMIT ** 0.5)
query2 = """SELECT a1.personid, a1.bibref_table, a1.bibref_value,
a1.bibrec, a1.name, a1.flag
FROM aidPERSONIDPAPERS as a1
WHERE a1.flag = 2
ORDER BY RAND()
LIMIT %d""" % (LIMIT ** 0.5, )
print query1
print query2
pairs = []
y = []
for p1 in run_sql(query1):
for p2 in run_sql(query2):
if p1[0] != p2[0] and p1[4] != p2[4]:
pairs.append(p1 + p2)
y.append(1.0)
all_pairs.extend(pairs)
all_y.extend(y)
# Rejected
for letter in string.ascii_lowercase:
query = """SELECT a1.personid, a1.bibref_table, a1.bibref_value,
a1.bibrec, a1.name, a1.flag,
a2.personid, a2.bibref_table, a2.bibref_value,
a2.bibrec, a2.name, a2.flag
FROM aidPERSONIDPAPERS as a1
INNER JOIN aidPERSONIDPAPERS as a2 ON a1.personid = a2.personid
WHERE a1.flag = 2 AND a2.flag = -2
AND a1.name LIKE '%s%%'
ORDER BY RAND()
LIMIT %d""" % (letter, LIMIT)
print query
pairs = run_sql(query)
y = [1.0] * len(pairs)
all_pairs.extend(pairs)
all_y.extend(y)
# Assign IDs to signatures
signature_id_mapping = {}
next_id = 0
X = []
for i, pair in enumerate(all_pairs):
s1 = pair[0:6]
s2 = pair[6:12]
if s1[1:4] not in signature_id_mapping:
signature_id_mapping[s1[1:4]] = (next_id, s1)
next_id += 1
if s2[1:4] not in signature_id_mapping:
signature_id_mapping[s2[1:4]] = (next_id, s2)
next_id += 1
X.append((signature_id_mapping[s1[1:4]][0], signature_id_mapping[s2[1:4]][0]))
# Extract signature data
signature_data = extract_signature_data(signature_id_mapping.values(), n_jobs=-1)
signature_data = sorted(signature_data, key=lambda x: x["signature_id"])
# Extract record data
records = {}
for i, signature in enumerate(signature_id_mapping.keys()):
if i % 1000 == 0:
print i
if signature[2] not in records:
records[signature[2]] = {'publication_id': signature[2],
'title': get_title_of_paper(signature[2]),
'authors': get_authors_of_paper(signature[2]),
'references': get_refers_to(signature[2]),
'citations': get_cited_by(signature[2]),
'year': get_year(signature[2]),
'keywords': get_keywords_for_paper(signature[2]),
'collaborations': get_collaborations_for_paper(signature[2])}
record_data = sorted(records.values(), key=lambda x: x["publication_id"])
# Dump all
import cPickle
cPickle.dump((X, all_y, signature_data, record_data),
open(sys.argv[2], "w"),
protocol=cPickle.HIGHEST_PROTOCOL)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment