Last active
June 18, 2018 14:47
-
-
Save arianpasquali/d81b66d068503084f69bd81b48595091 to your computer and use it in GitHub Desktop.
indexing semantic scholar corpus into solr (http://labs.semanticscholar.org/corpus/)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# requirements: | |
# pip install pysolr | |
# | |
# usage: | |
# python index_into_solr.py /home/ubuntu/workspace/semantic_scholar_dataset/s2-corpus-00.gz | |
import json | |
import pysolr | |
import os | |
import gzip | |
import sys | |
SOLR_ADDRESS = 'http://localhost:8983/solr/semantic_scholar' | |
PAGE_SIZE = 10000 | |
SAMPLE_SIZE = 10000 | |
solr = pysolr.Solr(SOLR_ADDRESS, timeout=100) | |
def reset_index(): | |
print("reset solr index") | |
solr.delete("*:*") | |
def read_files(directory_in_str, extension): | |
result = [] | |
directory = os.fsencode(directory_in_str) | |
for file in os.listdir(directory): | |
filename = os.fsdecode(file) | |
if filename.endswith(".gz"): | |
fname = os.path.join(directory_in_str, str(file,"utf8")) | |
result.append(fname) | |
return result | |
def parse_files(filepath): | |
#filename = open("/home/arian/Developer/workspace/opendata/semantic_scholar/s2-corpus-00","rb") | |
filename = gzip.open(filepath,"rb") | |
print("loading json file") | |
lines = filename.readlines() | |
print("loaded ",len(lines),"docs") | |
json_docs = [] | |
i = 0 | |
#for l in lines[:SAMPLE_SIZE]: | |
for l in lines: | |
i = i + 1 | |
if(i % PAGE_SIZE == 0): | |
print(filename,"persisting into solr",i) | |
solr.add(json_docs, commit=True) | |
json_docs = [] | |
else: | |
json_str = str(l,"utf8") | |
#print(json_str) | |
json_doc = json.loads(json_str) | |
#json_doc["journalVolume"] = json_doc["journalVolume"] + "." | |
del json_doc["journalVolume"] | |
authors_names = [] | |
authors_ids = [] | |
for author_item in json_doc["authors"]: | |
author_name = author_item["name"] | |
#for each author consider only first author id | |
author_id = -1 | |
if("ids" in author_item.keys()): | |
if(len(author_item["ids"]) > 0): | |
author_id = author_item["ids"][0] | |
authors_ids.append(author_id) | |
authors_names.append(author_name) | |
json_doc["authors_names"] = authors_names | |
json_doc["authors_ids"] = authors_ids | |
#ignore original neasted authors | |
del json_doc["authors"] | |
dataset_filename = os.path.basename(filename.name) | |
json_doc["dataset_filename"] = dataset_filename | |
#persists total of citation for easy sorting | |
if("inCitations" in json_doc.keys()): | |
json_doc["total_citations"] = len(json_doc["inCitations"]) | |
else: | |
json_doc["total_citations"] = 0 | |
json_docs.append(json_doc) | |
if __name__ == "__main__": | |
print("called main") | |
# print ("The arguments are: " , str(sys.argv)) | |
if(len(sys.argv) > 1): | |
papers_file = sys.argv[1] | |
print("parsing ",papers_file) | |
parse_files(papers_file) | |
else: | |
print("Please inform input file.") | |
print(" Example:") | |
print(" ".join(["python",sys.argv[0],"[input_file]"])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment