Skip to content

Instantly share code, notes, and snippets.

@arianpasquali
Last active June 18, 2018 14:47
Show Gist options
  • Save arianpasquali/d81b66d068503084f69bd81b48595091 to your computer and use it in GitHub Desktop.
Save arianpasquali/d81b66d068503084f69bd81b48595091 to your computer and use it in GitHub Desktop.
indexing semantic scholar corpus into solr (http://labs.semanticscholar.org/corpus/)
# requirements:
# pip install pysolr
#
# usage:
# python index_into_solr.py /home/ubuntu/workspace/semantic_scholar_dataset/s2-corpus-00.gz
import json
import pysolr
import os
import gzip
import sys
SOLR_ADDRESS = 'http://localhost:8983/solr/semantic_scholar'
PAGE_SIZE = 10000
SAMPLE_SIZE = 10000
solr = pysolr.Solr(SOLR_ADDRESS, timeout=100)
def reset_index():
print("reset solr index")
solr.delete("*:*")
def read_files(directory_in_str, extension):
result = []
directory = os.fsencode(directory_in_str)
for file in os.listdir(directory):
filename = os.fsdecode(file)
if filename.endswith(".gz"):
fname = os.path.join(directory_in_str, str(file,"utf8"))
result.append(fname)
return result
def parse_files(filepath):
#filename = open("/home/arian/Developer/workspace/opendata/semantic_scholar/s2-corpus-00","rb")
filename = gzip.open(filepath,"rb")
print("loading json file")
lines = filename.readlines()
print("loaded ",len(lines),"docs")
json_docs = []
i = 0
#for l in lines[:SAMPLE_SIZE]:
for l in lines:
i = i + 1
if(i % PAGE_SIZE == 0):
print(filename,"persisting into solr",i)
solr.add(json_docs, commit=True)
json_docs = []
else:
json_str = str(l,"utf8")
#print(json_str)
json_doc = json.loads(json_str)
#json_doc["journalVolume"] = json_doc["journalVolume"] + "."
del json_doc["journalVolume"]
authors_names = []
authors_ids = []
for author_item in json_doc["authors"]:
author_name = author_item["name"]
#for each author consider only first author id
author_id = -1
if("ids" in author_item.keys()):
if(len(author_item["ids"]) > 0):
author_id = author_item["ids"][0]
authors_ids.append(author_id)
authors_names.append(author_name)
json_doc["authors_names"] = authors_names
json_doc["authors_ids"] = authors_ids
#ignore original neasted authors
del json_doc["authors"]
dataset_filename = os.path.basename(filename.name)
json_doc["dataset_filename"] = dataset_filename
#persists total of citation for easy sorting
if("inCitations" in json_doc.keys()):
json_doc["total_citations"] = len(json_doc["inCitations"])
else:
json_doc["total_citations"] = 0
json_docs.append(json_doc)
if __name__ == "__main__":
print("called main")
# print ("The arguments are: " , str(sys.argv))
if(len(sys.argv) > 1):
papers_file = sys.argv[1]
print("parsing ",papers_file)
parse_files(papers_file)
else:
print("Please inform input file.")
print(" Example:")
print(" ".join(["python",sys.argv[0],"[input_file]"]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment