arianpasquali/index_semantic_scholar_into_solr.py

## index_semantic_scholar_into_solr.py
# requirements:
#  pip install pysolr
#
# usage:
#   python index_into_solr.py /home/ubuntu/workspace/semantic_scholar_dataset/s2-corpus-00.gz

import json
import pysolr

import os
import gzip
import sys


SOLR_ADDRESS = 'http://localhost:8983/solr/semantic_scholar'
PAGE_SIZE = 10000
SAMPLE_SIZE = 10000

solr = pysolr.Solr(SOLR_ADDRESS, timeout=100)

def reset_index():
        print("reset solr index")
        solr.delete("*:*")

def read_files(directory_in_str, extension):
        result = []
        directory = os.fsencode(directory_in_str)

        for file in os.listdir(directory):
                filename = os.fsdecode(file)
                if filename.endswith(".gz"):
                        fname = os.path.join(directory_in_str, str(file,"utf8"))
                        result.append(fname)

        return result


def parse_files(filepath):
        #filename = open("/home/arian/Developer/workspace/opendata/semantic_scholar/s2-corpus-00","rb")
        filename = gzip.open(filepath,"rb")

        print("loading json file")
        lines = filename.readlines()

        print("loaded ",len(lines),"docs")
        json_docs = []
        i = 0
        #for l in lines[:SAMPLE_SIZE]:
        for l in lines:
                i = i + 1

                if(i % PAGE_SIZE == 0):
                        print(filename,"persisting into solr",i)
                        solr.add(json_docs, commit=True)
                        json_docs = []
                else:
                        json_str = str(l,"utf8")
                        #print(json_str)
                        json_doc = json.loads(json_str)
                        #json_doc["journalVolume"] = json_doc["journalVolume"] + "."
                        del json_doc["journalVolume"]

                        authors_names = []
                        authors_ids = []
                        for author_item in json_doc["authors"]:

                                author_name = author_item["name"]
                                #for each author consider only first author id
                                author_id = -1
                                if("ids" in author_item.keys()):
                                    if(len(author_item["ids"]) > 0):
                                        author_id = author_item["ids"][0]

                                authors_ids.append(author_id)
                                authors_names.append(author_name)

                        json_doc["authors_names"] = authors_names
                        json_doc["authors_ids"] = authors_ids

                        #ignore original neasted authors
                        del json_doc["authors"]

                        dataset_filename = os.path.basename(filename.name)
                        json_doc["dataset_filename"] = dataset_filename

                        #persists total of citation for easy sorting
                        if("inCitations" in json_doc.keys()):
                                json_doc["total_citations"] = len(json_doc["inCitations"])
                        else:
                                json_doc["total_citations"] = 0

                        json_docs.append(json_doc)


if __name__ == "__main__":
        print("called main")
        # print ("The arguments are: " , str(sys.argv))
        if(len(sys.argv) > 1):
                papers_file = sys.argv[1]
                print("parsing ",papers_file)
                parse_files(papers_file)
        else:
                print("Please inform input file.")
                print(" Example:")
                print(" ".join(["python",sys.argv[0],"[input_file]"]))
	# requirements:
	# pip install pysolr
	#
	# usage:
	# python index_into_solr.py /home/ubuntu/workspace/semantic_scholar_dataset/s2-corpus-00.gz

	import json
	import pysolr

	import os
	import gzip
	import sys


	SOLR_ADDRESS = 'http://localhost:8983/solr/semantic_scholar'
	PAGE_SIZE = 10000
	SAMPLE_SIZE = 10000

	solr = pysolr.Solr(SOLR_ADDRESS, timeout=100)

	def reset_index():
	print("reset solr index")
	solr.delete(":")

	def read_files(directory_in_str, extension):
	result = []
	directory = os.fsencode(directory_in_str)

	for file in os.listdir(directory):
	filename = os.fsdecode(file)
	if filename.endswith(".gz"):
	fname = os.path.join(directory_in_str, str(file,"utf8"))
	result.append(fname)

	return result


	def parse_files(filepath):
	#filename = open("/home/arian/Developer/workspace/opendata/semantic_scholar/s2-corpus-00","rb")
	filename = gzip.open(filepath,"rb")

	print("loading json file")
	lines = filename.readlines()

	print("loaded ",len(lines),"docs")
	json_docs = []
	i = 0
	#for l in lines[:SAMPLE_SIZE]:
	for l in lines:
	i = i + 1

	if(i % PAGE_SIZE == 0):
	print(filename,"persisting into solr",i)
	solr.add(json_docs, commit=True)
	json_docs = []
	else:
	json_str = str(l,"utf8")
	#print(json_str)
	json_doc = json.loads(json_str)
	#json_doc["journalVolume"] = json_doc["journalVolume"] + "."
	del json_doc["journalVolume"]

	authors_names = []
	authors_ids = []
	for author_item in json_doc["authors"]:

	author_name = author_item["name"]
	#for each author consider only first author id
	author_id = -1
	if("ids" in author_item.keys()):
	if(len(author_item["ids"]) > 0):
	author_id = author_item["ids"][0]

	authors_ids.append(author_id)
	authors_names.append(author_name)

	json_doc["authors_names"] = authors_names
	json_doc["authors_ids"] = authors_ids

	#ignore original neasted authors
	del json_doc["authors"]

	dataset_filename = os.path.basename(filename.name)
	json_doc["dataset_filename"] = dataset_filename

	#persists total of citation for easy sorting
	if("inCitations" in json_doc.keys()):
	json_doc["total_citations"] = len(json_doc["inCitations"])
	else:
	json_doc["total_citations"] = 0

	json_docs.append(json_doc)


	if __name__ == "__main__":
	print("called main")
	# print ("The arguments are: " , str(sys.argv))
	if(len(sys.argv) > 1):
	papers_file = sys.argv[1]
	print("parsing ",papers_file)
	parse_files(papers_file)
	else:
	print("Please inform input file.")
	print(" Example:")
	print(" ".join(["python",sys.argv[0],"[input_file]"]))