almugabo/reference_extraction.py

## reference_extraction.py
"""
FROM: https://gist.github.com/sobolevnrm/412763ebae5424a92d3239898b615e2a

Process RIS format following the standard at",
http://referencemanager.com/sites/rm/files/m/direct_export_ris.pdf """

import re

ALLOWED_TAGS = {"TY" : "Record start",
                "ER" : "Record end",
                "A2" : "Secondary author",
                "A3" : "Tertiary Author",
                "A4" : "Subsidiary Author",
                "AB" : "Abstract",
                "AD" : "Author Address",
                "AN" : "Accession Number",
                "AU" : "Author",
                "C1" : "Custom 1",
                "C2" : "Custom 2",
                "C3" : "Custom 3",
                "C4" : "Custom 4",
                "C5" : "Custom 5",
                "C6" : "Custom 6",
                "C7" : "Custom 7",
                "C8" : "Custom 8",
                "CA" : "Caption",
                "CN" : "Call Number",
                "CY" : "Place Published",
                "DA" : "Date",
                "DB" : "Name of Database",
                "DO" : "DOI",
                "DP" : "Database Provider",
                "ET" : "Edition",
                "J2" : "Alternate Title",
                "KW" : "Keywords",
                "L1" : "File Attachments",
                "L4" : "Figure",
                "LA" : "Language",
                "LB" : "Label",
                "IS" : "Number",
                "M3" : "Type of Work",
                "N1" : "Notes",
                "NV" : "Number of Volumes",
                "OP" : "Original Publication",
                "PB" : "Publisher",
                "PY" : "Year"}
REFERENCE_TYPES = {"ABST" : "Abstract",
                   "ADVS" : "Audiovisual material",
                   "ART" : "Art Work",
                   "BILL" : "Bill/Resolution",
                   "BOOK" : "Book, Whole",
                   "CASE" : "Case",
                   "CHAP" : "Book chapter",
                   "COMP" : "Computer program",
                   "CONF" : "Conference proceeding",
                   "CTLG" : "Catalog",
                   "DATA" : "Data file",
                   "ELEC" : "Electronic Citation",
                   "GEN" : "Generic",
                   "HEAR" : "Hearing",
                   "ICOMM" : "Internet Communication",
                   "INPR" : "In Press",
                   "JFULL" : "Journal (full)",
                   "JOUR" : "Journal",
                   "MAP" : "Map",
                   "MGZN" : "Magazine article",
                   "MPCT" : "Motion picture",
                   "MUSIC" : "Music score",
                   "NEWS" : "Newspaper",
                   "PAMP" : "Pamphlet",
                   "PAT" : "Patent",
                   "PCOMM" : "Personal communication",
                   "RPRT" : "Report",
                   "SER" : "Serial (Book, Monograph)",
                   "SLIDE" : "Slide",
                   "SOUND" : "Sound recording",
                   "STAT" : "Statute",
                   "THES" : "Thesis/Dissertation",
                   "UNBILl" : "Unenacted bill/resolution",
                   "UNPB" : "Unpublished work",
                   "VIDEO" : "Video recording"}
class RIS:
    """ RIS file structure """
    def __init__(self, in_file=None):
        """ Initialize and parse input """
        self.records = []
        if in_file:
            self.parse(in_file)
    def parse(self, in_file):
        """ Parse input file """
        self.current_tag = None
        self.current_record = None
        prog = re.compile("^([A-Z][A-Z0-9]) *- *(.*)")
        lines = []
        # Eliminate blank lines
        for line in in_file:
            line = line.strip()
            if len(line) > 0:
                lines.append(line)
        for line in lines:
            match = prog.match(line)
            if match:
                tag = match.groups()[0]
                field = match.groups()[1]
                self.process_field(tag, field)
            else:
                raise ValueError(line)
    def process_field(self, tag, field):
        """ Process RIS file field """
        if tag == "TY":
            self.current_record = {tag: field}
        elif tag == "ER":
            self.records.append(self.current_record)
            self.current_record = None
        elif tag in ["AU", "AD", "KW", "N1"]:
            if tag in self.current_record:
                self.current_record[tag].append(field)
            else:
                self.current_record[tag] = [field]
        else:
            if not tag in self.current_record:
                self.current_record[tag] = field
            else:
                error_str = "Duplicate tag: %s" % tag
                raise ValueError(error_str)

import pandas as pd
with open(xFile, 'r', encoding="utf8") as ff:
    #xdata = ff.read()
    ris = RIS(ff)


d1 = pd.DataFrame(ris.records)

print(len(d1))
d1.head()
	"""
	FROM: https://gist.github.com/sobolevnrm/412763ebae5424a92d3239898b615e2a

	Process RIS format following the standard at",
	http://referencemanager.com/sites/rm/files/m/direct_export_ris.pdf """

	import re

	ALLOWED_TAGS = {"TY" : "Record start",
	"ER" : "Record end",
	"A2" : "Secondary author",
	"A3" : "Tertiary Author",
	"A4" : "Subsidiary Author",
	"AB" : "Abstract",
	"AD" : "Author Address",
	"AN" : "Accession Number",
	"AU" : "Author",
	"C1" : "Custom 1",
	"C2" : "Custom 2",
	"C3" : "Custom 3",
	"C4" : "Custom 4",
	"C5" : "Custom 5",
	"C6" : "Custom 6",
	"C7" : "Custom 7",
	"C8" : "Custom 8",
	"CA" : "Caption",
	"CN" : "Call Number",
	"CY" : "Place Published",
	"DA" : "Date",
	"DB" : "Name of Database",
	"DO" : "DOI",
	"DP" : "Database Provider",
	"ET" : "Edition",
	"J2" : "Alternate Title",
	"KW" : "Keywords",
	"L1" : "File Attachments",
	"L4" : "Figure",
	"LA" : "Language",
	"LB" : "Label",
	"IS" : "Number",
	"M3" : "Type of Work",
	"N1" : "Notes",
	"NV" : "Number of Volumes",
	"OP" : "Original Publication",
	"PB" : "Publisher",
	"PY" : "Year"}
	REFERENCE_TYPES = {"ABST" : "Abstract",
	"ADVS" : "Audiovisual material",
	"ART" : "Art Work",
	"BILL" : "Bill/Resolution",
	"BOOK" : "Book, Whole",
	"CASE" : "Case",
	"CHAP" : "Book chapter",
	"COMP" : "Computer program",
	"CONF" : "Conference proceeding",
	"CTLG" : "Catalog",
	"DATA" : "Data file",
	"ELEC" : "Electronic Citation",
	"GEN" : "Generic",
	"HEAR" : "Hearing",
	"ICOMM" : "Internet Communication",
	"INPR" : "In Press",
	"JFULL" : "Journal (full)",
	"JOUR" : "Journal",
	"MAP" : "Map",
	"MGZN" : "Magazine article",
	"MPCT" : "Motion picture",
	"MUSIC" : "Music score",
	"NEWS" : "Newspaper",
	"PAMP" : "Pamphlet",
	"PAT" : "Patent",
	"PCOMM" : "Personal communication",
	"RPRT" : "Report",
	"SER" : "Serial (Book, Monograph)",
	"SLIDE" : "Slide",
	"SOUND" : "Sound recording",
	"STAT" : "Statute",
	"THES" : "Thesis/Dissertation",
	"UNBILl" : "Unenacted bill/resolution",
	"UNPB" : "Unpublished work",
	"VIDEO" : "Video recording"}
	class RIS:
	""" RIS file structure """
	def __init__(self, in_file=None):
	""" Initialize and parse input """
	self.records = []
	if in_file:
	self.parse(in_file)
	def parse(self, in_file):
	""" Parse input file """
	self.current_tag = None
	self.current_record = None
	prog = re.compile("^([A-Z][A-Z0-9]) - (.*)")
	lines = []
	# Eliminate blank lines
	for line in in_file:
	line = line.strip()
	if len(line) > 0:
	lines.append(line)
	for line in lines:
	match = prog.match(line)
	if match:
	tag = match.groups()[0]
	field = match.groups()[1]
	self.process_field(tag, field)
	else:
	raise ValueError(line)
	def process_field(self, tag, field):
	""" Process RIS file field """
	if tag == "TY":
	self.current_record = {tag: field}
	elif tag == "ER":
	self.records.append(self.current_record)
	self.current_record = None
	elif tag in ["AU", "AD", "KW", "N1"]:
	if tag in self.current_record:
	self.current_record[tag].append(field)
	else:
	self.current_record[tag] = [field]
	else:
	if not tag in self.current_record:
	self.current_record[tag] = field
	else:
	error_str = "Duplicate tag: %s" % tag
	raise ValueError(error_str)

	import pandas as pd
	with open(xFile, 'r', encoding="utf8") as ff:
	#xdata = ff.read()
	ris = RIS(ff)


	d1 = pd.DataFrame(ris.records)

	print(len(d1))
	d1.head()