Created
June 3, 2021 04:29
-
-
Save almugabo/ee4eff46dc1165374c85f24dbef4abb9 to your computer and use it in GitHub Desktop.
extraction of references in structured form
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
FROM: https://gist.github.com/sobolevnrm/412763ebae5424a92d3239898b615e2a | |
Process RIS format following the standard at", | |
http://referencemanager.com/sites/rm/files/m/direct_export_ris.pdf """ | |
import re | |
ALLOWED_TAGS = {"TY" : "Record start", | |
"ER" : "Record end", | |
"A2" : "Secondary author", | |
"A3" : "Tertiary Author", | |
"A4" : "Subsidiary Author", | |
"AB" : "Abstract", | |
"AD" : "Author Address", | |
"AN" : "Accession Number", | |
"AU" : "Author", | |
"C1" : "Custom 1", | |
"C2" : "Custom 2", | |
"C3" : "Custom 3", | |
"C4" : "Custom 4", | |
"C5" : "Custom 5", | |
"C6" : "Custom 6", | |
"C7" : "Custom 7", | |
"C8" : "Custom 8", | |
"CA" : "Caption", | |
"CN" : "Call Number", | |
"CY" : "Place Published", | |
"DA" : "Date", | |
"DB" : "Name of Database", | |
"DO" : "DOI", | |
"DP" : "Database Provider", | |
"ET" : "Edition", | |
"J2" : "Alternate Title", | |
"KW" : "Keywords", | |
"L1" : "File Attachments", | |
"L4" : "Figure", | |
"LA" : "Language", | |
"LB" : "Label", | |
"IS" : "Number", | |
"M3" : "Type of Work", | |
"N1" : "Notes", | |
"NV" : "Number of Volumes", | |
"OP" : "Original Publication", | |
"PB" : "Publisher", | |
"PY" : "Year"} | |
REFERENCE_TYPES = {"ABST" : "Abstract", | |
"ADVS" : "Audiovisual material", | |
"ART" : "Art Work", | |
"BILL" : "Bill/Resolution", | |
"BOOK" : "Book, Whole", | |
"CASE" : "Case", | |
"CHAP" : "Book chapter", | |
"COMP" : "Computer program", | |
"CONF" : "Conference proceeding", | |
"CTLG" : "Catalog", | |
"DATA" : "Data file", | |
"ELEC" : "Electronic Citation", | |
"GEN" : "Generic", | |
"HEAR" : "Hearing", | |
"ICOMM" : "Internet Communication", | |
"INPR" : "In Press", | |
"JFULL" : "Journal (full)", | |
"JOUR" : "Journal", | |
"MAP" : "Map", | |
"MGZN" : "Magazine article", | |
"MPCT" : "Motion picture", | |
"MUSIC" : "Music score", | |
"NEWS" : "Newspaper", | |
"PAMP" : "Pamphlet", | |
"PAT" : "Patent", | |
"PCOMM" : "Personal communication", | |
"RPRT" : "Report", | |
"SER" : "Serial (Book, Monograph)", | |
"SLIDE" : "Slide", | |
"SOUND" : "Sound recording", | |
"STAT" : "Statute", | |
"THES" : "Thesis/Dissertation", | |
"UNBILl" : "Unenacted bill/resolution", | |
"UNPB" : "Unpublished work", | |
"VIDEO" : "Video recording"} | |
class RIS: | |
""" RIS file structure """ | |
def __init__(self, in_file=None): | |
""" Initialize and parse input """ | |
self.records = [] | |
if in_file: | |
self.parse(in_file) | |
def parse(self, in_file): | |
""" Parse input file """ | |
self.current_tag = None | |
self.current_record = None | |
prog = re.compile("^([A-Z][A-Z0-9]) *- *(.*)") | |
lines = [] | |
# Eliminate blank lines | |
for line in in_file: | |
line = line.strip() | |
if len(line) > 0: | |
lines.append(line) | |
for line in lines: | |
match = prog.match(line) | |
if match: | |
tag = match.groups()[0] | |
field = match.groups()[1] | |
self.process_field(tag, field) | |
else: | |
raise ValueError(line) | |
def process_field(self, tag, field): | |
""" Process RIS file field """ | |
if tag == "TY": | |
self.current_record = {tag: field} | |
elif tag == "ER": | |
self.records.append(self.current_record) | |
self.current_record = None | |
elif tag in ["AU", "AD", "KW", "N1"]: | |
if tag in self.current_record: | |
self.current_record[tag].append(field) | |
else: | |
self.current_record[tag] = [field] | |
else: | |
if not tag in self.current_record: | |
self.current_record[tag] = field | |
else: | |
error_str = "Duplicate tag: %s" % tag | |
raise ValueError(error_str) | |
import pandas as pd | |
with open(xFile, 'r', encoding="utf8") as ff: | |
#xdata = ff.read() | |
ris = RIS(ff) | |
d1 = pd.DataFrame(ris.records) | |
print(len(d1)) | |
d1.head() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment