Created
May 17, 2017 21:16
-
-
Save fabiobatalha/7e2ead7cc1e640cb3236b6bf79413842 to your computer and use it in GitHub Desktop.
Retorna documents SciELO a partir de uma URL de pesquisa da ferramenta search.scielo.org
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from io import BytesIO | |
import argparse | |
from urllib.parse import urlparse, urlencode, parse_qs, urlunparse | |
from articlemeta.client import RestfulClient | |
import requests | |
from lxml import etree | |
LIMIT = 100 | |
def parse_xml(xml): | |
xml = BytesIO(xml.encode('utf-8')) | |
xml_doc = etree.parse(xml) | |
return xml_doc | |
def fetch_documents_metadata_from_query(url): | |
""" | |
Unfortunately the API is for harvesting not for quering. So we need to goes | |
through all the records to get those that match is some search criteria. | |
To workaround on it, this method retrive SciELO ID's for each document | |
resulting from a search query made in the search.scielo.org website. | |
In the future we will include some fields for quering directly on the API. | |
This method is an generator that will retrieve a "list" of ids to be used as | |
identifier to query and retrieve metadata from the SciELO API. | |
the id is compounded by the collection identifier and the document identifier | |
ex: scl, S0102-311X2016000600601 | |
""" | |
rc = RestfulClient() | |
uparsed = urlparse(url) | |
query = {k: v for k, v in parse_qs(uparsed[4]).items() if 'q' in k or 'filter' in k} | |
query['output'] = ['xml'] | |
query['from'] = ['1'] | |
query['count'] = [str(LIMIT)] | |
while True: | |
url = urlunparse(['http', 'search.scielo.org', '/', '', urlencode(query, doseq=True), '']) | |
result = requests.get(url, timeout=10) | |
query['from'] = [str(int(query['from'][0]) + LIMIT)] | |
if not result.status_code == 200: | |
continue | |
xml = parse_xml(result.text) | |
ids = xml.xpath("//result/doc/str[@name='id']/text()") | |
if len(ids) == 0: | |
break | |
for item in ids: | |
col = item[-3:] | |
pid = item[:23] | |
yield rc.document(pid, col) | |
def output(documents): | |
""" | |
Make your mess here! All the SciELO metadata will be available for each | |
document through the Article Meta API. | |
""" | |
header = [ | |
'SciELO ID', | |
'DOI', | |
'publication date', | |
'published at scielo date', | |
'original title', | |
'ISSN', | |
'journal title', | |
'total authors', | |
'not normalized country', | |
'ISO 3166 Affiliation', | |
'ISO Language' | |
] | |
print(','.join(header)) | |
for document in documents: | |
data = [] | |
data.append(document.publisher_id) | |
data.append(document.doi or '') | |
data.append(document.publication_date or '') | |
data.append(document.creation_date or '') | |
data.append(document.original_title() or '') | |
data.append(document.journal.scielo_issn) | |
data.append(document.journal.title) | |
data.append(str(len(document.authors or []))) | |
data.append(';'.join(list(set([i['country'] for i in document.mixed_affiliations if 'country' in i and i['country']])))) | |
data.append(';'.join(list(set([i['country_iso_3166'].upper() for i in document.mixed_affiliations if 'country_iso_3166' in i and i['country_iso_3166']])))) | |
data.append(';'.join(list(set([i.upper() for i in document.languages() or []])))) | |
joined_data = u','.join([u'"%s"' % i.replace(u'"', u'""') for i in data]) | |
print(joined_data) | |
def main(): | |
parser = argparse.ArgumentParser( | |
description='Load SciELO IDs from SciELO Search engine' | |
) | |
parser.add_argument( | |
'--search_query_url', | |
'-s', | |
help='Full URL from the search.scielo.org website containing you filters and query', | |
required=True | |
) | |
args = parser.parse_args() | |
pids = fetch_documents_metadata_from_query(args.search_query_url) | |
output(pids) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment