Last active
December 16, 2015 04:09
-
-
Save fabiobatalha/5374995 to your computer and use it in GitHub Desktop.
Script para carregar texto completo de pressreleases em arquivos html
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# coding: utf-8 | |
import os | |
import shutil | |
import argparse | |
import json | |
def dict_articles_press_releases(cisis_path, scielo_path): | |
query = os.popen('''{0}/mx {1}/bases/artigo/artigo btell=0 lw=0 AHPR=$ pft="v880,'|',v881,'|',v241^i/" -all now'''.format(cisis_path, scielo_path)) | |
article_prs = {} | |
for line in query: | |
sline = line.split('|') | |
article_pid = sline[0].strip() | |
article_ahead_pid = sline[1].strip() | |
pr_pid = sline[2].strip() | |
article_prs[pr_pid] = [article_pid] | |
if article_ahead_pid: | |
article_prs[pr_pid].append(article_ahead_pid) | |
return article_prs | |
def list_all_press_releases(cisis_path, scielo_path): | |
query = os.popen('''{0}/mx {1}/bases/artigo/artigo btell=0 lw=0 PHR=$ pft="(v880[1],'|',v4[1],'|',v702[1],'|',v882[1],'|',v40[1],'|',v12^*,'|',v12^l,/)'" -all now'''.format(cisis_path, scielo_path)) | |
article_prs = {} | |
for line in query: | |
line = line.decode('iso-8859-1').encode('utf-8') | |
splited = line.split('|') | |
splited_path = splited[2].strip().split('\\') | |
prs = article_prs.setdefault(splited[0], {}) | |
prs['label'] = splited[1].strip() | |
prs['translation_path'] = splited[2].strip() | |
prs['related_to'] = splited[3].strip() | |
prs['original_language'] = splited[4].strip() | |
prs['acronym'] = splited_path[3].strip() | |
prs['trans_file'] = splited_path[6].strip() | |
prs['issue'] = splited_path[4].strip() | |
titles = prs.setdefault('titles', {}) | |
titles[splited[6].strip()] = splited[5].strip() | |
return article_prs | |
def load_original_document(cisis_path, scielo_path, output_dir, pid, original_language): | |
query = os.popen('''{0}/mx {1}/bases/artigo/artigo btell=0 lw=0 ART={2} "pft=v704^*,/" -all now'''.format(cisis_path, scielo_path, pid)) | |
f = open('{0}/{1}/{2}.html'.format(output_dir, pid, original_language), 'w') | |
f.write(query.read()) | |
def main(*args, **xargs): | |
scielo_path = xargs['scielo_path'] | |
cisis_path = xargs['cisis_path'] | |
output_dir = xargs['output_dir'] | |
if not os.path.isdir(scielo_path): | |
print 'Invalid path for SciELO website ({0})'.format(scielo_path) | |
if not os.path.isdir(cisis_path): | |
print 'Invalid path for CISIS tools ({0})'.format(cisis_path) | |
try: | |
shutil.rmtree(output_dir) | |
except: | |
pass | |
articles_press_releases = dict_articles_press_releases(cisis_path, scielo_path) | |
for prs, meta in list_all_press_releases(cisis_path, scielo_path).items(): | |
trans_path = meta['translation_path'] | |
pid = prs | |
acronym = meta['acronym'] | |
issue = meta['issue'] | |
trans_file = meta['trans_file'] | |
related_issue = meta['related_to'] | |
original_language = meta['original_language'] | |
try: | |
os.makedirs('htmls/{0}'.format(pid)) | |
except OSError: | |
pass | |
load_original_document(cisis_path, scielo_path, output_dir, pid, original_language) | |
related_to_file = open('htmls/{0}/related_to.txt'.format(pid), 'a') | |
meta_json = open('htmls/{0}/meta.json'.format(pid), 'w') | |
json_str = json.dumps(meta) | |
meta_json.write(json_str) | |
if pid in articles_press_releases: | |
print "Press Release de artigo: {0}".format(pid) | |
related_to_file.write('{0}\n'.format(articles_press_releases[pid])) | |
else: | |
print "Press Release de fasciculo: {0}".format(pid) | |
related_to_file.write('{0}{1}\n'.format(pid[10:14], related_issue)) | |
# defining languages available for each pr doc | |
directory = '{0}/bases/translation/{1}/{2}'.format(scielo_path, acronym, issue) | |
lstdir = None | |
try: | |
lstdir = os.listdir(directory) | |
except: | |
print "Directory not found: {0}".format(directory) | |
if lstdir: | |
language = set() | |
for l in lstdir: | |
language.add(l[0:2]) | |
for lang in language: | |
fname = '_'.join([lang, trans_file]) | |
ffrom = '{0}/{1}'.format(directory, fname) | |
fto = 'htmls/{0}/{1}.html'.format(pid, lang) | |
try: | |
shutil.copy2(ffrom, fto) | |
except: | |
pass | |
parser = argparse.ArgumentParser(description="Import HTML Press Releases data") | |
parser.add_argument('--scielo_path', default='/var/www/scielosp_org', help='SciELO website path') | |
parser.add_argument('--cisis_path', default='/usr/local/bireme/cisis/5.5.pre02/linux/lindG4', help='CISIS path') | |
parser.add_argument('--output_dir', default='htmls', help='Output directory for html files') | |
args = parser.parse_args() | |
if __name__ == "__main__": | |
main(scielo_path=args.scielo_path, | |
cisis_path=args.cisis_path, | |
output_dir=args.output_dir) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment