Skip to content

Instantly share code, notes, and snippets.

@fabiobatalha
Last active December 16, 2015 04:09
Show Gist options
  • Save fabiobatalha/5374995 to your computer and use it in GitHub Desktop.
Save fabiobatalha/5374995 to your computer and use it in GitHub Desktop.
Script para carregar texto completo de pressreleases em arquivos html
#!/usr/bin/env python
# coding: utf-8
import os
import shutil
import argparse
import json
def dict_articles_press_releases(cisis_path, scielo_path):
query = os.popen('''{0}/mx {1}/bases/artigo/artigo btell=0 lw=0 AHPR=$ pft="v880,'|',v881,'|',v241^i/" -all now'''.format(cisis_path, scielo_path))
article_prs = {}
for line in query:
sline = line.split('|')
article_pid = sline[0].strip()
article_ahead_pid = sline[1].strip()
pr_pid = sline[2].strip()
article_prs[pr_pid] = [article_pid]
if article_ahead_pid:
article_prs[pr_pid].append(article_ahead_pid)
return article_prs
def list_all_press_releases(cisis_path, scielo_path):
query = os.popen('''{0}/mx {1}/bases/artigo/artigo btell=0 lw=0 PHR=$ pft="(v880[1],'|',v4[1],'|',v702[1],'|',v882[1],'|',v40[1],'|',v12^*,'|',v12^l,/)'" -all now'''.format(cisis_path, scielo_path))
article_prs = {}
for line in query:
line = line.decode('iso-8859-1').encode('utf-8')
splited = line.split('|')
splited_path = splited[2].strip().split('\\')
prs = article_prs.setdefault(splited[0], {})
prs['label'] = splited[1].strip()
prs['translation_path'] = splited[2].strip()
prs['related_to'] = splited[3].strip()
prs['original_language'] = splited[4].strip()
prs['acronym'] = splited_path[3].strip()
prs['trans_file'] = splited_path[6].strip()
prs['issue'] = splited_path[4].strip()
titles = prs.setdefault('titles', {})
titles[splited[6].strip()] = splited[5].strip()
return article_prs
def load_original_document(cisis_path, scielo_path, output_dir, pid, original_language):
query = os.popen('''{0}/mx {1}/bases/artigo/artigo btell=0 lw=0 ART={2} "pft=v704^*,/" -all now'''.format(cisis_path, scielo_path, pid))
f = open('{0}/{1}/{2}.html'.format(output_dir, pid, original_language), 'w')
f.write(query.read())
def main(*args, **xargs):
scielo_path = xargs['scielo_path']
cisis_path = xargs['cisis_path']
output_dir = xargs['output_dir']
if not os.path.isdir(scielo_path):
print 'Invalid path for SciELO website ({0})'.format(scielo_path)
if not os.path.isdir(cisis_path):
print 'Invalid path for CISIS tools ({0})'.format(cisis_path)
try:
shutil.rmtree(output_dir)
except:
pass
articles_press_releases = dict_articles_press_releases(cisis_path, scielo_path)
for prs, meta in list_all_press_releases(cisis_path, scielo_path).items():
trans_path = meta['translation_path']
pid = prs
acronym = meta['acronym']
issue = meta['issue']
trans_file = meta['trans_file']
related_issue = meta['related_to']
original_language = meta['original_language']
try:
os.makedirs('htmls/{0}'.format(pid))
except OSError:
pass
load_original_document(cisis_path, scielo_path, output_dir, pid, original_language)
related_to_file = open('htmls/{0}/related_to.txt'.format(pid), 'a')
meta_json = open('htmls/{0}/meta.json'.format(pid), 'w')
json_str = json.dumps(meta)
meta_json.write(json_str)
if pid in articles_press_releases:
print "Press Release de artigo: {0}".format(pid)
related_to_file.write('{0}\n'.format(articles_press_releases[pid]))
else:
print "Press Release de fasciculo: {0}".format(pid)
related_to_file.write('{0}{1}\n'.format(pid[10:14], related_issue))
# defining languages available for each pr doc
directory = '{0}/bases/translation/{1}/{2}'.format(scielo_path, acronym, issue)
lstdir = None
try:
lstdir = os.listdir(directory)
except:
print "Directory not found: {0}".format(directory)
if lstdir:
language = set()
for l in lstdir:
language.add(l[0:2])
for lang in language:
fname = '_'.join([lang, trans_file])
ffrom = '{0}/{1}'.format(directory, fname)
fto = 'htmls/{0}/{1}.html'.format(pid, lang)
try:
shutil.copy2(ffrom, fto)
except:
pass
parser = argparse.ArgumentParser(description="Import HTML Press Releases data")
parser.add_argument('--scielo_path', default='/var/www/scielosp_org', help='SciELO website path')
parser.add_argument('--cisis_path', default='/usr/local/bireme/cisis/5.5.pre02/linux/lindG4', help='CISIS path')
parser.add_argument('--output_dir', default='htmls', help='Output directory for html files')
args = parser.parse_args()
if __name__ == "__main__":
main(scielo_path=args.scielo_path,
cisis_path=args.cisis_path,
output_dir=args.output_dir)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment