Skip to content

Instantly share code, notes, and snippets.

Last active August 29, 2015 14:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pudo/6c652413605dedb5392a to your computer and use it in GitHub Desktop.
Save pudo/6c652413605dedb5392a to your computer and use it in GitHub Desktop.
Scraper for the Mozambique Gazette, Pt. III
# coding: utf-8
import os
import shutil
import requests
import tempfile
from urllib import urlretrieve
from urlparse import urljoin
from lxml import html
AUTH = (os.environ.get('DOCCLOUD_USER'),
HOST = os.environ.get('DOCCLOUD_HOST', '')
PROJECT_ID = os.environ.get('DOCCLOUD_PROJECTID', '230')
def documentcloudify(file_name, data):
title = data.get('file').replace('.pdf', '')
title = '%s - %s' % (data['issue'], title)
search_url = urljoin(HOST, '/api/search.json')
params = {'q': 'projectid:"%s" title:"%s"' % (PROJECT_ID, title)}
res = requests.get(search_url, params=params, auth=AUTH,
found = res.json()
if found.get('total') > 0:
return found.get('documents')[0].get('canonical_url')
req_data = {
'title': title,
'source': u'Boletins da República',
'published_url': data.get('url'),
'access': 'public',
'language': 'por',
'project': PROJECT_ID
files = {
'file': open(file_name, 'rb')
upload_url = urljoin(HOST, '/api/upload.json')
res =, files=files,
verify=False, auth=AUTH, data=req_data)
return res.json().get('canonical_url')
def content_links(url):
res = requests.get(url)
doc = html.fromstring(res.text)
for a in doc.findall('.//div[@id="content"]//a'):
urlref = urljoin(url, a.get('href', ''))
if urlref == url:
if not urlref.startswith(url):
yield urlref, a
def get_files(data):
url = data.get('issue_url')
for href, a in content_links(url):
print [href]
d = data.copy()
d['file'] = a.text_content()
if href.endswith('/view'):
href, _ = href.rsplit('/view', 1)
if not href.endswith('.pdf'):
d['url'] = href
dir = tempfile.mkdtemp()
file_name = os.path.join(dir, d['file'])
file_name = file_name.encode('ascii', 'ignore')
print "FILE", file_name, d['file']
urlretrieve(d['url'], file_name)
documentcloudify(file_name, d)
def get_issues(data):
url = data.get('year_url')
for href, a in content_links(url):
d = data.copy()
d['issue'] = a.text_content()
d['issue_url'] = href
def get_years():
url = ''
for href, a in content_links(url):
print [a.text_content()]
data = {
'year': a.text_content(),
'year_url': a.get('href')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment