Skip to content

Instantly share code, notes, and snippets.

@nevenjovanovic
Created April 30, 2016 16:32
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nevenjovanovic/5fd105825cdc3120997c086b72b223e3 to your computer and use it in GitHub Desktop.
Save nevenjovanovic/5fd105825cdc3120997c086b72b223e3 to your computer and use it in GitHub Desktop.
A Python script to download pages from the CAMENA project, parse them and then follow only the links to XML documents
"""getcamena.py: Parse a list of CAMENA htmls, download links ending with .xml."""
__author__ = 'Neven Jovanovic'
__copyright__ = "Neven Jovanovic, Zagreb, Hrvatska"
__credits__ = ["Neven Jovanovic"]
__license__ = "CC-BY"
__version__ = "0.0.2"
__maintainer__ = "Neven Jovanovic"
__email__ = "neven.jovanovic@ffzg.hr"
__status__ = "Prototype"
import urllib2
from bs4 import BeautifulSoup
import urllib
suffix = ".xml";
prefix = "http://www.uni-mannheim.de/mateo";
dots = "../"
# A list of files to be scraped:
popis = ["http://www.uni-mannheim.de/mateo/cera/autoren/baierfj_cera.html",
"http://www.uni-mannheim.de/mateo/cera/autoren/baier_cera.html",
"http://www.uni-mannheim.de/mateo/cera/autoren/bartholin_cera.html"]
for address in popis:
# Use urllib2 to open page
req = urllib2.Request(address)
response = urllib2.urlopen(req)
html = response.read()
# Use Beautiful Soup to parse HTML
soup = BeautifulSoup(html,'lxml')
links = soup.find_all('a')
for tag in links:
link = tag.get('href',None)
if link is not None:
# Select only links ending in .xml
if link.endswith(suffix) and link.startswith(prefix):
filename=link.split('/')[-1]
urllib.urlretrieve(link, filename)
# For relative links staring with ../
elif link.endswith(suffix) and link.startswith(dots):
filename=link.split('/')[-1]
link2="http://www.uni-mannheim.de/mateo/" + link[3:]
urllib.urlretrieve(link2, filename)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment