Skip to content

Instantly share code, notes, and snippets.

Created April 30, 2016 16:32
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save nevenjovanovic/5fd105825cdc3120997c086b72b223e3 to your computer and use it in GitHub Desktop.
A Python script to download pages from the CAMENA project, parse them and then follow only the links to XML documents
""" Parse a list of CAMENA htmls, download links ending with .xml."""
__author__ = 'Neven Jovanovic'
__copyright__ = "Neven Jovanovic, Zagreb, Hrvatska"
__credits__ = ["Neven Jovanovic"]
__license__ = "CC-BY"
__version__ = "0.0.2"
__maintainer__ = "Neven Jovanovic"
__email__ = ""
__status__ = "Prototype"
import urllib2
from bs4 import BeautifulSoup
import urllib
suffix = ".xml";
prefix = "";
dots = "../"
# A list of files to be scraped:
popis = ["",
for address in popis:
# Use urllib2 to open page
req = urllib2.Request(address)
response = urllib2.urlopen(req)
html =
# Use Beautiful Soup to parse HTML
soup = BeautifulSoup(html,'lxml')
links = soup.find_all('a')
for tag in links:
link = tag.get('href',None)
if link is not None:
# Select only links ending in .xml
if link.endswith(suffix) and link.startswith(prefix):
urllib.urlretrieve(link, filename)
# For relative links staring with ../
elif link.endswith(suffix) and link.startswith(dots):
link2="" + link[3:]
urllib.urlretrieve(link2, filename)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment