Skip to content

Instantly share code, notes, and snippets.

@AntonOsika
Last active March 17, 2018 13:25
Show Gist options
  • Save AntonOsika/10561110 to your computer and use it in GitHub Desktop.
Save AntonOsika/10561110 to your computer and use it in GitHub Desktop.
Download all HREFs from URL.
import urllib
import os
import re
##############################
# Downloads files for every link it finds.
# The URLs can be handpicked with regex fileURLs.
##############################
f = urllib.urlopen("http://www.math.kth.se/matstat/gru/sf2943/matlabfunctions.html")
source = f.read()
f.close()
fileURLs = []
# fileURLs +=re.findall(r'href=[\'"]?([^\'">]+)',source) # only CAPITAL HREF is interesting in this case
fileURLs += re.findall(r'HREF=[\'"]?([^\'">]+)',source)
URLstart = ''
#If the URL references are relative:
URLstart = 'http://www.math.kth.se/matstat/gru/sf2943/'
print 'Getting URLs that are concatenation with URLstart: \n' + URLstart + '\n'
for x in fileURLs:
print x
a = raw_input("\nAre these urls OK?\n")
for URL in fileURLs:
f = urllib.urlopen(URLstart + URL)
filename = os.path.split(URL)[1]
g = open(filename,'w')
g.write(f.read())
f.close()
g.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment