AntonOsika/downloadURLs.py

## downloadURLs.py

import urllib
import os
import re

##############################
# Downloads files for every link it finds.
# The URLs can be handpicked with regex fileURLs.
##############################


f = urllib.urlopen("http://www.math.kth.se/matstat/gru/sf2943/matlabfunctions.html")
source = f.read()
f.close()

fileURLs = []
# fileURLs +=re.findall(r'href=[\'"]?([^\'">]+)',source) # only CAPITAL HREF is interesting in this case
fileURLs += re.findall(r'HREF=[\'"]?([^\'">]+)',source)


URLstart = ''
#If the URL references are relative:
URLstart = 'http://www.math.kth.se/matstat/gru/sf2943/'

print 'Getting URLs that are concatenation with URLstart: \n' + URLstart + '\n'
for x in fileURLs:
	print x

a = raw_input("\nAre these urls OK?\n")

for URL in fileURLs:
	f = urllib.urlopen(URLstart + URL)
	filename = os.path.split(URL)[1]
	g = open(filename,'w')
	g.write(f.read())
	f.close()
	g.close()

	import urllib
	import os
	import re

	##############################
	# Downloads files for every link it finds.
	# The URLs can be handpicked with regex fileURLs.
	##############################


	f = urllib.urlopen("http://www.math.kth.se/matstat/gru/sf2943/matlabfunctions.html")
	source = f.read()
	f.close()

	fileURLs = []
	# fileURLs +=re.findall(r'href=[\'"]?([^\'">]+)',source) # only CAPITAL HREF is interesting in this case
	fileURLs += re.findall(r'HREF=[\'"]?([^\'">]+)',source)


	URLstart = ''
	#If the URL references are relative:
	URLstart = 'http://www.math.kth.se/matstat/gru/sf2943/'

	print 'Getting URLs that are concatenation with URLstart: \n' + URLstart + '\n'
	for x in fileURLs:
	print x

	a = raw_input("\nAre these urls OK?\n")

	for URL in fileURLs:
	f = urllib.urlopen(URLstart + URL)
	filename = os.path.split(URL)[1]
	g = open(filename,'w')
	g.write(f.read())
	f.close()
	g.close()