lsfalimis/my-1st-crawler-alt.py

## my-1st-crawler-alt.py
import pprint, re, urllib, urllib2
from bs4 import BeautifulSoup


html = urllib2.urlopen('http://SOMEWEBSITE').read()
soup = BeautifulSoup(html)
stuff = soup(class_="WHATEVER")

# not quite understand the following line, it will insert '\n'
stuff.insert(0, stuff)

f=open('/Users/henry/Desktop/output.txt', 'w')
pprint.pprint(stuff, f)
f.close()

# sorry, I tried 'pprint.pformat' which returns a string, but after that, the loop won't loop lines
f=open('/Users/henry/Desktop/output.txt', 'r')
lines = f.readlines()[1:]
for line in lines:
    a = re.search(r'http.*?jpg', line).group()
    b = re.search(r'".*?"', line).group().strip('"')+'.jpg'
    urllib.urlretrieve(a,b)
f.close()
	import pprint, re, urllib, urllib2
	from bs4 import BeautifulSoup


	html = urllib2.urlopen('http://SOMEWEBSITE').read()
	soup = BeautifulSoup(html)
	stuff = soup(class_="WHATEVER")

	# not quite understand the following line, it will insert '\n'
	stuff.insert(0, stuff)

	f=open('/Users/henry/Desktop/output.txt', 'w')
	pprint.pprint(stuff, f)
	f.close()

	# sorry, I tried 'pprint.pformat' which returns a string, but after that, the loop won't loop lines
	f=open('/Users/henry/Desktop/output.txt', 'r')
	lines = f.readlines()[1:]
	for line in lines:
	a = re.search(r'http.*?jpg', line).group()
	b = re.search(r'".*?"', line).group().strip('"')+'.jpg'
	urllib.urlretrieve(a,b)
	f.close()