Skip to content

Instantly share code, notes, and snippets.

@yszheda
Created February 8, 2014 06:59
Show Gist options
  • Save yszheda/8877771 to your computer and use it in GitHub Desktop.
Save yszheda/8877771 to your computer and use it in GitHub Desktop.
a naive python crawler for the jxsj magzine.
import re
import urllib
import urlparse
import os
def getHtml(url):
page = urllib.urlopen(url)
html = page.read()
return html
def getImg(html):
regex = r'src="(.+?\.jpg)" /'
imgRegex = re.compile(regex)
imgList = imgRegex.findall(html)
URLscheme = "http"
URLlocation = "www.nlc.gov.cn"
URLpath = "dsb_zt/xzzt/ddyyj/dbzxs/jxsj/index_jxsj.htm"
x = 0
for imgPath in imgList:
os.system("mkdir jxsj-mahler")
unparsedURL = urlparse.urlunparse((URLscheme, URLlocation, URLpath, '', '', ''))
imgFullPath = urlparse.urljoin(unparsedURL, imgPath)
print "Retrieving %s:" % imgFullPath
urllib.urlretrieve(imgFullPath,'jxsj-mahler/%s.jpg' % x)
x = x + 1
def makePDF():
os.system("cd jxsj-mahler; ls | sort -n | tr '\n' ' ' | sed 's/$/\ magzine.pdf/' | xargs convert")
html = getHtml("http://www.nlc.gov.cn/dsb_zt/xzzt/ddyyj/dbzxs/jxsj/index_jxsj.htm")
getImg(html)
makePDF()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment