Skip to content

Instantly share code, notes, and snippets.

@jayrambhia
Created March 6, 2012 06:54
Show Gist options
  • Save jayrambhia/1984424 to your computer and use it in GitHub Desktop.
Save jayrambhia/1984424 to your computer and use it in GitHub Desktop.
A python script to download/update xkcd comic strip
import urllib2
import os
from BeautifulSoup import BeautifulSoup
BASE_URL = "http://xkcd.com"
proxy = {"http":"http://user:pass@proxy:port/",
"https":"https://user:pass@proxy:port/"}
Proxy = urllib2.ProxyHandler(proxy)
opener = urllib2.build_opener(Proxy)
urllib2.install_opener(opener)
def get_soup(URL):
page = urllib2.urlopen(URL)
soup = BeautifulSoup(page.read())
return soup
def get_next_URL(soup):
URL = soup.find("a",{"href":True,"accesskey":"n"})["href"]
if URL == "#":
return None
URL = BASE_URL+URL
return URL
def get_previous_URL(soup):
URL = soup.find("a",{"href":True,"accesskey":"p"})["href"]
if URL == "#":
return None
URL = BASE_URL+URL
return URL
def get_image_URL(soup):
img_URL = soup.find("img",{"src":True,"alt":True,"title":True})["src"]
return img_URL
def save_img(img_URL, URL):
page = opener.open(img_URL)
if not URL.endswith("/"):
URL=URL+"/"
filename = "-".join([URL.split("/")[-2],img_URL.split("/")[-1]])
f = open(os.path.join("down_xkcd",filename),"wb")
f.write(page.read())
f.close()
print filename,"saved"
def crawl(URL):
if URL is None:
return
soup = get_soup(URL)
image_URL = get_image_URL(soup)
save_img(image_URL, URL)
URL = get_next_URL(soup)
if URL is None:
return None
crawl(URL)
def main():
dirs = os.listdir(".")
if not "down_xkcd" in dirs:
os.mkdir("down_xkcd")
files = os.listdir("down_xkcd")
img_list = []
if files:
for filename in files:
img_list.append(int(filename.split("-")[0]))
img_list.sort()
URL = "/".join([BASE_URL,str(img_list[-1])])
else:
URL = "/".join([BASE_URL,"1"])
print URL
crawl(URL)
if __name__ == "__main__":
main()
@jayrambhia
Copy link
Author

There maybe some problem with spamming the website. :D

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment