Skip to content

Instantly share code, notes, and snippets.

@rghose
Created November 2, 2014 09:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rghose/14c690ab1049a5d1cb01 to your computer and use it in GitHub Desktop.
Save rghose/14c690ab1049a5d1cb01 to your computer and use it in GitHub Desktop.
Get wiki image
#!python
import sys
import urllib2
from HTMLParser import HTMLParser
def download_file(url,fName):
fName = "./images/"+fName
req = urllib2.urlopen(url)
CHUNK = 16 * 1024
with open(fName, 'wb') as fp:
while True:
chunk = req.read(CHUNK)
if not chunk: break
fp.write(chunk)
# create a subclass and override the handler methods
class MyHTMLParser(HTMLParser):
def __init__ (self):
HTMLParser.__init__(self)
self.myFlag = 0
def handle_endtag(self, tag):
if tag == "table":
self.myFlag = 0
print data
def handle_starttag(self, tag, attrs):
if str(tag) == "table":
for a in attrs:
if a[1] == "infobox biography vcard":
self.myFlag=1
elif tag == "img" and self.myFlag == 1:
for a in attrs:
if a[0] == "alt":
fileName = a[1]
if a[0] == "src":
url = a[1]
download_file("http:"+url,fileName)
def process(wikiurl):
htmldata = urllib2.urlopen(wikiurl).read()
parser = MyHTMLParser()
parser.feed(htmldata)
if len(sys.argv) == 1:
print "Wrong number of args"
exit(1)
myurl = sys.argv[1]
process(myurl)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment