rghose/python_parse.py

## python_parse.py
#!python
import sys
import urllib2
from HTMLParser import HTMLParser

def download_file(url,fName):
  fName = "./images/"+fName
  req = urllib2.urlopen(url)
  CHUNK = 16 * 1024
  with open(fName, 'wb') as fp:
    while True:
      chunk = req.read(CHUNK)
      if not chunk: break
      fp.write(chunk)

# create a subclass and override the handler methods
class MyHTMLParser(HTMLParser):
  def __init__ (self):
    HTMLParser.__init__(self)
    self.myFlag = 0

  def handle_endtag(self, tag):
    if tag == "table":
      self.myFlag = 0
    print data

  def handle_starttag(self, tag, attrs):
    if str(tag) == "table":
      for a in attrs:
        if a[1] == "infobox biography vcard":
          self.myFlag=1
    elif tag == "img" and self.myFlag == 1:
      for a in attrs:
        if a[0] == "alt":
          fileName = a[1]
        if a[0] == "src":
          url = a[1]
      download_file("http:"+url,fileName)


def process(wikiurl):
  htmldata = urllib2.urlopen(wikiurl).read()
  parser = MyHTMLParser()
  parser.feed(htmldata)

if len(sys.argv) == 1:
  print "Wrong number of args"
  exit(1)

myurl = sys.argv[1]
process(myurl)
	#!python
	import sys
	import urllib2
	from HTMLParser import HTMLParser

	def download_file(url,fName):
	fName = "./images/"+fName
	req = urllib2.urlopen(url)
	CHUNK = 16 * 1024
	with open(fName, 'wb') as fp:
	while True:
	chunk = req.read(CHUNK)
	if not chunk: break
	fp.write(chunk)

	# create a subclass and override the handler methods
	class MyHTMLParser(HTMLParser):
	def __init__ (self):
	HTMLParser.__init__(self)
	self.myFlag = 0

	def handle_endtag(self, tag):
	if tag == "table":
	self.myFlag = 0
	print data

	def handle_starttag(self, tag, attrs):
	if str(tag) == "table":
	for a in attrs:
	if a[1] == "infobox biography vcard":
	self.myFlag=1
	elif tag == "img" and self.myFlag == 1:
	for a in attrs:
	if a[0] == "alt":
	fileName = a[1]
	if a[0] == "src":
	url = a[1]
	download_file("http:"+url,fileName)


	def process(wikiurl):
	htmldata = urllib2.urlopen(wikiurl).read()
	parser = MyHTMLParser()
	parser.feed(htmldata)

	if len(sys.argv) == 1:
	print "Wrong number of args"
	exit(1)

	myurl = sys.argv[1]
	process(myurl)