conxuga/goScrap.py

## goScrap.py
import urllib2
import urllib
import json
import sys
import re
import random

from HTMLParser import HTMLParser
from xml.etree import cElementTree as etree

class LinksParser(HTMLParser):
  def __init__(self):
      HTMLParser.__init__(self)
      self.tb = etree.TreeBuilder()

  def handle_starttag(self, tag, attributes):
      self.tb.start(tag, dict(attributes))

  def handle_endtag(self, tag):
      self.tb.end(tag)

  def handle_data(self, data):
      self.tb.data(data)

  def close(self):
      HTMLParser.close(self)
      return self.tb.close()

def getDomain(url):
    patron = r'((https?):\/\/)?(\w+\.)*(?P<domain>\w+)\.(\w+)(\/.*)?'
    m = re.match(patron, url)
    if m:
		domain = m.group('domain')
		return domain
    else:
        return False

url = "http://ajax.googleapis.com/ajax/services/search/web?v=1.0&"
query = raw_input("Name AND company OR location >> ")
query = urllib.urlencode( {'q' : query } )
response = urllib2.urlopen (url + query ).read()
data = json.loads ( response )
results = data [ 'responseData' ] [ 'results' ]

for result in results:
	url = result['url']
	from urlparse import urlparse
	host = urlparse(url)
	if getDomain(url) == "linkedin":
		if host.path.find("/in/") or host.path.find("/pub/"):
			title = result['title']
			try:
				usock = urllib2.urlopen(url)
				data = usock.read()
				usock.close()
				print ( title + '; ' + url )
				parser = LinksParser()
				parser.feed(data)
				root = parser.close()
				span = root.find(".//p[@class='headline-title title']")
				img = root.find(".//img[@class='photo']")
				spanjob = root.find(".//span[@class='org summary']")

				spantitle = span.text.rstrip('\n')
				spantitle = span.text.rstrip('\t')
				spantitle = spantitle.strip()
				imgalt = img.get("alt")
				imgurl = img.get("src")
				download = urllib2.urlopen(imgurl)
				print imgalt
				print spantitle
				print spanjob.text
				localFile = open(str(random.randint(0, 9))+"_"+imgalt+'.jpg', 'wb')
				localFile.write(download.read())
				localFile.close()
			except Exception as err:
				print err
	import urllib2
	import urllib
	import json
	import sys
	import re
	import random

	from HTMLParser import HTMLParser
	from xml.etree import cElementTree as etree

	class LinksParser(HTMLParser):
	def __init__(self):
	HTMLParser.__init__(self)
	self.tb = etree.TreeBuilder()

	def handle_starttag(self, tag, attributes):
	self.tb.start(tag, dict(attributes))

	def handle_endtag(self, tag):
	self.tb.end(tag)

	def handle_data(self, data):
	self.tb.data(data)

	def close(self):
	HTMLParser.close(self)
	return self.tb.close()

	def getDomain(url):
	patron = r'((https?):\/\/)?(\w+\.)(?P<domain>\w+)\.(\w+)(\/.)?'
	m = re.match(patron, url)
	if m:
	domain = m.group('domain')
	return domain
	else:
	return False

	url = "http://ajax.googleapis.com/ajax/services/search/web?v=1.0&"
	query = raw_input("Name AND company OR location >> ")
	query = urllib.urlencode( {'q' : query } )
	response = urllib2.urlopen (url + query ).read()
	data = json.loads ( response )
	results = data [ 'responseData' ] [ 'results' ]

	for result in results:
	url = result['url']
	from urlparse import urlparse
	host = urlparse(url)
	if getDomain(url) == "linkedin":
	if host.path.find("/in/") or host.path.find("/pub/"):
	title = result['title']
	try:
	usock = urllib2.urlopen(url)
	data = usock.read()
	usock.close()
	print ( title + '; ' + url )
	parser = LinksParser()
	parser.feed(data)
	root = parser.close()
	span = root.find(".//p[@class='headline-title title']")
	img = root.find(".//img[@class='photo']")
	spanjob = root.find(".//span[@class='org summary']")

	spantitle = span.text.rstrip('\n')
	spantitle = span.text.rstrip('\t')
	spantitle = spantitle.strip()
	imgalt = img.get("alt")
	imgurl = img.get("src")
	download = urllib2.urlopen(imgurl)
	print imgalt
	print spantitle
	print spanjob.text
	localFile = open(str(random.randint(0, 9))+"_"+imgalt+'.jpg', 'wb')
	localFile.write(download.read())
	localFile.close()
	except Exception as err:
	print err