Instantly share code, notes, and snippets.

Embed
What would you like to do?
import urllib2
import urllib
import json
import sys
import re
import random
from HTMLParser import HTMLParser
from xml.etree import cElementTree as etree
class LinksParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.tb = etree.TreeBuilder()
def handle_starttag(self, tag, attributes):
self.tb.start(tag, dict(attributes))
def handle_endtag(self, tag):
self.tb.end(tag)
def handle_data(self, data):
self.tb.data(data)
def close(self):
HTMLParser.close(self)
return self.tb.close()
def getDomain(url):
patron = r'((https?):\/\/)?(\w+\.)*(?P<domain>\w+)\.(\w+)(\/.*)?'
m = re.match(patron, url)
if m:
domain = m.group('domain')
return domain
else:
return False
url = "http://ajax.googleapis.com/ajax/services/search/web?v=1.0&"
query = raw_input("Name AND company OR location >> ")
query = urllib.urlencode( {'q' : query } )
response = urllib2.urlopen (url + query ).read()
data = json.loads ( response )
results = data [ 'responseData' ] [ 'results' ]
for result in results:
url = result['url']
from urlparse import urlparse
host = urlparse(url)
if getDomain(url) == "linkedin":
if host.path.find("/in/") or host.path.find("/pub/"):
title = result['title']
try:
usock = urllib2.urlopen(url)
data = usock.read()
usock.close()
print ( title + '; ' + url )
parser = LinksParser()
parser.feed(data)
root = parser.close()
span = root.find(".//p[@class='headline-title title']")
img = root.find(".//img[@class='photo']")
spanjob = root.find(".//span[@class='org summary']")
spantitle = span.text.rstrip('\n')
spantitle = span.text.rstrip('\t')
spantitle = spantitle.strip()
imgalt = img.get("alt")
imgurl = img.get("src")
download = urllib2.urlopen(imgurl)
print imgalt
print spantitle
print spanjob.text
localFile = open(str(random.randint(0, 9))+"_"+imgalt+'.jpg', 'wb')
localFile.write(download.read())
localFile.close()
except Exception as err:
print err
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment