Skip to content

Instantly share code, notes, and snippets.

@conxuga
Created November 11, 2013 18:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save conxuga/7418009 to your computer and use it in GitHub Desktop.
Save conxuga/7418009 to your computer and use it in GitHub Desktop.
import urllib2
import urllib
import json
import sys
import re
import random
from HTMLParser import HTMLParser
from xml.etree import cElementTree as etree
class LinksParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.tb = etree.TreeBuilder()
def handle_starttag(self, tag, attributes):
self.tb.start(tag, dict(attributes))
def handle_endtag(self, tag):
self.tb.end(tag)
def handle_data(self, data):
self.tb.data(data)
def close(self):
HTMLParser.close(self)
return self.tb.close()
def getDomain(url):
patron = r'((https?):\/\/)?(\w+\.)*(?P<domain>\w+)\.(\w+)(\/.*)?'
m = re.match(patron, url)
if m:
domain = m.group('domain')
return domain
else:
return False
url = "http://ajax.googleapis.com/ajax/services/search/web?v=1.0&"
query = raw_input("Name AND company OR location >> ")
query = urllib.urlencode( {'q' : query } )
response = urllib2.urlopen (url + query ).read()
data = json.loads ( response )
results = data [ 'responseData' ] [ 'results' ]
for result in results:
url = result['url']
from urlparse import urlparse
host = urlparse(url)
if getDomain(url) == "linkedin":
if host.path.find("/in/") or host.path.find("/pub/"):
title = result['title']
try:
usock = urllib2.urlopen(url)
data = usock.read()
usock.close()
print ( title + '; ' + url )
parser = LinksParser()
parser.feed(data)
root = parser.close()
span = root.find(".//p[@class='headline-title title']")
img = root.find(".//img[@class='photo']")
spanjob = root.find(".//span[@class='org summary']")
spantitle = span.text.rstrip('\n')
spantitle = span.text.rstrip('\t')
spantitle = spantitle.strip()
imgalt = img.get("alt")
imgurl = img.get("src")
download = urllib2.urlopen(imgurl)
print imgalt
print spantitle
print spanjob.text
localFile = open(str(random.randint(0, 9))+"_"+imgalt+'.jpg', 'wb')
localFile.write(download.read())
localFile.close()
except Exception as err:
print err
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment