Skip to content

Instantly share code, notes, and snippets.

@obsesh
Created February 4, 2012 15:32
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save obsesh/1738514 to your computer and use it in GitHub Desktop.
Save obsesh/1738514 to your computer and use it in GitHub Desktop.
2012 updated sxsw scrapper
import urllib, mechanize
import time
import simplejson
import os
import sys
from multiprocessing import Pool
import pickle
def crawl_sxsw(cookies, user_id):
already_crawled = os.path.exists( data_path+str(user_id)+ ".json" )
be_nice = True
if not already_crawled:
skip = False
url = url_root + '/users/'+str(user_id)
print "\tmaking request for user " +str(user_id)
br = mechanize.Browser()
br.set_handle_redirect(False)
br._ua_handlers['_cookies'].cookiejar =pickle.loads(cookies)
try:
r = br.open(url)
except:
print "\t404 for " +str(user_id)
if be_nice:
time.sleep(1)
skip = True
if not skip:
s = r.read()
print "\tparsing user: " +str(user_id)
try:
name = s.split("<p class='name'>")[1]\
.split("</p>")[0]
except:
name= ''
try:
company = s.split("<p class='company'>")[1]\
.split("<p class='location'>")[0]\
.replace('</p>','')
company_url, company = ' '.join(company.split()).split('">')
company = company.replace('</a>','')
company_url = company_url.replace('<a href="','')
except:
company = ''
company_url = ''
try:
hometown = s.split("<p class='location'>")[1]\
.split("</p>")[0]
except:
hometown = ''
try:
photo = s.split('id="badge_photo"')[1]\
.split('" />')[0].replace(' src="','http://sxsocial.sxsw.com')
except:
photo = ''
try:
bio = s.split("<div class='bio'>")[1]\
.split("<p>")[1]\
.split("</p>")[0]
except:
bio = ''
try:
links = s.split("<ul class='social'>")[1]\
.split("</ul>")[0]
links_str = ' '.join(links.split()).split('<li>')
links = []
for l in links_str:
try:
links.append(l.split('<a href="')[1].split('"')[0])
except:
pass
except:
links = []
try:
registrant_type = s.split("<p class='registation'>")[1]\
.split("</p>")[0]
registrant_type = registrant_type.strip()
except:
registrant_type = ''
sxsw_user = {
'name':name,
'user_id':user_id,
'user_url':url,
'registrant_type':registrant_type,
'company':company,
'company_url':company_url,
'hometown':hometown,
'photo':photo,
'bio':bio,
'links':links
}
print "\twriting dump for " + name
f = open(data_path+str(user_id)+ ".json","w")
simplejson.dump(sxsw_user,f)
f.close()
f = open(data_path+"placeholder","w")
f.write(str(user_id))
f.close()
if be_nice:
time.sleep(4)
else:
print "\talready crawled " + str(user_id)
return "success"
if __name__ == '__main__':
url_root = 'http://sxsocial.sxsw.com'
data_path = './data/'
be_nice = True
username = os.getenv("USERNAME")
password = os.getenv("PASSWORD")
if not username and not password:
print "please pass USERNAME and PASSWORD in as env variables"
sys.exit()
#have to guess and fudge these
# try finding the lowest uid here:
# http://sxsocial.sxsw.com/users/290 (play around)
# then highest
# http://sxsocial.sxsw.com/users/28000 (play around)
start_uid = 11999 #500
end_uid = 12001 #28000
try:
placeholder = int(open(data_path+"placeholder", 'r').read())
except:
placeholder = 0
print "resuming from user id " +str(placeholder)
if placeholder > start_uid:
start_uid = placeholder
br = mechanize.Browser()
# log in
print "Logging in"
br.open(url_root + '/user_session/new')
br.select_form(nr=0)
br["user_session[username]"] = username
br["user_session[password]"] = password
r = br.submit()
#assert username in r.get_data()
#establish session
cookies = pickle.dumps(br._ua_handlers['_cookies'].cookiejar)
print "crawling range " + str(start_uid) + " to " + str(end_uid)
pool = Pool(processes=10)
for user_id in range(start_uid, end_uid):
result = pool.apply_async(crawl_sxsw, [cookies,user_id])
print result.get()
#pool.join()
#for user_id in range(start_uid, end_uid):
# crawl_sxsw(cookies, user_id)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment