Skip to content

Instantly share code, notes, and snippets.

@kylemanna
Created September 6, 2015 17:19
Show Gist options
  • Save kylemanna/b48d3dce447927377531 to your computer and use it in GitHub Desktop.
Save kylemanna/b48d3dce447927377531 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
'''
Angellist Web Search Scraper
Author: Kyle Manna
The goal of htis tool is to scrape the Angellist search results becaues the
Search API [1] only returns 20 results with no pagination at the time of
this writing (2015.09.06).
It's super hacky but works. Pass in query arguments on the command line. The
script will return a massive json array with all the results (de-paginated).
It's recommended to test the search query strings ahead of time using a web
browser @ https://angel.co/search?q=query1
Currently it's hardcoded to only search for people matching the queries.
Usage: ./al-search.py query1 query2
[1] https://angel.co/api/spec/search
'''
import json
import urllib.request
from pyquery import PyQuery as pq
import argparse
def search(query):
result = []
url = 'https://angel.co/search'
values = {
'page':0,
'per_page':40,
'skip_loading':'true',
'include_ids':'',
'q':query,
'type':'people',
}
headers = { 'Accept': '*/*' }
while True:
values['page'] = values['page'] + 1
full_url = url + '?' + urllib.parse.urlencode(values)
req = urllib.request.Request(full_url, headers=headers, method='GET')
with urllib.request.urlopen(req) as response:
html_doc = json.loads(response.read().decode('utf-8'))['html']
entries = pq(html_doc)('.result')
if len(entries) == 0: break
for entry in entries:
obj = {}
entry = pq(entry)
title = entry('.title')
a = title('a')
obj['href'] = a.attr['href']
obj['name'] = a.text()
obj['slug'] = obj['href'].rsplit('/',1)[-1]
obj['type'] = entry('.type').text().strip()
obj['pic'] = entry('img').attr['src']
bio = entry('.excerpt')
if bio:
obj['bio'] = bio.text().strip()[1:-2]
result.append(obj)
return result
if __name__ == '__main__':
p = argparse.ArgumentParser()
p.add_argument('query', nargs='+')
args = p.parse_args()
result = []
for q in args.query:
result.extend(search(q))
print(json.dumps(result))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment