ben-gy/gist:fdf167de093bde938324a2d7ffff43a7

## gistfile1.txt
import string
import requests
from bs4 import BeautifulSoup
import json
import glob
from multiprocessing import Pool
import sys

API_KEY = None  # your api key here
CRUNCHBASE_COMPANY_API = "http://api.crunchbase.com/v/1/company/%s.js?api_key=%s"


company_urls = []

# iterate through all crunchbase companies by going a-z and "other"
for letter in list(string.lowercase) + ["other"]:
  r = requests.get("http://www.crunchbase.com/companies?c=%s" % (letter))
  soup = BeautifulSoup(r.text)

  # get all the companies that are listed by checking for
  # "/company/" at the beginning of the url
  # however, only save the name of the company by taking
  # substring after /company/ (9 characters long)
  # output error message if company name too long, skips for now
  for link in soup.select("li > a"):
    if "/company/" == link['href'][:9]:
      company_name = link['href'][9:]

      # max filename len is 255, .json = 5
      if len(company_name) < 255 - 5:
        company_urls.append(company_name)
      else:
        print "%s too long, skipping." % (company_name)

# write this list of companies to file
open("list_of_companies.txt", 'w').write("\n".join(company_urls))

def download(company_url):
  r = requests.get(CRUNCHBASE_COMPANY_API % (company_url, API_KEY))
  open("crunchbase_companies/%s.json" % (company_url), 'w').write(r.text.encode("utf-8"))

# get all the already scraped companies so we don't rescrape them
# if force flag is used, rescrape all companies
if "--force" in sys.argv or "-f" in sys.argv:
  scraped_companies = []
else:
  # remove json extension
  scraped_companies = [x[:-5] for x in glob.glob("crunchbase_companies/*.json")]

p = Pool(10)
r = p.map_async(download, set(company_urls) - set(scraped_companies))
r.get()
	import string
	import requests
	from bs4 import BeautifulSoup
	import json
	import glob
	from multiprocessing import Pool
	import sys

	API_KEY = None # your api key here
	CRUNCHBASE_COMPANY_API = "http://api.crunchbase.com/v/1/company/%s.js?api_key=%s"


	company_urls = []

	# iterate through all crunchbase companies by going a-z and "other"
	for letter in list(string.lowercase) + ["other"]:
	r = requests.get("http://www.crunchbase.com/companies?c=%s" % (letter))
	soup = BeautifulSoup(r.text)

	# get all the companies that are listed by checking for
	# "/company/" at the beginning of the url
	# however, only save the name of the company by taking
	# substring after /company/ (9 characters long)
	# output error message if company name too long, skips for now
	for link in soup.select("li > a"):
	if "/company/" == link['href'][:9]:
	company_name = link['href'][9:]

	# max filename len is 255, .json = 5
	if len(company_name) < 255 - 5:
	company_urls.append(company_name)
	else:
	print "%s too long, skipping." % (company_name)

	# write this list of companies to file
	open("list_of_companies.txt", 'w').write("\n".join(company_urls))

	def download(company_url):
	r = requests.get(CRUNCHBASE_COMPANY_API % (company_url, API_KEY))
	open("crunchbase_companies/%s.json" % (company_url), 'w').write(r.text.encode("utf-8"))

	# get all the already scraped companies so we don't rescrape them
	# if force flag is used, rescrape all companies
	if "--force" in sys.argv or "-f" in sys.argv:
	scraped_companies = []
	else:
	# remove json extension
	scraped_companies = [x[:-5] for x in glob.glob("crunchbase_companies/*.json")]

	p = Pool(10)
	r = p.map_async(download, set(company_urls) - set(scraped_companies))
	r.get()