sugavaneshb/careercup-scrapper.py

## careercup-scrapper.py
from bs4 import BeautifulSoup
from urllib2 import urlopen
from urllib import urlretrieve
import os, sys, io, re, lxml
import mechanize
import codecs, string
br = mechanize.Browser()

base_url = "http://www.careercup.com"
directory = '/tmp/careercup/'

def down_them_all(directory='/tmp/careercup/', count = 1, down_url):
    links = [down_url + '&n=' + str(i) for i in range(1, 23)]
    print "Starting download of all links..."
    for url in links:
        print "Fetching " + url
        count = down_content(url,count)

def down_content(url, count):
    html = urlopen(url).read()
    soup = BeautifulSoup(html)
    print 'Finding ques_links in ' + url
    section = soup.find('ul', {'id':'question_preview'})
    pos_questions = section.findAll('a')
    #print pos_questions
    questions = []
    for q in pos_questions:
        if str(q['href']).startswith('/question?id'):
            questions.append(q['href'])
    questions = list(set(questions))
    for q in questions:
        durl = base_url + q
        name = 'q' + str(count) + '.html'
        count = count + 1
        qpath = os.path.join(directory, name)
        urlretrieve(durl, qpath)
        print "Done with downloading " + durl
        print "Check at " + directory
    return count

if __name__ == '__main__':
#    directory = raw_input('Where to store?')
    options = ['google', 'microsoft', 'amazon']
    down_url = 'http://www.careercup.com/page?pid=' + options[2] + '-interview-questions'
    print "Let the game begin!"
    down_them_all()
	from bs4 import BeautifulSoup
	from urllib2 import urlopen
	from urllib import urlretrieve
	import os, sys, io, re, lxml
	import mechanize
	import codecs, string
	br = mechanize.Browser()

	base_url = "http://www.careercup.com"
	directory = '/tmp/careercup/'

	def down_them_all(directory='/tmp/careercup/', count = 1, down_url):
	links = [down_url + '&n=' + str(i) for i in range(1, 23)]
	print "Starting download of all links..."
	for url in links:
	print "Fetching " + url
	count = down_content(url,count)

	def down_content(url, count):
	html = urlopen(url).read()
	soup = BeautifulSoup(html)
	print 'Finding ques_links in ' + url
	section = soup.find('ul', {'id':'question_preview'})
	pos_questions = section.findAll('a')
	#print pos_questions
	questions = []
	for q in pos_questions:
	if str(q['href']).startswith('/question?id'):
	questions.append(q['href'])
	questions = list(set(questions))
	for q in questions:
	durl = base_url + q
	name = 'q' + str(count) + '.html'
	count = count + 1
	qpath = os.path.join(directory, name)
	urlretrieve(durl, qpath)
	print "Done with downloading " + durl
	print "Check at " + directory
	return count

	if __name__ == '__main__':
	# directory = raw_input('Where to store?')
	options = ['google', 'microsoft', 'amazon']
	down_url = 'http://www.careercup.com/page?pid=' + options[2] + '-interview-questions'
	print "Let the game begin!"
	down_them_all()