Skip to content

Instantly share code, notes, and snippets.

@sugavaneshb
Created July 26, 2013 16:26
Show Gist options
  • Save sugavaneshb/6090238 to your computer and use it in GitHub Desktop.
Save sugavaneshb/6090238 to your computer and use it in GitHub Desktop.
A simple python script to scrap and get questions from careercup.com which can be later viewed Dependencies: beautifulsoup, urllib
from bs4 import BeautifulSoup
from urllib2 import urlopen
from urllib import urlretrieve
import os, sys, io, re, lxml
import mechanize
import codecs, string
br = mechanize.Browser()
base_url = "http://www.careercup.com"
directory = '/tmp/careercup/'
def down_them_all(directory='/tmp/careercup/', count = 1, down_url):
links = [down_url + '&n=' + str(i) for i in range(1, 23)]
print "Starting download of all links..."
for url in links:
print "Fetching " + url
count = down_content(url,count)
def down_content(url, count):
html = urlopen(url).read()
soup = BeautifulSoup(html)
print 'Finding ques_links in ' + url
section = soup.find('ul', {'id':'question_preview'})
pos_questions = section.findAll('a')
#print pos_questions
questions = []
for q in pos_questions:
if str(q['href']).startswith('/question?id'):
questions.append(q['href'])
questions = list(set(questions))
for q in questions:
durl = base_url + q
name = 'q' + str(count) + '.html'
count = count + 1
qpath = os.path.join(directory, name)
urlretrieve(durl, qpath)
print "Done with downloading " + durl
print "Check at " + directory
return count
if __name__ == '__main__':
# directory = raw_input('Where to store?')
options = ['google', 'microsoft', 'amazon']
down_url = 'http://www.careercup.com/page?pid=' + options[2] + '-interview-questions'
print "Let the game begin!"
down_them_all()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment