Skip to content

Instantly share code, notes, and snippets.

@sugavaneshb
Created February 8, 2015 10:10
Show Gist options
  • Save sugavaneshb/0c84b6cf6f68a8353712 to your computer and use it in GitHub Desktop.
Save sugavaneshb/0c84b6cf6f68a8353712 to your computer and use it in GitHub Desktop.
from bs4 import BeautifulSoup
from urllib2 import urlopen
from urllib import urlretrieve
import os, sys, io, re, lxml
import mechanize
import codecs, string
br = mechanize.Browser()
base_url = "http://www.careercup.com"
down_url = 'http://www.careercup.com/page?pid=google-interview-questions'
directory = '/home/local/sugavaneshb/careercup/'
def down_them_all(directory='/home/local/sugavaneshb/careercup/', count = 1):
links = [down_url + '&n=' + str(i) for i in range(1, 23)]
print "Starting download of all links..."
for url in links:
print "Fetching " + url
count = down_content(url,count)
def down_content(url, count):
html = urlopen(url).read()
soup = BeautifulSoup(html)
print 'Finding ques_links in ' + url
section = soup.find('ul', {'id':'question_preview'})
pos_questions = section.findAll('a')
#print pos_questions
questions = []
for q in pos_questions:
if str(q['href']).startswith('/question?id'):
questions.append(q['href'])
questions = list(set(questions))
for q in questions:
durl = base_url + q
name = 'q' + str(count) + '.html'
count = count + 1
qpath = os.path.join(directory, name)
urlretrieve(durl, qpath)
print "Done with downloading " + durl
print "Check at " + directory
return count
if __name__ == '__main__':
# directory = raw_input('Where to store?')
print "Let the game begin!"
down_them_all()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment