Skip to content

Instantly share code, notes, and snippets.

@dvcrn
Created September 25, 2012 03:05
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dvcrn/3779752 to your computer and use it in GitHub Desktop.
Save dvcrn/3779752 to your computer and use it in GitHub Desktop.
Bing download url crawler
from pybing import Bing
from urllib2 import HTTPError
from BeautifulSoup import BeautifulSoup
from pybing import constants
import re
import mechanize
import simplejson
searchstring = "cows" # your search string here
searchcount = 40
bing = Bing("xxxx")
browser = mechanize.Browser()
browser.set_handle_robots(False)
query = bing.search(searchstring, source_type=constants.WEB_SOURCE_TYPE, extra_params={'web.count':searchcount})
results = query['SearchResponse']['Web']['Results']
whitelist = ['mediafire.com', 'rapidshare.com', 'fileserve.com', 'filesonic.com', 'zippyshare.com']
print "Welcome to SINA v0.1"
print "SINA will search for '%s'\n" % searchstring
for r in results:
url = r['Url']
try:
browser.open(url)
response = browser.response()
html = response.read()
soup = BeautifulSoup(html)
for link in soup.findAll('a', href=True):
url = link['href']
for word in whitelist:
regex = "http://www\.%s/.+" % word
index = re.search(regex, url)
if index != None:
d_url = index.group()
print "%s" % d_url
except Exception:
continue
print "\nFinished :D"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment