Skip to content

Instantly share code, notes, and snippets.

@mutsuda
Last active January 2, 2016 01:29
Show Gist options
  • Save mutsuda/8230446 to your computer and use it in GitHub Desktop.
Save mutsuda/8230446 to your computer and use it in GitHub Desktop.
Search keywords from a CSV file and put resulting links matching a certain criteria in another CSV file
import csv
import sys
import mechanize
import cookielib
import re
import time
# Creates a Browser instance
# and defines its headers
def createBrowser():
# Browser
br = mechanize.Browser()
# Cookie Jar
cj = cookielib.LWPCookieJar()
br.set_cookiejar(cj)
# Browser options
br.set_handle_equiv(True)
br.set_handle_gzip(True)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)
# Follows refresh 0 but not hangs on refresh > 0
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
# User-Agent (this is cheating, ok?)
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; es-ES; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
return br
# Given a Browser a keyword and the ids
# returns an array containing the ids,the keyword and the
# google results
def search(br, keyword, ids):
# Create the variable where we will put the result
result = []
# Open google.com
r = br.open('http://google.com')
# Select the first (index zero) form
br.select_form(nr=0)
# Insert the query in the form
br.form['q'] = keyword
# Submit the form
br.submit()
# First column of the result will be the IDS
result.append(ids)
# Second column of the result will be the keyword
result.append(keyword)
# Looking at some results in link format
for l in br.links(url_regex = "(?P<url>https?://(.*)\.facebook[^\s]+)"):
# Try to get the result links cleaned
try:
# Apply regex to the result to clean it, put it in the result array
result.append(re.search("(?P<url>https://[^\s]+)", l.url).group("url"))
except:
# There where no results matching our criteria, we will pass
pass
# Return the resulting array
return result
# Main place where everything takes place
def main(argv):
# Create a browser
br = createBrowser()
# Open the CSV file to write into
ofile = open(argv[1], "wb")
writer = csv.writer(ofile,delimiter=';')
# Boolean to check if we have the result
gotResult = False
# Open the file we read from
with open(argv[0], 'rb') as csvfile:
reader = csv.reader(csvfile, delimiter=';', quotechar='|')
for row in reader:
# Loop until we get some result (which can be empty)
while not gotResult:
try:
# Search for the keyword with the browser and the IDS
line = search(br,row[1]+' '+row[2]+" facebook",row[0])
# If we got here we have the result
gotResult = True
# We might have got an error while trying
except mechanize.URLError as e:
print e.reason
print "Error retrieving information from Google"
print "Sleeping 30 seconds and trying again"
# Give it a rest
time.sleep(30)
print line
# Write the resulting row into the file
writer.writerow(line)
# Reset variable
gotResult = False
# Pretend to be human
time.sleep(1)
if __name__ == "__main__":
main(sys.argv[1:])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment