Skip to content

Instantly share code, notes, and snippets.

@benichmt1
Last active February 14, 2019 16:25
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save benichmt1/fbf57ea372ec691f087c91330596990f to your computer and use it in GitHub Desktop.
Save benichmt1/fbf57ea372ec691f087c91330596990f to your computer and use it in GitHub Desktop.
Google Scraper to replace FOCA
# requirements: selenium wget python 2.7
import time
import sys
import wget
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
def googlescrape(str):
browser = webdriver.Firefox()
browser.get(url)
time.sleep(3) # sleep for 5 seconds so you can see the results
results = browser.find_elements_by_css_selector('div.g')
if len(results) == 0:
print "No results found"
browser.quit()
else:
for x in range(0, len(results)):
link = results[x].find_element_by_tag_name("a")
href = link.get_attribute("href")
print href
wget.download(href)
browser.quit()
return
if len(sys.argv) == 3:
domain = sys.argv[1]
ftype = sys.argv[2]
url = "https://www.google.com/search?num=100&start=0&hl=em&meta=&q=site:"
url += domain
url += "+filetype:"
url += ftype
url += "&filter=0"
googlescrape(url)
elif len(sys.argv) == 2:
for i in range(0, 3):
if i == 0:
print "Checking for pdfs..."
ftype = "pdf"
elif i == 1:
print "Checking for docs..."
ftype = "doc"
elif i == 2:
print "Checking for xls..."
ftype = "xls"
domain = sys.argv[1]
url = "https://www.google.com/search?num=100&start=0&hl=em&meta=&q=site:"
url += domain
url += "+filetype:"
url += ftype
url += "&filter=0"
googlescrape(url)
else:
print "Error: Improper number of arguments. Usage: python search.py domain.com pdf"
sys.exit()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment