Skip to content

Instantly share code, notes, and snippets.

@antonioherraizs
Created August 30, 2015 14:14
Show Gist options
  • Select an option

  • Save antonioherraizs/34658c3768479fe73963 to your computer and use it in GitHub Desktop.

Select an option

Save antonioherraizs/34658c3768479fe73963 to your computer and use it in GitHub Desktop.
Script to generate a wordlist from a google search
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Build a wordlist based on a google search.
Scrapes only the first page of results.
Requires requests and BeautifulSoup modules.
Antonio Herraiz - 2015
"""
import requests
import argparse
import sys
from bs4 import BeautifulSoup
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("-q", "--query", help="Query to search in Google")
parser.add_argument("-n", "--num", help="Number of Google results to query")
return parser.parse_args()
def filter_do_cleanup(element):
try:
# only actual words
res = element != None
# only ascii words (if not ascii we get an exception)
element.decode('ascii')
# limit length
res = len(element) > 2 and len(element) < 20
return res
except:
return False
def make_request(query, num):
# build URL and make request
url = "https://www.google.com/search?"
url += "hl=en&complete=0&safe=off&filter=0&btnG=Search"
query_string = {'q': args.query, 'num': args.num}
r = requests.get(url, params = query_string)
return r.text
def parse_result(html_page):
# convert results to parseable soup
soup = BeautifulSoup(html_page)
# parse html, <span class='st'> contain the results descriptions
words = list()
for res in soup.find_all('span', attrs = {'class':'st'}): # descriptions
if res.text != None:
# split into words and do some clean up
res = res.text.split()
res = [i.strip('.,:;()<>') for i in res]
res = filter(filter_do_cleanup, res)
# join resulting list, don't append them
words += res
# remove duplicates with set()
words = list(set(words))
return words
def display_words(words):
# write them to stdout
sys.stdout.write('\n'.join(words))
if __name__ == "__main__":
args = parse_args()
html = make_request(args.query, args.num)
words = parse_result(html)
display_words(words)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment