Skip to content

Instantly share code, notes, and snippets.

@sdcampbell
Created November 8, 2018 22:32
Show Gist options
  • Save sdcampbell/1234b3c4b4eebe2d3a62f45abb8c7261 to your computer and use it in GitHub Desktop.
Save sdcampbell/1234b3c4b4eebe2d3a62f45abb8c7261 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
"""search.py - Searches Google for a domain name and downloads PDF and Office files to search for document metadata."""
import sys, os, wget
from pprint import pprint
from googleapiclient.discovery import build
API_KEY = "redacted"
CSE_ID = "redacted"
service = build("customsearch", "v1", developerKey=API_KEY)
links = list()
domain = sys.argv[1]
def search(ext):
query = "filetype:%s site:%s" % (ext, domain)
results = service.cse().list(
q=query,
num=10,
cx=CSE_ID,
).execute()
#print(results['items'])
for result in results['items']:
links.append(result['link'])
def download():
for link in links:
wget.download(link, "files/")
def print_metadata():
results = subprocess.check_output(['exiftool', 'files/'])
print(results)
def main():
if len(sys.argv) != 2:
print 'Usage : %s domain.com' % sys.argv[0]
sys.exit(1)
exts = ['pdf', 'doc', 'docx', 'xls', 'xlsx']
os.mkdir('files') # Make a folder to store our downloaded files. ToDo: Cleanup
for ext in exts:
search(ext)
download()
print_metadata()
sys.exit(0)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment