Created
November 8, 2018 22:32
-
-
Save sdcampbell/1234b3c4b4eebe2d3a62f45abb8c7261 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
"""search.py - Searches Google for a domain name and downloads PDF and Office files to search for document metadata.""" | |
import sys, os, wget | |
from pprint import pprint | |
from googleapiclient.discovery import build | |
API_KEY = "redacted" | |
CSE_ID = "redacted" | |
service = build("customsearch", "v1", developerKey=API_KEY) | |
links = list() | |
domain = sys.argv[1] | |
def search(ext): | |
query = "filetype:%s site:%s" % (ext, domain) | |
results = service.cse().list( | |
q=query, | |
num=10, | |
cx=CSE_ID, | |
).execute() | |
#print(results['items']) | |
for result in results['items']: | |
links.append(result['link']) | |
def download(): | |
for link in links: | |
wget.download(link, "files/") | |
def print_metadata(): | |
results = subprocess.check_output(['exiftool', 'files/']) | |
print(results) | |
def main(): | |
if len(sys.argv) != 2: | |
print 'Usage : %s domain.com' % sys.argv[0] | |
sys.exit(1) | |
exts = ['pdf', 'doc', 'docx', 'xls', 'xlsx'] | |
os.mkdir('files') # Make a folder to store our downloaded files. ToDo: Cleanup | |
for ext in exts: | |
search(ext) | |
download() | |
print_metadata() | |
sys.exit(0) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment