Skip to content

Instantly share code, notes, and snippets.

@niwaringo
Last active August 29, 2015 14:19
Show Gist options
  • Save niwaringo/5d9a8af0c7c63aff6e5b to your computer and use it in GitHub Desktop.
Save niwaringo/5d9a8af0c7c63aff6e5b to your computer and use it in GitHub Desktop.
import json
import gdata.webmastertools.service
import datetime
class Downloader(object):
HOST = 'www.google.com'
APP_NAME = 'Google-WMTdownloadscript-0.1'
LIST_PATH = '/webmasters/tools/downloads-list?hl=%s&siteUrl=%s'
FILE_NAME_HEADER = 'content-disposition'
TEXT_BEFORE_NAME = 'attachment; filename='
TARGET_DAY = datetime.datetime.now() + datetime.timedelta(days=-3)
TARGET_DAY_STR = TARGET_DAY.strftime('%Y%m%d')
def __init__(self):
self._client = gdata.webmastertools.service.GWebmasterToolsService()
self._logged_in = False
self._language = 'en'
self._downloaded = []
def IsLoggedIn(self):
return self._logged_in
def LogIn(self, email, password, captcha_answer=None):
if self._client.captcha_token and captcha_answer:
self._client.ClientLogin(email, password, source=self.APP_NAME,
captcha_token=self._client.captcha_token,
captcha_response=captcha_answer)
else:
self._client.ClientLogin(email, password, source=self.APP_NAME)
self._logged_in = True
def DoDownload(self, site, tables_to_download):
if not self.IsLoggedIn():
raise ValueError('Client not logged in.')
available = self._GetDownloadList(site)
sites_json = json.loads(available)
for key in tables_to_download:
url = sites_json.get(key)
if url:
url = self._getDateFilterdUrl(url)
self._DownloadFile(url)
def GetDownloadedFiles(self):
return self._downloaded
def GetCaptchaUrl(self):
return self._client.captcha_url
def SetLanguage(self, language_code):
self._language = language_code
def _getDateFilterdUrl(self, url):
dayparam = "&de=%s&db=%s" % (self.TARGET_DAY_STR, self.TARGET_DAY_STR)
return url + dayparam
def _GetDownloadList(self, site):
url = self._GetFullUrl(self.LIST_PATH % (self._language, site))
res_stream = self._client.request('GET', url)
download_list = res_stream.read()
res_stream.close()
return download_list
def _GetFullUrl(self, path):
return 'https://' + self.HOST + path
def _DownloadFile(self, path):
url = self._GetFullUrl(path)
in_stream = self._client.request('GET', url)
file_name = in_stream.getheader(self.FILE_NAME_HEADER)
file_name = file_name.lstrip(self.TEXT_BEFORE_NAME)
self._downloaded.append(file_name)
out_file = open(file_name, 'w')
out_file.write(in_stream.read())
in_stream.close()
out_file.close()
#!/usr/bin/python
from downloader import Downloader
email = 'email'
password = 'password'
website = 'yourwebsite'
selected_downloads = ['TOP_QUERIES', 'TOP_PAGES']
downloader = Downloader()
downloader.LogIn(email, password)
downloader.DoDownload(website, selected_downloads)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment