Last active
August 29, 2015 14:19
-
-
Save niwaringo/5d9a8af0c7c63aff6e5b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import gdata.webmastertools.service | |
import datetime | |
class Downloader(object): | |
HOST = 'www.google.com' | |
APP_NAME = 'Google-WMTdownloadscript-0.1' | |
LIST_PATH = '/webmasters/tools/downloads-list?hl=%s&siteUrl=%s' | |
FILE_NAME_HEADER = 'content-disposition' | |
TEXT_BEFORE_NAME = 'attachment; filename=' | |
TARGET_DAY = datetime.datetime.now() + datetime.timedelta(days=-3) | |
TARGET_DAY_STR = TARGET_DAY.strftime('%Y%m%d') | |
def __init__(self): | |
self._client = gdata.webmastertools.service.GWebmasterToolsService() | |
self._logged_in = False | |
self._language = 'en' | |
self._downloaded = [] | |
def IsLoggedIn(self): | |
return self._logged_in | |
def LogIn(self, email, password, captcha_answer=None): | |
if self._client.captcha_token and captcha_answer: | |
self._client.ClientLogin(email, password, source=self.APP_NAME, | |
captcha_token=self._client.captcha_token, | |
captcha_response=captcha_answer) | |
else: | |
self._client.ClientLogin(email, password, source=self.APP_NAME) | |
self._logged_in = True | |
def DoDownload(self, site, tables_to_download): | |
if not self.IsLoggedIn(): | |
raise ValueError('Client not logged in.') | |
available = self._GetDownloadList(site) | |
sites_json = json.loads(available) | |
for key in tables_to_download: | |
url = sites_json.get(key) | |
if url: | |
url = self._getDateFilterdUrl(url) | |
self._DownloadFile(url) | |
def GetDownloadedFiles(self): | |
return self._downloaded | |
def GetCaptchaUrl(self): | |
return self._client.captcha_url | |
def SetLanguage(self, language_code): | |
self._language = language_code | |
def _getDateFilterdUrl(self, url): | |
dayparam = "&de=%s&db=%s" % (self.TARGET_DAY_STR, self.TARGET_DAY_STR) | |
return url + dayparam | |
def _GetDownloadList(self, site): | |
url = self._GetFullUrl(self.LIST_PATH % (self._language, site)) | |
res_stream = self._client.request('GET', url) | |
download_list = res_stream.read() | |
res_stream.close() | |
return download_list | |
def _GetFullUrl(self, path): | |
return 'https://' + self.HOST + path | |
def _DownloadFile(self, path): | |
url = self._GetFullUrl(path) | |
in_stream = self._client.request('GET', url) | |
file_name = in_stream.getheader(self.FILE_NAME_HEADER) | |
file_name = file_name.lstrip(self.TEXT_BEFORE_NAME) | |
self._downloaded.append(file_name) | |
out_file = open(file_name, 'w') | |
out_file.write(in_stream.read()) | |
in_stream.close() | |
out_file.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
from downloader import Downloader | |
email = 'email' | |
password = 'password' | |
website = 'yourwebsite' | |
selected_downloads = ['TOP_QUERIES', 'TOP_PAGES'] | |
downloader = Downloader() | |
downloader.LogIn(email, password) | |
downloader.DoDownload(website, selected_downloads) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment