Created
January 16, 2018 13:52
-
-
Save coline-carle/f07e32eabfe170ad695bdaadf73162c6 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import re | |
import sys | |
import requests | |
import getopt | |
import glob | |
import os | |
import datetime | |
from db import Page, Session | |
sess = requests.session() | |
class FileMirror(object): | |
key_regexp = r'/(\d+)\.html$' | |
def __init__(self, session, type="quest", local_path="mirror"): | |
self.key_regexp = re.compile(self.key_regexp) | |
self.__local_path = local_path | |
self.__type = type | |
self.__fullpath = os.path.join(local_path, type) | |
self.__session = session | |
if not os.path.isdir(self.__fullpath): | |
raise Exception("{} is not a valid directory".format(self.__fullpath)) | |
def scan_local(self): | |
print("syncing database with already downloaded page (slow operation)") | |
self.__session.query(Page).update({'localLastMod': None}) | |
self.__session.commit() | |
files = glob.glob(os.path.join(self.__fullpath, '*')) | |
for file in files: | |
db_key = self.get_key_from_filename(file) | |
page = self.__session.query(Page).filter(Page.gameID == db_key).one() | |
page.localLastMod = self.last_mod_datetime(file) | |
self.__session.commit() | |
def last_mod_datetime(self, filename): | |
timestamp = os.path.getmtime(filename) | |
return datetime.datetime.fromtimestamp(timestamp) | |
def get_key_from_filename(self, filename): | |
match = self.key_regexp.search(filename) | |
if match: | |
return int(match.group(1)) | |
raise Exception("filename {} does not match standard filename pattern".format(filename)) | |
def stats(self): | |
Page.print_stats(self.__session, self.__type) | |
def download_outdated(self): | |
pages = Page.outdated(self.__session, self.__type) | |
for page in pages: | |
self.download(page) | |
def download_missing(self): | |
pages = Page.missing(self.__session, self.__type) | |
for page in pages: | |
self.download(page) | |
def download(self, page): | |
try: | |
response = requests.get(page.loc) | |
response.raise_for_status() | |
self.save_page(page, response) | |
print("Donwloaded: %s" % (page.loc)) | |
except requests.exceptions.HTTPError as err: | |
print(err) | |
except requests.exceptions.TooManyRedirects as err: | |
print(err) | |
except requests.exceptions.Timeout as err: | |
print(err) | |
except requests.exceptions.ConnectionError as err: | |
print(err) | |
except requests.exceptions.ProxyError as err: | |
print(err) | |
except requests.exceptions.SSLError as err: | |
print(err) | |
def get_filename(self, page): | |
return os.path.join(self.__fullpath, "%d.html" % (page.gameID)) | |
def save_page(self, page, response): | |
filename = self.get_filename(page) | |
with open(filename, 'wb') as f: | |
f.write(response.content) | |
page.localLastMod = datetime.datetime.now() | |
self.__session.commit() | |
def main(argv): | |
sync = False | |
try: | |
opts, args = getopt.getopt(argv, ":s", ["sync"]) | |
except getopt.GetoptError: | |
print("invalid argument") | |
sys.exit(1) | |
for opt, arg in opts: | |
if opt in("-s", "--sync"): | |
sync = True | |
dbsession = Session() | |
fileMirror = FileMirror(dbsession) | |
if sync: | |
fileMirror.scan_local() | |
sys.exit(0) | |
fileMirror.stats() | |
print("downloading missing pages") | |
fileMirror.download_missing() | |
print("downloading outdated pages") | |
fileMirror.download_outdated() | |
dbsession.close() | |
if __name__ == "__main__": | |
main(sys.argv[1:]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment