Last active
February 24, 2019 16:35
-
-
Save y3nr1ng/dcd99d0becb5dbe077cd7e0e167bac20 to your computer and use it in GitHub Desktop.
Google Drive CLI parallel downloader
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import io | |
import logging | |
from multiprocessing import Pool | |
import os | |
from functools import partial | |
import random | |
import re | |
import sched | |
import signal | |
import time | |
from tqdm import tqdm | |
import coloredlogs | |
import httplib2 | |
from pydrive.auth import AuthenticationError, GoogleAuth, RefreshError | |
from pydrive.auth import ServiceAccountCredentials as SAC | |
from pydrive.drive import GoogleDrive | |
logger = logging.getLogger(__name__) | |
# log to console | |
coloredlogs.install( | |
logger=logger, | |
level='INFO', | |
fmt='%(asctime)s %(levelname)s %(message)s', | |
datefmt='%H:%M:%S' | |
) | |
# log error to file | |
err_handler = logging.FileHandler('error.log') | |
err_handler.setLevel(logging.ERROR) | |
err_formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') | |
err_handler.setFormatter(err_formatter) | |
logger.addHandler(err_handler) | |
class TqdmToLogger(io.StringIO): | |
""" | |
Output stream for TQDM which will output to logger module instead of the sys.stdout | |
""" | |
logger = None | |
level = None | |
buf = '' | |
def __init__(self, logger, level=logging.INFO): | |
super(TqdmToLogger, self).__init__() | |
self.logger = logger | |
self.level = level | |
def write(self, buf): | |
self.buf = buf.strip('\r\n\t ') | |
def flush(self): | |
self.logger.log(self.level, self.buf) | |
tqdm_out = TqdmToLogger(logger, level=logging.INFO) | |
gauth = GoogleAuth() | |
# method 1 - service account | |
#scope = ['https://www.googleapis.com/auth/drive.file'] | |
#gauth.credentials = SAC.from_json_keyfile_name('client_secret.json', scope) | |
# method 2 - oauth | |
auth_url = gauth.GetAuthUrl() | |
print("Please visit\n\n{}\n\nto retrieve OAuth authorization code.".format(auth_url)) | |
code = input(".. CODE: ") | |
gauth.Auth(code) | |
drive = GoogleDrive(gauth) | |
def download_file(fo, dst_dir=''): | |
fn = fo['title'] | |
logging.debug("downloading \"{}\"".format(fn)) | |
fp = os.path.join(dst_dir, fn) | |
try: | |
fo.GetContentFile(fp) | |
finally: | |
pause = random.randint(10, 30) | |
time.sleep(pause) | |
def download_file_id(file_id, dst_dir='', n_retries=5): | |
fo = drive.CreateFile({'id': file_id}) | |
try: | |
for i in range(n_retries): | |
try: | |
download_file(fo, dst_dir=dst_dir) | |
return | |
except ConnectionAbortedError: | |
logger.warning("disconnected, retry ({})".format(i)) | |
except httplib2.ServerNotFoundError: | |
logger.warning("throttled, retry ({})".format(i)) | |
except (AuthenticationError, RefreshError): | |
logger.warning("token expired, retry ({})".format(i)) | |
logger.error("\"{}\", give up after retries".format(file_id)) | |
except: | |
logger.error("\"{}\", unknown exception".format(file_id)) | |
def scan_for_files(drive, file_id, max_return=32): | |
logger.info("scanning \"{}\"...".format(file_id)) | |
paginator = drive.ListFile({ | |
'q': "'{}' in parents and trashed=false".format(file_id), | |
'maxResults': max_return | |
}) | |
for file_list in paginator: | |
for fo in file_list: | |
# ignore nested folders | |
if fo['mimeType'] == 'application/vnd.google-apps.folder': | |
continue | |
yield fo | |
def get_folder_name(drive, file_id): | |
fo = drive.CreateFile({'id': file_id}) | |
return fo['title'] | |
def find_file_id(url, pattern=r'https://drive.google.com/drive/folders/(.*)$'): | |
token = re.search(pattern, url) | |
try: | |
return token.group(1) | |
except AttributeError: | |
raise ValueError("unable to determine file id") | |
def init_worker(): | |
signal.signal(signal.SIGINT, signal.SIG_IGN) | |
if __name__ == '__main__': | |
url = '###TARGET FOLDER URL###' | |
dst_dir = '###DESTINATION###' | |
n_workers = 4 | |
# target | |
file_id = find_file_id(url) | |
# destinations | |
dn = get_folder_name(drive, file_id) | |
dst_dir = os.path.join(os.path.expanduser(dst_dir), dn) | |
try: | |
os.makedirs(dst_dir) | |
logger.info("destination \"{}\" created".format(dst_dir)) | |
except: | |
pass | |
# scheduler | |
s = sched.scheduler(time.time, time.sleep) | |
def action(i, fd): | |
fd.flush() | |
logger.info("found {} files".format(i)) | |
if not os.path.exists('id.txt'): | |
# get id list | |
file_list = scan_for_files(drive, file_id) | |
with open('id.txt', 'w') as fd: | |
i = 0 | |
for fo in file_list: | |
fd.write("{}\n".format(fo['id'])) | |
i += 1 | |
if s.empty(): | |
s.enter(5, 1, action, (i, fd)) | |
s.run(blocking=False) | |
else: | |
# load id list | |
with open('id.txt', 'r') as fd: | |
file_ids = [line.strip() for line in fd.readlines()] | |
# download | |
pool = Pool(n_workers, init_worker) | |
try: | |
func = partial(download_file_id, dst_dir=dst_dir) | |
with tqdm( | |
total=len(file_ids), mininterval=5, | |
file=tqdm_out | |
) as pbar: | |
for _ in tqdm(pool.imap_unordered(func, file_ids)): | |
pbar.update() | |
except KeyboardInterrupt: | |
pass | |
finally: | |
pool.terminate() | |
pool.join() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
cachetools==3.1.0 | |
certifi==2018.11.29 | |
coloredlogs==10.0 | |
google-api-python-client==1.7.8 | |
google-auth==1.6.3 | |
google-auth-httplib2==0.0.3 | |
httplib2==0.12.1 | |
humanfriendly==4.18 | |
oauth2client==4.1.3 | |
pyasn1==0.4.5 | |
pyasn1-modules==0.2.4 | |
PyDrive==1.3.1 | |
PyYAML==3.13 | |
rsa==4.0 | |
six==1.12.0 | |
tqdm==4.31.1 | |
uritemplate==3.0.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
client_config_backend: settings | |
client_config: | |
client_id: ###CLIENT ID### | |
client_secret: ###CLIENT SERCRET### | |
save_credentials: True | |
save_credentials_backend: file | |
save_credentials_file: credentials.json | |
get_refresh_token: True | |
oauth_scope: | |
- https://www.googleapis.com/auth/drive.readonly |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment