get whole item tree of Google Drive
from enum import Enum
import time
import os
import httplib2
from apiclient import discovery
from oauth2client import client
from oauth2client import tools
from oauth2client.file import Storage
import argparse
flags = argparse.ArgumentParser(parents=[tools.argparser]).parse_args()
CLIENT_SECRET_FILE = 'client_secret.json'
APPLICATION_NAME = 'Drive API Python Quickstart'
def get_credentials():
home_FOLDER = os.path.expanduser('~')
credential_FOLDER = os.path.join(home_FOLDER, '.credentials')
if not os.path.exists(credential_FOLDER):
credential_path = os.path.join(credential_FOLDER,
store = Storage(credential_path)
credentials = store.get()
if not credentials or credentials.invalid:
flow = client.flow_from_clientsecrets(CLIENT_SECRET_FILE, SCOPES)
flow.user_agent = APPLICATION_NAME
credentials = tools.run_flow(flow, store, flags)
print('Storing credentials to ' + credential_path)
return credentials
#class for GoogleDrive item such as file/folder
class Node():
class FileType(Enum):
FILE = 0
def __init__(self, path, depth, file_type, file_id):
self.path = path
self.depth = depth
self.file_type = file_type
self.file_id = file_id
self.children = []
def print_children(self):
print('{0}{1} (ID: {2} file_type: {3})'.format(' '*self.depth, os.path.basename(self.path), self.file_id, self.file_type))
for child in self.children:
def count_children(self):
num_files = 0; num_folders = 0
for child in self.children:
if child.file_type == Node.FileType.FILE:
num_files += 1
if child.file_type == Node.FileType.FOLDER:
num_folders += 1
a,b = child.count_children()
num_files += a; num_folders += b
return (num_files, num_folders)
#search child items recursively
def search(self, drive_service):
if self.file_type == Node.FileType.FILE:
page_token = None
while True: #Sometimes we cannot get all childs at one time when items are too many. We must be able to handle such a case.
response = drive_service.files().list(
q="'%s' in parents and trashed=false" % self.file_id,
corpus='user', includeTeamDriveItems=False, orderBy='name', pageSize=MAX_PAGE_SIZE_PER_REQUEST, spaces='drive',
fields='nextPageToken, files(id, name, mimeType)', supportsTeamDrives=False,
for file in response.get('files', []):
print('Found file: %s (%s), mimeType: %s' % (file['name'], file['id'], file['mimeType']))
if file['mimeType'] == 'application/':
file_type = Node.FileType.FOLDER
file_type = Node.FileType.FILE
#new child
self.children.append(Node(path=self.path+'/'+file['name'], depth=self.depth+1, file_type=file_type, file_id=file['id']))
page_token = response.get('nextPageToken', None)
if page_token is None:
#do the same for childs (recursive!)
for child in self.children:
def main():
credentials = get_credentials()
http = credentials.authorize(httplib2.Http())
service ='drive', 'v3', http=http)
start_time = time.time()
root = Node(path='root', depth=0, file_type=Node.FileType.FOLDER, file_id='root')
print('\nThere are {0} files and {1} folders.'.format(*root.count_children()))
print('Search took {0} [sec].'.format(time.time()-start_time))
if __name__ == '__main__':
from enum import Enum
from queue import Queue
import os
import time
import httplib2
from apiclient import discovery
from oauth2client import client
from oauth2client import tools
from oauth2client.file import Storage
import argparse
flags = argparse.ArgumentParser(parents=[tools.argparser]).parse_args()
CLIENT_SECRET_FILE = 'client_secret.json'
APPLICATION_NAME = 'Drive API Python Quickstart'
def get_credentials():
home_FOLDER = os.path.expanduser('~')
credential_FOLDER = os.path.join(home_FOLDER, '.credentials')
if not os.path.exists(credential_FOLDER):
credential_path = os.path.join(credential_FOLDER,
store = Storage(credential_path)
credentials = store.get()
if not credentials or credentials.invalid:
flow = client.flow_from_clientsecrets(CLIENT_SECRET_FILE, SCOPES)
flow.user_agent = APPLICATION_NAME
credentials = tools.run_flow(flow, store, flags)
print('Storing credentials to ' + credential_path)
return credentials
#class for GoogleDrive item such as file/folder
class Node():
class FileType(Enum):
FILE = 0
def __init__(self, path, depth, file_type, file_id):
self.path = path
self.depth = depth
self.file_type = file_type
self.file_id = file_id
self.children = []
def print_children(self):
print('{0}{1} (ID: {2} file_type: {3})'.format(' '*self.depth, os.path.basename(self.path), self.file_id, self.file_type))
for child in self.children:
def count_children(self):
num_files = 0; num_folders = 0
for child in self.children:
if child.file_type == Node.FileType.FILE:
num_files += 1
if child.file_type == Node.FileType.FOLDER:
num_folders += 1
a,b = child.count_children()
num_files += a; num_folders += b
return (num_files, num_folders)
#search child items using combined method of BFS-serach and batch-processing
def search(cls, start_node, drive_service):
MAX_API_CALLS_PER_BATCH = 100 #Google says this is the largest value we can use
#class for single request. We pack these requests, as many as possible (up to 100), into every batch.
class SingleRequest():
def __init__(self, node, page_token):
self.node = node
self.page_token = page_token
#callback function for batch proessing.
#In every batch process, the argument 'request_id' starts at 1 and increases by 1 for each callback.
#When a batch is completed and another batch starts, 'request_id' returns to 1.
def callback(request_id, response, exception):
parent_node, page_token = req_id_vs_single_request_correspondence_table[request_id] #specify parent node associated with this request, using correspondence table
print('{0}in callback, request_id: {1}'.format(' '*parent_node.depth, request_id))
print('{0}parent node: {1}'.format(' '*parent_node.depth, os.path.basename(parent_node.path)))
if exception:
if exception._get_reason() == 'User Rate Limit Exceeded' or exception._get_reason() == 'Rate Limit Exceeded':
print('Rate Limit Exceeded !!!')
queue.put(SingleRequest(node=parent_node, page_token=page_token)) #re-queue this failed request
print('{0}re-queueing: {1}'.format(' '*parent_node.depth, os.path.basename(parent_node.path)))
print('{0}-> queue size: {1}'.format(' '*parent_node.depth, queue.qsize()))
nonlocal reached_rate_limit
reached_rate_limit = True #tell event to batch maker
raise Exception('Unknown Google Drive REST API Error !!!')
for file in response.get('files', []):
print('{0}Found file: {1} ({2}), mimeType: {3}'.format(' '*(parent_node.depth+1), file['name'], file['id'], file['mimeType']))
if file['mimeType'] == 'application/':
file_type = Node.FileType.FOLDER
file_type = Node.FileType.FILE
child = Node(path=parent_node.path+'/'+file['name'], depth=parent_node.depth+1, file_type=file_type, file_id=file['id'])
queue.put(SingleRequest(node=child, page_token=None)) #add child node to search queue
print('{0}queueing: {1}'.format(' '*child.depth, file['name']))
page_token = response.get('nextPageToken', None)
if page_token is not None: #When this response has next chunk, we must put the chunk to request so that it will be handled immediately after currently queued requests.
print('{0}next page: {1}'.format(' '*parent_node.depth, page_token))
queue.put(SingleRequest(node=parent_node, page_token=page_token))
print('{0}queueing: {1}'.format(' '*parent_node.depth, os.path.basename(parent_node.path)))
print('{0}-> queue size: {1}'.format(' '*(parent_node.depth+1), queue.qsize()))
#initialize request queue
queue = Queue()
queue.put(SingleRequest(node=start_node, page_token=None))
reached_rate_limit = False
print('{0}queueing: {1}'.format(' '*start_node.depth, os.path.basename(start_node.path)))
while not queue.empty():
#---------- make batch
if reached_rate_limit == True: #When we reached '(User) Rate Limit Exceeded', we have to wait for a seconds, otherwise next batch will fail.
reached_rate_limit = False
batch = drive_service.new_batch_http_request(callback=callback)
batch_size = 0
#correspondence table for 'request_id(str)'(an argument for callback function) and associated (node, page_token) pair.
#This is a dict {request_id: (node, page_token)}.
#Sometimes we may encounter '(User) Rate Limit Exceeded' exception because there may be too many items to handle at once, and need to re-queue the failed request.
#In this time we need (node, page_token) pair information.
#Since it is difficult to specify, in the callback function, the (node, page_token) pair associated with the request,
#we make correspondence table, and refer it in callback function.
req_id_vs_single_request_correspondence_table = {}
while (not queue.empty()) and (batch_size < MAX_API_CALLS_PER_BATCH): #pack requests as many as possible
single_request = queue.get()
node = single_request.node
print('{0}pop {1} from queue and adding to batch'.format(' '*node.depth, os.path.basename(node.path)))
if node.file_type == Node.FileType.FILE:
q="'%s' in parents and trashed=false" % node.file_id,
corpus='user', includeTeamDriveItems=False, orderBy='name', pageSize=100, spaces='drive',
fields='nextPageToken, files(id, name, mimeType)', supportsTeamDrives=False,
batch_size += 1
req_id_vs_single_request_correspondence_table[str(batch_size)] = (node, single_request.page_token)
print('{0}-> queue size: {1}'.format(' '*node.depth, queue.qsize()))
print('executing batch')
def main():
credentials = get_credentials()
http = credentials.authorize(httplib2.Http())
service ='drive', 'v3', http=http)
start_time = time.time()
root = Node(path='root', depth=0, file_type=Node.FileType.FOLDER, file_id='root'), service)
print('tree structure:\n')
print('\nThere are {0} files and {1} folders.'.format(*root.count_children()))
print('Search took {0} [sec].'.format(time.time()-start_time))
if __name__ == '__main__':
test of search-parent method
from enum import Enum
import argparse
import time
import logging
import os
import httplib2
from apiclient import discovery
from oauth2client import client
from oauth2client import tools
from oauth2client.file import Storage
#logging settings
logger = logging.getLogger(__name__); logger.setLevel(logging.DEBUG) #output DEBUG or higher level messages
fmt = logging.Formatter('%(asctime)s - %(name)s - %(threadName)s - %(levelname)s: %(message)s')
log_sh = logging.StreamHandler();\
log_fh = logging.FileHandler('debug.log');\
log_efh = logging.FileHandler('error.log');\
logger.addHandler(log_sh); logger.addHandler(log_fh); logger.addHandler(log_efh)
def get_credentials():
home_FOLDER = os.path.expanduser('~')
credential_folder = os.path.join(home_FOLDER, '.credentials')
if not os.path.exists(credential_folder):
credential_path = os.path.join(credential_folder, 'drive-python-quickstart.json')
store = Storage(credential_path)
credentials = store.get()
if not credentials or credentials.invalid:
flow = client.flow_from_clientsecrets(CLIENT_SECRET_FILE, SCOPES)
flow.user_agent = APPLICATION_NAME
flags = argparse.ArgumentParser(parents=[tools.argparser]).parse_args()
credentials = tools.run_flow(flow, store, flags)
print('Storing credentials to ' + credential_path)
return credentials
class FileType(Enum):
file type
FILE = 0
class Node():
class for Google Drive item
def __init__(self, path, basename, depth, file_type, file_id):
self.path = path
self.basename = basename
self.depth = depth
self.file_type = file_type
self.file_id = file_id
self.children = []
def print_children(self):
print all children
print('{0}{1} (ID: {2} file_type: {3})'.format(' '*self.depth, os.path.basename(self.path), self.file_id, self.file_type))
for child in self.children:
def count_children(self):
count all children
num_files = 0; num_folders = 0
for child in self.children:
if child.file_type == FileType.FILE:
num_files += 1
if child.file_type == FileType.FOLDER:
num_folders += 1
a,b = child.count_children()
num_files += a; num_folders += b
return (num_files, num_folders)
def complement_children_path_depth(self):
generate children's path and depth information from basename
for child in self.children:
child.path = '{0}/{1}'.format(self.path, child.basename)
child.depth = self.depth+1
def get_whole_tree(drive_service):
get whole Google Drive item tree
email_address = drive_service.about().get(fields='user(emailAddress)').execute().get('user')['emailAddress']
root_id = drive_service.files().get(fileId='root', supportsTeamDrives=False, fields='id', ).execute().get('id')
root = Node(path='root', basename='root', depth=0, file_type=FileType.FOLDER, file_id=root_id)
#get all items
nodes = {root_id: (root, None)}
page_token = None
while True:
response = drive_service.files().list(corpus='user', includeTeamDriveItems=False, orderBy='name', pageSize=MAX_PAGE_SIZE_PER_REQUEST, pageToken=page_token, q="trashed=false and '{0}' in owners".format(email_address), spaces='drive', supportsTeamDrives=False, fields="nextPageToken, files(id, name, mimeType, parents)").execute()
items = response.get('files', [])
for item in items:
file_name = item['name']
file_id = item['id']
parent_id = item['parents'][0]
if item['mimeType'] == 'application/':
file_type = FileType.FOLDER
file_type = FileType.FILE
node = Node(path=None, basename=file_name, depth=None, file_type=file_type, file_id=file_id)
nodes[file_id] = (node, parent_id)
#logger.debug('file_name: {0}, file_id: {1}, parent_id: {2}'.format(file_name, file_id, parent_id))
page_token = response.get('nextPageToken', None)
if page_token is None:
#connect to parent
for file_id, (node, parent_id) in nodes.items():
if parent_id is None: #root node
return root
if __name__ == '__main__':
CLIENT_SECRET_FILE = 'client_secret.json'
APPLICATION_NAME = 'Drive API Python Quickstart'
credentials = get_credentials()
http = credentials.authorize(httplib2.Http())
service ='drive', 'v3', http=http)
start_time = time.time()
root = get_whole_tree(drive_service=service)
print('\nThere are {0} files and {1} folders.'.format(*root.count_children()))
print('Search took {0} [sec].'.format(time.time()-start_time))
motchy869 commented Nov 1, 2017

Recently, I needed to get whole item tree in Google Drive automatically. I found Python3 can handle Google Drive REST v3 API. I tried DFS-serarch firstly. Of course it worked, but very slow because each 'list' request makes its own HTTP connection. Then I tried BFS-search and extend it to use batch processing. It worked much faster than no-batch processing.

I tested the DFS-search and BFS+batch, and measured time consumption. no-batch-BFS-serach had similar result to DFS-search. I run these 2 codes in my Google Drive which has 1754 files and 1028 folders. DFS method took 409 sec. BFS+batch method took only 64 sec. BFS+batch method saves muth time.

motchy869 commented Nov 4, 2017

Next day I came up with more efficient way.

1.Get all nodes (include information: name, id, mimeType, parents) using files().get() method.
2.Refer each item's parent attribute, then connect to parent correctly.
3.Now you have a Spanning-Tree you wanted.

Applying this way (, It took only 17.3 sec to specify whole tree in my Google Drive which has 1754 files and 1028 folders (same condition as my previous comment).

helpse commented Dec 28, 2019

Hey motchy. What would you think about starting the frontend interface using React?

Thank you for comment. I'm so busy in these days, but I'll try it later.

