Created
October 10, 2023 23:35
-
-
Save zoharbabin/075b317557536f507e69c16203ea81aa to your computer and use it in GitHub Desktop.
Recursively migrate all media assets from Google Drive to Kaltura using Bulk Upload CSV
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Google Drive Media Extractor for Kaltura Upload: | |
This script processes a specified Google Drive folder to identify media files | |
(audio, video, and images) and produces a CSV file suitable for bulk upload | |
to the Kaltura platform. It recursively traverses through all subfolders, | |
captures metadata about the media files, and appends them to the CSV. Media | |
files are determined based on their MIME type. | |
Author Metadata: | |
- Name: Zohar Babin | |
- Date: October 11, 2023 | |
- Contact: @zoharbabin on GitHub / @zohar on X.com | |
- License: MIT | |
Setup: | |
1. Obtain a service account key from the Google Cloud Console: | |
a. Go to the Google Cloud Console (https://console.cloud.google.com/). | |
b. Navigate to IAM & Admin > Service accounts. | |
c. Create a new service account or select an existing one. | |
d. Under "Keys", add a new JSON key. | |
e. Save the downloaded JSON file as 'credentials.json' in the script's directory. | |
f. Ensure the service account has permissions for Drive API access and can | |
read metadata from the desired Google Drive folder. | |
2. Install required packages via pip: | |
- pandas | |
- google-auth, google-auth-oauthlib, google-auth-httplib2 | |
- google-api-python-client | |
- halo | |
Usage: | |
Run the script from the command line and provide the Google Drive folder ID | |
and optionally a root category name for the Kaltura CSV: | |
python script_name.py [folder_id] [root_category_name] | |
Note: Ensure 'credentials.json' is present in the working directory and contains | |
the necessary permissions. | |
""" | |
import re | |
import argparse | |
import pandas as pd | |
import os | |
from google.oauth2 import service_account | |
from googleapiclient.discovery import build | |
from halo import Halo | |
# Function to recursively list files and folders from a Google Drive folder and filter out media files. | |
def list_files(folder_id, folder_name, spinner, folder_path=''): | |
""" | |
Recursively list files and folders from a Google Drive folder. | |
Parameters: | |
- folder_id (str): The ID of the Google Drive folder. | |
- folder_name (str): Name of the current folder. | |
- spinner (halo.Halo): Spinner instance for console output. | |
- folder_path (str): Path of the current folder, default is empty. | |
Returns: | |
None. The function directly appends to the global rows_list. | |
""" | |
# Query to fetch files and folders inside the current folder | |
query = f"'{folder_id}' in parents" | |
results = drive_service.files().list(q=query, fields="files(id, name, mimeType, owners, description, kind, fileExtension, fileExtension)").execute() | |
items = results.get('files', []) | |
for item in items: | |
mime_type = item['mimeType'] | |
file_id = item['id'] | |
file_name = item['name'] | |
# If the item is a folder, recursively call the function | |
if mime_type == 'application/vnd.google-apps.folder': | |
separator = '>' if folder_path else '' # Avoid adding the separator before the first folder | |
new_folder_path = f'{folder_path}{separator}{file_name}' | |
list_files(file_id, file_name, spinner, new_folder_path) | |
# Check if the item is an image, video, or audio file | |
elif mime_type.startswith(('image/', 'video/', 'audio/')): | |
spinner.text = f'Processing {file_name} ({mime_type}) in folder {folder_path or '/'}' | |
media_type = mime_type.split('/')[0] # Extracts "audio", "video", or "image" from mime_type | |
user_name = item['owners'][0]['displayName'] | |
user_email = item['owners'][0]['emailAddress'] | |
file_extension = item.get('fileExtension', '') | |
description = item.get('description', '') | |
download_url = f'https://drive.google.com/uc?export=download&id={file_id}' | |
row_dict = { | |
'title': file_name, | |
'description': f'By {user_name} in {folder_name}. \n{description}', | |
'tags': '', | |
'url': download_url, | |
'contentType': media_type.capitalize(), | |
'category': folder_path, | |
'scheduleStartDate': '', | |
'scheduleEndDate': '', | |
'thumbnailUrl': '', | |
'partnerData': '', | |
'creatorId': user_email, | |
'entitledUsersEdit': '', | |
'entitledUsersPublish': '', | |
'ownerId': user_email | |
} | |
rows_list.append(row_dict) | |
def main(folder_id, root_category_name): | |
""" | |
Main function to process files from a Google Drive folder and save them in a CSV file. | |
Parameters: | |
- folder_id (str): The ID of the Google Drive folder. | |
- root_category_name (str): Root category name to be appended to all categories in the CSV. | |
Returns: | |
None. Writes results to 'kaltura_upload.csv'. | |
""" | |
with Halo(text='Processing files', spinner='dots') as spinner: | |
list_files(folder_id, 'Root', spinner, root_category_name) | |
spinner.succeed('Processing completed') | |
df = pd.DataFrame(rows_list) | |
df.to_csv('kaltura_upload.csv', index=False, encoding='utf-8-sig') | |
if __name__ == "__main__": | |
# Argument parsing to get folder_id and root_category_name from command line | |
parser = argparse.ArgumentParser(description='Process a Google Drive folder to create a Kaltura bulk upload CSV.') | |
parser.add_argument('folder_id', help='The ID of the Google Drive folder to process.') | |
parser.add_argument('root_category_name', nargs='?', default='', help='If provided, will be appended to all categories as the root in the Kaltura CSV') | |
args = parser.parse_args() | |
# Load client secrets for Google Drive API | |
creds = service_account.Credentials.from_service_account_file( | |
'./credentials.json', scopes=['https://www.googleapis.com/auth/drive.metadata.readonly']) | |
# Initialize the Google Drive API client | |
drive_service = build('drive', 'v3', credentials=creds) | |
# Global list to store rows for CSV | |
rows_list = [] | |
main(args.folder_id, args.root_category_name) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment