Skip to content

Instantly share code, notes, and snippets.

@edsu
Last active July 18, 2024 14:48
Show Gist options
  • Save edsu/95dcb1498e61f36938236f411098afbb to your computer and use it in GitHub Desktop.
Save edsu/95dcb1498e61f36938236f411098afbb to your computer and use it in GitHub Desktop.
A sloppy prototype for moving browsertrix WACZs to AWS S3.

This is a sloppy script to download some crawls from Browsertrix and upload them one by one to an s3 bucket.

TODO

  • rewrite this with command line options for:

    • s3 bucket to write to
    • what btrix org to limit to
    • what btrix collection to limit to
    • what btrix crawl names to limit to (regex)
  • maybe use rclone to be able to write to more than just s3?

Configure

Install the dependencies:

pip install -r requirements.txt

Then you will need to set the following in your .env file:

BROWSERTRIX_USERNAME=""
BROWSERTRIX_PASSWORD=""
AWS_ACCESS_KEY_ID=""
AWS_SECRET_ACCESS_KEY=""
# and optionally
# AWS_ROLE_ARN=""
# AWS_DEFAULT_REGION=""
#!/usr/bin/env python3
import os
import functools
import subprocess
import boto3
import dotenv
import requests
dotenv.load_dotenv()
# might make sense to rewrite this to use rclone and allow the command line
# version to take some arguments to limit to particular corganizations,
# collectsions, crawls, etc
class BrowsertrixClient:
def __init__(self, base_url):
if not base_url.endswith('/api/'):
base_url += '/api/'
self.base_url = base_url
self.access_token = None
self.token_type = None
self.http = requests.Session()
def login(self, username, password):
data = self._post(
'auth/jwt/login',
{
"grant_type": "password",
"username": username,
"password": password,
}
)
self.access_token = data['access_token']
self.token_type = data['token_type']
def organizations(self):
return self._get('orgs')['items']
def collections(self, org_id):
return self._get(f'orgs/{org_id}/collections')['items']
def crawls(self, org_id):
return self._get(f'orgs/{org_id}/crawls')['items']
def me(self):
return self._get('users/me')
def waczs(self, org_id, crawl_id):
results = self._get(f'orgs/{org_id}/crawls/{crawl_id}/replay.json')
yield from results['resources']
def _get(self, path, params=None):
url = self.base_url + path
resp = self.http.get(url, params=params, headers=self._headers())
resp.raise_for_status()
return resp.json()
def _post(self, path, data):
url = self.base_url + path
resp = self.http.post(url, data=data, headers=self._headers())
resp.raise_for_status()
return resp.json()
def _headers(self):
headers = {}
if self.access_token:
headers['Authorization'] = f"{self.token_type} {self.access_token}"
return headers
def main():
username = os.environ.get('BROWSERTRIX_USERNAME')
password = os.environ.get('BROWSERTRIX_PASSWORD')
btrix = BrowsertrixClient('https://app.browsertrix.com')
btrix.login(username, password)
orgs = btrix.organizations()
org_id = orgs[0]['id']
for crawl in btrix.crawls(org_id):
if crawl['name'] == 'La retaguardia':
for wacz in (btrix.waczs(org_id, crawl['id'])):
if already_uploaded(wacz):
print(f"already uploaded {wacz['name']}")
continue
print(f"downloading {wacz['name']}")
subprocess.run(['curl', '--silent', '--output', 'download.wacz', wacz['path']], check=True)
print(f"uploading {wacz['name']}")
s3 = get_s3()
bucket = s3.Bucket('sul-webarchives')
bucket.upload_file('download.wacz', f"retaguardia/{wacz['name']}")
os.remove('download.wacz')
@functools.cache
def s3_files():
s3 = get_s3()
bucket = s3.Bucket('sul-webarchives')
return list(bucket.objects.all())
def get_s3():
cred = get_credentials()
session = boto3.session.Session(
aws_access_key_id=cred['AccessKeyId'],
aws_secret_access_key=cred['SecretAccessKey'],
aws_session_token=cred['SessionToken']
)
return session.resource('s3')
def get_credentials():
sts = boto3.client('sts')
cred = sts.assume_role(
RoleArn=os.environ.get('AWS_ROLE_ARN'),
RoleSessionName='browsertrix-s3',
DurationSeconds=10000
)
return cred['Credentials']
def already_uploaded(wacz):
for obj in s3_files():
if obj.key == f"retaguardia/{wacz['name']}" and obj.size == wacz['size']:
return True
return False
if __name__ == "__main__":
main()
boto3
requests
python-dotenv
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment