|
#!/usr/bin/env python3 |
|
|
|
import os |
|
import functools |
|
import subprocess |
|
|
|
import boto3 |
|
import dotenv |
|
import requests |
|
|
|
dotenv.load_dotenv() |
|
|
|
# might make sense to rewrite this to use rclone and allow the command line |
|
# version to take some arguments to limit to particular corganizations, |
|
# collectsions, crawls, etc |
|
|
|
class BrowsertrixClient: |
|
|
|
def __init__(self, base_url): |
|
if not base_url.endswith('/api/'): |
|
base_url += '/api/' |
|
|
|
self.base_url = base_url |
|
self.access_token = None |
|
self.token_type = None |
|
self.http = requests.Session() |
|
|
|
def login(self, username, password): |
|
data = self._post( |
|
'auth/jwt/login', |
|
{ |
|
"grant_type": "password", |
|
"username": username, |
|
"password": password, |
|
} |
|
) |
|
self.access_token = data['access_token'] |
|
self.token_type = data['token_type'] |
|
|
|
def organizations(self): |
|
return self._get('orgs')['items'] |
|
|
|
def collections(self, org_id): |
|
return self._get(f'orgs/{org_id}/collections')['items'] |
|
|
|
def crawls(self, org_id): |
|
return self._get(f'orgs/{org_id}/crawls')['items'] |
|
|
|
def me(self): |
|
return self._get('users/me') |
|
|
|
def waczs(self, org_id, crawl_id): |
|
results = self._get(f'orgs/{org_id}/crawls/{crawl_id}/replay.json') |
|
yield from results['resources'] |
|
|
|
def _get(self, path, params=None): |
|
url = self.base_url + path |
|
resp = self.http.get(url, params=params, headers=self._headers()) |
|
resp.raise_for_status() |
|
|
|
return resp.json() |
|
|
|
def _post(self, path, data): |
|
url = self.base_url + path |
|
resp = self.http.post(url, data=data, headers=self._headers()) |
|
resp.raise_for_status() |
|
|
|
return resp.json() |
|
|
|
def _headers(self): |
|
headers = {} |
|
if self.access_token: |
|
headers['Authorization'] = f"{self.token_type} {self.access_token}" |
|
|
|
return headers |
|
|
|
|
|
def main(): |
|
username = os.environ.get('BROWSERTRIX_USERNAME') |
|
password = os.environ.get('BROWSERTRIX_PASSWORD') |
|
|
|
btrix = BrowsertrixClient('https://app.browsertrix.com') |
|
btrix.login(username, password) |
|
|
|
orgs = btrix.organizations() |
|
org_id = orgs[0]['id'] |
|
|
|
for crawl in btrix.crawls(org_id): |
|
if crawl['name'] == 'La retaguardia': |
|
for wacz in (btrix.waczs(org_id, crawl['id'])): |
|
if already_uploaded(wacz): |
|
print(f"already uploaded {wacz['name']}") |
|
continue |
|
|
|
print(f"downloading {wacz['name']}") |
|
subprocess.run(['curl', '--silent', '--output', 'download.wacz', wacz['path']], check=True) |
|
|
|
print(f"uploading {wacz['name']}") |
|
s3 = get_s3() |
|
bucket = s3.Bucket('sul-webarchives') |
|
bucket.upload_file('download.wacz', f"retaguardia/{wacz['name']}") |
|
|
|
os.remove('download.wacz') |
|
|
|
|
|
@functools.cache |
|
def s3_files(): |
|
s3 = get_s3() |
|
bucket = s3.Bucket('sul-webarchives') |
|
return list(bucket.objects.all()) |
|
|
|
def get_s3(): |
|
cred = get_credentials() |
|
session = boto3.session.Session( |
|
aws_access_key_id=cred['AccessKeyId'], |
|
aws_secret_access_key=cred['SecretAccessKey'], |
|
aws_session_token=cred['SessionToken'] |
|
) |
|
return session.resource('s3') |
|
|
|
def get_credentials(): |
|
sts = boto3.client('sts') |
|
cred = sts.assume_role( |
|
RoleArn=os.environ.get('AWS_ROLE_ARN'), |
|
RoleSessionName='browsertrix-s3', |
|
DurationSeconds=10000 |
|
) |
|
return cred['Credentials'] |
|
|
|
def already_uploaded(wacz): |
|
for obj in s3_files(): |
|
if obj.key == f"retaguardia/{wacz['name']}" and obj.size == wacz['size']: |
|
return True |
|
return False |
|
|
|
if __name__ == "__main__": |
|
main() |