Skip to content

Instantly share code, notes, and snippets.

@hellais
Created May 11, 2020 11:20
Show Gist options
  • Save hellais/d60c3ca9d4d456c01ecf793a5763d510 to your computer and use it in GitHub Desktop.
Save hellais/d60c3ca9d4d456c01ecf793a5763d510 to your computer and use it in GitHub Desktop.
import os
import io
import yaml
import json
from datetime import datetime, timedelta
import pandas as pd
import psycopg2
import boto3
import requests
from tqdm import tqdm
from dateutil.parser import parse as parse_date
from urllib.parse import urlencode, quote, urlparse
import requests
import lz4framed
from urllib.parse import urljoin
import gzip
import subprocess
import tarfile
CANNED_BASE_URL = 'http://s3.amazonaws.com/ooni-data/canned/'
def load_canned_index(bucket_date):
r = requests.get(
urljoin(CANNED_BASE_URL, '{}/index.json.gz'.format(bucket_date))
)
blob = r.content
return gzip.decompress(blob).split(b'\n')
def decompress_tar(filename):
tar_data = b''
p = subprocess.Popen(["lz4", "-d"], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
r = requests.get(urljoin(CANNED_BASE_URL, filename))
stdout_data, _ = p.communicate(input=r.content)
return tarfile.open(mode="r:", fileobj=io.BytesIO(stdout_data))
canned_index = load_canned_index('2020-04-01')
## XXX need to filter by test_name
can = json.loads(canned_index[3])
tarfd = decompress_tar(can['filename'])
print(tarfd.getmembers())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment