Created
May 11, 2020 11:20
-
-
Save hellais/d60c3ca9d4d456c01ecf793a5763d510 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import io | |
import yaml | |
import json | |
from datetime import datetime, timedelta | |
import pandas as pd | |
import psycopg2 | |
import boto3 | |
import requests | |
from tqdm import tqdm | |
from dateutil.parser import parse as parse_date | |
from urllib.parse import urlencode, quote, urlparse | |
import requests | |
import lz4framed | |
from urllib.parse import urljoin | |
import gzip | |
import subprocess | |
import tarfile | |
CANNED_BASE_URL = 'http://s3.amazonaws.com/ooni-data/canned/' | |
def load_canned_index(bucket_date): | |
r = requests.get( | |
urljoin(CANNED_BASE_URL, '{}/index.json.gz'.format(bucket_date)) | |
) | |
blob = r.content | |
return gzip.decompress(blob).split(b'\n') | |
def decompress_tar(filename): | |
tar_data = b'' | |
p = subprocess.Popen(["lz4", "-d"], stdin=subprocess.PIPE, stdout=subprocess.PIPE) | |
r = requests.get(urljoin(CANNED_BASE_URL, filename)) | |
stdout_data, _ = p.communicate(input=r.content) | |
return tarfile.open(mode="r:", fileobj=io.BytesIO(stdout_data)) | |
canned_index = load_canned_index('2020-04-01') | |
## XXX need to filter by test_name | |
can = json.loads(canned_index[3]) | |
tarfd = decompress_tar(can['filename']) | |
print(tarfd.getmembers()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment