-
-
Save georgf/0e6043c998c3bca754570a86d875d397 to your computer and use it in GitHub Desktop.
load_and_run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
# In[1]: | |
# This comes from https://github.com/harterrt/cookiecutter-python-etl/ | |
# Thanks Harter! | |
# In[2]: | |
import boto3 | |
import botocore | |
import os | |
# In[3]: | |
repo_dir = "probe-scraper" | |
output_dir = "/home/hadoop/analyses/probe_data" | |
cache_dir = "/home/hadoop/analyses/probe_cache" | |
repo_https_url = "https://github.com/georgf/probe-scraper" | |
S3_PUBLIC_BUCKET = "telemetry-public-analysis-2" | |
S3_DATA_PATH = "probe-scraper/data/" | |
OUTPUT_FILES = ["general.json", "probes.json", "revisions.json"] | |
# In[4]: | |
get_ipython().system(u'rm -rf $repo_dir') | |
# In[5]: | |
get_ipython().system(u'rm -rf $output_dir') | |
# In[6]: | |
get_ipython().system(u'rm -rf $cache_dir') | |
# In[7]: | |
get_ipython().system(u'git config --global user.email "gfritzsche@mozilla.com" && git config --global user.name "Georg Fritzsche"') | |
# In[8]: | |
get_ipython().system(u'git clone $repo_https_url $repo_dir') | |
# In[9]: | |
get_ipython().system(u'cd $repo_dir && git pull origin master && python setup.py bdist_egg') | |
# In[10]: | |
get_ipython().system(u'mkdir $output_dir && mkdir $cache_dir') | |
# In[11]: | |
get_ipython().system(u'cd $repo_dir && pip install -r requirements.txt && python probe_scraper/runner.py --outdir $output_dir --tempdir $cache_dir') | |
# ## Upload the output to S3. | |
# In[12]: | |
# Get access to the S3 connect API. | |
client = boto3.client('s3', 'us-west-2') | |
transfer = boto3.s3.transfer.S3Transfer(client) | |
# In[13]: | |
# Copy the files to S3. | |
for file_name in OUTPUT_FILES: | |
source_path = os.path.join(output_dir, file_name) | |
key_path = S3_DATA_PATH + file_name | |
print "uploading " + file_name + " to s3: " + key_path | |
transfer.upload_file(source_path, S3_PUBLIC_BUCKET, key_path) | |
# ## GZIP? | |
# | |
# Ok, lets try out the gzip content encoding approach from [this gist](https://gist.github.com/veselosky/9427faa38cee75cd8e27#file-s3gzip-py-L32). | |
# | |
# | |
# Or should we let CloudFront do this? | |
# In[14]: | |
from io import BytesIO | |
from gzip import GzipFile | |
# In[15]: | |
path = "/home/hadoop/analyses/probe_data/probes.json" | |
with open(path) as fi: | |
text_body = fi.read().decode("utf-8") | |
# In[16]: | |
text_body[0] | |
# In[17]: | |
gz_body = BytesIO() | |
gz = GzipFile(None, 'wb', 9, gz_body) | |
gz.write(text_body.encode('utf-8')) # convert unicode strings to bytes! | |
gz.close() | |
# In[20]: | |
client.put_object( | |
ACL='public-read', | |
Bucket=S3_PUBLIC_BUCKET, | |
Key=S3_DATA_PATH + 'probes2.json', # Note: NO .gz extension! | |
ContentType='text/plain', # the original type | |
ContentEncoding='gzip', # MUST have or browsers will error | |
Body=gz_body.getvalue() | |
) | |
# In[ ]: | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment