Skip to content

Instantly share code, notes, and snippets.

@georgf
Created July 14, 2017 03:04
Show Gist options
  • Save georgf/0e6043c998c3bca754570a86d875d397 to your computer and use it in GitHub Desktop.
Save georgf/0e6043c998c3bca754570a86d875d397 to your computer and use it in GitHub Desktop.
load_and_run
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
# coding: utf-8
# In[1]:
# This comes from https://github.com/harterrt/cookiecutter-python-etl/
# Thanks Harter!
# In[2]:
import boto3
import botocore
import os
# In[3]:
repo_dir = "probe-scraper"
output_dir = "/home/hadoop/analyses/probe_data"
cache_dir = "/home/hadoop/analyses/probe_cache"
repo_https_url = "https://github.com/georgf/probe-scraper"
S3_PUBLIC_BUCKET = "telemetry-public-analysis-2"
S3_DATA_PATH = "probe-scraper/data/"
OUTPUT_FILES = ["general.json", "probes.json", "revisions.json"]
# In[4]:
get_ipython().system(u'rm -rf $repo_dir')
# In[5]:
get_ipython().system(u'rm -rf $output_dir')
# In[6]:
get_ipython().system(u'rm -rf $cache_dir')
# In[7]:
get_ipython().system(u'git config --global user.email "gfritzsche@mozilla.com" && git config --global user.name "Georg Fritzsche"')
# In[8]:
get_ipython().system(u'git clone $repo_https_url $repo_dir')
# In[9]:
get_ipython().system(u'cd $repo_dir && git pull origin master && python setup.py bdist_egg')
# In[10]:
get_ipython().system(u'mkdir $output_dir && mkdir $cache_dir')
# In[11]:
get_ipython().system(u'cd $repo_dir && pip install -r requirements.txt && python probe_scraper/runner.py --outdir $output_dir --tempdir $cache_dir')
# ## Upload the output to S3.
# In[12]:
# Get access to the S3 connect API.
client = boto3.client('s3', 'us-west-2')
transfer = boto3.s3.transfer.S3Transfer(client)
# In[13]:
# Copy the files to S3.
for file_name in OUTPUT_FILES:
source_path = os.path.join(output_dir, file_name)
key_path = S3_DATA_PATH + file_name
print "uploading " + file_name + " to s3: " + key_path
transfer.upload_file(source_path, S3_PUBLIC_BUCKET, key_path)
# ## GZIP?
#
# Ok, lets try out the gzip content encoding approach from [this gist](https://gist.github.com/veselosky/9427faa38cee75cd8e27#file-s3gzip-py-L32).
#
#
# Or should we let CloudFront do this?
# In[14]:
from io import BytesIO
from gzip import GzipFile
# In[15]:
path = "/home/hadoop/analyses/probe_data/probes.json"
with open(path) as fi:
text_body = fi.read().decode("utf-8")
# In[16]:
text_body[0]
# In[17]:
gz_body = BytesIO()
gz = GzipFile(None, 'wb', 9, gz_body)
gz.write(text_body.encode('utf-8')) # convert unicode strings to bytes!
gz.close()
# In[20]:
client.put_object(
ACL='public-read',
Bucket=S3_PUBLIC_BUCKET,
Key=S3_DATA_PATH + 'probes2.json', # Note: NO .gz extension!
ContentType='text/plain', # the original type
ContentEncoding='gzip', # MUST have or browsers will error
Body=gz_body.getvalue()
)
# In[ ]:
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment