georgf/load_and_run.ipynb Secret

## load_and_run.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              load_and_run.ipynb
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## load_and_run.py

# coding: utf-8

# In[1]:

# This comes from https://github.com/harterrt/cookiecutter-python-etl/
# Thanks Harter!


# In[2]:

import boto3
import botocore
import os


# In[3]:

repo_dir = "probe-scraper"
output_dir = "/home/hadoop/analyses/probe_data"
cache_dir = "/home/hadoop/analyses/probe_cache"
repo_https_url = "https://github.com/georgf/probe-scraper"

S3_PUBLIC_BUCKET = "telemetry-public-analysis-2"
S3_DATA_PATH = "probe-scraper/data/"
OUTPUT_FILES = ["general.json", "probes.json", "revisions.json"]


# In[4]:

get_ipython().system(u'rm -rf $repo_dir')


# In[5]:

get_ipython().system(u'rm -rf $output_dir')


# In[6]:

get_ipython().system(u'rm -rf $cache_dir')


# In[7]:

get_ipython().system(u'git config --global user.email "gfritzsche@mozilla.com" && git config --global user.name "Georg Fritzsche"')


# In[8]:

get_ipython().system(u'git clone $repo_https_url $repo_dir')


# In[9]:

get_ipython().system(u'cd $repo_dir && git pull origin master && python setup.py bdist_egg')


# In[10]:

get_ipython().system(u'mkdir $output_dir && mkdir $cache_dir')


# In[11]:

get_ipython().system(u'cd $repo_dir && pip install -r requirements.txt && python probe_scraper/runner.py --outdir $output_dir --tempdir $cache_dir')


# ## Upload the output to S3.

# In[12]:

# Get access to the S3 connect API.
client = boto3.client('s3', 'us-west-2')
transfer = boto3.s3.transfer.S3Transfer(client)


# In[13]:

# Copy the files to S3.
for file_name in OUTPUT_FILES:
    source_path = os.path.join(output_dir, file_name)
    key_path = S3_DATA_PATH + file_name
    print "uploading " + file_name + " to s3: " + key_path
    transfer.upload_file(source_path, S3_PUBLIC_BUCKET, key_path)


# ## GZIP?
#
# Ok, lets try out the gzip content encoding approach from [this gist](https://gist.github.com/veselosky/9427faa38cee75cd8e27#file-s3gzip-py-L32).
#
#
# Or should we let CloudFront do this?

# In[14]:

from io import BytesIO
from gzip import GzipFile


# In[15]:

path = "/home/hadoop/analyses/probe_data/probes.json"
with open(path) as fi:
    text_body = fi.read().decode("utf-8")


# In[16]:

text_body[0]


# In[17]:

gz_body = BytesIO()
gz = GzipFile(None, 'wb', 9, gz_body)
gz.write(text_body.encode('utf-8'))  # convert unicode strings to bytes!
gz.close()


# In[20]:

client.put_object(
    ACL='public-read',
    Bucket=S3_PUBLIC_BUCKET,
    Key=S3_DATA_PATH + 'probes2.json',  # Note: NO .gz extension!
    ContentType='text/plain',  # the original type
    ContentEncoding='gzip',  # MUST have or browsers will error
    Body=gz_body.getvalue()
)


# In[ ]:

	# coding: utf-8

	# In[1]:

	# This comes from https://github.com/harterrt/cookiecutter-python-etl/
	# Thanks Harter!


	# In[2]:

	import boto3
	import botocore
	import os


	# In[3]:

	repo_dir = "probe-scraper"
	output_dir = "/home/hadoop/analyses/probe_data"
	cache_dir = "/home/hadoop/analyses/probe_cache"
	repo_https_url = "https://github.com/georgf/probe-scraper"

	S3_PUBLIC_BUCKET = "telemetry-public-analysis-2"
	S3_DATA_PATH = "probe-scraper/data/"
	OUTPUT_FILES = ["general.json", "probes.json", "revisions.json"]


	# In[4]:

	get_ipython().system(u'rm -rf $repo_dir')


	# In[5]:

	get_ipython().system(u'rm -rf $output_dir')


	# In[6]:

	get_ipython().system(u'rm -rf $cache_dir')


	# In[7]:

	get_ipython().system(u'git config --global user.email "gfritzsche@mozilla.com" && git config --global user.name "Georg Fritzsche"')


	# In[8]:

	get_ipython().system(u'git clone $repo_https_url $repo_dir')


	# In[9]:

	get_ipython().system(u'cd $repo_dir && git pull origin master && python setup.py bdist_egg')


	# In[10]:

	get_ipython().system(u'mkdir $output_dir && mkdir $cache_dir')


	# In[11]:

	get_ipython().system(u'cd $repo_dir && pip install -r requirements.txt && python probe_scraper/runner.py --outdir $output_dir --tempdir $cache_dir')


	# ## Upload the output to S3.

	# In[12]:

	# Get access to the S3 connect API.
	client = boto3.client('s3', 'us-west-2')
	transfer = boto3.s3.transfer.S3Transfer(client)


	# In[13]:

	# Copy the files to S3.
	for file_name in OUTPUT_FILES:
	source_path = os.path.join(output_dir, file_name)
	key_path = S3_DATA_PATH + file_name
	print "uploading " + file_name + " to s3: " + key_path
	transfer.upload_file(source_path, S3_PUBLIC_BUCKET, key_path)


	# ## GZIP?
	#
	# Ok, lets try out the gzip content encoding approach from [this gist](https://gist.github.com/veselosky/9427faa38cee75cd8e27#file-s3gzip-py-L32).
	#
	#
	# Or should we let CloudFront do this?

	# In[14]:

	from io import BytesIO
	from gzip import GzipFile


	# In[15]:

	path = "/home/hadoop/analyses/probe_data/probes.json"
	with open(path) as fi:
	text_body = fi.read().decode("utf-8")


	# In[16]:

	text_body[0]


	# In[17]:

	gz_body = BytesIO()
	gz = GzipFile(None, 'wb', 9, gz_body)
	gz.write(text_body.encode('utf-8')) # convert unicode strings to bytes!
	gz.close()


	# In[20]:

	client.put_object(
	ACL='public-read',
	Bucket=S3_PUBLIC_BUCKET,
	Key=S3_DATA_PATH + 'probes2.json', # Note: NO .gz extension!
	ContentType='text/plain', # the original type
	ContentEncoding='gzip', # MUST have or browsers will error
	Body=gz_body.getvalue()
	)


	# In[ ]: