Skip to content

Instantly share code, notes, and snippets.

@yuriybash
Last active January 24, 2019 22:53
Show Gist options
  • Save yuriybash/4fec8305135f361fd48043256daf50f2 to your computer and use it in GitHub Desktop.
Save yuriybash/4fec8305135f361fd48043256daf50f2 to your computer and use it in GitHub Desktop.
lambda code
# -*- coding: utf-8 -*-
import numpy as np
import boto3
import pickle
from flask import Flask, request, json
BUCKET_NAME = 'non-hacker-news-models'
MODEL_FILE_NAME = 'model.pkl'
T_VECTORIZER_FILE_NAME = 't_vectorizer.pkl'
U_VECTORIZER_FILE_NAME = 'u_vectorizer.pkl'
app = Flask(__name__)
S3 = boto3.client('s3', region_name='us-east-1')
@app.route('/', methods=['POST'])
def index():
# Parse request body for model input
body_dict = request.get_json(silent=True)
data = body_dict['data']
# Load model and vectorizers
model = load_model(MODEL_FILE_NAME)
t_vectorizer = load_vectorizer(T_VECTORIZER_FILE_NAME)
u_vectorizer = load_vectorizer(U_VECTORIZER_FILE_NAME)
#<----- how i *think* it should work, but it doesn't----->:
titles = [title_url_pair[0] for title_url_pair in data] # [u'python hacker c++', u'trump wall politics']
urls = [title_url_pair[1] for title_url_pair in data] # [u'github', u'nytimes']
title_matrix = t_vectorizer.transform(titles).toarray() # title_matrix.shape: (2, 500)
url_matrix = t_vectorizer.transform(urls).toarray() # url_matrix.shape: (2, 500)
X = np.concatenate([title_matrix, url_matrix], axis=1) # X.shape: (2, 1000)
model.predict(X) # array([0, 0])
import ipdb; ipdb. set_trace()
# <------- current implementation, it does work--------->
# this is a slow, non-vectorized implementation, but the implementation above predicts
# [0, 0] when it should predict [0, 1]. this one predicts [0, 1] correctly
to_concatenate = []
for pair in data:
title = pair[0]
url = pair[1]
title_vec = t_vectorizer.transform([title]).toarray()
url_vec = u_vectorizer.transform([url]).toarray()
to_concatenate.append(np.concatenate([title_vec, url_vec], axis=1))
concatenated = np.concatenate(to_concatenate)
# Make prediction
prediction = model.predict(concatenated).tolist()
# Respond with prediction result
result = {'prediction': prediction}
return json.dumps(result)
def load_vectorizer(v_filename):
response = S3.get_object(Bucket=BUCKET_NAME, Key=v_filename)
# Load pickled vectorizer
v_str = response['Body'].read()
vectorizer = pickle.loads(v_str)
return vectorizer
def load_model(key):
# Load model from S3 bucket
response = S3.get_object(Bucket=BUCKET_NAME, Key=key)
# Load pickle model
model_str = response['Body'].read()
model = pickle.loads(model_str)
return model
if __name__ == '__main__':
# listen on all IPs
app.run(host='0.0.0.0')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment