Last active
January 24, 2019 22:53
-
-
Save yuriybash/4fec8305135f361fd48043256daf50f2 to your computer and use it in GitHub Desktop.
lambda code
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import numpy as np | |
import boto3 | |
import pickle | |
from flask import Flask, request, json | |
BUCKET_NAME = 'non-hacker-news-models' | |
MODEL_FILE_NAME = 'model.pkl' | |
T_VECTORIZER_FILE_NAME = 't_vectorizer.pkl' | |
U_VECTORIZER_FILE_NAME = 'u_vectorizer.pkl' | |
app = Flask(__name__) | |
S3 = boto3.client('s3', region_name='us-east-1') | |
@app.route('/', methods=['POST']) | |
def index(): | |
# Parse request body for model input | |
body_dict = request.get_json(silent=True) | |
data = body_dict['data'] | |
# Load model and vectorizers | |
model = load_model(MODEL_FILE_NAME) | |
t_vectorizer = load_vectorizer(T_VECTORIZER_FILE_NAME) | |
u_vectorizer = load_vectorizer(U_VECTORIZER_FILE_NAME) | |
#<----- how i *think* it should work, but it doesn't----->: | |
titles = [title_url_pair[0] for title_url_pair in data] # [u'python hacker c++', u'trump wall politics'] | |
urls = [title_url_pair[1] for title_url_pair in data] # [u'github', u'nytimes'] | |
title_matrix = t_vectorizer.transform(titles).toarray() # title_matrix.shape: (2, 500) | |
url_matrix = t_vectorizer.transform(urls).toarray() # url_matrix.shape: (2, 500) | |
X = np.concatenate([title_matrix, url_matrix], axis=1) # X.shape: (2, 1000) | |
model.predict(X) # array([0, 0]) | |
import ipdb; ipdb. set_trace() | |
# <------- current implementation, it does work---------> | |
# this is a slow, non-vectorized implementation, but the implementation above predicts | |
# [0, 0] when it should predict [0, 1]. this one predicts [0, 1] correctly | |
to_concatenate = [] | |
for pair in data: | |
title = pair[0] | |
url = pair[1] | |
title_vec = t_vectorizer.transform([title]).toarray() | |
url_vec = u_vectorizer.transform([url]).toarray() | |
to_concatenate.append(np.concatenate([title_vec, url_vec], axis=1)) | |
concatenated = np.concatenate(to_concatenate) | |
# Make prediction | |
prediction = model.predict(concatenated).tolist() | |
# Respond with prediction result | |
result = {'prediction': prediction} | |
return json.dumps(result) | |
def load_vectorizer(v_filename): | |
response = S3.get_object(Bucket=BUCKET_NAME, Key=v_filename) | |
# Load pickled vectorizer | |
v_str = response['Body'].read() | |
vectorizer = pickle.loads(v_str) | |
return vectorizer | |
def load_model(key): | |
# Load model from S3 bucket | |
response = S3.get_object(Bucket=BUCKET_NAME, Key=key) | |
# Load pickle model | |
model_str = response['Body'].read() | |
model = pickle.loads(model_str) | |
return model | |
if __name__ == '__main__': | |
# listen on all IPs | |
app.run(host='0.0.0.0') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment