Skip to content

Instantly share code, notes, and snippets.

@microft
Created November 29, 2016 09:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save microft/288c06cf0e89706b2787aefb493aa290 to your computer and use it in GitHub Desktop.
Save microft/288c06cf0e89706b2787aefb493aa290 to your computer and use it in GitHub Desktop.
a small Flask service
from __future__ import division
import logging
from flask import Flask, request, jsonify
from gensim import corpora, models, similarities
SIMILARITY_THRESHOLD = 0.5
INDUSTRIES = {
'funding': {},
'txt50': {}
}
stoplist = set('for a of the and to in \' "'.split())
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
level=logging.INFO)
def load_industry_data(name):
dict_path = '{name}.dict'.format(**locals())
dictionary = corpora.Dictionary.load(dict_path)
# corpus_path = 'attentive.mm'
# corpus = corpora.MmCorpus(corpus_path)
lsi_path = '{name}.lsi'.format(**locals())
lsimodel = models.LsiModel.load(lsi_path)
index_path = '{name}.index'.format(**locals())
index = similarities.MatrixSimilarity.load(index_path)
INDUSTRIES[name] = {
'dictionary': dictionary,
# 'corpus': corpus,
'lsimodel': lsimodel,
'index': index
}
def similars(article, industry, threshold=SIMILARITY_THRESHOLD):
vec_bow = INDUSTRIES[industry]['dictionary'].doc2bow(
article)
vec_lsi = INDUSTRIES[industry]['lsimodel'][vec_bow]
sims = INDUSTRIES[industry]['index'][vec_lsi]
sims = sorted(enumerate(sims), key=lambda item: -item[1])
return len(filter(lambda x: x[1] > float(threshold), sims))
def clean_article(body):
words = body.lower().split()
return [word for word in words if word not in stoplist]
for name in INDUSTRIES:
load_industry_data(name)
application = Flask(__name__)
@application.route("/", methods=['POST'])
def classify():
article = clean_article(request.form['article'])
threshold = request.form.get('threshold', SIMILARITY_THRESHOLD)
result = {}
for ind in INDUSTRIES:
nsimilars = similars(article, ind, threshold=threshold)
ntotal = INDUSTRIES[ind]['dictionary'].num_docs
result[ind] = {
'similars': nsimilars,
'total': ntotal,
'percentage': (nsimilars / ntotal) * 100
}
return jsonify(result)
if __name__ == "__main__":
application.run()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment