microft/gideon.py

## gideon.py
from __future__ import division
import logging
from flask import Flask, request, jsonify
from gensim import corpora, models, similarities

SIMILARITY_THRESHOLD = 0.5

INDUSTRIES = {
    'funding': {},
    'txt50': {}
}

stoplist = set('for a of the and to in \' "'.split())

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)


def load_industry_data(name):
    dict_path = '{name}.dict'.format(**locals())
    dictionary = corpora.Dictionary.load(dict_path)

    # corpus_path = 'attentive.mm'
    # corpus = corpora.MmCorpus(corpus_path)

    lsi_path = '{name}.lsi'.format(**locals())
    lsimodel = models.LsiModel.load(lsi_path)

    index_path = '{name}.index'.format(**locals())
    index = similarities.MatrixSimilarity.load(index_path)

    INDUSTRIES[name] = {
        'dictionary': dictionary,
        # 'corpus': corpus,
        'lsimodel': lsimodel,
        'index': index
    }


def similars(article, industry, threshold=SIMILARITY_THRESHOLD):
    vec_bow = INDUSTRIES[industry]['dictionary'].doc2bow(
        article)
    vec_lsi = INDUSTRIES[industry]['lsimodel'][vec_bow]
    sims = INDUSTRIES[industry]['index'][vec_lsi]
    sims = sorted(enumerate(sims), key=lambda item: -item[1])
    return len(filter(lambda x: x[1] > float(threshold), sims))


def clean_article(body):
    words = body.lower().split()
    return [word for word in words if word not in stoplist]


for name in INDUSTRIES:
    load_industry_data(name)

application = Flask(__name__)


@application.route("/", methods=['POST'])
def classify():
    article = clean_article(request.form['article'])
    threshold = request.form.get('threshold', SIMILARITY_THRESHOLD)
    result = {}
    for ind in INDUSTRIES:
        nsimilars = similars(article, ind, threshold=threshold)
        ntotal = INDUSTRIES[ind]['dictionary'].num_docs
        result[ind] = {
            'similars': nsimilars,
            'total': ntotal,
            'percentage': (nsimilars / ntotal) * 100
        }
    return jsonify(result)


if __name__ == "__main__":
    application.run()
	from __future__ import division
	import logging
	from flask import Flask, request, jsonify
	from gensim import corpora, models, similarities

	SIMILARITY_THRESHOLD = 0.5

	INDUSTRIES = {
	'funding': {},
	'txt50': {}
	}

	stoplist = set('for a of the and to in \' "'.split())

	logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
	level=logging.INFO)


	def load_industry_data(name):
	dict_path = '{name}.dict'.format(**locals())
	dictionary = corpora.Dictionary.load(dict_path)

	# corpus_path = 'attentive.mm'
	# corpus = corpora.MmCorpus(corpus_path)

	lsi_path = '{name}.lsi'.format(**locals())
	lsimodel = models.LsiModel.load(lsi_path)

	index_path = '{name}.index'.format(**locals())
	index = similarities.MatrixSimilarity.load(index_path)

	INDUSTRIES[name] = {
	'dictionary': dictionary,
	# 'corpus': corpus,
	'lsimodel': lsimodel,
	'index': index
	}


	def similars(article, industry, threshold=SIMILARITY_THRESHOLD):
	vec_bow = INDUSTRIES[industry]['dictionary'].doc2bow(
	article)
	vec_lsi = INDUSTRIES[industry]['lsimodel'][vec_bow]
	sims = INDUSTRIES[industry]['index'][vec_lsi]
	sims = sorted(enumerate(sims), key=lambda item: -item[1])
	return len(filter(lambda x: x[1] > float(threshold), sims))


	def clean_article(body):
	words = body.lower().split()
	return [word for word in words if word not in stoplist]


	for name in INDUSTRIES:
	load_industry_data(name)

	application = Flask(__name__)


	@application.route("/", methods=['POST'])
	def classify():
	article = clean_article(request.form['article'])
	threshold = request.form.get('threshold', SIMILARITY_THRESHOLD)
	result = {}
	for ind in INDUSTRIES:
	nsimilars = similars(article, ind, threshold=threshold)
	ntotal = INDUSTRIES[ind]['dictionary'].num_docs
	result[ind] = {
	'similars': nsimilars,
	'total': ntotal,
	'percentage': (nsimilars / ntotal) * 100
	}
	return jsonify(result)


	if __name__ == "__main__":
	application.run()