aloncarmel/gist:6404647

## gistfile1.py
#!/usr/bin/env python
#
# Copyright 2007 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import webapp2
from datetime import datetime
from google.appengine.api import search
from google.appengine.api import urlfetch
import simplejson as json
import urllib
from urlparse import urlparse


def levenshtein(a,b):
    "Calculates the Levenshtein distance between a and b."
    n, m = len(a), len(b)
    if n > m:
        # Make sure n <= m, to use O(min(n,m)) space
        a,b = b,a
        n,m = m,n

    current = range(n+1)
    for i in range(1,m+1):
        previous, current = current, [i]+[0]*n
        for j in range(1,n+1):
            add, delete = previous[j]+1, current[j-1]+1
            change = previous[j-1]
            if a[j-1] != b[i-1]:
                change = change + 1
            current[j] = min(add, delete, change)

    return current[n]


# Grab keywords from textwise and return them

def getKeywords(url):
  apiurl = 'http://api.semantichacker.com/KEY/concept?format=json&uri='+url

	result = urlfetch.fetch(apiurl)

	json_decoder = json.decoder.JSONDecoder()

	decoded_json = json_decoder.decode(result.content)

	keywords = []

	for concept in decoded_json['conceptExtractor']['conceptExtractorResponse']['concepts']:

		keywords.append(concept['label'])

	return keywords

# Grab url and save keywords and hash for future search

class WriteDocHandler(webapp2.RequestHandler):
    def get(self):
    	url = self.request.get("url")

    	parsed_uri = urlparse(url)

    	domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)

    	keywords = getKeywords(url)

    	keywordss = ','.join(map(str, keywords))

    	document = search.Document(fields=[
    		search.TextField(name='keywords', value=keywordss),
    		search.TextField(name='url', value=url),
    		search.TextField(name='hash',value=str(hash(frozenset(keywords))))
    		])

    	index = search.Index(name=domain)
    	results = index.put(document)
    	doc_id = results[0].id

    	self.response.write(doc_id)

# Search the database for url and compare results scoring with hash using levenshtein basic method and return scores.

class SearchHandler(webapp2.RequestHandler):
    def get(self):

		url = self.request.get("url")

		parsed_uri = urlparse(url)

		domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)

		keywords = getKeywords(url)

		keywordss = ' OR '.join(map(str, keywords))

		query = search.Query("keywords = "+keywordss)

		index = search.Index(name=domain)

		results = index.search(query)

		json_encoder = json.encoder.JSONEncoder()

		jsonobj = []

		currenthashurl = str(hash(frozenset(keywords)))

		currenturlarr = {}
		currenturlarr['requested_url'] = url
		currenturlarr['hash'] = currenthashurl

		jsonobj.append(currenturlarr)

		for ScoredDocument in results:

			if(ScoredDocument.fields[1].value != url):
				arr = {}
				arr['url'] = ScoredDocument.fields[1].value
				arr['keywords'] = str(ScoredDocument.fields[0].value)
				arr['hash'] = str(ScoredDocument.fields[2].value)
				arr['score'] = float(levenshtein(currenthashurl,str(ScoredDocument.fields[2].value)))/100
				jsonobj.append(arr)

		self.response.write(json_encoder.encode(jsonobj))


app = webapp2.WSGIApplication([
    ('/writedoc', WriteDocHandler),
    ('/search',SearchHandler)
], debug=True)
	#!/usr/bin/env python
	#
	# Copyright 2007 Google Inc.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	#
	import webapp2
	from datetime import datetime
	from google.appengine.api import search
	from google.appengine.api import urlfetch
	import simplejson as json
	import urllib
	from urlparse import urlparse


	def levenshtein(a,b):
	"Calculates the Levenshtein distance between a and b."
	n, m = len(a), len(b)
	if n > m:
	# Make sure n <= m, to use O(min(n,m)) space
	a,b = b,a
	n,m = m,n

	current = range(n+1)
	for i in range(1,m+1):
	previous, current = current, [i]+[0]*n
	for j in range(1,n+1):
	add, delete = previous[j]+1, current[j-1]+1
	change = previous[j-1]
	if a[j-1] != b[i-1]:
	change = change + 1
	current[j] = min(add, delete, change)

	return current[n]


	# Grab keywords from textwise and return them

	def getKeywords(url):
	apiurl = 'http://api.semantichacker.com/KEY/concept?format=json&uri='+url

	result = urlfetch.fetch(apiurl)

	json_decoder = json.decoder.JSONDecoder()

	decoded_json = json_decoder.decode(result.content)

	keywords = []

	for concept in decoded_json['conceptExtractor']['conceptExtractorResponse']['concepts']:

	keywords.append(concept['label'])

	return keywords

	# Grab url and save keywords and hash for future search

	class WriteDocHandler(webapp2.RequestHandler):
	def get(self):
	url = self.request.get("url")

	parsed_uri = urlparse(url)

	domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)

	keywords = getKeywords(url)

	keywordss = ','.join(map(str, keywords))

	document = search.Document(fields=[
	search.TextField(name='keywords', value=keywordss),
	search.TextField(name='url', value=url),
	search.TextField(name='hash',value=str(hash(frozenset(keywords))))
	])

	index = search.Index(name=domain)
	results = index.put(document)
	doc_id = results[0].id

	self.response.write(doc_id)

	# Search the database for url and compare results scoring with hash using levenshtein basic method and return scores.

	class SearchHandler(webapp2.RequestHandler):
	def get(self):

	url = self.request.get("url")

	parsed_uri = urlparse(url)

	domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)

	keywords = getKeywords(url)

	keywordss = ' OR '.join(map(str, keywords))

	query = search.Query("keywords = "+keywordss)

	index = search.Index(name=domain)

	results = index.search(query)

	json_encoder = json.encoder.JSONEncoder()

	jsonobj = []

	currenthashurl = str(hash(frozenset(keywords)))

	currenturlarr = {}
	currenturlarr['requested_url'] = url
	currenturlarr['hash'] = currenthashurl

	jsonobj.append(currenturlarr)

	for ScoredDocument in results:

	if(ScoredDocument.fields[1].value != url):
	arr = {}
	arr['url'] = ScoredDocument.fields[1].value
	arr['keywords'] = str(ScoredDocument.fields[0].value)
	arr['hash'] = str(ScoredDocument.fields[2].value)
	arr['score'] = float(levenshtein(currenthashurl,str(ScoredDocument.fields[2].value)))/100
	jsonobj.append(arr)

	self.response.write(json_encoder.encode(jsonobj))


	app = webapp2.WSGIApplication([
	('/writedoc', WriteDocHandler),
	('/search',SearchHandler)
	], debug=True)