Skip to content

Instantly share code, notes, and snippets.

@aloncarmel
Created September 1, 2013 14:02
Show Gist options
  • Save aloncarmel/6404647 to your computer and use it in GitHub Desktop.
Save aloncarmel/6404647 to your computer and use it in GitHub Desktop.
Small experiement to start writing my own related content engine on app engine using search api and some basic levenshtein. * Grabs keywords per url from textwise.com * Writes full text search in app engine, creates a hash. * Grab url and keywords, compare hashes after keyword search. Its a start. never been tested.
#!/usr/bin/env python
#
# Copyright 2007 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import webapp2
from datetime import datetime
from google.appengine.api import search
from google.appengine.api import urlfetch
import simplejson as json
import urllib
from urlparse import urlparse
def levenshtein(a,b):
"Calculates the Levenshtein distance between a and b."
n, m = len(a), len(b)
if n > m:
# Make sure n <= m, to use O(min(n,m)) space
a,b = b,a
n,m = m,n
current = range(n+1)
for i in range(1,m+1):
previous, current = current, [i]+[0]*n
for j in range(1,n+1):
add, delete = previous[j]+1, current[j-1]+1
change = previous[j-1]
if a[j-1] != b[i-1]:
change = change + 1
current[j] = min(add, delete, change)
return current[n]
# Grab keywords from textwise and return them
def getKeywords(url):
apiurl = 'http://api.semantichacker.com/KEY/concept?format=json&uri='+url
result = urlfetch.fetch(apiurl)
json_decoder = json.decoder.JSONDecoder()
decoded_json = json_decoder.decode(result.content)
keywords = []
for concept in decoded_json['conceptExtractor']['conceptExtractorResponse']['concepts']:
keywords.append(concept['label'])
return keywords
# Grab url and save keywords and hash for future search
class WriteDocHandler(webapp2.RequestHandler):
def get(self):
url = self.request.get("url")
parsed_uri = urlparse(url)
domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
keywords = getKeywords(url)
keywordss = ','.join(map(str, keywords))
document = search.Document(fields=[
search.TextField(name='keywords', value=keywordss),
search.TextField(name='url', value=url),
search.TextField(name='hash',value=str(hash(frozenset(keywords))))
])
index = search.Index(name=domain)
results = index.put(document)
doc_id = results[0].id
self.response.write(doc_id)
# Search the database for url and compare results scoring with hash using levenshtein basic method and return scores.
class SearchHandler(webapp2.RequestHandler):
def get(self):
url = self.request.get("url")
parsed_uri = urlparse(url)
domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
keywords = getKeywords(url)
keywordss = ' OR '.join(map(str, keywords))
query = search.Query("keywords = "+keywordss)
index = search.Index(name=domain)
results = index.search(query)
json_encoder = json.encoder.JSONEncoder()
jsonobj = []
currenthashurl = str(hash(frozenset(keywords)))
currenturlarr = {}
currenturlarr['requested_url'] = url
currenturlarr['hash'] = currenthashurl
jsonobj.append(currenturlarr)
for ScoredDocument in results:
if(ScoredDocument.fields[1].value != url):
arr = {}
arr['url'] = ScoredDocument.fields[1].value
arr['keywords'] = str(ScoredDocument.fields[0].value)
arr['hash'] = str(ScoredDocument.fields[2].value)
arr['score'] = float(levenshtein(currenthashurl,str(ScoredDocument.fields[2].value)))/100
jsonobj.append(arr)
self.response.write(json_encoder.encode(jsonobj))
app = webapp2.WSGIApplication([
('/writedoc', WriteDocHandler),
('/search',SearchHandler)
], debug=True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment