Created
February 20, 2012 16:34
-
-
Save gfrison/1870022 to your computer and use it in GitHub Desktop.
Rate affinity score between 2 web sites
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package com.gfrison | |
import groovy.json.* | |
import static org.apache.commons.lang3.StringUtils.* | |
/** | |
* @author Giancarlo Frison <giancarlo@gfrison.com> | |
* | |
* rate the affinity between 2 different web sites | |
* | |
* groovy Affinity.groovy http://gfrison.com http://www.codingthearchitecture.com | |
* | |
* remember you need to set the calaisKey for inquiry the Calais web service | |
* | |
*/ | |
@Grab('org.apache.commons:commons-lang3:3.0') | |
@Grab('org.slf4j:slf4j-api:1.6.2') | |
@Grab('org.slf4j:slf4j-log4j12:1.6.2') | |
class Affinity { | |
def notallow = ['aboard', 'about', 'above', 'absent', 'across', 'after', 'against', 'along', 'alongside', 'amid', 'amidst', 'among', 'amongst', 'around', 'aside', 'astride', 'athwart', 'atop', 'barring', 'before', 'behind', 'below', 'beneath', 'beside', 'besides', 'between', 'betwixt', 'beyond', 'but', 'circa', 'concerning', 'despite', 'down', 'during', 'except', 'excluding', 'failing', 'following', 'for', 'from', 'given', 'including', 'inside', 'into', 'like', 'mid', 'minus', 'near', 'next', 'off', 'onto', 'opposite', 'out', 'outside', 'over', 'pace', 'past', 'per', 'plus', 'pro', 'qua', 'regarding', 'round', 'save', 'since', 'than', 'through', 'throughout', 'till', 'times', 'toward', 'towards', 'under', 'underneath', 'unlike', 'until', 'up', 'upon', 'versus', 'via', 'vice', 'with', 'within', 'without', 'worth', 'come', 'get', 'give', 'go', 'keep', 'let', 'make', 'put', 'seem', 'take', 'be', 'do', 'have', 'say', 'see', 'send', 'may', 'will', 'about', 'across', 'after', 'against', 'among', 'at', 'before', 'between', 'by', 'down', 'from', 'in', 'off', 'on', 'over', 'through', 'to', 'under', 'up', 'with', 'as', 'for', 'of', 'till', 'than', 'a', 'the', 'all', 'any', 'every', 'no', 'other', 'some', 'such', 'that', 'this', 'i', 'he', 'you', 'who', 'and', 'because', 'but', 'or', 'if', 'though', 'while', 'how', 'when', 'where', 'why', 'again', 'ever', 'far', 'forward', 'here', 'near', 'now', 'out', 'still', 'then', 'there', 'together', 'well', 'almost', 'enough', 'even', 'little', 'much', 'not', 'only', 'quite', 'so', 'very', 'tomorrow', 'yesterday', 'north', 'south', 'east', 'west', 'please', 'yes'] | |
def calaisKey = '...' // create the key on semanticproxy.com | |
static String PLAIN_ASCII = "AaEeIiOoUu" + "AaEeIiOoUuYy"+ "AaEeIiOoUuYy" + "AaOoNn" + "AaEeIiOoUuYy" + "Aa"+ "Cc" + "OoUu" | |
static String UNICODE = | |
'\u00C0\u00E0\u00C8\u00E8\u00CC\u00EC\u00D2\u00F2\u00D9\u00F9'+ | |
'\u00C1\u00E1\u00C9\u00E9\u00CD\u00ED\u00D3\u00F3\u00DA\u00FA\u00DD\u00FD'+ | |
'\u00C2\u00E2\u00CA\u00EA\u00CE\u00EE\u00D4\u00F4\u00DB\u00FB\u0176\u0177'+ | |
'\u00C3\u00E3\u00D5\u00F5\u00D1\u00F1'+ | |
'\u00C4\u00E4\u00CB\u00EB\u00CF\u00EF\u00D6\u00F6\u00DC\u00FC\u0178\u00FF'+ | |
'\u00C5\u00E5'+ | |
'\u00C7\u00E7'+ | |
'\u0150\u0151\u0170\u0171' | |
public static String convertNonAscii(String s) { | |
if (s == null) return null; | |
StringBuilder sb = new StringBuilder(); | |
int n = s.length(); | |
for (int i = 0; i < n; i++) { | |
def c = s.substring(i, i+1); | |
int pos = UNICODE.indexOf(c); | |
if (pos > -1) { | |
sb.append(PLAIN_ASCII.charAt(pos)); | |
} else { | |
sb.append(c); | |
} | |
} | |
return sb.toString(); | |
} | |
def analyze = {url-> | |
def entities = [:] | |
String json | |
try { | |
json = ('http://service.semanticproxy.com/processurl/' + calaisKey + '/json/' + url).toURL().text; | |
} catch (e) { | |
println e.message + ', url:' + url | |
return entities | |
} | |
def slurper = new JsonSlurper() | |
def doc = slurper.parseText(json) | |
doc.each{ | |
if (it.value.name) { | |
def perms = convertNonAscii(it.value.name).split(' ').findAll {it.size() > 2 && !notallow.contains(it)}.collect { | |
lowerCase(it).replaceAll("[^a-zA-Z0-9]", ""); | |
} | |
def name = '' | |
perms.each { | |
if ((name + ' ' + it).length() < 50) | |
name += ' ' + it | |
} | |
name = name.trim() | |
if (isNotBlank(name)) { | |
if (it.value.relevance) | |
entities.put(name, it.value.relevance.toDouble() * 1000) | |
} | |
} | |
} | |
return entities | |
} | |
public static void main(String[] args) { | |
if(args.length!=2){ | |
println 'usage: groovy Affinity.groovy <site1> <site2>' | |
return | |
} | |
Affinity aff = new Affinity() | |
def tags1 = aff.analyze(args[0]) | |
def tags2 = aff.analyze(args[1]) | |
long score=0 | |
tags1.each{k,v-> | |
if(tags2.containsKey(k)){ | |
score += tags2[k]*v | |
} | |
} | |
println "score:$score" | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment