public
Last active

Rate affinity score between 2 web sites

  • Download Gist
Affinity.groovy
Groovy
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101
package com.gfrison
 
import groovy.json.*
import static org.apache.commons.lang3.StringUtils.*
 
/**
* @author Giancarlo Frison <giancarlo@gfrison.com>
*
* rate the affinity between 2 different web sites
*
* groovy Affinity.groovy http://gfrison.com http://www.codingthearchitecture.com
*
* remember you need to set the calaisKey for inquiry the Calais web service
*
*/
@Grab('org.apache.commons:commons-lang3:3.0')
@Grab('org.slf4j:slf4j-api:1.6.2')
@Grab('org.slf4j:slf4j-log4j12:1.6.2')
class Affinity {
def notallow = ['aboard', 'about', 'above', 'absent', 'across', 'after', 'against', 'along', 'alongside', 'amid', 'amidst', 'among', 'amongst', 'around', 'aside', 'astride', 'athwart', 'atop', 'barring', 'before', 'behind', 'below', 'beneath', 'beside', 'besides', 'between', 'betwixt', 'beyond', 'but', 'circa', 'concerning', 'despite', 'down', 'during', 'except', 'excluding', 'failing', 'following', 'for', 'from', 'given', 'including', 'inside', 'into', 'like', 'mid', 'minus', 'near', 'next', 'off', 'onto', 'opposite', 'out', 'outside', 'over', 'pace', 'past', 'per', 'plus', 'pro', 'qua', 'regarding', 'round', 'save', 'since', 'than', 'through', 'throughout', 'till', 'times', 'toward', 'towards', 'under', 'underneath', 'unlike', 'until', 'up', 'upon', 'versus', 'via', 'vice', 'with', 'within', 'without', 'worth', 'come', 'get', 'give', 'go', 'keep', 'let', 'make', 'put', 'seem', 'take', 'be', 'do', 'have', 'say', 'see', 'send', 'may', 'will', 'about', 'across', 'after', 'against', 'among', 'at', 'before', 'between', 'by', 'down', 'from', 'in', 'off', 'on', 'over', 'through', 'to', 'under', 'up', 'with', 'as', 'for', 'of', 'till', 'than', 'a', 'the', 'all', 'any', 'every', 'no', 'other', 'some', 'such', 'that', 'this', 'i', 'he', 'you', 'who', 'and', 'because', 'but', 'or', 'if', 'though', 'while', 'how', 'when', 'where', 'why', 'again', 'ever', 'far', 'forward', 'here', 'near', 'now', 'out', 'still', 'then', 'there', 'together', 'well', 'almost', 'enough', 'even', 'little', 'much', 'not', 'only', 'quite', 'so', 'very', 'tomorrow', 'yesterday', 'north', 'south', 'east', 'west', 'please', 'yes']
 
def calaisKey = '...' // create the key on semanticproxy.com
 
static String PLAIN_ASCII = "AaEeIiOoUu" + "AaEeIiOoUuYy"+ "AaEeIiOoUuYy" + "AaOoNn" + "AaEeIiOoUuYy" + "Aa"+ "Cc" + "OoUu"
 
static String UNICODE =
'\u00C0\u00E0\u00C8\u00E8\u00CC\u00EC\u00D2\u00F2\u00D9\u00F9'+
'\u00C1\u00E1\u00C9\u00E9\u00CD\u00ED\u00D3\u00F3\u00DA\u00FA\u00DD\u00FD'+
'\u00C2\u00E2\u00CA\u00EA\u00CE\u00EE\u00D4\u00F4\u00DB\u00FB\u0176\u0177'+
'\u00C3\u00E3\u00D5\u00F5\u00D1\u00F1'+
'\u00C4\u00E4\u00CB\u00EB\u00CF\u00EF\u00D6\u00F6\u00DC\u00FC\u0178\u00FF'+
'\u00C5\u00E5'+
'\u00C7\u00E7'+
'\u0150\u0151\u0170\u0171'
 
public static String convertNonAscii(String s) {
if (s == null) return null;
StringBuilder sb = new StringBuilder();
int n = s.length();
for (int i = 0; i < n; i++) {
def c = s.substring(i, i+1);
int pos = UNICODE.indexOf(c);
if (pos > -1) {
sb.append(PLAIN_ASCII.charAt(pos));
} else {
sb.append(c);
}
}
return sb.toString();
}
 
def analyze = {url->
def entities = [:]
String json
try {
json = ('http://service.semanticproxy.com/processurl/' + calaisKey + '/json/' + url).toURL().text;
} catch (e) {
println e.message + ', url:' + url
return entities
}
def slurper = new JsonSlurper()
def doc = slurper.parseText(json)
doc.each{
if (it.value.name) {
def perms = convertNonAscii(it.value.name).split(' ').findAll {it.size() > 2 && !notallow.contains(it)}.collect {
lowerCase(it).replaceAll("[^a-zA-Z0-9]", "");
}
 
 
def name = ''
perms.each {
if ((name + ' ' + it).length() < 50)
name += ' ' + it
}
name = name.trim()
if (isNotBlank(name)) {
if (it.value.relevance)
entities.put(name, it.value.relevance.toDouble() * 1000)
}
}
}
return entities
}
 
public static void main(String[] args) {
if(args.length!=2){
println 'usage: groovy Affinity.groovy <site1> <site2>'
return
}
Affinity aff = new Affinity()
def tags1 = aff.analyze(args[0])
def tags2 = aff.analyze(args[1])
long score=0
tags1.each{k,v->
if(tags2.containsKey(k)){
score += tags2[k]*v
}
}
println "score:$score"
}
}

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.