Created
June 14, 2012 15:40
-
-
Save bdelacretaz/2931050 to your computer and use it in GitHub Desktop.
Stanbol/Groovy/Camel experiment: dynamic enhancement chains
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// WHAT'S THIS? | |
// | |
// This is a Stanbol/Groovy/Camel proof of concept: the goal is | |
// to define Stanbol content enhancement chains as flowchart-like | |
// structures from a script language. Intermediate results (as | |
// the detected language in this example) can be used to route | |
// the content being analyzed to different paths in the flowchart. | |
// | |
// Apache Camel is used as the flowchart execution engine. | |
// | |
// This script sets up and runs an Apache Camel context | |
// that includes custom components written in | |
// Groovy, directly in this script. | |
// | |
// The final goal is to allow such scripts to define | |
// dynamic Stanbol enhancer chains, where intermediate | |
// results can influence the sequence of operations | |
// applied to the content that's being enhanced. | |
// | |
// Example use cases (once this is integrated in Stanbol): | |
// -Run a different part of speech tagger for German | |
// | |
// -Run an additional domain-specific enhancement engine | |
// based on entities that have been identified by a generic | |
// engine at the beginning of the flowchart. | |
// | |
// -Add user-supplied components (written in groovy as | |
// in this script) to the enhancement chain. | |
// | |
// This is just a proof of concept, that runs on Sling | |
// as I knew how to run groovy scripts out of the box with Sling. | |
// We'll probably need to add a few bundles and glue | |
// code to Stanbol to do the same thing there. | |
// | |
// The is the first time I've used Camel, so I probably | |
// overlooked a few things, but it works. | |
// | |
// HOW TO RUN THIS IN SLING: | |
// | |
// 1. Start the Sling Launchpad | |
// (found at http://svn.apache.org/repos/asf/sling/trunk/launchpad/builder) | |
// | |
// 2. Install and start these two bundles, via /system/console/bundles: | |
// | |
// <groupId>org.apache.camel</groupId> | |
// <artifactId>camel-core-osgi</artifactId> | |
// <version>2.8.1</version> | |
// | |
// <groupId>org.fusesource.commonman</groupId> | |
// <artifactId>commons-management</artifactId> | |
// <version>1.0</version> | |
// | |
// 3. Create a test node in the Sling repository: | |
// $ curl -u admin:admin -Fsling:resourceType=camel http://localhost:8080/camel | |
// | |
// 4. Upload this script so that Sling uses it for a POST with the .camel extension: | |
// $ curl -u admin:admin -X MKCOL http://localhost:8080/apps/camel | |
// $ curl -u admin:admin -T camel.POST.groovy http://localhost:8080/apps/camel/camel.POST.groovy | |
// | |
// 5. Enjoy, and note that our French text is processed using a slightly different chain than | |
// the one used to process German - that's what I meant to demonstrate, along with the ability | |
// to inject simple scripted components like WordCount in the pipeline. | |
// | |
// $ curl -s -X POST http://localhost:8080/camel.camel?text=Salut,+Le+monsieur+dans+le+train | |
// *** Messages received from mock://result *** | |
// content='Salut, Le monsieur dans le train', metadata=[words:6, length:32, language:fr] | |
// content='path=/camel', metadata=[words:1, length:11, something:about english, language:en] | |
// content='sling:resourceType=camel', metadata=[words:1, length:24, something:about english, language:en] | |
// content='jcr:primaryType=nt:unstructured', metadata=[words:1, length:31, something:about english, language:en] | |
// | |
// $ curl -s -X POST http://localhost:8080/camel.camel?text=Der+Mann+im+Zug | |
// *** Messages received from mock://result *** | |
// content='Der Mann im Zug', metadata=[words:4, length:15, etwas:ueber Deutsch, language:de] | |
// content='path=/camel', metadata=[words:1, length:11, something:about english, language:en] | |
// content='sling:resourceType=camel', metadata=[words:1, length:24, something:about english, language:en] | |
// content='jcr:primaryType=nt:unstructured', metadata=[words:1, length:31, something:about english, language:en] | |
// | |
// NEXT STEPS: | |
// -Setup Stanbol so that it can run this | |
// -Hook up the actual Stanbol enhancement engines (or parts of them to make it easier to mix | |
// and match), and combine with custom Groovy components like this script does. | |
import org.apache.camel.Exchange | |
import org.apache.camel.Predicate | |
// This is the data that passes through our enhancement | |
// pipeline components: some content with metadata that's | |
// added by the pipeline components | |
class EnhancedString { | |
String content | |
Map attributes | |
EnhancedString(String str) { | |
content = str | |
attributes = new HashMap(); | |
} | |
String toString() { | |
return "content='" + content + "', metadata=" + attributes | |
} | |
} | |
// Convert a String to an EnhancedString (there's probably | |
// a more integrated way to do this with Camel) | |
class EnhancedStringConverter { | |
public EnhancedString pipe(String str) { | |
return new EnhancedString(str) | |
} | |
} | |
// Example pipeline component: count the number of words in | |
// our content and add the result as metadata. I didn't want | |
// to go into deep semantic analysis for this example...but the | |
// principle is the same as Stanbol's enhancement engines | |
class WordCount { | |
public EnhancedString pipe(EnhancedString es) { | |
es.attributes.put("words", es.content.split(" ").length) | |
return es | |
} | |
} | |
// Another "enhancement engine" that adds the string length | |
class StringLength { | |
public EnhancedString pipe(EnhancedString es) { | |
es.attributes.put("length", es.content.length()) | |
return es | |
} | |
} | |
// A very stupid language detector | |
class LanguageDetector { | |
public EnhancedString pipe(EnhancedString es) { | |
String language = "en" | |
if(es.content.toLowerCase().contains("mann")) { | |
language = "de" | |
} else if(es.content.toLowerCase().contains("salut")) { | |
language = "fr" | |
} | |
es.attributes.put("language", language) | |
return es | |
} | |
} | |
// Add an arbitrary attribute to our metadata, to simulate | |
// using different engines after the language detection | |
class AddAttribute { | |
String n | |
String v | |
AddAttribute(name, value) { | |
n = name | |
v = value | |
} | |
public EnhancedString pipe(EnhancedString es) { | |
es.attributes.put(n, v) | |
return es | |
} | |
} | |
// Camel Predicate which returns true if the detected | |
// language matches what we expect | |
class MatchLanguage implements org.apache.camel.Predicate { | |
String language | |
MatchLanguage(expectedLanguage) { | |
language = expectedLanguage | |
} | |
boolean matches(Exchange e) { | |
return language.equals(e.getIn().getBody().attributes.get("language")) | |
} | |
} | |
// Build the Camel "route" (I'd rather call it a flowchart) which | |
// analyzes our content | |
class MyRouteBuilder extends org.apache.camel.builder.RouteBuilder { | |
void configure() { | |
// Get messages from our input and convert them to EnhancedString | |
from("direct://input") | |
.bean(new EnhancedStringConverter(), "pipe") | |
// Detect language | |
.bean(new LanguageDetector(),"pipe") | |
// Based on language, simulate running different engines - note | |
// that for French we don't run anything, as an example | |
.choice() | |
.when(new MatchLanguage("de")).bean(new AddAttribute("etwas", "ueber Deutsch"), "pipe") | |
.when(new MatchLanguage("fr")) | |
.otherwise().bean(new AddAttribute("something", "about english"), "pipe") | |
.end() | |
// Run a few more common engines | |
.bean(new StringLength(),"pipe") | |
.bean(new WordCount(), "pipe") | |
// And send the result to output | |
.to("mock://result") | |
} | |
} | |
// Camel setup boilerplate | |
// TODO does the context need to be stopped, and when? | |
mrb = new MyRouteBuilder() | |
ctx = new org.apache.camel.impl.DefaultCamelContext() | |
ctx.addRoutes mrb | |
ctx.start() | |
p = ctx.createProducerTemplate() | |
// Send some data to our input based on request parameters | |
p.sendBody "direct:input", request.getParameter("text") | |
// Send some Sling-supplied data...won't happen with Stanbol | |
// but useful for my tests | |
p.sendBody "direct:input", "path=" + currentNode.getPath() | |
props = currentNode.getProperties() | |
while(props.hasNext()) { | |
prop = props.nextProperty() | |
p.sendBody "direct:input", prop.getName() + "=" + prop.getValue().getString() | |
} | |
// Output the results to the response as plain text | |
pt = "mock://result" | |
w = response.getWriter() | |
response.setContentType("text/plain") | |
w.write("*** Messages received from " + pt + " ***\n") | |
ctx.getEndpoint(pt).exchanges.each { ex -> | |
w.write("${ex.getIn().getBody()}\n") | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment