Skip to content

Instantly share code, notes, and snippets.

@kcleereman
Created October 10, 2014 20:28
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kcleereman/dd7c13e1f0f55e9959d8 to your computer and use it in GitHub Desktop.
Save kcleereman/dd7c13e1f0f55e9959d8 to your computer and use it in GitHub Desktop.
def annotate(text: String, field: String): String = {
def prependHttp(s: String): String = if (!(s.startsWith("http://") || s.startsWith("https://"))) "http://" + s else s
{
val preprocessed = text.replaceAll(System.getProperty("line.separator"), " ")
pattern.findAllMatchIn(preprocessed).toList.map(m => {
Option(m.subgroups(0)).flatMap(token => {
symbolCode.get(token.slice(0, 1)).map(sym => Seq(sym, field, m.start(1).toString, m.end(1).toString, token, token.substring(1)))
}).getOrElse(Seq.empty[String]).mkString("|")
}) ++ urls.findAllMatchIn(preprocessed).toList.filterNot(m => {
val start: Int = m.start(1)
val end: Int = m.start(1)
val isHref: Boolean = (start - 7 > 0) && text.substring(start - 7, start).contains("href")
val isAnchor: Boolean = (start - 1 > 0 && end + 1 < text.length && text.charAt(start - 1).toString == ">" && text.charAt(end + 1).toString == "<")
isHref || isAnchor
}).map(m => {
Option(m.group(1)).map(token => Seq("l", field, m.start(1).toString, m.end(1).toString, token, prependHttp(token))).getOrElse(Seq()).mkString("|")
})
}.mkString(";")
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment