Skip to content

Instantly share code, notes, and snippets.

@k8si
Created June 17, 2015 21:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save k8si/ae0409929544f032d498 to your computer and use it in GitHub Desktop.
Save k8si/ae0409929544f032d498 to your computer and use it in GitHub Desktop.
createParagraph
def createParagraph2(paragraphNode: Node, paragraphStart: Int, doc: Document): Unit = {
for (child <- paragraphNode.childNodes) {
if (child.isInstanceOf[TextNode]) {
val tmpDoc = new Document(child.asInstanceOf[TextNode].text)
cc.factorie.app.nlp.segment.DeterministicNormalizingTokenizer.process(tmpDoc)
//attach the tokens to the original document
tmpDoc.tokens.foreach { token => new Token(doc, token.string) }
} else if (child.nodeName.equals("a")) {
val linkTarget: String = child.attr("href")
val linkText: String = child.childNode(0).toString()
val tmpDoc = new Document(linkText)
cc.factorie.app.nlp.segment.DeterministicNormalizingTokenizer.process(tmpDoc)
//attach the tokens to the original document
val newTokens = tmpDoc.tokens.map { token => new Token(doc, token.string) }
val span = new TokenSpan(newTokens.toSeq)
span.attr += new Link(linkTarget)
doc.attr[TokenSpanBuffer[TokenSpan]] += span
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment