Skip to content

Instantly share code, notes, and snippets.

@ianmilligan1
Last active March 28, 2016 21:04
Show Gist options
  • Save ianmilligan1/9bc7e05859f512f05359 to your computer and use it in GitHub Desktop.
Save ianmilligan1/9bc7e05859f512f05359 to your computer and use it in GitHub Desktop.
val r =
RecordLoader.loadArc(arc,
sc)
.keepValidPages()
.map(r => {
val t = ExtractRawText(r.getBodyContent)
val len = 100
(r.getCrawldate, createClickableLink(r.getUrl,
r.getCrawldate), if ( t.length > len ) t.substring(0, len) else t)})
.collect()
val r =
RecordLoader.loadArc(arc,
sc)
.keepMimeTypes(Set("text/html"))
.discardDate(null)
.map(r => {
val t = ExtractRawText(r.getBodyContent)
NER3Classifier("/Users/ianmilligan1/dropbox/ner/stanford-ner-2015-04-20/classifiers/english.all.3class.distsim.crf.ser.gz")
val entities = NER3Classifier.classify(t)
val len = 100
(r.getCrawldate, r.getMimeType, entities, r.getUrl, if ( t.length > len ) t.substring(0,
len) else t)})
.collect()
def createClickableLink(url: String, date: String): String = {
"<a href='http://web.archive.org/web/" + date + "/" + url + "'>" +
url + "</a>"
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment