Skip to content

Instantly share code, notes, and snippets.

@ruebot
Last active April 9, 2019 16:16
Show Gist options
  • Save ruebot/60ad53f25cbeae6b9e26334addcac379 to your computer and use it in GitHub Desktop.
Save ruebot/60ad53f25cbeae6b9e26334addcac379 to your computer and use it in GitHub Desktop.
import io.archivesunleashed._
import io.archivesunleashed.app._
import io.archivesunleashed.matchbox._
import org.apache.spark.storage.StorageLevel.MEMORY_AND_DISK
sc.setLogLevel("DEBUG")
val validPages = RecordLoader
.loadArchives("/tuna1/scratch/nruest/auk_collection_testing/10689/warcs/*.gz", sc)
.keepValidPages()
.persist(MEMORY_AND_DISK)
validPages
.map(r => ExtractDomain(r.getUrl))
.countItems()
.saveAsTextFile("/tuna1/scratch/nruest/auk_collection_testing/10689/133/derivatives/all-domains/output")
validPages
.map(r => (r.getCrawlDate, r.getDomain, r.getUrl, RemoveHTML(RemoveHttpHeader(r.getContentString))))
.saveAsTextFile("/tuna1/scratch/nruest/auk_collection_testing/10689/133/derivatives/all-text/output")
val links = validPages
.map(r => (r.getCrawlDate, ExtractLinks(r.getUrl, r.getContentString)))
.flatMap(r => r._2.map(f => (r._1, ExtractDomain(f._1)
.replaceAll("^\\s*www\\.", ""), ExtractDomain(f._2)
.replaceAll("^\\s*www\\.", ""))))
.filter(r => r._2 != "" && r._3 != "")
.countItems()
.filter(r => r._2 > 5)
WriteGraphML(links, "/tuna1/scratch/nruest/auk_collection_testing/10689/133/derivatives/gephi/10689-gephi.graphml")
sys.exit
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment