Last active November 30, 2018 19:49
$ ./spark-shell --master local\[10\] --driver-memory 30G --conf'10000000' --conf spark.executor.heartbeatInterval='600s' --conf spark.driver.maxResultSize='4G' --jars ~/git/aut/target/aut-0.17.1-SNAPSHOT-fatjar.jar
2018-11-30 09:08:03 WARN Utils:66 - Your hostname, wombat resolves to a loopback address:; using instead (on interface enp0s31f6)
2018-11-30 09:08:03 WARN Utils:66 - Set SPARK_LOCAL_IP if you need to bind to another address
2018-11-30 09:08:04 WARN NativeCodeLoader:62 - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
Spark context Web UI available at
Spark context available as 'sc' (master = local[10], app id = local-1543586887449).
Spark session available as 'spark'.
Welcome to
____ __
/ __/__ ___ _____/ /__
_\ \/ _ \/ _ `/ __/ '_/
/___/ .__/\_,_/_/ /_/\_\ version 2.4.0
Using Scala version 2.11.12 (OpenJDK 64-Bit Server VM, Java 1.8.0_191)
Type in expressions to have them evaluated.
Type :help for more information.
scala> :paste
// Entering paste mode (ctrl-D to finish)
import io.archivesunleashed._
import io.archivesunleashed.matchbox._
val r = RecordLoader.loadArchives("/home/nruest/tmp/test-warcs/231/*gz", sc)
.map(r => ExtractDomain(r.getUrl))
// Exiting paste mode, now interpreting.
import io.archivesunleashed._
import io.archivesunleashed.matchbox._
r: Array[(String, Int)] = Array((,86336), (,16348), (,9250), (,7178), (,4779), (,3870), (,3848), (,3268), (,2632), (,2070))
scala> :paste
// Entering paste mode (ctrl-D to finish)
import io.archivesunleashed._
import io.archivesunleashed.matchbox._
val r = RecordLoader.loadArchives("/home/nruest/tmp/test-warcs/231/*gz", sc)
.map(r => r.getUrl)
// Exiting paste mode, now interpreting.
import io.archivesunleashed._
import io.archivesunleashed.matchbox._
r: Array[String] = Array(,,,,,,,,,
scala> :paste
// Entering paste mode (ctrl-D to finish)
import io.archivesunleashed._
import io.archivesunleashed.matchbox._
val r = RecordLoader.loadArchives("/home/nruest/tmp/test-warcs/231/*gz", sc)
.map(r => r.getUrl)
// Exiting paste mode, now interpreting.
import io.archivesunleashed._
import io.archivesunleashed.matchbox._
r: Unit = ()
scala> :paste
// Entering paste mode (ctrl-D to finish)
import io.archivesunleashed._
import io.archivesunleashed.matchbox._
val r =
RecordLoader.loadArchives("/home/nruest/tmp/test-warcs/231/*gz", sc)
.map(r => ExtractDomain(r.getUrl))
// Exiting paste mode, now interpreting.
import io.archivesunleashed._
import io.archivesunleashed.matchbox._
r: Array[(String, Int)] = Array((,86336), (,16348), (,9250), (,7178), (,4779), (,3870), (,3848), (,3268), (,2632), (,2070))
// Entering paste mode (ctrl-D to finish)
import io.archivesunleashed._
import io.archivesunleashed.matchbox._
RecordLoader.loadArchives("/home/nruest/tmp/test-warcs/231/*gz", sc)
.map(r => (r.getCrawlDate, r.getDomain, r.getUrl, RemoveHTML(r.getContentString)))
// Exiting paste mode, now interpreting.
import io.archivesunleashed._
import io.archivesunleashed.matchbox._
