Created
February 29, 2016 19:22
-
-
Save nuria/73394ff6ad9a070ccd72 to your computer and use it in GitHub Desktop.
Encoding and Spark
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// spark-shell --num-executors 16 --executor-cores 1 --executor-memory 256M --master yarn \ | |
// --jars /mnt/hdfs/wmf/refinery/2016-02-23T18.55.34Z--7dadb6b/artifacts/org/wikimedia/analytics/refinery/refinery-hive-0.0.26.jar | |
// --conf 'spark.executor.extraJavaOptions=-Dfile.encoding=UTF-8' | |
import sys.process._ | |
import org.wikimedia.analytics.refinery.core.PageviewDefinition; | |
def pt(path: String): String = { | |
val pageviewDefinition = PageviewDefinition.getInstance() | |
val title = pageviewDefinition.getPageTitleFromUri(path, "") | |
val hostname: String = "hostname -f" !! | |
val p: String = "%s:\t%s".format(hostname.stripLineEnd, title) | |
return p | |
} | |
val t = sc.parallelize((1 to 16).map(_ => "/wiki/Lasse_%C3%85berg")) | |
val titles = t.map(pt).collect() | |
titles.foreach(println(_)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment