Skip to content

Instantly share code, notes, and snippets.

@santanu-dey
Created June 16, 2017 05:04
Show Gist options
  • Save santanu-dey/c0d009e06d98d0b80edf74e10ca95573 to your computer and use it in GitHub Desktop.
Save santanu-dey/c0d009e06d98d0b80edf74e10ca95573 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"metadata": {
"name": "SparkJob1"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": "import os\nfor a in os.environ:\n print('Var: ', a, 'Value: ', os.getenv(a))\nprint(\"all done\")",
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": "('Var: ', 'SPARK_HOME', 'Value: ', '/opt/cloudera/parcels/CDH/lib/spark')\n('Var: ', 'LESSOPEN', 'Value: ', '||/usr/bin/lesspipe.sh %s')\n('Var: ', 'SSH_CLIENT', 'Value: ', '172.16.1.237 57798 22')\n('Var: ', 'CVS_RSH', 'Value: ', 'ssh')\n('Var: ', 'LOGNAME', 'Value: ', 'hdfs')\n('Var: ', 'USER', 'Value: ', 'hdfs')\n('Var: ', 'PYSPARK_SUBMIT_ARGS', 'Value: ', '--master yarn --deploy-mode client pyspark-shell')\n('Var: ', 'HOME', 'Value: ', '/var/lib/hadoop-hdfs')\n('Var: ', 'PATH', 'Value: ', '/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin:/root/bin:/usr/java/jdk1.7.0_79/bin:/usr/java/jdk1.7.0_79/jre/bin')\n('Var: ', 'LANG', 'Value: ', 'en_US.UTF-8')\n('Var: ', 'TERM', 'Value: ', 'xterm-color')\n('Var: ', 'SHELL', 'Value: ', '/bin/bash')\n('Var: ', 'SHLVL', 'Value: ', '2')\n('Var: ', 'G_BROKEN_FILENAMES', 'Value: ', '1')\n('Var: ', 'HISTSIZE', 'Value: ', '1000')\n('Var: ', 'JAVA_HOME', 'Value: ', '/usr/java/jdk1.7.0_79')\n('Var: ', 'PYTHONPATH', 'Value: ', '/opt/cloudera/parcels/CDH/lib/spark/python:/opt/cloudera/parcels/CDH/lib/spark/python/build:/opt/cloudera/parcels/CDH/lib/spark/python/lib/py4j-0.9-src.zip:')\n('Var: ', 'GIT_PAGER', 'Value: ', 'cat')\n('Var: ', 'MAIL', 'Value: ', '/var/spool/mail/root')\n('Var: ', '_', 'Value: ', '/usr/bin/nohup')\n('Var: ', 'LS_COLORS', 'Value: ', 'rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=01;05;37;41:su=37;41:sg=30;43:ca=30;41:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arj=01;31:*.taz=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.zip=01;31:*.z=01;31:*.Z=01;31:*.dz=01;31:*.gz=01;31:*.lz=01;31:*.xz=01;31:*.bz2=01;31:*.tbz=01;31:*.tbz2=01;31:*.bz=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.rar=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.jpg=01;35:*.jpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.axv=01;35:*.anx=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=01;36:*.au=01;36:*.flac=01;36:*.mid=01;36:*.midi=01;36:*.mka=01;36:*.mp3=01;36:*.mpc=01;36:*.ogg=01;36:*.ra=01;36:*.wav=01;36:*.axa=01;36:*.oga=01;36:*.spx=01;36:*.xspf=01;36:')\n('Var: ', 'SSH_TTY', 'Value: ', '/dev/pts/0')\n('Var: ', 'OLDPWD', 'Value: ', '/root')\n('Var: ', 'HOSTNAME', 'Value: ', 'CDHN01.fapliotsg.com')\n('Var: ', 'CLICOLOR', 'Value: ', '1')\n('Var: ', 'HISTCONTROL', 'Value: ', 'ignoredups')\n('Var: ', 'PWD', 'Value: ', '/var/lib/hadoop-hdfs')\n('Var: ', 'JRE_HOME', 'Value: ', '/usr/java/jdk1.7.0_79/jre')\n('Var: ', 'SSH_CONNECTION', 'Value: ', '172.16.1.237 57798 10.6.2.162 22')\n('Var: ', 'PAGER', 'Value: ', 'cat')\nall done\n"
}
],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": "from pyspark import SparkContext, SparkConf",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": "conf = (SparkConf().setAppName(\"simple\")\n .set(\"spark.shuffle.service.enabled\", \"false\")\n .set(\"spark.dynamicAllocation.enabled\", \"false\")\n .set(\"spark.io.compression.codec\", \"snappy\")\n .set(\"spark.cores.max\", \"1\")\n .set(\"spark.rdd.compress\", \"true\")\n .set(\"spark.executor.instances\",\"2\")\n .set(\"spark.executor.memory\",\"200m\")\n .set(\"spark.executor.cores\",\"1\"))\n\nsc = SparkContext(conf = conf)",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": "sc._conf.getAll()",
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 4,
"text": "[(u'spark.dynamicAllocation.enabled', u'false'),\n (u'spark.eventLog.enabled', u'true'),\n (u'spark.yarn.appMasterEnv.PYSPARK_DRIVER_PYTHON',\n u'/opt/cloudera/parcels/Anaconda-4.1.1/bin/python'),\n (u'spark.shuffle.service.enabled', u'false'),\n (u'spark.dynamicAllocation.executorIdleTimeout', u'60'),\n (u'spark.executor.extraLibraryPath',\n u'/opt/cloudera/parcels/CDH-5.8.0-1.cdh5.8.0.p0.42/lib/hadoop/lib/native'),\n (u'spark.serializer', u'org.apache.spark.serializer.KryoSerializer'),\n (u'spark.cores.max', u'1'),\n (u'spark.authenticate', u'false'),\n (u'spark.yarn.historyServer.address', u'http://CDHN01.fapliotsg.com:18088'),\n (u'spark.executor.instances', u'2'),\n (u'spark.driver.extraLibraryPath',\n u'/opt/cloudera/parcels/CDH-5.8.0-1.cdh5.8.0.p0.42/lib/hadoop/lib/native'),\n (u'spark.serializer.objectStreamReset', u'100'),\n (u'spark.submit.deployMode', u'client'),\n (u'spark.yarn.appMasterEnv.PYSPARK_PYTHON',\n u'/opt/cloudera/parcels/Anaconda-4.1.1/bin/python'),\n (u'spark.executor.memory', u'200m'),\n (u'spark.io.compression.codec', u'snappy'),\n (u'spark.master', u'yarn-client'),\n (u'spark.dynamicAllocation.schedulerBacklogTimeout', u'1'),\n (u'spark.shuffle.service.port', u'7337'),\n (u'spark.rdd.compress', u'true'),\n (u'spark.yarn.config.gatewayPath', u'/opt/cloudera/parcels'),\n (u'spark.yarn.jar',\n u'local:/opt/cloudera/parcels/CDH-5.8.0-1.cdh5.8.0.p0.42/lib/spark/lib/spark-assembly.jar'),\n (u'spark.eventLog.dir',\n u'hdfs://CDHN01.fapliotsg.com:8020/user/spark/applicationHistory'),\n (u'spark.app.name', u'simple'),\n (u'spark.yarn.config.replacementPath', u'{{HADOOP_COMMON_HOME}}/../../..'),\n (u'spark.yarn.isPython', u'true'),\n (u'spark.dynamicAllocation.minExecutors', u'0'),\n (u'spark.executor.cores', u'1'),\n (u'spark.yarn.am.extraLibraryPath',\n u'/opt/cloudera/parcels/CDH-5.8.0-1.cdh5.8.0.p0.42/lib/hadoop/lib/native')]"
}
],
"prompt_number": 4
},
{
"cell_type": "code",
"collapsed": false,
"input": "print sc",
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": "<pyspark.context.SparkContext object at 0x291df10>\n"
}
],
"prompt_number": 5
},
{
"cell_type": "code",
"collapsed": false,
"input": "raw_events = sc.textFile('/user/hdfs/web_logs_1.csv').map(lambda x: x.split(','))",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 6
},
{
"cell_type": "code",
"collapsed": false,
"input": "print raw_events.count()",
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": "250\n"
}
],
"prompt_number": 7
},
{
"cell_type": "code",
"collapsed": false,
"input": "print raw_events.first()",
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": "[u'1480895575515725824', u'metastore', u'1041', u'Singapore', u'128.199.234.236', u'200', u'SG', u'SGP', u'Singapore', u'Other', u'', u'1.2931', u'103.8558', u'GET', u'Other', u'', u'HTTP/1.1', u'', u'-', u'00', u'GET /metastore/table/default/sample_07 HTTP/1.1', u'table', u'2014-05-04T06:35:49Z', u'/metastore/table/default/sample_07', u'Mozilla/5.0 (compatible; phpservermon/3.0.1; +http://www.phpservermonitor.org)', u'Other', u'', u'8836e6ce-9a21-449f-a372-9e57641389b3']\n"
}
],
"prompt_number": 8
},
{
"cell_type": "code",
"collapsed": false,
"input": "",
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment