Last active
September 7, 2015 03:47
-
-
Save kenttw/4fa02c06b1bf0e1c19c9 to your computer and use it in GitHub Desktop.
spark - let file name as key
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Initial Spark Config" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Welcome to\n", | |
" ____ __\n", | |
" / __/__ ___ _____/ /__\n", | |
" _\\ \\/ _ \\/ _ `/ __/ '_/\n", | |
" /__ / .__/\\_,_/_/ /_/\\_\\ version 1.4.0\n", | |
" /_/\n", | |
"\n", | |
"Using Python version 2.7.6 (default, Sep 9 2014 15:04:36)\n", | |
"SparkContext available as sc, HiveContext available as sqlContext.\n" | |
] | |
} | |
], | |
"source": [ | |
"import os\n", | |
"import sys\n", | |
"\n", | |
"spark_home = os.environ.get('SPARK_HOME', None)\n", | |
"sys.path.insert(0, spark_home + \"/python\")\n", | |
"\n", | |
"# Add the py4j to the path.\n", | |
"# You may need to change the version number to match your install\n", | |
"sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.8.2.1-src.zip'))\n", | |
"\n", | |
"# Initialize PySpark to predefine the SparkContext variable 'sc'\n", | |
"execfile(os.path.join(spark_home, 'python/pyspark/shell.py'))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Parse Every File" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"from pyspark.sql import SQLContext, Row\n", | |
"sqlContext = SQLContext(sc)\n", | |
"\n", | |
"# get file list in a folder\n", | |
"import glob\n", | |
"flist =glob.glob(\"./sequence/*\")\n", | |
"\n", | |
"# parse every file\n", | |
"rf = []\n", | |
"for f in flist :\n", | |
" lines = sc.textFile(f)\n", | |
" parts = lines.map(lambda l: l.split(\"|\"))\n", | |
" rf.append(parts.map(lambda x : [f] + x))\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Union all file to one rdd" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['./sequence/Sida yellow vein virus.fasta',\n", | |
" u'>gi',\n", | |
" u'167006425',\n", | |
" u'ref',\n", | |
" u'NC_010314.1',\n", | |
" u' Abaca bunchy top virus DNA-N, complete genome\\nAGCAGGGGGGCTTATTATTACCCCCCCTGCTCGGGGCGGGACATTCTGTGATGGGCTGGGCTTTATGCGG\\nCCAAATAAGCCCATAAAGCCAGATCTGGGCCCATTTAAGGGCCCGTGGTTTGAAAATGTCGCGTTCCCGC\\nCTAAATTGTTTGCTTGCCCTGCAAGGAAACGAAAACTCTATAAATAGGGTTGTTCTCTGCTTGTTTAATA\\nCATCAGGCGCAAATCTTTTGCAACGATGGATTGGATGGAATCACAATTCAAGACATGTACGCATGGCTGC\\nGACTGGAAGGCGATAGCTCCAGAAGCACAAGATAATATACAGGTAATTACATGTTCCGATTCAGGTTACG\\nGAAGAAAGAACCCTCGTAAGGTTCTTCTGAGGAGTATTCAGATAGGGTTCAATGGAAGCTTCAGAGGAAG\\nTAATAGAAATGTTCGAGGCTTCATATACGTGTCTGTAAGACAGGATGATGGCCAAATGAGACCAATTATG\\nGTCGTTCCATTCGGAGGGTATGGATATCATAACGACTACTATTATTTTGAAGGACAATCCAGTACGAATT\\nGTGAGATAGTGTCGGACTATATTCCGGCCGGTCAAGACTGGAGCAGAGATATGGAGATAAGTATAAGTAA\\nCAGCAACAATTGTAATCAAGAGTGCGATATCAAGTGTTATGTAGTATGTAATTTAAGAATTAAGGAATAA\\nWATTGTTGCCGAAGGTCTGTTATTTGAATGTTGAGATAAGGAAAGGGGCGGCGAAGCATGTGTGTATAAT\\nAACATATAACACACTATTATATATTTTGTAAAGAATAAAATTATGACCTGTCAGATTAAGTTTAGAATGA\\nACTGAGGCCGAAGGCCTCACCGAGGCCGAAGGCCGTCAGGATGGTTTTACAAAATAATTATAAGCACCTG\\nTACTAAGTACGAAGAGCGGTATAATATCTGAAAGGAAAAAATAATAATATAATAAAAATATTATGATGTC\\nCCAAAATAGCAGAATGCTAAAGGAACAAAAGGATGCTCTAAGTACAGGGTTGCGTGCTCTGGACGCCACT\\nTTAGTGGTGGGCCAGATGTCCCGAGTTAGTGCGCCACGTC\\n']" | |
] | |
}, | |
"execution_count": 20, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"all = sc.union(rf)\n", | |
"all.first()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.6" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment