Last active
May 1, 2019 11:31
-
-
Save joshuarobinson/6cf86ab22e86a074aa15a056fa5d4ac3 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import subprocess\n", | |
"import os" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"SRC=\"/datahub/onewiki/split_230k/\"" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def readContents(f):\n", | |
" with open(f, \"rb\") as fd:\n", | |
" return fd.read().splitlines()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def pureReadText(path):\n", | |
" result = subprocess.check_output([\"pls\", path])\n", | |
" listing = sc.parallelize(result.decode('utf-8').splitlines(), 2400)\n", | |
" listing = listing.filter(lambda x: os.path.isfile(x))\n", | |
" return listing.flatMap(readContents)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 892 ms, sys: 1.13 s, total: 2.02 s\n", | |
"Wall time: 1min 20s\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"2756675251" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"%time pureReadText(SRC).count()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 16 ms, sys: 0 ns, total: 16 ms\n", | |
"Wall time: 2min 37s\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"2756678137" | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"%time spark.read.text(SRC).count()" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.3" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment