Skip to content

Instantly share code, notes, and snippets.

@duyet
Created December 11, 2016 07:31
Show Gist options
  • Save duyet/e1f8122a015b300456ece1b4f92c69f1 to your computer and use it in GitHub Desktop.
Save duyet/e1f8122a015b300456ece1b4f92c69f1 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from pyspark import SparkContext\n",
"from py4j.java_gateway import java_import\n",
"from pyspark.mllib.common import _to_java_object_rdd"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Import vnTokenizer from Java\n",
"java_import(sc._gateway.jvm, \"vn.vitk.tok.Tokenizer\")\n",
"Tokenizer = sc._jvm.vn.vitk.tok.Tokenizer"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true
},
"source": [
"## Load dataset "
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"data = sc.textFile('./data/data_test.txt')\n",
"data_rdd_java = _to_java_object_rdd(data) # Convert RDD to JavaRDD"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Tokenize"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"dataFolder = '/export/dat/tok'\n",
"token = Tokenizer(sc._jsc, dataFolder + \"/lexicon.xml\", dataFolder + \"/regexp.txt\")\n",
"result = token.tokenize(data_rdd_java)"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"result.saveAsTextFile('./output/tokenize')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [Root]",
"language": "python",
"name": "Python [Root]"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.12"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment