Skip to content

Instantly share code, notes, and snippets.

@bgbg
Last active February 2, 2016 09:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bgbg/d8d46f974cf871e58c78 to your computer and use it in GitHub Desktop.
Save bgbg/d8d46f974cf871e58c78 to your computer and use it in GitHub Desktop.
Another triple_apply pecularity
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[INFO] Using MetricMock instead of real metrics, mode is: QA\n"
]
}
],
"source": [
"import time\n",
"import pandas as pd\n",
"import numpy as np\n",
"import sframe as gl"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The data below is an edge table. It contains three types of vertices: A, B, and C. Edge type is determined by vertix types: \"A_A\", \"C_B\", etc. There are 150 edges."
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Overwriting test_edges.csv\n"
]
}
],
"source": [
"%%writefile test_edges.csv\n",
"__src_id,__dst_id,edge_type,weight\n",
"\"C_56\",\"C_36\",\"C_C\",1\n",
"\"C_53\",\"B_20\",\"C_B\",1\n",
"\"B_12\",\"A_02\",\"B_A\",1\n",
"\"B_00\",\"A_00\",\"B_A\",1\n",
"\"C_02\",\"B_04\",\"C_B\",1\n",
"\"C_80\",\"B_31\",\"C_B\",1\n",
"\"C_80\",\"C_28\",\"C_C\",1\n",
"\"B_34\",\"A_06\",\"B_A\",1\n",
"\"B_33\",\"A_06\",\"B_A\",1\n",
"\"B_35\",\"A_06\",\"B_A\",1\n",
"\"C_02\",\"B_05\",\"C_B\",1\n",
"\"B_37\",\"A_06\",\"B_A\",1\n",
"\"C_13\",\"B_57\",\"C_B\",1\n",
"\"C_53\",\"C_16\",\"C_C\",1\n",
"\"C_53\",\"B_21\",\"C_B\",1\n",
"\"C_02\",\"B_03\",\"C_B\",1\n",
"\"C_47\",\"C_41\",\"C_C\",1\n",
"\"C_15\",\"C_36\",\"C_C\",1\n",
"\"C_81\",\"C_36\",\"C_C\",1\n",
"\"C_78\",\"C_36\",\"C_C\",1\n",
"\"C_99\",\"C_36\",\"C_C\",1\n",
"\"C_38\",\"B_46\",\"C_B\",1\n",
"\"C_81\",\"B_40\",\"C_B\",1\n",
"\"C_15\",\"B_40\",\"C_B\",1\n",
"\"C_78\",\"B_40\",\"C_B\",1\n",
"\"C_96\",\"B_55\",\"C_B\",1\n",
"\"C_31\",\"B_55\",\"C_B\",1\n",
"\"C_73\",\"B_25\",\"C_B\",1\n",
"\"C_81\",\"B_55\",\"C_B\",1\n",
"\"C_20\",\"B_55\",\"C_B\",1\n",
"\"C_25\",\"B_55\",\"C_B\",1\n",
"\"C_86\",\"C_16\",\"C_C\",1\n",
"\"C_50\",\"B_14\",\"C_B\",1\n",
"\"C_01\",\"C_16\",\"C_C\",1\n",
"\"C_81\",\"B_39\",\"C_B\",1\n",
"\"C_81\",\"B_38\",\"C_B\",1\n",
"\"C_78\",\"B_38\",\"C_B\",1\n",
"\"C_12\",\"C_41\",\"C_C\",1\n",
"\"C_73\",\"B_24\",\"C_B\",1\n",
"\"C_61\",\"C_41\",\"C_C\",1\n",
"\"C_19\",\"A_05\",\"C_A\",1\n",
"\"C_92\",\"B_55\",\"C_B\",1\n",
"\"C_74\",\"B_55\",\"C_B\",1\n",
"\"C_35\",\"C_16\",\"C_C\",1\n",
"\"C_87\",\"C_16\",\"C_C\",1\n",
"\"C_71\",\"C_16\",\"C_C\",1\n",
"\"C_65\",\"B_56\",\"C_B\",1\n",
"\"C_65\",\"C_54\",\"C_C\",1\n",
"\"C_87\",\"C_54\",\"C_C\",1\n",
"\"C_65\",\"C_11\",\"C_C\",1\n",
"\"C_100\",\"C_41\",\"C_C\",1\n",
"\"C_91\",\"C_36\",\"C_C\",1\n",
"\"C_27\",\"B_28\",\"C_B\",1\n",
"\"C_05\",\"C_50\",\"C_C\",1\n",
"\"C_05\",\"B_15\",\"C_B\",1\n",
"\"C_34\",\"B_55\",\"C_B\",1\n",
"\"B_36\",\"A_06\",\"B_A\",1\n",
"\"C_79\",\"C_29\",\"C_C\",1\n",
"\"C_66\",\"C_16\",\"C_C\",1\n",
"\"C_82\",\"C_16\",\"C_C\",1\n",
"\"C_05\",\"B_14\",\"C_B\",1\n",
"\"C_51\",\"C_16\",\"C_C\",1\n",
"\"C_70\",\"C_16\",\"C_C\",1\n",
"\"C_21\",\"C_16\",\"C_C\",1\n",
"\"C_58\",\"B_48\",\"C_B\",1\n",
"\"C_69\",\"B_30\",\"C_B\",1\n",
"\"B_27\",\"A_04\",\"B_A\",1\n",
"\"C_26\",\"C_54\",\"C_C\",1\n",
"\"C_06\",\"B_06\",\"C_B\",1\n",
"\"C_39\",\"B_29\",\"C_B\",1\n",
"\"C_06\",\"B_07\",\"C_B\",1\n",
"\"C_58\",\"C_41\",\"C_C\",1\n",
"\"C_17\",\"B_12\",\"C_B\",1\n",
"\"C_72\",\"C_36\",\"C_C\",1\n",
"\"C_22\",\"B_20\",\"C_B\",1\n",
"\"C_23\",\"C_36\",\"C_C\",1\n",
"\"C_16\",\"B_20\",\"C_B\",1\n",
"\"C_16\",\"B_22\",\"C_B\",1\n",
"\"C_22\",\"B_22\",\"C_B\",1\n",
"\"C_55\",\"B_28\",\"C_B\",1\n",
"\"B_08\",\"A_01\",\"B_A\",1\n",
"\"B_09\",\"A_01\",\"B_A\",1\n",
"\"C_29\",\"C_50\",\"C_C\",1\n",
"\"C_16\",\"B_19\",\"C_B\",1\n",
"\"C_62\",\"B_55\",\"C_B\",1\n",
"\"B_32\",\"A_06\",\"B_A\",1\n",
"\"C_60\",\"B_55\",\"C_B\",1\n",
"\"C_57\",\"B_54\",\"C_B\",1\n",
"\"C_22\",\"C_16\",\"C_C\",1\n",
"\"C_52\",\"C_16\",\"C_C\",1\n",
"\"C_29\",\"B_14\",\"C_B\",1\n",
"\"C_16\",\"B_21\",\"C_B\",1\n",
"\"C_98\",\"C_16\",\"C_C\",1\n",
"\"C_18\",\"B_23\",\"C_B\",1\n",
"\"C_16\",\"B_18\",\"C_B\",1\n",
"\"C_63\",\"B_17\",\"C_B\",1\n",
"\"C_48\",\"C_36\",\"C_C\",1\n",
"\"C_48\",\"B_42\",\"C_B\",1\n",
"\"C_04\",\"C_08\",\"C_C\",1\n",
"\"C_44\",\"C_29\",\"C_C\",1\n",
"\"C_07\",\"C_16\",\"C_C\",1\n",
"\"C_85\",\"C_16\",\"C_C\",1\n",
"\"C_45\",\"B_14\",\"C_B\",1\n",
"\"C_37\",\"B_26\",\"C_B\",1\n",
"\"C_89\",\"B_41\",\"C_B\",1\n",
"\"C_48\",\"B_38\",\"C_B\",1\n",
"\"C_40\",\"B_38\",\"C_B\",1\n",
"\"C_93\",\"B_02\",\"C_B\",1\n",
"\"C_43\",\"B_10\",\"C_B\",1\n",
"\"C_84\",\"C_36\",\"C_C\",1\n",
"\"C_03\",\"C_50\",\"C_C\",1\n",
"\"C_84\",\"B_28\",\"C_B\",1\n",
"\"B_13\",\"A_03\",\"B_A\",1\n",
"\"C_09\",\"B_11\",\"C_B\",1\n",
"\"C_88\",\"B_43\",\"C_B\",1\n",
"\"C_10\",\"B_55\",\"C_B\",1\n",
"\"C_03\",\"C_42\",\"C_C\",1\n",
"\"C_101\",\"C_16\",\"C_C\",1\n",
"\"C_46\",\"C_16\",\"C_C\",1\n",
"\"C_03\",\"B_16\",\"C_B\",1\n",
"\"C_03\",\"C_24\",\"C_C\",1\n",
"\"C_76\",\"C_16\",\"C_C\",1\n",
"\"C_97\",\"C_16\",\"C_C\",1\n",
"\"C_88\",\"B_45\",\"C_B\",1\n",
"\"C_77\",\"B_45\",\"C_B\",1\n",
"\"B_51\",\"A_07\",\"B_A\",1\n",
"\"C_88\",\"B_44\",\"C_B\",1\n",
"\"C_41\",\"B_49\",\"C_B\",1\n",
"\"C_64\",\"C_36\",\"C_C\",1\n",
"\"C_75\",\"B_28\",\"C_B\",1\n",
"\"B_02\",\"A_00\",\"B_A\",1\n",
"\"C_68\",\"B_11\",\"C_B\",1\n",
"\"C_94\",\"B_55\",\"C_B\",1\n",
"\"C_32\",\"B_55\",\"C_B\",1\n",
"\"C_14\",\"B_55\",\"C_B\",1\n",
"\"C_95\",\"C_16\",\"C_C\",1\n",
"\"C_41\",\"B_47\",\"C_B\",1\n",
"\"C_30\",\"C_16\",\"C_C\",1\n",
"\"C_90\",\"C_16\",\"C_C\",1\n",
"\"C_49\",\"C_16\",\"C_C\",1\n",
"\"C_67\",\"C_00\",\"C_C\",1\n",
"\"C_41\",\"B_48\",\"C_B\",1\n",
"\"C_41\",\"B_50\",\"C_B\",1\n",
"\"C_59\",\"B_53\",\"C_B\",1\n",
"\"C_83\",\"B_01\",\"C_B\",1\n",
"\"C_33\",\"C_54\",\"C_C\",1\n",
"\"C_64\",\"C_54\",\"C_C\",1\n",
"\"C_41\",\"B_52\",\"C_B\",1\n",
"\"C_41\",\"B_51\",\"C_B\",1\n",
"\"C_67\",\"B_02\",\"C_B\",1"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We may test the validity of edge types using the following function. "
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def validate_edge_types(gr):\n",
" '''Print True or False, depending on edge type validity\n",
" Also, return a binary array of valid edge types\n",
" '''\n",
" sel = gl.SArray([\"%s_%s\" % (r['__src_id'][0], r['__dst_id'][0]) == r['edge_type'] for r in gr.edges])\n",
" print(np.all(sel))\n",
" return sel"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The following function simulates some activity"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def do_stuff(src, edge, dst):\n",
" edge['weight'] /= 1.0\n",
" return src, edge, dst"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's load the data, create a graph and test edge type validity:"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[INFO] Start server at: ipc:///tmp/graphlab_server-54687 - Server binary: /Users/boris/anaconda/lib/python2.7/site-packages/sframe/unity_server - Server log: /tmp/sframe_server_1454405531.log\n",
"[INFO] GraphLab Server Version: 1.6\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"PROGRESS: Finished parsing file /Users/boris/temp/test_edges.csv\n",
"PROGRESS: Parsing completed. Parsed 100 lines in 0.022644 secs.\n",
"------------------------------------------------------\n",
"Inferred types from first line of file as \n",
"column_type_hints=[str,str,str,int]\n",
"If parsing fails due to incorrect types, you can correct\n",
"the inferred type list above and pass it to read_csv in\n",
"the column_type_hints argument\n",
"------------------------------------------------------\n",
"PROGRESS: Finished parsing file /Users/boris/temp/test_edges.csv\n",
"PROGRESS: Parsing completed. Parsed 150 lines in 0.009042 secs.\n",
"True\n"
]
}
],
"source": [
"gg = gl.SGraph(edges=gl.load_sframe('test_edges.csv'))\n",
"validate_edge_types(gg);"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Good, now let's `triple_apply`"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"gg_after = gg.triple_apply(do_stuff, ['weight'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The input graph is still valid, but not the resulting one:"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"True\n",
"False\n",
"+----------+----------+-----------+--------+\n",
"| __src_id | __dst_id | edge_type | weight |\n",
"+----------+----------+-----------+--------+\n",
"| C_73 | B_24 | C_C | 1 |\n",
"| C_12 | C_41 | C_B | 1 |\n",
"| C_05 | B_14 | C_C | 1 |\n",
"| C_82 | C_16 | C_B | 1 |\n",
"| B_32 | A_06 | C_B | 1 |\n",
"| C_57 | B_54 | B_A | 1 |\n",
"| C_03 | B_16 | C_C | 1 |\n",
"| C_101 | C_16 | C_B | 1 |\n",
"| B_02 | A_00 | C_B | 1 |\n",
"| C_75 | B_28 | B_A | 1 |\n",
"+----------+----------+-----------+--------+\n",
"[? rows x 4 columns]\n",
"Note: Only the head of the SFrame is printed. This SFrame is lazily evaluated.\n",
"You can use len(sf) to force materialization.\n"
]
}
],
"source": [
"validate_edge_types(gg)\n",
"sel = validate_edge_types(gg_after)\n",
"print(gg_after.edges[1-sel])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Testing a workaround\n",
"\n",
"It [has been suggested](https://github.com/dato-code/SFrame/issues/157#issue-130537095) that including all the columns in the `mutated_fields` argument prevents the problem, at the expense of efficiency. Let's test it"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"gg_workaround = gg.triple_apply(do_stuff, ['edge_type', 'weight'])"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"True\n"
]
}
],
"source": [
"_ = validate_edge_types(gg_workaround)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.11"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment