Created
January 18, 2021 16:02
-
-
Save taijest/4d6254952a88aeb6fb81f861c5f1ede9 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Node Embedding" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### 利用ライブラリ \n", | |
"**OpenNE : https://github.com/thunlp/OpenNE ** \n", | |
"\n", | |
"Installation \n", | |
"```\n", | |
"git clone https://github.com/thunlp/OpenNE.git\n", | |
"pip install -r OpenNE/requirements.txt\n", | |
"python OpenNE/src/setup.py install\n", | |
"```\n", | |
"\n", | |
"\n", | |
"**StellarGraph : https://github.com/stellargraph/stellargraph** \n", | |
"\n", | |
"Installation\n", | |
"```\n", | |
"pip install stellargraph\n", | |
"```\n", | |
"\n", | |
"\n", | |
"**Karate Club : https://github.com/benedekrozemberczki/karateclub** \n", | |
"\n", | |
"Installation\n", | |
"```\n", | |
"pip install karateclub\n", | |
"```\n", | |
"\n", | |
"**ProNE : https://github.com/THUDM/ProNE** \n", | |
"\n", | |
"Installation\n", | |
"```\n", | |
"git clone https://github.com/lykeven/ProNE\n", | |
"pip install -r ProNE/requirements.txt\n", | |
"```" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"STORAGE_PATH = \"embeddings\"\n", | |
"INPUT_PATH = \"OpenNE/data/blogCatalog/bc_edgelist.txt\"\n", | |
"DIMENSION = 128" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"!mkdir $STORAGE_PATH" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#### Embedding by OpenNE" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import os" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"embeddings_map = {}" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# DeepWalk\n", | |
"for window_size in [1, 5, 10, 20]:\n", | |
" deepwalk_output_path = os.path.join(\n", | |
" STORAGE_PATH, f\"deepwalk_{ window_size }.txt\"\n", | |
" )\n", | |
" !python -m openne --method deepWalk \\\n", | |
" --input $INPUT_PATH --output $deepwalk_output_path \\\n", | |
" --graph-format edgelist --window-size $window_size\n", | |
" embeddings_map[f\"deepwalk_{ window_size }\"] = deepwalk_output_path" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# node2vec\n", | |
"for p, q in [(1, 1), (0.5, 2), (2, 0.5)]:\n", | |
" node2vec_output_path = os.path.join(\n", | |
" STORAGE_PATH, \n", | |
" f\"node2vec_{ str(p).replace('.', '') }_{ str(q).replace('.', '') }.txt\"\n", | |
" )\n", | |
" !python -m openne --method node2vec \\\n", | |
" --input $INPUT_PATH --output $node2vec_output_path \\\n", | |
" --graph-format edgelist --p $p --q $q\n", | |
" embeddings_map[\n", | |
" f\"node2vec_{ str(p).replace('.', '') }_{ str(q).replace('.', '') }\"\n", | |
" ] = node2vec_output_path" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Grarep\n", | |
"for k in [1, 4, 8]:\n", | |
" grarep_output_path = os.path.join(\n", | |
" STORAGE_PATH, f\"grarep_{ k }.txt\"\n", | |
" )\n", | |
" !python -m openne --method grarep \\\n", | |
" --input $INPUT_PATH --output $grarep_output_path \\\n", | |
" --graph-format edgelist --kstep $k\n", | |
" embeddings_map[f\"grarep_{ k }\"] = grarep_output_path" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Graph Factorization\n", | |
"gf_output_path = os.path.join(STORAGE_PATH, \"gf.txt\")\n", | |
"!python -m openne --method gf --input $INPUT_PATH \\\n", | |
"--output $gf_output_path --graph-format edgelist\n", | |
"embeddings_map[\"gf\"] = gf_output_path" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# SDNE\n", | |
"sdne_output_path = os.path.join(STORAGE_PATH, \"sdne.txt\")\n", | |
"!python -m openne --method sdne --input $INPUT_PATH \\\n", | |
"--output $sdne_output_path --graph-format edgelist\n", | |
"embeddings_map[\"sdne\"] = sdne_output_path" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#### Embedding by StellarGraph" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import networkx as nx\n", | |
"import tensorflow as tf\n", | |
"from stellargraph import StellarGraph\n", | |
"from stellargraph.layer import WatchYourStep\n", | |
"from stellargraph.losses import graph_log_likelihood\n", | |
"from stellargraph.mapper import AdjacencyPowerGenerator" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"G = nx.read_edgelist(INPUT_PATH, nodetype=int)\n", | |
"SG = StellarGraph.from_networkx(G)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"## Watch Your Step\n", | |
"wys_output_path = os.path.join(STORAGE_PATH, \"watch_your_step.txt\")\n", | |
"generator = AdjacencyPowerGenerator(SG, num_powers=10)\n", | |
"wys = WatchYourStep(\n", | |
" generator,\n", | |
" num_walks=80,\n", | |
" embedding_dimension=DIMENSION,\n", | |
" attention_regularizer=tf.keras.regularizers.l2(0.5),\n", | |
")\n", | |
"x_in, x_out = wys.in_out_tensors()\n", | |
"\n", | |
"batch_size = 10\n", | |
"\n", | |
"model = tf.keras.Model(inputs=x_in, outputs=x_out)\n", | |
"model.compile(loss=graph_log_likelihood, optimizer=tf.keras.optimizers.Adam(1e-3))\n", | |
"train_gen = generator.flow(batch_size=batch_size, num_parallel_calls=10)\n", | |
"history = model.fit(\n", | |
" train_gen, epochs=epochs, verbose=1, \\\n", | |
" steps_per_epoch=int(len(SG.nodes()) // batch_size)\n", | |
")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"wys_embeddings = wys.embeddings()\n", | |
"node_list = list(SG.nodes())\n", | |
"\n", | |
"with open(wys_output_path, 'wt') as fout:\n", | |
" fout.write(f\"{ len(node_list) } { DIMENSION }\\n\")\n", | |
" for node, embedding in zip(node_list, embeddings):\n", | |
" row = node\n", | |
" for fac in embedding:\n", | |
" row += f\" { fac }\"\n", | |
" row += \"\\n\"\n", | |
" fout.write(row)\n", | |
"embeddings_map[\"wys\"] = wys_output_path" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#### Embedding by Karate Club" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from karateclub import NetMF" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"node_map = {v: int(v) for v in G.nodes()}\n", | |
"H = nx.relabel_nodes(G, node_map)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# NetMF\n", | |
"netmf_output_path = os.path.join(STORAGE_PATH, \"netmf.txt\")\n", | |
"netmf = NetMF(dimensions=DIMENSION, order=10, negative_samples=5)\n", | |
"netmf.fit(H)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"netmf_embeddings = netmf.get_embedding()\n", | |
"with open(netmf_output_path, 'wt') as fout:\n", | |
" fout.write(f\"{ len(node_list) } { DIMENSION }\\n\")\n", | |
" for node in node_list:\n", | |
" row = node\n", | |
" for fac in netmf_embeddings[int(node)]:\n", | |
" row += f\" { fac }\"\n", | |
" row += \"\\n\"\n", | |
" fout.write(row)\n", | |
"embeddings_map[\"netmf\"] = netmf_output_path" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#### Emnbedding by ProNE" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# ProNE\n", | |
"prone_output_1_path = os.path.join(STORAGE_PATH, \"prone_1.txt\")\n", | |
"prone_output_2_path = os.path.join(STORAGE_PATH, \"prone_2.txt\")\n", | |
"\n", | |
"!python ProNE/proNE.py -graph $INPUT_PATH \\\n", | |
"-emb1 $prone_output_1_path -emb2 $prone_output_2_path \\\n", | |
"-dimension $DIMENSION -step 10 -theta 0.5 -mu 0.2\n", | |
"\n", | |
"embeddings_map[\"prone\"] = prone_output_2_path" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Calculation RPD" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import numpy as np\n", | |
"import pandas as pd\n", | |
"import matplotlib.pyplot as plt\n", | |
"from sklearn.manifold import MDS" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def RPD(emb_1, emb_2):\n", | |
" normed_emb_1 = emb_1 / np.std(emb_1, axis=0)\n", | |
" normed_emb_2 = emb_2 / np.std(emb_2, axis=0)\n", | |
" sim_matrix_1 = np.dot(normed_emb_1, normed_emb_1.T)\n", | |
" sim_matrix_2 = np.dot(normed_emb_2, normed_emb_2.T)\n", | |
" return (\n", | |
" np.linalg.norm(sim_matrix_1 - sim_matrix_2)\n", | |
" )**2 / (\n", | |
" 2 * np.linalg.norm(sim_matrix_1) * np.linalg.norm(sim_matrix_2)\n", | |
" )" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def load_embedding(file_path):\n", | |
" dict_embeddings = {}\n", | |
" with open(file_path, 'rt') as fin:\n", | |
" text = fin.read()\n", | |
" for line in text.split(\"\\n\")[1:-1]:\n", | |
" fact = line.split(\" \")\n", | |
" dict_embeddings[fact[0]] = [float(x) for x in fact[1:]]\n", | |
" return np.array([dict_embeddings[node] for node in node_list])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"rpd_dict = {emb_name: [] for emb_name in embeddings_map.keys()}\n", | |
"rpd_dict[\"idx\"] = []\n", | |
"for emb_name_1, emb_path_1 in embeddings_map.keys():\n", | |
" rpd_dict[\"idx\"].append(emb_name_1)\n", | |
" for emb_name_2, emb_path_2 in embeddings_map.keys():\n", | |
" rpd = RPD(emb_path_1, emb_path_2)\n", | |
" rap_dict[emb_2].append(round(rpd, 3))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df_rpd = pd.DataFrame.from_dict(rpd_dict)\n", | |
"df_rpd.set_index(\"idx\", drop=True, inplace=True)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#### Visualization" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"mds = MDS(n_components=2, dissimilarity=\"precomputed\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"X_2d = mds.fit_transform(df_rpd.values)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"plt.figure(figsize=(9,6), dpi=200)\n", | |
" \n", | |
"for (x, y), emb_name in zip(X_2d, df_rpd.index):\n", | |
" plt.text(x, y, emb_name)\n", | |
"\n", | |
"plt.scatter(X_2d[:, 0], X_2d[:, 1], c=\"g\", alpha=0.6, s=150)\n", | |
"\n", | |
"plt.show()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#### Downstream Task : Node Classification" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from openne.classify import read_node_label, Classifier\n", | |
"from sklearn.linear_model import LogisticRegression" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"LABEL_PATH = \"OpenNE/data/blogCatalog/bc_labels.txt\"\n", | |
"X, Y = read_node_label(LABEL_PATH)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"MAX_ITER = 10000\n", | |
"def eval_embedding(file_path):\n", | |
" embedding = load_embedding(file_path)\n", | |
" clf = Classifier(vectors=embedding, clf=LogisticRegression(max_iter=MAX_ITER))\n", | |
" result = clf.split_train_evaluate(X, Y, 0.8, seed=33)\n", | |
" return result['micro'], result['macro']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"eval_dict = {\n", | |
" 'idx': list(embeddings_map.keys()),\n", | |
" 'micro_f1': [],\n", | |
" 'macro_f1': []\n", | |
"}\n", | |
"for emb_path in embeddings_map.values():\n", | |
" micro, macro = eval_embedding(emb_path)\n", | |
" eval_dict['micro_f1'].append(micro)\n", | |
" eval_dict['macro_f1'].append(macro)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df_eval = pd.DataFrame.from_dict(eval_dict)" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "n2v69", | |
"language": "python", | |
"name": "n2v69" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.9" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment