Skip to content

Instantly share code, notes, and snippets.

@taijest
Created January 18, 2021 16:02
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save taijest/4d6254952a88aeb6fb81f861c5f1ede9 to your computer and use it in GitHub Desktop.
Save taijest/4d6254952a88aeb6fb81f861c5f1ede9 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Node Embedding"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 利用ライブラリ \n",
"**OpenNE : https://github.com/thunlp/OpenNE ** \n",
"\n",
"Installation \n",
"```\n",
"git clone https://github.com/thunlp/OpenNE.git\n",
"pip install -r OpenNE/requirements.txt\n",
"python OpenNE/src/setup.py install\n",
"```\n",
"\n",
"\n",
"**StellarGraph : https://github.com/stellargraph/stellargraph** \n",
"\n",
"Installation\n",
"```\n",
"pip install stellargraph\n",
"```\n",
"\n",
"\n",
"**Karate Club : https://github.com/benedekrozemberczki/karateclub** \n",
"\n",
"Installation\n",
"```\n",
"pip install karateclub\n",
"```\n",
"\n",
"**ProNE : https://github.com/THUDM/ProNE** \n",
"\n",
"Installation\n",
"```\n",
"git clone https://github.com/lykeven/ProNE\n",
"pip install -r ProNE/requirements.txt\n",
"```"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"STORAGE_PATH = \"embeddings\"\n",
"INPUT_PATH = \"OpenNE/data/blogCatalog/bc_edgelist.txt\"\n",
"DIMENSION = 128"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!mkdir $STORAGE_PATH"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Embedding by OpenNE"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"embeddings_map = {}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# DeepWalk\n",
"for window_size in [1, 5, 10, 20]:\n",
" deepwalk_output_path = os.path.join(\n",
" STORAGE_PATH, f\"deepwalk_{ window_size }.txt\"\n",
" )\n",
" !python -m openne --method deepWalk \\\n",
" --input $INPUT_PATH --output $deepwalk_output_path \\\n",
" --graph-format edgelist --window-size $window_size\n",
" embeddings_map[f\"deepwalk_{ window_size }\"] = deepwalk_output_path"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# node2vec\n",
"for p, q in [(1, 1), (0.5, 2), (2, 0.5)]:\n",
" node2vec_output_path = os.path.join(\n",
" STORAGE_PATH, \n",
" f\"node2vec_{ str(p).replace('.', '') }_{ str(q).replace('.', '') }.txt\"\n",
" )\n",
" !python -m openne --method node2vec \\\n",
" --input $INPUT_PATH --output $node2vec_output_path \\\n",
" --graph-format edgelist --p $p --q $q\n",
" embeddings_map[\n",
" f\"node2vec_{ str(p).replace('.', '') }_{ str(q).replace('.', '') }\"\n",
" ] = node2vec_output_path"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Grarep\n",
"for k in [1, 4, 8]:\n",
" grarep_output_path = os.path.join(\n",
" STORAGE_PATH, f\"grarep_{ k }.txt\"\n",
" )\n",
" !python -m openne --method grarep \\\n",
" --input $INPUT_PATH --output $grarep_output_path \\\n",
" --graph-format edgelist --kstep $k\n",
" embeddings_map[f\"grarep_{ k }\"] = grarep_output_path"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Graph Factorization\n",
"gf_output_path = os.path.join(STORAGE_PATH, \"gf.txt\")\n",
"!python -m openne --method gf --input $INPUT_PATH \\\n",
"--output $gf_output_path --graph-format edgelist\n",
"embeddings_map[\"gf\"] = gf_output_path"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# SDNE\n",
"sdne_output_path = os.path.join(STORAGE_PATH, \"sdne.txt\")\n",
"!python -m openne --method sdne --input $INPUT_PATH \\\n",
"--output $sdne_output_path --graph-format edgelist\n",
"embeddings_map[\"sdne\"] = sdne_output_path"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Embedding by StellarGraph"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import networkx as nx\n",
"import tensorflow as tf\n",
"from stellargraph import StellarGraph\n",
"from stellargraph.layer import WatchYourStep\n",
"from stellargraph.losses import graph_log_likelihood\n",
"from stellargraph.mapper import AdjacencyPowerGenerator"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"G = nx.read_edgelist(INPUT_PATH, nodetype=int)\n",
"SG = StellarGraph.from_networkx(G)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"## Watch Your Step\n",
"wys_output_path = os.path.join(STORAGE_PATH, \"watch_your_step.txt\")\n",
"generator = AdjacencyPowerGenerator(SG, num_powers=10)\n",
"wys = WatchYourStep(\n",
" generator,\n",
" num_walks=80,\n",
" embedding_dimension=DIMENSION,\n",
" attention_regularizer=tf.keras.regularizers.l2(0.5),\n",
")\n",
"x_in, x_out = wys.in_out_tensors()\n",
"\n",
"batch_size = 10\n",
"\n",
"model = tf.keras.Model(inputs=x_in, outputs=x_out)\n",
"model.compile(loss=graph_log_likelihood, optimizer=tf.keras.optimizers.Adam(1e-3))\n",
"train_gen = generator.flow(batch_size=batch_size, num_parallel_calls=10)\n",
"history = model.fit(\n",
" train_gen, epochs=epochs, verbose=1, \\\n",
" steps_per_epoch=int(len(SG.nodes()) // batch_size)\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"wys_embeddings = wys.embeddings()\n",
"node_list = list(SG.nodes())\n",
"\n",
"with open(wys_output_path, 'wt') as fout:\n",
" fout.write(f\"{ len(node_list) } { DIMENSION }\\n\")\n",
" for node, embedding in zip(node_list, embeddings):\n",
" row = node\n",
" for fac in embedding:\n",
" row += f\" { fac }\"\n",
" row += \"\\n\"\n",
" fout.write(row)\n",
"embeddings_map[\"wys\"] = wys_output_path"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Embedding by Karate Club"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from karateclub import NetMF"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"node_map = {v: int(v) for v in G.nodes()}\n",
"H = nx.relabel_nodes(G, node_map)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# NetMF\n",
"netmf_output_path = os.path.join(STORAGE_PATH, \"netmf.txt\")\n",
"netmf = NetMF(dimensions=DIMENSION, order=10, negative_samples=5)\n",
"netmf.fit(H)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"netmf_embeddings = netmf.get_embedding()\n",
"with open(netmf_output_path, 'wt') as fout:\n",
" fout.write(f\"{ len(node_list) } { DIMENSION }\\n\")\n",
" for node in node_list:\n",
" row = node\n",
" for fac in netmf_embeddings[int(node)]:\n",
" row += f\" { fac }\"\n",
" row += \"\\n\"\n",
" fout.write(row)\n",
"embeddings_map[\"netmf\"] = netmf_output_path"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Emnbedding by ProNE"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# ProNE\n",
"prone_output_1_path = os.path.join(STORAGE_PATH, \"prone_1.txt\")\n",
"prone_output_2_path = os.path.join(STORAGE_PATH, \"prone_2.txt\")\n",
"\n",
"!python ProNE/proNE.py -graph $INPUT_PATH \\\n",
"-emb1 $prone_output_1_path -emb2 $prone_output_2_path \\\n",
"-dimension $DIMENSION -step 10 -theta 0.5 -mu 0.2\n",
"\n",
"embeddings_map[\"prone\"] = prone_output_2_path"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Calculation RPD"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"from sklearn.manifold import MDS"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def RPD(emb_1, emb_2):\n",
" normed_emb_1 = emb_1 / np.std(emb_1, axis=0)\n",
" normed_emb_2 = emb_2 / np.std(emb_2, axis=0)\n",
" sim_matrix_1 = np.dot(normed_emb_1, normed_emb_1.T)\n",
" sim_matrix_2 = np.dot(normed_emb_2, normed_emb_2.T)\n",
" return (\n",
" np.linalg.norm(sim_matrix_1 - sim_matrix_2)\n",
" )**2 / (\n",
" 2 * np.linalg.norm(sim_matrix_1) * np.linalg.norm(sim_matrix_2)\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def load_embedding(file_path):\n",
" dict_embeddings = {}\n",
" with open(file_path, 'rt') as fin:\n",
" text = fin.read()\n",
" for line in text.split(\"\\n\")[1:-1]:\n",
" fact = line.split(\" \")\n",
" dict_embeddings[fact[0]] = [float(x) for x in fact[1:]]\n",
" return np.array([dict_embeddings[node] for node in node_list])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"rpd_dict = {emb_name: [] for emb_name in embeddings_map.keys()}\n",
"rpd_dict[\"idx\"] = []\n",
"for emb_name_1, emb_path_1 in embeddings_map.keys():\n",
" rpd_dict[\"idx\"].append(emb_name_1)\n",
" for emb_name_2, emb_path_2 in embeddings_map.keys():\n",
" rpd = RPD(emb_path_1, emb_path_2)\n",
" rap_dict[emb_2].append(round(rpd, 3))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df_rpd = pd.DataFrame.from_dict(rpd_dict)\n",
"df_rpd.set_index(\"idx\", drop=True, inplace=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Visualization"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"mds = MDS(n_components=2, dissimilarity=\"precomputed\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"X_2d = mds.fit_transform(df_rpd.values)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"plt.figure(figsize=(9,6), dpi=200)\n",
" \n",
"for (x, y), emb_name in zip(X_2d, df_rpd.index):\n",
" plt.text(x, y, emb_name)\n",
"\n",
"plt.scatter(X_2d[:, 0], X_2d[:, 1], c=\"g\", alpha=0.6, s=150)\n",
"\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Downstream Task : Node Classification"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from openne.classify import read_node_label, Classifier\n",
"from sklearn.linear_model import LogisticRegression"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"LABEL_PATH = \"OpenNE/data/blogCatalog/bc_labels.txt\"\n",
"X, Y = read_node_label(LABEL_PATH)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"MAX_ITER = 10000\n",
"def eval_embedding(file_path):\n",
" embedding = load_embedding(file_path)\n",
" clf = Classifier(vectors=embedding, clf=LogisticRegression(max_iter=MAX_ITER))\n",
" result = clf.split_train_evaluate(X, Y, 0.8, seed=33)\n",
" return result['micro'], result['macro']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"eval_dict = {\n",
" 'idx': list(embeddings_map.keys()),\n",
" 'micro_f1': [],\n",
" 'macro_f1': []\n",
"}\n",
"for emb_path in embeddings_map.values():\n",
" micro, macro = eval_embedding(emb_path)\n",
" eval_dict['micro_f1'].append(micro)\n",
" eval_dict['macro_f1'].append(macro)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df_eval = pd.DataFrame.from_dict(eval_dict)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "n2v69",
"language": "python",
"name": "n2v69"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment