taijest/RPD_node_embedding.ipynb

## RPD_node_embedding.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Node Embedding"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 利用ライブラリ  \n",
    "**OpenNE : https://github.com/thunlp/OpenNE  **  \n",
    "\n",
    "Installation  \n",
    "```\n",
    "git clone https://github.com/thunlp/OpenNE.git\n",
    "pip install -r OpenNE/requirements.txt\n",
    "python OpenNE/src/setup.py install\n",
    "```\n",
    "\n",
    "\n",
    "**StellarGraph : https://github.com/stellargraph/stellargraph**  \n",
    "\n",
    "Installation\n",
    "```\n",
    "pip install stellargraph\n",
    "```\n",
    "\n",
    "\n",
    "**Karate Club : https://github.com/benedekrozemberczki/karateclub**  \n",
    "\n",
    "Installation\n",
    "```\n",
    "pip install karateclub\n",
    "```\n",
    "\n",
    "**ProNE : https://github.com/THUDM/ProNE**  \n",
    "\n",
    "Installation\n",
    "```\n",
    "git clone https://github.com/lykeven/ProNE\n",
    "pip install -r ProNE/requirements.txt\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "STORAGE_PATH = \"embeddings\"\n",
    "INPUT_PATH = \"OpenNE/data/blogCatalog/bc_edgelist.txt\"\n",
    "DIMENSION = 128"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!mkdir $STORAGE_PATH"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Embedding by OpenNE"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "embeddings_map = {}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# DeepWalk\n",
    "for window_size in [1, 5, 10, 20]:\n",
    "    deepwalk_output_path = os.path.join(\n",
    "        STORAGE_PATH, f\"deepwalk_{ window_size }.txt\"\n",
    "    )\n",
    "    !python -m openne --method deepWalk \\\n",
    "    --input $INPUT_PATH --output $deepwalk_output_path \\\n",
    "    --graph-format edgelist --window-size $window_size\n",
    "    embeddings_map[f\"deepwalk_{ window_size }\"] = deepwalk_output_path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# node2vec\n",
    "for p, q in [(1, 1), (0.5, 2), (2, 0.5)]:\n",
    "    node2vec_output_path = os.path.join(\n",
    "        STORAGE_PATH, \n",
    "        f\"node2vec_{ str(p).replace('.', '') }_{ str(q).replace('.', '') }.txt\"\n",
    "    )\n",
    "    !python -m openne --method node2vec \\\n",
    "    --input $INPUT_PATH --output $node2vec_output_path \\\n",
    "    --graph-format edgelist --p $p --q $q\n",
    "    embeddings_map[\n",
    "        f\"node2vec_{ str(p).replace('.', '') }_{ str(q).replace('.', '') }\"\n",
    "    ] = node2vec_output_path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Grarep\n",
    "for k in [1, 4, 8]:\n",
    "    grarep_output_path = os.path.join(\n",
    "        STORAGE_PATH, f\"grarep_{ k }.txt\"\n",
    "    )\n",
    "    !python -m openne --method grarep \\\n",
    "    --input $INPUT_PATH --output $grarep_output_path \\\n",
    "    --graph-format edgelist --kstep $k\n",
    "    embeddings_map[f\"grarep_{ k }\"] = grarep_output_path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Graph Factorization\n",
    "gf_output_path = os.path.join(STORAGE_PATH, \"gf.txt\")\n",
    "!python -m openne --method gf --input $INPUT_PATH \\\n",
    "--output $gf_output_path --graph-format edgelist\n",
    "embeddings_map[\"gf\"] = gf_output_path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# SDNE\n",
    "sdne_output_path = os.path.join(STORAGE_PATH, \"sdne.txt\")\n",
    "!python -m openne --method sdne --input $INPUT_PATH \\\n",
    "--output $sdne_output_path --graph-format edgelist\n",
    "embeddings_map[\"sdne\"] = sdne_output_path"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Embedding by StellarGraph"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import networkx as nx\n",
    "import tensorflow as tf\n",
    "from stellargraph import StellarGraph\n",
    "from stellargraph.layer import WatchYourStep\n",
    "from stellargraph.losses import graph_log_likelihood\n",
    "from stellargraph.mapper import AdjacencyPowerGenerator"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "G = nx.read_edgelist(INPUT_PATH, nodetype=int)\n",
    "SG = StellarGraph.from_networkx(G)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Watch Your Step\n",
    "wys_output_path = os.path.join(STORAGE_PATH, \"watch_your_step.txt\")\n",
    "generator = AdjacencyPowerGenerator(SG, num_powers=10)\n",
    "wys = WatchYourStep(\n",
    "    generator,\n",
    "    num_walks=80,\n",
    "    embedding_dimension=DIMENSION,\n",
    "    attention_regularizer=tf.keras.regularizers.l2(0.5),\n",
    ")\n",
    "x_in, x_out = wys.in_out_tensors()\n",
    "\n",
    "batch_size = 10\n",
    "\n",
    "model = tf.keras.Model(inputs=x_in, outputs=x_out)\n",
    "model.compile(loss=graph_log_likelihood, optimizer=tf.keras.optimizers.Adam(1e-3))\n",
    "train_gen = generator.flow(batch_size=batch_size, num_parallel_calls=10)\n",
    "history = model.fit(\n",
    "    train_gen, epochs=epochs, verbose=1, \\\n",
    "    steps_per_epoch=int(len(SG.nodes()) // batch_size)\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "wys_embeddings = wys.embeddings()\n",
    "node_list = list(SG.nodes())\n",
    "\n",
    "with open(wys_output_path, 'wt') as fout:\n",
    "    fout.write(f\"{ len(node_list) } { DIMENSION }\\n\")\n",
    "    for node, embedding in zip(node_list, embeddings):\n",
    "        row = node\n",
    "        for fac in embedding:\n",
    "            row += f\" { fac }\"\n",
    "        row += \"\\n\"\n",
    "        fout.write(row)\n",
    "embeddings_map[\"wys\"] = wys_output_path"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Embedding by Karate Club"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from karateclub import NetMF"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "node_map = {v: int(v)  for v in G.nodes()}\n",
    "H = nx.relabel_nodes(G, node_map)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# NetMF\n",
    "netmf_output_path = os.path.join(STORAGE_PATH, \"netmf.txt\")\n",
    "netmf = NetMF(dimensions=DIMENSION, order=10, negative_samples=5)\n",
    "netmf.fit(H)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "netmf_embeddings = netmf.get_embedding()\n",
    "with open(netmf_output_path, 'wt') as fout:\n",
    "    fout.write(f\"{ len(node_list) } { DIMENSION }\\n\")\n",
    "    for node in node_list:\n",
    "        row = node\n",
    "        for fac in netmf_embeddings[int(node)]:\n",
    "            row += f\" { fac }\"\n",
    "        row += \"\\n\"\n",
    "        fout.write(row)\n",
    "embeddings_map[\"netmf\"] = netmf_output_path"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Emnbedding by ProNE"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# ProNE\n",
    "prone_output_1_path = os.path.join(STORAGE_PATH, \"prone_1.txt\")\n",
    "prone_output_2_path = os.path.join(STORAGE_PATH, \"prone_2.txt\")\n",
    "\n",
    "!python ProNE/proNE.py -graph $INPUT_PATH \\\n",
    "-emb1 $prone_output_1_path -emb2 $prone_output_2_path \\\n",
    "-dimension $DIMENSION -step 10 -theta 0.5 -mu 0.2\n",
    "\n",
    "embeddings_map[\"prone\"] = prone_output_2_path"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Calculation RPD"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn.manifold import MDS"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def RPD(emb_1, emb_2):\n",
    "    normed_emb_1 = emb_1 / np.std(emb_1, axis=0)\n",
    "    normed_emb_2 = emb_2 / np.std(emb_2, axis=0)\n",
    "    sim_matrix_1 = np.dot(normed_emb_1, normed_emb_1.T)\n",
    "    sim_matrix_2 = np.dot(normed_emb_2, normed_emb_2.T)\n",
    "    return (\n",
    "        np.linalg.norm(sim_matrix_1 - sim_matrix_2)\n",
    "    )**2 / (\n",
    "        2 * np.linalg.norm(sim_matrix_1) * np.linalg.norm(sim_matrix_2)\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def load_embedding(file_path):\n",
    "    dict_embeddings = {}\n",
    "    with open(file_path, 'rt') as fin:\n",
    "        text = fin.read()\n",
    "    for line in text.split(\"\\n\")[1:-1]:\n",
    "        fact = line.split(\" \")\n",
    "        dict_embeddings[fact[0]] = [float(x) for x in fact[1:]]\n",
    "    return np.array([dict_embeddings[node] for node in node_list])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "rpd_dict = {emb_name: [] for emb_name in embeddings_map.keys()}\n",
    "rpd_dict[\"idx\"] = []\n",
    "for emb_name_1, emb_path_1 in embeddings_map.keys():\n",
    "    rpd_dict[\"idx\"].append(emb_name_1)\n",
    "    for emb_name_2, emb_path_2 in embeddings_map.keys():\n",
    "        rpd = RPD(emb_path_1, emb_path_2)\n",
    "        rap_dict[emb_2].append(round(rpd, 3))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_rpd = pd.DataFrame.from_dict(rpd_dict)\n",
    "df_rpd.set_index(\"idx\", drop=True, inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Visualization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "mds = MDS(n_components=2, dissimilarity=\"precomputed\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_2d = mds.fit_transform(df_rpd.values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(figsize=(9,6), dpi=200)\n",
    "    \n",
    "for (x, y), emb_name in zip(X_2d, df_rpd.index):\n",
    "    plt.text(x, y, emb_name)\n",
    "\n",
    "plt.scatter(X_2d[:, 0], X_2d[:, 1], c=\"g\", alpha=0.6, s=150)\n",
    "\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Downstream Task : Node Classification"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from openne.classify import read_node_label, Classifier\n",
    "from sklearn.linear_model import LogisticRegression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "LABEL_PATH = \"OpenNE/data/blogCatalog/bc_labels.txt\"\n",
    "X, Y = read_node_label(LABEL_PATH)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "MAX_ITER = 10000\n",
    "def eval_embedding(file_path):\n",
    "    embedding = load_embedding(file_path)\n",
    "    clf = Classifier(vectors=embedding, clf=LogisticRegression(max_iter=MAX_ITER))\n",
    "    result = clf.split_train_evaluate(X, Y, 0.8, seed=33)\n",
    "    return result['micro'], result['macro']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "eval_dict = {\n",
    "    'idx': list(embeddings_map.keys()),\n",
    "    'micro_f1': [],\n",
    "    'macro_f1': []\n",
    "}\n",
    "for emb_path in embeddings_map.values():\n",
    "    micro, macro = eval_embedding(emb_path)\n",
    "    eval_dict['micro_f1'].append(micro)\n",
    "    eval_dict['macro_f1'].append(macro)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_eval = pd.DataFrame.from_dict(eval_dict)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "n2v69",
   "language": "python",
   "name": "n2v69"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Node Embedding"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### 利用ライブラリ \n",
	"OpenNE : https://github.com/thunlp/OpenNE \n",
	"\n",
	"Installation \n",
	"```\n",
	"git clone https://github.com/thunlp/OpenNE.git\n",
	"pip install -r OpenNE/requirements.txt\n",
	"python OpenNE/src/setup.py install\n",
	"```\n",
	"\n",
	"\n",
	"StellarGraph : https://github.com/stellargraph/stellargraph \n",
	"\n",
	"Installation\n",
	"```\n",
	"pip install stellargraph\n",
	"```\n",
	"\n",
	"\n",
	"Karate Club : https://github.com/benedekrozemberczki/karateclub \n",
	"\n",
	"Installation\n",
	"```\n",
	"pip install karateclub\n",
	"```\n",
	"\n",
	"ProNE : https://github.com/THUDM/ProNE \n",
	"\n",
	"Installation\n",
	"```\n",
	"git clone https://github.com/lykeven/ProNE\n",
	"pip install -r ProNE/requirements.txt\n",
	"```"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"STORAGE_PATH = \"embeddings\"\n",
	"INPUT_PATH = \"OpenNE/data/blogCatalog/bc_edgelist.txt\"\n",
	"DIMENSION = 128"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"!mkdir $STORAGE_PATH"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"#### Embedding by OpenNE"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"import os"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"embeddings_map = {}"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# DeepWalk\n",
	"for window_size in [1, 5, 10, 20]:\n",
	" deepwalk_output_path = os.path.join(\n",
	" STORAGE_PATH, f\"deepwalk_{ window_size }.txt\"\n",
	" )\n",
	" !python -m openne --method deepWalk \\\n",
	" --input $INPUT_PATH --output $deepwalk_output_path \\\n",
	" --graph-format edgelist --window-size $window_size\n",
	" embeddings_map[f\"deepwalk_{ window_size }\"] = deepwalk_output_path"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# node2vec\n",
	"for p, q in [(1, 1), (0.5, 2), (2, 0.5)]:\n",
	" node2vec_output_path = os.path.join(\n",
	" STORAGE_PATH, \n",
	" f\"node2vec_{ str(p).replace('.', '') }_{ str(q).replace('.', '') }.txt\"\n",
	" )\n",
	" !python -m openne --method node2vec \\\n",
	" --input $INPUT_PATH --output $node2vec_output_path \\\n",
	" --graph-format edgelist --p $p --q $q\n",
	" embeddings_map[\n",
	" f\"node2vec_{ str(p).replace('.', '') }_{ str(q).replace('.', '') }\"\n",
	" ] = node2vec_output_path"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Grarep\n",
	"for k in [1, 4, 8]:\n",
	" grarep_output_path = os.path.join(\n",
	" STORAGE_PATH, f\"grarep_{ k }.txt\"\n",
	" )\n",
	" !python -m openne --method grarep \\\n",
	" --input $INPUT_PATH --output $grarep_output_path \\\n",
	" --graph-format edgelist --kstep $k\n",
	" embeddings_map[f\"grarep_{ k }\"] = grarep_output_path"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Graph Factorization\n",
	"gf_output_path = os.path.join(STORAGE_PATH, \"gf.txt\")\n",
	"!python -m openne --method gf --input $INPUT_PATH \\\n",
	"--output $gf_output_path --graph-format edgelist\n",
	"embeddings_map[\"gf\"] = gf_output_path"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# SDNE\n",
	"sdne_output_path = os.path.join(STORAGE_PATH, \"sdne.txt\")\n",
	"!python -m openne --method sdne --input $INPUT_PATH \\\n",
	"--output $sdne_output_path --graph-format edgelist\n",
	"embeddings_map[\"sdne\"] = sdne_output_path"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"#### Embedding by StellarGraph"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"import networkx as nx\n",
	"import tensorflow as tf\n",
	"from stellargraph import StellarGraph\n",
	"from stellargraph.layer import WatchYourStep\n",
	"from stellargraph.losses import graph_log_likelihood\n",
	"from stellargraph.mapper import AdjacencyPowerGenerator"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"G = nx.read_edgelist(INPUT_PATH, nodetype=int)\n",
	"SG = StellarGraph.from_networkx(G)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"## Watch Your Step\n",
	"wys_output_path = os.path.join(STORAGE_PATH, \"watch_your_step.txt\")\n",
	"generator = AdjacencyPowerGenerator(SG, num_powers=10)\n",
	"wys = WatchYourStep(\n",
	" generator,\n",
	" num_walks=80,\n",
	" embedding_dimension=DIMENSION,\n",
	" attention_regularizer=tf.keras.regularizers.l2(0.5),\n",
	")\n",
	"x_in, x_out = wys.in_out_tensors()\n",
	"\n",
	"batch_size = 10\n",
	"\n",
	"model = tf.keras.Model(inputs=x_in, outputs=x_out)\n",
	"model.compile(loss=graph_log_likelihood, optimizer=tf.keras.optimizers.Adam(1e-3))\n",
	"train_gen = generator.flow(batch_size=batch_size, num_parallel_calls=10)\n",
	"history = model.fit(\n",
	" train_gen, epochs=epochs, verbose=1, \\\n",
	" steps_per_epoch=int(len(SG.nodes()) // batch_size)\n",
	")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"wys_embeddings = wys.embeddings()\n",
	"node_list = list(SG.nodes())\n",
	"\n",
	"with open(wys_output_path, 'wt') as fout:\n",
	" fout.write(f\"{ len(node_list) } { DIMENSION }\\n\")\n",
	" for node, embedding in zip(node_list, embeddings):\n",
	" row = node\n",
	" for fac in embedding:\n",
	" row += f\" { fac }\"\n",
	" row += \"\\n\"\n",
	" fout.write(row)\n",
	"embeddings_map[\"wys\"] = wys_output_path"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"#### Embedding by Karate Club"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"from karateclub import NetMF"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"node_map = {v: int(v) for v in G.nodes()}\n",
	"H = nx.relabel_nodes(G, node_map)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# NetMF\n",
	"netmf_output_path = os.path.join(STORAGE_PATH, \"netmf.txt\")\n",
	"netmf = NetMF(dimensions=DIMENSION, order=10, negative_samples=5)\n",
	"netmf.fit(H)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"netmf_embeddings = netmf.get_embedding()\n",
	"with open(netmf_output_path, 'wt') as fout:\n",
	" fout.write(f\"{ len(node_list) } { DIMENSION }\\n\")\n",
	" for node in node_list:\n",
	" row = node\n",
	" for fac in netmf_embeddings[int(node)]:\n",
	" row += f\" { fac }\"\n",
	" row += \"\\n\"\n",
	" fout.write(row)\n",
	"embeddings_map[\"netmf\"] = netmf_output_path"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"#### Emnbedding by ProNE"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# ProNE\n",
	"prone_output_1_path = os.path.join(STORAGE_PATH, \"prone_1.txt\")\n",
	"prone_output_2_path = os.path.join(STORAGE_PATH, \"prone_2.txt\")\n",
	"\n",
	"!python ProNE/proNE.py -graph $INPUT_PATH \\\n",
	"-emb1 $prone_output_1_path -emb2 $prone_output_2_path \\\n",
	"-dimension $DIMENSION -step 10 -theta 0.5 -mu 0.2\n",
	"\n",
	"embeddings_map[\"prone\"] = prone_output_2_path"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Calculation RPD"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"import numpy as np\n",
	"import pandas as pd\n",
	"import matplotlib.pyplot as plt\n",
	"from sklearn.manifold import MDS"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"def RPD(emb_1, emb_2):\n",
	" normed_emb_1 = emb_1 / np.std(emb_1, axis=0)\n",
	" normed_emb_2 = emb_2 / np.std(emb_2, axis=0)\n",
	" sim_matrix_1 = np.dot(normed_emb_1, normed_emb_1.T)\n",
	" sim_matrix_2 = np.dot(normed_emb_2, normed_emb_2.T)\n",
	" return (\n",
	" np.linalg.norm(sim_matrix_1 - sim_matrix_2)\n",
	" )**2 / (\n",
	" 2 * np.linalg.norm(sim_matrix_1) * np.linalg.norm(sim_matrix_2)\n",
	" )"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"def load_embedding(file_path):\n",
	" dict_embeddings = {}\n",
	" with open(file_path, 'rt') as fin:\n",
	" text = fin.read()\n",
	" for line in text.split(\"\\n\")[1:-1]:\n",
	" fact = line.split(\" \")\n",
	" dict_embeddings[fact[0]] = [float(x) for x in fact[1:]]\n",
	" return np.array([dict_embeddings[node] for node in node_list])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"rpd_dict = {emb_name: [] for emb_name in embeddings_map.keys()}\n",
	"rpd_dict[\"idx\"] = []\n",
	"for emb_name_1, emb_path_1 in embeddings_map.keys():\n",
	" rpd_dict[\"idx\"].append(emb_name_1)\n",
	" for emb_name_2, emb_path_2 in embeddings_map.keys():\n",
	" rpd = RPD(emb_path_1, emb_path_2)\n",
	" rap_dict[emb_2].append(round(rpd, 3))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"df_rpd = pd.DataFrame.from_dict(rpd_dict)\n",
	"df_rpd.set_index(\"idx\", drop=True, inplace=True)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"#### Visualization"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"mds = MDS(n_components=2, dissimilarity=\"precomputed\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"X_2d = mds.fit_transform(df_rpd.values)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"plt.figure(figsize=(9,6), dpi=200)\n",
	" \n",
	"for (x, y), emb_name in zip(X_2d, df_rpd.index):\n",
	" plt.text(x, y, emb_name)\n",
	"\n",
	"plt.scatter(X_2d[:, 0], X_2d[:, 1], c=\"g\", alpha=0.6, s=150)\n",
	"\n",
	"plt.show()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"#### Downstream Task : Node Classification"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"from openne.classify import read_node_label, Classifier\n",
	"from sklearn.linear_model import LogisticRegression"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"LABEL_PATH = \"OpenNE/data/blogCatalog/bc_labels.txt\"\n",
	"X, Y = read_node_label(LABEL_PATH)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"MAX_ITER = 10000\n",
	"def eval_embedding(file_path):\n",
	" embedding = load_embedding(file_path)\n",
	" clf = Classifier(vectors=embedding, clf=LogisticRegression(max_iter=MAX_ITER))\n",
	" result = clf.split_train_evaluate(X, Y, 0.8, seed=33)\n",
	" return result['micro'], result['macro']"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"eval_dict = {\n",
	" 'idx': list(embeddings_map.keys()),\n",
	" 'micro_f1': [],\n",
	" 'macro_f1': []\n",
	"}\n",
	"for emb_path in embeddings_map.values():\n",
	" micro, macro = eval_embedding(emb_path)\n",
	" eval_dict['micro_f1'].append(micro)\n",
	" eval_dict['macro_f1'].append(macro)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"df_eval = pd.DataFrame.from_dict(eval_dict)"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "n2v69",
	"language": "python",
	"name": "n2v69"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.9"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}