Last active
November 23, 2022 20:09
-
-
Save dsignr/c3f7a67fcfb1fb93698a507f4cce8eef to your computer and use it in GitHub Desktop.
A python script to extract data from CSV and convert it into Gephi compatible GML.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"import sys, time\n", | |
"import pandas as pd\n", | |
"import datetime as dt\n", | |
"from IPython.display import display\n", | |
"\n", | |
"import plotly.plotly as py # interactive graphing\n", | |
"from plotly.graph_objs import Bar, Scatter, Marker, Layout " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"FILE_NAME = \"output.gml\"" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"authors = pd.read_csv('authors.csv', sep=' ')\n", | |
"occurrence = pd.read_csv('occurrence.csv', sep=' ')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#Utility functions\n", | |
"def progress(v):\n", | |
" v = str(v)\n", | |
" sys.stdout.flush()\n", | |
" sys.stdout.write('\\r')\n", | |
" sys.stdout.flush()\n", | |
" sys.stdout.write(v)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>AUTHOR_ID</th>\n", | |
" <th>CO-AUTHOR_ID</th>\n", | |
" <th>NO_OF_BOOKS</th>\n", | |
" <th>AUTHOR</th>\n", | |
" <th>CO-AUTHOR</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1</td>\n", | |
" <td>190</td>\n", | |
" <td>7</td>\n", | |
" <td>RADZIHOVSKY, L</td>\n", | |
" <td>TONER, J</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>66</th>\n", | |
" <td>1</td>\n", | |
" <td>2281</td>\n", | |
" <td>1</td>\n", | |
" <td>RADZIHOVSKY, L</td>\n", | |
" <td>FREY, E</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>93</th>\n", | |
" <td>1</td>\n", | |
" <td>3896</td>\n", | |
" <td>1</td>\n", | |
" <td>RADZIHOVSKY, L</td>\n", | |
" <td>GINZBURG, VV</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>101</th>\n", | |
" <td>1</td>\n", | |
" <td>3897</td>\n", | |
" <td>2</td>\n", | |
" <td>RADZIHOVSKY, L</td>\n", | |
" <td>CLARK, NA</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>110</th>\n", | |
" <td>1</td>\n", | |
" <td>12347</td>\n", | |
" <td>1</td>\n", | |
" <td>RADZIHOVSKY, L</td>\n", | |
" <td>JACOBSEN, B</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>113</th>\n", | |
" <td>1</td>\n", | |
" <td>12348</td>\n", | |
" <td>1</td>\n", | |
" <td>RADZIHOVSKY, L</td>\n", | |
" <td>SAUNDERS, K</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>116</th>\n", | |
" <td>1</td>\n", | |
" <td>12700</td>\n", | |
" <td>1</td>\n", | |
" <td>RADZIHOVSKY, L</td>\n", | |
" <td>LINK, DR</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>123</th>\n", | |
" <td>1</td>\n", | |
" <td>12701</td>\n", | |
" <td>1</td>\n", | |
" <td>RADZIHOVSKY, L</td>\n", | |
" <td>NATALE, G</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>130</th>\n", | |
" <td>1</td>\n", | |
" <td>12702</td>\n", | |
" <td>1</td>\n", | |
" <td>RADZIHOVSKY, L</td>\n", | |
" <td>MACLENNAN, JE</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>137</th>\n", | |
" <td>1</td>\n", | |
" <td>12703</td>\n", | |
" <td>1</td>\n", | |
" <td>RADZIHOVSKY, L</td>\n", | |
" <td>WALSH, M</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>144</th>\n", | |
" <td>1</td>\n", | |
" <td>12704</td>\n", | |
" <td>1</td>\n", | |
" <td>RADZIHOVSKY, L</td>\n", | |
" <td>KEAST, SS</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>151</th>\n", | |
" <td>1</td>\n", | |
" <td>12705</td>\n", | |
" <td>1</td>\n", | |
" <td>RADZIHOVSKY, L</td>\n", | |
" <td>NEUBERT, ME</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>55</th>\n", | |
" <td>1</td>\n", | |
" <td>1075</td>\n", | |
" <td>3</td>\n", | |
" <td>RADZIHOVSKY, L</td>\n", | |
" <td>MARCHETTI, MC</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>35</th>\n", | |
" <td>1</td>\n", | |
" <td>562</td>\n", | |
" <td>2</td>\n", | |
" <td>RADZIHOVSKY, L</td>\n", | |
" <td>NELSON, DR</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>11</th>\n", | |
" <td>1</td>\n", | |
" <td>201</td>\n", | |
" <td>3</td>\n", | |
" <td>RADZIHOVSKY, L</td>\n", | |
" <td>BALENTS, L</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>159</th>\n", | |
" <td>2</td>\n", | |
" <td>1237</td>\n", | |
" <td>2</td>\n", | |
" <td>FRISCHAT, SD</td>\n", | |
" <td>DORON, E</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>158</th>\n", | |
" <td>2</td>\n", | |
" <td>3</td>\n", | |
" <td>1</td>\n", | |
" <td>FRISCHAT, SD</td>\n", | |
" <td>KUHN, R</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>160</th>\n", | |
" <td>3</td>\n", | |
" <td>2</td>\n", | |
" <td>1</td>\n", | |
" <td>KUHN, R</td>\n", | |
" <td>FRISCHAT, SD</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>311</th>\n", | |
" <td>4</td>\n", | |
" <td>10757</td>\n", | |
" <td>9</td>\n", | |
" <td>BEENAKKER, CWJ</td>\n", | |
" <td>PATRA, M</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>215</th>\n", | |
" <td>4</td>\n", | |
" <td>891</td>\n", | |
" <td>2</td>\n", | |
" <td>BEENAKKER, CWJ</td>\n", | |
" <td>LEYRONAS, X</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>219</th>\n", | |
" <td>4</td>\n", | |
" <td>1785</td>\n", | |
" <td>1</td>\n", | |
" <td>BEENAKKER, CWJ</td>\n", | |
" <td>BUTTIKER, M</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>243</th>\n", | |
" <td>4</td>\n", | |
" <td>2212</td>\n", | |
" <td>5</td>\n", | |
" <td>BEENAKKER, CWJ</td>\n", | |
" <td>LANGEN, SAV</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>200</th>\n", | |
" <td>4</td>\n", | |
" <td>722</td>\n", | |
" <td>1</td>\n", | |
" <td>BEENAKKER, CWJ</td>\n", | |
" <td>FRAHM, K</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>255</th>\n", | |
" <td>4</td>\n", | |
" <td>2264</td>\n", | |
" <td>1</td>\n", | |
" <td>BEENAKKER, CWJ</td>\n", | |
" <td>BLANTER, YM</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>196</th>\n", | |
" <td>4</td>\n", | |
" <td>7</td>\n", | |
" <td>4</td>\n", | |
" <td>BEENAKKER, CWJ</td>\n", | |
" <td>JONG, MJMD</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>268</th>\n", | |
" <td>4</td>\n", | |
" <td>2876</td>\n", | |
" <td>9</td>\n", | |
" <td>BEENAKKER, CWJ</td>\n", | |
" <td>SCHOMERUS, H</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>290</th>\n", | |
" <td>4</td>\n", | |
" <td>3232</td>\n", | |
" <td>5</td>\n", | |
" <td>BEENAKKER, CWJ</td>\n", | |
" <td>MISIRPASHAEV, TS</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>306</th>\n", | |
" <td>4</td>\n", | |
" <td>8994</td>\n", | |
" <td>2</td>\n", | |
" <td>BEENAKKER, CWJ</td>\n", | |
" <td>TWORZYDLO, J</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>293</th>\n", | |
" <td>4</td>\n", | |
" <td>3478</td>\n", | |
" <td>7</td>\n", | |
" <td>BEENAKKER, CWJ</td>\n", | |
" <td>FRAHM, KM</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>162</th>\n", | |
" <td>4</td>\n", | |
" <td>5</td>\n", | |
" <td>5</td>\n", | |
" <td>BEENAKKER, CWJ</td>\n", | |
" <td>MELSEN, JA</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>...</th>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>67404</th>\n", | |
" <td>16718</td>\n", | |
" <td>8621</td>\n", | |
" <td>1</td>\n", | |
" <td>RENZ, F</td>\n", | |
" <td>JAKOB, G</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>87838</th>\n", | |
" <td>16718</td>\n", | |
" <td>16716</td>\n", | |
" <td>1</td>\n", | |
" <td>RENZ, F</td>\n", | |
" <td>TREMEL, W</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>87566</th>\n", | |
" <td>16719</td>\n", | |
" <td>13396</td>\n", | |
" <td>1</td>\n", | |
" <td>GUETLICH, P</td>\n", | |
" <td>RITTER, C</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>83218</th>\n", | |
" <td>16719</td>\n", | |
" <td>13170</td>\n", | |
" <td>1</td>\n", | |
" <td>GUETLICH, P</td>\n", | |
" <td>WESTERBURG, W</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>87855</th>\n", | |
" <td>16719</td>\n", | |
" <td>16718</td>\n", | |
" <td>1</td>\n", | |
" <td>GUETLICH, P</td>\n", | |
" <td>RENZ, F</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>87847</th>\n", | |
" <td>16719</td>\n", | |
" <td>16717</td>\n", | |
" <td>1</td>\n", | |
" <td>GUETLICH, P</td>\n", | |
" <td>WALDECK, M</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>67405</th>\n", | |
" <td>16719</td>\n", | |
" <td>8621</td>\n", | |
" <td>1</td>\n", | |
" <td>GUETLICH, P</td>\n", | |
" <td>JAKOB, G</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>87839</th>\n", | |
" <td>16719</td>\n", | |
" <td>16716</td>\n", | |
" <td>1</td>\n", | |
" <td>GUETLICH, P</td>\n", | |
" <td>TREMEL, W</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>87831</th>\n", | |
" <td>16719</td>\n", | |
" <td>16715</td>\n", | |
" <td>1</td>\n", | |
" <td>GUETLICH, P</td>\n", | |
" <td>FELSER, C</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>87823</th>\n", | |
" <td>16719</td>\n", | |
" <td>16714</td>\n", | |
" <td>1</td>\n", | |
" <td>GUETLICH, P</td>\n", | |
" <td>LANG, O</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>80930</th>\n", | |
" <td>16720</td>\n", | |
" <td>11553</td>\n", | |
" <td>1</td>\n", | |
" <td>HAEUSSLER, R</td>\n", | |
" <td>LOEHNEYSEN, HV</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>51427</th>\n", | |
" <td>16720</td>\n", | |
" <td>15834</td>\n", | |
" <td>1</td>\n", | |
" <td>HAEUSSLER, R</td>\n", | |
" <td>SCHEER, E</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>91849</th>\n", | |
" <td>16720</td>\n", | |
" <td>16721</td>\n", | |
" <td>1</td>\n", | |
" <td>HAEUSSLER, R</td>\n", | |
" <td>WEBER, HB</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>51428</th>\n", | |
" <td>16721</td>\n", | |
" <td>15834</td>\n", | |
" <td>1</td>\n", | |
" <td>WEBER, HB</td>\n", | |
" <td>SCHEER, E</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>80931</th>\n", | |
" <td>16721</td>\n", | |
" <td>11553</td>\n", | |
" <td>1</td>\n", | |
" <td>WEBER, HB</td>\n", | |
" <td>LOEHNEYSEN, HV</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>91846</th>\n", | |
" <td>16721</td>\n", | |
" <td>16720</td>\n", | |
" <td>1</td>\n", | |
" <td>WEBER, HB</td>\n", | |
" <td>HAEUSSLER, R</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>74577</th>\n", | |
" <td>16723</td>\n", | |
" <td>14983</td>\n", | |
" <td>1</td>\n", | |
" <td>LEUNG, MA</td>\n", | |
" <td>CARR, LD</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>68050</th>\n", | |
" <td>16723</td>\n", | |
" <td>4256</td>\n", | |
" <td>1</td>\n", | |
" <td>LEUNG, MA</td>\n", | |
" <td>REINHARDT, WP</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>41242</th>\n", | |
" <td>16724</td>\n", | |
" <td>8897</td>\n", | |
" <td>1</td>\n", | |
" <td>CORNISH, SL</td>\n", | |
" <td>WIEMAN, CE</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>80425</th>\n", | |
" <td>16724</td>\n", | |
" <td>16725</td>\n", | |
" <td>1</td>\n", | |
" <td>CORNISH, SL</td>\n", | |
" <td>CLAUSSEN, NR</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>80429</th>\n", | |
" <td>16724</td>\n", | |
" <td>16726</td>\n", | |
" <td>1</td>\n", | |
" <td>CORNISH, SL</td>\n", | |
" <td>ROBERTS, JL</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>41176</th>\n", | |
" <td>16724</td>\n", | |
" <td>5350</td>\n", | |
" <td>1</td>\n", | |
" <td>CORNISH, SL</td>\n", | |
" <td>CORNELL, EA</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>41177</th>\n", | |
" <td>16725</td>\n", | |
" <td>5350</td>\n", | |
" <td>1</td>\n", | |
" <td>CLAUSSEN, NR</td>\n", | |
" <td>CORNELL, EA</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>80430</th>\n", | |
" <td>16725</td>\n", | |
" <td>16726</td>\n", | |
" <td>1</td>\n", | |
" <td>CLAUSSEN, NR</td>\n", | |
" <td>ROBERTS, JL</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>41243</th>\n", | |
" <td>16725</td>\n", | |
" <td>8897</td>\n", | |
" <td>1</td>\n", | |
" <td>CLAUSSEN, NR</td>\n", | |
" <td>WIEMAN, CE</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>80421</th>\n", | |
" <td>16725</td>\n", | |
" <td>16724</td>\n", | |
" <td>1</td>\n", | |
" <td>CLAUSSEN, NR</td>\n", | |
" <td>CORNISH, SL</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>41244</th>\n", | |
" <td>16726</td>\n", | |
" <td>8897</td>\n", | |
" <td>1</td>\n", | |
" <td>ROBERTS, JL</td>\n", | |
" <td>WIEMAN, CE</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>80426</th>\n", | |
" <td>16726</td>\n", | |
" <td>16725</td>\n", | |
" <td>1</td>\n", | |
" <td>ROBERTS, JL</td>\n", | |
" <td>CLAUSSEN, NR</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>80422</th>\n", | |
" <td>16726</td>\n", | |
" <td>16724</td>\n", | |
" <td>1</td>\n", | |
" <td>ROBERTS, JL</td>\n", | |
" <td>CORNISH, SL</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>41178</th>\n", | |
" <td>16726</td>\n", | |
" <td>5350</td>\n", | |
" <td>1</td>\n", | |
" <td>ROBERTS, JL</td>\n", | |
" <td>CORNELL, EA</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>95188 rows × 5 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" AUTHOR_ID CO-AUTHOR_ID NO_OF_BOOKS AUTHOR CO-AUTHOR\n", | |
"0 1 190 7 RADZIHOVSKY, L TONER, J\n", | |
"66 1 2281 1 RADZIHOVSKY, L FREY, E\n", | |
"93 1 3896 1 RADZIHOVSKY, L GINZBURG, VV\n", | |
"101 1 3897 2 RADZIHOVSKY, L CLARK, NA\n", | |
"110 1 12347 1 RADZIHOVSKY, L JACOBSEN, B\n", | |
"113 1 12348 1 RADZIHOVSKY, L SAUNDERS, K\n", | |
"116 1 12700 1 RADZIHOVSKY, L LINK, DR\n", | |
"123 1 12701 1 RADZIHOVSKY, L NATALE, G\n", | |
"130 1 12702 1 RADZIHOVSKY, L MACLENNAN, JE\n", | |
"137 1 12703 1 RADZIHOVSKY, L WALSH, M\n", | |
"144 1 12704 1 RADZIHOVSKY, L KEAST, SS\n", | |
"151 1 12705 1 RADZIHOVSKY, L NEUBERT, ME\n", | |
"55 1 1075 3 RADZIHOVSKY, L MARCHETTI, MC\n", | |
"35 1 562 2 RADZIHOVSKY, L NELSON, DR\n", | |
"11 1 201 3 RADZIHOVSKY, L BALENTS, L\n", | |
"159 2 1237 2 FRISCHAT, SD DORON, E\n", | |
"158 2 3 1 FRISCHAT, SD KUHN, R\n", | |
"160 3 2 1 KUHN, R FRISCHAT, SD\n", | |
"311 4 10757 9 BEENAKKER, CWJ PATRA, M\n", | |
"215 4 891 2 BEENAKKER, CWJ LEYRONAS, X\n", | |
"219 4 1785 1 BEENAKKER, CWJ BUTTIKER, M\n", | |
"243 4 2212 5 BEENAKKER, CWJ LANGEN, SAV\n", | |
"200 4 722 1 BEENAKKER, CWJ FRAHM, K\n", | |
"255 4 2264 1 BEENAKKER, CWJ BLANTER, YM\n", | |
"196 4 7 4 BEENAKKER, CWJ JONG, MJMD\n", | |
"268 4 2876 9 BEENAKKER, CWJ SCHOMERUS, H\n", | |
"290 4 3232 5 BEENAKKER, CWJ MISIRPASHAEV, TS\n", | |
"306 4 8994 2 BEENAKKER, CWJ TWORZYDLO, J\n", | |
"293 4 3478 7 BEENAKKER, CWJ FRAHM, KM\n", | |
"162 4 5 5 BEENAKKER, CWJ MELSEN, JA\n", | |
"... ... ... ... ... ...\n", | |
"67404 16718 8621 1 RENZ, F JAKOB, G\n", | |
"87838 16718 16716 1 RENZ, F TREMEL, W\n", | |
"87566 16719 13396 1 GUETLICH, P RITTER, C\n", | |
"83218 16719 13170 1 GUETLICH, P WESTERBURG, W\n", | |
"87855 16719 16718 1 GUETLICH, P RENZ, F\n", | |
"87847 16719 16717 1 GUETLICH, P WALDECK, M\n", | |
"67405 16719 8621 1 GUETLICH, P JAKOB, G\n", | |
"87839 16719 16716 1 GUETLICH, P TREMEL, W\n", | |
"87831 16719 16715 1 GUETLICH, P FELSER, C\n", | |
"87823 16719 16714 1 GUETLICH, P LANG, O\n", | |
"80930 16720 11553 1 HAEUSSLER, R LOEHNEYSEN, HV\n", | |
"51427 16720 15834 1 HAEUSSLER, R SCHEER, E\n", | |
"91849 16720 16721 1 HAEUSSLER, R WEBER, HB\n", | |
"51428 16721 15834 1 WEBER, HB SCHEER, E\n", | |
"80931 16721 11553 1 WEBER, HB LOEHNEYSEN, HV\n", | |
"91846 16721 16720 1 WEBER, HB HAEUSSLER, R\n", | |
"74577 16723 14983 1 LEUNG, MA CARR, LD\n", | |
"68050 16723 4256 1 LEUNG, MA REINHARDT, WP\n", | |
"41242 16724 8897 1 CORNISH, SL WIEMAN, CE\n", | |
"80425 16724 16725 1 CORNISH, SL CLAUSSEN, NR\n", | |
"80429 16724 16726 1 CORNISH, SL ROBERTS, JL\n", | |
"41176 16724 5350 1 CORNISH, SL CORNELL, EA\n", | |
"41177 16725 5350 1 CLAUSSEN, NR CORNELL, EA\n", | |
"80430 16725 16726 1 CLAUSSEN, NR ROBERTS, JL\n", | |
"41243 16725 8897 1 CLAUSSEN, NR WIEMAN, CE\n", | |
"80421 16725 16724 1 CLAUSSEN, NR CORNISH, SL\n", | |
"41244 16726 8897 1 ROBERTS, JL WIEMAN, CE\n", | |
"80426 16726 16725 1 ROBERTS, JL CLAUSSEN, NR\n", | |
"80422 16726 16724 1 ROBERTS, JL CORNISH, SL\n", | |
"41178 16726 5350 1 ROBERTS, JL CORNELL, EA\n", | |
"\n", | |
"[95188 rows x 5 columns]" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"a = authors.assign(CO_AUTHOR_ID=authors['AUTHOR_ID']).assign(CO_AUTHOR_NAME=authors['AUTHOR_NAME'])\n", | |
"b = occurrence.merge(authors, how='inner', on='AUTHOR_ID')\n", | |
"#.merge(a, how='inner', on='AUTHOR_ID')\n", | |
"df = b.merge(a, how='inner', on='CO_AUTHOR_ID') \\\n", | |
" .sort_values(by='AUTHOR_ID_x') \\\n", | |
" .drop('AUTHOR_ID_y',1) \\\n", | |
" .drop('AUTHOR_NAME_y',1)\n", | |
"df.columns = ['AUTHOR_ID', 'CO-AUTHOR_ID', 'NO_OF_BOOKS', 'AUTHOR', 'CO-AUTHOR']\n", | |
"df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"95188\n", | |
"Printing nodes over\n", | |
"95188\n", | |
"Printing nodes and edges over\n" | |
] | |
} | |
], | |
"source": [ | |
"f = open(FILE_NAME, \"w\")\n", | |
"#helpers\n", | |
"s = \" \"\n", | |
"ss = s+s\n", | |
"sss = s+s+s\n", | |
"ssss = s+s+s+s\n", | |
"nl = \"\\n\"\n", | |
"\n", | |
"#loop helpers\n", | |
"added = []\n", | |
"ind = 0\n", | |
"\n", | |
"#Root node\n", | |
"f.write(\"graph\"+nl)\n", | |
"f.write(\"[\"+nl)\n", | |
"\n", | |
"#Write an edge\n", | |
"def write_edge(r):\n", | |
" f.write( ss + \"edge\" + nl)\n", | |
" f.write( ss + \"[\" + nl)\n", | |
" f.write( ssss + \"source\" + s + '\"' + str(r['AUTHOR_ID']) + '\"' + nl)\n", | |
" f.write( ssss + \"target\" + s + '\"' + str(r['CO-AUTHOR_ID']) + '\"' + nl)\n", | |
" f.write( ssss + \"value\" + s + str(r['NO_OF_BOOKS']) + '\"' + nl)\n", | |
" f.write( ss + \"]\"+ nl)\n", | |
"\n", | |
"#Write a node\n", | |
"def write_node(r):\n", | |
" f.write( ss + \"node\" + nl)\n", | |
" f.write( ss + \"[\" + nl)\n", | |
" f.write( ssss + \"id\" + s + '\"' + str(r['AUTHOR_ID']) + '\"' + nl)\n", | |
" f.write( ssss + \"label\" + s + '\"' + str(r['AUTHOR']) + '\"' + nl)\n", | |
" f.write( ss + \"]\"+ nl)\n", | |
"\n", | |
"#Generate nodes\n", | |
"for i, r in df.iterrows():\n", | |
" #increment, as index not reliable\n", | |
" ind += 1\n", | |
" #Check for duplicates\n", | |
" if (r['AUTHOR_ID'] not in added):\n", | |
" #Add to list\n", | |
" added.append(r['AUTHOR_ID'])\n", | |
" write_node(r)\n", | |
" #print the progress \n", | |
" progress(ind)\n", | |
"\n", | |
"print(nl+\"Printing nodes over\")\n", | |
"\n", | |
"#flush index\n", | |
"ind = 0 \n", | |
"#Generate edges \n", | |
"for i, r in df.iterrows():\n", | |
" #increment, as index not reliable\n", | |
" ind += 1\n", | |
" if(r['AUTHOR_ID'] < r['CO-AUTHOR_ID']):\n", | |
" write_edge(r)\n", | |
" #print the progress \n", | |
" progress(ind)\n", | |
"\n", | |
"print(nl+\"Printing nodes and edges over\")\n", | |
"\n", | |
"#closing node\n", | |
"f.write(\"]\"+nl)\n", | |
"f.close()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.1" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
xml2csv
import sys, time
import pandas as pd
import datetime as dt
from IPython.display import display
import csv
import xml.etree.ElementTree as ET
def xml_to_csv(file_path,csv_name) -> None:
tree = ET.parse(file_path)
root = tree.getroot()
if name = 'main':