Skip to content

Instantly share code, notes, and snippets.

@dsignr
Last active November 23, 2022 20:09
Show Gist options
  • Save dsignr/c3f7a67fcfb1fb93698a507f4cce8eef to your computer and use it in GitHub Desktop.
Save dsignr/c3f7a67fcfb1fb93698a507f4cce8eef to your computer and use it in GitHub Desktop.
A python script to extract data from CSV and convert it into Gephi compatible GML.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import sys, time\n",
"import pandas as pd\n",
"import datetime as dt\n",
"from IPython.display import display\n",
"\n",
"import plotly.plotly as py # interactive graphing\n",
"from plotly.graph_objs import Bar, Scatter, Marker, Layout "
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"FILE_NAME = \"output.gml\""
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"authors = pd.read_csv('authors.csv', sep=' ')\n",
"occurrence = pd.read_csv('occurrence.csv', sep=' ')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"#Utility functions\n",
"def progress(v):\n",
" v = str(v)\n",
" sys.stdout.flush()\n",
" sys.stdout.write('\\r')\n",
" sys.stdout.flush()\n",
" sys.stdout.write(v)\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>AUTHOR_ID</th>\n",
" <th>CO-AUTHOR_ID</th>\n",
" <th>NO_OF_BOOKS</th>\n",
" <th>AUTHOR</th>\n",
" <th>CO-AUTHOR</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>190</td>\n",
" <td>7</td>\n",
" <td>RADZIHOVSKY, L</td>\n",
" <td>TONER, J</td>\n",
" </tr>\n",
" <tr>\n",
" <th>66</th>\n",
" <td>1</td>\n",
" <td>2281</td>\n",
" <td>1</td>\n",
" <td>RADZIHOVSKY, L</td>\n",
" <td>FREY, E</td>\n",
" </tr>\n",
" <tr>\n",
" <th>93</th>\n",
" <td>1</td>\n",
" <td>3896</td>\n",
" <td>1</td>\n",
" <td>RADZIHOVSKY, L</td>\n",
" <td>GINZBURG, VV</td>\n",
" </tr>\n",
" <tr>\n",
" <th>101</th>\n",
" <td>1</td>\n",
" <td>3897</td>\n",
" <td>2</td>\n",
" <td>RADZIHOVSKY, L</td>\n",
" <td>CLARK, NA</td>\n",
" </tr>\n",
" <tr>\n",
" <th>110</th>\n",
" <td>1</td>\n",
" <td>12347</td>\n",
" <td>1</td>\n",
" <td>RADZIHOVSKY, L</td>\n",
" <td>JACOBSEN, B</td>\n",
" </tr>\n",
" <tr>\n",
" <th>113</th>\n",
" <td>1</td>\n",
" <td>12348</td>\n",
" <td>1</td>\n",
" <td>RADZIHOVSKY, L</td>\n",
" <td>SAUNDERS, K</td>\n",
" </tr>\n",
" <tr>\n",
" <th>116</th>\n",
" <td>1</td>\n",
" <td>12700</td>\n",
" <td>1</td>\n",
" <td>RADZIHOVSKY, L</td>\n",
" <td>LINK, DR</td>\n",
" </tr>\n",
" <tr>\n",
" <th>123</th>\n",
" <td>1</td>\n",
" <td>12701</td>\n",
" <td>1</td>\n",
" <td>RADZIHOVSKY, L</td>\n",
" <td>NATALE, G</td>\n",
" </tr>\n",
" <tr>\n",
" <th>130</th>\n",
" <td>1</td>\n",
" <td>12702</td>\n",
" <td>1</td>\n",
" <td>RADZIHOVSKY, L</td>\n",
" <td>MACLENNAN, JE</td>\n",
" </tr>\n",
" <tr>\n",
" <th>137</th>\n",
" <td>1</td>\n",
" <td>12703</td>\n",
" <td>1</td>\n",
" <td>RADZIHOVSKY, L</td>\n",
" <td>WALSH, M</td>\n",
" </tr>\n",
" <tr>\n",
" <th>144</th>\n",
" <td>1</td>\n",
" <td>12704</td>\n",
" <td>1</td>\n",
" <td>RADZIHOVSKY, L</td>\n",
" <td>KEAST, SS</td>\n",
" </tr>\n",
" <tr>\n",
" <th>151</th>\n",
" <td>1</td>\n",
" <td>12705</td>\n",
" <td>1</td>\n",
" <td>RADZIHOVSKY, L</td>\n",
" <td>NEUBERT, ME</td>\n",
" </tr>\n",
" <tr>\n",
" <th>55</th>\n",
" <td>1</td>\n",
" <td>1075</td>\n",
" <td>3</td>\n",
" <td>RADZIHOVSKY, L</td>\n",
" <td>MARCHETTI, MC</td>\n",
" </tr>\n",
" <tr>\n",
" <th>35</th>\n",
" <td>1</td>\n",
" <td>562</td>\n",
" <td>2</td>\n",
" <td>RADZIHOVSKY, L</td>\n",
" <td>NELSON, DR</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>1</td>\n",
" <td>201</td>\n",
" <td>3</td>\n",
" <td>RADZIHOVSKY, L</td>\n",
" <td>BALENTS, L</td>\n",
" </tr>\n",
" <tr>\n",
" <th>159</th>\n",
" <td>2</td>\n",
" <td>1237</td>\n",
" <td>2</td>\n",
" <td>FRISCHAT, SD</td>\n",
" <td>DORON, E</td>\n",
" </tr>\n",
" <tr>\n",
" <th>158</th>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>FRISCHAT, SD</td>\n",
" <td>KUHN, R</td>\n",
" </tr>\n",
" <tr>\n",
" <th>160</th>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>KUHN, R</td>\n",
" <td>FRISCHAT, SD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>311</th>\n",
" <td>4</td>\n",
" <td>10757</td>\n",
" <td>9</td>\n",
" <td>BEENAKKER, CWJ</td>\n",
" <td>PATRA, M</td>\n",
" </tr>\n",
" <tr>\n",
" <th>215</th>\n",
" <td>4</td>\n",
" <td>891</td>\n",
" <td>2</td>\n",
" <td>BEENAKKER, CWJ</td>\n",
" <td>LEYRONAS, X</td>\n",
" </tr>\n",
" <tr>\n",
" <th>219</th>\n",
" <td>4</td>\n",
" <td>1785</td>\n",
" <td>1</td>\n",
" <td>BEENAKKER, CWJ</td>\n",
" <td>BUTTIKER, M</td>\n",
" </tr>\n",
" <tr>\n",
" <th>243</th>\n",
" <td>4</td>\n",
" <td>2212</td>\n",
" <td>5</td>\n",
" <td>BEENAKKER, CWJ</td>\n",
" <td>LANGEN, SAV</td>\n",
" </tr>\n",
" <tr>\n",
" <th>200</th>\n",
" <td>4</td>\n",
" <td>722</td>\n",
" <td>1</td>\n",
" <td>BEENAKKER, CWJ</td>\n",
" <td>FRAHM, K</td>\n",
" </tr>\n",
" <tr>\n",
" <th>255</th>\n",
" <td>4</td>\n",
" <td>2264</td>\n",
" <td>1</td>\n",
" <td>BEENAKKER, CWJ</td>\n",
" <td>BLANTER, YM</td>\n",
" </tr>\n",
" <tr>\n",
" <th>196</th>\n",
" <td>4</td>\n",
" <td>7</td>\n",
" <td>4</td>\n",
" <td>BEENAKKER, CWJ</td>\n",
" <td>JONG, MJMD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>268</th>\n",
" <td>4</td>\n",
" <td>2876</td>\n",
" <td>9</td>\n",
" <td>BEENAKKER, CWJ</td>\n",
" <td>SCHOMERUS, H</td>\n",
" </tr>\n",
" <tr>\n",
" <th>290</th>\n",
" <td>4</td>\n",
" <td>3232</td>\n",
" <td>5</td>\n",
" <td>BEENAKKER, CWJ</td>\n",
" <td>MISIRPASHAEV, TS</td>\n",
" </tr>\n",
" <tr>\n",
" <th>306</th>\n",
" <td>4</td>\n",
" <td>8994</td>\n",
" <td>2</td>\n",
" <td>BEENAKKER, CWJ</td>\n",
" <td>TWORZYDLO, J</td>\n",
" </tr>\n",
" <tr>\n",
" <th>293</th>\n",
" <td>4</td>\n",
" <td>3478</td>\n",
" <td>7</td>\n",
" <td>BEENAKKER, CWJ</td>\n",
" <td>FRAHM, KM</td>\n",
" </tr>\n",
" <tr>\n",
" <th>162</th>\n",
" <td>4</td>\n",
" <td>5</td>\n",
" <td>5</td>\n",
" <td>BEENAKKER, CWJ</td>\n",
" <td>MELSEN, JA</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>67404</th>\n",
" <td>16718</td>\n",
" <td>8621</td>\n",
" <td>1</td>\n",
" <td>RENZ, F</td>\n",
" <td>JAKOB, G</td>\n",
" </tr>\n",
" <tr>\n",
" <th>87838</th>\n",
" <td>16718</td>\n",
" <td>16716</td>\n",
" <td>1</td>\n",
" <td>RENZ, F</td>\n",
" <td>TREMEL, W</td>\n",
" </tr>\n",
" <tr>\n",
" <th>87566</th>\n",
" <td>16719</td>\n",
" <td>13396</td>\n",
" <td>1</td>\n",
" <td>GUETLICH, P</td>\n",
" <td>RITTER, C</td>\n",
" </tr>\n",
" <tr>\n",
" <th>83218</th>\n",
" <td>16719</td>\n",
" <td>13170</td>\n",
" <td>1</td>\n",
" <td>GUETLICH, P</td>\n",
" <td>WESTERBURG, W</td>\n",
" </tr>\n",
" <tr>\n",
" <th>87855</th>\n",
" <td>16719</td>\n",
" <td>16718</td>\n",
" <td>1</td>\n",
" <td>GUETLICH, P</td>\n",
" <td>RENZ, F</td>\n",
" </tr>\n",
" <tr>\n",
" <th>87847</th>\n",
" <td>16719</td>\n",
" <td>16717</td>\n",
" <td>1</td>\n",
" <td>GUETLICH, P</td>\n",
" <td>WALDECK, M</td>\n",
" </tr>\n",
" <tr>\n",
" <th>67405</th>\n",
" <td>16719</td>\n",
" <td>8621</td>\n",
" <td>1</td>\n",
" <td>GUETLICH, P</td>\n",
" <td>JAKOB, G</td>\n",
" </tr>\n",
" <tr>\n",
" <th>87839</th>\n",
" <td>16719</td>\n",
" <td>16716</td>\n",
" <td>1</td>\n",
" <td>GUETLICH, P</td>\n",
" <td>TREMEL, W</td>\n",
" </tr>\n",
" <tr>\n",
" <th>87831</th>\n",
" <td>16719</td>\n",
" <td>16715</td>\n",
" <td>1</td>\n",
" <td>GUETLICH, P</td>\n",
" <td>FELSER, C</td>\n",
" </tr>\n",
" <tr>\n",
" <th>87823</th>\n",
" <td>16719</td>\n",
" <td>16714</td>\n",
" <td>1</td>\n",
" <td>GUETLICH, P</td>\n",
" <td>LANG, O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>80930</th>\n",
" <td>16720</td>\n",
" <td>11553</td>\n",
" <td>1</td>\n",
" <td>HAEUSSLER, R</td>\n",
" <td>LOEHNEYSEN, HV</td>\n",
" </tr>\n",
" <tr>\n",
" <th>51427</th>\n",
" <td>16720</td>\n",
" <td>15834</td>\n",
" <td>1</td>\n",
" <td>HAEUSSLER, R</td>\n",
" <td>SCHEER, E</td>\n",
" </tr>\n",
" <tr>\n",
" <th>91849</th>\n",
" <td>16720</td>\n",
" <td>16721</td>\n",
" <td>1</td>\n",
" <td>HAEUSSLER, R</td>\n",
" <td>WEBER, HB</td>\n",
" </tr>\n",
" <tr>\n",
" <th>51428</th>\n",
" <td>16721</td>\n",
" <td>15834</td>\n",
" <td>1</td>\n",
" <td>WEBER, HB</td>\n",
" <td>SCHEER, E</td>\n",
" </tr>\n",
" <tr>\n",
" <th>80931</th>\n",
" <td>16721</td>\n",
" <td>11553</td>\n",
" <td>1</td>\n",
" <td>WEBER, HB</td>\n",
" <td>LOEHNEYSEN, HV</td>\n",
" </tr>\n",
" <tr>\n",
" <th>91846</th>\n",
" <td>16721</td>\n",
" <td>16720</td>\n",
" <td>1</td>\n",
" <td>WEBER, HB</td>\n",
" <td>HAEUSSLER, R</td>\n",
" </tr>\n",
" <tr>\n",
" <th>74577</th>\n",
" <td>16723</td>\n",
" <td>14983</td>\n",
" <td>1</td>\n",
" <td>LEUNG, MA</td>\n",
" <td>CARR, LD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>68050</th>\n",
" <td>16723</td>\n",
" <td>4256</td>\n",
" <td>1</td>\n",
" <td>LEUNG, MA</td>\n",
" <td>REINHARDT, WP</td>\n",
" </tr>\n",
" <tr>\n",
" <th>41242</th>\n",
" <td>16724</td>\n",
" <td>8897</td>\n",
" <td>1</td>\n",
" <td>CORNISH, SL</td>\n",
" <td>WIEMAN, CE</td>\n",
" </tr>\n",
" <tr>\n",
" <th>80425</th>\n",
" <td>16724</td>\n",
" <td>16725</td>\n",
" <td>1</td>\n",
" <td>CORNISH, SL</td>\n",
" <td>CLAUSSEN, NR</td>\n",
" </tr>\n",
" <tr>\n",
" <th>80429</th>\n",
" <td>16724</td>\n",
" <td>16726</td>\n",
" <td>1</td>\n",
" <td>CORNISH, SL</td>\n",
" <td>ROBERTS, JL</td>\n",
" </tr>\n",
" <tr>\n",
" <th>41176</th>\n",
" <td>16724</td>\n",
" <td>5350</td>\n",
" <td>1</td>\n",
" <td>CORNISH, SL</td>\n",
" <td>CORNELL, EA</td>\n",
" </tr>\n",
" <tr>\n",
" <th>41177</th>\n",
" <td>16725</td>\n",
" <td>5350</td>\n",
" <td>1</td>\n",
" <td>CLAUSSEN, NR</td>\n",
" <td>CORNELL, EA</td>\n",
" </tr>\n",
" <tr>\n",
" <th>80430</th>\n",
" <td>16725</td>\n",
" <td>16726</td>\n",
" <td>1</td>\n",
" <td>CLAUSSEN, NR</td>\n",
" <td>ROBERTS, JL</td>\n",
" </tr>\n",
" <tr>\n",
" <th>41243</th>\n",
" <td>16725</td>\n",
" <td>8897</td>\n",
" <td>1</td>\n",
" <td>CLAUSSEN, NR</td>\n",
" <td>WIEMAN, CE</td>\n",
" </tr>\n",
" <tr>\n",
" <th>80421</th>\n",
" <td>16725</td>\n",
" <td>16724</td>\n",
" <td>1</td>\n",
" <td>CLAUSSEN, NR</td>\n",
" <td>CORNISH, SL</td>\n",
" </tr>\n",
" <tr>\n",
" <th>41244</th>\n",
" <td>16726</td>\n",
" <td>8897</td>\n",
" <td>1</td>\n",
" <td>ROBERTS, JL</td>\n",
" <td>WIEMAN, CE</td>\n",
" </tr>\n",
" <tr>\n",
" <th>80426</th>\n",
" <td>16726</td>\n",
" <td>16725</td>\n",
" <td>1</td>\n",
" <td>ROBERTS, JL</td>\n",
" <td>CLAUSSEN, NR</td>\n",
" </tr>\n",
" <tr>\n",
" <th>80422</th>\n",
" <td>16726</td>\n",
" <td>16724</td>\n",
" <td>1</td>\n",
" <td>ROBERTS, JL</td>\n",
" <td>CORNISH, SL</td>\n",
" </tr>\n",
" <tr>\n",
" <th>41178</th>\n",
" <td>16726</td>\n",
" <td>5350</td>\n",
" <td>1</td>\n",
" <td>ROBERTS, JL</td>\n",
" <td>CORNELL, EA</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>95188 rows × 5 columns</p>\n",
"</div>"
],
"text/plain": [
" AUTHOR_ID CO-AUTHOR_ID NO_OF_BOOKS AUTHOR CO-AUTHOR\n",
"0 1 190 7 RADZIHOVSKY, L TONER, J\n",
"66 1 2281 1 RADZIHOVSKY, L FREY, E\n",
"93 1 3896 1 RADZIHOVSKY, L GINZBURG, VV\n",
"101 1 3897 2 RADZIHOVSKY, L CLARK, NA\n",
"110 1 12347 1 RADZIHOVSKY, L JACOBSEN, B\n",
"113 1 12348 1 RADZIHOVSKY, L SAUNDERS, K\n",
"116 1 12700 1 RADZIHOVSKY, L LINK, DR\n",
"123 1 12701 1 RADZIHOVSKY, L NATALE, G\n",
"130 1 12702 1 RADZIHOVSKY, L MACLENNAN, JE\n",
"137 1 12703 1 RADZIHOVSKY, L WALSH, M\n",
"144 1 12704 1 RADZIHOVSKY, L KEAST, SS\n",
"151 1 12705 1 RADZIHOVSKY, L NEUBERT, ME\n",
"55 1 1075 3 RADZIHOVSKY, L MARCHETTI, MC\n",
"35 1 562 2 RADZIHOVSKY, L NELSON, DR\n",
"11 1 201 3 RADZIHOVSKY, L BALENTS, L\n",
"159 2 1237 2 FRISCHAT, SD DORON, E\n",
"158 2 3 1 FRISCHAT, SD KUHN, R\n",
"160 3 2 1 KUHN, R FRISCHAT, SD\n",
"311 4 10757 9 BEENAKKER, CWJ PATRA, M\n",
"215 4 891 2 BEENAKKER, CWJ LEYRONAS, X\n",
"219 4 1785 1 BEENAKKER, CWJ BUTTIKER, M\n",
"243 4 2212 5 BEENAKKER, CWJ LANGEN, SAV\n",
"200 4 722 1 BEENAKKER, CWJ FRAHM, K\n",
"255 4 2264 1 BEENAKKER, CWJ BLANTER, YM\n",
"196 4 7 4 BEENAKKER, CWJ JONG, MJMD\n",
"268 4 2876 9 BEENAKKER, CWJ SCHOMERUS, H\n",
"290 4 3232 5 BEENAKKER, CWJ MISIRPASHAEV, TS\n",
"306 4 8994 2 BEENAKKER, CWJ TWORZYDLO, J\n",
"293 4 3478 7 BEENAKKER, CWJ FRAHM, KM\n",
"162 4 5 5 BEENAKKER, CWJ MELSEN, JA\n",
"... ... ... ... ... ...\n",
"67404 16718 8621 1 RENZ, F JAKOB, G\n",
"87838 16718 16716 1 RENZ, F TREMEL, W\n",
"87566 16719 13396 1 GUETLICH, P RITTER, C\n",
"83218 16719 13170 1 GUETLICH, P WESTERBURG, W\n",
"87855 16719 16718 1 GUETLICH, P RENZ, F\n",
"87847 16719 16717 1 GUETLICH, P WALDECK, M\n",
"67405 16719 8621 1 GUETLICH, P JAKOB, G\n",
"87839 16719 16716 1 GUETLICH, P TREMEL, W\n",
"87831 16719 16715 1 GUETLICH, P FELSER, C\n",
"87823 16719 16714 1 GUETLICH, P LANG, O\n",
"80930 16720 11553 1 HAEUSSLER, R LOEHNEYSEN, HV\n",
"51427 16720 15834 1 HAEUSSLER, R SCHEER, E\n",
"91849 16720 16721 1 HAEUSSLER, R WEBER, HB\n",
"51428 16721 15834 1 WEBER, HB SCHEER, E\n",
"80931 16721 11553 1 WEBER, HB LOEHNEYSEN, HV\n",
"91846 16721 16720 1 WEBER, HB HAEUSSLER, R\n",
"74577 16723 14983 1 LEUNG, MA CARR, LD\n",
"68050 16723 4256 1 LEUNG, MA REINHARDT, WP\n",
"41242 16724 8897 1 CORNISH, SL WIEMAN, CE\n",
"80425 16724 16725 1 CORNISH, SL CLAUSSEN, NR\n",
"80429 16724 16726 1 CORNISH, SL ROBERTS, JL\n",
"41176 16724 5350 1 CORNISH, SL CORNELL, EA\n",
"41177 16725 5350 1 CLAUSSEN, NR CORNELL, EA\n",
"80430 16725 16726 1 CLAUSSEN, NR ROBERTS, JL\n",
"41243 16725 8897 1 CLAUSSEN, NR WIEMAN, CE\n",
"80421 16725 16724 1 CLAUSSEN, NR CORNISH, SL\n",
"41244 16726 8897 1 ROBERTS, JL WIEMAN, CE\n",
"80426 16726 16725 1 ROBERTS, JL CLAUSSEN, NR\n",
"80422 16726 16724 1 ROBERTS, JL CORNISH, SL\n",
"41178 16726 5350 1 ROBERTS, JL CORNELL, EA\n",
"\n",
"[95188 rows x 5 columns]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"a = authors.assign(CO_AUTHOR_ID=authors['AUTHOR_ID']).assign(CO_AUTHOR_NAME=authors['AUTHOR_NAME'])\n",
"b = occurrence.merge(authors, how='inner', on='AUTHOR_ID')\n",
"#.merge(a, how='inner', on='AUTHOR_ID')\n",
"df = b.merge(a, how='inner', on='CO_AUTHOR_ID') \\\n",
" .sort_values(by='AUTHOR_ID_x') \\\n",
" .drop('AUTHOR_ID_y',1) \\\n",
" .drop('AUTHOR_NAME_y',1)\n",
"df.columns = ['AUTHOR_ID', 'CO-AUTHOR_ID', 'NO_OF_BOOKS', 'AUTHOR', 'CO-AUTHOR']\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"95188\n",
"Printing nodes over\n",
"95188\n",
"Printing nodes and edges over\n"
]
}
],
"source": [
"f = open(FILE_NAME, \"w\")\n",
"#helpers\n",
"s = \" \"\n",
"ss = s+s\n",
"sss = s+s+s\n",
"ssss = s+s+s+s\n",
"nl = \"\\n\"\n",
"\n",
"#loop helpers\n",
"added = []\n",
"ind = 0\n",
"\n",
"#Root node\n",
"f.write(\"graph\"+nl)\n",
"f.write(\"[\"+nl)\n",
"\n",
"#Write an edge\n",
"def write_edge(r):\n",
" f.write( ss + \"edge\" + nl)\n",
" f.write( ss + \"[\" + nl)\n",
" f.write( ssss + \"source\" + s + '\"' + str(r['AUTHOR_ID']) + '\"' + nl)\n",
" f.write( ssss + \"target\" + s + '\"' + str(r['CO-AUTHOR_ID']) + '\"' + nl)\n",
" f.write( ssss + \"value\" + s + str(r['NO_OF_BOOKS']) + '\"' + nl)\n",
" f.write( ss + \"]\"+ nl)\n",
"\n",
"#Write a node\n",
"def write_node(r):\n",
" f.write( ss + \"node\" + nl)\n",
" f.write( ss + \"[\" + nl)\n",
" f.write( ssss + \"id\" + s + '\"' + str(r['AUTHOR_ID']) + '\"' + nl)\n",
" f.write( ssss + \"label\" + s + '\"' + str(r['AUTHOR']) + '\"' + nl)\n",
" f.write( ss + \"]\"+ nl)\n",
"\n",
"#Generate nodes\n",
"for i, r in df.iterrows():\n",
" #increment, as index not reliable\n",
" ind += 1\n",
" #Check for duplicates\n",
" if (r['AUTHOR_ID'] not in added):\n",
" #Add to list\n",
" added.append(r['AUTHOR_ID'])\n",
" write_node(r)\n",
" #print the progress \n",
" progress(ind)\n",
"\n",
"print(nl+\"Printing nodes over\")\n",
"\n",
"#flush index\n",
"ind = 0 \n",
"#Generate edges \n",
"for i, r in df.iterrows():\n",
" #increment, as index not reliable\n",
" ind += 1\n",
" if(r['AUTHOR_ID'] < r['CO-AUTHOR_ID']):\n",
" write_edge(r)\n",
" #print the progress \n",
" progress(ind)\n",
"\n",
"print(nl+\"Printing nodes and edges over\")\n",
"\n",
"#closing node\n",
"f.write(\"]\"+nl)\n",
"f.close()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.1"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
@TarekHammad337
Copy link

xml2csv

import sys, time
import pandas as pd
import datetime as dt
from IPython.display import display

import csv
import xml.etree.ElementTree as ET

def xml_to_csv(file_path,csv_name) -> None:
tree = ET.parse(file_path)
root = tree.getroot()

with open(csv_name, 'w') as csv_file:
    writer= csv.writer(csv_file)
    headers = (child.tag for child in root[0])
    writer.writerow(headers)
    num_records = len(root)
    
    for record in range(num_records):
        rec = (child.text for child in root[record])
        writer.writerow(rec)

if name = 'main':

 import sys
 import pathlib


try:
    file_path= sys.argv[1]
    csv_name=  sys.argv[2]
    
except IndexError:
    sys.exit('Tow argument required. One cml path and one save file name.')
        
with pathlib.Path(file_path) as xml_file:
    if xml_file.is_file():
        xml_to_csv(file_path, csv_name)
        
        
    else:
        sys.exit(f'Did not find {file_path}')

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment