-
-
Save clemsos/f15cc2e74ba7605717e758d0ed5866df to your computer and use it in GitHub Desktop.
Xinchejian wiki analysis
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Analysis of data extracted from Xinchejian's wiki\n", | |
"\n", | |
"\n", | |
"We downloaded the data using the [export](https://wiki.xinchejian.com/wiki/Special:Export) Wikimedia feature. We used a crawled list of pages." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 25, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# import xml.etree.ElementTree as etree\n", | |
"from lxml import etree\n", | |
"import codecs\n", | |
"import csv\n", | |
"import time\n", | |
"import os\n", | |
"import lxml\n", | |
"\n", | |
"DATA_PATH_WIKI = os.path.join(os.getcwd(),'data')\n", | |
"FILENAME_WIKI = 'XinCheJian-20170710160715.xml'\n", | |
"FILENAME_ARTICLES = 'articles.csv'\n", | |
"ENCODING = \"utf-8\"\n", | |
"\n", | |
"dump_path = os.path.join(DATA_PATH_WIKI, FILENAME_WIKI)\n", | |
"\n", | |
"# Helper : Nicely formatted time string\n", | |
"def hms_string(sec_elapsed):\n", | |
" h = int(sec_elapsed / (60 * 60))\n", | |
" m = int((sec_elapsed % (60 * 60)) / 60)\n", | |
" s = sec_elapsed % 60\n", | |
" return \"{}:{:>02}:{:>05.2f}\".format(h, m, s)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 28, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Total pages: 0\n", | |
"Elapsed time: 0:00:00.09\n" | |
] | |
} | |
], | |
"source": [ | |
"totalCount = 0\n", | |
"start_time = time.time()\n", | |
"\n", | |
"titles = []\n", | |
"\n", | |
"def strip_tag_name(t):\n", | |
" t = elem.tag\n", | |
" idx = k = t.rfind(\"}\")\n", | |
" if idx != -1:\n", | |
" t = t[idx + 1:]\n", | |
" return t\n", | |
"\n", | |
"# parse XML tree\n", | |
"tree = etree.parse(dump_path)\n", | |
"root = tree.getroot()\n", | |
"ns = 'http://www.mediawiki.org/xml/export-0.10/'\n", | |
"\n", | |
"# find elements\n", | |
"# for page in root.findall('{%s}page' % ns):\n", | |
"# print (page.find('{%s}title' % ns).text)\n", | |
"# elem.clear()\n", | |
"\n", | |
"# using xPath\n", | |
"# nsmap={'x': 'http://www.mediawiki.org/xml/export-0.10/'}\n", | |
"# for title in tree.xpath('//x:title', namespaces=nsmap):\n", | |
"# print (title.text)\n", | |
"\n", | |
"# for text in tree.xpath('//x:text', namespaces=nsmap):\n", | |
"# print (text.text)\n", | |
" \n", | |
"elapsed_time = time.time() - start_time\n", | |
"print(\"Total pages: {:,}\".format(totalCount))\n", | |
"print(\"Elapsed time: {}\".format(hms_string(elapsed_time)))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Extract all content\n", | |
"\n", | |
"Another solution is to use the [Wikipedia Extractor](http://medialab.di.unipi.it/wiki/Wikipedia_Extractor)\n", | |
"\n", | |
"Command used (for reference)\n", | |
"\n", | |
"```\n", | |
"╭─clemsos@miner ~/Dev/EPFL/xcj-wiki ‹ruby-2.2.1› \n", | |
"╰─$ python wiki_extrator.py data/XinCheJian-20170710160715.xml -o extracts 1 ↵\n", | |
"INFO: Loaded 0 templates in 0.0s\n", | |
"INFO: Starting page extraction from data/XinCheJian-20170710160715.xml.\n", | |
"INFO: Using 3 extract processes.\n", | |
"INFO: Finished 3-process extraction of 625 articles in 6.8s (92.1 art/s)\n", | |
"```" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Process informations\n", | |
"\n", | |
"Finally ended up using [StatMediaWiki](https://github.com/emijrp/statmediawiki) which worked well to extract most common info and stats (time, edits, users, etc.).\n", | |
"\n", | |
"\n", | |
"Here is a graph of co-editing produced with the Wiki.\n", | |
"\n", | |
"Let's try to visualize it with Topogram.\n", | |
"\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 39, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Creating topogram 'Xinchejian edits'\n", | |
"A topogram with the same name already exists\n", | |
"22 nodes, 52 edges\n", | |
"22 nodes created.\n", | |
"52 edges created.\n", | |
"done. Topogram has been updated. Check it at https://app.topogram.io/topograms/BWqQQzAwvyYSSsxzF/view\n" | |
] | |
} | |
], | |
"source": [ | |
"import pygraphviz as pgv\n", | |
"from topogram_client import TopogramAPIClient\n", | |
"# import networkx as nx\n", | |
"# import matplotlib.pyplot as plt\n", | |
"\n", | |
"Gtmp = pgv.AGraph('/home/clemsos/Dev/EPFL/xcj-wiki/statmediawiki/branches/interactive/output/usereditsnetwork-2.dot')\n", | |
"\n", | |
"# parse to network X\n", | |
"G = nx.Graph(Gtmp)\n", | |
"\n", | |
"# plot with nx\n", | |
"# nx.draw(G)\n", | |
"# plt.show()\n", | |
"\n", | |
"# credentials\n", | |
"TOPOGRAM_URL = \"https://app.topogram.io\" # http://localhost:3000\n", | |
"USER = \"clement.renaud@gmail.com\"\n", | |
"PASSWORD = \"911sucks\"\n", | |
"\n", | |
"title = \"Xinchejian edits\"\n", | |
"\n", | |
"# connect to the topogram instance\n", | |
"topogram = TopogramAPIClient(TOPOGRAM_URL)\n", | |
"\n", | |
"# login a new user if needed\n", | |
"topogram.user_login(USER, PASSWORD)\n", | |
"\n", | |
"print \"Creating topogram '%s'\"%title\n", | |
"\n", | |
"r = topogram.create_topogram(title)\n", | |
"print r[\"message\"]\n", | |
"topogram_ID = r[\"data\"][0][\"_id\"]\n", | |
"\n", | |
"# # get and backup existing nodes and edges\n", | |
"# existing_nodes = topogram.get_nodes(topogram_ID)[\"data\"]\n", | |
"# existing_edges = topogram.get_edges(topogram_ID)[\"data\"]\n", | |
"\n", | |
"# # clear existing graph\n", | |
"# topogram.delete_nodes([n[\"_id\"] for n in existing_nodes])\n", | |
"# print \"nodes deleted\"\n", | |
"# topogram.delete_edges([n[\"_id\"] for n in existing_edges])\n", | |
"# print \"edges deleted\"\n", | |
"\n", | |
"nodes = [ { \"id\" : n, \"name\" : n } for n in G.nodes()]\n", | |
"edges = [\n", | |
" { \n", | |
" \"source\" : e[0],\n", | |
" \"target\" : e[1],\n", | |
" \"weight\" : int(e[2][\"label\"])\n", | |
" }\n", | |
" for e in G.edges(data=True)\n", | |
"]\n", | |
"\n", | |
"print \"%s nodes, %s edges\"%(len(nodes), len(edges))\n", | |
"\n", | |
"r = topogram.create_nodes(topogram_ID, nodes)\n", | |
"print \"%s nodes created.\"%len(r[\"data\"])\n", | |
"r = topogram.create_edges(topogram_ID, edges)\n", | |
"print \"%s edges created.\"%len(r[\"data\"])\n", | |
"# \n", | |
"print \"done. Topogram has been updated. Check it at %s/topograms/%s/view\"%(TOPOGRAM_URL, topogram_ID)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.9" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment