Created
October 1, 2010 20:27
-
-
Save psychemedia/606803 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# A quick and dirty script for converting a lablog xml dump into a json format and | |
# from there on to GUESS for importing into Gephi | |
from xml.etree import ElementTree as ET | |
import re | |
import simplejson | |
infile = 'blogdump.xml' | |
jsonfile = 'blogdump.json' | |
guessfile = 'blogdump.gdf' | |
tree = ET.parse(infile) | |
root = tree.getroot() | |
bloglinks = re.compile('(\\[blog\\])' + '(\\d+)' + | |
'(\\[\\/blog\\])', re.IGNORECASE|re.DOTALL) | |
f = open(jsonfile, 'w') | |
dumplist = [] | |
for post in root.getiterator('post'): | |
postdict = {} | |
postdict['title'] = post.find('title').text | |
postdict['id'] = post.find('id').text | |
postdict['section'] = post.find('section').text | |
postdict['author'] = post.find('author').text | |
postdict['datestamp'] = post.find('datestamp').text | |
postdict['timestamp'] = post.find('timestamp').text | |
postdict['content'] = {'bbcode': post.find('content').text, | |
'html' : post.find('html').text} | |
postdict['internal-links'] = [] | |
# Search through content for [blog]###[/blog] and create iterator | |
bloglinkslist = bloglinks.finditer(post.find('content').text) | |
for link in bloglinkslist: | |
# For each link grab the ID and append to list of links | |
postdict['internal-links'].append(link.group(2)) | |
# postdict['permalink'] = post.find('permalink').text | |
dumplist.append(postdict) | |
f.write(simplejson.dumps(dumplist)) | |
f.close | |
f2 = open(guessfile, 'w') | |
#label is a reserved word used to display node labels | |
f2.write('nodedef> name, label STRING, section VARCHAR\n') | |
nodelist = '' | |
nodetrack = [] | |
edgelist = '' | |
nodes=[] | |
for post in dumplist: | |
#just checking against duplicates? | |
if post['id'] not in nodes: | |
nodes.append(post['id']) | |
nodelist += ('id' + post['id'] +',"' + post['title'] + '",' | |
+ post['section'] + '\n') | |
nodetrack.append(post['id']) | |
for post in dumplist: | |
for link in post['internal-links']: | |
edgelist += 'id' + post['id'] + ',id' + link + '\n' | |
if nodetrack.count(link) == 0: | |
nodelist += 'id' + link +',external,unknown\n' | |
f2.write(nodelist) | |
f2.write('edgedef>node1,node2\n') | |
f2.write(edgelist) | |
f2.close |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
There are several blogs running on our server and the XML is just the download from my own. So there are links from my blog to others on the server. These nodes aren't explicitly in the XML dump but I know that the ID codes are unique. So this code writes out a list of those nodes it discovers from the ID tag in the XML, then in the code from 66 onwards we just check whether the target of the links is in that list. If not we add that node as an external link.
I'll set up a proper Git Repo and invite you and then clean up the XML. I need to talk to the DB manager about how to remove the SQL insertion attack on the back end.