Skip to content

Instantly share code, notes, and snippets.

@bkmeneguello
Created January 3, 2016 12:26
Show Gist options
  • Save bkmeneguello/4eddcde3f95e60e16c5f to your computer and use it in GitHub Desktop.
Save bkmeneguello/4eddcde3f95e60e16c5f to your computer and use it in GitHub Desktop.
#!/usr/bin/python
import json
from HTMLParser import HTMLParser
class MyHTMLParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.on_dt = False
self.on_dd = False
self.on_a = False
self.node = None
self.tags = None
self.url = None
self.info = None
def handle_starttag(self, tag, attrs):
#print "start", tag
attrs = dict(attrs)
if tag == 'dt':
if self.on_dt:
self.flush_node()
self.on_dt = True
if tag == 'a':
self.url = attrs['href']
self.tags = attrs['tags']
self.on_a = True
if tag == 'dd':
self.on_dt = False
self.on_dd = True
def handle_endtag(self, tag):
#print "end", tag
if tag == 'a':
self.on_a = False
if tag == 'dl':
self.flush_node()
def handle_data(self, data):
#print "data", data
if self.on_a:
self.node = data
if self.on_dd:
self.info = data
def flush_node(self):
node = {'name': self.node, 'tags': ['url:%s' % self.url] + self.tags.split(',')}
if self.info:
node['info'] = self.info
print '{"name": "%s", "tags": ["%s"]%s}' % (self.node, '", "'.join(['url:%s' % self.url] + self.tags.split(',')), ', "info": "%s"'%self.info.strip() if self.info else '')
self.node = None
self.tags = None
self.url = None
self.info = None
self.on_dt = False
self.on_dd = False
self.on_a = False
parser = MyHTMLParser()
with open('/home/bruno/Desktop/delicious.html', 'r') as f:
parser.feed(f.read())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment