Skip to content

Instantly share code, notes, and snippets.

@turtlemonvh
Created April 7, 2022 22:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save turtlemonvh/8b632a81e6023306a7798a83bfc3eb34 to your computer and use it in GitHub Desktop.
Save turtlemonvh/8b632a81e6023306a7798a83bfc3eb34 to your computer and use it in GitHub Desktop.
bookmark_file_parser

Scripts and functions for handling parsing of bookmark files into more friendly formats.

From some work I was doing in parsing bookmarks back in ~2016, for deduplication and finding similar bookmarks in a collection of 30K+ bookmarks I have accumulated over 15+ years.

The goal was to make this approx constant memory so parsing can happen quickly in very resource constrained environments. Hence the custom streaming XML and JSON parsing.

Now that tools like browser-history exist I'll likely revisit this project in the next couple years since I'll be able to make a lot of progress in just a few hours. At that time I'll likely convert all this into a more public repository.

Import

From HTML

# Output bookmark and folder objects, 1 json string per line
python bookmark_file_parser.py yield -f ../../test/fixtures/bookmarks_small.html

# Can be piped into a file
python bookmark_file_parser.py yield -f bookmarks_9_5_15.html > perline.json

# Include summary counts
python bookmark_file_parser.py yield -f bookmarks_9_5_15.html -v

# Output cleaned 1 json per line bookmarks with folder path "//" separated included
# Include status information along the way
python bookmark_file_parser.py cleaned -f bookmarks_9_5_15.html -o cleaned.bookmarks -v

# Leave temporary files for debugging
python bookmark_file_parser.py cleaned -f bookmarks_9_5_15.html -o cleaned.bookmarks -v --leave-temp
  • process file to tmp file
  • start parsing
  • yield bookmarks/folders one at a time

From JSON

# Yielding 1 per line
python bookmark_file_parser.py yield -f bookmarks_9_5_15.small.json

# Just bookmarks in a file
python bookmark_file_parser.py cleaned -f bookmarks_9_5_15.small.json -o cleaned.bookmarks -v
  • use ijson to handle mozilla bookmark archive
  • may require 2 passes
    • first pass puts in a placeholder for path (uuid)
      • uuids are mapped to actual paths as the parent name is encountered
      • records are written to a temporary file
    • second pass replaces the uuid in each item with the actual parent path
      • records are written to the final file, and
  • In the HTML format H3s at the top of a section denote a directory name
    • in the json we don't get directory name until after the child record is produced
    • folders are marked with "type": "text/x-moz-place-container"

Formats

Yielded

The format of the dictionaries yielded from each generator is the same.

// Bookmarks
{
    'type': 'bookmark',
    'uri': <string>,
    'path': <array of ints>,
    'name': <string>,
    'date_added': <int>,
    'iconuri': <string>
}

// Folders
{
    'type': 'folder',
    'id': <int>,
    'path': <array of ints>,
    'name': <string>,
    'last_modified': <int>,
    'date_added': <int>
}

Cleaned

The format of bookmarks on each line of the file produced by the cleaned function.

// Bookmarks
{
    'type': 'bookmark',
    'uri': <string>,
    'path': <'//' separated string>,
    'name': <string>,
    'date_added': <int>,
    'iconuri': <string>
}
from __future__ import print_function
from __future__ import division
__author__ = 'timothy'
from collections import defaultdict
import codecs
import json
import os
import logging
from logging.config import dictConfig
import ijson
from lxml.html import parse
from lxml import etree
import sys
logging_config = dict(
version = 1,
formatters = {
'basic': {
'format': '%(asctime)s %(levelname)-6s %(module)s.%(funcName)s,%(lineno)d :: %(message)s',
'datefmt': '%Y-%m-%d %H:%M:%S'
}
},
handlers = {
'stdout': {
'level': 'DEBUG',
'formatter': 'basic',
'class': 'logging.StreamHandler',
'stream': sys.stdout
},
'stderr': {
'level': 'WARN',
'formatter': 'basic',
'class': 'logging.StreamHandler',
'stream': sys.stderr
},
},
root = {
'handlers': ['stdout'],
'level': 'DEBUG'
}
)
dictConfig(logging_config)
logger = logging.getLogger(__file__)
###########################
# Preprocess html
###########################
def writefile(lines, outfilename):
"""Write file one line at a time
"""
with codecs.open(outfilename, 'w+', 'utf-8') as outfile:
for line in lines:
outfile.write(line)
def preprocess_bookmark_file(filename, outfile=None):
"""Takes a filename and creates a new version of the file cleaned up for easier processing
File format:
http://msdn.microsoft.com/en-us/library/aa753582(v=vs.85).aspx
"""
if outfile:
newfilename = outfile
else:
newfilename = "%s.cleaned" % filename
with codecs.open(filename, 'rb', 'utf-8') as f:
cleaned_lines = cleanlines(f)
writefile(cleaned_lines, newfilename)
return newfilename
def cleanlines(fileobj):
"""Takes a file object and returns cleaned lines one at a time
"""
finished_head_section = False
for line in fileobj:
# Getting out of the 'ignore' state
if line.startswith("<DL>") and not finished_head_section:
finished_head_section = True
# Ignore the head lines
if not finished_head_section:
continue
# Clean up unclosed "<DT>" tags
if line.lstrip().startswith("<DT>"):
line = line.replace("<DT>", "", 1) # Just remove these
# Remove "<p>" tags
# They always are at the end of lines
line = line.rstrip()
if line.endswith("<p>"):
line = line[:-3]
# Add unix newline back since this was removed with rstrip
yield line + "\n"
###########################
# Yield bookmark objects
###########################
def yield_from_cleaned_archive(filename, return_counts=False):
"""Accepts the name of a bookmarks file that has been cleaned
Yields bookmark and folder dicts
"""
# Force everyone to play nice with utf-8
with codecs.open(filename, 'rb', 'utf-8') as f:
p = etree.HTMLParser(encoding='utf-8')
tree = parse(f, parser=p)
root = tree.getroot()
root_dl = root.xpath("/html/body/dl")[0]
folder_stats = {'id': 0, 'path': [], 'nbookmarks': 0}
for e in root_dl.iterchildren():
# Main processing
for bm in yield_bookmarks(e, folder_stats):
yield bm
if return_counts:
yield {
'bookmark': folder_stats['nbookmarks'],
'folder': folder_stats['id']
}
def yield_bookmarks(e, folder_stats):
"""Recursive function that takes a DOM node and the current folder path
Yields bookmark dictionaries
"""
if e.tag == 'h3':
# Add folder to the path
# After an H3, a DL will be encountered soon that contains it's children
# This H3 and the DL are on the same level (siblings)
folder_stats['id'] += 1
# yield folder
try:
date_added = int(e.get('add_date', 0))
except TypeError:
date_added = 0
try:
last_modified = int(e.get('last_modified', 0))
except TypeError:
last_modified = 0
yield {
'type': 'folder',
'date_added': date_added,
'last_modified': last_modified,
'id': folder_stats['id'],
'path': folder_stats['path'],
'name': e.text
}
folder_stats['path'].append(folder_stats['id'])
elif e.tag == 'dl':
# Process it's items recursively
for child in e.iterchildren():
for bm in yield_bookmarks(child, folder_stats):
yield bm
# After a DL is finished, the last title should be popped
if len(folder_stats['path']): # Handle top level node
folder_stats['path'].pop()
elif e.tag == 'a':
href = e.get('href', None)
add_date = e.get('add_date', None)
title = e.text
icon = e.get('icon', None)
# Pass in a slice of the folder path so it doesn't update every time the folder path changes
yield {
'type': 'bookmark',
'uri': href,
'date_added': add_date,
'name': title,
'icon': icon,
'path': folder_stats['path'] # can have multiple paths
}
folder_stats['nbookmarks'] += 1
else:
raise TypeError("Encountered unexpected tag '%s'" % e.tag)
def yield_from_json(json_file, return_counts=False):
"""Accepts the name of a json bookmarks file
Yields bookmark and folder dicts
"""
folder_number = 0 # increments every time we go into a new folder
folder_path = [] # list of folder numbers representing current state
# A list of dictionaries all collecting information about the item being scanned over
# Every item except the last one is a folder
item_detail = []
# The type of information we are expecting based off of the last 'map_key' event
await_type = ""
# Rename fields for consistency with html version
# Some fields are ignored:
# - type is 'text/x-moz-place-container' or 'text/x-moz-place'
# - guid is a unique base64 id
# - index is the order of an item within a folder (which we don't maintain)
name_transform = {
'lastModified': 'last_modified',
'dateAdded': 'date_added',
'title': 'name',
'iconuri': 'icon',
'uri': 'uri'
}
def edit_last(key, value):
# Edit a value for the last item in the stack
item_detail[len(item_detail) - 1][key] = value
counts = defaultdict(int)
with open(json_file) as f:
parser = ijson.parse(f)
for prefix, event, value in parser:
if event == "start_map":
# We always assume it is a bookmark and not a folder to start
item_detail.append({"type": "bookmark"})
continue
if event == "end_map":
# Ending either a folder or a bookmark
ii = item_detail.pop()
ii["path"] = folder_path
counts[ii['type']] += 1
# Clean up dates to be consistent with html version
date_added = ii.get("date_added")
if date_added:
try:
ii['date_added'] = int(date_added // 1e6)
except TypeError:
del ii['date_added']
last_modified = ii.get("last_modified")
if last_modified:
try:
ii['last_modified'] = int(last_modified // 1e6)
except TypeError:
del ii['last_modified']
yield ii
continue
if event == "start_array":
if await_type != "children":
# A little format checking
raise TypeError("Expected await_type='children' when starting an array but found await_type='%s'" % await_type)
await_type = ""
# Now we know we were in a folder
folder_number += 1
folder_path.append(folder_number)
edit_last("type", "folder")
edit_last("id", folder_number)
continue
if event == "end_array":
folder_path.pop()
continue
# Set state machine to expect value
if event == "map_key":
await_type = value
continue
# Add keys to last item
if await_type and event in ["string", "number"]:
key_name = name_transform.get(await_type)
if key_name:
edit_last(key_name, value)
await_type = ""
continue
if return_counts:
counts['type'] = 'counts'
yield counts
def yield_from_file(filename, verbose=False, return_counts=False, leave_temp=False):
if filename.endswith("json"):
for bm in yield_from_json(filename, return_counts):
yield bm
elif filename.endswith("html"):
newfilename = preprocess_bookmark_file(filename)
for bm in yield_from_cleaned_archive(newfilename, return_counts):
yield bm
if not leave_temp:
if verbose:
logger.info("Removing temporary file: %s" % (newfilename))
os.remove(newfilename)
else:
if verbose:
logger.error("Unknown file type; accepts '.json', '.html'")
exit(1)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description='Script for processing and transforming bookmark archive files')
parser.add_argument('operation',
action='store',
default='yield',
choices=['clean', 'yield', 'cleaned'],
help='type of operation.')
parser.add_argument('-f', '--filename',
dest="filename",
action="store",
help="input filename"
)
parser.add_argument('-o', '--outfile',
dest="outfile",
action="store",
default=None,
help="output filename"
)
parser.add_argument('-v', '--verbose',
dest="verbose",
action="store_true",
default=False,
help="more verbose output."
)
parser.add_argument('--leave-temp',
dest="leave_temp",
action="store_true",
default=False,
help="don't clean up temporary files."
)
args = parser.parse_args()
if args.operation == 'clean':
newfilename = preprocess_bookmark_file(args.filename, args.outfile)
if args.verbose:
logger.info("Created cleaned file: '%s'" % newfilename)
if args.operation == 'yield':
for bm in yield_from_file(args.filename, return_counts=args.verbose):
print(json.dumps(bm))
if args.operation == "cleaned":
folders_tmp = ".tmp.folders.txt"
bookmarks_tmp = ".tmp.bookmarks.txt"
bookmarks_final = args.outfile if args.outfile else "%s.final" % args.filename
if args.verbose:
logger.info("Processing bookmarks file into 2 temporary files")
# Split file into 2 streams
counts = defaultdict(int)
with open(bookmarks_tmp, "w+") as bookmarks_file, open(folders_tmp, "w+") as folders_file:
for bm in yield_from_file(args.filename, verbose=args.verbose, leave_temp=args.leave_temp):
outfile = None
counts[bm['type']] += 1
if bm['type'] == 'bookmark':
outfile = bookmarks_file
else:
outfile = folders_file
print(json.dumps(bm), file=outfile)
if args.verbose:
logger.info("Processed %s bookmarks in %s folders" % (counts['bookmark'], counts['folder']))
logger.info("Reading in folders to sort")
# Read in folders and sort by id
folders = []
with open(folders_tmp) as folders_file:
for line in folders_file:
folders.append(json.loads(line))
# Sort the whole thing in memory
if args.verbose:
logger.info("Sorting %s folders in memory" % len(folders))
folders.sort(key=lambda folder: folder['id'])
with open(folders_tmp, "w+") as folders_file:
for folder in folders:
print(json.dumps(folder), file=folders_file)
if args.verbose:
logger.info("Completed sorting of folders into file: %s" % folders_tmp)
logger.info("Printing name path bookmarks into file: %s" % bookmarks_final)
# Combine into a single file where folders are separated by "//"
# Efficient way is to read items from folders into a list, popping off last item when it is no longer seen
# For now just use the sorted list of bookmarks we already have in memory
with open(bookmarks_tmp) as bookmarks_file, open(folders_tmp) as folders_file, open(bookmarks_final, 'w+') as bookmarks_final_file:
for line in bookmarks_file:
bookmark = json.loads(line)
path = bookmark['path']
name_path = "//".join([folders[f-1].get('name') for f in path])
# The first 2 folders for json are "" and "Bookmarks Menu"
if name_path.startswith("//Bookmarks Menu//"):
name_path = name_path[18:]
bookmark['path'] = name_path
print(json.dumps(bookmark), file=bookmarks_final_file)
if not args.leave_temp:
if args.verbose:
logger.info("Removing temporary files %s, %s" % (bookmarks_tmp, folders_tmp))
os.remove(bookmarks_tmp)
os.remove(folders_tmp)
if args.verbose:
logger.info("Completed writing bookmarks into file: %s" % bookmarks_final)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment