|
from __future__ import print_function |
|
from __future__ import division |
|
|
|
__author__ = 'timothy' |
|
|
|
from collections import defaultdict |
|
import codecs |
|
import json |
|
import os |
|
import logging |
|
from logging.config import dictConfig |
|
import ijson |
|
from lxml.html import parse |
|
from lxml import etree |
|
import sys |
|
|
|
|
|
logging_config = dict( |
|
version = 1, |
|
formatters = { |
|
'basic': { |
|
'format': '%(asctime)s %(levelname)-6s %(module)s.%(funcName)s,%(lineno)d :: %(message)s', |
|
'datefmt': '%Y-%m-%d %H:%M:%S' |
|
} |
|
}, |
|
handlers = { |
|
'stdout': { |
|
'level': 'DEBUG', |
|
'formatter': 'basic', |
|
'class': 'logging.StreamHandler', |
|
'stream': sys.stdout |
|
}, |
|
'stderr': { |
|
'level': 'WARN', |
|
'formatter': 'basic', |
|
'class': 'logging.StreamHandler', |
|
'stream': sys.stderr |
|
}, |
|
}, |
|
root = { |
|
'handlers': ['stdout'], |
|
'level': 'DEBUG' |
|
} |
|
) |
|
dictConfig(logging_config) |
|
logger = logging.getLogger(__file__) |
|
|
|
########################### |
|
# Preprocess html |
|
########################### |
|
|
|
def writefile(lines, outfilename): |
|
"""Write file one line at a time |
|
""" |
|
with codecs.open(outfilename, 'w+', 'utf-8') as outfile: |
|
for line in lines: |
|
outfile.write(line) |
|
|
|
def preprocess_bookmark_file(filename, outfile=None): |
|
"""Takes a filename and creates a new version of the file cleaned up for easier processing |
|
|
|
File format: |
|
http://msdn.microsoft.com/en-us/library/aa753582(v=vs.85).aspx |
|
|
|
""" |
|
if outfile: |
|
newfilename = outfile |
|
else: |
|
newfilename = "%s.cleaned" % filename |
|
with codecs.open(filename, 'rb', 'utf-8') as f: |
|
cleaned_lines = cleanlines(f) |
|
writefile(cleaned_lines, newfilename) |
|
return newfilename |
|
|
|
def cleanlines(fileobj): |
|
"""Takes a file object and returns cleaned lines one at a time |
|
""" |
|
finished_head_section = False |
|
|
|
for line in fileobj: |
|
# Getting out of the 'ignore' state |
|
if line.startswith("<DL>") and not finished_head_section: |
|
finished_head_section = True |
|
|
|
# Ignore the head lines |
|
if not finished_head_section: |
|
continue |
|
|
|
# Clean up unclosed "<DT>" tags |
|
if line.lstrip().startswith("<DT>"): |
|
line = line.replace("<DT>", "", 1) # Just remove these |
|
|
|
# Remove "<p>" tags |
|
# They always are at the end of lines |
|
line = line.rstrip() |
|
if line.endswith("<p>"): |
|
line = line[:-3] |
|
|
|
# Add unix newline back since this was removed with rstrip |
|
yield line + "\n" |
|
|
|
########################### |
|
# Yield bookmark objects |
|
########################### |
|
|
|
def yield_from_cleaned_archive(filename, return_counts=False): |
|
"""Accepts the name of a bookmarks file that has been cleaned |
|
|
|
Yields bookmark and folder dicts |
|
""" |
|
# Force everyone to play nice with utf-8 |
|
with codecs.open(filename, 'rb', 'utf-8') as f: |
|
p = etree.HTMLParser(encoding='utf-8') |
|
tree = parse(f, parser=p) |
|
|
|
root = tree.getroot() |
|
root_dl = root.xpath("/html/body/dl")[0] |
|
|
|
folder_stats = {'id': 0, 'path': [], 'nbookmarks': 0} |
|
|
|
for e in root_dl.iterchildren(): |
|
# Main processing |
|
for bm in yield_bookmarks(e, folder_stats): |
|
yield bm |
|
|
|
if return_counts: |
|
yield { |
|
'bookmark': folder_stats['nbookmarks'], |
|
'folder': folder_stats['id'] |
|
} |
|
|
|
def yield_bookmarks(e, folder_stats): |
|
"""Recursive function that takes a DOM node and the current folder path |
|
|
|
Yields bookmark dictionaries |
|
""" |
|
if e.tag == 'h3': |
|
# Add folder to the path |
|
# After an H3, a DL will be encountered soon that contains it's children |
|
# This H3 and the DL are on the same level (siblings) |
|
folder_stats['id'] += 1 |
|
# yield folder |
|
try: |
|
date_added = int(e.get('add_date', 0)) |
|
except TypeError: |
|
date_added = 0 |
|
try: |
|
last_modified = int(e.get('last_modified', 0)) |
|
except TypeError: |
|
last_modified = 0 |
|
|
|
yield { |
|
'type': 'folder', |
|
'date_added': date_added, |
|
'last_modified': last_modified, |
|
'id': folder_stats['id'], |
|
'path': folder_stats['path'], |
|
'name': e.text |
|
} |
|
folder_stats['path'].append(folder_stats['id']) |
|
|
|
elif e.tag == 'dl': |
|
# Process it's items recursively |
|
for child in e.iterchildren(): |
|
for bm in yield_bookmarks(child, folder_stats): |
|
yield bm |
|
|
|
# After a DL is finished, the last title should be popped |
|
if len(folder_stats['path']): # Handle top level node |
|
folder_stats['path'].pop() |
|
|
|
elif e.tag == 'a': |
|
href = e.get('href', None) |
|
add_date = e.get('add_date', None) |
|
title = e.text |
|
icon = e.get('icon', None) |
|
|
|
# Pass in a slice of the folder path so it doesn't update every time the folder path changes |
|
yield { |
|
'type': 'bookmark', |
|
'uri': href, |
|
'date_added': add_date, |
|
'name': title, |
|
'icon': icon, |
|
'path': folder_stats['path'] # can have multiple paths |
|
} |
|
folder_stats['nbookmarks'] += 1 |
|
|
|
else: |
|
raise TypeError("Encountered unexpected tag '%s'" % e.tag) |
|
|
|
|
|
def yield_from_json(json_file, return_counts=False): |
|
"""Accepts the name of a json bookmarks file |
|
|
|
Yields bookmark and folder dicts |
|
""" |
|
folder_number = 0 # increments every time we go into a new folder |
|
folder_path = [] # list of folder numbers representing current state |
|
|
|
# A list of dictionaries all collecting information about the item being scanned over |
|
# Every item except the last one is a folder |
|
item_detail = [] |
|
|
|
# The type of information we are expecting based off of the last 'map_key' event |
|
await_type = "" |
|
|
|
# Rename fields for consistency with html version |
|
# Some fields are ignored: |
|
# - type is 'text/x-moz-place-container' or 'text/x-moz-place' |
|
# - guid is a unique base64 id |
|
# - index is the order of an item within a folder (which we don't maintain) |
|
name_transform = { |
|
'lastModified': 'last_modified', |
|
'dateAdded': 'date_added', |
|
'title': 'name', |
|
'iconuri': 'icon', |
|
'uri': 'uri' |
|
} |
|
|
|
def edit_last(key, value): |
|
# Edit a value for the last item in the stack |
|
item_detail[len(item_detail) - 1][key] = value |
|
|
|
counts = defaultdict(int) |
|
with open(json_file) as f: |
|
parser = ijson.parse(f) |
|
for prefix, event, value in parser: |
|
|
|
if event == "start_map": |
|
# We always assume it is a bookmark and not a folder to start |
|
item_detail.append({"type": "bookmark"}) |
|
continue |
|
|
|
if event == "end_map": |
|
# Ending either a folder or a bookmark |
|
ii = item_detail.pop() |
|
ii["path"] = folder_path |
|
counts[ii['type']] += 1 |
|
|
|
# Clean up dates to be consistent with html version |
|
date_added = ii.get("date_added") |
|
if date_added: |
|
try: |
|
ii['date_added'] = int(date_added // 1e6) |
|
except TypeError: |
|
del ii['date_added'] |
|
|
|
last_modified = ii.get("last_modified") |
|
if last_modified: |
|
try: |
|
ii['last_modified'] = int(last_modified // 1e6) |
|
except TypeError: |
|
del ii['last_modified'] |
|
|
|
yield ii |
|
continue |
|
|
|
if event == "start_array": |
|
if await_type != "children": |
|
# A little format checking |
|
raise TypeError("Expected await_type='children' when starting an array but found await_type='%s'" % await_type) |
|
|
|
await_type = "" |
|
|
|
# Now we know we were in a folder |
|
folder_number += 1 |
|
folder_path.append(folder_number) |
|
edit_last("type", "folder") |
|
edit_last("id", folder_number) |
|
continue |
|
|
|
if event == "end_array": |
|
folder_path.pop() |
|
continue |
|
|
|
# Set state machine to expect value |
|
if event == "map_key": |
|
await_type = value |
|
continue |
|
|
|
# Add keys to last item |
|
if await_type and event in ["string", "number"]: |
|
key_name = name_transform.get(await_type) |
|
if key_name: |
|
edit_last(key_name, value) |
|
await_type = "" |
|
continue |
|
|
|
if return_counts: |
|
counts['type'] = 'counts' |
|
yield counts |
|
|
|
def yield_from_file(filename, verbose=False, return_counts=False, leave_temp=False): |
|
if filename.endswith("json"): |
|
for bm in yield_from_json(filename, return_counts): |
|
yield bm |
|
|
|
elif filename.endswith("html"): |
|
newfilename = preprocess_bookmark_file(filename) |
|
for bm in yield_from_cleaned_archive(newfilename, return_counts): |
|
yield bm |
|
|
|
if not leave_temp: |
|
if verbose: |
|
logger.info("Removing temporary file: %s" % (newfilename)) |
|
os.remove(newfilename) |
|
else: |
|
if verbose: |
|
logger.error("Unknown file type; accepts '.json', '.html'") |
|
exit(1) |
|
|
|
|
|
if __name__ == "__main__": |
|
import argparse |
|
|
|
parser = argparse.ArgumentParser(description='Script for processing and transforming bookmark archive files') |
|
|
|
parser.add_argument('operation', |
|
action='store', |
|
default='yield', |
|
choices=['clean', 'yield', 'cleaned'], |
|
help='type of operation.') |
|
parser.add_argument('-f', '--filename', |
|
dest="filename", |
|
action="store", |
|
help="input filename" |
|
) |
|
parser.add_argument('-o', '--outfile', |
|
dest="outfile", |
|
action="store", |
|
default=None, |
|
help="output filename" |
|
) |
|
parser.add_argument('-v', '--verbose', |
|
dest="verbose", |
|
action="store_true", |
|
default=False, |
|
help="more verbose output." |
|
) |
|
parser.add_argument('--leave-temp', |
|
dest="leave_temp", |
|
action="store_true", |
|
default=False, |
|
help="don't clean up temporary files." |
|
) |
|
|
|
args = parser.parse_args() |
|
|
|
if args.operation == 'clean': |
|
newfilename = preprocess_bookmark_file(args.filename, args.outfile) |
|
if args.verbose: |
|
logger.info("Created cleaned file: '%s'" % newfilename) |
|
|
|
if args.operation == 'yield': |
|
for bm in yield_from_file(args.filename, return_counts=args.verbose): |
|
print(json.dumps(bm)) |
|
|
|
if args.operation == "cleaned": |
|
folders_tmp = ".tmp.folders.txt" |
|
bookmarks_tmp = ".tmp.bookmarks.txt" |
|
bookmarks_final = args.outfile if args.outfile else "%s.final" % args.filename |
|
|
|
if args.verbose: |
|
logger.info("Processing bookmarks file into 2 temporary files") |
|
|
|
# Split file into 2 streams |
|
counts = defaultdict(int) |
|
with open(bookmarks_tmp, "w+") as bookmarks_file, open(folders_tmp, "w+") as folders_file: |
|
for bm in yield_from_file(args.filename, verbose=args.verbose, leave_temp=args.leave_temp): |
|
outfile = None |
|
counts[bm['type']] += 1 |
|
if bm['type'] == 'bookmark': |
|
outfile = bookmarks_file |
|
else: |
|
outfile = folders_file |
|
print(json.dumps(bm), file=outfile) |
|
|
|
if args.verbose: |
|
logger.info("Processed %s bookmarks in %s folders" % (counts['bookmark'], counts['folder'])) |
|
logger.info("Reading in folders to sort") |
|
|
|
# Read in folders and sort by id |
|
folders = [] |
|
with open(folders_tmp) as folders_file: |
|
for line in folders_file: |
|
folders.append(json.loads(line)) |
|
|
|
# Sort the whole thing in memory |
|
if args.verbose: |
|
logger.info("Sorting %s folders in memory" % len(folders)) |
|
folders.sort(key=lambda folder: folder['id']) |
|
|
|
with open(folders_tmp, "w+") as folders_file: |
|
for folder in folders: |
|
print(json.dumps(folder), file=folders_file) |
|
|
|
if args.verbose: |
|
logger.info("Completed sorting of folders into file: %s" % folders_tmp) |
|
logger.info("Printing name path bookmarks into file: %s" % bookmarks_final) |
|
|
|
# Combine into a single file where folders are separated by "//" |
|
# Efficient way is to read items from folders into a list, popping off last item when it is no longer seen |
|
# For now just use the sorted list of bookmarks we already have in memory |
|
with open(bookmarks_tmp) as bookmarks_file, open(folders_tmp) as folders_file, open(bookmarks_final, 'w+') as bookmarks_final_file: |
|
for line in bookmarks_file: |
|
bookmark = json.loads(line) |
|
path = bookmark['path'] |
|
name_path = "//".join([folders[f-1].get('name') for f in path]) |
|
|
|
# The first 2 folders for json are "" and "Bookmarks Menu" |
|
if name_path.startswith("//Bookmarks Menu//"): |
|
name_path = name_path[18:] |
|
|
|
bookmark['path'] = name_path |
|
print(json.dumps(bookmark), file=bookmarks_final_file) |
|
|
|
if not args.leave_temp: |
|
if args.verbose: |
|
logger.info("Removing temporary files %s, %s" % (bookmarks_tmp, folders_tmp)) |
|
os.remove(bookmarks_tmp) |
|
os.remove(folders_tmp) |
|
|
|
if args.verbose: |
|
logger.info("Completed writing bookmarks into file: %s" % bookmarks_final) |