-
-
Save hghwng/324cc28b007a8f650ce3aac5df099ef8 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python | |
import bs4 | |
def convert_element(lines, level=1): | |
result = '' | |
for line in lines: | |
if not isinstance(line, bs4.element.Tag) or \ | |
line.name != 'outline': | |
continue | |
result += '*' * level + ' ' + line.attrs.get('text', '') + '\n' | |
if 'note' in line.attrs: | |
result += line.attrs['note'].replace('\r', '\n') + '\n' | |
result += convert_element(line.children, level + 1) | |
return result | |
def convert_file(path): | |
root = bs4.BeautifulSoup(open(path), "lxml") | |
return convert_element(root.select('html body opml')[0]) | |
def main(): | |
import sys | |
output_path = sys.argv[1][:-4] + 'org' | |
result = convert_file(sys.argv[1]) | |
open(output_path, 'w').write(result) | |
if __name__ == '__main__': | |
main() |
I have extended this script to convert inline Markdown formating (links, bold text, and so on) to Org Mode format.
The conversion is done by pandoc
via pypandoc
(which you both have to install, see here).
Warning: This script is extremely slow and takes ages to finish if you have a decent amount of content in your Dynalist files.
The reason is, that pypandoc
spawns a new process for every invocation (i.e. it's not a binding via a C-API or so), and we have to call pypandoc
for every headline and for every note (I tried several approaches to prevent this, but it wasn't possible easily).
I have used the following command with GNU parallel
to run several instances of the script in parallel:
find . -name "*.opml" | parallel --bar --eta python convert.py {}
I'm using Linux, so I don't know if these dependencies are available on MacOS or Windows.
Here is the altered script:
#!/usr/bin/env python
import bs4
import pypandoc
def convert_element(lines, level=1):
result = ''
for line in lines:
if not isinstance(line, bs4.element.Tag) or \
line.name != 'outline':
continue
result += '*' * level + ' ' + md_to_org(line.attrs.get('text', '')) + '\n'
if '_note' in line.attrs:
result += md_to_org(line.attrs['_note'].replace('\r', '\n')) + '\n'
result += convert_element(line.children, level + 1)
return result
def md_to_org(md):
return pypandoc.convert_text(md, "org", format="md")
def convert_file(path):
content = open(path, "r").read()
root = bs4.BeautifulSoup(content, "lxml")
return convert_element(root.select('html body opml')[0])
def main():
import sys
print("Converting " + sys.argv[1])
output_path = sys.argv[1][:-4] + 'org'
result = convert_file(sys.argv[1])
result = result.replace("\n\n*", "\n*")
open(output_path, 'w').write(result)
if __name__ == '__main__':
main()
I know this is a bit old but the 'note' key is actually '_note' now. Thanks for this!