Skip to content

Instantly share code, notes, and snippets.

@takaxp
Forked from yono/evnt2org.py
Created December 17, 2011 07:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save takaxp/1489536 to your computer and use it in GitHub Desktop.
Save takaxp/1489536 to your computer and use it in GitHub Desktop.
Evernote から Export した HTML を Org-mode 形式に変換するスクリプト
#!/usr/bin/env python
# -*- coding:utf-8 -*-
"""
evnt2org
convert html text that is exported from Evernote to org-mode
"""
import urllib
import re
import sys
"""
タグごとに正規表現を定義
"""
# 無視するタグ
ignores = re.compile(r"(</?body.*?>|</?html.*?>|<\?xml.*?>|</?meta.*?>|</?head.*?>|</?input.*?>|</?tbody.*?>|</?dt.*?>|</?ul.*?>|<!DOCTYPE.*?>|</?dl.*?>|</div>|</?font.*?>|</?p.*?>|</?table.*?>|</?span.*?>|</?ol.*?>|</?col.*?>|</?tt.*?>|</?dd.*?>)", re.DOTALL)
# タイトル
title_tag = re.compile(r"<title.*?>(?P<title>.*?)</title>", re.DOTALL)
# h?
h1_tag = re.compile(r"<h1>(?P<h1>.*?)</h1>", re.DOTALL)
h2_tag = re.compile(r"<h2>(?P<h2>.*?)</h2>", re.DOTALL)
h3_tag = re.compile(r"<h3>(?P<h3>.*?)</h3>", re.DOTALL)
h4_tag = re.compile(r"<h4>(?P<h4>.*?)</h4>", re.DOTALL)
h5_tag = re.compile(r"<h5>(?P<h5>.*?)</h5>", re.DOTALL)
# リンク
a_tag = re.compile(r"<a.*?href=(\"|')(?P<href>.*?)(\"|').*?>(?P<title>.*?)</a>", re.DOTALL)
# 改行
newline = re.compile(r"(<div.*?>|<br\/>|</?tr.*?>)", re.DOTALL)
# 強調
bold = re.compile(r"<strong>(?P<bold>.*?)</strong>", re.DOTALL)
bold2 = re.compile("<b>(?P<bold>.*?)</b>", re.DOTALL)
bold3 = re.compile("<em>(?P<em>.*?)</em>", re.DOTALL)
# イタリック
italic = re.compile(r"<i>(?P<italic>.*?)</i>", re.DOTALL)
# 下線
under = re.compile(r"<u>(?P<under>.*?)</u>", re.DOTALL)
# 打ち消し
strike = re.compile(r"<strike>(?P<strike>.*?)</strike>", re.DOTALL)
strike2 = re.compile(r"<s>(?P<strike>.*?)</s>", re.DOTALL)
# code
code = re.compile(r"<code>(?P<code>.*?)</code>", re.DOTALL)
pre = re.compile(r"<pre>(?P<pre>.*?)</pre>", re.DOTALL)
# 箇条書き
li = re.compile(r"<li>(?P<li>.*?)</li>", re.DOTALL)
# 引用
quote = re.compile(r"<blockquote>(?P<quote>.*?)</blockquote>", re.DOTALL)
# 中央寄せ
center = re.compile(r"<center>(?P<center>.*?)</center>", re.DOTALL)
# テーブル
rowh1 = re.compile(r"</th.*?><th.*?>", re.DOTALL)
rowh2 = re.compile(r"</?th.*?>", re.DOTALL)
rowd1 = re.compile(r"</td.*?><td.*?>", re.DOTALL)
rowd2 = re.compile(r"</?td.*?>", re.DOTALL)
# 画像
img = re.compile(r"<img .*?src=\"(?P<img>.*?)\".*?/>", re.DOTALL)
"""
ファイルを読み込む
"""
file = sys.argv[1]
evnt = open(file).read()
"""
正規表現を適用する
"""
evnt = ignores.sub("", evnt)
evnt = title_tag.sub("* \g<title>\n", evnt)
evnt = h1_tag.sub("** \g<h1>\n", evnt)
evnt = h2_tag.sub("*** \g<h2>\n", evnt)
evnt = h3_tag.sub("**** \g<h3>\n", evnt)
evnt = h4_tag.sub("***** \g<h4>\n", evnt)
evnt = h5_tag.sub("****** \g<h5>\n", evnt)
evnt = a_tag.sub("[[\g<href>][\g<title>]", evnt)
evnt = newline.sub("\n", evnt)
evnt = bold.sub("*\g<bold>*", evnt)
evnt = bold2.sub("*\g<bold>*", evnt)
evnt = bold3.sub("*\g<em>*", evnt)
evnt = italic.sub("/\g<italic>/", evnt)
evnt = under.sub("_\g<under>_", evnt)
evnt = strike.sub("+\g<strike>+", evnt)
evnt = strike2.sub("+\g<strike>+", evnt)
evnt = code.sub("#+BEGIN_SRC text\n\g<code>\n#+END_SRC\n", evnt)
evnt = pre.sub("#+BEGIN_SRC text\n\g<pre>\n#+END_SRC\n", evnt)
evnt = li.sub("- \g<li>\n", evnt)
evnt = quote.sub("#+BEGIN_QUOTE \n\g<quote>\n#+END_QUOTE\n", evnt)
evnt = center.sub("#+BEGIN_CENTER \n\g<center>\n#+END_CENTER\n", evnt)
evnt = rowh1.sub("|", evnt)
evnt = rowh2.sub("|", evnt)
evnt = rowd1.sub("|", evnt)
evnt = rowd2.sub("|", evnt)
evnt = img.sub("[img:\g<img>]", evnt)
evnt = urllib.unquote(evnt)
print evnt
#!/usr/bin/env python
# -*- coding:utf-8 -*-
#!c:\python27\python.exe
# coding: utf-8
u"""========================================
正規表現を利用してHTMLタグの抽出
========================================"""
import re
text=u"""\
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
<html lang="ja">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<title>${title}</title>
</head>
<body>
${body}
</body>
</html>\
"""
def tag_all(str):
u"""----------------------------------------
全てのタグを正規表現で抽出、リストにして返す
----------------------------------------"""
#タグ全ての正規表現 <${任意}>
reg=re.compile(ur"""<(.*?)>""")
#タグをリストに格納
tag_list=[i.group(0) for i in reg.finditer(str)]
return tag_list
def tag_end(str):
u"""----------------------------------------
終了タグを正規表現で抽出、リストにして返す
----------------------------------------"""
#タグ全ての正規表現 <${任意}>
reg=re.compile(ur"""</(.*?)>""")
#タグをリストに格納
tag_list=[i.group(0) for i in reg.finditer(str)]
return tag_list
def tag_start(str):
u"""----------------------------------------
開始タグをリストにして返す
----------------------------------------"""
#all tag
tag_all_list=tag_all(str)
#end tag
tag_end_list=tag_end(str)
#start tag
tag_start_list=[i for i in set(tag_all_list)-set(tag_end_list)]
return tag_start_list
def main():
res=tag_start(text)
print u"[元のテキスト]"
print text
print "-"*40
print u"[開始タグ一覧]"
for i,elem in enumerate(res):
print u"%s : %s" % (i+1,elem)
if __name__=='__main__':
main()
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import os
import re
import extract_htmltag
dirname = "mynotes"
files = os.listdir(dirname)
tags = {}
gomi = re.compile(r" .*")
for file in files:
if os.path.isfile(dirname + "/" + file):
htmltext = open(dirname + "/" + file).read()
res = extract_htmltag.tag_start(htmltext)
for tag in res:
tags[gomi.sub(">", tag)] = 1
for tag in tags:
if not "/" in tag:
print tag
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment