Created
July 12, 2019 21:12
-
-
Save hengstchon/4b4db4a6ac85d71c51907124d549d343 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
from bs4 import BeautifulSoup, Comment, Tag, NavigableString | |
import requests | |
import unicodedata | |
result = './result.txt' | |
result_body = './body.txt' | |
md_dir = '/Users/yh/Documents/myblog/content/blog/' | |
url_file = 'url.txt' | |
def write(content, filename): | |
with open(filename, 'w') as f: | |
f.write(content) | |
def load(filename): | |
with open(filename) as f: | |
return f.read() | |
def get_raw(url): | |
html = requests.get(url).text | |
soup = BeautifulSoup(html, 'lxml') | |
return soup | |
def check_tag(soup): | |
tag_list = [] | |
for e in soup.contents: | |
if isinstance(e, Tag) and e.name not in tag_list: | |
tag_list.append(e.name) | |
def deal_a(e): | |
if 'href' in e.attrs and e.contents: | |
href = e['href'] | |
text = e.contents[0].strip() | |
return f'[{text}]({href})' | |
else: | |
# print('\nnot recognize: ', f'({e})') | |
return '' | |
def deal_br(e): | |
return '\n' | |
def deal_h2(e): | |
return f'## {e.contents[0].strip()}\n' | |
def deal_h3(e): | |
text = '' | |
for i in e.contents: | |
sub = convert(i) | |
if sub not in ['\n', '']: | |
text += sub | |
return f'### {text.strip()}\n' | |
def deal_b(e): | |
return f'**{e.contents[0].strip()}**' | |
def deal_i(e): | |
return f'*{e.contents[0].strip()}*' | |
def deal_blockquote(e): | |
text = '' | |
for i in e.contents: | |
sub = convert(i) | |
if sub not in [ '\n', '']: | |
text += f'>{sub} \n' | |
return text | |
def deal_div(e): | |
text = '' | |
for i in e.contents: | |
sub = convert(i) | |
if sub != '\n' or '': | |
text += f'\n{sub}' | |
return '' | |
tag_func = { | |
'a': deal_a, | |
'br': deal_br, | |
'h2': deal_h2, | |
'h3': deal_h3, | |
'b': deal_b, | |
'blockquote': deal_blockquote, | |
'div': deal_div, | |
'i': deal_i, | |
} | |
def process_tag(e): | |
if e.name in tag_func: | |
func = tag_func[e.name] | |
return func(e) | |
else: | |
return '' | |
def process_text(e): | |
return e.strip() | |
def convert(e): | |
if isinstance(e, Comment): | |
return '' | |
elif isinstance(e, Tag): | |
return process_tag(e) | |
elif isinstance(e, NavigableString): | |
return process_text(e) | |
else: | |
return '' | |
def clear_return(text): | |
if '\n\n' in text: | |
text = text.replace('\n\n', '\n') | |
return clear_return(text) | |
else: | |
return text | |
def add_return(text): | |
return text.replace('\n', '\n\n') | |
def connect_blockquote(text): | |
return text.replace(' \n\n>', ' \n>') | |
def fix_format(text): | |
text = clear_return(text) | |
text = add_return(text) | |
text = connect_blockquote(text) | |
return text | |
def markdownify(url): | |
soup = get_raw(url) | |
md_file = url.split('/')[-1].split('.')[0] + '.md' | |
md_path = os.path.join(md_dir, md_file) | |
title = soup.select('.post-title')[0].text.strip() | |
date = soup.select('.post-timestamp')[0]['data'] | |
body = soup.select('.post-body')[0] | |
post_head = f'---\ntitle: {title}\ndate: {date}\n---\n\n' | |
post_footer = f'\n\n**版权声明** \n本博客所有的原创文章,作者皆保留版权。转载必须包含本声明,保持本文完整,并以超链接形式注明作者[编程随想](mailto:program.think@gmail.com) 和本文原始地址: \n[{url}]({url})' | |
post_body = '' | |
for e in body.contents: | |
s = convert(e) | |
post_body += s | |
post_body = fix_format(post_body) | |
with open(md_path, 'w') as f: | |
f.write(post_head) | |
f.write(post_body) | |
f.write(post_footer) | |
def main(): | |
with open(url_file) as f: | |
url_list = f.readlines() | |
for url in url_list: | |
url = url[:-1] | |
markdownify(url) | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment