Skip to content

Instantly share code, notes, and snippets.

@hengstchon
Created July 12, 2019 21:12
Show Gist options
  • Save hengstchon/4b4db4a6ac85d71c51907124d549d343 to your computer and use it in GitHub Desktop.
Save hengstchon/4b4db4a6ac85d71c51907124d549d343 to your computer and use it in GitHub Desktop.
import os
from bs4 import BeautifulSoup, Comment, Tag, NavigableString
import requests
import unicodedata
result = './result.txt'
result_body = './body.txt'
md_dir = '/Users/yh/Documents/myblog/content/blog/'
url_file = 'url.txt'
def write(content, filename):
with open(filename, 'w') as f:
f.write(content)
def load(filename):
with open(filename) as f:
return f.read()
def get_raw(url):
html = requests.get(url).text
soup = BeautifulSoup(html, 'lxml')
return soup
def check_tag(soup):
tag_list = []
for e in soup.contents:
if isinstance(e, Tag) and e.name not in tag_list:
tag_list.append(e.name)
def deal_a(e):
if 'href' in e.attrs and e.contents:
href = e['href']
text = e.contents[0].strip()
return f'[{text}]({href})'
else:
# print('\nnot recognize: ', f'({e})')
return ''
def deal_br(e):
return '\n'
def deal_h2(e):
return f'## {e.contents[0].strip()}\n'
def deal_h3(e):
text = ''
for i in e.contents:
sub = convert(i)
if sub not in ['\n', '']:
text += sub
return f'### {text.strip()}\n'
def deal_b(e):
return f'**{e.contents[0].strip()}**'
def deal_i(e):
return f'*{e.contents[0].strip()}*'
def deal_blockquote(e):
text = ''
for i in e.contents:
sub = convert(i)
if sub not in [ '\n', '']:
text += f'>{sub} \n'
return text
def deal_div(e):
text = ''
for i in e.contents:
sub = convert(i)
if sub != '\n' or '':
text += f'\n{sub}'
return ''
tag_func = {
'a': deal_a,
'br': deal_br,
'h2': deal_h2,
'h3': deal_h3,
'b': deal_b,
'blockquote': deal_blockquote,
'div': deal_div,
'i': deal_i,
}
def process_tag(e):
if e.name in tag_func:
func = tag_func[e.name]
return func(e)
else:
return ''
def process_text(e):
return e.strip()
def convert(e):
if isinstance(e, Comment):
return ''
elif isinstance(e, Tag):
return process_tag(e)
elif isinstance(e, NavigableString):
return process_text(e)
else:
return ''
def clear_return(text):
if '\n\n' in text:
text = text.replace('\n\n', '\n')
return clear_return(text)
else:
return text
def add_return(text):
return text.replace('\n', '\n\n')
def connect_blockquote(text):
return text.replace(' \n\n>', ' \n>')
def fix_format(text):
text = clear_return(text)
text = add_return(text)
text = connect_blockquote(text)
return text
def markdownify(url):
soup = get_raw(url)
md_file = url.split('/')[-1].split('.')[0] + '.md'
md_path = os.path.join(md_dir, md_file)
title = soup.select('.post-title')[0].text.strip()
date = soup.select('.post-timestamp')[0]['data']
body = soup.select('.post-body')[0]
post_head = f'---\ntitle: {title}\ndate: {date}\n---\n\n'
post_footer = f'\n\n**版权声明** \n本博客所有的原创文章,作者皆保留版权。转载必须包含本声明,保持本文完整,并以超链接形式注明作者[编程随想](mailto:program.think@gmail.com) 和本文原始地址: \n[{url}]({url})'
post_body = ''
for e in body.contents:
s = convert(e)
post_body += s
post_body = fix_format(post_body)
with open(md_path, 'w') as f:
f.write(post_head)
f.write(post_body)
f.write(post_footer)
def main():
with open(url_file) as f:
url_list = f.readlines()
for url in url_list:
url = url[:-1]
markdownify(url)
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment