hengstchon/scrap-program-think.py

## scrap-program-think.py
import os
from bs4 import BeautifulSoup, Comment, Tag, NavigableString
import requests
import unicodedata

result = './result.txt'
result_body = './body.txt'
md_dir = '/Users/yh/Documents/myblog/content/blog/'
url_file = 'url.txt'

def write(content, filename):
    with open(filename, 'w') as f:
        f.write(content)

def load(filename):
    with open(filename) as f:
        return f.read()

def get_raw(url):
    html = requests.get(url).text
    soup = BeautifulSoup(html, 'lxml')
    return soup

def check_tag(soup):
    tag_list = []
    for e in soup.contents:
        if isinstance(e, Tag) and e.name not in tag_list:
            tag_list.append(e.name)

def deal_a(e):
    if 'href' in e.attrs and e.contents:
        href = e['href']
        text = e.contents[0].strip()
        return f'[{text}]({href})'
    else:
        #  print('\nnot recognize: ', f'({e})')
        return ''

def deal_br(e):
    return '\n'

def deal_h2(e):
    return f'## {e.contents[0].strip()}\n'

def deal_h3(e):
    text = ''
    for i in e.contents:
        sub = convert(i)
        if sub not in ['\n', '']:
            text += sub
    return f'### {text.strip()}\n'

def deal_b(e):
    return f'**{e.contents[0].strip()}**'

def deal_i(e):
    return f'*{e.contents[0].strip()}*'

def deal_blockquote(e):
    text = ''
    for i in e.contents:
        sub = convert(i)
        if sub not in [ '\n', '']:
            text += f'>{sub}  \n'
    return text

def deal_div(e):
    text = ''
    for i in e.contents:
        sub = convert(i)
        if sub != '\n' or '':
            text += f'\n{sub}'
    return ''

tag_func = {
        'a': deal_a,
        'br': deal_br,
        'h2': deal_h2,
        'h3': deal_h3,
        'b': deal_b,
        'blockquote': deal_blockquote,
        'div': deal_div,
        'i': deal_i,
        }

def process_tag(e):
    if e.name in tag_func:
        func = tag_func[e.name]
        return func(e)
    else:
        return ''

def process_text(e):
    return e.strip()

def convert(e):
    if isinstance(e, Comment):
        return ''
    elif isinstance(e, Tag):
        return process_tag(e)
    elif isinstance(e, NavigableString):
        return process_text(e)
    else:
        return ''

def clear_return(text):
    if '\n\n' in text:
        text = text.replace('\n\n', '\n')
        return clear_return(text)
    else:
        return text

def add_return(text):
    return text.replace('\n', '\n\n')

def connect_blockquote(text):
    return text.replace('  \n\n>', '  \n>')

def fix_format(text):
    text = clear_return(text)
    text = add_return(text)
    text = connect_blockquote(text)
    return text

def markdownify(url):
    soup = get_raw(url)

    md_file = url.split('/')[-1].split('.')[0] + '.md'
    md_path = os.path.join(md_dir, md_file)
    title = soup.select('.post-title')[0].text.strip()
    date = soup.select('.post-timestamp')[0]['data']
    body = soup.select('.post-body')[0]

    post_head = f'---\ntitle: {title}\ndate: {date}\n---\n\n'
    post_footer = f'\n\n**版权声明** \n本博客所有的原创文章，作者皆保留版权。转载必须包含本声明，保持本文完整，并以超链接形式注明作者[编程随想](mailto:program.think@gmail.com) 和本文原始地址：  \n[{url}]({url})'
    post_body = ''
    for e in body.contents:
        s = convert(e)
        post_body += s
    post_body = fix_format(post_body)

    with open(md_path, 'w') as f:
        f.write(post_head)
        f.write(post_body)
        f.write(post_footer)

def main():
    with open(url_file) as f:
        url_list = f.readlines()
        for url in url_list:
            url = url[:-1]
            markdownify(url)

main()
	import os
	from bs4 import BeautifulSoup, Comment, Tag, NavigableString
	import requests
	import unicodedata

	result = './result.txt'
	result_body = './body.txt'
	md_dir = '/Users/yh/Documents/myblog/content/blog/'
	url_file = 'url.txt'

	def write(content, filename):
	with open(filename, 'w') as f:
	f.write(content)

	def load(filename):
	with open(filename) as f:
	return f.read()

	def get_raw(url):
	html = requests.get(url).text
	soup = BeautifulSoup(html, 'lxml')
	return soup

	def check_tag(soup):
	tag_list = []
	for e in soup.contents:
	if isinstance(e, Tag) and e.name not in tag_list:
	tag_list.append(e.name)

	def deal_a(e):
	if 'href' in e.attrs and e.contents:
	href = e['href']
	text = e.contents[0].strip()
	return f'[{text}]({href})'
	else:
	# print('\nnot recognize: ', f'({e})')
	return ''

	def deal_br(e):
	return '\n'

	def deal_h2(e):
	return f'## {e.contents[0].strip()}\n'

	def deal_h3(e):
	text = ''
	for i in e.contents:
	sub = convert(i)
	if sub not in ['\n', '']:
	text += sub
	return f'### {text.strip()}\n'

	def deal_b(e):
	return f'{e.contents[0].strip()}'

	def deal_i(e):
	return f'{e.contents[0].strip()}'

	def deal_blockquote(e):
	text = ''
	for i in e.contents:
	sub = convert(i)
	if sub not in [ '\n', '']:
	text += f'>{sub} \n'
	return text

	def deal_div(e):
	text = ''
	for i in e.contents:
	sub = convert(i)
	if sub != '\n' or '':
	text += f'\n{sub}'
	return ''

	tag_func = {
	'a': deal_a,
	'br': deal_br,
	'h2': deal_h2,
	'h3': deal_h3,
	'b': deal_b,
	'blockquote': deal_blockquote,
	'div': deal_div,
	'i': deal_i,
	}

	def process_tag(e):
	if e.name in tag_func:
	func = tag_func[e.name]
	return func(e)
	else:
	return ''

	def process_text(e):
	return e.strip()

	def convert(e):
	if isinstance(e, Comment):
	return ''
	elif isinstance(e, Tag):
	return process_tag(e)
	elif isinstance(e, NavigableString):
	return process_text(e)
	else:
	return ''

	def clear_return(text):
	if '\n\n' in text:
	text = text.replace('\n\n', '\n')
	return clear_return(text)
	else:
	return text

	def add_return(text):
	return text.replace('\n', '\n\n')

	def connect_blockquote(text):
	return text.replace(' \n\n>', ' \n>')

	def fix_format(text):
	text = clear_return(text)
	text = add_return(text)
	text = connect_blockquote(text)
	return text

	def markdownify(url):
	soup = get_raw(url)

	md_file = url.split('/')[-1].split('.')[0] + '.md'
	md_path = os.path.join(md_dir, md_file)
	title = soup.select('.post-title')[0].text.strip()
	date = soup.select('.post-timestamp')[0]['data']
	body = soup.select('.post-body')[0]

	post_head = f'---\ntitle: {title}\ndate: {date}\n---\n\n'
	post_footer = f'\n\n版权声明 \n本博客所有的原创文章，作者皆保留版权。转载必须包含本声明，保持本文完整，并以超链接形式注明作者[编程随想](mailto:program.think@gmail.com) 和本文原始地址： \n[{url}]({url})'
	post_body = ''
	for e in body.contents:
	s = convert(e)
	post_body += s
	post_body = fix_format(post_body)

	with open(md_path, 'w') as f:
	f.write(post_head)
	f.write(post_body)
	f.write(post_footer)

	def main():
	with open(url_file) as f:
	url_list = f.readlines()
	for url in url_list:
	url = url[:-1]
	markdownify(url)

	main()