prnake/csdn2markdown.py

## csdn2markdown.py
from lxml import etree
import requests
import html2text
from bs4 import BeautifulSoup
import codecs

def request_get(url):
    session = requests.Session()
    headers = {
        'User-Agent': 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.36'}
    response = requests.get(url, headers=headers, timeout=3)
    return response


def CrawlingItemBlog(base_url, id):
    second_url = base_url + 'article/details/'
    url = second_url + id
    # 发送request请求并接受返回值
    item_html = request_get(url)
    if item_html.status_code == 200:
        '''
        需要的信息：
        1：标题
        2：markdown内容
        3：发表日期
        4：标签
        5：类别

        '''

        # 利用BeautifulSoup解析返回的html
        soup = BeautifulSoup(item_html.text)
        c = soup.find(id="content_views")

        # 标题
        title_article = soup.find(attrs={'class': 'title-article'})
        # 这里是将标题作为最后存储的文件名
        file_name = title_article.get_text()
        title_article = title_article.prettify()

        # 设置hexo格式博客开头的格式（title）
        hexo_title = 'title: ' + file_name + '\n'

        # 文章的categories
        hexo_categories = ''

        # 有可能出现这篇文章没有categories的情况
        try:
            hexo_categories = soup.find(attrs={'class': 'tags-box space'}).find(attrs={'class': 'tag-link'}).get_text()
        except Exception:
            pass

        if hexo_categories == '':
            pass
        else:
            # 去除拿到的str中的'\t'
            hexo_categories = hexo_categories.replace('\t', '')
            hexo_categories = 'categories:\n' + '- ' + hexo_categories + '\n'

        # 发表时间
        # time = soup.find(attrs={'class': 'time'}).get_text()
        # s_time1 = time.split('年')
        # year = s_time1[0]
        # s_time2 = s_time1[1].split('月')
        # month = s_time2[0]
        # s_time3 = s_time2[1].split('日')
        # day = s_time3[0]
        # minite = s_time3[1].strip()

        # hexo_date = 'date: ' + year + '-' + month + '-' + day + ' ' + minite + '\n'

        hexo_tags = ''

        # 获取tags
        tags = ''
        try:
            tags = soup.find(attrs={'class': 'tags-box artic-tag-box'}).get_text()
        except Exception:
            pass

        if tags == '':
            pass
        else:
            tags = tags.split('\n')
            tags = tags[2]
            tags = tags.replace('\t', ' ')
            tags = tags.split(' ')
            hexo_tags = 'tags:\n'
            for tag in tags:
                if tag == '':
                    continue
                else:
                    hexo_tags = hexo_tags + '- ' + tag + '\n'

        # 将html转化为markdown
        text_maker = html2text.HTML2Text()
        text_maker.bypass_tables = False

        text = text_maker.handle(c.prettify())
        # 有的文章名字特殊，会新建文件失败
        try:
            # 写入文件
            f = codecs.open('./mds/' + file_name + '.md', 'w', encoding='utf-8')
            f.write(text)
            f.close()
        except Exception:
            print(file_name)

        return True
    else:
        return False
	from lxml import etree
	import requests
	import html2text
	from bs4 import BeautifulSoup
	import codecs

	def request_get(url):
	session = requests.Session()
	headers = {
	'User-Agent': 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.36'}
	response = requests.get(url, headers=headers, timeout=3)
	return response


	def CrawlingItemBlog(base_url, id):
	second_url = base_url + 'article/details/'
	url = second_url + id
	# 发送request请求并接受返回值
	item_html = request_get(url)
	if item_html.status_code == 200:
	'''
	需要的信息：
	1：标题
	2：markdown内容
	3：发表日期
	4：标签
	5：类别

	'''

	# 利用BeautifulSoup解析返回的html
	soup = BeautifulSoup(item_html.text)
	c = soup.find(id="content_views")

	# 标题
	title_article = soup.find(attrs={'class': 'title-article'})
	# 这里是将标题作为最后存储的文件名
	file_name = title_article.get_text()
	title_article = title_article.prettify()

	# 设置hexo格式博客开头的格式（title）
	hexo_title = 'title: ' + file_name + '\n'

	# 文章的categories
	hexo_categories = ''

	# 有可能出现这篇文章没有categories的情况
	try:
	hexo_categories = soup.find(attrs={'class': 'tags-box space'}).find(attrs={'class': 'tag-link'}).get_text()
	except Exception:
	pass

	if hexo_categories == '':
	pass
	else:
	# 去除拿到的str中的'\t'
	hexo_categories = hexo_categories.replace('\t', '')
	hexo_categories = 'categories:\n' + '- ' + hexo_categories + '\n'

	# 发表时间
	# time = soup.find(attrs={'class': 'time'}).get_text()
	# s_time1 = time.split('年')
	# year = s_time1[0]
	# s_time2 = s_time1[1].split('月')
	# month = s_time2[0]
	# s_time3 = s_time2[1].split('日')
	# day = s_time3[0]
	# minite = s_time3[1].strip()

	# hexo_date = 'date: ' + year + '-' + month + '-' + day + ' ' + minite + '\n'

	hexo_tags = ''

	# 获取tags
	tags = ''
	try:
	tags = soup.find(attrs={'class': 'tags-box artic-tag-box'}).get_text()
	except Exception:
	pass

	if tags == '':
	pass
	else:
	tags = tags.split('\n')
	tags = tags[2]
	tags = tags.replace('\t', ' ')
	tags = tags.split(' ')
	hexo_tags = 'tags:\n'
	for tag in tags:
	if tag == '':
	continue
	else:
	hexo_tags = hexo_tags + '- ' + tag + '\n'

	# 将html转化为markdown
	text_maker = html2text.HTML2Text()
	text_maker.bypass_tables = False

	text = text_maker.handle(c.prettify())
	# 有的文章名字特殊，会新建文件失败
	try:
	# 写入文件
	f = codecs.open('./mds/' + file_name + '.md', 'w', encoding='utf-8')
	f.write(text)
	f.close()
	except Exception:
	print(file_name)

	return True
	else:
	return False