wenLiangcan/drain.py

## drain.py
#!/usr/bin/env python3

import os
import re
import sys

import mechanicalsoup

headers = {
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.94 Safari/537.36'
}


def topic_pages(url):
    """Get all pages of a topic contained author's comments only.
    """
    def genlink(url):
        """generate link of the first page.
        """
        ptn = r'(^http://www.douban.com/group/topic/\d+).*?'
        return re.findall(ptn, url)[0] + r'/?author=1'

    url = genlink(url)
    browser = mechanicalsoup.Browser()
    while True:
        p = browser.get(url, headers=headers)
        yield p
        try:
            url = p.soup.select('.next')[0].link['href']
        except TypeError:
            break


def topic_content(page):
    """Extract content from a page.
    """
    def get_text(html):
        """Extract readable plain text from html code.
        """
        return html.text.replace('\r', os.linesep).strip()

    def reply_quote(html):
        template = '<< {} | {} >>\n'
        return template.format(
            get_text(html.select('.all')[0]),
            get_text(html.select('.pubdate')[0])
        )

    if page.url.endswith('author=1'):
        title = page.soup.select('.infobox')  # long title
        if title:
            yield get_text(title[0])[3:]
        else:
            yield get_text(page.soup.title)
        yield get_text(page.soup.select('#link-report')[0])

    for i in page.soup.select('#comments')[0].findAll('li'):
        quote = i.select('.reply-quote')
        reply = ''
        if quote:
            reply = reply_quote(quote[0])
        yield reply + get_text(i.p)


def crawler(url):
    for p in topic_pages(url):
        for c in topic_content(p):
            yield c


def main(url):
    filename = re.findall(r'\d+', url)[0] + '.txt'

    for text in crawler(url):
        with open(filename, 'a') as f:
            f.writelines(text+os.linesep*2)

    with open(filename, 'r') as f:
        newname = f.readline().strip() + '.txt'

    os.rename(filename, newname)


if __name__ == '__main__':
    main(sys.argv[1])
	#!/usr/bin/env python3

	import os
	import re
	import sys

	import mechanicalsoup

	headers = {
	'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.94 Safari/537.36'
	}


	def topic_pages(url):
	"""Get all pages of a topic contained author's comments only.
	"""
	def genlink(url):
	"""generate link of the first page.
	"""
	ptn = r'(^http://www.douban.com/group/topic/\d+).*?'
	return re.findall(ptn, url)[0] + r'/?author=1'

	url = genlink(url)
	browser = mechanicalsoup.Browser()
	while True:
	p = browser.get(url, headers=headers)
	yield p
	try:
	url = p.soup.select('.next')[0].link['href']
	except TypeError:
	break


	def topic_content(page):
	"""Extract content from a page.
	"""
	def get_text(html):
	"""Extract readable plain text from html code.
	"""
	return html.text.replace('\r', os.linesep).strip()

	def reply_quote(html):
	template = '<< {} \| {} >>\n'
	return template.format(
	get_text(html.select('.all')[0]),
	get_text(html.select('.pubdate')[0])
	)

	if page.url.endswith('author=1'):
	title = page.soup.select('.infobox') # long title
	if title:
	yield get_text(title[0])[3:]
	else:
	yield get_text(page.soup.title)
	yield get_text(page.soup.select('#link-report')[0])

	for i in page.soup.select('#comments')[0].findAll('li'):
	quote = i.select('.reply-quote')
	reply = ''
	if quote:
	reply = reply_quote(quote[0])
	yield reply + get_text(i.p)


	def crawler(url):
	for p in topic_pages(url):
	for c in topic_content(p):
	yield c


	def main(url):
	filename = re.findall(r'\d+', url)[0] + '.txt'

	for text in crawler(url):
	with open(filename, 'a') as f:
	f.writelines(text+os.linesep*2)

	with open(filename, 'r') as f:
	newname = f.readline().strip() + '.txt'

	os.rename(filename, newname)


	if __name__ == '__main__':
	main(sys.argv[1])