Skip to content

Instantly share code, notes, and snippets.

@wenLiangcan
Created August 31, 2014 11:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save wenLiangcan/42cc02f93136984aa7c7 to your computer and use it in GitHub Desktop.
Save wenLiangcan/42cc02f93136984aa7c7 to your computer and use it in GitHub Desktop.
脱水豆瓣小组话题
#!/usr/bin/env python3
import os
import re
import sys
import mechanicalsoup
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.94 Safari/537.36'
}
def topic_pages(url):
"""Get all pages of a topic contained author's comments only.
"""
def genlink(url):
"""generate link of the first page.
"""
ptn = r'(^http://www.douban.com/group/topic/\d+).*?'
return re.findall(ptn, url)[0] + r'/?author=1'
url = genlink(url)
browser = mechanicalsoup.Browser()
while True:
p = browser.get(url, headers=headers)
yield p
try:
url = p.soup.select('.next')[0].link['href']
except TypeError:
break
def topic_content(page):
"""Extract content from a page.
"""
def get_text(html):
"""Extract readable plain text from html code.
"""
return html.text.replace('\r', os.linesep).strip()
def reply_quote(html):
template = '<< {} | {} >>\n'
return template.format(
get_text(html.select('.all')[0]),
get_text(html.select('.pubdate')[0])
)
if page.url.endswith('author=1'):
title = page.soup.select('.infobox') # long title
if title:
yield get_text(title[0])[3:]
else:
yield get_text(page.soup.title)
yield get_text(page.soup.select('#link-report')[0])
for i in page.soup.select('#comments')[0].findAll('li'):
quote = i.select('.reply-quote')
reply = ''
if quote:
reply = reply_quote(quote[0])
yield reply + get_text(i.p)
def crawler(url):
for p in topic_pages(url):
for c in topic_content(p):
yield c
def main(url):
filename = re.findall(r'\d+', url)[0] + '.txt'
for text in crawler(url):
with open(filename, 'a') as f:
f.writelines(text+os.linesep*2)
with open(filename, 'r') as f:
newname = f.readline().strip() + '.txt'
os.rename(filename, newname)
if __name__ == '__main__':
main(sys.argv[1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment