Skip to content

Instantly share code, notes, and snippets.

@wenLiangcan wenLiangcan/
Created Aug 31, 2014

What would you like to do?
#!/usr/bin/env python3
import os
import re
import sys
import mechanicalsoup
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.94 Safari/537.36'
def topic_pages(url):
"""Get all pages of a topic contained author's comments only.
def genlink(url):
"""generate link of the first page.
ptn = r'(^\d+).*?'
return re.findall(ptn, url)[0] + r'/?author=1'
url = genlink(url)
browser = mechanicalsoup.Browser()
while True:
p = browser.get(url, headers=headers)
yield p
url ='.next')[0].link['href']
except TypeError:
def topic_content(page):
"""Extract content from a page.
def get_text(html):
"""Extract readable plain text from html code.
return html.text.replace('\r', os.linesep).strip()
def reply_quote(html):
template = '<< {} | {} >>\n'
return template.format(
if page.url.endswith('author=1'):
title ='.infobox') # long title
if title:
yield get_text(title[0])[3:]
yield get_text(page.soup.title)
yield get_text('#link-report')[0])
for i in'#comments')[0].findAll('li'):
quote ='.reply-quote')
reply = ''
if quote:
reply = reply_quote(quote[0])
yield reply + get_text(i.p)
def crawler(url):
for p in topic_pages(url):
for c in topic_content(p):
yield c
def main(url):
filename = re.findall(r'\d+', url)[0] + '.txt'
for text in crawler(url):
with open(filename, 'a') as f:
with open(filename, 'r') as f:
newname = f.readline().strip() + '.txt'
os.rename(filename, newname)
if __name__ == '__main__':
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.