Skip to content

Instantly share code, notes, and snippets.

@denten
Last active December 20, 2015 18:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save denten/d898a48bcf2a496c7c07 to your computer and use it in GitHub Desktop.
Save denten/d898a48bcf2a496c7c07 to your computer and use it in GitHub Desktop.
from ipdb import set_trace as _BREAK
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.http import Request
from mobileread.items import PageItem, ThreadItem
class ForumSpider(CrawlSpider):
name = 'mobileread'
allowed_domains = ['www.mobileread.com']
# testing a page with 9 threads
start_urls = ['http://www.mobileread.com/forums/forumdisplay.php?f=230&order=desc&page=2']
# set rules for crawling next page links only
# first follow forum's page next links
# %timeit to time xpath selection--this was the fastest
rules = (
Rule(SgmlLinkExtractor(restrict_xpaths="//div[@class='threadlink condensed']"), callback='parse_threads'),
)
def parse_threads(self, response):
thread = HtmlXPathSelector(response)
# get the list of posts
posts = thread.select("//div[@id='posts']//table[contains(@id,'post')]/*")
plist = []
for p in posts:
table = ThreadItem()
# getting user_name
table['user_name'] = (p.select("//div[contains(@id, table['post_id'])]/a[@class='bigusername']/text()").extract())[0].strip()
# skip adverts
if 'Advertisement' in table['user_name']:
continue
# getting thread_id
# xpath returns a list of one element so we take the first and strip it
table['thread_id'] = (p.select("//input[@name='searchthreadid']/@value").extract())[0].strip()
# getting the post_id
string_id = p.select("../@id").extract() # returns a list
p_id = string_id[0].split("post")
table['post_id'] = p_id[1]
# Testing without yield
plist.append(table)
return plist
# yield table
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment