Scraping the Partisan Divide: My Scrapy Spider
# Defined Number of Threads -> Multiple Pages per Thread -> Multiple posts per page | |
from scrapy import Spider, Request | |
from forum.items import ForumItem | |
import re | |
class ForumSpider(Spider): | |
name = "forum_spider" | |
allowed_url = ['https://liberalforum.net'] | |
start_urls = ['https://liberalforum.net/viewtopic.php?t=37200'] | |
#load start page for for each thread: | |
def parse(self, response): | |
numThreads = ['https://liberalforum.net/viewtopic.php?t={}'.format(x) for x in range(14493,37211)] | |
# desired start = When Trump announced Presidency: 14493 | |
for thread in numThreads: | |
yield Request(url = thread, callback = self.parse_within_thread) | |
#load each page of each thread: | |
def parse_within_thread(self,response): | |
# different xpath for when only one thread page and when thread has multiple pages... | |
if response.xpath('//*[@id="page-body"]/div[1]/div[1]/div[3]/div[2]/a/strong[2]/text()').extract_first() is None: | |
threadMaxPage = 1 | |
else: | |
threadMaxPage = response.xpath('//*[@id="page-body"]/div[1]/div[1]/div[3]/div[2]/a/strong[2]/text()').extract_first() | |
numPages =['https://liberalforum.net/viewtopic.php?t={}&start={}'.format(x,y) for x in range(14493,37211) for y in range(0,20*(int(threadMaxPage)-1),20)] | |
for page in numPages: | |
yield Request(url = page, callback=self.parse_posts_within_thread_page) | |
#parse components of each post of each page in each thread | |
def parse_posts_within_thread_page(self, response): | |
posts = response.xpath('//div[@class="vtinner"]') | |
for post in posts: | |
threadReplies = response.xpath('//*[@id="page-body"]/div[1]/div[1]/div[6]/div/div/span[1]/text()').extract_first() | |
threadTitle = response.xpath('//*[@id="page-body"]/div[1]/h2/a/span/text()').extract_first() | |
userName = post.xpath('div[3]/div/span/a/text()').extract_first() | |
userPosts = post.xpath('div[3]/div[2]/a/text()').extract_first() | |
userPolitics = post.xpath('div[3]/div[2]/a[2]/text()').extract # <- Issue with spacing/newline characters | |
userMoney = post.xpath('div[3]/div[2]/a[3]/span/text()').extract_first() | |
postDateTime = post.xpath('./div[2]/div[2]/span/a/text()').extract_first() | |
postText = post.xpath('div[4]/div[@class="content"]/text()').extract() # <- Issue with spacing/newline characters | |
item = ForumItem() | |
item['threadReplies'] = threadReplies | |
item['threadTitle'] = threadTitle | |
item['userName'] = userName | |
item['userPosts'] = userPosts | |
item['userPolitics'] = userPolitics | |
item['userMoney'] = userMoney | |
item['postDateTime'] = postDateTime | |
item['postText'] = postText | |
yield item |
# -*- coding: utf-8 -*- | |
# Define here the models for your scraped items | |
# | |
# See documentation in: | |
# https://doc.scrapy.org/en/latest/topics/items.html | |
import scrapy | |
class ForumItem(scrapy.Item): | |
# define the fields for your item here like: | |
# name = scrapy.Field() | |
threadReplies = scrapy.Field() | |
threadTitle = scrapy.Field() | |
userName = scrapy.Field() | |
userPosts = scrapy.Field() | |
userPolitics = scrapy.Field() | |
userMoney = scrapy.Field() | |
postDateTime = scrapy.Field() | |
postText = scrapy.Field() |
# -*- coding: utf-8 -*- | |
# Define your item pipelines here | |
# | |
# Don't forget to add your pipeline to the ITEM_PIPELINES setting | |
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html | |
from scrapy.exporters import CsvItemExporter | |
class ForumPipeline(object): | |
def __init__(self): | |
self.filename = 'forum.csv' | |
def open_spider(self, spider): | |
self.csvfile = open(self.filename, 'wb') | |
self.exporter = CsvItemExporter(self.csvfile) | |
self.exporter.start_exporting() | |
def close_spider(self, spider): | |
self.exporter.finish_exporting() | |
self.csvfile.close() | |
def process_item(self, item, spider): | |
self.exporter.export_item(item) | |
return item |
# -*- coding: utf-8 -*- | |
# Scrapy settings for forum project | |
# | |
# For simplicity, this file contains only settings considered important or | |
# commonly used. You can find more settings consulting the documentation: | |
# | |
# https://doc.scrapy.org/en/latest/topics/settings.html | |
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html | |
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html | |
BOT_NAME = 'forum' | |
SPIDER_MODULES = ['forum.spiders'] | |
NEWSPIDER_MODULE = 'forum.spiders' | |
# Crawl responsibly by identifying yourself (and your website) on the user-agent | |
#USER_AGENT = 'forum (+http://www.yourdomain.com)' | |
# Obey robots.txt rules | |
ROBOTSTXT_OBEY = False | |
# Configure maximum concurrent requests performed by Scrapy (default: 16) | |
#CONCURRENT_REQUESTS = 32 | |
# Configure a delay for requests for the same website (default: 0) | |
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay | |
# See also autothrottle settings and docs | |
DOWNLOAD_DELAY = 1 | |
# The download delay setting will honor only one of: | |
#CONCURRENT_REQUESTS_PER_DOMAIN = 16 | |
#CONCURRENT_REQUESTS_PER_IP = 16 | |
# Disable cookies (enabled by default) | |
#COOKIES_ENABLED = False | |
# Disable Telnet Console (enabled by default) | |
#TELNETCONSOLE_ENABLED = False | |
# Override the default request headers: | |
#DEFAULT_REQUEST_HEADERS = { | |
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', | |
# 'Accept-Language': 'en', | |
#} | |
# Enable or disable spider middlewares | |
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html | |
#SPIDER_MIDDLEWARES = { | |
# 'forum.middlewares.ForumSpiderMiddleware': 543, | |
#} | |
# Enable or disable downloader middlewares | |
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html | |
#DOWNLOADER_MIDDLEWARES = { | |
# 'forum.middlewares.ForumDownloaderMiddleware': 543, | |
#} | |
# Enable or disable extensions | |
# See https://doc.scrapy.org/en/latest/topics/extensions.html | |
#EXTENSIONS = { | |
# 'scrapy.extensions.telnet.TelnetConsole': None, | |
#} | |
# Configure item pipelines | |
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html | |
ITEM_PIPELINES = { | |
'forum.pipelines.ForumPipeline': 300, | |
} | |
# Enable and configure the AutoThrottle extension (disabled by default) | |
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html | |
#AUTOTHROTTLE_ENABLED = True | |
# The initial download delay | |
#AUTOTHROTTLE_START_DELAY = 5 | |
# The maximum download delay to be set in case of high latencies | |
#AUTOTHROTTLE_MAX_DELAY = 60 | |
# The average number of requests Scrapy should be sending in parallel to | |
# each remote server | |
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 | |
# Enable showing throttling stats for every response received: | |
#AUTOTHROTTLE_DEBUG = False | |
# Enable and configure HTTP caching (disabled by default) | |
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings | |
#HTTPCACHE_ENABLED = True | |
#HTTPCACHE_EXPIRATION_SECS = 0 | |
#HTTPCACHE_DIR = 'httpcache' | |
#HTTPCACHE_IGNORE_HTTP_CODES = [] | |
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment