Skip to content

Instantly share code, notes, and snippets.

@zdxerr
Created May 7, 2021 15:45
Show Gist options
  • Save zdxerr/3571118a33eb18c3a4802182ca83bb66 to your computer and use it in GitHub Desktop.
Save zdxerr/3571118a33eb18c3a4802182ca83bb66 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
"""
Created on Sun May 2 12:01:41 2021
@author: Haya Halimeh
"""
import json
import scrapy
from scrapy.selector import Selector
NUMBER_OF_DEBATES = 1
class CrawlerDebatesSpider(scrapy.Spider):
name = 'debate_crawler'
def start_requests(self):
#start_url
urls=['https://www.debate.org/opinions/?sort=popular/']
for url in urls:
request = scrapy.Request(url=url, callback=self.parse_url)
yield request
def parse_url(self, response):
debates = response.css('#opinions-list .a-image-contain')
# find first 5 popular debates aka urls for the debates
for i in range(NUMBER_OF_DEBATES):
relative_url = debates[i].css('a::attr(href)').get()
debate_url = f'https://www.debate.org{relative_url}'
# follow each url and scrap the pages using parse in callback
yield response.follow(debate_url, callback= self.parse)
def parse_more(self, response):
s = Selector(text=json.loads(response.text)["d"])
print(s)
pass
def parse(self, response):
# retrieve topic and categorie using css tags per page
topic=response.css('div.r-contain h1.qh-debate span.q-title ::text').get()
categorie=response.css('div#breadcrumb a::text')[2].get()
# create a pro and a con lists to save nested arguments
pro_list = []
con_list = []
load_more_id = response.css('.debate-more-btn').attrib["onclick"].strip().split("'")[1]
load_more_url = f'https://www.debate.org/opinions/~services/opinions.asmx/GetDebateArgumentPage'
params = {
"debateId": load_more_id,
"pageNumber": 1,
"itemsPerPage": 50,
"ysort": 5,
"nsort": 5
}
yield response.follow(
url=load_more_url,
callback=self.parse_more,
method="POST",
body=json.dumps(params),
headers={'Content-Type':'application/json'},
)
return
#retrieve pro_arguments using css tags
pro_arguments=response.css('div#yes-arguments li')
for i in range(0, len(pro_arguments)-1):
title = pro_arguments[i].css('.hasData h2::text').getall() or pro_arguments[i].css('.hasData h2 a::text').getall()
body= pro_arguments[i].css('p::text').getall()
#create con_item dict object
pro_item={}
pro_item['title']=title
pro_item['body']=body
#save pro_item object into a pro_list
pro_list.append(pro_item)
#pro_titles = pro_arguments.css('.hasData h2 a::text').getall() + pro_arguments.css('h2::text').getall()
#pro_bodies= pro_arguments.css('p::text').getall()
#retrieve con_arguments using css tags
con_arguments=response.css('div#no-arguments li')
for i in range(0, len(con_arguments)-1):
title = con_arguments[i].css('.hasData h2::text').getall() or con_arguments[i].css('.hasData h2 a::text').getall()
body= con_arguments[i].css('p::text').getall()
#create con_item dict object
con_item={}
con_item['title']=title
con_item['body']=body
#save con_item object into a con_list
con_list.append(con_item)
#con_titles = con_arguments.css('.hasData h2 a::text').getall() + con_arguments.css('h2::text').getall()
#con_bodies= con_arguments.css('p::text').getall()
yield {"topic": topic, "categorie": categorie, "pro_arguments":pro_list, "con_arguments": con_list}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment