Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@gouneken

gouneken/file.py Secret

Last active June 20, 2022 16:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gouneken/22a3b53ad90aa93c35d012bc5f5a9a9a to your computer and use it in GitHub Desktop.
Save gouneken/22a3b53ad90aa93c35d012bc5f5a9a9a to your computer and use it in GitHub Desktop.
import scrapy
import datetime
class AskScrapingSpider(scrapy.Spider):
name = 'ask_scraping'
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url, callback=self.parse, meta={'pos': 0})
custom_settings = {'FEED_URI': 'scrap/outputfile.json', 'CLOSESPIDER_TIMEOUT' : 15}
def __init__(self, category='', **kwargs):
self.myBaseUrl = category
self.start_urls.append(self.myBaseUrl)
super().__init__(**kwargs)
def parse(self, response):
print('url:', response.url)
start_pos = response.meta['pos']
print('start pos:', start_pos)
dt = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
items = response.css('div.PartialSearchResults-item')
for pos, result in enumerate(items, start_pos+1):
yield {
'title': result.css('a.PartialSearchResults-item-title-link.result-link::text').get().strip(),
'snippet': result.css('p.PartialSearchResults-item-abstract::text').get().strip(),
'link': result.css('a.PartialSearchResults-item-title-link.result-link').attrib.get('href'),
'position': pos,
'date': dt,
}
# --- after loop ---
next_page = response.css('.PartialWebPagination-next a')
if next_page:
url = next_page.attrib.get('href')
print('next_page:', url) # relative URL
# use `follow()` to add `https://www.ask.com/` to URL and create absolute URL
yield response.follow(url, callback=self.parse, meta={'pos': pos+1})
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment