Skip to content

Instantly share code, notes, and snippets.

@eupendra
Created September 26, 2020 09:55
Show Gist options
  • Save eupendra/8ab724ced046e93d8134b376ef40479a to your computer and use it in GitHub Desktop.
Save eupendra/8ab724ced046e93d8134b376ef40479a to your computer and use it in GitHub Desktop.
import scrapy
import pandas as pd
base_url = 'https://stackoverflow.com/questions/tagged/{}'
def read_csv():
df = pd.read_csv('so_tags.csv')
return df['Tags'].values.tolist()
def read_excel():
df = pd.read_excel('so_tags.xlsx')
return df['Tags'].values.tolist()
class SoSpider(scrapy.Spider):
name = 'so'
def start_requests(self):
for tag in read_excel():
yield scrapy.Request(base_url.format(tag))
def parse(self, response):
questions = response.xpath('normalize-space(//*[@id="mainbar"]/div[4]/div/div[1]/text())').get()
questions = questions.strip('questions')
yield {
'questions': questions,
'url': response.url
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment