Skip to content

Instantly share code, notes, and snippets.

@abkosar
Created May 28, 2016 18:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save abkosar/dd972d3192a8647a34deda2fd4fb2a53 to your computer and use it in GitHub Desktop.
Save abkosar/dd972d3192a8647a34deda2fd4fb2a53 to your computer and use it in GitHub Desktop.
Collecting links
from scrapy import Spider
from indeed.items import IndeedItem
from scrapy.selector import Selector
from selenium import webdriver
from scrapy.http import TextResponse
import scrapy
class IndeedSpider(Spider):
name = 'indeed'
allowed_domains = ["http://www.dice.com/"]
s1 = 'https://www.dice.com/jobs/q-data_scientist-limit-30-l-New_York%2C_NY-radius-30-startPage-1-limit-30-jobs?searchid=291607343849'
start_urls = [s1]
def __init__(self):
self.driver = webdriver.Chrome()
def parse(self, response):
self.driver.get(response.url)
urls = []
for i in range(1,20):
response = TextResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8')
self.driver.implicitly_wait(10)
for j in range(1, 31):
result = response.xpath('//*[@class="col-md-9"]/div[1]/div['+str(j)+']/h3/a/@href')
urls.extend(result)
next_page = self.driver.find_element_by_xpath('//*[@title="Go to next page"]')
next_page.click()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment