Skip to content

Instantly share code, notes, and snippets.

@this-is-r-gaurav
Last active January 25, 2019 13:53
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save this-is-r-gaurav/f398e4049b192e8c29670d056c8b28f8 to your computer and use it in GitHub Desktop.
Save this-is-r-gaurav/f398e4049b192e8c29670d056c8b28f8 to your computer and use it in GitHub Desktop.
Proxy For Scrapy
import scrapy
import datetime
import scraper.items as ProxyItems
from scraper.settings import PROXY_DATA
import logging
class ProxySpider(scrapy.Spider):
name = 'proxy'
allowed_domains = ['sslproxies.org']
start_urls = ['http://sslproxies.org/']
custom_settings = {
'FEED_URI': PROXY_DATA,
'FEED_FORMAT': 'csv',
'DOWNLOADER_MIDDLEWARES': {
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
},
'ITEM_PIPELINES':{
'DeHaze.pipelines.ProxyPipeline': 100
}
}
def parse(self, response):
logging.log(logging.INFO, "Scraping Page {}".format(response.request.url))
ips = response.css('#proxylisttable td:nth-child(1)::text').extract()
ports = response.css('#proxylisttable td:nth-child(2)::text').extract()
time = datetime.datetime.utcnow().timestamp()
total_ips = len(ips)
logging.log(logging.INFO, "Found {} Total IP".format(total_ips))
for i in range(total_ips):
item = ProxyItems.ProxyItem(ip=ips[i], port=ports[i], timestamp=time)
logging.log(logging.INFO, "Scraped Following Proxy {}".format(itm))
yield item
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment