Skip to content

Instantly share code, notes, and snippets.

View rafikahmed's full-sized avatar

Ahmed Rafik Djerah rafikahmed

View GitHub Profile
import json
import scrapy
from scrapy import FormRequest
class NeuvooSpider(scrapy.Spider):
name = 'neuvoo'
allowed_domains = ['neuvoo.com']
def start_requests(self):
# -*- coding: utf-8 -*-
import scrapy
class MetrocuadradoSpider(scrapy.Spider):
name = 'metroCuadrado'
allowed_domains = ['metrocuadrado.com']
def start_requests(self):
yield scrapy.Request(
# Make sure to add the proxy address, username and passowrd
# Make sure to append the ProxyMiddleware class to the DOWNLOADER_MIDDLEWARES dict in the settings.py file
from w3lib.http import basic_auth_header
class ProxyMiddleware:
def process_request(self, request, spider):
request.meta['proxy'] = "COMPANY PROXY URL OR ADDRESS"
request.headers['Proxy-Authorization'] = basic_auth_header('USERNAME', 'PASSWORD')
class MySpider(scrapy.Spider):
name = 'example'
current_page = 1
script = '''
function main(splash, args)
splash.private_mode_enabled = false
url = args.url
assert(splash:go(url))
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
#
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
# -*- coding: utf-8 -*-
import scrapy
from scrapy.selector import Selector
from scrapy_selenium import SeleniumRequest
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import datetime
from scrapy.exceptions import DropItem
class DuplicatesPipeline(object):
def __init__(self):
self.emails_seen = set()
def process_item(self, item, spider):
if item['email'] in self.emails_seen:
raise DropItem("Duplicate item found: %s" % item)
import scrapy
from scrapy.http import FormRequest
class SaeedSpider(scrapy.Spider):
name = 'saeed'
start_urls = [
'https://seffaflik.epias.com.tr/transparency/uretim/gerceklesen-uretim/gercek-zamanli-uretim.xhtml']
def strip_output(self, input):
try:
return input.strip()
catch NoneType:
return ""
def parse(self, response):
yield {
"target_field": self.strip_output(response.xpath('//label[contains(text(),"Founded:")]/following-sibling::p/text()').get())
}
# -*- coding: utf-8 -*-
import scrapy
class BlockchainSpider(scrapy.Spider):
name = 'blockchain'
allowed_domains = ['www.blockchain.com']
start_urls = ['https://www.blockchain.com/explorer']