mostafa6765/requirements.txt

## requirements.txt
attrs==23.1.0
Automat==22.10.0
certifi==2023.5.7
cffi==1.15.1
charset-normalizer==3.1.0
click==8.1.6
constantly==15.1.0
cryptography==41.0.1
cssselect==1.2.0
docker==6.1.3
fake-useragent==1.1.3
Faker==18.11.2
filelock==3.12.1
greenlet==2.0.2
hyperlink==21.0.0
idna==3.4
incremental==22.10.0
itemadapter==0.8.0
itemloaders==1.1.0
jmespath==1.0.1
lxml==4.9.2
packaging==23.1
parsel==1.8.1
playwright==1.34.0
Protego==0.2.1
pyasn1==0.5.0
pyasn1-modules==0.3.0
pycparser==2.21
PyDispatcher==2.0.7
pyee==9.0.4
pyOpenSSL==23.2.0
python-dateutil==2.8.2
PyYAML==6.0.1
queuelib==1.6.2
requests==2.31.0
requests-file==1.5.1
retrying==1.3.4
scrapinghub==2.4.0
Scrapy==2.9.0
scrapy-fake-useragent==1.4.4
scrapy-playwright==0.0.26
scrapy-zyte-smartproxy==2.2.0
scrapyrt==0.13.0
service-identity==21.1.0
shub==2.14.5
six==1.16.0
tldextract==3.4.4
toml==0.10.2
tqdm==4.55.1
Twisted==22.10.0
typing_extensions==4.6.3
urllib3==2.0.3
w3lib==2.1.1
websocket-client==1.6.1
zope.interface==6.0

## scrapy-settings.py
# Scrapy settings for palscraper project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = "palscraper"

SPIDER_MODULES = ["palscraper.spiders"]
NEWSPIDER_MODULE = "palscraper.spiders"


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = "palscraper (+http://www.yourdomain.com)"

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
COOKIES_ENABLED = True

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
#    "Accept-Language": "en",
#}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    "palscraper.middlewares.PalscraperSpiderMiddleware": 543,
#}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    "palscraper.middlewares.PalscraperDownloaderMiddleware": 543,
#}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    "scrapy.extensions.telnet.TelnetConsole": None,
#}

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
#    "palscraper.pipelines.PalscraperPipeline": 300,
#}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = "httpcache"
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"

# Set settings whose default value is deprecated to a future-proof value
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"


PLAYWRIGHT_LAUNCH_OPTIONS = {"headless": True}
PLAYWRIGHT_BROWSER_TYPE = "webkit"
PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT = 100000

DOWNLOAD_HANDLERS = {
    "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
    "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
}

# Enable the Zyte Smart Proxy Middleware
DOWNLOADER_MIDDLEWARES = {
    'scrapy_zyte_smartproxy.ZyteSmartProxyMiddleware': 610,
}

# Enable the Zyte Smart Proxy Manager
ZYTE_SMARTPROXY_ENABLED = True

# Set your Zyte Smart Proxy API Key
ZYTE_SMARTPROXY_APIKEY = 'f096****bc413f9bc2c71f6*******'

# Preserve the delay when using proxies
ZYTE_SMARTPROXY_PRESERVE_DELAY = True

CONCURRENT_REQUESTS = 32
CONCURRENT_REQUESTS_PER_DOMAIN = 32
AUTOTHROTTLE_ENABLED = False
DOWNLOAD_TIMEOUT = 600

## spider.py
import scrapy
from scrapy_playwright.page import PageMethod


class QuotesSpider(scrapy.Spider):
    name = 'united'

    def start_requests(self):
        depart_date = getattr(self,'depart_date','2023-08-25')
        depart_from = getattr(self,'depart_from','JFK')
        arrival_to = getattr(self,'arrival_to','DFW')
        print(f'depart_date {depart_date}')
        print(f'from {depart_from}')
        print(f'to {arrival_to}')

        headers = {
            #'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Referer': 'https://www.united.com',
        }
        yield scrapy.Request(
            headers=headers,
            url=f"https://www.united.com/en/us/fsr/choose-flights?f={depart_from}&t={arrival_to}&d={depart_date}&tt=1&at=1&sc=7&px=1&taxng=1&newHP=True&clm=7&st=bestmatches&tqp=A",
            meta={
                # 'dont_proxy': True,
                "playwright": True,
                 "playwright_include_page": True,
                "playwright_page_methods": [
                    #PageMethod("wait_for_selector", '.search-form--fields-airports'),
                ]
            },
            errback=self.errback_close_page,
        )


    async def parse(self, response):

        # playwright instance.
        page = response.meta["playwright_page"]
        await page.wait_for_timeout(1000)

        # Close login sidebar modal.
        await page.click('button#closeBtn');

        # Show all flights.
        # await page.click('.app-components-Shopping-ResultFooter-styles__buttonContainer--T6Hxj')
        # await page.wait_for_timeout(30000)

        flights = response.css(".app-components-Shopping-GridItem-styles__flightRow--1E4Sk")

        for index, flight in enumerate(flights):
            departure_time = flight.css(".app-components-Shopping-FlightInfoBlock-styles__departTime--oRDUv ::text").get()
            arrival_time = flight.css(".app-components-Shopping-FlightInfoBlock-styles__arrivalTime--1V4Lg ::text").get()
            duration = flight.css(".app-components-Shopping-FlightInfoBlock-styles__dividerLine--2s5M8 ::text").get()
            origin = flight.css(".app-components-Shopping-FlightInfoBlock-styles__departAirport--1V3Dd ::text").get()
            destination = flight.css(".app-components-Shopping-FlightInfoBlock-styles__arrivalAirport--2976a ::text").get()
            flight_stop = flight.css('.app-components-Shopping-FlightBaseCard-styles__flightHeaderRight--25F4- ::text').get()

            # price / points
            price_economy = flight.css('[aria-describedby="MIN-ECONOMY-SURP-OR-DISP"] .app-components-Shopping-Miles-styles__fontStyle--3swxB::text').get()
            price_premium = flight.css('[aria-describedby="ECO-PREMIUM-DISP"] .app-components-Shopping-Miles-styles__fontStyle--3swxB::text').get()
            price_business = flight.css('[aria-describedby="MIN-BUSINESS-SURP-OR-DISP"] .app-components-Shopping-Miles-styles__fontStyle--3swxB::text').get()

            yield {
                "index": index,
                "airline": "united",
                "depart_time": departure_time,
                "arrival_time": arrival_time,
                "flight_duration": duration,
                "depart_from": origin,

                }
            }

        #await page.wait_for_timeout(3000)

        # Take a screenshot.
        #screenshot = await page.screenshot(path="./screenshots/united.png", full_page=True)

    async def errback_close_page(self, failure):
        page = failure.request.meta["playwright_page"]
        await page.close()
	attrs==23.1.0
	Automat==22.10.0
	certifi==2023.5.7
	cffi==1.15.1
	charset-normalizer==3.1.0
	click==8.1.6
	constantly==15.1.0
	cryptography==41.0.1
	cssselect==1.2.0
	docker==6.1.3
	fake-useragent==1.1.3
	Faker==18.11.2
	filelock==3.12.1
	greenlet==2.0.2
	hyperlink==21.0.0
	idna==3.4
	incremental==22.10.0
	itemadapter==0.8.0
	itemloaders==1.1.0
	jmespath==1.0.1
	lxml==4.9.2
	packaging==23.1
	parsel==1.8.1
	playwright==1.34.0
	Protego==0.2.1
	pyasn1==0.5.0
	pyasn1-modules==0.3.0
	pycparser==2.21
	PyDispatcher==2.0.7
	pyee==9.0.4
	pyOpenSSL==23.2.0
	python-dateutil==2.8.2
	PyYAML==6.0.1
	queuelib==1.6.2
	requests==2.31.0
	requests-file==1.5.1
	retrying==1.3.4
	scrapinghub==2.4.0
	Scrapy==2.9.0
	scrapy-fake-useragent==1.4.4
	scrapy-playwright==0.0.26
	scrapy-zyte-smartproxy==2.2.0
	scrapyrt==0.13.0
	service-identity==21.1.0
	shub==2.14.5
	six==1.16.0
	tldextract==3.4.4
	toml==0.10.2
	tqdm==4.55.1
	Twisted==22.10.0
	typing_extensions==4.6.3
	urllib3==2.0.3
	w3lib==2.1.1
	websocket-client==1.6.1
	zope.interface==6.0
	# Scrapy settings for palscraper project
	#
	# For simplicity, this file contains only settings considered important or
	# commonly used. You can find more settings consulting the documentation:
	#
	# https://docs.scrapy.org/en/latest/topics/settings.html
	# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
	# https://docs.scrapy.org/en/latest/topics/spider-middleware.html

	BOT_NAME = "palscraper"

	SPIDER_MODULES = ["palscraper.spiders"]
	NEWSPIDER_MODULE = "palscraper.spiders"


	# Crawl responsibly by identifying yourself (and your website) on the user-agent
	#USER_AGENT = "palscraper (+http://www.yourdomain.com)"

	# Obey robots.txt rules
	ROBOTSTXT_OBEY = False

	# Configure maximum concurrent requests performed by Scrapy (default: 16)
	#CONCURRENT_REQUESTS = 32

	# Configure a delay for requests for the same website (default: 0)
	# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
	# See also autothrottle settings and docs
	# DOWNLOAD_DELAY = 3
	# The download delay setting will honor only one of:
	#CONCURRENT_REQUESTS_PER_DOMAIN = 16
	#CONCURRENT_REQUESTS_PER_IP = 16

	# Disable cookies (enabled by default)
	COOKIES_ENABLED = True

	# Disable Telnet Console (enabled by default)
	#TELNETCONSOLE_ENABLED = False

	# Override the default request headers:
	#DEFAULT_REQUEST_HEADERS = {
	# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8",
	# "Accept-Language": "en",
	#}

	# Enable or disable spider middlewares
	# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
	#SPIDER_MIDDLEWARES = {
	# "palscraper.middlewares.PalscraperSpiderMiddleware": 543,
	#}

	# Enable or disable downloader middlewares
	# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
	#DOWNLOADER_MIDDLEWARES = {
	# "palscraper.middlewares.PalscraperDownloaderMiddleware": 543,
	#}

	# Enable or disable extensions
	# See https://docs.scrapy.org/en/latest/topics/extensions.html
	#EXTENSIONS = {
	# "scrapy.extensions.telnet.TelnetConsole": None,
	#}

	# Configure item pipelines
	# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
	#ITEM_PIPELINES = {
	# "palscraper.pipelines.PalscraperPipeline": 300,
	#}

	# Enable and configure the AutoThrottle extension (disabled by default)
	# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
	#AUTOTHROTTLE_ENABLED = True
	# The initial download delay
	#AUTOTHROTTLE_START_DELAY = 5
	# The maximum download delay to be set in case of high latencies
	#AUTOTHROTTLE_MAX_DELAY = 60
	# The average number of requests Scrapy should be sending in parallel to
	# each remote server
	#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
	# Enable showing throttling stats for every response received:
	#AUTOTHROTTLE_DEBUG = False

	# Enable and configure HTTP caching (disabled by default)
	# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
	#HTTPCACHE_ENABLED = True
	#HTTPCACHE_EXPIRATION_SECS = 0
	#HTTPCACHE_DIR = "httpcache"
	#HTTPCACHE_IGNORE_HTTP_CODES = []
	#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"

	# Set settings whose default value is deprecated to a future-proof value
	REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
	TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
	FEED_EXPORT_ENCODING = "utf-8"


	PLAYWRIGHT_LAUNCH_OPTIONS = {"headless": True}
	PLAYWRIGHT_BROWSER_TYPE = "webkit"
	PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT = 100000

	DOWNLOAD_HANDLERS = {
	"http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
	"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
	}

	# Enable the Zyte Smart Proxy Middleware
	DOWNLOADER_MIDDLEWARES = {
	'scrapy_zyte_smartproxy.ZyteSmartProxyMiddleware': 610,
	}

	# Enable the Zyte Smart Proxy Manager
	ZYTE_SMARTPROXY_ENABLED = True

	# Set your Zyte Smart Proxy API Key
	ZYTE_SMARTPROXY_APIKEY = 'f096**bc413f9bc2c71f6*****'

	# Preserve the delay when using proxies
	ZYTE_SMARTPROXY_PRESERVE_DELAY = True

	CONCURRENT_REQUESTS = 32
	CONCURRENT_REQUESTS_PER_DOMAIN = 32
	AUTOTHROTTLE_ENABLED = False
	DOWNLOAD_TIMEOUT = 600
	import scrapy
	from scrapy_playwright.page import PageMethod


	class QuotesSpider(scrapy.Spider):
	name = 'united'

	def start_requests(self):
	depart_date = getattr(self,'depart_date','2023-08-25')
	depart_from = getattr(self,'depart_from','JFK')
	arrival_to = getattr(self,'arrival_to','DFW')
	print(f'depart_date {depart_date}')
	print(f'from {depart_from}')
	print(f'to {arrival_to}')

	headers = {
	#'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
	'Referer': 'https://www.united.com',
	}
	yield scrapy.Request(
	headers=headers,
	url=f"https://www.united.com/en/us/fsr/choose-flights?f={depart_from}&t={arrival_to}&d={depart_date}&tt=1&at=1&sc=7&px=1&taxng=1&newHP=True&clm=7&st=bestmatches&tqp=A",
	meta={
	# 'dont_proxy': True,
	"playwright": True,
	"playwright_include_page": True,
	"playwright_page_methods": [
	#PageMethod("wait_for_selector", '.search-form--fields-airports'),
	]
	},
	errback=self.errback_close_page,
	)


	async def parse(self, response):

	# playwright instance.
	page = response.meta["playwright_page"]
	await page.wait_for_timeout(1000)

	# Close login sidebar modal.
	await page.click('button#closeBtn');

	# Show all flights.
	# await page.click('.app-components-Shopping-ResultFooter-styles__buttonContainer--T6Hxj')
	# await page.wait_for_timeout(30000)

	flights = response.css(".app-components-Shopping-GridItem-styles__flightRow--1E4Sk")

	for index, flight in enumerate(flights):
	departure_time = flight.css(".app-components-Shopping-FlightInfoBlock-styles__departTime--oRDUv ::text").get()
	arrival_time = flight.css(".app-components-Shopping-FlightInfoBlock-styles__arrivalTime--1V4Lg ::text").get()
	duration = flight.css(".app-components-Shopping-FlightInfoBlock-styles__dividerLine--2s5M8 ::text").get()
	origin = flight.css(".app-components-Shopping-FlightInfoBlock-styles__departAirport--1V3Dd ::text").get()
	destination = flight.css(".app-components-Shopping-FlightInfoBlock-styles__arrivalAirport--2976a ::text").get()
	flight_stop = flight.css('.app-components-Shopping-FlightBaseCard-styles__flightHeaderRight--25F4- ::text').get()

	# price / points
	price_economy = flight.css('[aria-describedby="MIN-ECONOMY-SURP-OR-DISP"] .app-components-Shopping-Miles-styles__fontStyle--3swxB::text').get()
	price_premium = flight.css('[aria-describedby="ECO-PREMIUM-DISP"] .app-components-Shopping-Miles-styles__fontStyle--3swxB::text').get()
	price_business = flight.css('[aria-describedby="MIN-BUSINESS-SURP-OR-DISP"] .app-components-Shopping-Miles-styles__fontStyle--3swxB::text').get()

	yield {
	"index": index,
	"airline": "united",
	"depart_time": departure_time,
	"arrival_time": arrival_time,
	"flight_duration": duration,
	"depart_from": origin,

	}
	}

	#await page.wait_for_timeout(3000)

	# Take a screenshot.
	#screenshot = await page.screenshot(path="./screenshots/united.png", full_page=True)

	async def errback_close_page(self, failure):
	page = failure.request.meta["playwright_page"]
	await page.close()