Skip to content

Instantly share code, notes, and snippets.

@mostafa6765
Last active August 24, 2023 10:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mostafa6765/eb33bb701c732ac29bcbaf8bc369236a to your computer and use it in GitHub Desktop.
Save mostafa6765/eb33bb701c732ac29bcbaf8bc369236a to your computer and use it in GitHub Desktop.
attrs==23.1.0
Automat==22.10.0
certifi==2023.5.7
cffi==1.15.1
charset-normalizer==3.1.0
click==8.1.6
constantly==15.1.0
cryptography==41.0.1
cssselect==1.2.0
docker==6.1.3
fake-useragent==1.1.3
Faker==18.11.2
filelock==3.12.1
greenlet==2.0.2
hyperlink==21.0.0
idna==3.4
incremental==22.10.0
itemadapter==0.8.0
itemloaders==1.1.0
jmespath==1.0.1
lxml==4.9.2
packaging==23.1
parsel==1.8.1
playwright==1.34.0
Protego==0.2.1
pyasn1==0.5.0
pyasn1-modules==0.3.0
pycparser==2.21
PyDispatcher==2.0.7
pyee==9.0.4
pyOpenSSL==23.2.0
python-dateutil==2.8.2
PyYAML==6.0.1
queuelib==1.6.2
requests==2.31.0
requests-file==1.5.1
retrying==1.3.4
scrapinghub==2.4.0
Scrapy==2.9.0
scrapy-fake-useragent==1.4.4
scrapy-playwright==0.0.26
scrapy-zyte-smartproxy==2.2.0
scrapyrt==0.13.0
service-identity==21.1.0
shub==2.14.5
six==1.16.0
tldextract==3.4.4
toml==0.10.2
tqdm==4.55.1
Twisted==22.10.0
typing_extensions==4.6.3
urllib3==2.0.3
w3lib==2.1.1
websocket-client==1.6.1
zope.interface==6.0
# Scrapy settings for palscraper project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = "palscraper"
SPIDER_MODULES = ["palscraper.spiders"]
NEWSPIDER_MODULE = "palscraper.spiders"
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = "palscraper (+http://www.yourdomain.com)"
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = True
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
# "Accept-Language": "en",
#}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# "palscraper.middlewares.PalscraperSpiderMiddleware": 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# "palscraper.middlewares.PalscraperDownloaderMiddleware": 543,
#}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# "scrapy.extensions.telnet.TelnetConsole": None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# "palscraper.pipelines.PalscraperPipeline": 300,
#}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = "httpcache"
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
# Set settings whose default value is deprecated to a future-proof value
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"
PLAYWRIGHT_LAUNCH_OPTIONS = {"headless": True}
PLAYWRIGHT_BROWSER_TYPE = "webkit"
PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT = 100000
DOWNLOAD_HANDLERS = {
"http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
}
# Enable the Zyte Smart Proxy Middleware
DOWNLOADER_MIDDLEWARES = {
'scrapy_zyte_smartproxy.ZyteSmartProxyMiddleware': 610,
}
# Enable the Zyte Smart Proxy Manager
ZYTE_SMARTPROXY_ENABLED = True
# Set your Zyte Smart Proxy API Key
ZYTE_SMARTPROXY_APIKEY = 'f096****bc413f9bc2c71f6*******'
# Preserve the delay when using proxies
ZYTE_SMARTPROXY_PRESERVE_DELAY = True
CONCURRENT_REQUESTS = 32
CONCURRENT_REQUESTS_PER_DOMAIN = 32
AUTOTHROTTLE_ENABLED = False
DOWNLOAD_TIMEOUT = 600
import scrapy
from scrapy_playwright.page import PageMethod
class QuotesSpider(scrapy.Spider):
name = 'united'
def start_requests(self):
depart_date = getattr(self,'depart_date','2023-08-25')
depart_from = getattr(self,'depart_from','JFK')
arrival_to = getattr(self,'arrival_to','DFW')
print(f'depart_date {depart_date}')
print(f'from {depart_from}')
print(f'to {arrival_to}')
headers = {
#'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Referer': 'https://www.united.com',
}
yield scrapy.Request(
headers=headers,
url=f"https://www.united.com/en/us/fsr/choose-flights?f={depart_from}&t={arrival_to}&d={depart_date}&tt=1&at=1&sc=7&px=1&taxng=1&newHP=True&clm=7&st=bestmatches&tqp=A",
meta={
# 'dont_proxy': True,
"playwright": True,
"playwright_include_page": True,
"playwright_page_methods": [
#PageMethod("wait_for_selector", '.search-form--fields-airports'),
]
},
errback=self.errback_close_page,
)
async def parse(self, response):
# playwright instance.
page = response.meta["playwright_page"]
await page.wait_for_timeout(1000)
# Close login sidebar modal.
await page.click('button#closeBtn');
# Show all flights.
# await page.click('.app-components-Shopping-ResultFooter-styles__buttonContainer--T6Hxj')
# await page.wait_for_timeout(30000)
flights = response.css(".app-components-Shopping-GridItem-styles__flightRow--1E4Sk")
for index, flight in enumerate(flights):
departure_time = flight.css(".app-components-Shopping-FlightInfoBlock-styles__departTime--oRDUv ::text").get()
arrival_time = flight.css(".app-components-Shopping-FlightInfoBlock-styles__arrivalTime--1V4Lg ::text").get()
duration = flight.css(".app-components-Shopping-FlightInfoBlock-styles__dividerLine--2s5M8 ::text").get()
origin = flight.css(".app-components-Shopping-FlightInfoBlock-styles__departAirport--1V3Dd ::text").get()
destination = flight.css(".app-components-Shopping-FlightInfoBlock-styles__arrivalAirport--2976a ::text").get()
flight_stop = flight.css('.app-components-Shopping-FlightBaseCard-styles__flightHeaderRight--25F4- ::text').get()
# price / points
price_economy = flight.css('[aria-describedby="MIN-ECONOMY-SURP-OR-DISP"] .app-components-Shopping-Miles-styles__fontStyle--3swxB::text').get()
price_premium = flight.css('[aria-describedby="ECO-PREMIUM-DISP"] .app-components-Shopping-Miles-styles__fontStyle--3swxB::text').get()
price_business = flight.css('[aria-describedby="MIN-BUSINESS-SURP-OR-DISP"] .app-components-Shopping-Miles-styles__fontStyle--3swxB::text').get()
yield {
"index": index,
"airline": "united",
"depart_time": departure_time,
"arrival_time": arrival_time,
"flight_duration": duration,
"depart_from": origin,
}
}
#await page.wait_for_timeout(3000)
# Take a screenshot.
#screenshot = await page.screenshot(path="./screenshots/united.png", full_page=True)
async def errback_close_page(self, failure):
page = failure.request.meta["playwright_page"]
await page.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment