Granitosaurus/avbuyer.com_scraper.py

## avbuyer.com_scraper.py
import asyncio
from aiohttp.client import ClientSession
from parsel import Selector


async def get_page(page: int, session: ClientSession):
    """This is a little shortcut function that requests specific page of our pagination"""
    url = f"https://www.avbuyer.com/aircraft/private-jets/page-{page}"
    print(f"requesting {url}")
    return await session.get(url)


async def parse_page(resp):
    """this is our parse function that extracts airplane details from the pagination page"""
    # first we need to find all airplane boxes on the page, there should be 20 of them
    airplanes = Selector(await resp.text()).css(".listing-item")

    # then we should parse box one by one for airplane data
    parsed = []
    for plane in airplanes:
        # plane.css will select RELATIVE to plane box
        # so using plane.css(".item-title") will only select node that is under current plane box
        parsed.append({
            # note: we are using ::text selector to select text of the node rather than whole node content
            # note: parsel selector have a convenient .get() method that takes first text value of selected nodes
            "title": plane.css(".item-title ::text").get(),
            "price": plane.css(".price::text").get(),
            "updated": plane.css(".list-update::text").get(),
            "description": plane.css(".list-item-para::text").get().strip(),
            # since location can contain multiple nodes we want to join them explicitly
            "location": " ".join(
                plane.css(".list-item-location::text").extract()
            ).strip(),
            # same for other details
            "other_details": "\n".join(
                plane.css(".list-other-dtl ::text").extract()
            ).strip(),
            # etc. other fields just have different classes so I leave them up to you.
        })
    return parsed


async def scrape():
    async with ClientSession() as session:
        # we start with the first page of listing pagination, there we get total amount of pages
        # so we could scrape all pages concurrently rather than going to the next page one by one which
        # is very slow!
        first_page = await get_page(1, session)
        first_page_sel = Selector(await first_page.text())
        total_pages = int(
            first_page_sel.xpath(
                '//li[@class="pagination-next"]/preceding-sibling::li[1]/a/text()'
            ).get()
        )

        # now that we have total pages and first page data we can extract airplane data for the first page and..
        parsed = await parse_page(first_page)
        # and for other pages we can schedule them asynchronously
        # This might appear complex but `asyncio.as_completed` turns a list of coroutines
        # into a list of futures. meaning this loop will execute in first-come-first-serve order
        for page_future in asyncio.as_completed(
            [get_page(i, session) for i in range(2, total_pages)]
        ):
            # since here we get future not a full response we need to await it
            response = await page_future
            # once we have response object all we have to do is parse the airplane data from it
            # with the function we wrote above!
            parsed.extend(await parse_page(response))
        # this will generate a list of dictionaries
        # you can further dump this as json with `json.dumps(parsed)`
        return parsed


if __name__ == "__main__":
    asyncio.run(scrape())
	import asyncio
	from aiohttp.client import ClientSession
	from parsel import Selector


	async def get_page(page: int, session: ClientSession):
	"""This is a little shortcut function that requests specific page of our pagination"""
	url = f"https://www.avbuyer.com/aircraft/private-jets/page-{page}"
	print(f"requesting {url}")
	return await session.get(url)


	async def parse_page(resp):
	"""this is our parse function that extracts airplane details from the pagination page"""
	# first we need to find all airplane boxes on the page, there should be 20 of them
	airplanes = Selector(await resp.text()).css(".listing-item")

	# then we should parse box one by one for airplane data
	parsed = []
	for plane in airplanes:
	# plane.css will select RELATIVE to plane box
	# so using plane.css(".item-title") will only select node that is under current plane box
	parsed.append({
	# note: we are using ::text selector to select text of the node rather than whole node content
	# note: parsel selector have a convenient .get() method that takes first text value of selected nodes
	"title": plane.css(".item-title ::text").get(),
	"price": plane.css(".price::text").get(),
	"updated": plane.css(".list-update::text").get(),
	"description": plane.css(".list-item-para::text").get().strip(),
	# since location can contain multiple nodes we want to join them explicitly
	"location": " ".join(
	plane.css(".list-item-location::text").extract()
	).strip(),
	# same for other details
	"other_details": "\n".join(
	plane.css(".list-other-dtl ::text").extract()
	).strip(),
	# etc. other fields just have different classes so I leave them up to you.
	})
	return parsed


	async def scrape():
	async with ClientSession() as session:
	# we start with the first page of listing pagination, there we get total amount of pages
	# so we could scrape all pages concurrently rather than going to the next page one by one which
	# is very slow!
	first_page = await get_page(1, session)
	first_page_sel = Selector(await first_page.text())
	total_pages = int(
	first_page_sel.xpath(
	'//li[@class="pagination-next"]/preceding-sibling::li[1]/a/text()'
	).get()
	)

	# now that we have total pages and first page data we can extract airplane data for the first page and..
	parsed = await parse_page(first_page)
	# and for other pages we can schedule them asynchronously
	# This might appear complex but `asyncio.as_completed` turns a list of coroutines
	# into a list of futures. meaning this loop will execute in first-come-first-serve order
	for page_future in asyncio.as_completed(
	[get_page(i, session) for i in range(2, total_pages)]
	):
	# since here we get future not a full response we need to await it
	response = await page_future
	# once we have response object all we have to do is parse the airplane data from it
	# with the function we wrote above!
	parsed.extend(await parse_page(response))
	# this will generate a list of dictionaries
	# you can further dump this as json with `json.dumps(parsed)`
	return parsed


	if __name__ == "__main__":
	asyncio.run(scrape())