Skip to content

Instantly share code, notes, and snippets.

@Granitosaurus
Created November 17, 2021 07:20
Show Gist options
  • Save Granitosaurus/ed4df68e3249bfd274669512962f55a0 to your computer and use it in GitHub Desktop.
Save Granitosaurus/ed4df68e3249bfd274669512962f55a0 to your computer and use it in GitHub Desktop.
aircraft scraper for avbuyer.com website.
import asyncio
from aiohttp.client import ClientSession
from parsel import Selector
async def get_page(page: int, session: ClientSession):
"""This is a little shortcut function that requests specific page of our pagination"""
url = f"https://www.avbuyer.com/aircraft/private-jets/page-{page}"
print(f"requesting {url}")
return await session.get(url)
async def parse_page(resp):
"""this is our parse function that extracts airplane details from the pagination page"""
# first we need to find all airplane boxes on the page, there should be 20 of them
airplanes = Selector(await resp.text()).css(".listing-item")
# then we should parse box one by one for airplane data
parsed = []
for plane in airplanes:
# plane.css will select RELATIVE to plane box
# so using plane.css(".item-title") will only select node that is under current plane box
parsed.append({
# note: we are using ::text selector to select text of the node rather than whole node content
# note: parsel selector have a convenient .get() method that takes first text value of selected nodes
"title": plane.css(".item-title ::text").get(),
"price": plane.css(".price::text").get(),
"updated": plane.css(".list-update::text").get(),
"description": plane.css(".list-item-para::text").get().strip(),
# since location can contain multiple nodes we want to join them explicitly
"location": " ".join(
plane.css(".list-item-location::text").extract()
).strip(),
# same for other details
"other_details": "\n".join(
plane.css(".list-other-dtl ::text").extract()
).strip(),
# etc. other fields just have different classes so I leave them up to you.
})
return parsed
async def scrape():
async with ClientSession() as session:
# we start with the first page of listing pagination, there we get total amount of pages
# so we could scrape all pages concurrently rather than going to the next page one by one which
# is very slow!
first_page = await get_page(1, session)
first_page_sel = Selector(await first_page.text())
total_pages = int(
first_page_sel.xpath(
'//li[@class="pagination-next"]/preceding-sibling::li[1]/a/text()'
).get()
)
# now that we have total pages and first page data we can extract airplane data for the first page and..
parsed = await parse_page(first_page)
# and for other pages we can schedule them asynchronously
# This might appear complex but `asyncio.as_completed` turns a list of coroutines
# into a list of futures. meaning this loop will execute in first-come-first-serve order
for page_future in asyncio.as_completed(
[get_page(i, session) for i in range(2, total_pages)]
):
# since here we get future not a full response we need to await it
response = await page_future
# once we have response object all we have to do is parse the airplane data from it
# with the function we wrote above!
parsed.extend(await parse_page(response))
# this will generate a list of dictionaries
# you can further dump this as json with `json.dumps(parsed)`
return parsed
if __name__ == "__main__":
asyncio.run(scrape())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment