Created
October 26, 2023 05:38
-
-
Save cjnghn/af6efa73ada0db3d3b48bce5a42827ed to your computer and use it in GitHub Desktop.
crawling examples: aiohttp + paginatoin + detail view
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import asyncio | |
import pandas as pd | |
from aiohttp import ClientSession | |
from bs4 import BeautifulSoup | |
from typing import List, Dict, Optional, Any | |
ALGOLIA_URL = 'https://w1bghm6ujn-dsn.algolia.net/1/indexes/*/queries' | |
HEADERS = { | |
'x-algolia-agent': 'Algolia for JavaScript (3.35.1); Browser; instantsearch.js (3.7.0); Vue (2.6.11); Vue InstantSearch (2.6.0); JS Helper (2.28.0)', | |
'x-algolia-application-id': 'W1BGHM6UJN', | |
'x-algolia-api-key': 'eeb43ac4430d1b80eac787aca249803e' | |
} | |
async def fetch_list(session: ClientSession, pageNo: int = 1) -> List[Dict[str, Any]]: | |
payload = { | |
'requests': [{ | |
'indexName': 'catalog.prod.milipol.exhibitors.en.name-asc', | |
'params': 'query=&maxValuesPerFacet=100&page=1&highlightPreTag=__ais-highlight__&highlightPostTag=__%2Fais-highlight__&facets=%5B%22targets%22%2C%22thematics%22%2C%22stands.sector%22%2C%22brands.name%22%2C%22address.country%22%2C%22isPressRelease%22%2C%22businessArea.categories.lvl0%22%5D&tagFilters=', | |
'page': pageNo, | |
}] | |
} | |
async with session.post(ALGOLIA_URL, headers=HEADERS, json=payload) as resp: | |
if resp.status == 200: | |
data = await resp.json() | |
return data['results'][0]['hits'] | |
print('Error:', resp.status) | |
return [] | |
async def fetch_detail(session: ClientSession, item: Dict[str, Any]) -> Optional[Dict[str, Any]]: | |
href = item['hreflang'].get('en') or item['hreflang'].get('fr') | |
if not href: | |
return None | |
url = f"https://en.milipol.com/Catalogue/Exhibitor/{href}" | |
async with session.get(url) as resp: | |
if resp.status != 200: | |
print('Error:', resp.status) | |
return None | |
page_content = await resp.text() | |
soup = BeautifulSoup(page_content, 'html.parser') | |
# Name extraction | |
name_element = soup.select_one('h1') | |
name = name_element.text.strip() if name_element else None | |
# Address extraction | |
address_elements = soup.select('.CatalogExhibitorStrip-address > span') | |
full_address = ' '.join([p.text for p in address_elements]) | |
address = ' '.join(full_address.split(' ')[:-1]).strip() | |
country = full_address.split(' ')[-1].strip() if full_address else None | |
# Stand extraction | |
stand_element = soup.select_one('.CatalogStandsBlock-stand') | |
stand = stand_element.text.strip() if stand_element else None | |
# Website extraction | |
website_icon = soup.select_one('.CatalogExhibitorStrip-socialitem > .cc2-icon-web') | |
website = website_icon.attrs['href'] if website_icon else None | |
# Description extraction | |
description_element = soup.select_one('.CatalogParagraph') | |
description = description_element.text.strip() if description_element else None | |
# Activity fields extraction | |
activity_elements = soup.select('.CatalogActivityList') | |
activity_fields = '\n'.join([ | |
' > '.join([field.text.strip() for field in activity.select('.CatalogLabel')]) | |
for activity in activity_elements | |
]) | |
return { | |
'url': url, | |
'name': name, | |
'address': address, | |
'country': country, | |
'stand': stand, | |
'website': website, | |
'description': description, | |
'activity_fields': activity_fields | |
} | |
async def main_async(): | |
collected_data = [] | |
async def process_item(session: ClientSession, item: Dict[str, Any]) -> None: | |
detailData = await fetch_detail(session, item) | |
if detailData: | |
collected_data.append(detailData) | |
print(detailData.get('name')) | |
async with ClientSession() as session: | |
pageNo = 1 | |
while True: | |
print('[🐈 Page]:', pageNo) | |
hits = await fetch_list(session, pageNo) | |
if not hits: | |
break | |
tasks = [process_item(session, item) for item in hits] | |
await asyncio.gather(*tasks) | |
pageNo += 1 | |
df = pd.DataFrame(collected_data) | |
df.to_excel("결과.xlsx", index=False, engine='openpyxl') | |
if __name__ == '__main__': | |
asyncio.run(main_async()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
aiohttp is superfast...