Skip to content

Instantly share code, notes, and snippets.

@cjnghn
Created October 26, 2023 05:38
Show Gist options
  • Save cjnghn/af6efa73ada0db3d3b48bce5a42827ed to your computer and use it in GitHub Desktop.
Save cjnghn/af6efa73ada0db3d3b48bce5a42827ed to your computer and use it in GitHub Desktop.
crawling examples: aiohttp + paginatoin + detail view
import asyncio
import pandas as pd
from aiohttp import ClientSession
from bs4 import BeautifulSoup
from typing import List, Dict, Optional, Any
ALGOLIA_URL = 'https://w1bghm6ujn-dsn.algolia.net/1/indexes/*/queries'
HEADERS = {
'x-algolia-agent': 'Algolia for JavaScript (3.35.1); Browser; instantsearch.js (3.7.0); Vue (2.6.11); Vue InstantSearch (2.6.0); JS Helper (2.28.0)',
'x-algolia-application-id': 'W1BGHM6UJN',
'x-algolia-api-key': 'eeb43ac4430d1b80eac787aca249803e'
}
async def fetch_list(session: ClientSession, pageNo: int = 1) -> List[Dict[str, Any]]:
payload = {
'requests': [{
'indexName': 'catalog.prod.milipol.exhibitors.en.name-asc',
'params': 'query=&maxValuesPerFacet=100&page=1&highlightPreTag=__ais-highlight__&highlightPostTag=__%2Fais-highlight__&facets=%5B%22targets%22%2C%22thematics%22%2C%22stands.sector%22%2C%22brands.name%22%2C%22address.country%22%2C%22isPressRelease%22%2C%22businessArea.categories.lvl0%22%5D&tagFilters=',
'page': pageNo,
}]
}
async with session.post(ALGOLIA_URL, headers=HEADERS, json=payload) as resp:
if resp.status == 200:
data = await resp.json()
return data['results'][0]['hits']
print('Error:', resp.status)
return []
async def fetch_detail(session: ClientSession, item: Dict[str, Any]) -> Optional[Dict[str, Any]]:
href = item['hreflang'].get('en') or item['hreflang'].get('fr')
if not href:
return None
url = f"https://en.milipol.com/Catalogue/Exhibitor/{href}"
async with session.get(url) as resp:
if resp.status != 200:
print('Error:', resp.status)
return None
page_content = await resp.text()
soup = BeautifulSoup(page_content, 'html.parser')
# Name extraction
name_element = soup.select_one('h1')
name = name_element.text.strip() if name_element else None
# Address extraction
address_elements = soup.select('.CatalogExhibitorStrip-address > span')
full_address = ' '.join([p.text for p in address_elements])
address = ' '.join(full_address.split(' ')[:-1]).strip()
country = full_address.split(' ')[-1].strip() if full_address else None
# Stand extraction
stand_element = soup.select_one('.CatalogStandsBlock-stand')
stand = stand_element.text.strip() if stand_element else None
# Website extraction
website_icon = soup.select_one('.CatalogExhibitorStrip-socialitem > .cc2-icon-web')
website = website_icon.attrs['href'] if website_icon else None
# Description extraction
description_element = soup.select_one('.CatalogParagraph')
description = description_element.text.strip() if description_element else None
# Activity fields extraction
activity_elements = soup.select('.CatalogActivityList')
activity_fields = '\n'.join([
' > '.join([field.text.strip() for field in activity.select('.CatalogLabel')])
for activity in activity_elements
])
return {
'url': url,
'name': name,
'address': address,
'country': country,
'stand': stand,
'website': website,
'description': description,
'activity_fields': activity_fields
}
async def main_async():
collected_data = []
async def process_item(session: ClientSession, item: Dict[str, Any]) -> None:
detailData = await fetch_detail(session, item)
if detailData:
collected_data.append(detailData)
print(detailData.get('name'))
async with ClientSession() as session:
pageNo = 1
while True:
print('[🐈 Page]:', pageNo)
hits = await fetch_list(session, pageNo)
if not hits:
break
tasks = [process_item(session, item) for item in hits]
await asyncio.gather(*tasks)
pageNo += 1
df = pd.DataFrame(collected_data)
df.to_excel("결과.xlsx", index=False, engine='openpyxl')
if __name__ == '__main__':
asyncio.run(main_async())
@cjnghn
Copy link
Author

cjnghn commented Oct 26, 2023

aiohttp is superfast...

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment