cjnghn/crawl.py

## crawl.py
import asyncio
import pandas as pd

from aiohttp import ClientSession
from bs4 import BeautifulSoup
from typing import List, Dict, Optional, Any

ALGOLIA_URL = 'https://w1bghm6ujn-dsn.algolia.net/1/indexes/*/queries'
HEADERS = {
  'x-algolia-agent': 'Algolia for JavaScript (3.35.1); Browser; instantsearch.js (3.7.0); Vue (2.6.11); Vue InstantSearch (2.6.0); JS Helper (2.28.0)',
  'x-algolia-application-id': 'W1BGHM6UJN',
  'x-algolia-api-key': 'eeb43ac4430d1b80eac787aca249803e'
}

async def fetch_list(session: ClientSession, pageNo: int = 1) -> List[Dict[str, Any]]:
  payload = {
    'requests': [{
      'indexName': 'catalog.prod.milipol.exhibitors.en.name-asc',
      'params': 'query=&maxValuesPerFacet=100&page=1&highlightPreTag=__ais-highlight__&highlightPostTag=__%2Fais-highlight__&facets=%5B%22targets%22%2C%22thematics%22%2C%22stands.sector%22%2C%22brands.name%22%2C%22address.country%22%2C%22isPressRelease%22%2C%22businessArea.categories.lvl0%22%5D&tagFilters=',
      'page': pageNo,
    }]
  }

  async with session.post(ALGOLIA_URL, headers=HEADERS, json=payload) as resp:
    if resp.status == 200:
      data = await resp.json()
      return data['results'][0]['hits']

  print('Error:', resp.status)

  return []


async def fetch_detail(session: ClientSession, item: Dict[str, Any]) -> Optional[Dict[str, Any]]:
  href = item['hreflang'].get('en') or item['hreflang'].get('fr')
  if not href:
    return None

  url = f"https://en.milipol.com/Catalogue/Exhibitor/{href}"
  async with session.get(url) as resp:
    if resp.status != 200:
      print('Error:', resp.status)
      return None
    page_content = await resp.text()

  soup = BeautifulSoup(page_content, 'html.parser')

  # Name extraction
  name_element = soup.select_one('h1')
  name = name_element.text.strip() if name_element else None

  # Address extraction
  address_elements = soup.select('.CatalogExhibitorStrip-address > span')
  full_address = ' '.join([p.text for p in address_elements])
  address = ' '.join(full_address.split(' ')[:-1]).strip()
  country = full_address.split(' ')[-1].strip() if full_address else None

  # Stand extraction
  stand_element = soup.select_one('.CatalogStandsBlock-stand')
  stand = stand_element.text.strip() if stand_element else None

  # Website extraction
  website_icon = soup.select_one('.CatalogExhibitorStrip-socialitem > .cc2-icon-web')
  website = website_icon.attrs['href'] if website_icon else None

  # Description extraction
  description_element = soup.select_one('.CatalogParagraph')
  description = description_element.text.strip() if description_element else None

  # Activity fields extraction
  activity_elements = soup.select('.CatalogActivityList')
  activity_fields = '\n'.join([
      ' > '.join([field.text.strip() for field in activity.select('.CatalogLabel')])
      for activity in activity_elements
  ])

  return {
    'url': url,
    'name': name,
    'address': address,
    'country': country,
    'stand': stand,
    'website': website,
    'description': description,
    'activity_fields': activity_fields
  }


async def main_async():
  collected_data = []

  async def process_item(session: ClientSession, item: Dict[str, Any]) -> None:
    detailData = await fetch_detail(session, item)
    if detailData:
      collected_data.append(detailData)
      print(detailData.get('name'))

  async with ClientSession() as session:
    pageNo = 1
    while True:
      print('[🐈 Page]:', pageNo)
      hits = await fetch_list(session, pageNo)
      if not hits:
        break

      tasks = [process_item(session, item) for item in hits]
      await asyncio.gather(*tasks)

      pageNo += 1

  df = pd.DataFrame(collected_data)
  df.to_excel("결과.xlsx", index=False, engine='openpyxl')


if __name__ == '__main__':
  asyncio.run(main_async())
	import asyncio
	import pandas as pd

	from aiohttp import ClientSession
	from bs4 import BeautifulSoup
	from typing import List, Dict, Optional, Any

	ALGOLIA_URL = 'https://w1bghm6ujn-dsn.algolia.net/1/indexes/*/queries'
	HEADERS = {
	'x-algolia-agent': 'Algolia for JavaScript (3.35.1); Browser; instantsearch.js (3.7.0); Vue (2.6.11); Vue InstantSearch (2.6.0); JS Helper (2.28.0)',
	'x-algolia-application-id': 'W1BGHM6UJN',
	'x-algolia-api-key': 'eeb43ac4430d1b80eac787aca249803e'
	}

	async def fetch_list(session: ClientSession, pageNo: int = 1) -> List[Dict[str, Any]]:
	payload = {
	'requests': [{
	'indexName': 'catalog.prod.milipol.exhibitors.en.name-asc',
	'params': 'query=&maxValuesPerFacet=100&page=1&highlightPreTag=__ais-highlight__&highlightPostTag=__%2Fais-highlight__&facets=%5B%22targets%22%2C%22thematics%22%2C%22stands.sector%22%2C%22brands.name%22%2C%22address.country%22%2C%22isPressRelease%22%2C%22businessArea.categories.lvl0%22%5D&tagFilters=',
	'page': pageNo,
	}]
	}

	async with session.post(ALGOLIA_URL, headers=HEADERS, json=payload) as resp:
	if resp.status == 200:
	data = await resp.json()
	return data['results'][0]['hits']

	print('Error:', resp.status)

	return []


	async def fetch_detail(session: ClientSession, item: Dict[str, Any]) -> Optional[Dict[str, Any]]:
	href = item['hreflang'].get('en') or item['hreflang'].get('fr')
	if not href:
	return None

	url = f"https://en.milipol.com/Catalogue/Exhibitor/{href}"
	async with session.get(url) as resp:
	if resp.status != 200:
	print('Error:', resp.status)
	return None
	page_content = await resp.text()

	soup = BeautifulSoup(page_content, 'html.parser')

	# Name extraction
	name_element = soup.select_one('h1')
	name = name_element.text.strip() if name_element else None

	# Address extraction
	address_elements = soup.select('.CatalogExhibitorStrip-address > span')
	full_address = ' '.join([p.text for p in address_elements])
	address = ' '.join(full_address.split(' ')[:-1]).strip()
	country = full_address.split(' ')[-1].strip() if full_address else None

	# Stand extraction
	stand_element = soup.select_one('.CatalogStandsBlock-stand')
	stand = stand_element.text.strip() if stand_element else None

	# Website extraction
	website_icon = soup.select_one('.CatalogExhibitorStrip-socialitem > .cc2-icon-web')
	website = website_icon.attrs['href'] if website_icon else None

	# Description extraction
	description_element = soup.select_one('.CatalogParagraph')
	description = description_element.text.strip() if description_element else None

	# Activity fields extraction
	activity_elements = soup.select('.CatalogActivityList')
	activity_fields = '\n'.join([
	' > '.join([field.text.strip() for field in activity.select('.CatalogLabel')])
	for activity in activity_elements
	])

	return {
	'url': url,
	'name': name,
	'address': address,
	'country': country,
	'stand': stand,
	'website': website,
	'description': description,
	'activity_fields': activity_fields
	}


	async def main_async():
	collected_data = []

	async def process_item(session: ClientSession, item: Dict[str, Any]) -> None:
	detailData = await fetch_detail(session, item)
	if detailData:
	collected_data.append(detailData)
	print(detailData.get('name'))

	async with ClientSession() as session:
	pageNo = 1
	while True:
	print('[🐈 Page]:', pageNo)
	hits = await fetch_list(session, pageNo)
	if not hits:
	break

	tasks = [process_item(session, item) for item in hits]
	await asyncio.gather(*tasks)

	pageNo += 1

	df = pd.DataFrame(collected_data)
	df.to_excel("결과.xlsx", index=False, engine='openpyxl')


	if __name__ == '__main__':
	asyncio.run(main_async())