Skip to content

Instantly share code, notes, and snippets.

@ohahohah
Last active May 15, 2022 11:44
Show Gist options
  • Save ohahohah/b7f07813437dd235c5f9829188c6d6d5 to your computer and use it in GitHub Desktop.
Save ohahohah/b7f07813437dd235c5f9829188c6d6d5 to your computer and use it in GitHub Desktop.
스탬프투어참여_박물관미술관리스트 스크래핑 후 엑셀에 저장. 구글 내 지도에서 엑셀파일을 지도 마커로 불러올 수 있음. 결과지도 : https://www.google.com/maps/d/edit?mid=1F_gqeG2V5V6ac07dkczUZ8CpmMRhV9lc&usp=sharing
from datetime import datetime
from itertools import chain
import requests
from bs4 import BeautifulSoup
from openpyxl import Workbook
BASE_URL = 'https://xn--2d3b68pp1a79ecyl.kr' # https://뮤지엄위크.kr
YEAR = '2022'
REGION_CODE = {'서울': 1, '경기': 2, '강원/인천': 3, '경상/부산/울산/대구': 4, '전라/광주/제주': 5, '충청/대전/세종': 6}
DESKTOP_CHROME = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
def scrap_museum_urls():
"""박물관미술관 리스트페이지에서 모든 지역의 각 기관 url 링크를 스크래핑"""
museum_urls = []
for region_code in REGION_CODE.values():
museum_urls.extend(scrap_museum_urls_region(region_code))
return museum_urls
def scrap_museum_urls_region(region_code):
"""박물관미술관 리스트페이지에서 특정 지역의 기관 url 링크를 스크래핑
Parameters:
region_code (int): 특정 지역 코드 = 리스트페이지의 해당 지역 li 행번호
Returns:
list: 기관 urls
"""
data = requests.get(f'{BASE_URL}/program/stamp', headers=DESKTOP_CHROME)
soup = BeautifulSoup(data.text, 'html.parser')
museum_list_tb = f'#app > main > div.container > div > div > div.info09_con > div.contents_con > div ' \
f'> table > tbody > tr > td:nth-child(2) > div.list_con > ul > li:nth-child({region_code}) > div' \
f'> table >tbody'
museum_tds = soup.select(f'{museum_list_tb}> tr > td')
museum_urls = []
for museum_td in museum_tds:
urls = [convert_url(BASE_URL, museum_li.select_one('a')['href'])
for museum_li in museum_td.select('div > ul > li')
if is_url(museum_li.select_one('a')['href'])]
museum_urls.extend(urls)
return list(set(museum_urls))
def scrap_museum_info(url):
"""박물관미술관 기관 페이지에서 정보를 스크래핑
Parameters:
url (string): 스크래핑할 기관 페이지 url
Returns:
dictionary:
{'name':장소명, 'address': 주소, 'phone': 전화번호, 'url': 기관 url, 'homepage' : 홈페이지, 'opening_hrs': {운영시간 정보}}
"""
data = requests.get(url, headers=DESKTOP_CHROME)
soup = BeautifulSoup(data.text, 'html.parser')
base_element = '#app > main > div.container.museum.detail > div.museum-header > div.info'
name_raw = soup.select_one(f'{base_element}> div.name')
name = name_raw.text if name_raw is not None else 'NaN'
address_raw = soup.select_one(f'{base_element}> ul > li:nth-child(1) > div')
address = address_raw.text.strip() if address_raw is not None else 'NaN'
phone_raw = soup.select_one(f'{base_element}> ul > li:nth-child(2) > div')
phone = phone_raw.text if phone_raw is not None else 'NaN'
homepage_raw = soup.select_one(f'{base_element}> ul > li:nth-child(3) > div:nth-child(1) > div > a')
homepage = homepage_raw['href'] if homepage_raw is not None else 'NaN'
hr_operation_raw = soup.select_one(f'{base_element}> ul > li.info-group-item.operating')
if hr_operation_raw is not None:
it = iter(filter(bool, hr_operation_raw.text.splitlines()))
opening_hrs = dict(zip(it, it))
else:
opening_hrs = {'평일 관람시간': 'NaN', '공휴일 관람시간': 'NaN', '휴관일': 'NaN'}
loc = locals()
infos = {i: ("NaN" if loc[i] == '' else loc[i]) for i in
('name', 'address', 'phone', 'homepage', 'url', 'opening_hrs')}
return infos
def save_xlsx():
"""박물관미술관 스탬프투어에 참여하는 기관 정보를 xlsx 로 저장"""
xlsx_name = f'museum_stamp_{datetime.now().strftime("%Y%m%d_%H_%M_%S")}.xlsx'
wb = Workbook()
work_sheet = wb.active
work_sheet.title = YEAR
categories = ['장소', '주소', '전화번호', '홈페이지', '뮤지엄위크 url', '평일 관람시간', '공휴일 관람시간', '휴관일', ]
for i in range(len(categories)):
work_sheet.cell(row=1, column=i + 1, value=categories[i])
museum_urls = scrap_museum_urls()
row = 2
for m_url in museum_urls:
info = convert_dict_to_list(scrap_museum_info(m_url))
print(info)
for i in range(len(info)):
work_sheet.cell(row=row, column=i + 1, value=info[i])
row += 1
wb.save(xlsx_name)
def convert_dict_to_list(nested_dict):
items = list(nested_dict.values())
opening_hrs = list((dict(items.pop())).values())
items = list(chain(items, opening_hrs))
return items
def convert_url(site_url, str_url):
return f'{site_url}{str_url}'
def is_url(str_url):
return True if str_url != 'javascript:;' else False
if __name__ == '__main__':
save_xlsx()
def save_xlsx_with_keys():
"""박물관미술관 스탬프투어에 참여하는 기관 정보를 xlsx 로 저장
.. deprecated:: 2022.05.15
"""
museum_urls = scrap_museum_urls()
xlsx_name = f'museum_stamp_{datetime.now().strftime("%Y%m%d_%H_%M_%S")}.xlsx'
year = '2022'
wb = Workbook()
work_sheet = wb.active
work_sheet.title = year
work_sheet['A1'] = '장소'
work_sheet['B1'] = '주소'
work_sheet['C1'] = '전화번호'
work_sheet['D1'] = '홈페이지'
work_sheet['E1'] = '평일 관람시간'
work_sheet['F1'] = '공휴일 관람시간'
work_sheet['G1'] = '휴관일'
work_sheet['H1'] = '뮤지엄위크 url'
row = 2
for m_url in museum_urls:
info = scrap_museum_info(m_url)
print(info)
work_sheet[f"A{row}"] = info['name']
work_sheet[f"B{row}"] = info['address']
work_sheet[f"C{row}"] = info['phone']
work_sheet[f"D{row}"] = info['homepage']
work_sheet[f"E{row}"] = info['opening_hrs']['평일 관람시간']
work_sheet[f"F{row}"] = info['opening_hrs']['공휴일 관람시간']
work_sheet[f"G{row}"] = info['opening_hrs']['휴관일']
work_sheet[f"H{row}"] = info['url']
row += 1
wb.save(xlsx_name)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment