Skip to content

Instantly share code, notes, and snippets.

@yurihan
Last active August 29, 2015 14:09
Show Gist options
  • Save yurihan/148f05dbe3e3213893d0 to your computer and use it in GitHub Desktop.
Save yurihan/148f05dbe3e3213893d0 to your computer and use it in GitHub Desktop.
#!/usr/bin/python
#-*- coding: utf-8 -*-
import urllib2
import urllib
from BeautifulSoup import BeautifulSoup
import re
#대충대충 갑시다.
def crawlMuseum(): #국립
museumList = []
result = []
baseUrl="http://www.museum.or.kr/organ/"
kind = ['국립','공립','사립','대학']
print '목록 긁기'
for i in range(len(kind)) : # 1. 국립 2. 공립 3. 사립 4. 대학
lst = [3,14,35,12] # 각 운영주체별 리스트 페이지 개수. 일단 대충 하드코딩.
print kind[i]
for j in range(lst[i]):
print '페이지 %d'%j
url = baseUrl+"museums01.php?cmdProc=listform&page=%d&orderByField=MuseumIntroduce_idx&orderBySort=DESC&MuseumIntroduce_kind=%d&sub_menu=%d"%(j+1,i+1,i+1)
#print url
req = urllib2.Request(url)
response = urllib2.urlopen(req,timeout=5)
# 파싱.
html = unicode(response.read(),'cp949').encode('utf-8')
#lst = re.findall('\\goList',html)
#next = len(lst)-2
bs = BeautifulSoup(html)
m = bs('td',{'class':'CommuniPd02'})
for mm in m :
href = mm('a')[0]['href']
museumList.append(href)
print '정보 긁기'
for cnt,url in enumerate(museumList) :
req = urllib2.Request(baseUrl+url)
response = urllib2.urlopen(req,timeout=5)
# 파싱.
html = unicode(response.read(),'cp949').encode('utf-8')
bs = BeautifulSoup(html)
name = bs('td',{'class':'DataFont01'})[0].string.strip()
etc = bs('td',{'class':'PdTop5'})
addr = etc[4].string.strip()
print '%5d/%-5d\t%s,%s'%(cnt+1,len(museumList),name,addr)
result.append([name,addr])
print '파일 쓰기'
with open('museum.csv','w') as f :
#f.write('name|addr\n')
for m in result :
f.write(('M|%s|%s\n'%(m[0],m[1])).encode('utf-8'))
crawlMuseum()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment