Skip to content

Instantly share code, notes, and snippets.

@anemochore
Last active January 5, 2017 05:05
Show Gist options
  • Save anemochore/477c44dcc69fb507bbb532ba29192b26 to your computer and use it in GitHub Desktop.
Save anemochore/477c44dcc69fb507bbb532ba29192b26 to your computer and use it in GitHub Desktop.
파이썬으로 웹 크롤러 만들기_4장_097쪽 예제 전체 코드
from urllib.request import urlopen
from bs4 import BeautifulSoup
import datetime
import random
import re
random.seed(datetime.datetime.now())
def getLinks(articleUrl):
html = urlopen("http://en.wikipedia.org"+articleUrl)
bsObj = BeautifulSoup(html, "html.parser")
return bsObj.find("div", {"id":"bodyContent"}).findAll("a",
href=re.compile("^(/wiki/)((?!:).)*$"))
def getHistoryIPs(pageUrl):
# 개정 내역 페이지 URL은 다음과 같은 형식입니다.
# http://en.wikipedia.org/w/index.php?title=Title_in_URL&action=history
pageUrl = pageUrl.replace("/wiki/", "")
historyUrl = "http://en.wikipedia.org/w/index.php?title="
historyUrl += pageUrl + "&action=history"
print("history url is: "+historyUrl)
html = urlopen(historyUrl)
bsObj = BeautifulSoup(html, "html.parser")
# 사용자명 대신 IP 주소가 담긴, 클래스가 mw-anonuserlink인 링크만 찾습니다.
ipAddresses = bsObj.findAll("a", {"class":"mw-anonuserlink"})
addressList = set()
for ipAddress in ipAddresses:
addressList.add(ipAddress.get_text())
return addressList
# 여기까지 94쪽 코드의 일부
import json # 추가
from urllib.request import HTTPError # 추가
# 여기부터 97쪽 코드
def getCountry(ipAddress):
try:
response = urlopen("http://freegeoip.net/json/"+ipAddress).read().decode('utf-8')
except HTTPError:
return None
responseJson = json.loads(response)
return responseJson.get("country_code")
links = getLinks("/wiki/Python_(programming_language)")
while(len(links) > 0):
for link in links:
print("-------------------")
historyIPs = getHistoryIPs(link.attrs["href"])
for historyIP in historyIPs:
country = getCountry(historyIP)
if country is not None:
print(historyIP+" is from "+country)
newLink = links[random.randint(0, len(links)-1)].attrs["href"]
links = getLinks(newLink)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment