Last active
April 27, 2017 17:33
-
-
Save naturale0/e04a4e5a518062e27bbcf7b76380d903 to your computer and use it in GitHub Desktop.
2017 대한민국 대선 여론조사 - 네이버 여론조사를 크롤링, 그래프로 간단히 시각화하는 Python class
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#coding=utf-8 | |
import re | |
import requests | |
import StringIO | |
import matplotlib as mpl | |
import matplotlib.pyplot as plt | |
import matplotlib.font_manager as fm | |
import pandas as pd | |
import numpy as np | |
from bs4 import BeautifulSoup | |
from collections import defaultdict | |
font_location = "/Library/Fonts/NanumGothic.otf" | |
font_name = fm.FontProperties(fname=font_location).get_name() | |
mpl.rc('font', family=font_name) | |
class NaverPollCrawler(object): | |
def __init__(self): | |
self.url = "http://news.naver.com/main/election/president2017/trend/survey.nhn" | |
res = requests.get(self.url) | |
soup = BeautifulSoup(res.content, "html.parser") | |
divs = soup.find_all("script", attrs={"type": "text/javascript"}) | |
candidate_id = divs[5].text.split("\n") # 5: 후보 id (네이버에서 쓰는 내부 아이디) | |
candidate_poll = divs[6].text.split("\n") # 6: 후보 지지율 | |
party_poll = divs[8].text.split("\n") # 8: 정당 지지율 | |
by_region = divs[9].text.split("\n") # 9: 지역별 후보 지지율 | |
by_age = divs[10].text.split("\n") # 10: 연령별 후보 지지율 | |
by_gender = divs[11].text.split("\n") # 11: 성별별 후보 지지율 | |
# three_or_four = divs[7].text.split("\n") # 7: 누군가 빠지거나 단일화할때 | |
buff = StringIO.StringIO() | |
buff.write("\n".join(candidate_id[2:candidate_id.index("};")])) | |
buff.seek(0) | |
line = buff.readline() | |
id_dict = {} | |
while line: | |
if line.startswith('"'): | |
identity = int(line.split('"')[1]) | |
line = buff.readline() | |
candidate_name = line.split('"')[1] | |
line = buff.readline() | |
line = buff.readline() | |
line = buff.readline() | |
color = line.split('"')[1] | |
id_dict[identity] = (candidate_name, color) | |
line = buff.readline() | |
self.id_dict = id_dict | |
buff = StringIO.StringIO() | |
buff.write("\n".join(candidate_poll[candidate_poll.index("{"):candidate_poll.index("]);")])) | |
buff.seek(0) | |
line = buff.readline() | |
poll_dict = defaultdict(dict) | |
while line: | |
if line.startswith("formatDate"): | |
targetDate = line.split('"')[1] ## | |
line = buff.readline() | |
line = buff.readline() | |
#line = buff.readline() | |
office = line.split('"')[1] ## | |
line = buff.readline() | |
line = buff.readline() | |
agency = line.split('"')[1] ## | |
line = buff.readline() | |
info = line.split('"')[1] ## | |
line = buff.readline() | |
url = line.split('"')[1] ## | |
line = buff.readline() | |
if line.startswith("candidateList:"): | |
line = buff.readline() | |
line = buff.readline() | |
while not line.startswith("]"): | |
if line.startswith("id"): | |
identity = line.split('"')[1] ## | |
line = buff.readline() | |
if line.startswith("rating"): | |
rate = line.split('"')[1] | |
if rate == "": | |
rate = 0 | |
rating = float(rate) ## | |
poll_dict[targetDate][identity] = rating | |
line = buff.readline() | |
line = buff.readline() | |
self.poll_dict = poll_dict | |
df = pd.DataFrame(poll_dict) | |
df = df[1:] | |
df.index = df.index.astype(int) | |
df.sort_index(inplace=True) | |
self.poll_df = df | |
self.poll_important_df = pd.concat([self.poll_df.loc[i] for i in [1, 3, 5, 6, 10]], axis=1) | |
self.poll_important_df.columns = [self.id_dict[i][0] for i in [1, 3, 5, 6, 10]] | |
self.color_palette = [id_dict[i][1] for i in [1, 3, 5, 6, 10]] | |
bars = soup.find_all("div", attrs={"class": "card_box graph graph--wide"}) | |
bars_hidden = soup.find_all("div", attrs={"class": "_block card_box graph graph--wide"}) | |
bars += bars_hidden | |
barchart_dict = defaultdict(dict) | |
for i in bars: | |
p = i.text.split("\n") | |
poll = [i for i in p if i != ""] | |
title = poll[0] + " | " + poll[1] | |
poll.pop() | |
info = poll.pop() | |
poll = poll[2:] | |
for i in range(0, len(poll), 2): | |
barchart_dict[title][poll[i+1]] = float(poll[i][:-1]) | |
self.barchart_dict = barchart_dict | |
relatives = soup.find_all("div", attrs={"class": "card_box graph"}) | |
relatives_hidden = soup.find_all("div", attrs={"class": "_block card_box graph"}) | |
relatives += relatives_hidden | |
relative_dict = defaultdict(dict) | |
for i in relatives: | |
b = i.text.split("\n") | |
box = [i for i in b if i != ""] | |
title = box[0] + " | " + box[1] | |
start = box.index(u"상세 결과") + 1 | |
box.pop() | |
info = box.pop() | |
box = box[start:] | |
for i in range(len(box)): | |
rate = re.findall(r"([0-9.]+)", box[i])[0] | |
category = re.findall(u"([가-힣 ]+)", box[i])[0] | |
#print rate, category | |
relative_dict[title][category] = float(rate) | |
self.relative_dict = relative_dict | |
buff = StringIO.StringIO() | |
buff.write("\n".join(by_region[by_region.index("{"):by_region.index("]);")])) | |
buff.seek(0) | |
line = buff.readline() | |
region_dict = defaultdict(lambda : defaultdict(dict)) | |
while line: | |
if line.startswith("formatDate"): | |
targetDate = line.split('"')[1] ## | |
line = buff.readline() | |
line = buff.readline() | |
#line = buff.readline() | |
office = line.split('"')[1] ## | |
line = buff.readline() | |
line = buff.readline() | |
agency = line.split('"')[1] ## | |
line = buff.readline() | |
info = line.split('"')[1] ## | |
line = buff.readline() | |
url = line.split('"')[1] ## | |
line = buff.readline() | |
if line.startswith("name"): | |
reg = line.split('"')[1] | |
if line.startswith("candidateList:"): | |
line = buff.readline() | |
line = buff.readline() | |
while not line.startswith("]"): | |
if line.startswith("id"): | |
identity = line.split('"')[1] ## | |
line = buff.readline() | |
if line.startswith("rating"): | |
rate = line.split('"')[1] | |
if rate == "": | |
rate = 0 | |
rating = float(rate) ## | |
region_dict[targetDate][reg][identity] = rating | |
line = buff.readline() | |
line = buff.readline() | |
self.region_dict = region_dict | |
def plot_latest(self): | |
df = self.poll_df.iloc[:, -1].dropna() | |
location = 0 | |
xlocs = [] | |
xticks = [] | |
ratings = [] | |
for index, record in df.iteritems(): | |
location += 1 | |
x = location | |
if index == '': | |
continue | |
xtick, color = self.id_dict[index] | |
y = record | |
plt.bar([x], [y], align="center", color=color, yerr=3, error_kw={'ecolor':'k'}) | |
plt.annotate("{}%".format(y), xy=(x+.05, y+1), horizontalalignment="left") | |
xlocs.append(x) | |
xticks.append(xtick) | |
ratings.append(y) | |
plt.xlim(None, len(df.index)+1) | |
plt.ylim(0, max(ratings)+3.5) | |
plt.xticks(xlocs, xticks, fontsize=11) | |
plt.xlabel(u"후보자", fontsize=14) | |
plt.ylabel(u"지지율(%)", fontsize=14) | |
plt.title(df.name, fontsize=16) | |
plt.tight_layout() | |
def plot_important(self): | |
self.poll_important_df.plot(figsize=(12,6), color=self.color_palette, | |
legend={"loc": 2, "fontsize": 6}, label="series") | |
def plot_date(self, yyyymmdd): | |
try: | |
df = self.poll_df.loc[:, yyyymmdd].dropna() | |
location = 0 | |
xlocs = [] | |
xticks = [] | |
ratings = [] | |
for index, record in df.iteritems(): | |
location += 1 | |
x = location | |
if index == '': | |
continue | |
xtick, color = self.id_dict[index] | |
y = record | |
plt.bar([x], [y], align="center", color=color, yerr=3, error_kw={'ecolor':'k'}) | |
plt.annotate("{}%".format(y), xy=(x+.05, y+1), horizontalalignment="left") | |
xlocs.append(x) | |
xticks.append(xtick) | |
ratings.append(y) | |
plt.xlim(None, len(df.index)+1) | |
plt.ylim(0, max(ratings)+3.5) | |
plt.xticks(xlocs, xticks, fontsize=11) | |
plt.xlabel(u"후보자", fontsize=14) | |
plt.ylabel(u"지지율(%)", fontsize=14) | |
plt.title(df.name, fontsize=16) | |
plt.tight_layout() | |
except KeyError: | |
raise KeyError(u"이 날({})의 여론조사가 없습니다.".format(yyyymmdd)) | |
def show(self): | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment