Skip to content

Instantly share code, notes, and snippets.

@naturale0
Last active April 27, 2017 17:33
Show Gist options
  • Save naturale0/e04a4e5a518062e27bbcf7b76380d903 to your computer and use it in GitHub Desktop.
Save naturale0/e04a4e5a518062e27bbcf7b76380d903 to your computer and use it in GitHub Desktop.
2017 대한민국 대선 여론조사 - 네이버 여론조사를 크롤링, 그래프로 간단히 시각화하는 Python class
#coding=utf-8
import re
import requests
import StringIO
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from collections import defaultdict
font_location = "/Library/Fonts/NanumGothic.otf"
font_name = fm.FontProperties(fname=font_location).get_name()
mpl.rc('font', family=font_name)
class NaverPollCrawler(object):
def __init__(self):
self.url = "http://news.naver.com/main/election/president2017/trend/survey.nhn"
res = requests.get(self.url)
soup = BeautifulSoup(res.content, "html.parser")
divs = soup.find_all("script", attrs={"type": "text/javascript"})
candidate_id = divs[5].text.split("\n") # 5: 후보 id (네이버에서 쓰는 내부 아이디)
candidate_poll = divs[6].text.split("\n") # 6: 후보 지지율
party_poll = divs[8].text.split("\n") # 8: 정당 지지율
by_region = divs[9].text.split("\n") # 9: 지역별 후보 지지율
by_age = divs[10].text.split("\n") # 10: 연령별 후보 지지율
by_gender = divs[11].text.split("\n") # 11: 성별별 후보 지지율
# three_or_four = divs[7].text.split("\n") # 7: 누군가 빠지거나 단일화할때
buff = StringIO.StringIO()
buff.write("\n".join(candidate_id[2:candidate_id.index("};")]))
buff.seek(0)
line = buff.readline()
id_dict = {}
while line:
if line.startswith('"'):
identity = int(line.split('"')[1])
line = buff.readline()
candidate_name = line.split('"')[1]
line = buff.readline()
line = buff.readline()
line = buff.readline()
color = line.split('"')[1]
id_dict[identity] = (candidate_name, color)
line = buff.readline()
self.id_dict = id_dict
buff = StringIO.StringIO()
buff.write("\n".join(candidate_poll[candidate_poll.index("{"):candidate_poll.index("]);")]))
buff.seek(0)
line = buff.readline()
poll_dict = defaultdict(dict)
while line:
if line.startswith("formatDate"):
targetDate = line.split('"')[1] ##
line = buff.readline()
line = buff.readline()
#line = buff.readline()
office = line.split('"')[1] ##
line = buff.readline()
line = buff.readline()
agency = line.split('"')[1] ##
line = buff.readline()
info = line.split('"')[1] ##
line = buff.readline()
url = line.split('"')[1] ##
line = buff.readline()
if line.startswith("candidateList:"):
line = buff.readline()
line = buff.readline()
while not line.startswith("]"):
if line.startswith("id"):
identity = line.split('"')[1] ##
line = buff.readline()
if line.startswith("rating"):
rate = line.split('"')[1]
if rate == "":
rate = 0
rating = float(rate) ##
poll_dict[targetDate][identity] = rating
line = buff.readline()
line = buff.readline()
self.poll_dict = poll_dict
df = pd.DataFrame(poll_dict)
df = df[1:]
df.index = df.index.astype(int)
df.sort_index(inplace=True)
self.poll_df = df
self.poll_important_df = pd.concat([self.poll_df.loc[i] for i in [1, 3, 5, 6, 10]], axis=1)
self.poll_important_df.columns = [self.id_dict[i][0] for i in [1, 3, 5, 6, 10]]
self.color_palette = [id_dict[i][1] for i in [1, 3, 5, 6, 10]]
bars = soup.find_all("div", attrs={"class": "card_box graph graph--wide"})
bars_hidden = soup.find_all("div", attrs={"class": "_block card_box graph graph--wide"})
bars += bars_hidden
barchart_dict = defaultdict(dict)
for i in bars:
p = i.text.split("\n")
poll = [i for i in p if i != ""]
title = poll[0] + " | " + poll[1]
poll.pop()
info = poll.pop()
poll = poll[2:]
for i in range(0, len(poll), 2):
barchart_dict[title][poll[i+1]] = float(poll[i][:-1])
self.barchart_dict = barchart_dict
relatives = soup.find_all("div", attrs={"class": "card_box graph"})
relatives_hidden = soup.find_all("div", attrs={"class": "_block card_box graph"})
relatives += relatives_hidden
relative_dict = defaultdict(dict)
for i in relatives:
b = i.text.split("\n")
box = [i for i in b if i != ""]
title = box[0] + " | " + box[1]
start = box.index(u"상세 결과") + 1
box.pop()
info = box.pop()
box = box[start:]
for i in range(len(box)):
rate = re.findall(r"([0-9.]+)", box[i])[0]
category = re.findall(u"([가-힣 ]+)", box[i])[0]
#print rate, category
relative_dict[title][category] = float(rate)
self.relative_dict = relative_dict
buff = StringIO.StringIO()
buff.write("\n".join(by_region[by_region.index("{"):by_region.index("]);")]))
buff.seek(0)
line = buff.readline()
region_dict = defaultdict(lambda : defaultdict(dict))
while line:
if line.startswith("formatDate"):
targetDate = line.split('"')[1] ##
line = buff.readline()
line = buff.readline()
#line = buff.readline()
office = line.split('"')[1] ##
line = buff.readline()
line = buff.readline()
agency = line.split('"')[1] ##
line = buff.readline()
info = line.split('"')[1] ##
line = buff.readline()
url = line.split('"')[1] ##
line = buff.readline()
if line.startswith("name"):
reg = line.split('"')[1]
if line.startswith("candidateList:"):
line = buff.readline()
line = buff.readline()
while not line.startswith("]"):
if line.startswith("id"):
identity = line.split('"')[1] ##
line = buff.readline()
if line.startswith("rating"):
rate = line.split('"')[1]
if rate == "":
rate = 0
rating = float(rate) ##
region_dict[targetDate][reg][identity] = rating
line = buff.readline()
line = buff.readline()
self.region_dict = region_dict
def plot_latest(self):
df = self.poll_df.iloc[:, -1].dropna()
location = 0
xlocs = []
xticks = []
ratings = []
for index, record in df.iteritems():
location += 1
x = location
if index == '':
continue
xtick, color = self.id_dict[index]
y = record
plt.bar([x], [y], align="center", color=color, yerr=3, error_kw={'ecolor':'k'})
plt.annotate("{}%".format(y), xy=(x+.05, y+1), horizontalalignment="left")
xlocs.append(x)
xticks.append(xtick)
ratings.append(y)
plt.xlim(None, len(df.index)+1)
plt.ylim(0, max(ratings)+3.5)
plt.xticks(xlocs, xticks, fontsize=11)
plt.xlabel(u"후보자", fontsize=14)
plt.ylabel(u"지지율(%)", fontsize=14)
plt.title(df.name, fontsize=16)
plt.tight_layout()
def plot_important(self):
self.poll_important_df.plot(figsize=(12,6), color=self.color_palette,
legend={"loc": 2, "fontsize": 6}, label="series")
def plot_date(self, yyyymmdd):
try:
df = self.poll_df.loc[:, yyyymmdd].dropna()
location = 0
xlocs = []
xticks = []
ratings = []
for index, record in df.iteritems():
location += 1
x = location
if index == '':
continue
xtick, color = self.id_dict[index]
y = record
plt.bar([x], [y], align="center", color=color, yerr=3, error_kw={'ecolor':'k'})
plt.annotate("{}%".format(y), xy=(x+.05, y+1), horizontalalignment="left")
xlocs.append(x)
xticks.append(xtick)
ratings.append(y)
plt.xlim(None, len(df.index)+1)
plt.ylim(0, max(ratings)+3.5)
plt.xticks(xlocs, xticks, fontsize=11)
plt.xlabel(u"후보자", fontsize=14)
plt.ylabel(u"지지율(%)", fontsize=14)
plt.title(df.name, fontsize=16)
plt.tight_layout()
except KeyError:
raise KeyError(u"이 날({})의 여론조사가 없습니다.".format(yyyymmdd))
def show(self):
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment