Created
March 11, 2018 13:09
-
-
Save Astro36/13806abcd85563376a3259f1fb7ebb32 to your computer and use it in GitHub Desktop.
Naver News Crawler for Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
#-*- coding: utf-8 -*- | |
import getopt | |
import json | |
import multiprocessing | |
import os | |
import re | |
import sys | |
import requests | |
from bs4 import BeautifulSoup | |
output_path = os.getcwd() | |
def extract_text(element_title, element_content): | |
title = element_title.get_text().strip() | |
content = element_content.get_text() \ | |
.replace(u'// flash 오류를 우회하기 위한 함수 추가', '') \ | |
.replace(r'function _flash_removeCallback() {}', '') | |
content = re.sub(r'\n+', ' ', content).strip() | |
return (title, content) | |
def get_content(url): | |
global output_path | |
req = requests.get(url) | |
html = req.text | |
soup = BeautifulSoup(html, 'html.parser') | |
selectors = ( | |
('.article_header > .article_info > #articleTitle', | |
'#articleBody > #articleBodyContents'), | |
('.content_area > .news_headline > .title', '.content_area > .news_end'), | |
('.end_ct > .end_ct_area > .end_tit', | |
'.end_ct > .end_ct_area > .end_body_wrp > .article_body') | |
) | |
element_title = None | |
element_content = None | |
title = None | |
content = None | |
index = 0 | |
while element_title is None or element_content is None: | |
element_title = soup.select_one(selectors[index][0]) | |
element_content = soup.select_one(selectors[index][1]) | |
index += 1 | |
if index >= len(selectors): | |
break | |
if element_title is not None and element_content is not None: | |
title, content = extract_text(element_title, element_content) | |
if title is not None and content is not None: | |
print(title) | |
f = open(u'{}/{}.json'.format(output_path, | |
title.replace('/', ' ')), 'w') | |
f.write(json.dumps({ | |
'url': url, | |
'title': title, | |
'content': content | |
}, indent=4, ensure_ascii=False).encode('utf8')) | |
f.close() | |
return | |
print(url) | |
if __name__ == '__main__': | |
try: | |
opts, args = getopt.getopt( | |
sys.argv[1:], | |
'ht:o:', | |
['help', 'type=', 'start=', 'start-id=', 'stop=', 'stop-id=', 'output=']) | |
except getopt.GetoptError as err: | |
print(str(err)) | |
sys.exit(2) | |
types = { | |
u'경향신문': '032', | |
u'국민일보': '005', | |
u'동아일보': '020', | |
u'문화일보': '021', | |
u'서울신문': '081', | |
u'세계일보': '022', | |
u'조선일보': '023', | |
u'중앙일보': '025', | |
u'한겨레': '028', | |
u'한국일보': '469' | |
} | |
type_name = None | |
start_id = None | |
stop_id = None | |
urls = None | |
for opt, arg in opts: | |
if opt in ('-h', '--help'): | |
sys.exit() | |
elif opt in ('-t', '--type'): | |
type_name = arg.decode('utf8') | |
elif opt in ('--start', '--start-id'): | |
start_id = int(arg) | |
elif opt in ('--stop', '--stop-id'): | |
stop_id = int(arg) | |
elif opt in ('-o', '--output'): | |
output_path = arg.decode('utf8') | |
else: | |
assert False, 'unhandled option' | |
if type_name in types: | |
type_id = types[type_name] | |
if stop_id is None: | |
req = requests.get( | |
'http://news.naver.com/main/list.nhn?mode=LPOD&mid=sec&oid={}'.format(type_id)) | |
html = req.text | |
soup = BeautifulSoup(html, 'html.parser') | |
stop_id = int(soup.select_one( | |
'.type06_headline > li > dl > dt > a')['href'].split('aid=')[1]) | |
urls = map( | |
lambda article_id: 'http://news.naver.com/main/read.nhn?mode=LPOD&mid=sec&oid={}&aid={}'.format( | |
type_id, | |
str(article_id).rjust(10, '0')), | |
range(start_id, stop_id + 1)) | |
else: | |
print('Error: Can not find the type ID.') | |
sys.exit(2) | |
pool = multiprocessing.Pool(multiprocessing.cpu_count()) | |
pool.map(get_content, urls) | |
f = open(u'{}/articles.json'.format(output_path), 'w') | |
f.write(json.dumps({ | |
'type': type_name, | |
'start_id': start_id, | |
'stop_id': stop_id | |
}, indent=4, ensure_ascii=False).encode('utf8')) | |
f.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment