Skip to content

Instantly share code, notes, and snippets.

@loslch
Created March 10, 2020 23:21
Show Gist options
  • Save loslch/aa0566098168a8ea86b6a58adf9f96d3 to your computer and use it in GitHub Desktop.
Save loslch/aa0566098168a8ea86b6a58adf9f96d3 to your computer and use it in GitHub Desktop.
Naver Cafe Article Crawler
#-*- coding:utf-8 -*-
import json
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException
def find_element_by_css_selector(driver, css_selector):
try:
return driver.find_element_by_css_selector(css_selector)
except NoSuchElementException:
return None
chrome_options = Options()
chrome_options.add_experimental_option("prefs", {"profile.managed_default_content_settings.javascript": 2})
browser = webdriver.Chrome("chromedriver", options=chrome_options)
browser.implicitly_wait(3)
browser.get("https://cafe.naver.com/")
cafe_id = "12345"
""" cookie.json
[
{"domain": "cafe.naver.com", "name": "JSESSIONID", "value": "...", "path": "/", "httpOnly": true, "secure": false},
{"domain": ".naver.com", "name": "NID_AUT", "value": "...", "path": "/", "httpOnly": true, "secure": false},
{"domain": ".naver.com", "name": "NID_JKL", "value": "...", "path": "/", "httpOnly": false, "secure": true},
{"domain": ".naver.com", "name": "NID_SES", "value": "...", "path": "/", "httpOnly": false, "secure": false},
{"domain": ".naver.com", "name": "NNB", "value": "...", "path": "/", "httpOnly": false, "secure": true},
{"domain": ".naver.com", "name": "NRTK", "value": "...", "path": "/", "httpOnly": false, "secure": false},
{"domain": ".cafe.naver.com", "name": "nci4", "value": "...", "path": "/", "httpOnly": false, "secure": false},
{"domain": ".cafe.naver.com", "name": "ncmc4", "value": "...", "path": "/", "httpOnly": false, "secure": false},
{"domain": ".cafe.naver.com", "name": "ncu", "value": "...", "path": "/", "httpOnly": false, "secure": false},
{"domain": ".cafe.naver.com", "name": "ncvc2", "value": "...", "path": "/", "httpOnly": false, "secure": false},
{"domain": ".naver.com", "name": "nid_inf", "value": "...", "path": "/", "httpOnly": false, "secure": false},
{"domain": ".naver.com", "name": "nx_ssl", "value": "...", "path": "/", "httpOnly": false, "secure": false}
]
"""
with open("cookie.json", "r") as j:
saved_cookies = json.load(j)
for cookie in saved_cookies:
browser.add_cookie(cookie)
data = []
try:
for page in range(1, 1001):
pagenum = str(page)
print("Start crawling page #" + pagenum)
url_article_list = "https://cafe.naver.com/ArticleList.nhn?search.clubid=" + cafe_id + "&search.boardtype=C&search.page=" + pagenum
browser.get(url_article_list)
contents = browser.find_elements_by_css_selector(".article-movie-sub li")
for content in contents:
category = content.find_element_by_class_name("board_name")
con_top = content.find_element_by_class_name("con_top")
title = con_top.find_element_by_class_name("tit")
text = con_top.find_element_by_class_name("txt")
con_bottom = content.find_element_by_class_name("con_bottom")
name = con_bottom.find_element_by_class_name("p-nick")
date = con_bottom.find_element_by_class_name("date")
read = con_bottom.find_element_by_class_name("num")
like_area = con_bottom.find_element_by_class_name("like_area")
comments = like_area.find_element_by_class_name("num")
likes = like_area.find_element_by_class_name("num-recomm")
item = {
"title": str(title.text),
"category": str(category.text),
"text": str(text.text),
"link": str(text.get_attribute('href')),
"name": str(name.text),
"date": str(date.text),
"read": int(read.text.replace("조회 ", "").replace(",", "")),
"comments": int(comments.text.replace(",", "")),
"likes": int(likes.text.replace(",", ""))
}
image = find_element_by_css_selector(content, ".movie-img img")
if image:
item["image"] = image.get_attribute('src')
print(" > processing :: " + str(title.text))
data.append(item)
except Exception as err:
print("Error: ", err)
finally:
with open("result.json", "w", encoding="UTF-8-sig") as result:
result.write(json.dumps(data, ensure_ascii=False))
browser.close()
@loslch
Copy link
Author

loslch commented Mar 10, 2020

Works on v3.7.6

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment