loslch/crawl.py

## crawl.py
#-*- coding:utf-8 -*-

import json
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException

def find_element_by_css_selector(driver, css_selector):
	try:
		return driver.find_element_by_css_selector(css_selector)
	except NoSuchElementException:
		return None

chrome_options = Options()
chrome_options.add_experimental_option("prefs", {"profile.managed_default_content_settings.javascript": 2})
browser = webdriver.Chrome("chromedriver", options=chrome_options)

browser.implicitly_wait(3)
browser.get("https://cafe.naver.com/")

cafe_id = "12345"

""" cookie.json
[
	{"domain": "cafe.naver.com", "name": "JSESSIONID", "value": "...", "path": "/", "httpOnly": true, "secure": false},
	{"domain": ".naver.com", "name": "NID_AUT", "value": "...", "path": "/", "httpOnly": true, "secure": false},
	{"domain": ".naver.com", "name": "NID_JKL", "value": "...", "path": "/", "httpOnly": false, "secure": true},
	{"domain": ".naver.com", "name": "NID_SES", "value": "...", "path": "/", "httpOnly": false, "secure": false},
	{"domain": ".naver.com", "name": "NNB", "value": "...", "path": "/", "httpOnly": false, "secure": true},
	{"domain": ".naver.com", "name": "NRTK", "value": "...", "path": "/", "httpOnly": false, "secure": false},
	{"domain": ".cafe.naver.com", "name": "nci4", "value": "...", "path": "/", "httpOnly": false, "secure": false},
	{"domain": ".cafe.naver.com", "name": "ncmc4", "value": "...", "path": "/", "httpOnly": false, "secure": false},
	{"domain": ".cafe.naver.com", "name": "ncu", "value": "...", "path": "/", "httpOnly": false, "secure": false},
	{"domain": ".cafe.naver.com", "name": "ncvc2", "value": "...", "path": "/", "httpOnly": false, "secure": false},
	{"domain": ".naver.com", "name": "nid_inf", "value": "...", "path": "/", "httpOnly": false, "secure": false},
	{"domain": ".naver.com", "name": "nx_ssl", "value": "...", "path": "/", "httpOnly": false, "secure": false}
]
"""
with open("cookie.json", "r") as j:
	saved_cookies = json.load(j)
	for cookie in saved_cookies:
		browser.add_cookie(cookie)

data = []
try:
	for page in range(1, 1001):
		pagenum = str(page)
		print("Start crawling page #" + pagenum)
		url_article_list = "https://cafe.naver.com/ArticleList.nhn?search.clubid=" + cafe_id + "&search.boardtype=C&search.page=" + pagenum
		browser.get(url_article_list)

		contents = browser.find_elements_by_css_selector(".article-movie-sub li")
		for content in contents:
			category = content.find_element_by_class_name("board_name")

			con_top = content.find_element_by_class_name("con_top")
			title = con_top.find_element_by_class_name("tit")
			text = con_top.find_element_by_class_name("txt")

			con_bottom = content.find_element_by_class_name("con_bottom")
			name = con_bottom.find_element_by_class_name("p-nick")
			date = con_bottom.find_element_by_class_name("date")
			read = con_bottom.find_element_by_class_name("num")

			like_area = con_bottom.find_element_by_class_name("like_area")
			comments = like_area.find_element_by_class_name("num")
			likes = like_area.find_element_by_class_name("num-recomm")

			item = {
				"title": str(title.text),
				"category": str(category.text),
				"text": str(text.text),
				"link": str(text.get_attribute('href')),
				"name": str(name.text),
				"date": str(date.text),
				"read": int(read.text.replace("조회 ", "").replace(",", "")),
				"comments": int(comments.text.replace(",", "")),
				"likes": int(likes.text.replace(",", ""))
			}

			image = find_element_by_css_selector(content, ".movie-img img")
			if image:
				item["image"] = image.get_attribute('src')

			print(" > processing :: " + str(title.text))
			data.append(item)
except Exception as err:
	print("Error: ", err)
finally:
	with open("result.json", "w", encoding="UTF-8-sig") as result:
		result.write(json.dumps(data, ensure_ascii=False))

	browser.close()
	#-- coding:utf-8 --

	import json
	from selenium import webdriver
	from selenium.webdriver.chrome.options import Options
	from selenium.common.exceptions import NoSuchElementException

	def find_element_by_css_selector(driver, css_selector):
	try:
	return driver.find_element_by_css_selector(css_selector)
	except NoSuchElementException:
	return None

	chrome_options = Options()
	chrome_options.add_experimental_option("prefs", {"profile.managed_default_content_settings.javascript": 2})
	browser = webdriver.Chrome("chromedriver", options=chrome_options)

	browser.implicitly_wait(3)
	browser.get("https://cafe.naver.com/")

	cafe_id = "12345"

	""" cookie.json
	[
	{"domain": "cafe.naver.com", "name": "JSESSIONID", "value": "...", "path": "/", "httpOnly": true, "secure": false},
	{"domain": ".naver.com", "name": "NID_AUT", "value": "...", "path": "/", "httpOnly": true, "secure": false},
	{"domain": ".naver.com", "name": "NID_JKL", "value": "...", "path": "/", "httpOnly": false, "secure": true},
	{"domain": ".naver.com", "name": "NID_SES", "value": "...", "path": "/", "httpOnly": false, "secure": false},
	{"domain": ".naver.com", "name": "NNB", "value": "...", "path": "/", "httpOnly": false, "secure": true},
	{"domain": ".naver.com", "name": "NRTK", "value": "...", "path": "/", "httpOnly": false, "secure": false},
	{"domain": ".cafe.naver.com", "name": "nci4", "value": "...", "path": "/", "httpOnly": false, "secure": false},
	{"domain": ".cafe.naver.com", "name": "ncmc4", "value": "...", "path": "/", "httpOnly": false, "secure": false},
	{"domain": ".cafe.naver.com", "name": "ncu", "value": "...", "path": "/", "httpOnly": false, "secure": false},
	{"domain": ".cafe.naver.com", "name": "ncvc2", "value": "...", "path": "/", "httpOnly": false, "secure": false},
	{"domain": ".naver.com", "name": "nid_inf", "value": "...", "path": "/", "httpOnly": false, "secure": false},
	{"domain": ".naver.com", "name": "nx_ssl", "value": "...", "path": "/", "httpOnly": false, "secure": false}
	]
	"""
	with open("cookie.json", "r") as j:
	saved_cookies = json.load(j)
	for cookie in saved_cookies:
	browser.add_cookie(cookie)

	data = []
	try:
	for page in range(1, 1001):
	pagenum = str(page)
	print("Start crawling page #" + pagenum)
	url_article_list = "https://cafe.naver.com/ArticleList.nhn?search.clubid=" + cafe_id + "&search.boardtype=C&search.page=" + pagenum
	browser.get(url_article_list)

	contents = browser.find_elements_by_css_selector(".article-movie-sub li")
	for content in contents:
	category = content.find_element_by_class_name("board_name")

	con_top = content.find_element_by_class_name("con_top")
	title = con_top.find_element_by_class_name("tit")
	text = con_top.find_element_by_class_name("txt")

	con_bottom = content.find_element_by_class_name("con_bottom")
	name = con_bottom.find_element_by_class_name("p-nick")
	date = con_bottom.find_element_by_class_name("date")
	read = con_bottom.find_element_by_class_name("num")

	like_area = con_bottom.find_element_by_class_name("like_area")
	comments = like_area.find_element_by_class_name("num")
	likes = like_area.find_element_by_class_name("num-recomm")

	item = {
	"title": str(title.text),
	"category": str(category.text),
	"text": str(text.text),
	"link": str(text.get_attribute('href')),
	"name": str(name.text),
	"date": str(date.text),
	"read": int(read.text.replace("조회 ", "").replace(",", "")),
	"comments": int(comments.text.replace(",", "")),
	"likes": int(likes.text.replace(",", ""))
	}

	image = find_element_by_css_selector(content, ".movie-img img")
	if image:
	item["image"] = image.get_attribute('src')

	print(" > processing :: " + str(title.text))
	data.append(item)
	except Exception as err:
	print("Error: ", err)
	finally:
	with open("result.json", "w", encoding="UTF-8-sig") as result:
	result.write(json.dumps(data, ensure_ascii=False))

	browser.close()