woodongk/crawling_naver_news_comments.py

## crawling_naver_news_comments.py
# 출처 - https://wikidocs.net/61221

from selenium import webdriver
import time

def get_comments(URL,imp_time=5,delay_time=0.1):
    #웹 드라이버
    driver = webdriver.Chrome('/usr/local/bin/chromedriver') #chromedriver
    driver.implicitly_wait(imp_time)
    driver.get(URL)

    #더보기 계속 클릭해서 모든 데이터 오픈
    while True:
        try:
            더보기 = driver.find_element_by_css_selector('a.u_cbox_btn_more')
            더보기.click()
            time.sleep(delay_time)
        except:
            break

    html = driver.page_source

    # 모듈 참조
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html, 'lxml') #html.parser

    # 작성자
    user_ids = soup.select('span.u_cbox_nick')
    user_ids = [user_id.text for user_id in user_ids]

    # 삭제된 댓글 포함해서 모든 댓글 추출
    contents = soup.findAll("span", {"class":["u_cbox_contents","u_cbox_delete_contents"]})
    contents = [content.get_text() for content in contents]

    # 날짜 추출
    dates = soup.select('span.u_cbox_date')
    dates = [date.text for date in dates]

    # 취합
    comments = list(zip(user_ids, contents, dates))

    driver.quit()

def get_user_profile_from_comment(URL, imp_time=5, delay_time=0.1):

    #웹 드라이버
    driver = webdriver.Chrome('/usr/local/bin/chromedriver')
    driver.implicitly_wait(imp_time)
    driver.get(URL)

    #더보기 계속 클릭해서 모든 데이터 오픈
    while True:
        try:
            더보기 = driver.find_element_by_css_selector('a.u_cbox_btn_more')
            더보기.click()
            time.sleep(delay_time)
        except Exception as e:
#            print(e)
            print("더이상 더보기할 댓글이 없습니다.")
            break

    # 맨 위로 페이지 업 - 댓글 순차적으로 불러오기 쉽도록
    btn_top = driver.find_element_by_css_selector("a.floating_btn_top")
    btn_top.click()

    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')

    # 전체 댓글 개수
    total_comments = int(soup.select_one('span.u_cbox_count').text.replace(",",""))
    print("총",total_comments,"개 댓글")
    print("-"*50)

    total_profiles = []

    # 코멘트 차례대로 긁어오기
    cbox_list = soup.select("div.u_cbox_content_wrap > ul.u_cbox_list > li")
    for i, comment in enumerate(cbox_list,1):
        if "visible:false" in str(comment): # 작성자에 의해 삭제된 댓글 처리
            #print(i, "번째 댓글", comment.get_text())
            pass
        else: # 삭제되었거나 보이지 않는 댓글은 제외하고 긁어온다
            try:
                button = '//*[@id="cbox_module"]/div[*]/div[*]/ul/li[%s]/div[1]/div/div[1]/span[1]/button' % i
                댓글모음 = driver.find_element_by_xpath(button)
                댓글모음.click()

                profiles = driver.find_elements_by_css_selector('div.u_cbox_userinfo_wrap')
                profiles = [profile.text for profile in profiles]
                total_profiles.append(profiles)

                driver.back()
                time.sleep(delay_time)

            except Exception as e:
                print(e)
                print(i, "번째 댓글 오류 발생", comment.get_text())
                continue

    driver.quit()

    return total_profiles
	# 출처 - https://wikidocs.net/61221

	from selenium import webdriver
	import time

	def get_comments(URL,imp_time=5,delay_time=0.1):
	#웹 드라이버
	driver = webdriver.Chrome('/usr/local/bin/chromedriver') #chromedriver
	driver.implicitly_wait(imp_time)
	driver.get(URL)

	#더보기 계속 클릭해서 모든 데이터 오픈
	while True:
	try:
	더보기 = driver.find_element_by_css_selector('a.u_cbox_btn_more')
	더보기.click()
	time.sleep(delay_time)
	except:
	break

	html = driver.page_source

	# 모듈 참조
	from bs4 import BeautifulSoup
	soup = BeautifulSoup(html, 'lxml') #html.parser

	# 작성자
	user_ids = soup.select('span.u_cbox_nick')
	user_ids = [user_id.text for user_id in user_ids]

	# 삭제된 댓글 포함해서 모든 댓글 추출
	contents = soup.findAll("span", {"class":["u_cbox_contents","u_cbox_delete_contents"]})
	contents = [content.get_text() for content in contents]

	# 날짜 추출
	dates = soup.select('span.u_cbox_date')
	dates = [date.text for date in dates]

	# 취합
	comments = list(zip(user_ids, contents, dates))

	driver.quit()

	def get_user_profile_from_comment(URL, imp_time=5, delay_time=0.1):

	#웹 드라이버
	driver = webdriver.Chrome('/usr/local/bin/chromedriver')
	driver.implicitly_wait(imp_time)
	driver.get(URL)

	#더보기 계속 클릭해서 모든 데이터 오픈
	while True:
	try:
	더보기 = driver.find_element_by_css_selector('a.u_cbox_btn_more')
	더보기.click()
	time.sleep(delay_time)
	except Exception as e:
	# print(e)
	print("더이상 더보기할 댓글이 없습니다.")
	break

	# 맨 위로 페이지 업 - 댓글 순차적으로 불러오기 쉽도록
	btn_top = driver.find_element_by_css_selector("a.floating_btn_top")
	btn_top.click()

	html = driver.page_source
	soup = BeautifulSoup(html, 'html.parser')

	# 전체 댓글 개수
	total_comments = int(soup.select_one('span.u_cbox_count').text.replace(",",""))
	print("총",total_comments,"개 댓글")
	print("-"*50)

	total_profiles = []

	# 코멘트 차례대로 긁어오기
	cbox_list = soup.select("div.u_cbox_content_wrap > ul.u_cbox_list > li")
	for i, comment in enumerate(cbox_list,1):
	if "visible:false" in str(comment): # 작성자에 의해 삭제된 댓글 처리
	#print(i, "번째 댓글", comment.get_text())
	pass
	else: # 삭제되었거나 보이지 않는 댓글은 제외하고 긁어온다
	try:
	button = '//[@id="cbox_module"]/div[]/div[*]/ul/li[%s]/div[1]/div/div[1]/span[1]/button' % i
	댓글모음 = driver.find_element_by_xpath(button)
	댓글모음.click()

	profiles = driver.find_elements_by_css_selector('div.u_cbox_userinfo_wrap')
	profiles = [profile.text for profile in profiles]
	total_profiles.append(profiles)

	driver.back()
	time.sleep(delay_time)

	except Exception as e:
	print(e)
	print(i, "번째 댓글 오류 발생", comment.get_text())
	continue

	driver.quit()

	return total_profiles