Skip to content

Instantly share code, notes, and snippets.

@woodongk
Last active April 6, 2020 23:28
Show Gist options
  • Save woodongk/638cd6986fbee0ecf8e826aa8ef3a5d9 to your computer and use it in GitHub Desktop.
Save woodongk/638cd6986fbee0ecf8e826aa8ef3a5d9 to your computer and use it in GitHub Desktop.
네이버 뉴스에서 댓글 긁어오기
# 출처 - https://wikidocs.net/61221
from selenium import webdriver
import time
def get_comments(URL,imp_time=5,delay_time=0.1):
#웹 드라이버
driver = webdriver.Chrome('/usr/local/bin/chromedriver') #chromedriver
driver.implicitly_wait(imp_time)
driver.get(URL)
#더보기 계속 클릭해서 모든 데이터 오픈
while True:
try:
더보기 = driver.find_element_by_css_selector('a.u_cbox_btn_more')
더보기.click()
time.sleep(delay_time)
except:
break
html = driver.page_source
# 모듈 참조
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml') #html.parser
# 작성자
user_ids = soup.select('span.u_cbox_nick')
user_ids = [user_id.text for user_id in user_ids]
# 삭제된 댓글 포함해서 모든 댓글 추출
contents = soup.findAll("span", {"class":["u_cbox_contents","u_cbox_delete_contents"]})
contents = [content.get_text() for content in contents]
# 날짜 추출
dates = soup.select('span.u_cbox_date')
dates = [date.text for date in dates]
# 취합
comments = list(zip(user_ids, contents, dates))
driver.quit()
def get_user_profile_from_comment(URL, imp_time=5, delay_time=0.1):
#웹 드라이버
driver = webdriver.Chrome('/usr/local/bin/chromedriver')
driver.implicitly_wait(imp_time)
driver.get(URL)
#더보기 계속 클릭해서 모든 데이터 오픈
while True:
try:
더보기 = driver.find_element_by_css_selector('a.u_cbox_btn_more')
더보기.click()
time.sleep(delay_time)
except Exception as e:
# print(e)
print("더이상 더보기할 댓글이 없습니다.")
break
# 맨 위로 페이지 업 - 댓글 순차적으로 불러오기 쉽도록
btn_top = driver.find_element_by_css_selector("a.floating_btn_top")
btn_top.click()
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
# 전체 댓글 개수
total_comments = int(soup.select_one('span.u_cbox_count').text.replace(",",""))
print("총",total_comments,"개 댓글")
print("-"*50)
total_profiles = []
# 코멘트 차례대로 긁어오기
cbox_list = soup.select("div.u_cbox_content_wrap > ul.u_cbox_list > li")
for i, comment in enumerate(cbox_list,1):
if "visible:false" in str(comment): # 작성자에 의해 삭제된 댓글 처리
#print(i, "번째 댓글", comment.get_text())
pass
else: # 삭제되었거나 보이지 않는 댓글은 제외하고 긁어온다
try:
button = '//*[@id="cbox_module"]/div[*]/div[*]/ul/li[%s]/div[1]/div/div[1]/span[1]/button' % i
댓글모음 = driver.find_element_by_xpath(button)
댓글모음.click()
profiles = driver.find_elements_by_css_selector('div.u_cbox_userinfo_wrap')
profiles = [profile.text for profile in profiles]
total_profiles.append(profiles)
driver.back()
time.sleep(delay_time)
except Exception as e:
print(e)
print(i, "번째 댓글 오류 발생", comment.get_text())
continue
driver.quit()
return total_profiles
@woodongk
Copy link
Author

woodongk commented Apr 4, 2020

삭제된 댓글을 제외한 나머지 댓글의 유저 아이디와 댓글 내용, 작성날짜만 가져오는 법 고민

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment