Created
March 2, 2022 15:14
-
-
Save jgs03177/fa1a4fbbf3abed830ea9c6f64492c9cf to your computer and use it in GitHub Desktop.
지식인 크롤러
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import csv | |
# import unicodecsv as csv | |
from time import sleep | |
from random import normalvariate | |
from bs4 import BeautifulSoup | |
pagenum_range = range(829, 844) | |
filename = 'kin.csv' | |
def main(): | |
error_lines = list() | |
error_links = list() | |
with open(filename, 'x', newline='') as csvfile: | |
spamwriter = csv.writer(csvfile, dialect='excel') | |
for current_pagenum in pagenum_range: | |
current_pagestr = 'https://kin.naver.com/best/listaha.nhn?page=' + str(current_pagenum) | |
req = requests.get(current_pagestr) | |
html = req.text | |
soup = BeautifulSoup(html, 'html.parser') | |
row_range = range(1, 21) | |
for row in row_range: | |
title_accessor = '#au_board_list > tr:nth-of-type(' + str( | |
row) + ') > td:nth-of-type(1) > a:nth-of-type(1)' # title | |
title_ = soup.select_one(title_accessor) | |
q_title = title_.text.strip() | |
rellink_str = title_.get('href') | |
abslink_str = 'https://kin.naver.com' + rellink_str | |
req2 = requests.get(abslink_str) | |
html2 = req2.text | |
soup2 = BeautifulSoup(html2, 'html.parser') | |
q_content_accessor = '#contents_layer_0 > div:nth-of-type(1) > div:nth-of-type(1)' | |
q_type_accessors = ['#au_location > li:nth-of-type(2) > a:nth-of-type(1)', | |
'#au_location > li:nth-of-type(3) > a:nth-of-type(1)', | |
'#au_location > li:nth-of-type(4) > a:nth-of-type(1)', | |
'#au_location > li:nth-of-type(5) > a:nth-of-type(1)'] | |
a_rank_accessor = 'a.info_text' | |
a_content_accessor = '.first_answer > ._contentsLayer > div:nth-of-type(1) > div:nth-of-type(1)' | |
q_type = list() | |
for q_type_accessor in q_type_accessors: | |
q_type_ = soup2.select_one(q_type_accessor) | |
if q_type_ is not None: | |
q_type.append(q_type_.text) | |
while len(q_type) <= 4: | |
q_type.append('') | |
q_content_ = soup2.select_one(q_content_accessor) | |
q_content = q_content_.text.strip() if q_content_ is not None else '신고당한 게시물' | |
a_rank_ = soup2.select_one(a_rank_accessor) | |
a_rank = a_rank_.text if a_rank_ is not None else '일반' | |
a_content = soup2.select_one(a_content_accessor).text.strip() | |
output_strings = [str(current_pagenum)] | |
output_strings.extend(q_type) | |
output_strings.extend( | |
[q_title, | |
q_content.replace(u'\xa0', u' '), | |
a_rank, | |
a_content.replace(u'\xa0', u' '), | |
abslink_str]) | |
print(output_strings) | |
try: | |
spamwriter.writerow(output_strings) | |
except UnicodeEncodeError: | |
error_lines.append(output_strings) | |
error_links.append(abslink_str) | |
sleeptime = 0 | |
while sleeptime < 0.3: | |
sleeptime = normalvariate(1, 1) | |
sleep(sleeptime) | |
sleeptime = 0 | |
while sleeptime < 0.3: | |
sleeptime = normalvariate(2, 1) | |
sleep(sleeptime) | |
for e in error_links: | |
spamwriter.writerow(e) | |
if len(error_lines) != 0: | |
print('unicode errors:') | |
for e in error_lines: | |
print(e) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment