Skip to content

Instantly share code, notes, and snippets.

@jgs03177
Created March 2, 2022 15:14
Show Gist options
  • Save jgs03177/fa1a4fbbf3abed830ea9c6f64492c9cf to your computer and use it in GitHub Desktop.
Save jgs03177/fa1a4fbbf3abed830ea9c6f64492c9cf to your computer and use it in GitHub Desktop.
지식인 크롤러
import requests
import csv
# import unicodecsv as csv
from time import sleep
from random import normalvariate
from bs4 import BeautifulSoup
pagenum_range = range(829, 844)
filename = 'kin.csv'
def main():
error_lines = list()
error_links = list()
with open(filename, 'x', newline='') as csvfile:
spamwriter = csv.writer(csvfile, dialect='excel')
for current_pagenum in pagenum_range:
current_pagestr = 'https://kin.naver.com/best/listaha.nhn?page=' + str(current_pagenum)
req = requests.get(current_pagestr)
html = req.text
soup = BeautifulSoup(html, 'html.parser')
row_range = range(1, 21)
for row in row_range:
title_accessor = '#au_board_list > tr:nth-of-type(' + str(
row) + ') > td:nth-of-type(1) > a:nth-of-type(1)' # title
title_ = soup.select_one(title_accessor)
q_title = title_.text.strip()
rellink_str = title_.get('href')
abslink_str = 'https://kin.naver.com' + rellink_str
req2 = requests.get(abslink_str)
html2 = req2.text
soup2 = BeautifulSoup(html2, 'html.parser')
q_content_accessor = '#contents_layer_0 > div:nth-of-type(1) > div:nth-of-type(1)'
q_type_accessors = ['#au_location > li:nth-of-type(2) > a:nth-of-type(1)',
'#au_location > li:nth-of-type(3) > a:nth-of-type(1)',
'#au_location > li:nth-of-type(4) > a:nth-of-type(1)',
'#au_location > li:nth-of-type(5) > a:nth-of-type(1)']
a_rank_accessor = 'a.info_text'
a_content_accessor = '.first_answer > ._contentsLayer > div:nth-of-type(1) > div:nth-of-type(1)'
q_type = list()
for q_type_accessor in q_type_accessors:
q_type_ = soup2.select_one(q_type_accessor)
if q_type_ is not None:
q_type.append(q_type_.text)
while len(q_type) <= 4:
q_type.append('')
q_content_ = soup2.select_one(q_content_accessor)
q_content = q_content_.text.strip() if q_content_ is not None else '신고당한 게시물'
a_rank_ = soup2.select_one(a_rank_accessor)
a_rank = a_rank_.text if a_rank_ is not None else '일반'
a_content = soup2.select_one(a_content_accessor).text.strip()
output_strings = [str(current_pagenum)]
output_strings.extend(q_type)
output_strings.extend(
[q_title,
q_content.replace(u'\xa0', u' '),
a_rank,
a_content.replace(u'\xa0', u' '),
abslink_str])
print(output_strings)
try:
spamwriter.writerow(output_strings)
except UnicodeEncodeError:
error_lines.append(output_strings)
error_links.append(abslink_str)
sleeptime = 0
while sleeptime < 0.3:
sleeptime = normalvariate(1, 1)
sleep(sleeptime)
sleeptime = 0
while sleeptime < 0.3:
sleeptime = normalvariate(2, 1)
sleep(sleeptime)
for e in error_links:
spamwriter.writerow(e)
if len(error_lines) != 0:
print('unicode errors:')
for e in error_lines:
print(e)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment