jgs03177/kincrawler.py

## kincrawler.py
import requests
import csv
# import unicodecsv as csv
from time import sleep
from random import normalvariate
from bs4 import BeautifulSoup

pagenum_range = range(829, 844)
filename = 'kin.csv'


def main():
    error_lines = list()
    error_links = list()
    with open(filename, 'x', newline='') as csvfile:
        spamwriter = csv.writer(csvfile, dialect='excel')
        for current_pagenum in pagenum_range:
            current_pagestr = 'https://kin.naver.com/best/listaha.nhn?page=' + str(current_pagenum)
            req = requests.get(current_pagestr)
            html = req.text

            soup = BeautifulSoup(html, 'html.parser')

            row_range = range(1, 21)
            for row in row_range:
                title_accessor = '#au_board_list > tr:nth-of-type(' + str(
                    row) + ') > td:nth-of-type(1) > a:nth-of-type(1)'  # title

                title_ = soup.select_one(title_accessor)

                q_title = title_.text.strip()
                rellink_str = title_.get('href')
                abslink_str = 'https://kin.naver.com' + rellink_str

                req2 = requests.get(abslink_str)
                html2 = req2.text

                soup2 = BeautifulSoup(html2, 'html.parser')

                q_content_accessor = '#contents_layer_0 > div:nth-of-type(1) > div:nth-of-type(1)'
                q_type_accessors = ['#au_location > li:nth-of-type(2) > a:nth-of-type(1)',
                                    '#au_location > li:nth-of-type(3) > a:nth-of-type(1)',
                                    '#au_location > li:nth-of-type(4) > a:nth-of-type(1)',
                                    '#au_location > li:nth-of-type(5) > a:nth-of-type(1)']
                a_rank_accessor = 'a.info_text'
                a_content_accessor = '.first_answer > ._contentsLayer > div:nth-of-type(1) > div:nth-of-type(1)'

                q_type = list()
                for q_type_accessor in q_type_accessors:
                    q_type_ = soup2.select_one(q_type_accessor)
                    if q_type_ is not None:
                        q_type.append(q_type_.text)
                while len(q_type) <= 4:
                    q_type.append('')

                q_content_ = soup2.select_one(q_content_accessor)
                q_content = q_content_.text.strip() if q_content_ is not None else '신고당한 게시물'
                a_rank_ = soup2.select_one(a_rank_accessor)
                a_rank = a_rank_.text if a_rank_ is not None else '일반'
                a_content = soup2.select_one(a_content_accessor).text.strip()

                output_strings = [str(current_pagenum)]
                output_strings.extend(q_type)
                output_strings.extend(
                    [q_title,
                     q_content.replace(u'\xa0', u' '),
                     a_rank,
                     a_content.replace(u'\xa0', u' '),
                     abslink_str])
                print(output_strings)
                try:
                    spamwriter.writerow(output_strings)
                except UnicodeEncodeError:
                    error_lines.append(output_strings)
                    error_links.append(abslink_str)

                sleeptime = 0
                while sleeptime < 0.3:
                    sleeptime = normalvariate(1, 1)
                sleep(sleeptime)
            sleeptime = 0
            while sleeptime < 0.3:
                sleeptime = normalvariate(2, 1)
            sleep(sleeptime)

        for e in error_links:
            spamwriter.writerow(e)

    if len(error_lines) != 0:
        print('unicode errors:')
        for e in error_lines:
            print(e)


if __name__ == '__main__':
    main()
	import requests
	import csv
	# import unicodecsv as csv
	from time import sleep
	from random import normalvariate
	from bs4 import BeautifulSoup

	pagenum_range = range(829, 844)
	filename = 'kin.csv'


	def main():
	error_lines = list()
	error_links = list()
	with open(filename, 'x', newline='') as csvfile:
	spamwriter = csv.writer(csvfile, dialect='excel')
	for current_pagenum in pagenum_range:
	current_pagestr = 'https://kin.naver.com/best/listaha.nhn?page=' + str(current_pagenum)
	req = requests.get(current_pagestr)
	html = req.text

	soup = BeautifulSoup(html, 'html.parser')

	row_range = range(1, 21)
	for row in row_range:
	title_accessor = '#au_board_list > tr:nth-of-type(' + str(
	row) + ') > td:nth-of-type(1) > a:nth-of-type(1)' # title

	title_ = soup.select_one(title_accessor)

	q_title = title_.text.strip()
	rellink_str = title_.get('href')
	abslink_str = 'https://kin.naver.com' + rellink_str

	req2 = requests.get(abslink_str)
	html2 = req2.text

	soup2 = BeautifulSoup(html2, 'html.parser')

	q_content_accessor = '#contents_layer_0 > div:nth-of-type(1) > div:nth-of-type(1)'
	q_type_accessors = ['#au_location > li:nth-of-type(2) > a:nth-of-type(1)',
	'#au_location > li:nth-of-type(3) > a:nth-of-type(1)',
	'#au_location > li:nth-of-type(4) > a:nth-of-type(1)',
	'#au_location > li:nth-of-type(5) > a:nth-of-type(1)']
	a_rank_accessor = 'a.info_text'
	a_content_accessor = '.first_answer > ._contentsLayer > div:nth-of-type(1) > div:nth-of-type(1)'

	q_type = list()
	for q_type_accessor in q_type_accessors:
	q_type_ = soup2.select_one(q_type_accessor)
	if q_type_ is not None:
	q_type.append(q_type_.text)
	while len(q_type) <= 4:
	q_type.append('')

	q_content_ = soup2.select_one(q_content_accessor)
	q_content = q_content_.text.strip() if q_content_ is not None else '신고당한 게시물'
	a_rank_ = soup2.select_one(a_rank_accessor)
	a_rank = a_rank_.text if a_rank_ is not None else '일반'
	a_content = soup2.select_one(a_content_accessor).text.strip()

	output_strings = [str(current_pagenum)]
	output_strings.extend(q_type)
	output_strings.extend(
	[q_title,
	q_content.replace(u'\xa0', u' '),
	a_rank,
	a_content.replace(u'\xa0', u' '),
	abslink_str])
	print(output_strings)
	try:
	spamwriter.writerow(output_strings)
	except UnicodeEncodeError:
	error_lines.append(output_strings)
	error_links.append(abslink_str)

	sleeptime = 0
	while sleeptime < 0.3:
	sleeptime = normalvariate(1, 1)
	sleep(sleeptime)
	sleeptime = 0
	while sleeptime < 0.3:
	sleeptime = normalvariate(2, 1)
	sleep(sleeptime)

	for e in error_links:
	spamwriter.writerow(e)

	if len(error_lines) != 0:
	print('unicode errors:')
	for e in error_lines:
	print(e)


	if __name__ == '__main__':
	main()