skyclad0x7b7/NaverMailCrawler.py

## NaverMailCrawler.py
# The MIT License
#
# Copyright (c) 2018 Sanghyeon Jeon
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.

import os
import time

from random import randrange

from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from bs4 import *

import Utils

# ===== Config =====
HEADLESS       = False
SEARCH_KEYWORD = None
DOWNLOAD_PATH  = os.path.realpath("download")
USER_ID        = "USER_ID"
PASSWORD       = "PASSWORD"
LOGIN_URL      = "https://nid.naver.com/nidlogin.login"
# ==================

class NaverMailCrawler():
    def __init__(self):
        self.logger = Utils.CreateLogger("NaverMailCrawler")
        self.driverPath = os.path.realpath('chromedriver.exe')
        self.driver = None
        if not os.path.exists(DOWNLOAD_PATH):
            Utils.Mkdirs(DOWNLOAD_PATH)

    def __del__(self):
        pass

    # Decorator for checking driver
    def DriverCheck(targetFunction):
        def wrapper(self, *args, **kwargs):
            if self.driver == None:
                self.logger.info("Driver not ready, calling SetDriver()...")
                self.SetDriver()
            return targetFunction(self, *args, **kwargs)
        return wrapper

    def SetDriver(self):
        options = webdriver.ChromeOptions()
        options.headless = HEADLESS
        preferences = {
            "download.default_directory": DOWNLOAD_PATH,
            "directory_upgrade": True,
            "safebrowsing.enabled": True,
            "profile.default_content_setting_values.automatic_downloads": 2
        }
        options.add_experimental_option("prefs", preferences)
        self.driver = webdriver.Chrome(executable_path = self.driverPath, chrome_options = options)
        self.driver.set_page_load_timeout(15)  # 15 seconds timeout
        self.driver.set_window_size(1000, 600)
        self.driver.set_window_position(200, 200)
        self.logger.info("Driver Setting Completed")

    @DriverCheck
    def Login(self, userId, password):
        try:
            self.driver.get("http://www.naver.com/")
            self.driver.implicitly_wait(5)
            self.logger.info("Trying to Login : %s" % (LOGIN_URL))
            self.driver.get(LOGIN_URL)
            self.driver.implicitly_wait(5)
            time.sleep(randrange(2,5)) # For bypass captcha

            self.driver.find_element_by_name('id').send_keys(userId)
            time.sleep(1)
            self.driver.find_element_by_name('pw').send_keys(password)
            time.sleep(randrange(2,4))
            self.driver.find_element_by_xpath('//*[@id="frmNIDLogin"]/fieldset/input').click()
            try:
                self.driver.find_element_by_class_name('link_login_help')
            except NoSuchElementException:
                pass
            else:
                self.logger.error("Login Failed : Captcha Occured, Try Later...")
                return False

        except Exception as e:
            self.logger.error("Login Failed : Unknown Exception (%s)" % (str(e)))
            return False
        else:
            self.logger.info("Login Success")
            return True

    @DriverCheck
    def GetMailSNList(self, keyword = None, unreadOnly = False):
        mailSNList = []
        try:
            self.logger.info("Start Getting MailSNList")
            self.driver.get('https://mail.naver.com')
            self.driver.implicitly_wait(5)

            if keyword is not None:
                self.driver.find_element_by_id('searchKeyWord').send_keys(keyword)
                self.driver.find_element_by_xpath('//*[@id="searchBtn"]').click()
                time.sleep(2)

            if unreadOnly:
                self.driver.find_element_by_xpath('//*[@id="listBtnMenu"]/div[@class="buttonSet"]/button[6]').click()
                time.sleep(0.5)
                self.driver.find_element_by_xpath('//*[@id="changeViewFilterLayer"]/div/ul[@class="selector list_filtering"]/li[@data-viewfilter="unread"]').click()
                time.sleep(2)

            htmlSource = self.driver.page_source
            bs = BeautifulSoup(htmlSource, 'html5lib')
            mailSNList = [int(x['mailsn']) for x in bs.select('ol.mailList > li')]
            self.logger.info("GetMailSNList Success : %d found" % (len(mailSNList)))
        except Exception as e:
            self.logger.error("GetMailSNList Failed : Unknown Exception (%s)" % (str(e)))

        return mailSNList

    @DriverCheck
    def DownloadAttatchedFiles(self, mailSN):
        completedFileList = []
        try:

            self.driver.get('https://mail.naver.com/read/popup/?nMailId=%d' % (mailSN))
            self.driver.implicitly_wait(5)

            fileList = self.driver.find_elements_by_xpath('//*[@id="previewContent"]/div[@class="coverWrap"]/div[@class="attfile_area"]/div[@class="file_list"]/ul/li/span/a')

            for i in range(len(fileList)):
                fileName = fileList[i].get_attribute('title')
                self.logger.info("Try Download : %s" % (fileName))
                fileList[i].click()
                time.sleep(2)
                completedFileList.append(fileName)

        except Exception as e:
            self.logger.error("DownloadAttatchedFiles Error : Unknown Exception (%s)" % (str(e)))

        return completedFileList

def main():
    crawler = NaverMailCrawler()
    if crawler.Login(USER_ID, PASSWORD):
        mailSNList = crawler.GetMailSNList(keyword = SEARCH_KEYWORD, unreadOnly = True)
        for mailSN in mailSNList:
            completedFileList = crawler.DownloadAttatchedFiles(mailSN)
    raw_input("[*] Press Enter to exit")


if __name__ == '__main__':
    main()
	# The MIT License
	#
	# Copyright (c) 2018 Sanghyeon Jeon
	#
	# Permission is hereby granted, free of charge, to any person obtaining a copy
	# of this software and associated documentation files (the "Software"), to deal
	# in the Software without restriction, including without limitation the rights
	# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	# copies of the Software, and to permit persons to whom the Software is
	# furnished to do so, subject to the following conditions:
	#
	# The above copyright notice and this permission notice shall be included in
	# all copies or substantial portions of the Software.
	#
	# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
	# THE SOFTWARE.

	import os
	import time

	from random import randrange

	from selenium import webdriver
	from selenium.common.exceptions import NoSuchElementException
	from bs4 import *

	import Utils

	# ===== Config =====
	HEADLESS = False
	SEARCH_KEYWORD = None
	DOWNLOAD_PATH = os.path.realpath("download")
	USER_ID = "USER_ID"
	PASSWORD = "PASSWORD"
	LOGIN_URL = "https://nid.naver.com/nidlogin.login"
	# ==================

	class NaverMailCrawler():
	def __init__(self):
	self.logger = Utils.CreateLogger("NaverMailCrawler")
	self.driverPath = os.path.realpath('chromedriver.exe')
	self.driver = None
	if not os.path.exists(DOWNLOAD_PATH):
	Utils.Mkdirs(DOWNLOAD_PATH)

	def __del__(self):
	pass

	# Decorator for checking driver
	def DriverCheck(targetFunction):
	def wrapper(self, args, *kwargs):
	if self.driver == None:
	self.logger.info("Driver not ready, calling SetDriver()...")
	self.SetDriver()
	return targetFunction(self, args, *kwargs)
	return wrapper

	def SetDriver(self):
	options = webdriver.ChromeOptions()
	options.headless = HEADLESS
	preferences = {
	"download.default_directory": DOWNLOAD_PATH,
	"directory_upgrade": True,
	"safebrowsing.enabled": True,
	"profile.default_content_setting_values.automatic_downloads": 2
	}
	options.add_experimental_option("prefs", preferences)
	self.driver = webdriver.Chrome(executable_path = self.driverPath, chrome_options = options)
	self.driver.set_page_load_timeout(15) # 15 seconds timeout
	self.driver.set_window_size(1000, 600)
	self.driver.set_window_position(200, 200)
	self.logger.info("Driver Setting Completed")

	@DriverCheck
	def Login(self, userId, password):
	try:
	self.driver.get("http://www.naver.com/")
	self.driver.implicitly_wait(5)
	self.logger.info("Trying to Login : %s" % (LOGIN_URL))
	self.driver.get(LOGIN_URL)
	self.driver.implicitly_wait(5)
	time.sleep(randrange(2,5)) # For bypass captcha

	self.driver.find_element_by_name('id').send_keys(userId)
	time.sleep(1)
	self.driver.find_element_by_name('pw').send_keys(password)
	time.sleep(randrange(2,4))
	self.driver.find_element_by_xpath('//*[@id="frmNIDLogin"]/fieldset/input').click()
	try:
	self.driver.find_element_by_class_name('link_login_help')
	except NoSuchElementException:
	pass
	else:
	self.logger.error("Login Failed : Captcha Occured, Try Later...")
	return False

	except Exception as e:
	self.logger.error("Login Failed : Unknown Exception (%s)" % (str(e)))
	return False
	else:
	self.logger.info("Login Success")
	return True

	@DriverCheck
	def GetMailSNList(self, keyword = None, unreadOnly = False):
	mailSNList = []
	try:
	self.logger.info("Start Getting MailSNList")
	self.driver.get('https://mail.naver.com')
	self.driver.implicitly_wait(5)

	if keyword is not None:
	self.driver.find_element_by_id('searchKeyWord').send_keys(keyword)
	self.driver.find_element_by_xpath('//*[@id="searchBtn"]').click()
	time.sleep(2)

	if unreadOnly:
	self.driver.find_element_by_xpath('//*[@id="listBtnMenu"]/div[@class="buttonSet"]/button[6]').click()
	time.sleep(0.5)
	self.driver.find_element_by_xpath('//*[@id="changeViewFilterLayer"]/div/ul[@class="selector list_filtering"]/li[@data-viewfilter="unread"]').click()
	time.sleep(2)

	htmlSource = self.driver.page_source
	bs = BeautifulSoup(htmlSource, 'html5lib')
	mailSNList = [int(x['mailsn']) for x in bs.select('ol.mailList > li')]
	self.logger.info("GetMailSNList Success : %d found" % (len(mailSNList)))
	except Exception as e:
	self.logger.error("GetMailSNList Failed : Unknown Exception (%s)" % (str(e)))

	return mailSNList

	@DriverCheck
	def DownloadAttatchedFiles(self, mailSN):
	completedFileList = []
	try:

	self.driver.get('https://mail.naver.com/read/popup/?nMailId=%d' % (mailSN))
	self.driver.implicitly_wait(5)

	fileList = self.driver.find_elements_by_xpath('//*[@id="previewContent"]/div[@class="coverWrap"]/div[@class="attfile_area"]/div[@class="file_list"]/ul/li/span/a')

	for i in range(len(fileList)):
	fileName = fileList[i].get_attribute('title')
	self.logger.info("Try Download : %s" % (fileName))
	fileList[i].click()
	time.sleep(2)
	completedFileList.append(fileName)

	except Exception as e:
	self.logger.error("DownloadAttatchedFiles Error : Unknown Exception (%s)" % (str(e)))

	return completedFileList

	def main():
	crawler = NaverMailCrawler()
	if crawler.Login(USER_ID, PASSWORD):
	mailSNList = crawler.GetMailSNList(keyword = SEARCH_KEYWORD, unreadOnly = True)
	for mailSN in mailSNList:
	completedFileList = crawler.DownloadAttatchedFiles(mailSN)
	raw_input("[*] Press Enter to exit")


	if __name__ == '__main__':
	main()