Skip to content

Instantly share code, notes, and snippets.

@skyclad0x7b7
Last active December 7, 2020 04:30
Show Gist options
  • Save skyclad0x7b7/3f6ac6c66f4cd349d627123929bb1979 to your computer and use it in GitHub Desktop.
Save skyclad0x7b7/3f6ac6c66f4cd349d627123929bb1979 to your computer and use it in GitHub Desktop.
Simple Naver Mail Crawler using Python 2.7, selenium, BeautifulSoup4
# The MIT License
#
# Copyright (c) 2018 Sanghyeon Jeon
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
import os
import time
from random import randrange
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from bs4 import *
import Utils
# ===== Config =====
HEADLESS = False
SEARCH_KEYWORD = None
DOWNLOAD_PATH = os.path.realpath("download")
USER_ID = "USER_ID"
PASSWORD = "PASSWORD"
LOGIN_URL = "https://nid.naver.com/nidlogin.login"
# ==================
class NaverMailCrawler():
def __init__(self):
self.logger = Utils.CreateLogger("NaverMailCrawler")
self.driverPath = os.path.realpath('chromedriver.exe')
self.driver = None
if not os.path.exists(DOWNLOAD_PATH):
Utils.Mkdirs(DOWNLOAD_PATH)
def __del__(self):
pass
# Decorator for checking driver
def DriverCheck(targetFunction):
def wrapper(self, *args, **kwargs):
if self.driver == None:
self.logger.info("Driver not ready, calling SetDriver()...")
self.SetDriver()
return targetFunction(self, *args, **kwargs)
return wrapper
def SetDriver(self):
options = webdriver.ChromeOptions()
options.headless = HEADLESS
preferences = {
"download.default_directory": DOWNLOAD_PATH,
"directory_upgrade": True,
"safebrowsing.enabled": True,
"profile.default_content_setting_values.automatic_downloads": 2
}
options.add_experimental_option("prefs", preferences)
self.driver = webdriver.Chrome(executable_path = self.driverPath, chrome_options = options)
self.driver.set_page_load_timeout(15) # 15 seconds timeout
self.driver.set_window_size(1000, 600)
self.driver.set_window_position(200, 200)
self.logger.info("Driver Setting Completed")
@DriverCheck
def Login(self, userId, password):
try:
self.driver.get("http://www.naver.com/")
self.driver.implicitly_wait(5)
self.logger.info("Trying to Login : %s" % (LOGIN_URL))
self.driver.get(LOGIN_URL)
self.driver.implicitly_wait(5)
time.sleep(randrange(2,5)) # For bypass captcha
self.driver.find_element_by_name('id').send_keys(userId)
time.sleep(1)
self.driver.find_element_by_name('pw').send_keys(password)
time.sleep(randrange(2,4))
self.driver.find_element_by_xpath('//*[@id="frmNIDLogin"]/fieldset/input').click()
try:
self.driver.find_element_by_class_name('link_login_help')
except NoSuchElementException:
pass
else:
self.logger.error("Login Failed : Captcha Occured, Try Later...")
return False
except Exception as e:
self.logger.error("Login Failed : Unknown Exception (%s)" % (str(e)))
return False
else:
self.logger.info("Login Success")
return True
@DriverCheck
def GetMailSNList(self, keyword = None, unreadOnly = False):
mailSNList = []
try:
self.logger.info("Start Getting MailSNList")
self.driver.get('https://mail.naver.com')
self.driver.implicitly_wait(5)
if keyword is not None:
self.driver.find_element_by_id('searchKeyWord').send_keys(keyword)
self.driver.find_element_by_xpath('//*[@id="searchBtn"]').click()
time.sleep(2)
if unreadOnly:
self.driver.find_element_by_xpath('//*[@id="listBtnMenu"]/div[@class="buttonSet"]/button[6]').click()
time.sleep(0.5)
self.driver.find_element_by_xpath('//*[@id="changeViewFilterLayer"]/div/ul[@class="selector list_filtering"]/li[@data-viewfilter="unread"]').click()
time.sleep(2)
htmlSource = self.driver.page_source
bs = BeautifulSoup(htmlSource, 'html5lib')
mailSNList = [int(x['mailsn']) for x in bs.select('ol.mailList > li')]
self.logger.info("GetMailSNList Success : %d found" % (len(mailSNList)))
except Exception as e:
self.logger.error("GetMailSNList Failed : Unknown Exception (%s)" % (str(e)))
return mailSNList
@DriverCheck
def DownloadAttatchedFiles(self, mailSN):
completedFileList = []
try:
self.driver.get('https://mail.naver.com/read/popup/?nMailId=%d' % (mailSN))
self.driver.implicitly_wait(5)
fileList = self.driver.find_elements_by_xpath('//*[@id="previewContent"]/div[@class="coverWrap"]/div[@class="attfile_area"]/div[@class="file_list"]/ul/li/span/a')
for i in range(len(fileList)):
fileName = fileList[i].get_attribute('title')
self.logger.info("Try Download : %s" % (fileName))
fileList[i].click()
time.sleep(2)
completedFileList.append(fileName)
except Exception as e:
self.logger.error("DownloadAttatchedFiles Error : Unknown Exception (%s)" % (str(e)))
return completedFileList
def main():
crawler = NaverMailCrawler()
if crawler.Login(USER_ID, PASSWORD):
mailSNList = crawler.GetMailSNList(keyword = SEARCH_KEYWORD, unreadOnly = True)
for mailSN in mailSNList:
completedFileList = crawler.DownloadAttatchedFiles(mailSN)
raw_input("[*] Press Enter to exit")
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment