Skip to content

Instantly share code, notes, and snippets.

@kagamimoe
Last active July 30, 2017 05:17
Show Gist options
  • Star 5 You must be signed in to star a gist
  • Fork 3 You must be signed in to fork a gist
  • Save kagamimoe/42a3f1b8ff1dc60304b43e9bc64409cf to your computer and use it in GitHub Desktop.
Save kagamimoe/42a3f1b8ff1dc60304b43e9bc64409cf to your computer and use it in GitHub Desktop.
a python crawler to find the job information
# -*- coding: utf-8 -*-
import urllib2
import sys, smtplib, poplib
from email.mime.text import MIMEText
reload(sys)
sys.setdefaultencoding('utf-8')
from bs4 import BeautifulSoup
msg_head = ['From:xxx@163.com',
'To:xxx@qq.com',
'Subject:V2EX酷工作节点 每日职位信息']
msg_body = ''
base_url = 'https://www.v2ex.com/go/jobs?p='
href_url = 'https://www.v2ex.com'
t_list = []
link_list = []
words = [u'测试',u'QA',] #这里设置抓取关键词
mailto_list=["xxx@qq.com"] #收件人邮箱
mail_host="smtp.163.com" #设置SMTP服务器
mail_user="xxx@163.com" #发件人邮箱
mail_pass="xxx" #发件人邮箱密码
mail_postfix="163.com" #发件箱的后缀
def get_house_list():
global t_list
global link_list
i = 1
while i < 15:
print '*****第' + str(i) + '次抓取开始*****'
page = urllib2.urlopen(base_url + str(i))
html = page.read()
soup = BeautifulSoup(html.decode("utf-8"))
title_list = soup.find_all("span",class_="item_title")
for tag in title_list:
for word in words:
if word in (tag.contents[0]).string:
t_list.append((tag.contents[0]).string)
link_list.append(href_url + (tag.contents[0])['href'])
i = i + 1
print '*****抓取完毕,总计抓取'+ str(i) +'页*****'
return t_list, link_list
def send_mail(to_list,sub,content): #to_list:收件人;sub:主题;content:邮件内容
me="admin"+"<"+mail_user+"@"+mail_postfix+">" #这里的hello可以任意设置,收到信后,将按照设置显示
msg = MIMEText(content,_subtype='html',_charset='utf-8') #创建一个实例,这里设置为html格式邮件
msg['Subject'] = sub #设置主题
msg['From'] = me
msg['To'] = ";".join(to_list)
try:
s = smtplib.SMTP()
s.connect(mail_host) #连接smtp服务器
s.login(mail_user,mail_pass) #登陆服务器
s.sendmail(me, to_list, msg.as_string()) #发送邮件
s.close()
return True
except Exception, e:
print str(e)
return False
if __name__ == '__main__':
get_house_list()
for i in range(len(t_list)):
msg_body = msg_body + ''.join(t_list[i]) + '<br><href>' + ''.join(link_list[i]) + '</href><br><br>'
if send_mail(mailto_list,"V2EX酷工作节点 每日职位信息 ",msg_body):
print "邮件发送成功!"
else:
print "邮件发送失败!"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment