Last active
March 13, 2018 17:23
-
-
Save andy23512/5d17442e8688c9e3e42c6e0e0de0bf39 to your computer and use it in GitHub Desktop.
crawler example
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# import 建議放最上面\n", | |
"import requests\n", | |
"import re\n", | |
"from bs4 import BeautifulSoup" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"naibin@ms7.hinet.net\n", | |
"jacob@mail.ypu.edu.tw\n", | |
"reiheng@gmail.com\n", | |
"suyu9966@yahoo.com.tw\n", | |
"E-Mail:csie@mail.ypu.edu.tw\n", | |
"cyberpaul747@mail.ypu.edu.tw\n", | |
"djchou@mail.ypu.edu.tw\n", | |
"E-MAIL: kuofeng@mail.ypu.edu.tw\n", | |
"wyliu@mail.ypu.edu.tw\n", | |
"ytchen@mail.ypu.edu.tw\n", | |
"yitingchuang@yahoo.com\n", | |
"neanben@mail.ypu.edu.tw\n", | |
"wangts@seed.net.tw\n", | |
"skchen@mail.ypu.edu.tw\n", | |
"\n", | |
"E-mail: howa@mail.ypu.edu.tw\n", | |
"hcchan@mail.ypu.edu.tw\n", | |
"cjchen@mail.ypu.edu.tw\n", | |
"ikiyama@gmail.com\n", | |
"lucyslin@gmail.com\n", | |
"mhhuang@mail.ypu.edu.tw\n" | |
] | |
} | |
], | |
"source": [ | |
"def crawler():\n", | |
" for url01 in url_list:\n", | |
" pattern=re.compile(r'[a-zA-Z0-9_]+@[a-zA-Z0-9\\._]+')\n", | |
" res=requests.get(url01)\n", | |
" res.encoding = 'utf8'\n", | |
" soup=BeautifulSoup(res.text,'html.parser')\n", | |
" articles=soup.select(web_tag)\n", | |
" email_set=set()\n", | |
" for art in articles:\n", | |
" res2=requests.get(art['href'])\n", | |
" res2.encoding = 'utf8'\n", | |
" soup2=BeautifulSoup(res2.text,'html.parser')\n", | |
" email_set.add(soup2.find(text=pattern))\n", | |
" email_list=((\",\".join(list(email_set))).replace(\",\",\"\\n\")).replace(\";\",\"\\n\")\n", | |
" print(email_list)\n", | |
" email_set.clear() # 需要用email_set.clear()呼叫函式才會clear,你如果只有用email_set.clear什麼事情都不會發生\n", | |
" \n", | |
"url_list=[\n", | |
" \"https://ndma.ypu.edu.tw/files/11-1077-82.php?Lang=zh-tw\",\n", | |
"]\n", | |
"\n", | |
"url02 = \"http://civil.csu.edu.tw/wSite/\"\n", | |
"web_tag = 'div.h5.title-noicon.title-center a' # 你要選到你說的那種元素,選擇器要這樣寫,建議去弄懂css selector的寫法\n", | |
"\n", | |
"crawler()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.4" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment