Skip to content

Instantly share code, notes, and snippets.

@andy23512
Last active March 13, 2018 17:23
Show Gist options
  • Save andy23512/5d17442e8688c9e3e42c6e0e0de0bf39 to your computer and use it in GitHub Desktop.
Save andy23512/5d17442e8688c9e3e42c6e0e0de0bf39 to your computer and use it in GitHub Desktop.
crawler example
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# import 建議放最上面\n",
"import requests\n",
"import re\n",
"from bs4 import BeautifulSoup"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"naibin@ms7.hinet.net\n",
"jacob@mail.ypu.edu.tw\n",
"reiheng@gmail.com\n",
"suyu9966@yahoo.com.tw\n",
"E-Mail:csie@mail.ypu.edu.tw\n",
"cyberpaul747@mail.ypu.edu.tw\n",
"djchou@mail.ypu.edu.tw\n",
"E-MAIL: kuofeng@mail.ypu.edu.tw\n",
"wyliu@mail.ypu.edu.tw\n",
"ytchen@mail.ypu.edu.tw\n",
"yitingchuang@yahoo.com\n",
"neanben@mail.ypu.edu.tw\n",
"wangts@seed.net.tw\n",
"skchen@mail.ypu.edu.tw\n",
"\n",
"E-mail: howa@mail.ypu.edu.tw\n",
"hcchan@mail.ypu.edu.tw\n",
"cjchen@mail.ypu.edu.tw\n",
"ikiyama@gmail.com\n",
"lucyslin@gmail.com\n",
"mhhuang@mail.ypu.edu.tw\n"
]
}
],
"source": [
"def crawler():\n",
" for url01 in url_list:\n",
" pattern=re.compile(r'[a-zA-Z0-9_]+@[a-zA-Z0-9\\._]+')\n",
" res=requests.get(url01)\n",
" res.encoding = 'utf8'\n",
" soup=BeautifulSoup(res.text,'html.parser')\n",
" articles=soup.select(web_tag)\n",
" email_set=set()\n",
" for art in articles:\n",
" res2=requests.get(art['href'])\n",
" res2.encoding = 'utf8'\n",
" soup2=BeautifulSoup(res2.text,'html.parser')\n",
" email_set.add(soup2.find(text=pattern))\n",
" email_list=((\",\".join(list(email_set))).replace(\",\",\"\\n\")).replace(\";\",\"\\n\")\n",
" print(email_list)\n",
" email_set.clear() # 需要用email_set.clear()呼叫函式才會clear,你如果只有用email_set.clear什麼事情都不會發生\n",
" \n",
"url_list=[\n",
" \"https://ndma.ypu.edu.tw/files/11-1077-82.php?Lang=zh-tw\",\n",
"]\n",
"\n",
"url02 = \"http://civil.csu.edu.tw/wSite/\"\n",
"web_tag = 'div.h5.title-noicon.title-center a' # 你要選到你說的那種元素,選擇器要這樣寫,建議去弄懂css selector的寫法\n",
"\n",
"crawler()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment