Created
January 25, 2019 00:38
-
-
Save rs6000/80ab04cf71a00962d609d539595aea19 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2019-01-18T11:06:44.787548Z", | |
"start_time": "2019-01-18T11:06:44.480996Z" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"import requests, re, os, csv, wget, time\n", | |
"\n", | |
"from bs4 import BeautifulSoup\n", | |
"base_url = \"http://stockmarketpilipinas.com/\"\n", | |
"url='http://stockmarketpilipinas.com/thread-337.html'\n", | |
"#url2='http://stockmarketpilipinas.com/thread-337-page-453.html'" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2019-01-18T11:06:45.338766Z", | |
"start_time": "2019-01-18T11:06:45.326742Z" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"runat=time.strftime(\"%Y-%m-%d %H:%M:%S\", time.localtime())\n", | |
"workpath = os.getcwd()\n", | |
"# 存檔路徑\n", | |
"mydir = os.path.join(workpath, \"daily_csv\")\n", | |
"#重覆的檔案放這邊\n", | |
"mydir2 = os.path.join(workpath, \"duplicate_files\")\n", | |
"DailyReport=''\n", | |
"\n", | |
"\n", | |
"filelist=[]\n", | |
"with open('download_files.txt', 'r') as f:\n", | |
" data = f.readlines()\n", | |
" for line in data:\n", | |
" filelist.append(line.strip())\n", | |
"last_download_page=filelist[0]\n", | |
"#print(\"上次下載的頁面是在{}\".format(last_download_page))\n", | |
"#print(len(filelist), filelist)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2019-01-18T11:06:46.678437Z", | |
"start_time": "2019-01-18T11:06:46.229195Z" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"res=requests.get(url)\n", | |
"soup=BeautifulSoup(res.content, 'html5lib')\n", | |
"get_lastpage=soup.find(\"span\",{\"class\":\"pages\"})\n", | |
"get_curren=soup.find(\"span\",{\"class\":\"pagination_current\"})" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2019-01-18T11:06:47.415641Z", | |
"start_time": "2019-01-18T11:06:47.405565Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"目前在討論版的第1頁 \n", | |
"最後一頁是:453頁\n" | |
] | |
} | |
], | |
"source": [ | |
"last_page=re.sub(r'\\D','',get_lastpage.text)\n", | |
"curren_page=re.sub(r'\\D','',get_curren.text)\n", | |
"print('目前在討論版的第{}頁 \\n最後一頁是:{}頁'.format(get_curren.text,last_page))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2019-01-18T11:07:51.328180Z", | |
"start_time": "2019-01-18T11:07:51.319053Z" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"page_list = []\n", | |
"# 起始的頁數\n", | |
"pg = int(last_download_page)\n", | |
"#結束的頁數\n", | |
"max_num = int(last_page)\n", | |
"\n", | |
"download_files = []\n", | |
"error_page = []\n", | |
"\n", | |
"if max_num != pg:\n", | |
" for i in range(pg, max_num+1):\n", | |
" get_page = str(pg)\n", | |
" pg += 1\n", | |
" page_list.append(base_url+'thread-337-page-'+get_page+'.html')\n", | |
"else:\n", | |
" page_list.append(base_url+'thread-337-page-'+str(max_num)+'.html')\n", | |
" \n", | |
"#print(\"PG={}\\nmax_num={}\".format(pg,max_num))\n", | |
"#print(page_list)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2019-01-18T01:25:21.379217Z", | |
"start_time": "2019-01-18T01:25:18.917089Z" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"for crawling_page in page_list:\n", | |
" page_html = requests.get(crawling_page)\n", | |
" page_soup = BeautifulSoup(page_html.text, 'lxml')\n", | |
" title = page_soup.find('div', {'id': 'posts'}).find_all('fieldset')\n", | |
" if title:\n", | |
" print(\"開始從 {} 下載資料:\".format(crawling_page))\n", | |
" for i in title:\n", | |
" try:\n", | |
" #取得檔名 + 轉成小寫\n", | |
" f_name = i.find('a').text.lower()\n", | |
" # 取得檔案連結\n", | |
" f_href = base_url+i.find('a')['href']\n", | |
" #先檢查檔案是否在上次下載的檔案list裡面\n", | |
" if f_name in filelist:\n", | |
" print(\"已有檔案: {}\".format(f_name))\n", | |
" #有就跳出本次的迴圈,檢查下一個\n", | |
" continue\n", | |
" else:\n", | |
" #檢查檔案是否存在\n", | |
" isExists = os.path.exists(os.path.join(mydir, f_name))\n", | |
" if not isExists:\n", | |
" # 下載檔案\n", | |
" print(\"下載檔案:\", f_name)\n", | |
" #本次下載的檔案清單\n", | |
" download_files.append(f_name)\n", | |
" #所有下載的檔案清單\n", | |
" filelist.append(f_name)\n", | |
" wget.download(f_href, out=os.path.join(mydir, f_name))\n", | |
" #下載檔案後,暫時0.3秒\n", | |
" time.sleep(0.3)\n", | |
" else:\n", | |
" #如果檔案存在就下載到其他資料夾\n", | |
" print(\"已有檔案: {}\".format(f_name))\n", | |
" #wget.download(f_href, out=os.path.join(mydir2, f_name))\n", | |
" except Exception as e:\n", | |
" #msg = 'error: {0} {1} \\n'.format(crawling_page, f_name)\n", | |
" error_page.append(e)\n", | |
" continue\n", | |
" else:\n", | |
" print(\"沒有資料:\",crawling_page)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2019-01-18T01:22:40.191446Z", | |
"start_time": "2019-01-18T01:22:40.183128Z" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"#檢查\n", | |
"#print(\"本次下載檔案{}\\n錯誤訊息:{}\".format(download_files ,error_page))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2019-01-18T01:25:25.377986Z", | |
"start_time": "2019-01-18T01:25:25.370771Z" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"msg=''\n", | |
"if len(download_files):\n", | |
" for i in download_files:\n", | |
" msg=msg+i+'\\n'\n", | |
"else:\n", | |
" msg='本次沒有沒有新檔案需要下載'\n", | |
"#print(msg)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2019-01-18T01:25:26.428668Z", | |
"start_time": "2019-01-18T01:25:26.421285Z" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"msg2=''\n", | |
"if len(error_page):\n", | |
" for i in error_page:\n", | |
" msg2=msg2+i+'\\n'\n", | |
" print(msg2)\n", | |
"else:\n", | |
" msg2='沒有錯誤訊息!!!'" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2019-01-18T10:31:45.805078Z", | |
"start_time": "2019-01-18T10:31:45.797679Z" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"#建立工作log檔\n", | |
"s='================================='\n", | |
"DailyReport='每日更新報告 @ {} \\n{}\\n本次下載的檔案:{}\\n錯誤訊息:{}\\n{}\\n'.format(runat,s,msg ,msg2,s)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2019-01-18T10:28:29.030560Z", | |
"start_time": "2019-01-18T10:28:29.021325Z" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"#檢查\n", | |
"#DailyReport" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2019-01-18T10:23:52.315364Z", | |
"start_time": "2019-01-18T10:23:52.306628Z" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"#filelist[0]紀錄最後抓取的頁數\n", | |
"filelist[0]=str(max_num)\n", | |
"#print(len(filelist), filelist)\n", | |
"\n", | |
"#把list寫入文字檔,更新filelist[0]的內容\n", | |
"with open('download_files.txt', 'w') as f:\n", | |
" for item in filelist:\n", | |
" f.write(\"%s\\n\" % item)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2019-01-18T10:32:26.656315Z", | |
"start_time": "2019-01-18T10:32:26.649939Z" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"#以附加的方式將新增檔案的名稱寫入文字檔\n", | |
"with open('DailyReport.txt', 'a+') as f:\n", | |
" for item in DailyReport:\n", | |
" f.write(\"%s\" % item)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2019-01-18T10:24:26.267391Z", | |
"start_time": "2019-01-18T10:24:26.258991Z" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"#以下用不到\n", | |
"#將文字檔傳進list\n", | |
"'''\n", | |
"filelist=[]\n", | |
"with open('download_files.txt', 'r') as f:\n", | |
" data = f.readlines()\n", | |
" for line in data:\n", | |
" filelist.append(line.strip())\n", | |
"\n", | |
"#filelist[0] 最後更新的頁數 \n", | |
"filelist[0]=max_num\n", | |
"'''" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.5" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment