Skip to content

Instantly share code, notes, and snippets.

@rs6000
Created January 25, 2019 00:38
Show Gist options
  • Save rs6000/80ab04cf71a00962d609d539595aea19 to your computer and use it in GitHub Desktop.
Save rs6000/80ab04cf71a00962d609d539595aea19 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"ExecuteTime": {
"end_time": "2019-01-18T11:06:44.787548Z",
"start_time": "2019-01-18T11:06:44.480996Z"
}
},
"outputs": [],
"source": [
"import requests, re, os, csv, wget, time\n",
"\n",
"from bs4 import BeautifulSoup\n",
"base_url = \"http://stockmarketpilipinas.com/\"\n",
"url='http://stockmarketpilipinas.com/thread-337.html'\n",
"#url2='http://stockmarketpilipinas.com/thread-337-page-453.html'"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"ExecuteTime": {
"end_time": "2019-01-18T11:06:45.338766Z",
"start_time": "2019-01-18T11:06:45.326742Z"
}
},
"outputs": [],
"source": [
"runat=time.strftime(\"%Y-%m-%d %H:%M:%S\", time.localtime())\n",
"workpath = os.getcwd()\n",
"# 存檔路徑\n",
"mydir = os.path.join(workpath, \"daily_csv\")\n",
"#重覆的檔案放這邊\n",
"mydir2 = os.path.join(workpath, \"duplicate_files\")\n",
"DailyReport=''\n",
"\n",
"\n",
"filelist=[]\n",
"with open('download_files.txt', 'r') as f:\n",
" data = f.readlines()\n",
" for line in data:\n",
" filelist.append(line.strip())\n",
"last_download_page=filelist[0]\n",
"#print(\"上次下載的頁面是在{}\".format(last_download_page))\n",
"#print(len(filelist), filelist)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"ExecuteTime": {
"end_time": "2019-01-18T11:06:46.678437Z",
"start_time": "2019-01-18T11:06:46.229195Z"
}
},
"outputs": [],
"source": [
"res=requests.get(url)\n",
"soup=BeautifulSoup(res.content, 'html5lib')\n",
"get_lastpage=soup.find(\"span\",{\"class\":\"pages\"})\n",
"get_curren=soup.find(\"span\",{\"class\":\"pagination_current\"})"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"ExecuteTime": {
"end_time": "2019-01-18T11:06:47.415641Z",
"start_time": "2019-01-18T11:06:47.405565Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"目前在討論版的第1頁 \n",
"最後一頁是:453頁\n"
]
}
],
"source": [
"last_page=re.sub(r'\\D','',get_lastpage.text)\n",
"curren_page=re.sub(r'\\D','',get_curren.text)\n",
"print('目前在討論版的第{}頁 \\n最後一頁是:{}頁'.format(get_curren.text,last_page))"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"ExecuteTime": {
"end_time": "2019-01-18T11:07:51.328180Z",
"start_time": "2019-01-18T11:07:51.319053Z"
}
},
"outputs": [],
"source": [
"page_list = []\n",
"# 起始的頁數\n",
"pg = int(last_download_page)\n",
"#結束的頁數\n",
"max_num = int(last_page)\n",
"\n",
"download_files = []\n",
"error_page = []\n",
"\n",
"if max_num != pg:\n",
" for i in range(pg, max_num+1):\n",
" get_page = str(pg)\n",
" pg += 1\n",
" page_list.append(base_url+'thread-337-page-'+get_page+'.html')\n",
"else:\n",
" page_list.append(base_url+'thread-337-page-'+str(max_num)+'.html')\n",
" \n",
"#print(\"PG={}\\nmax_num={}\".format(pg,max_num))\n",
"#print(page_list)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-01-18T01:25:21.379217Z",
"start_time": "2019-01-18T01:25:18.917089Z"
}
},
"outputs": [],
"source": [
"for crawling_page in page_list:\n",
" page_html = requests.get(crawling_page)\n",
" page_soup = BeautifulSoup(page_html.text, 'lxml')\n",
" title = page_soup.find('div', {'id': 'posts'}).find_all('fieldset')\n",
" if title:\n",
" print(\"開始從 {} 下載資料:\".format(crawling_page))\n",
" for i in title:\n",
" try:\n",
" #取得檔名 + 轉成小寫\n",
" f_name = i.find('a').text.lower()\n",
" # 取得檔案連結\n",
" f_href = base_url+i.find('a')['href']\n",
" #先檢查檔案是否在上次下載的檔案list裡面\n",
" if f_name in filelist:\n",
" print(\"已有檔案: {}\".format(f_name))\n",
" #有就跳出本次的迴圈,檢查下一個\n",
" continue\n",
" else:\n",
" #檢查檔案是否存在\n",
" isExists = os.path.exists(os.path.join(mydir, f_name))\n",
" if not isExists:\n",
" # 下載檔案\n",
" print(\"下載檔案:\", f_name)\n",
" #本次下載的檔案清單\n",
" download_files.append(f_name)\n",
" #所有下載的檔案清單\n",
" filelist.append(f_name)\n",
" wget.download(f_href, out=os.path.join(mydir, f_name))\n",
" #下載檔案後,暫時0.3秒\n",
" time.sleep(0.3)\n",
" else:\n",
" #如果檔案存在就下載到其他資料夾\n",
" print(\"已有檔案: {}\".format(f_name))\n",
" #wget.download(f_href, out=os.path.join(mydir2, f_name))\n",
" except Exception as e:\n",
" #msg = 'error: {0} {1} \\n'.format(crawling_page, f_name)\n",
" error_page.append(e)\n",
" continue\n",
" else:\n",
" print(\"沒有資料:\",crawling_page)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-01-18T01:22:40.191446Z",
"start_time": "2019-01-18T01:22:40.183128Z"
}
},
"outputs": [],
"source": [
"#檢查\n",
"#print(\"本次下載檔案{}\\n錯誤訊息:{}\".format(download_files ,error_page))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-01-18T01:25:25.377986Z",
"start_time": "2019-01-18T01:25:25.370771Z"
}
},
"outputs": [],
"source": [
"msg=''\n",
"if len(download_files):\n",
" for i in download_files:\n",
" msg=msg+i+'\\n'\n",
"else:\n",
" msg='本次沒有沒有新檔案需要下載'\n",
"#print(msg)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-01-18T01:25:26.428668Z",
"start_time": "2019-01-18T01:25:26.421285Z"
}
},
"outputs": [],
"source": [
"msg2=''\n",
"if len(error_page):\n",
" for i in error_page:\n",
" msg2=msg2+i+'\\n'\n",
" print(msg2)\n",
"else:\n",
" msg2='沒有錯誤訊息!!!'"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-01-18T10:31:45.805078Z",
"start_time": "2019-01-18T10:31:45.797679Z"
}
},
"outputs": [],
"source": [
"#建立工作log檔\n",
"s='================================='\n",
"DailyReport='每日更新報告 @ {} \\n{}\\n本次下載的檔案:{}\\n錯誤訊息:{}\\n{}\\n'.format(runat,s,msg ,msg2,s)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-01-18T10:28:29.030560Z",
"start_time": "2019-01-18T10:28:29.021325Z"
}
},
"outputs": [],
"source": [
"#檢查\n",
"#DailyReport"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-01-18T10:23:52.315364Z",
"start_time": "2019-01-18T10:23:52.306628Z"
}
},
"outputs": [],
"source": [
"#filelist[0]紀錄最後抓取的頁數\n",
"filelist[0]=str(max_num)\n",
"#print(len(filelist), filelist)\n",
"\n",
"#把list寫入文字檔,更新filelist[0]的內容\n",
"with open('download_files.txt', 'w') as f:\n",
" for item in filelist:\n",
" f.write(\"%s\\n\" % item)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-01-18T10:32:26.656315Z",
"start_time": "2019-01-18T10:32:26.649939Z"
}
},
"outputs": [],
"source": [
"#以附加的方式將新增檔案的名稱寫入文字檔\n",
"with open('DailyReport.txt', 'a+') as f:\n",
" for item in DailyReport:\n",
" f.write(\"%s\" % item)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-01-18T10:24:26.267391Z",
"start_time": "2019-01-18T10:24:26.258991Z"
}
},
"outputs": [],
"source": [
"#以下用不到\n",
"#將文字檔傳進list\n",
"'''\n",
"filelist=[]\n",
"with open('download_files.txt', 'r') as f:\n",
" data = f.readlines()\n",
" for line in data:\n",
" filelist.append(line.strip())\n",
"\n",
"#filelist[0] 最後更新的頁數 \n",
"filelist[0]=max_num\n",
"'''"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment