rs6000/2019_PSE_0_CheckNewCsv.ipynb

## 2019_PSE_0_CheckNewCsv.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-01-18T11:06:44.787548Z",
     "start_time": "2019-01-18T11:06:44.480996Z"
    }
   },
   "outputs": [],
   "source": [
    "import requests, re, os, csv, wget, time\n",
    "\n",
    "from bs4 import BeautifulSoup\n",
    "base_url = \"http://stockmarketpilipinas.com/\"\n",
    "url='http://stockmarketpilipinas.com/thread-337.html'\n",
    "#url2='http://stockmarketpilipinas.com/thread-337-page-453.html'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-01-18T11:06:45.338766Z",
     "start_time": "2019-01-18T11:06:45.326742Z"
    }
   },
   "outputs": [],
   "source": [
    "runat=time.strftime(\"%Y-%m-%d %H:%M:%S\", time.localtime())\n",
    "workpath = os.getcwd()\n",
    "# 存檔路徑\n",
    "mydir = os.path.join(workpath, \"daily_csv\")\n",
    "#重覆的檔案放這邊\n",
    "mydir2 = os.path.join(workpath, \"duplicate_files\")\n",
    "DailyReport=''\n",
    "\n",
    "\n",
    "filelist=[]\n",
    "with open('download_files.txt', 'r') as f:\n",
    "    data = f.readlines()\n",
    "    for line in data:\n",
    "        filelist.append(line.strip())\n",
    "last_download_page=filelist[0]\n",
    "#print(\"上次下載的頁面是在{}\".format(last_download_page))\n",
    "#print(len(filelist), filelist)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-01-18T11:06:46.678437Z",
     "start_time": "2019-01-18T11:06:46.229195Z"
    }
   },
   "outputs": [],
   "source": [
    "res=requests.get(url)\n",
    "soup=BeautifulSoup(res.content, 'html5lib')\n",
    "get_lastpage=soup.find(\"span\",{\"class\":\"pages\"})\n",
    "get_curren=soup.find(\"span\",{\"class\":\"pagination_current\"})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-01-18T11:06:47.415641Z",
     "start_time": "2019-01-18T11:06:47.405565Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "目前在討論版的第1頁 \n",
      "最後一頁是:453頁\n"
     ]
    }
   ],
   "source": [
    "last_page=re.sub(r'\\D','',get_lastpage.text)\n",
    "curren_page=re.sub(r'\\D','',get_curren.text)\n",
    "print('目前在討論版的第{}頁 \\n最後一頁是:{}頁'.format(get_curren.text,last_page))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-01-18T11:07:51.328180Z",
     "start_time": "2019-01-18T11:07:51.319053Z"
    }
   },
   "outputs": [],
   "source": [
    "page_list = []\n",
    "# 起始的頁數\n",
    "pg = int(last_download_page)\n",
    "#結束的頁數\n",
    "max_num = int(last_page)\n",
    "\n",
    "download_files = []\n",
    "error_page = []\n",
    "\n",
    "if max_num != pg:\n",
    "    for i in range(pg, max_num+1):\n",
    "        get_page = str(pg)\n",
    "        pg += 1\n",
    "        page_list.append(base_url+'thread-337-page-'+get_page+'.html')\n",
    "else:\n",
    "    page_list.append(base_url+'thread-337-page-'+str(max_num)+'.html')\n",
    "  \n",
    "#print(\"PG={}\\nmax_num={}\".format(pg,max_num))\n",
    "#print(page_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-01-18T01:25:21.379217Z",
     "start_time": "2019-01-18T01:25:18.917089Z"
    }
   },
   "outputs": [],
   "source": [
    "for crawling_page in page_list:\n",
    "    page_html = requests.get(crawling_page)\n",
    "    page_soup = BeautifulSoup(page_html.text, 'lxml')\n",
    "    title = page_soup.find('div', {'id': 'posts'}).find_all('fieldset')\n",
    "    if title:\n",
    "        print(\"開始從 {} 下載資料:\".format(crawling_page))\n",
    "        for i in title:\n",
    "            try:\n",
    "                #取得檔名 + 轉成小寫\n",
    "                f_name = i.find('a').text.lower()\n",
    "                # 取得檔案連結\n",
    "                f_href = base_url+i.find('a')['href']\n",
    "                #先檢查檔案是否在上次下載的檔案list裡面\n",
    "                if f_name in filelist:\n",
    "                    print(\"已有檔案: {}\".format(f_name))\n",
    "                    #有就跳出本次的迴圈，檢查下一個\n",
    "                    continue\n",
    "                else:\n",
    "                    #檢查檔案是否存在\n",
    "                    isExists = os.path.exists(os.path.join(mydir, f_name))\n",
    "                    if not isExists:\n",
    "                        # 下載檔案\n",
    "                        print(\"下載檔案:\", f_name)\n",
    "                        #本次下載的檔案清單\n",
    "                        download_files.append(f_name)\n",
    "                        #所有下載的檔案清單\n",
    "                        filelist.append(f_name)\n",
    "                        wget.download(f_href, out=os.path.join(mydir, f_name))\n",
    "                        #下載檔案後，暫時0.3秒\n",
    "                        time.sleep(0.3)\n",
    "                    else:\n",
    "                    #如果檔案存在就下載到其他資料夾\n",
    "                        print(\"已有檔案: {}\".format(f_name))\n",
    "                        #wget.download(f_href, out=os.path.join(mydir2, f_name))\n",
    "            except Exception as e:\n",
    "                #msg = 'error: {0} {1} \\n'.format(crawling_page, f_name)\n",
    "                error_page.append(e)\n",
    "                continue\n",
    "    else:\n",
    "        print(\"沒有資料:\",crawling_page)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-01-18T01:22:40.191446Z",
     "start_time": "2019-01-18T01:22:40.183128Z"
    }
   },
   "outputs": [],
   "source": [
    "#檢查\n",
    "#print(\"本次下載檔案{}\\n錯誤訊息:{}\".format(download_files ,error_page))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-01-18T01:25:25.377986Z",
     "start_time": "2019-01-18T01:25:25.370771Z"
    }
   },
   "outputs": [],
   "source": [
    "msg=''\n",
    "if len(download_files):\n",
    "    for i in download_files:\n",
    "        msg=msg+i+'\\n'\n",
    "else:\n",
    "    msg='本次沒有沒有新檔案需要下載'\n",
    "#print(msg)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-01-18T01:25:26.428668Z",
     "start_time": "2019-01-18T01:25:26.421285Z"
    }
   },
   "outputs": [],
   "source": [
    "msg2=''\n",
    "if len(error_page):\n",
    "    for i in error_page:\n",
    "        msg2=msg2+i+'\\n'\n",
    "    print(msg2)\n",
    "else:\n",
    "    msg2='沒有錯誤訊息!!!'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-01-18T10:31:45.805078Z",
     "start_time": "2019-01-18T10:31:45.797679Z"
    }
   },
   "outputs": [],
   "source": [
    "#建立工作log檔\n",
    "s='================================='\n",
    "DailyReport='每日更新報告 @ {} \\n{}\\n本次下載的檔案:{}\\n錯誤訊息:{}\\n{}\\n'.format(runat,s,msg ,msg2,s)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-01-18T10:28:29.030560Z",
     "start_time": "2019-01-18T10:28:29.021325Z"
    }
   },
   "outputs": [],
   "source": [
    "#檢查\n",
    "#DailyReport"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-01-18T10:23:52.315364Z",
     "start_time": "2019-01-18T10:23:52.306628Z"
    }
   },
   "outputs": [],
   "source": [
    "#filelist[0]紀錄最後抓取的頁數\n",
    "filelist[0]=str(max_num)\n",
    "#print(len(filelist), filelist)\n",
    "\n",
    "#把list寫入文字檔,更新filelist[0]的內容\n",
    "with open('download_files.txt', 'w') as f:\n",
    "    for item in filelist:\n",
    "        f.write(\"%s\\n\" % item)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-01-18T10:32:26.656315Z",
     "start_time": "2019-01-18T10:32:26.649939Z"
    }
   },
   "outputs": [],
   "source": [
    "#以附加的方式將新增檔案的名稱寫入文字檔\n",
    "with open('DailyReport.txt', 'a+') as f:\n",
    "    for item in DailyReport:\n",
    "        f.write(\"%s\" % item)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-01-18T10:24:26.267391Z",
     "start_time": "2019-01-18T10:24:26.258991Z"
    }
   },
   "outputs": [],
   "source": [
    "#以下用不到\n",
    "#將文字檔傳進list\n",
    "'''\n",
    "filelist=[]\n",
    "with open('download_files.txt', 'r') as f:\n",
    "    data = f.readlines()\n",
    "    for line in data:\n",
    "        filelist.append(line.strip())\n",
    "\n",
    "#filelist[0] 最後更新的頁數        \n",
    "filelist[0]=max_num\n",
    "'''"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2019-01-18T11:06:44.787548Z",
	"start_time": "2019-01-18T11:06:44.480996Z"
	}
	},
	"outputs": [],
	"source": [
	"import requests, re, os, csv, wget, time\n",
	"\n",
	"from bs4 import BeautifulSoup\n",
	"base_url = \"http://stockmarketpilipinas.com/\"\n",
	"url='http://stockmarketpilipinas.com/thread-337.html'\n",
	"#url2='http://stockmarketpilipinas.com/thread-337-page-453.html'"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2019-01-18T11:06:45.338766Z",
	"start_time": "2019-01-18T11:06:45.326742Z"
	}
	},
	"outputs": [],
	"source": [
	"runat=time.strftime(\"%Y-%m-%d %H:%M:%S\", time.localtime())\n",
	"workpath = os.getcwd()\n",
	"# 存檔路徑\n",
	"mydir = os.path.join(workpath, \"daily_csv\")\n",
	"#重覆的檔案放這邊\n",
	"mydir2 = os.path.join(workpath, \"duplicate_files\")\n",
	"DailyReport=''\n",
	"\n",
	"\n",
	"filelist=[]\n",
	"with open('download_files.txt', 'r') as f:\n",
	" data = f.readlines()\n",
	" for line in data:\n",
	" filelist.append(line.strip())\n",
	"last_download_page=filelist[0]\n",
	"#print(\"上次下載的頁面是在{}\".format(last_download_page))\n",
	"#print(len(filelist), filelist)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2019-01-18T11:06:46.678437Z",
	"start_time": "2019-01-18T11:06:46.229195Z"
	}
	},
	"outputs": [],
	"source": [
	"res=requests.get(url)\n",
	"soup=BeautifulSoup(res.content, 'html5lib')\n",
	"get_lastpage=soup.find(\"span\",{\"class\":\"pages\"})\n",
	"get_curren=soup.find(\"span\",{\"class\":\"pagination_current\"})"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2019-01-18T11:06:47.415641Z",
	"start_time": "2019-01-18T11:06:47.405565Z"
	}
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"目前在討論版的第1頁 \n",
	"最後一頁是:453頁\n"
	]
	}
	],
	"source": [
	"last_page=re.sub(r'\\D','',get_lastpage.text)\n",
	"curren_page=re.sub(r'\\D','',get_curren.text)\n",
	"print('目前在討論版的第{}頁 \\n最後一頁是:{}頁'.format(get_curren.text,last_page))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2019-01-18T11:07:51.328180Z",
	"start_time": "2019-01-18T11:07:51.319053Z"
	}
	},
	"outputs": [],
	"source": [
	"page_list = []\n",
	"# 起始的頁數\n",
	"pg = int(last_download_page)\n",
	"#結束的頁數\n",
	"max_num = int(last_page)\n",
	"\n",
	"download_files = []\n",
	"error_page = []\n",
	"\n",
	"if max_num != pg:\n",
	" for i in range(pg, max_num+1):\n",
	" get_page = str(pg)\n",
	" pg += 1\n",
	" page_list.append(base_url+'thread-337-page-'+get_page+'.html')\n",
	"else:\n",
	" page_list.append(base_url+'thread-337-page-'+str(max_num)+'.html')\n",
	" \n",
	"#print(\"PG={}\\nmax_num={}\".format(pg,max_num))\n",
	"#print(page_list)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2019-01-18T01:25:21.379217Z",
	"start_time": "2019-01-18T01:25:18.917089Z"
	}
	},
	"outputs": [],
	"source": [
	"for crawling_page in page_list:\n",
	" page_html = requests.get(crawling_page)\n",
	" page_soup = BeautifulSoup(page_html.text, 'lxml')\n",
	" title = page_soup.find('div', {'id': 'posts'}).find_all('fieldset')\n",
	" if title:\n",
	" print(\"開始從 {} 下載資料:\".format(crawling_page))\n",
	" for i in title:\n",
	" try:\n",
	" #取得檔名 + 轉成小寫\n",
	" f_name = i.find('a').text.lower()\n",
	" # 取得檔案連結\n",
	" f_href = base_url+i.find('a')['href']\n",
	" #先檢查檔案是否在上次下載的檔案list裡面\n",
	" if f_name in filelist:\n",
	" print(\"已有檔案: {}\".format(f_name))\n",
	" #有就跳出本次的迴圈，檢查下一個\n",
	" continue\n",
	" else:\n",
	" #檢查檔案是否存在\n",
	" isExists = os.path.exists(os.path.join(mydir, f_name))\n",
	" if not isExists:\n",
	" # 下載檔案\n",
	" print(\"下載檔案:\", f_name)\n",
	" #本次下載的檔案清單\n",
	" download_files.append(f_name)\n",
	" #所有下載的檔案清單\n",
	" filelist.append(f_name)\n",
	" wget.download(f_href, out=os.path.join(mydir, f_name))\n",
	" #下載檔案後，暫時0.3秒\n",
	" time.sleep(0.3)\n",
	" else:\n",
	" #如果檔案存在就下載到其他資料夾\n",
	" print(\"已有檔案: {}\".format(f_name))\n",
	" #wget.download(f_href, out=os.path.join(mydir2, f_name))\n",
	" except Exception as e:\n",
	" #msg = 'error: {0} {1} \\n'.format(crawling_page, f_name)\n",
	" error_page.append(e)\n",
	" continue\n",
	" else:\n",
	" print(\"沒有資料:\",crawling_page)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2019-01-18T01:22:40.191446Z",
	"start_time": "2019-01-18T01:22:40.183128Z"
	}
	},
	"outputs": [],
	"source": [
	"#檢查\n",
	"#print(\"本次下載檔案{}\\n錯誤訊息:{}\".format(download_files ,error_page))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2019-01-18T01:25:25.377986Z",
	"start_time": "2019-01-18T01:25:25.370771Z"
	}
	},
	"outputs": [],
	"source": [
	"msg=''\n",
	"if len(download_files):\n",
	" for i in download_files:\n",
	" msg=msg+i+'\\n'\n",
	"else:\n",
	" msg='本次沒有沒有新檔案需要下載'\n",
	"#print(msg)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2019-01-18T01:25:26.428668Z",
	"start_time": "2019-01-18T01:25:26.421285Z"
	}
	},
	"outputs": [],
	"source": [
	"msg2=''\n",
	"if len(error_page):\n",
	" for i in error_page:\n",
	" msg2=msg2+i+'\\n'\n",
	" print(msg2)\n",
	"else:\n",
	" msg2='沒有錯誤訊息!!!'"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2019-01-18T10:31:45.805078Z",
	"start_time": "2019-01-18T10:31:45.797679Z"
	}
	},
	"outputs": [],
	"source": [
	"#建立工作log檔\n",
	"s='================================='\n",
	"DailyReport='每日更新報告 @ {} \\n{}\\n本次下載的檔案:{}\\n錯誤訊息:{}\\n{}\\n'.format(runat,s,msg ,msg2,s)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2019-01-18T10:28:29.030560Z",
	"start_time": "2019-01-18T10:28:29.021325Z"
	}
	},
	"outputs": [],
	"source": [
	"#檢查\n",
	"#DailyReport"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2019-01-18T10:23:52.315364Z",
	"start_time": "2019-01-18T10:23:52.306628Z"
	}
	},
	"outputs": [],
	"source": [
	"#filelist[0]紀錄最後抓取的頁數\n",
	"filelist[0]=str(max_num)\n",
	"#print(len(filelist), filelist)\n",
	"\n",
	"#把list寫入文字檔,更新filelist[0]的內容\n",
	"with open('download_files.txt', 'w') as f:\n",
	" for item in filelist:\n",
	" f.write(\"%s\\n\" % item)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2019-01-18T10:32:26.656315Z",
	"start_time": "2019-01-18T10:32:26.649939Z"
	}
	},
	"outputs": [],
	"source": [
	"#以附加的方式將新增檔案的名稱寫入文字檔\n",
	"with open('DailyReport.txt', 'a+') as f:\n",
	" for item in DailyReport:\n",
	" f.write(\"%s\" % item)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"ExecuteTime": {
	"end_time": "2019-01-18T10:24:26.267391Z",
	"start_time": "2019-01-18T10:24:26.258991Z"
	}
	},
	"outputs": [],
	"source": [
	"#以下用不到\n",
	"#將文字檔傳進list\n",
	"'''\n",
	"filelist=[]\n",
	"with open('download_files.txt', 'r') as f:\n",
	" data = f.readlines()\n",
	" for line in data:\n",
	" filelist.append(line.strip())\n",
	"\n",
	"#filelist[0] 最後更新的頁數 \n",
	"filelist[0]=max_num\n",
	"'''"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.5"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}