Skip to content

Instantly share code, notes, and snippets.

@rs6000
Created January 12, 2019 21:52
Show Gist options
  • Save rs6000/ebf44b8f5aee07c934a09e8f8bea4b55 to your computer and use it in GitHub Desktop.
Save rs6000/ebf44b8f5aee07c934a09e8f8bea4b55 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-01-12T21:33:15.882963Z",
"start_time": "2019-01-12T21:33:15.336780Z"
}
},
"outputs": [],
"source": [
"import os\n",
"import re\n",
"import requests\n",
"import shutil\n",
"import wget\n",
"import time\n",
"from bs4 import BeautifulSoup\n",
"import pandas as pd\n",
"from pandas import DataFrame \n",
"import sqlite3 as lite\n",
"import csv"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-01-12T21:33:16.142673Z",
"start_time": "2019-01-12T21:33:16.134738Z"
}
},
"outputs": [],
"source": [
"workpath = os.getcwd()\n",
"# 存檔路徑\n",
"mydir = os.path.join(workpath, \"save\")\n",
"#重覆的檔案放這邊\n",
"mydir2 = os.path.join(workpath, \"duplicate_files\")\n",
"'''\n",
"頁數起點到終點\n",
"http://stockmarketpilipinas.com/thread-337-page-2.html\n",
"http://stockmarketpilipinas.com/thread-337-page-452.html\n",
"'''\n",
"base_url = \"http://stockmarketpilipinas.com/\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-01-12T21:33:25.886144Z",
"start_time": "2019-01-12T21:33:25.877814Z"
}
},
"outputs": [],
"source": [
"# 頁數\n",
"page_list = []\n",
"# 起始的頁數\n",
"pg = 2\n",
"#結束的頁數\n",
"max_num = 452\n",
"\n",
"#產生要抓取的網址清單\n",
"for i in range(2, max_num+1):\n",
" get_page = str(pg)\n",
" pg += 1\n",
" page_list.append(base_url+'thread-337-page-'+get_page+'.html')\n",
"\n",
"# 測試\n",
"# print(len(page_list),page_list)\n",
"download_files = []\n",
"error_page = []"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-01-12T21:45:00.153430Z",
"start_time": "2019-01-12T21:33:32.878308Z"
}
},
"outputs": [],
"source": [
"for crawling_page in page_list:\n",
" page_html = requests.get(crawling_page)\n",
" page_soup = BeautifulSoup(page_html.text, 'lxml')\n",
" title = page_soup.find('div', {'id': 'posts'}).find_all('fieldset')\n",
" if title:\n",
" print(\"開始從 {} 下載資料:\".format(crawling_page))\n",
" for i in title:\n",
" try:\n",
" #取得檔名 + 轉成小寫\n",
" f_name = i.find('a').text.lower()\n",
" # 取得檔案連結\n",
" f_href = base_url+i.find('a')['href']\n",
" #檢查檔案是否存在\n",
" isExists = os.path.exists(os.path.join(mydir, f_name))\n",
" if not isExists:\n",
" # 下載檔案\n",
" print(\"下載檔案:\", f_name)\n",
" download_files.append(f_name)\n",
" wget.download(f_href, out=os.path.join(mydir, f_name))\n",
" #下載檔案後,暫時0.3秒\n",
" time.sleep(0.3)\n",
" else:\n",
" #如果檔案存在就下載到其他資料夾\n",
" print(\"已有檔案: {}\".format(f_name))\n",
" wget.download(f_href, out=os.path.join(mydir2, f_name))\n",
" except:\n",
" msg = 'error: {0} {1} \\n'.format(crawling_page, f_name)\n",
" error_page.append(msg)\n",
" pass\n",
" else:\n",
" print(\"沒有資料:\",crawling_page)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-01-12T21:47:23.443295Z",
"start_time": "2019-01-12T21:47:23.434813Z"
}
},
"outputs": [],
"source": [
"#印出以下載的檔案列表\n",
"print(download_files)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-01-08T04:01:49.181028Z",
"start_time": "2019-01-08T04:01:49.171557Z"
}
},
"outputs": [],
"source": [
"#其他功能測試區,請忽略以下程式\n",
"file = open('files.txt', 'w') # w 寫入 a 附加 \n",
"newlines=[str(line)+'\\n' for line in download_files] \n",
"file.writelines(newlines)\n",
"file.close()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-01-08T04:00:31.982984Z",
"start_time": "2019-01-08T04:00:31.904588Z"
}
},
"outputs": [],
"source": [
"print(\"下載檔案數量:{}\".format(len(download_files)))\n",
"for f in download_files:\n",
" totalCount = re.sub(\"\\D\", \"\", f)\n",
" if len(totalCount) > 6:\n",
" print(totalCount)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-01-07T06:10:32.396094Z",
"start_time": "2019-01-07T06:10:32.388663Z"
}
},
"outputs": [],
"source": [
"os.makedirs(\"/root/aa/b2\")\n",
"t_path=os.getcwd()\n",
"#print(t_path)\n",
"saveto=os.path.join(t_path,\"2019\",\"01\")\n",
"#print(saveto)\n",
"os.makedirs(saveto)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment