Created
November 10, 2017 23:41
-
-
Save rubyu/0560e7e1958cc35c1f9771791822d3ef to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"! conda create -y -n request-crawler python=3.6 jupyter requests tqdm bs4 && conda clean -tipsy" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"asn1crypto==0.22.0\r\n", | |
"beautifulsoup4==4.6.0\r\n", | |
"bleach==2.0.0\r\n", | |
"bs4==0.0.1\r\n", | |
"certifi==2017.7.27.1\r\n", | |
"cffi==1.10.0\r\n", | |
"chardet==3.0.4\r\n", | |
"cryptography==2.0.3\r\n", | |
"decorator==4.1.2\r\n", | |
"entrypoints==0.2.3\r\n", | |
"html5lib==0.999999999\r\n", | |
"idna==2.6\r\n", | |
"ipykernel==4.6.1\r\n", | |
"ipython==6.2.1\r\n", | |
"ipython-genutils==0.2.0\r\n", | |
"ipywidgets==7.0.3\r\n", | |
"jedi==0.10.2\r\n", | |
"Jinja2==2.9.6\r\n", | |
"jsonschema==2.6.0\r\n", | |
"jupyter-client==5.1.0\r\n", | |
"jupyter-console==5.2.0\r\n", | |
"jupyter-core==4.4.0\r\n", | |
"MarkupSafe==1.0\r\n", | |
"mistune==0.8\r\n", | |
"nbconvert==5.3.1\r\n", | |
"nbformat==4.4.0\r\n", | |
"notebook==5.2.1\r\n", | |
"pandocfilters==1.4.1\r\n", | |
"pexpect==4.2.1\r\n", | |
"pickleshare==0.7.4\r\n", | |
"prompt-toolkit==1.0.15\r\n", | |
"ptyprocess==0.5.2\r\n", | |
"pycparser==2.18\r\n", | |
"Pygments==2.2.0\r\n", | |
"pyOpenSSL==17.2.0\r\n", | |
"PySocks==1.6.7\r\n", | |
"python-dateutil==2.6.1\r\n", | |
"pyzmq==16.0.2\r\n", | |
"qtconsole==4.3.1\r\n", | |
"requests==2.18.4\r\n", | |
"simplegeneric==0.8.1\r\n", | |
"six==1.11.0\r\n", | |
"terminado==0.6\r\n", | |
"testpath==0.3.1\r\n", | |
"tornado==4.5.2\r\n", | |
"tqdm==4.19.4\r\n", | |
"traitlets==4.3.2\r\n", | |
"urllib3==1.22\r\n", | |
"wcwidth==0.1.7\r\n", | |
"webencodings==0.5\r\n", | |
"widgetsnbextension==3.0.6\r\n" | |
] | |
} | |
], | |
"source": [ | |
"! pip freeze" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"--2017-11-10 17:59:49-- https://s3.amazonaws.com/JP_AM/JP_monthly_top1000/201709_monthly/Top1000_14.txt\r\n", | |
"Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.225.115\r\n", | |
"Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.225.115|:443... connected.\r\n", | |
"HTTP request sent, awaiting response... 200 OK\r\n", | |
"Length: 86309 (84K) [text/plain]\r\n", | |
"Saving to: ‘Top1000_14.txt’\r\n", | |
"\r\n", | |
"\r", | |
"Top1000_14.txt 0%[ ] 0 --.-KB/s \r", | |
"Top1000_14.txt 29%[====> ] 24.59K 65.5KB/s \r", | |
"Top1000_14.txt 69%[============> ] 58.59K 78.0KB/s \r", | |
"Top1000_14.txt 100%[===================>] 84.29K 112KB/s in 0.8s \r\n", | |
"\r\n", | |
"2017-11-10 17:59:50 (112 KB/s) - ‘Top1000_14.txt’ saved [86309/86309]\r\n", | |
"\r\n" | |
] | |
} | |
], | |
"source": [ | |
"! wget \"https://s3.amazonaws.com/JP_AM/JP_monthly_top1000/201709_monthly/Top1000_14.txt\"" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import logging\n", | |
"logging.basicConfig(\n", | |
" level=logging.ERROR,\n", | |
" format=\"%(levelname)-8s %(module)-16s %(funcName)-16s@%(lineno)d - %(message)s\"\n", | |
")\n", | |
"\n", | |
"def storage_dir():\n", | |
" from pathlib import Path\n", | |
" return Path(\"./amazon-jp-ranking\")\n", | |
"\n", | |
"def to_file_url(year, month, category_id):\n", | |
" return f\"https://s3.amazonaws.com/JP_AM/JP_monthly_top1000/{year}{month:0>2}_monthly/Top1000_{category_id}.txt\"\n", | |
"\n", | |
"def to_file_path(filename):\n", | |
" return storage_dir() / filename\n", | |
"\n", | |
"def wait(min_wait, max_wait):\n", | |
" import random\n", | |
" import time\n", | |
" t = random.uniform(min_wait, max_wait)\n", | |
" logging.debug(\"waiting %s sec\", t)\n", | |
" time.sleep(t)\n", | |
" \n", | |
"def save_remote_file(url, file):\n", | |
" import os\n", | |
" import requests\n", | |
" if (file.exists()):\n", | |
" logging.debug(f\"{file} is already exists\")\n", | |
" return\n", | |
" r = requests.get(url, stream=True)\n", | |
" if r.status_code != 200:\n", | |
" raise Exception()\n", | |
" with file.open(mode=\"wb\") as f:\n", | |
" for chunk in r:\n", | |
" f.write(chunk)\n", | |
" \n", | |
"def init():\n", | |
" sdir = storage_dir()\n", | |
" if not sdir.exists():\n", | |
" sdir.mkdir()\n", | |
" for cat_name, cat_id in categories():\n", | |
" cat_dir = storage_dir() / f\"{cat_id}-{cat_name}\"\n", | |
" if not cat_dir.exists():\n", | |
" cat_dir.mkdir()\n", | |
" \n", | |
"def categories():\n", | |
" return [\n", | |
" (\"本\", \"14\"),\n", | |
" (\"ミュージック\", \"15\"),\n", | |
" (\"おもちゃ&ホビー\", \"21\"),\n", | |
" (\"AV機器・携帯電話\", \"23\"),\n", | |
" (\"DIY・工具\", \"60\"),\n", | |
" (\"TVゲーム\", \"63\"),\n", | |
" (\"PCソフト\", \"65\"),\n", | |
" (\"DVD\", \"74\"),\n", | |
" (\"ベビー・マタニティ用品\", \"75\"),\n", | |
" (\"ホーム&キッチン\", \"79\"),\n", | |
" (\"ドラッグストア\", \"121\"),\n", | |
" (\"パソコン・周辺機器\", \"147\"),\n", | |
" (\"アパレル\", \"193\"),\n", | |
" (\"コスメ\", \"194\"),\n", | |
" (\"ジュエリー\", \"197\"),\n", | |
" (\"ペット用品\", \"199\"),\n", | |
" (\"スポーツ\", \"200\"),\n", | |
" (\"文房具・オフィス用品\", \"229\"),\n", | |
" (\"時計\", \"241\"),\n", | |
" (\"カー&バイク用品\", \"263\"),\n", | |
" (\"楽器\", \"267\"),\n", | |
" (\"シューズ&バッグ\", \"309\"),\n", | |
" (\"食品&飲料\", \"325\"),\n", | |
" (\"カメラ\", \"421\"),\n", | |
" ]\n", | |
" \n", | |
"def crawl_start():\n", | |
" init()\n", | |
" for year in range(2011, 2018):\n", | |
" for month in range(1, 13):\n", | |
" for cat_name, cat_id in categories():\n", | |
" cat_dir = storage_dir() / f\"{cat_id}-{cat_name}\"\n", | |
" url = to_file_url(year, month, cat_id)\n", | |
" filename = f\"Monthly-Top1000-{year}-{month:0>2}-{cat_id}.tsv\"\n", | |
" try:\n", | |
" print(url)\n", | |
" print(cat_dir / filename)\n", | |
" save_remote_file(url, cat_dir / filename)\n", | |
" except Exception as err:\n", | |
" print(err)\n", | |
" wait(1, 5)\n", | |
"crawl_start()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Environment (conda_request-crawler)", | |
"language": "python", | |
"name": "conda_request-crawler" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.3" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment