Skip to content

Instantly share code, notes, and snippets.

@rubyu
Created November 10, 2017 23:41
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rubyu/0560e7e1958cc35c1f9771791822d3ef to your computer and use it in GitHub Desktop.
Save rubyu/0560e7e1958cc35c1f9771791822d3ef to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"! conda create -y -n request-crawler python=3.6 jupyter requests tqdm bs4 && conda clean -tipsy"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"asn1crypto==0.22.0\r\n",
"beautifulsoup4==4.6.0\r\n",
"bleach==2.0.0\r\n",
"bs4==0.0.1\r\n",
"certifi==2017.7.27.1\r\n",
"cffi==1.10.0\r\n",
"chardet==3.0.4\r\n",
"cryptography==2.0.3\r\n",
"decorator==4.1.2\r\n",
"entrypoints==0.2.3\r\n",
"html5lib==0.999999999\r\n",
"idna==2.6\r\n",
"ipykernel==4.6.1\r\n",
"ipython==6.2.1\r\n",
"ipython-genutils==0.2.0\r\n",
"ipywidgets==7.0.3\r\n",
"jedi==0.10.2\r\n",
"Jinja2==2.9.6\r\n",
"jsonschema==2.6.0\r\n",
"jupyter-client==5.1.0\r\n",
"jupyter-console==5.2.0\r\n",
"jupyter-core==4.4.0\r\n",
"MarkupSafe==1.0\r\n",
"mistune==0.8\r\n",
"nbconvert==5.3.1\r\n",
"nbformat==4.4.0\r\n",
"notebook==5.2.1\r\n",
"pandocfilters==1.4.1\r\n",
"pexpect==4.2.1\r\n",
"pickleshare==0.7.4\r\n",
"prompt-toolkit==1.0.15\r\n",
"ptyprocess==0.5.2\r\n",
"pycparser==2.18\r\n",
"Pygments==2.2.0\r\n",
"pyOpenSSL==17.2.0\r\n",
"PySocks==1.6.7\r\n",
"python-dateutil==2.6.1\r\n",
"pyzmq==16.0.2\r\n",
"qtconsole==4.3.1\r\n",
"requests==2.18.4\r\n",
"simplegeneric==0.8.1\r\n",
"six==1.11.0\r\n",
"terminado==0.6\r\n",
"testpath==0.3.1\r\n",
"tornado==4.5.2\r\n",
"tqdm==4.19.4\r\n",
"traitlets==4.3.2\r\n",
"urllib3==1.22\r\n",
"wcwidth==0.1.7\r\n",
"webencodings==0.5\r\n",
"widgetsnbextension==3.0.6\r\n"
]
}
],
"source": [
"! pip freeze"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"--2017-11-10 17:59:49-- https://s3.amazonaws.com/JP_AM/JP_monthly_top1000/201709_monthly/Top1000_14.txt\r\n",
"Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.225.115\r\n",
"Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.225.115|:443... connected.\r\n",
"HTTP request sent, awaiting response... 200 OK\r\n",
"Length: 86309 (84K) [text/plain]\r\n",
"Saving to: ‘Top1000_14.txt’\r\n",
"\r\n",
"\r",
"Top1000_14.txt 0%[ ] 0 --.-KB/s \r",
"Top1000_14.txt 29%[====> ] 24.59K 65.5KB/s \r",
"Top1000_14.txt 69%[============> ] 58.59K 78.0KB/s \r",
"Top1000_14.txt 100%[===================>] 84.29K 112KB/s in 0.8s \r\n",
"\r\n",
"2017-11-10 17:59:50 (112 KB/s) - ‘Top1000_14.txt’ saved [86309/86309]\r\n",
"\r\n"
]
}
],
"source": [
"! wget \"https://s3.amazonaws.com/JP_AM/JP_monthly_top1000/201709_monthly/Top1000_14.txt\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import logging\n",
"logging.basicConfig(\n",
" level=logging.ERROR,\n",
" format=\"%(levelname)-8s %(module)-16s %(funcName)-16s@%(lineno)d - %(message)s\"\n",
")\n",
"\n",
"def storage_dir():\n",
" from pathlib import Path\n",
" return Path(\"./amazon-jp-ranking\")\n",
"\n",
"def to_file_url(year, month, category_id):\n",
" return f\"https://s3.amazonaws.com/JP_AM/JP_monthly_top1000/{year}{month:0>2}_monthly/Top1000_{category_id}.txt\"\n",
"\n",
"def to_file_path(filename):\n",
" return storage_dir() / filename\n",
"\n",
"def wait(min_wait, max_wait):\n",
" import random\n",
" import time\n",
" t = random.uniform(min_wait, max_wait)\n",
" logging.debug(\"waiting %s sec\", t)\n",
" time.sleep(t)\n",
" \n",
"def save_remote_file(url, file):\n",
" import os\n",
" import requests\n",
" if (file.exists()):\n",
" logging.debug(f\"{file} is already exists\")\n",
" return\n",
" r = requests.get(url, stream=True)\n",
" if r.status_code != 200:\n",
" raise Exception()\n",
" with file.open(mode=\"wb\") as f:\n",
" for chunk in r:\n",
" f.write(chunk)\n",
" \n",
"def init():\n",
" sdir = storage_dir()\n",
" if not sdir.exists():\n",
" sdir.mkdir()\n",
" for cat_name, cat_id in categories():\n",
" cat_dir = storage_dir() / f\"{cat_id}-{cat_name}\"\n",
" if not cat_dir.exists():\n",
" cat_dir.mkdir()\n",
" \n",
"def categories():\n",
" return [\n",
" (\"本\", \"14\"),\n",
" (\"ミュージック\", \"15\"),\n",
" (\"おもちゃ&ホビー\", \"21\"),\n",
" (\"AV機器・携帯電話\", \"23\"),\n",
" (\"DIY・工具\", \"60\"),\n",
" (\"TVゲーム\", \"63\"),\n",
" (\"PCソフト\", \"65\"),\n",
" (\"DVD\", \"74\"),\n",
" (\"ベビー・マタニティ用品\", \"75\"),\n",
" (\"ホーム&キッチン\", \"79\"),\n",
" (\"ドラッグストア\", \"121\"),\n",
" (\"パソコン・周辺機器\", \"147\"),\n",
" (\"アパレル\", \"193\"),\n",
" (\"コスメ\", \"194\"),\n",
" (\"ジュエリー\", \"197\"),\n",
" (\"ペット用品\", \"199\"),\n",
" (\"スポーツ\", \"200\"),\n",
" (\"文房具・オフィス用品\", \"229\"),\n",
" (\"時計\", \"241\"),\n",
" (\"カー&バイク用品\", \"263\"),\n",
" (\"楽器\", \"267\"),\n",
" (\"シューズ&バッグ\", \"309\"),\n",
" (\"食品&飲料\", \"325\"),\n",
" (\"カメラ\", \"421\"),\n",
" ]\n",
" \n",
"def crawl_start():\n",
" init()\n",
" for year in range(2011, 2018):\n",
" for month in range(1, 13):\n",
" for cat_name, cat_id in categories():\n",
" cat_dir = storage_dir() / f\"{cat_id}-{cat_name}\"\n",
" url = to_file_url(year, month, cat_id)\n",
" filename = f\"Monthly-Top1000-{year}-{month:0>2}-{cat_id}.tsv\"\n",
" try:\n",
" print(url)\n",
" print(cat_dir / filename)\n",
" save_remote_file(url, cat_dir / filename)\n",
" except Exception as err:\n",
" print(err)\n",
" wait(1, 5)\n",
"crawl_start()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Environment (conda_request-crawler)",
"language": "python",
"name": "conda_request-crawler"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment