Last active
May 6, 2019 01:03
-
-
Save sorami/c3c3ec33147751555de5c98be996f9b6 to your computer and use it in GitHub Desktop.
ちくまプリマー新書 一覧情報取得
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import time\n", | |
"\n", | |
"import requests\n", | |
"from lxml import html\n", | |
"import pandas as pd" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"http://www.chikumashobo.co.jp/search/result?p=&g=&a=&t=&k=402&s=&isbn=&order=&v=&desc=false\n", | |
"http://www.chikumashobo.co.jp/search/result?p=&k=402&s=&t=&a=&g=&isbn=&order=&v=&page=2&desc=false\n", | |
"skip: プリマー新書A全20冊\n", | |
"http://www.chikumashobo.co.jp/search/result?p=&k=402&s=&t=&a=&g=&isbn=&order=&v=&page=3&desc=false\n", | |
"http://www.chikumashobo.co.jp/search/result?p=&k=402&s=&t=&a=&g=&isbn=&order=&v=&page=4&desc=false\n", | |
"http://www.chikumashobo.co.jp/search/result?p=&k=402&s=&t=&a=&g=&isbn=&order=&v=&page=5&desc=false\n", | |
"http://www.chikumashobo.co.jp/search/result?p=&k=402&s=&t=&a=&g=&isbn=&order=&v=&page=6&desc=false\n", | |
"skip: プリマー新書D全20冊\n", | |
"skip: プリマー新書E全20冊\n", | |
"http://www.chikumashobo.co.jp/search/result?p=&k=402&s=&t=&a=&g=&isbn=&order=&v=&page=7&desc=false\n", | |
"http://www.chikumashobo.co.jp/search/result?p=&k=402&s=&t=&a=&g=&isbn=&order=&v=&page=8&desc=false\n", | |
"skip: 新書で学ぶ 中学生からの教養 全14冊\n", | |
"http://www.chikumashobo.co.jp/search/result?p=&k=402&s=&t=&a=&g=&isbn=&order=&v=&page=9&desc=false\n", | |
"skip: プリマー新書G全20冊\n", | |
"http://www.chikumashobo.co.jp/search/result?p=&k=402&s=&t=&a=&g=&isbn=&order=&v=&page=10&desc=false\n", | |
"http://www.chikumashobo.co.jp/search/result?p=&k=402&s=&t=&a=&g=&isbn=&order=&v=&page=11&desc=false\n", | |
"http://www.chikumashobo.co.jp/search/result?p=&k=402&s=&t=&a=&g=&isbn=&order=&v=&page=12&desc=false\n", | |
"http://www.chikumashobo.co.jp/search/result?p=&k=402&s=&t=&a=&g=&isbn=&order=&v=&page=13&desc=false\n", | |
"skip: 中学生からの大学講義 全5巻セット\n", | |
"http://www.chikumashobo.co.jp/search/result?p=&k=402&s=&t=&a=&g=&isbn=&order=&v=&page=14&desc=false\n", | |
"http://www.chikumashobo.co.jp/search/result?p=&k=402&s=&t=&a=&g=&isbn=&order=&v=&page=15&desc=false\n", | |
"http://www.chikumashobo.co.jp/search/result?p=&k=402&s=&t=&a=&g=&isbn=&order=&v=&page=16&desc=false\n", | |
"skip: 続・中学生からの大学講義セット\n", | |
"http://www.chikumashobo.co.jp/search/result?p=&k=402&s=&t=&a=&g=&isbn=&order=&v=&page=17&desc=false\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>title</th>\n", | |
" <th>url</th>\n", | |
" <th>author_list</th>\n", | |
" <th>date</th>\n", | |
" <th>isbn</th>\n", | |
" <th>jan_code</th>\n", | |
" <th>pages</th>\n", | |
" <th>price</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>ちゃんと話すための敬語の本</td>\n", | |
" <td>http://www.chikumashobo.co.jp/product/97844806...</td>\n", | |
" <td>[[橋本 治, http://www.chikumashobo.co.jp/author/0...</td>\n", | |
" <td>2005/01/25</td>\n", | |
" <td>4-480-68701-7</td>\n", | |
" <td>9784480687012</td>\n", | |
" <td>128頁</td>\n", | |
" <td>定価:本体680 円+税</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>先生はえらい</td>\n", | |
" <td>http://www.chikumashobo.co.jp/product/97844806...</td>\n", | |
" <td>[[内田 樹, http://www.chikumashobo.co.jp/author/0...</td>\n", | |
" <td>2005/01/25</td>\n", | |
" <td>978-4-480-68702-9</td>\n", | |
" <td>9784480687029</td>\n", | |
" <td>176頁</td>\n", | |
" <td>定価:本体780 円+税</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>死んだらどうなるの?</td>\n", | |
" <td>http://www.chikumashobo.co.jp/product/97844806...</td>\n", | |
" <td>[[玄侑 宗久, http://www.chikumashobo.co.jp/author/...</td>\n", | |
" <td>2005/01/25</td>\n", | |
" <td>4-480-68703-3</td>\n", | |
" <td>9784480687036</td>\n", | |
" <td>160頁</td>\n", | |
" <td>定価:本体760 円+税</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>熱烈応援!スポーツ天国</td>\n", | |
" <td>http://www.chikumashobo.co.jp/product/97844806...</td>\n", | |
" <td>[[最相 葉月, http://www.chikumashobo.co.jp/author/...</td>\n", | |
" <td>2005/01/25</td>\n", | |
" <td>4-480-68704-1</td>\n", | |
" <td>9784480687043</td>\n", | |
" <td>160頁</td>\n", | |
" <td>定価:本体720 円+税</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>事物はじまりの物語</td>\n", | |
" <td>http://www.chikumashobo.co.jp/product/97844806...</td>\n", | |
" <td>[[吉村 昭, http://www.chikumashobo.co.jp/author/0...</td>\n", | |
" <td>2005/01/25</td>\n", | |
" <td>4-480-68705-X</td>\n", | |
" <td>9784480687050</td>\n", | |
" <td>128頁</td>\n", | |
" <td>定価:本体680 円+税</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" title url \\\n", | |
"0 ちゃんと話すための敬語の本 http://www.chikumashobo.co.jp/product/97844806... \n", | |
"1 先生はえらい http://www.chikumashobo.co.jp/product/97844806... \n", | |
"2 死んだらどうなるの? http://www.chikumashobo.co.jp/product/97844806... \n", | |
"3 熱烈応援!スポーツ天国 http://www.chikumashobo.co.jp/product/97844806... \n", | |
"4 事物はじまりの物語 http://www.chikumashobo.co.jp/product/97844806... \n", | |
"\n", | |
" author_list date \\\n", | |
"0 [[橋本 治, http://www.chikumashobo.co.jp/author/0... 2005/01/25 \n", | |
"1 [[内田 樹, http://www.chikumashobo.co.jp/author/0... 2005/01/25 \n", | |
"2 [[玄侑 宗久, http://www.chikumashobo.co.jp/author/... 2005/01/25 \n", | |
"3 [[最相 葉月, http://www.chikumashobo.co.jp/author/... 2005/01/25 \n", | |
"4 [[吉村 昭, http://www.chikumashobo.co.jp/author/0... 2005/01/25 \n", | |
"\n", | |
" isbn jan_code pages price \n", | |
"0 4-480-68701-7 9784480687012 128頁 定価:本体680 円+税 \n", | |
"1 978-4-480-68702-9 9784480687029 176頁 定価:本体780 円+税 \n", | |
"2 4-480-68703-3 9784480687036 160頁 定価:本体760 円+税 \n", | |
"3 4-480-68704-1 9784480687043 160頁 定価:本体720 円+税 \n", | |
"4 4-480-68705-X 9784480687050 128頁 定価:本体680 円+税 " | |
] | |
}, | |
"execution_count": 2, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"l4df = []\n", | |
"search_url = \"http://www.chikumashobo.co.jp/search/result?p=&g=&a=&t=&k=402&s=&isbn=&order=&v=&desc=false\"\n", | |
"\n", | |
"for i in range(100):\n", | |
" print(search_url)\n", | |
" r = requests.get(search_url)\n", | |
" assert r.status_code == 200, (url, r.status_code)\n", | |
" dom = html.fromstring(r.content, \"lxml\")\n", | |
"\n", | |
"\n", | |
" for el in dom.cssselect(\"#result\"):\n", | |
" assert len(el.cssselect(\".title\")) == 1, el.cssselect(\".title\")\n", | |
" title = el.cssselect(\".title>a\")[0].text.strip()\n", | |
" book_url = \"http://www.chikumashobo.co.jp\" + el.cssselect(\".title>a\")[0].attrib[\"href\"]\n", | |
" \n", | |
" if \"全20冊\" in title or \"全14冊\" in title or \"全5巻セット\" in title or \"大学講義セット\" in title:\n", | |
" print(f\"skip: {title}\")\n", | |
" continue\n", | |
"\n", | |
" assert len(el.cssselect(\".title + * > *\")) > 0, title\n", | |
" author_list = []\n", | |
" for author_el in el.cssselect(\".title + * > *\"):\n", | |
" author = author_el.text\n", | |
" author_url = \"http://www.chikumashobo.co.jp\" + author_el.attrib[\"href\"]\n", | |
" author_list.append( [author, author_url] )\n", | |
"\n", | |
" assert len(el.cssselect(\".date\")) == 1, el.cssselect(\".date\")\n", | |
" desc = el.cssselect(\".date\")[0].text_content()\n", | |
" assert len(desc.split(\"\\n\")) == 6, desc.split(\"\\n\")\n", | |
" _, _, _, pages, _, etc = desc.split(\"\\n\")\n", | |
" pages = pages.strip()\n", | |
" _, date, _, etc2, jan_code = etc.split()\n", | |
" isbn = etc2.replace(\"JANコード\", \"\")\n", | |
"\n", | |
" assert len(el.cssselect(\".price\")) == 1, el.cssselect(\".price\")\n", | |
" price = el.cssselect(\".price\")[0].text\n", | |
"\n", | |
" l4df.append([title, book_url, author_list, date, isbn, jan_code, pages, price])\n", | |
" \n", | |
" if dom.cssselect(\"#more-result *.next *\")[-1].text != \"次へ\":\n", | |
" break\n", | |
" search_url = \"http://www.chikumashobo.co.jp\" + dom.cssselect(\"#more-result *.next *\")[-1].attrib[\"href\"]\n", | |
" time.sleep(2)\n", | |
"\n", | |
" \n", | |
"df = pd.DataFrame(l4df, columns=[\"title\", \"url\", \"author_list\", \n", | |
" \"date\", \"isbn\", \"jan_code\", \"pages\", \"price\"])\n", | |
"df.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df.to_csv(\"all.csv\", index=False)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"title,url,author_list,date,isbn,jan_code,pages,price\n", | |
"ちゃんと話すための敬語の本,http://www.chikumashobo.co.jp/product/9784480687012/,\"[['橋本 治', 'http://www.chikumashobo.co.jp/author/000967/']]\",2005/01/25,4-480-68701-7,9784480687012,128頁,定価:本体680 円+税\n", | |
"先生はえらい,http://www.chikumashobo.co.jp/product/9784480687029/,\"[['内田 樹', 'http://www.chikumashobo.co.jp/author/002177/']]\",2005/01/25,978-4-480-68702-9,9784480687029,176頁,定価:本体780 円+税\n", | |
"死んだらどうなるの?,http://www.chikumashobo.co.jp/product/9784480687036/,\"[['玄侑 宗久', 'http://www.chikumashobo.co.jp/author/001026/']]\",2005/01/25,4-480-68703-3,9784480687036,160頁,定価:本体760 円+税\n", | |
"熱烈応援!スポーツ天国,http://www.chikumashobo.co.jp/product/9784480687043/,\"[['最相 葉月', 'http://www.chikumashobo.co.jp/author/001185/']]\",2005/01/25,4-480-68704-1,9784480687043,160頁,定価:本体720 円+税\n", | |
"事物はじまりの物語,http://www.chikumashobo.co.jp/product/9784480687050/,\"[['吉村 昭', 'http://www.chikumashobo.co.jp/author/000892/']]\",2005/01/25,4-480-68705-X,9784480687050,128頁,定価:本体680 円+税\n", | |
"勉強ができなくても恥ずかしくない 1 ─どうしよう・・・の巻,http://www.chikumashobo.co.jp/product/9784480687067/,\"[['橋本 治', 'http://www.chikumashobo.co.jp/author/000967/']]\",2005/03/07,4-480-68706-8,9784480687067,112頁,定価:本体680 円+税\n", | |
"学校で教えない性教育の本,http://www.chikumashobo.co.jp/product/9784480687098/,\"[['河野 美香', 'http://www.chikumashobo.co.jp/author/000789/']]\",2005/03/07,4-480-68709-2,9784480687098,128頁,定価:本体680 円+税\n", | |
"奇跡を起こした村のはなし,http://www.chikumashobo.co.jp/product/9784480687104/,\"[['吉岡 忍', 'http://www.chikumashobo.co.jp/author/000880/']]\",2005/03/07,4-480-68710-6,9784480687104,176頁,定価:本体760 円+税\n", | |
"勉強ができなくても恥ずかしくない 2 ─やっちまえ!の巻,http://www.chikumashobo.co.jp/product/9784480687074/,\"[['橋本 治', 'http://www.chikumashobo.co.jp/author/000967/']]\",2005/04/05,4-480-68707-6,9784480687074,112頁,定価:本体680 円+税\n" | |
] | |
} | |
], | |
"source": [ | |
"!head all.csv" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>title</th>\n", | |
" <th>authors</th>\n", | |
" <th>date</th>\n", | |
" <th>pages</th>\n", | |
" <th>price</th>\n", | |
" <th>isbn</th>\n", | |
" <th>url</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>ちゃんと話すための敬語の本</td>\n", | |
" <td>橋本 治</td>\n", | |
" <td>2005/01/25</td>\n", | |
" <td>128</td>\n", | |
" <td>680</td>\n", | |
" <td>4-480-68701-7</td>\n", | |
" <td>http://www.chikumashobo.co.jp/product/97844806...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>先生はえらい</td>\n", | |
" <td>内田 樹</td>\n", | |
" <td>2005/01/25</td>\n", | |
" <td>176</td>\n", | |
" <td>780</td>\n", | |
" <td>978-4-480-68702-9</td>\n", | |
" <td>http://www.chikumashobo.co.jp/product/97844806...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>死んだらどうなるの?</td>\n", | |
" <td>玄侑 宗久</td>\n", | |
" <td>2005/01/25</td>\n", | |
" <td>160</td>\n", | |
" <td>760</td>\n", | |
" <td>4-480-68703-3</td>\n", | |
" <td>http://www.chikumashobo.co.jp/product/97844806...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>熱烈応援!スポーツ天国</td>\n", | |
" <td>最相 葉月</td>\n", | |
" <td>2005/01/25</td>\n", | |
" <td>160</td>\n", | |
" <td>720</td>\n", | |
" <td>4-480-68704-1</td>\n", | |
" <td>http://www.chikumashobo.co.jp/product/97844806...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>事物はじまりの物語</td>\n", | |
" <td>吉村 昭</td>\n", | |
" <td>2005/01/25</td>\n", | |
" <td>128</td>\n", | |
" <td>680</td>\n", | |
" <td>4-480-68705-X</td>\n", | |
" <td>http://www.chikumashobo.co.jp/product/97844806...</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" title authors date pages price isbn \\\n", | |
"0 ちゃんと話すための敬語の本 橋本 治 2005/01/25 128 680 4-480-68701-7 \n", | |
"1 先生はえらい 内田 樹 2005/01/25 176 780 978-4-480-68702-9 \n", | |
"2 死んだらどうなるの? 玄侑 宗久 2005/01/25 160 760 4-480-68703-3 \n", | |
"3 熱烈応援!スポーツ天国 最相 葉月 2005/01/25 160 720 4-480-68704-1 \n", | |
"4 事物はじまりの物語 吉村 昭 2005/01/25 128 680 4-480-68705-X \n", | |
"\n", | |
" url \n", | |
"0 http://www.chikumashobo.co.jp/product/97844806... \n", | |
"1 http://www.chikumashobo.co.jp/product/97844806... \n", | |
"2 http://www.chikumashobo.co.jp/product/97844806... \n", | |
"3 http://www.chikumashobo.co.jp/product/97844806... \n", | |
"4 http://www.chikumashobo.co.jp/product/97844806... " | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df_tidy = df.copy()\n", | |
"df_tidy[\"authors\"] = df_tidy[\"author_list\"].apply(lambda l: \", \".join([x[0] for x in l]))\n", | |
"df_tidy[\"pages\"] = df_tidy[\"pages\"].apply(lambda x: int(x.replace(\"頁\", \"\")))\n", | |
"df_tidy[\"price\"] = df_tidy[\"price\"].apply(lambda x: int(x.replace(\"定価:本体\",\"\").replace(\" 円+税\", \"\").replace(\",\", \"\")))\n", | |
"df_tidy = df_tidy[ [\"title\", \"authors\", \"date\", \"pages\", \"price\", \"isbn\", \"url\"] ]\n", | |
"df_tidy.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df_tidy.to_csv(\"tidy.csv\", index=False)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"title,authors,date,pages,price,isbn,url\n", | |
"ちゃんと話すための敬語の本,橋本 治,2005/01/25,128,680,4-480-68701-7,http://www.chikumashobo.co.jp/product/9784480687012/\n", | |
"先生はえらい,内田 樹,2005/01/25,176,780,978-4-480-68702-9,http://www.chikumashobo.co.jp/product/9784480687029/\n", | |
"死んだらどうなるの?,玄侑 宗久,2005/01/25,160,760,4-480-68703-3,http://www.chikumashobo.co.jp/product/9784480687036/\n", | |
"熱烈応援!スポーツ天国,最相 葉月,2005/01/25,160,720,4-480-68704-1,http://www.chikumashobo.co.jp/product/9784480687043/\n", | |
"事物はじまりの物語,吉村 昭,2005/01/25,128,680,4-480-68705-X,http://www.chikumashobo.co.jp/product/9784480687050/\n", | |
"勉強ができなくても恥ずかしくない 1 ─どうしよう・・・の巻,橋本 治,2005/03/07,112,680,4-480-68706-8,http://www.chikumashobo.co.jp/product/9784480687067/\n", | |
"学校で教えない性教育の本,河野 美香,2005/03/07,128,680,4-480-68709-2,http://www.chikumashobo.co.jp/product/9784480687098/\n", | |
"奇跡を起こした村のはなし,吉岡 忍,2005/03/07,176,760,4-480-68710-6,http://www.chikumashobo.co.jp/product/9784480687104/\n", | |
"勉強ができなくても恥ずかしくない 2 ─やっちまえ!の巻,橋本 治,2005/04/05,112,680,4-480-68707-6,http://www.chikumashobo.co.jp/product/9784480687074/\n" | |
] | |
} | |
], | |
"source": [ | |
"!head tidy.csv" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.5" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment