Skip to content

Instantly share code, notes, and snippets.

@jshirius
Last active November 20, 2023 16:26
Show Gist options
  • Save jshirius/e8992c0e7620de098a43d77e4bd91859 to your computer and use it in GitHub Desktop.
Save jshirius/e8992c0e7620de098a43d77e4bd91859 to your computer and use it in GitHub Desktop.
Yahoo知恵袋のスクレイピングのPythonサンプルプログラム
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Yahoo知恵袋のスクレイピングのPythonサンプルプログラム\n",
"\n",
"seleniumをつかったyahoo知恵袋のスクレイピングのサンプルプログラムです。<br>\n",
"質問の検索結果まで出力できます。<br>\n",
"出力結果は、csvファイルに書き出します。<br>\n",
"\n",
"\n",
"参考にしたコード<br>\n",
"【Python×Selenium】超簡単にWebサイトをスクレイピングしてみる<br>\n",
"https://miyanetdev.com/archives/327"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from selenium import webdriver\n",
"from time import sleep\n",
"import urllib\n",
"import re\n",
"import pandas as pd\n",
"\n",
"PAGE_LIMIT = 20 #ページ遷移の最大の回数\n",
"SEARCH_QUERY = \"プログラミング\"\n",
"SQRAPING_URL = \"https://chiebukuro.yahoo.co.jp/\"\n",
"\n",
"#出力結果を格納数csvファイル\n",
"csv_file_name = SEARCH_QUERY + \".csv\"\n",
"\n",
"#ドライバーを設定する\n",
"#linuxなどGUIがない環境で動かす場合は、ヘッドレスモードを入れておく\n",
"#options = webdriver.ChromeOptions()\n",
"#options.add_argument('--headless')\n",
"\n",
"\n",
"#driver = webdriver.Chrome('./chromedriver', options)\n",
"driver = webdriver.Chrome('./chromedriver')\n",
"\n",
"#知恵袋ページを読み込む\n",
"driver.get(SQRAPING_URL)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#該当ページを解析する\n",
"def analysis_action():\n",
"\n",
" elems = driver.find_elements_by_xpath('//*[@id=\"sr\"]/ul/li[*]')\n",
" # 取得した要素を1つずつ表示\n",
"\n",
" out_puts = []\n",
"\n",
" if(len(elems) == 0):\n",
" print(\"ページは存在しないよ〜\")\n",
" else:\n",
" for elem in elems:\n",
" out_dic ={}\n",
" out_dic['query_key'] = SEARCH_QUERY\n",
" out_dic['rs_title'] = elem.find_elements_by_xpath('h3/a')[0].text\n",
" out_dic['rs_link'] = elem.find_elements_by_xpath('h3/a')[0].get_attribute('href')\n",
" out_dic['rs_summary'] = elem.find_elements_by_xpath('p[1]')[0].text\n",
" #print(out_dic)\n",
" out_puts.append(out_dic)\n",
" #print(\"*\" * 60)\n",
" \n",
" return out_puts"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def next_page_action():\n",
" \"\"\"\n",
" 現在のページから次のページを読み込むアクションを実行する\n",
" \"\"\"\n",
" rtn = False\n",
" \n",
" #次へボタンのクリック\n",
" elems = driver.find_elements_by_xpath('//*[@id=\"pg_low\"]/div/a[*]')\n",
"\n",
" #現在のページ\n",
" print(\"ページ遷移前のurl:\")\n",
" print(driver.current_url)\n",
" if(len(elems) == 0):\n",
" print(\"次のページは存在しないよ〜\")\n",
" else:\n",
" for elem in elems:\n",
" #print(elem.text)\n",
" if(elem.text != \"次へ\"):\n",
" continue\n",
" url = elem.get_attribute('href')\n",
" driver.get(url)\n",
" rtn = True\n",
" break\n",
"\n",
" return rtn\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 最初の検索を実行する\n",
"search_box = driver.find_element_by_css_selector('input.txtKeyword')\n",
"search_box.send_keys(SEARCH_QUERY)\n",
"search_button_container = driver.find_element_by_css_selector('p.btnSearch')\n",
"search_button = search_button_container.find_element_by_css_selector('input')\n",
"search_button.click()\n",
"sleep(2)\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#知恵袋の検索結果の一覧をpandasに格納してcsvに書き出す\n",
"#csvには、途中で止まっても良いように、1ページ終わったら書き出すようにしている\n",
"\n",
"d = analysis_action()\n",
"df=pd.DataFrame(d) \n",
"df.to_csv(csv_file_name, encoding=\"utf_8_sig\")\n",
"\n",
"analysis_list = []\n",
"analysis_list.extend(d)\n",
"\n",
"for page in range(PAGE_LIMIT):\n",
" \n",
" print(\"ページ %dを実行中\" % page)\n",
" sleep(5)\n",
" \n",
" #次のページに遷移する\n",
" rtn = next_page_action()\n",
" if(rtn == False):\n",
" break\n",
" \n",
" #知恵袋の質問リストを格納する\n",
" d = analysis_action()\n",
" if(len(d) > 0):\n",
" analysis_list.extend(d)\n",
" df=pd.DataFrame(analysis_list) \n",
" df.to_csv(csv_file_name, encoding=\"utf_8_sig\")\n",
" \n",
"driver.close()\n",
"driver.quit()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment