Skip to content

Instantly share code, notes, and snippets.

@sin-tanaka
Created July 30, 2018 07:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sin-tanaka/12bbe518e5df7f2c17c8137feb511723 to your computer and use it in GitHub Desktop.
Save sin-tanaka/12bbe518e5df7f2c17c8137feb511723 to your computer and use it in GitHub Desktop.
requests-htmlを触ってみる
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"# 公式: https://github.com/kennethreitz/requests-html\n",
"# 参考: https://devlights.hatenablog.com/entry/2018/02/26/024346\n",
"# 最初sessionを読み込むだけで良かったっぽいけどHTMLSessionクラスに変わったっぽい\n",
"# どこで変わったかは調べる\n",
"from requests_html import HTMLSession\n",
"session = HTMLSession()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"requests_html.HTMLResponse"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"r = session.get('https://python.org/')\n",
"type(r)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'/',\n",
" '//docs.python.org/3/tutorial/',\n",
" '//docs.python.org/3/tutorial/controlflow.html',\n",
" '//docs.python.org/3/tutorial/controlflow.html#defining-functions',\n",
" '//docs.python.org/3/tutorial/introduction.html#lists',\n",
" '//jobs.python.org',\n",
" '/about/',\n",
" '/about/apps',\n",
" '/about/apps/',\n",
" '/about/gettingstarted/',\n",
" '/about/help/',\n",
" '/about/legal/',\n",
" '/about/quotes/',\n",
" '/about/success/',\n",
" '/about/success/#arts',\n",
" '/about/success/#business',\n",
" '/about/success/#education',\n",
" '/about/success/#engineering',\n",
" '/about/success/#government',\n",
" '/about/success/#scientific',\n",
" '/about/success/#software-development',\n",
" '/accounts/login/',\n",
" '/accounts/signup/',\n",
" '/blogs/',\n",
" '/community/',\n",
" '/community/awards',\n",
" '/community/diversity/',\n",
" '/community/forums/',\n",
" '/community/irc/',\n",
" '/community/lists/',\n",
" '/community/logos/',\n",
" '/community/merchandise/',\n",
" '/community/sigs/',\n",
" '/community/workshops/',\n",
" '/dev/',\n",
" '/dev/core-mentorship/',\n",
" '/dev/peps/',\n",
" '/dev/peps/peps.rss',\n",
" '/doc/',\n",
" '/doc/av',\n",
" '/doc/essays/',\n",
" '/download/alternatives',\n",
" '/download/other/',\n",
" '/downloads/',\n",
" '/downloads/mac-osx/',\n",
" '/downloads/release/python-370/',\n",
" '/downloads/source/',\n",
" '/downloads/windows/',\n",
" '/events/',\n",
" '/events/calendars/',\n",
" '/events/python-events',\n",
" '/events/python-events/695/',\n",
" '/events/python-events/702/',\n",
" '/events/python-events/711/',\n",
" '/events/python-events/718/',\n",
" '/events/python-events/past/',\n",
" '/events/python-user-group/',\n",
" '/events/python-user-group/744/',\n",
" '/events/python-user-group/past/',\n",
" '/jobs/',\n",
" '/privacy/',\n",
" '/psf-landing/',\n",
" '/psf/',\n",
" '/psf/donations/',\n",
" '/psf/sponsorship/sponsors/',\n",
" '/shell/',\n",
" '/success-stories/',\n",
" '/success-stories/industrial-light-magic-runs-python/',\n",
" '/users/membership/',\n",
" 'http://blog.python.org',\n",
" 'http://bottlepy.org',\n",
" 'http://brochure.getpython.info/',\n",
" 'http://buildbot.net/',\n",
" 'http://docs.python.org/3/tutorial/introduction.html#using-python-as-a-calculator',\n",
" 'http://feedproxy.google.com/~r/PythonInsider/~3/BMstxEjkOt0/python-2715-released.html',\n",
" 'http://feedproxy.google.com/~r/PythonInsider/~3/PuHgTVhNAAE/python-370rc1-and-366rc1-now-available.html',\n",
" 'http://feedproxy.google.com/~r/PythonInsider/~3/RMqgTQsV720/python-3.html',\n",
" 'http://feedproxy.google.com/~r/PythonInsider/~3/rPQiRIs2Qhg/python-370b5-bonus-beta-is-now.html',\n",
" 'http://feedproxy.google.com/~r/PythonInsider/~3/vo7OgsISIdQ/python-370b4-final-37-beta-now.html',\n",
" 'http://flask.pocoo.org/',\n",
" 'http://ipython.org',\n",
" 'http://pandas.pydata.org/',\n",
" 'http://planetpython.org/',\n",
" 'http://plus.google.com/+Python',\n",
" 'http://pycon.blogspot.com/',\n",
" 'http://pyfound.blogspot.com/',\n",
" 'http://python.org/dev/peps/',\n",
" 'http://roundup.sourceforge.net/',\n",
" 'http://tornadoweb.org',\n",
" 'http://trac.edgewall.org/',\n",
" 'http://twitter.com/ThePSF',\n",
" 'http://wiki.python.org/moin/Languages',\n",
" 'http://wiki.python.org/moin/TkInter',\n",
" 'http://www.ansible.com',\n",
" 'http://www.djangoproject.com/',\n",
" 'http://www.facebook.com/pythonlang?fref=ts',\n",
" 'http://www.pylonsproject.org/',\n",
" 'http://www.riverbankcomputing.co.uk/software/pyqt/intro',\n",
" 'http://www.saltstack.com',\n",
" 'http://www.scipy.org',\n",
" 'http://www.web2py.com/',\n",
" 'http://www.wxpython.org/',\n",
" 'https://bugs.python.org/',\n",
" 'https://devguide.python.org/',\n",
" 'https://docs.python.org',\n",
" 'https://docs.python.org/3/license.html',\n",
" 'https://docs.python.org/faq/',\n",
" 'https://github.com/python/pythondotorg/issues',\n",
" 'https://kivy.org/',\n",
" 'https://mail.python.org/mailman/listinfo/python-dev',\n",
" 'https://pypi.python.org/',\n",
" 'https://status.python.org/',\n",
" 'https://wiki.gnome.org/Projects/PyGObject',\n",
" 'https://wiki.python.org/moin/',\n",
" 'https://wiki.python.org/moin/BeginnersGuide',\n",
" 'https://wiki.python.org/moin/PythonBooks',\n",
" 'https://wiki.python.org/moin/PythonEventsCalendar#Submitting_an_Event',\n",
" 'https://wiki.qt.io/PySide',\n",
" 'https://www.openstack.org',\n",
" 'https://www.python.org/psf/codeofconduct/'}"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# link全取得\n",
"r.html.links"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'http://blog.python.org',\n",
" 'http://bottlepy.org',\n",
" 'http://brochure.getpython.info/',\n",
" 'http://buildbot.net/',\n",
" 'http://docs.python.org/3/tutorial/introduction.html#using-python-as-a-calculator',\n",
" 'http://feedproxy.google.com/~r/PythonInsider/~3/BMstxEjkOt0/python-2715-released.html',\n",
" 'http://feedproxy.google.com/~r/PythonInsider/~3/PuHgTVhNAAE/python-370rc1-and-366rc1-now-available.html',\n",
" 'http://feedproxy.google.com/~r/PythonInsider/~3/RMqgTQsV720/python-3.html',\n",
" 'http://feedproxy.google.com/~r/PythonInsider/~3/rPQiRIs2Qhg/python-370b5-bonus-beta-is-now.html',\n",
" 'http://feedproxy.google.com/~r/PythonInsider/~3/vo7OgsISIdQ/python-370b4-final-37-beta-now.html',\n",
" 'http://flask.pocoo.org/',\n",
" 'http://ipython.org',\n",
" 'http://pandas.pydata.org/',\n",
" 'http://planetpython.org/',\n",
" 'http://plus.google.com/+Python',\n",
" 'http://pycon.blogspot.com/',\n",
" 'http://pyfound.blogspot.com/',\n",
" 'http://python.org/dev/peps/',\n",
" 'http://roundup.sourceforge.net/',\n",
" 'http://tornadoweb.org',\n",
" 'http://trac.edgewall.org/',\n",
" 'http://twitter.com/ThePSF',\n",
" 'http://wiki.python.org/moin/Languages',\n",
" 'http://wiki.python.org/moin/TkInter',\n",
" 'http://www.ansible.com',\n",
" 'http://www.djangoproject.com/',\n",
" 'http://www.facebook.com/pythonlang?fref=ts',\n",
" 'http://www.pylonsproject.org/',\n",
" 'http://www.riverbankcomputing.co.uk/software/pyqt/intro',\n",
" 'http://www.saltstack.com',\n",
" 'http://www.scipy.org',\n",
" 'http://www.web2py.com/',\n",
" 'http://www.wxpython.org/',\n",
" 'https://bugs.python.org/',\n",
" 'https://devguide.python.org/',\n",
" 'https://docs.python.org',\n",
" 'https://docs.python.org/3/license.html',\n",
" 'https://docs.python.org/3/tutorial/',\n",
" 'https://docs.python.org/3/tutorial/controlflow.html',\n",
" 'https://docs.python.org/3/tutorial/controlflow.html#defining-functions',\n",
" 'https://docs.python.org/3/tutorial/introduction.html#lists',\n",
" 'https://docs.python.org/faq/',\n",
" 'https://github.com/python/pythondotorg/issues',\n",
" 'https://jobs.python.org',\n",
" 'https://kivy.org/',\n",
" 'https://mail.python.org/mailman/listinfo/python-dev',\n",
" 'https://pypi.python.org/',\n",
" 'https://status.python.org/',\n",
" 'https://wiki.gnome.org/Projects/PyGObject',\n",
" 'https://wiki.python.org/moin/',\n",
" 'https://wiki.python.org/moin/BeginnersGuide',\n",
" 'https://wiki.python.org/moin/PythonBooks',\n",
" 'https://wiki.python.org/moin/PythonEventsCalendar#Submitting_an_Event',\n",
" 'https://wiki.qt.io/PySide',\n",
" 'https://www.openstack.org',\n",
" 'https://www.python.org/',\n",
" 'https://www.python.org/about/',\n",
" 'https://www.python.org/about/apps',\n",
" 'https://www.python.org/about/apps/',\n",
" 'https://www.python.org/about/gettingstarted/',\n",
" 'https://www.python.org/about/help/',\n",
" 'https://www.python.org/about/legal/',\n",
" 'https://www.python.org/about/quotes/',\n",
" 'https://www.python.org/about/success/',\n",
" 'https://www.python.org/about/success/#arts',\n",
" 'https://www.python.org/about/success/#business',\n",
" 'https://www.python.org/about/success/#education',\n",
" 'https://www.python.org/about/success/#engineering',\n",
" 'https://www.python.org/about/success/#government',\n",
" 'https://www.python.org/about/success/#scientific',\n",
" 'https://www.python.org/about/success/#software-development',\n",
" 'https://www.python.org/accounts/login/',\n",
" 'https://www.python.org/accounts/signup/',\n",
" 'https://www.python.org/blogs/',\n",
" 'https://www.python.org/community/',\n",
" 'https://www.python.org/community/awards',\n",
" 'https://www.python.org/community/diversity/',\n",
" 'https://www.python.org/community/forums/',\n",
" 'https://www.python.org/community/irc/',\n",
" 'https://www.python.org/community/lists/',\n",
" 'https://www.python.org/community/logos/',\n",
" 'https://www.python.org/community/merchandise/',\n",
" 'https://www.python.org/community/sigs/',\n",
" 'https://www.python.org/community/workshops/',\n",
" 'https://www.python.org/dev/',\n",
" 'https://www.python.org/dev/core-mentorship/',\n",
" 'https://www.python.org/dev/peps/',\n",
" 'https://www.python.org/dev/peps/peps.rss',\n",
" 'https://www.python.org/doc/',\n",
" 'https://www.python.org/doc/av',\n",
" 'https://www.python.org/doc/essays/',\n",
" 'https://www.python.org/download/alternatives',\n",
" 'https://www.python.org/download/other/',\n",
" 'https://www.python.org/downloads/',\n",
" 'https://www.python.org/downloads/mac-osx/',\n",
" 'https://www.python.org/downloads/release/python-370/',\n",
" 'https://www.python.org/downloads/source/',\n",
" 'https://www.python.org/downloads/windows/',\n",
" 'https://www.python.org/events/',\n",
" 'https://www.python.org/events/calendars/',\n",
" 'https://www.python.org/events/python-events',\n",
" 'https://www.python.org/events/python-events/695/',\n",
" 'https://www.python.org/events/python-events/702/',\n",
" 'https://www.python.org/events/python-events/711/',\n",
" 'https://www.python.org/events/python-events/718/',\n",
" 'https://www.python.org/events/python-events/past/',\n",
" 'https://www.python.org/events/python-user-group/',\n",
" 'https://www.python.org/events/python-user-group/744/',\n",
" 'https://www.python.org/events/python-user-group/past/',\n",
" 'https://www.python.org/jobs/',\n",
" 'https://www.python.org/privacy/',\n",
" 'https://www.python.org/psf-landing/',\n",
" 'https://www.python.org/psf/',\n",
" 'https://www.python.org/psf/codeofconduct/',\n",
" 'https://www.python.org/psf/donations/',\n",
" 'https://www.python.org/psf/sponsorship/sponsors/',\n",
" 'https://www.python.org/shell/',\n",
" 'https://www.python.org/success-stories/',\n",
" 'https://www.python.org/success-stories/industrial-light-magic-runs-python/',\n",
" 'https://www.python.org/users/membership/'}"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 絶対パスで取得\n",
"r.html.absolute_links"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"set"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# set型\n",
"type(r.html.links)"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'requests_html.Element'>\n"
]
}
],
"source": [
"# jquiryっぽいセレクターで要素を取得\n",
"about = r.html.find('#about', first=True)\n",
"\n",
"# PythonのElement Class\n",
"print(type(about))"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"About\n",
"Applications\n",
"Quotes\n",
"Getting Started\n",
"Help\n",
"Python Brochure\n"
]
}
],
"source": [
"print(about.text)"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<li id=\"about\" class=\"tier-1 element-1 \" aria-haspopup=\"true\">\n",
" <a href=\"/about/\" title=\"\" class=\"\">About</a>\n",
" \n",
" \n",
"\n",
"<ul class=\"subnav menu\" role=\"menu\" aria-hidden=\"true\">\n",
" \n",
" <li class=\"tier-2 element-1\" role=\"treeitem\"><a href=\"/about/apps/\" title=\"\">Applications</a></li>\n",
" \n",
" <li class=\"tier-2 element-2\" role=\"treeitem\"><a href=\"/about/quotes/\" title=\"\">Quotes</a></li>\n",
" \n",
" <li class=\"tier-2 element-3\" role=\"treeitem\"><a href=\"/about/gettingstarted/\" title=\"\">Getting Started</a></li>\n",
" \n",
" <li class=\"tier-2 element-4\" role=\"treeitem\"><a href=\"/about/help/\" title=\"\">Help</a></li>\n",
" \n",
" <li class=\"tier-2 element-5\" role=\"treeitem\"><a href=\"http://brochure.getpython.info/\" title=\"\">Python Brochure</a></li>\n",
" \n",
"</ul>\n",
"\n",
" \n",
" </li>\n"
]
}
],
"source": [
"print(about.html)"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'id': 'about', 'class': ('tier-1', 'element-1'), 'aria-haspopup': 'true'}\n"
]
}
],
"source": [
"print(about.attrs)"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[<Element 'a' href='/about/' title='' class=()>,\n",
" <Element 'a' href='/about/apps/' title=''>,\n",
" <Element 'a' href='/about/quotes/' title=''>,\n",
" <Element 'a' href='/about/gettingstarted/' title=''>,\n",
" <Element 'a' href='/about/help/' title=''>,\n",
" <Element 'a' href='http://brochure.getpython.info/' title=''>]"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 取得した要素内を、要素名で検索\n",
"about.find('a')"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'http://brochure.getpython.info/',\n",
" 'https://www.python.org/about/',\n",
" 'https://www.python.org/about/apps/',\n",
" 'https://www.python.org/about/gettingstarted/',\n",
" 'https://www.python.org/about/help/',\n",
" 'https://www.python.org/about/quotes/'}"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# URLが欲しい場合これでもOK\n",
"about.absolute_links"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'programming'"
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"r.html.search('Python is a {} language')[0]"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'http://blog.hatena.ne.jp/-/campaign/gnavi201807',\n",
" 'http://blog.hatena.ne.jp/-/odai/10257846132605475586',\n",
" 'http://blog.hatenablog.com/entry/2018/02/26/110000',\n",
" 'http://chanshu61ssb.hatenablog.com/entry/2018/07/29/191703',\n",
" 'http://daikanyama.blog.houyhnhnm.jp/entry/2018/07/30/103005',\n",
" 'http://ibaya.hatenablog.com/entry/2018/07/30/105806',\n",
" 'http://katsumakazuyo.hatenablog.com/entry/2018/07/30/120220',\n",
" 'http://tenten69.hatenablog.com/entry/2018/07/29/030504',\n",
" 'http://yamafashion.hatenadiary.jp/entry/2018/07/28/202539',\n",
" 'http://yasumi-08.hatenablog.com/entry/2018/07/30/120000',\n",
" 'https://aille.hatenablog.com/entry/2018/07/28/004617',\n",
" 'https://blog.notsobad.jp/entry/2018/07/30/110000',\n",
" 'https://k0kubun.hatenablog.com/entry/mjit-compaction',\n",
" 'https://mamichansan.hatenablog.com/entry/2018/07/29/200834',\n",
" 'https://peko-pekkopeko.hateblo.jp/entry/2018/07/28/135059',\n",
" 'https://sasayamyam.hatenadiary.jp/entry/2018/07/28/223934',\n",
" 'https://www.anomaly3-movie.com/entry/RANKING_MUNAKUSO',\n",
" 'https://www.zentei-happy-end.com/entry/2018/07/29/123316'}"
]
},
"execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# dev-toolsからコピペしたセレクタもそのまま使える\n",
"r = session.get('http://hatenablog.com/')\n",
"sel = 'body > div:nth-child(2) > section.serviceTop-recommend'\n",
"recommends = r.html.find(sel, first=True)\n",
"recommends.absolute_links"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[]"
]
},
"execution_count": 72,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# JSの実行も可能\n",
"# 裏ではChromiumが走っている\n",
"# Chromiumがない場合、自動でダウンロードされる\n",
"r = session.get('https://mya-ake.com/samples/vuejs-spa/')\n",
"r.html.find('a')"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[<Element 'a' href='/samples/vuejs-spa/about' class=('mdl-navigation__link',)>,\n",
" <Element 'a' href='/samples/vuejs-spa/api-sample' class=('mdl-navigation__link',)>,\n",
" <Element 'a' href='/samples/vuejs-spa/mdl-samples' class=('mdl-navigation__link',)>,\n",
" <Element 'a' href='/samples/vuejs-spa/libraries' class=('mdl-navigation__link',)>,\n",
" <Element 'a' href='/samples/vuejs-spa/credit' class=('mdl-navigation__link',)>,\n",
" <Element 'a' href='/samples/vuejs-spa/about' class=('mdl-navigation__link',)>,\n",
" <Element 'a' href='/samples/vuejs-spa/api-sample' class=('mdl-navigation__link',)>,\n",
" <Element 'a' href='/samples/vuejs-spa/mdl-samples' class=('mdl-navigation__link',)>,\n",
" <Element 'a' href='/samples/vuejs-spa/libraries' class=('mdl-navigation__link',)>,\n",
" <Element 'a' href='/samples/vuejs-spa/credit' class=('mdl-navigation__link',)>,\n",
" <Element 'a' href='/samples/vuejs-spa/libraries' class=()>,\n",
" <Element 'a' href='https://github.com/mya-ake/vuejs-spa-dev-env'>]"
]
},
"execution_count": 73,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"r.html.render()\n",
"r.html.find('a')"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# マークダウン機能も軽量化を理由に削除されているっぽい。。。 https://github.com/kennethreitz/requests-html/issues/15\n",
"# [elm.markdown for elm in r.html.find('a')]\n",
"# まだメジャーバージョンリリースされてないので、変更に追従していく必要はありそう"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment