Skip to content

Instantly share code, notes, and snippets.

@taldcroft
Created December 14, 2023 15:03
Show Gist options
  • Save taldcroft/00911ca21cddf43f82c0c3365ea3b219 to your computer and use it in GitHub Desktop.
Save taldcroft/00911ca21cddf43f82c0c3365ea3b219 to your computer and use it in GitHub Desktop.
Try to get an ApJ data table programmatically
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"from pathlib import Path\n",
"\n",
"import requests\n",
"from bs4 import BeautifulSoup\n",
"\n",
"from astropy.table import Table"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"url = \"https://iopscience.iop.org/article/10.3847/1538-4357/ac743f#apjac743ft2\"\n",
"doi = \"10.3847/1538-4357/ac743f\""
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"req = requests.get(f\"https://doi.org/api/handles/{doi}\", timeout=30)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'https://iopscience.iop.org/article/10.3847/1538-4357/ac743f'"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"url = req.json()[\"values\"][0][\"data\"][\"value\"]\n",
"url"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"html = requests.get(url, timeout=30).text"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"doc = BeautifulSoup(html, \"html.parser\")"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"502211"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Path(\"test-pretty.html\").write_text(doc.prettify())"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"# Find all the <a> tags in doc that have \"ASCII\" as the text content\n",
"# and return the first one.\n",
"data_links = [link for link in doc.find_all(\"a\") if link.text in [\"ASCII\", \"Data\"]]"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[<a class=\"btn btn-primary wd-jnl-art-btn-ascii\" href=\"/0004-637X/933/2/180/suppdata/apjac743ft1_ascii.txt?doi=10.3847/1538-4357/ac743f\" target=\"_blank\">ASCII</a>,\n",
" <a class=\"btn btn-primary wd-jnl-art-btn-table-data\" href=\"https://content.cld.iop.org/journals/0004-637X/933/2/180/revision2/apjac743ft2_mrt.txt?Expires=1703165700&amp;Signature=CNsCVx1TltrqEbrZzv4n16lFOlNwk~gmts6UAH7aqJm9hd2qLURJ-JEfURgPb~bhe2wZLZ4eRxHVjtXSkUKgomu~WMUrLbidFs~KDvoAtU7i2QFOn3fz1a8bMR~-X8YXQ9ND82J64SZ0MhiawfNTIiNWcEfJkA~aJoEMbENrAVmpv9iAsvypjjldaE2ZQMvQGD7ve0chRVvE5jBw1e-XTvxRKK2ZMSytUeHIPtZ4SCSWDPZvuW55Y3CBp6AirG577HXvrkZfy49MhZSLV2D045bVOolQm~JfQiMThmAjfJFhCvuhoYEgK0Jz~rMbOWZ36NgkvCWS32u2hHkgfpbASw__&amp;Key-Pair-Id=KL1D8TIY3N7T8\" target=\"_blank\">Data</a>]"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data_links"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'/0004-637X/933/2/180/suppdata/apjac743ft1_ascii.txt?doi=10.3847/1538-4357/ac743f'"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Get the href attribute of the link\n",
"link[\"href\"]"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"def parse_apj_ascii(text):\n",
" \"\"\"Parse a tab-separated ASCII table from the ApJ website.\n",
"\n",
" For Example::\n",
"\n",
" Table 1\n",
" CLQ Candidate Selection\n",
"\n",
"\n",
" TDSS Target Type\tN _QSOs\tN _Cand\t% Cand\tN _CLQ\t% CLQ\n",
" A\t17,511\t0\t0.0\t0\t0\n",
" B\t44,832\t1\t0.0\t0\t0\n",
" FES_DE\t653\t9\t1.4\t6\t0.9\n",
" FES_HYPQSO\t1237\t23\t2.0\t4\t0.3\n",
" FES_MGII\t64\t1\t1.6\t1\t1.5\n",
" FES_NQHISN\t744\t2\t0.3\t2\t0.3\n",
" RQS1\t10,520\t18\t0.2\t5\t0.0\n",
" RQS2\t2243\t1\t0.0\t1\t0.0\n",
" RQS2v\t1111\t2\t0.1\t0\t0.0\n",
" RQS3\t1056\t0\t0\t0\t0\n",
" RQS3v\t1579\t4\t0.3\t0\t0\n",
" Total\t64,039\t61\t0.1\t19\t0.03\n",
" Note. Target selection methods for our TDSS CLQs. These abbreviations correspond to several different target selection algorithms used for the TDSS program of SDSS-IV, and are described briefly in Section 2. Some QSOs were selected by TDSS using more than one method. Small number statistics dominate the percentages listed, which are derived simply by dividing by N _QSOs. See the discussion in Section 2.\n",
"\n",
" \"\"\"\n",
" lines = text.splitlines()\n",
" return lines"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"def get_apj_data_text(link: BeautifulSoup) -> Table:\n",
" url = link[\"href\"]\n",
"\n",
" if url.startswith(\"/\"):\n",
" url = f\"https://iopscience.iop.org{url}\"\n",
"\n",
" req = requests.get(url, timeout=30)\n",
" req.raise_for_status()\n",
" text = req.text\n",
" return text"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"def get_apj_data_table(link: BeautifulSoup) -> Table:\n",
" text = get_apj_data_text(link)\n",
" if link.text == \"ASCII\":\n",
" out = parse_apj_ascii(text)\n",
" elif link.text == \"Data\":\n",
" out = Table.read(text, format=\"ascii.mrt\")\n",
" else:\n",
" raise ValueError(f\"Unknown link type {link.text}\")\n",
" return out"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['<head><title>ShieldSquare Captcha</title><script type=\"text/javascript\">',\n",
" '\\twindow.SSJSInternal = 13231;',\n",
" '',\n",
" ' var __uzdbm_1 = \"13e9e8e3-8fae-41ac-a6d2-aac952b173c3\";',\n",
" ' var __uzdbm_2 = \"M2M3MTA4ZmMtODQyNy00YzBjLWJhODAtYzJjZjkzMDQwZmFmJDE3My40OC4xMjAuMg==\";',\n",
" '',\n",
" '\\t(function(w, d, e, u, c, g, a, b){',\n",
" '\\t\\tw[\"SSJSConnectorObj\"] = w[\"SSJSConnectorObj\"] || {ss_cid : c, domain_info: g};',\n",
" '\\t\\ta = d.createElement(e);',\n",
" '\\t\\ta.async = true;',\n",
" '\\t\\ta.src = u;',\n",
" '\\t\\tb = d.getElementsByTagName(e)[0];',\n",
" '\\t\\tb.parentNode.insertBefore(a, b);',\n",
" '\\t})(window,document,\"script\",\"https://cdn.perfdrive.com/aperture/aperture.js\",\"b8c3\",\"auto\");',\n",
" '</script>',\n",
" '</head><div><link rel=\"stylesheet\" type=\"text/css\" href=\"https://captcha.perfdrive.com/captcha-public/css/shieldsquare_styles.min.css\">',\n",
" ' </div><div class=\"container\">',\n",
" ' <div>',\n",
" ' <div>',\n",
" ' <img src=\"https://cdn.images.iop.org/website-logos/IOP-Publishing-300x83.jpg\" alt=\"Captcha Page\">',\n",
" ' </div>',\n",
" ' </div>',\n",
" ' <div>',\n",
" ' <div>',\n",
" ' <h2>We apologize for the inconvenience...</h2>',\n",
" ' <p>To ensure we keep this website safe, please can you confirm you are a human by ticking the box below. </p>',\n",
" '\\t\\t\\t\\t\\t\\t <p>If you are unable to complete the above request please contact us using the below link, providing a screenshot of your experience.</p>',\n",
" '\\t\\t\\t\\t\\t\\t <p><a href=\"https://ioppublishing.org/contacts/\">https://ioppublishing.org/contacts/</a></p>',\n",
" ' </div>',\n",
" ' </div>',\n",
" ' <div class=\"row\">',\n",
" ' <center> <strong>Incident ID: 3c7108fc-8427-4c0c-ba80-c2cf93040faf</strong> </center>',\n",
" ' <br>',\n",
" ' <div>',\n",
" '<!-- <center>',\n",
" ' <p><strong> Please solve this CAPTCHA to request unblock to the website </strong></p>',\n",
" ' </center> -->',\n",
" ' </div>',\n",
" ' <br>',\n",
" ' <div class=\"captcha-mid\">',\n",
" \" <!-- *Required - Uncomment the below commented php code and remove 'echo 1234' before submit -->\",\n",
" ' <form action=\"\" method=POST style=\"width:300px;margin-left:auto;margin-right:auto;\">',\n",
" ' <script type=\"text/javascript\" src=\"https://hcaptcha.com/1/api.js\"></script>',\n",
" ' <div class=\"h-captcha\" data-sitekey=\"ae73173b-7003-44e0-bc87-654d0dab8b75\"></div>',\n",
" ' <input type=\"hidden\" id=\"recaptcha_response\" name=\"recaptcha_response\"><br>',\n",
" ' <center><br><input type=\"submit\" class=\"btn btn-success btn-sm\" Value=\"Submit\"><br></center>',\n",
" ' </form>',\n",
" ' <br></div>',\n",
" ' </div>',\n",
" ' </div>']"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"get_apj_data_table(data_links[0])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "astropy",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.10"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment