Skip to content

Instantly share code, notes, and snippets.

@omimo
Forked from bshillingford/arxiv2kindle.ipynb
Created September 20, 2019 18:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save omimo/5c97cf23b50824a4b2b243abd6f5d250 to your computer and use it in GitHub Desktop.
Save omimo/5c97cf23b50824a4b2b243abd6f5d250 to your computer and use it in GitHub Desktop.
arxiv2kindle: recompiles an arxiv paper for kindle-sized screens, and sends it to your wifi-enabled kindle
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import requests\n",
"import lxml.html as html\n",
"import re\n",
"import urllib\n",
"import os, sys, subprocess, os.path\n",
"import glob\n",
"import IPython.display\n",
"import getpass\n",
"import tempfile"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Change the following:\n",
"The query can be an arxiv URL or any string containing an arxiv ID.\n",
"\n",
"It will prompt you for the Gmail account's password; note that the account security settings will have to \"allow unsecure apps\" for permission to use the Gmail SMTP server with TLS."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"query = \"http://arxiv.org/abs/1511.08228\"\n",
"kindle_email = 'ADDRESS_HERE@kindle.com'\n",
"your_gmail = 'ADDRESS_HERE@gmail.com'\n",
"gmailpass = getpass.getpass()\n",
"\n",
"# paper settings (decrease width/height to increase font)\n",
"landscape = True\n",
"width = \"6in\"\n",
"height = \"4in\"\n",
"margin = \"0.2in\"\n",
"# settings for latex geometry package:\n",
"if landscape:\n",
" geom_settings = dict(paperwidth=width, paperheight=height, margin=margin)\n",
"else:\n",
" geom_settings = dict(paperwidth=height, paperheight=width, margin=margin)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"----------"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"arxiv_id = re.match(r'(http://.*?/)?(?P<id>\\d{4}\\.\\d{4,5}(v\\d{1,2})?)', query).group('id')\n",
"arxiv_abs = 'http://arxiv.org/abs/' + arxiv_id\n",
"arxiv_pdf = 'http://arxiv.org/pdf/' + arxiv_id\n",
"arxiv_pgtitle = html.fromstring(requests.get(arxiv_abs).text.encode('utf8')).xpath('/html/head/title/text()')[0]\n",
"arxiv_title = re.sub(r'\\s+', ' ', re.sub(r'^\\[[^]]+\\]\\s*', '', arxiv_pgtitle), re.DOTALL)\n",
"arxiv_title_scrubbed = re.sub('[^-_A-Za-z0-9]+', '_', arxiv_title, re.DOTALL)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"IPython.display.HTML('''\n",
"<h2><a href=\"{abs}\">[{id}] {title}</a><br />\n",
"[<a href=\"{pdf}\">pdf</a>]</h2>\n",
"'''.format(id=arxiv_id, abs=arxiv_abs, pdf=arxiv_pdf, title=arxiv_title))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"---------------"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"d = tempfile.mkdtemp(prefix='arxiv2kindle_')\n",
"\n",
"url = 'http://arxiv.org/e-print/' + arxiv_id\n",
"!wget -O {os.path.join(d, 'src.tar.gz')} --user-agent=\"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:21.0) Gecko/20100101 Firefox/21.0\" {url}\n",
"\n",
"os.chdir(d)\n",
"!tar xvf src.tar.gz"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"texfiles = glob.glob(os.path.join(d, '*.tex'))\n",
"for texfile in texfiles:\n",
" with open(texfile, 'r') as f:\n",
" src = f.readlines()\n",
" if 'documentclass' in src[0]:\n",
" print('correct file: ' + texfile)\n",
" break"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# filter comments/newlines for easier debugging:\n",
"src = [line for line in src if line[0] != '%' and len(line.strip()) > 0]\n",
"\n",
"# strip font size, column stuff, and paper size stuff in documentclass line:\n",
"src[0] = re.sub(r'\\b\\d+pt\\b', '', src[0])\n",
"src[0] = re.sub(r'\\b\\w+column\\b', '', src[0])\n",
"src[0] = re.sub(r'\\b\\w+paper\\b', '', src[0])\n",
"src[0] = re.sub(r'(?<=\\[),', '', src[0]) # remove extraneous starting commas\n",
"src[0] = re.sub(r',(?=[\\],])', '', src[0]) # remove extraneous middle/ending commas\n",
"\n",
"# find begin{document}:\n",
"begindocs = [i for i, line in enumerate(src) if line.startswith(r'\\begin{document}')]\n",
"assert(len(begindocs) == 1)\n",
"src.insert(begindocs[0], '\\\\usepackage['+','.join(k+'='+v for k,v in geom_settings.items())+']{geometry}\\n')\n",
"src.insert(begindocs[0], '\\\\usepackage{times}\\n')\n",
"src.insert(begindocs[0], '\\\\pagestyle{empty}\\n')\n",
"if landscape:\n",
" src.insert(begindocs[0], '\\\\usepackage{pdflscape}\\n')\n",
"\n",
"# shrink figures to be at most the size of the page:\n",
"for i in range(len(src)):\n",
" line = src[i]\n",
" m = re.search(r'\\\\includegraphics\\[width=([.\\d]+)\\\\(line|text)width\\]', line)\n",
" if m:\n",
" mul = m.group(1)\n",
" src[i] = re.sub(r'\\\\includegraphics\\[width=([.\\d]+)\\\\(line|text)width\\]',\n",
" r'\\\\includegraphics[width={mul}\\\\textwidth,height={mul}\\\\textheight,keepaspectratio]'.format(mul=mul),\n",
" line)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"os.rename(texfile, texfile+'.bak')\n",
"with open(texfile, 'w') as f:\n",
" f.writelines(src)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"texout = !pdflatex {texfile} && pdflatex {texfile} && pdflatex {texfile}\n",
"texout[-8:]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"------"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"pdffilename = texfile[:-4] + '.pdf'\n",
"if sys.platform == 'darwin':\n",
" os.system('open ' + pdffilename)\n",
"else:\n",
" os.system('xdg-open ' + pdffilename)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"-------"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from email.mime.application import MIMEApplication\n",
"from email.mime.multipart import MIMEMultipart\n",
"msg = MIMEMultipart()\n",
"pdf_part = MIMEApplication(open(texfile[:-4]+'.pdf', 'rb').read(), _subtype='pdf')\n",
"pdf_part.add_header('Content-Disposition', 'attachment', filename=arxiv_id+\"_\"+arxiv_title_scrubbed+\".pdf\")\n",
"msg.attach(pdf_part)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import smtplib\n",
"import getpass\n",
"server = smtplib.SMTP('smtp.gmail.com:587') \n",
"server.starttls() \n",
"server.login(your_gmail, gmailpass)\n",
"server.sendmail(your_gmail, kindle_email, msg.as_string())\n",
"server.close()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"------------"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.10"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment