Skip to content

Instantly share code, notes, and snippets.

@bshillingford
Last active March 1, 2024 12:50
Show Gist options
  • Star 37 You must be signed in to star a gist
  • Fork 10 You must be signed in to fork a gist
  • Save bshillingford/6259986edca707ca58dd to your computer and use it in GitHub Desktop.
Save bshillingford/6259986edca707ca58dd to your computer and use it in GitHub Desktop.
arxiv2kindle: recompiles an arxiv paper for kindle-sized screens, and sends it to your wifi-enabled kindle
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import requests\n",
"import lxml.html as html\n",
"import re\n",
"import urllib\n",
"import os, sys, subprocess, os.path\n",
"import glob\n",
"import IPython.display\n",
"import getpass\n",
"import tempfile"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Change the following:\n",
"The query can be an arxiv URL or any string containing an arxiv ID.\n",
"\n",
"It will prompt you for the Gmail account's password; note that the account security settings will have to \"allow unsecure apps\" for permission to use the Gmail SMTP server with TLS."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"query = \"http://arxiv.org/abs/1511.08228\"\n",
"kindle_email = 'ADDRESS_HERE@kindle.com'\n",
"your_gmail = 'ADDRESS_HERE@gmail.com'\n",
"gmailpass = getpass.getpass()\n",
"\n",
"# paper settings (decrease width/height to increase font)\n",
"landscape = True\n",
"width = \"6in\"\n",
"height = \"4in\"\n",
"margin = \"0.2in\"\n",
"# settings for latex geometry package:\n",
"if landscape:\n",
" geom_settings = dict(paperwidth=width, paperheight=height, margin=margin)\n",
"else:\n",
" geom_settings = dict(paperwidth=height, paperheight=width, margin=margin)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"----------"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"arxiv_id = re.match(r'(http://.*?/)?(?P<id>\\d{4}\\.\\d{4,5}(v\\d{1,2})?)', query).group('id')\n",
"arxiv_abs = 'http://arxiv.org/abs/' + arxiv_id\n",
"arxiv_pdf = 'http://arxiv.org/pdf/' + arxiv_id\n",
"arxiv_pgtitle = html.fromstring(requests.get(arxiv_abs).text.encode('utf8')).xpath('/html/head/title/text()')[0]\n",
"arxiv_title = re.sub(r'\\s+', ' ', re.sub(r'^\\[[^]]+\\]\\s*', '', arxiv_pgtitle), re.DOTALL)\n",
"arxiv_title_scrubbed = re.sub('[^-_A-Za-z0-9]+', '_', arxiv_title, re.DOTALL)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"IPython.display.HTML('''\n",
"<h2><a href=\"{abs}\">[{id}] {title}</a><br />\n",
"[<a href=\"{pdf}\">pdf</a>]</h2>\n",
"'''.format(id=arxiv_id, abs=arxiv_abs, pdf=arxiv_pdf, title=arxiv_title))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"---------------"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"d = tempfile.mkdtemp(prefix='arxiv2kindle_')\n",
"\n",
"url = 'http://arxiv.org/e-print/' + arxiv_id\n",
"!wget -O {os.path.join(d, 'src.tar.gz')} --user-agent=\"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:21.0) Gecko/20100101 Firefox/21.0\" {url}\n",
"\n",
"os.chdir(d)\n",
"!tar xvf src.tar.gz"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"texfiles = glob.glob(os.path.join(d, '*.tex'))\n",
"for texfile in texfiles:\n",
" with open(texfile, 'r') as f:\n",
" src = f.readlines()\n",
" if 'documentclass' in src[0]:\n",
" print('correct file: ' + texfile)\n",
" break"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# filter comments/newlines for easier debugging:\n",
"src = [line for line in src if line[0] != '%' and len(line.strip()) > 0]\n",
"\n",
"# strip font size, column stuff, and paper size stuff in documentclass line:\n",
"src[0] = re.sub(r'\\b\\d+pt\\b', '', src[0])\n",
"src[0] = re.sub(r'\\b\\w+column\\b', '', src[0])\n",
"src[0] = re.sub(r'\\b\\w+paper\\b', '', src[0])\n",
"src[0] = re.sub(r'(?<=\\[),', '', src[0]) # remove extraneous starting commas\n",
"src[0] = re.sub(r',(?=[\\],])', '', src[0]) # remove extraneous middle/ending commas\n",
"\n",
"# find begin{document}:\n",
"begindocs = [i for i, line in enumerate(src) if line.startswith(r'\\begin{document}')]\n",
"assert(len(begindocs) == 1)\n",
"src.insert(begindocs[0], '\\\\usepackage['+','.join(k+'='+v for k,v in geom_settings.items())+']{geometry}\\n')\n",
"src.insert(begindocs[0], '\\\\usepackage{times}\\n')\n",
"src.insert(begindocs[0], '\\\\pagestyle{empty}\\n')\n",
"if landscape:\n",
" src.insert(begindocs[0], '\\\\usepackage{pdflscape}\\n')\n",
"\n",
"# shrink figures to be at most the size of the page:\n",
"for i in range(len(src)):\n",
" line = src[i]\n",
" m = re.search(r'\\\\includegraphics\\[width=([.\\d]+)\\\\(line|text)width\\]', line)\n",
" if m:\n",
" mul = m.group(1)\n",
" src[i] = re.sub(r'\\\\includegraphics\\[width=([.\\d]+)\\\\(line|text)width\\]',\n",
" r'\\\\includegraphics[width={mul}\\\\textwidth,height={mul}\\\\textheight,keepaspectratio]'.format(mul=mul),\n",
" line)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"os.rename(texfile, texfile+'.bak')\n",
"with open(texfile, 'w') as f:\n",
" f.writelines(src)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"texout = !pdflatex {texfile} && pdflatex {texfile} && pdflatex {texfile}\n",
"texout[-8:]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"------"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"pdffilename = texfile[:-4] + '.pdf'\n",
"if sys.platform == 'darwin':\n",
" os.system('open ' + pdffilename)\n",
"else:\n",
" os.system('xdg-open ' + pdffilename)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"-------"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from email.mime.application import MIMEApplication\n",
"from email.mime.multipart import MIMEMultipart\n",
"msg = MIMEMultipart()\n",
"pdf_part = MIMEApplication(open(texfile[:-4]+'.pdf', 'rb').read(), _subtype='pdf')\n",
"pdf_part.add_header('Content-Disposition', 'attachment', filename=arxiv_id+\"_\"+arxiv_title_scrubbed+\".pdf\")\n",
"msg.attach(pdf_part)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import smtplib\n",
"import getpass\n",
"server = smtplib.SMTP('smtp.gmail.com:587') \n",
"server.starttls() \n",
"server.login(your_gmail, gmailpass)\n",
"server.sendmail(your_gmail, kindle_email, msg.as_string())\n",
"server.close()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"------------"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.10"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
@sergeyf
Copy link

sergeyf commented Jan 17, 2016

Thanks for this great example!

To make the commented out part work you need quadruple backslash:

        src[i] = re.sub(r'\\includegraphics\[width=([.\d]+)\\(line|text)width\]',
                   '\\includegraphics[width={mul}\\\\textwidth,height={mul}\\\\textheight,keepaspectratio]'.format(mul=mul),
                   line)

@bshillingford
Copy link
Author

@sergeyf Good catch, thanks!

@stared
Copy link

stared commented May 14, 2016

Great! (Though, does not work on all papers, for understandable reasons.)
In any case, while I'm in love with Jupyter Notebook, for such tool it seems more natural to have a standalone script (pip-installable?), or something for Calibre.

@deepwilson
Copy link

Gives me following error:
! LaTeX Error: File "eso-pic.sty\" not found.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment