omimo/arxiv2kindle.ipynb

## arxiv2kindle.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import requests\n",
    "import lxml.html as html\n",
    "import re\n",
    "import urllib\n",
    "import os, sys, subprocess, os.path\n",
    "import glob\n",
    "import IPython.display\n",
    "import getpass\n",
    "import tempfile"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Change the following:\n",
    "The query can be an arxiv URL or any string containing an arxiv ID.\n",
    "\n",
    "It will prompt you for the Gmail account's password; note that the account security settings will have to \"allow unsecure apps\" for permission to use the Gmail SMTP server with TLS."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "query = \"http://arxiv.org/abs/1511.08228\"\n",
    "kindle_email = 'ADDRESS_HERE@kindle.com'\n",
    "your_gmail = 'ADDRESS_HERE@gmail.com'\n",
    "gmailpass = getpass.getpass()\n",
    "\n",
    "# paper settings (decrease width/height to increase font)\n",
    "landscape = True\n",
    "width = \"6in\"\n",
    "height = \"4in\"\n",
    "margin = \"0.2in\"\n",
    "# settings for latex geometry package:\n",
    "if landscape:\n",
    "    geom_settings = dict(paperwidth=width, paperheight=height, margin=margin)\n",
    "else:\n",
    "    geom_settings = dict(paperwidth=height, paperheight=width, margin=margin)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "----------"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "arxiv_id = re.match(r'(http://.*?/)?(?P<id>\\d{4}\\.\\d{4,5}(v\\d{1,2})?)', query).group('id')\n",
    "arxiv_abs = 'http://arxiv.org/abs/' + arxiv_id\n",
    "arxiv_pdf = 'http://arxiv.org/pdf/' + arxiv_id\n",
    "arxiv_pgtitle = html.fromstring(requests.get(arxiv_abs).text.encode('utf8')).xpath('/html/head/title/text()')[0]\n",
    "arxiv_title = re.sub(r'\\s+', ' ', re.sub(r'^\\[[^]]+\\]\\s*', '', arxiv_pgtitle), re.DOTALL)\n",
    "arxiv_title_scrubbed = re.sub('[^-_A-Za-z0-9]+', '_', arxiv_title, re.DOTALL)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "IPython.display.HTML('''\n",
    "<h2><a href=\"{abs}\">[{id}] {title}</a><br />\n",
    "[<a href=\"{pdf}\">pdf</a>]</h2>\n",
    "'''.format(id=arxiv_id, abs=arxiv_abs, pdf=arxiv_pdf, title=arxiv_title))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "---------------"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "d = tempfile.mkdtemp(prefix='arxiv2kindle_')\n",
    "\n",
    "url = 'http://arxiv.org/e-print/' + arxiv_id\n",
    "!wget -O {os.path.join(d, 'src.tar.gz')} --user-agent=\"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:21.0) Gecko/20100101 Firefox/21.0\" {url}\n",
    "\n",
    "os.chdir(d)\n",
    "!tar xvf src.tar.gz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "texfiles = glob.glob(os.path.join(d, '*.tex'))\n",
    "for texfile in texfiles:\n",
    "    with open(texfile, 'r') as f:\n",
    "        src = f.readlines()\n",
    "    if 'documentclass' in src[0]:\n",
    "        print('correct file: ' + texfile)\n",
    "        break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# filter comments/newlines for easier debugging:\n",
    "src = [line for line in src if line[0] != '%' and len(line.strip()) > 0]\n",
    "\n",
    "# strip font size, column stuff, and paper size stuff in documentclass line:\n",
    "src[0] = re.sub(r'\\b\\d+pt\\b', '', src[0])\n",
    "src[0] = re.sub(r'\\b\\w+column\\b', '', src[0])\n",
    "src[0] = re.sub(r'\\b\\w+paper\\b', '', src[0])\n",
    "src[0] = re.sub(r'(?<=\\[),', '', src[0]) # remove extraneous starting commas\n",
    "src[0] = re.sub(r',(?=[\\],])', '', src[0]) # remove extraneous middle/ending commas\n",
    "\n",
    "# find begin{document}:\n",
    "begindocs = [i for i, line in enumerate(src) if line.startswith(r'\\begin{document}')]\n",
    "assert(len(begindocs) == 1)\n",
    "src.insert(begindocs[0], '\\\\usepackage['+','.join(k+'='+v for k,v in geom_settings.items())+']{geometry}\\n')\n",
    "src.insert(begindocs[0], '\\\\usepackage{times}\\n')\n",
    "src.insert(begindocs[0], '\\\\pagestyle{empty}\\n')\n",
    "if landscape:\n",
    "    src.insert(begindocs[0], '\\\\usepackage{pdflscape}\\n')\n",
    "\n",
    "# shrink figures to be at most the size of the page:\n",
    "for i in range(len(src)):\n",
    "    line = src[i]\n",
    "    m = re.search(r'\\\\includegraphics\\[width=([.\\d]+)\\\\(line|text)width\\]', line)\n",
    "    if m:\n",
    "        mul = m.group(1)\n",
    "        src[i] = re.sub(r'\\\\includegraphics\\[width=([.\\d]+)\\\\(line|text)width\\]',\n",
    "                   r'\\\\includegraphics[width={mul}\\\\textwidth,height={mul}\\\\textheight,keepaspectratio]'.format(mul=mul),\n",
    "                   line)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "os.rename(texfile, texfile+'.bak')\n",
    "with open(texfile, 'w') as f:\n",
    "    f.writelines(src)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "texout = !pdflatex {texfile} && pdflatex {texfile} && pdflatex {texfile}\n",
    "texout[-8:]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "------"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "pdffilename = texfile[:-4] + '.pdf'\n",
    "if sys.platform == 'darwin':\n",
    "    os.system('open ' + pdffilename)\n",
    "else:\n",
    "    os.system('xdg-open ' + pdffilename)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "-------"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from email.mime.application import MIMEApplication\n",
    "from email.mime.multipart import MIMEMultipart\n",
    "msg = MIMEMultipart()\n",
    "pdf_part = MIMEApplication(open(texfile[:-4]+'.pdf', 'rb').read(), _subtype='pdf')\n",
    "pdf_part.add_header('Content-Disposition', 'attachment', filename=arxiv_id+\"_\"+arxiv_title_scrubbed+\".pdf\")\n",
    "msg.attach(pdf_part)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import smtplib\n",
    "import getpass\n",
    "server = smtplib.SMTP('smtp.gmail.com:587')  \n",
    "server.starttls()  \n",
    "server.login(your_gmail, gmailpass)\n",
    "server.sendmail(your_gmail, kindle_email, msg.as_string())\n",
    "server.close()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "------------"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"import requests\n",
	"import lxml.html as html\n",
	"import re\n",
	"import urllib\n",
	"import os, sys, subprocess, os.path\n",
	"import glob\n",
	"import IPython.display\n",
	"import getpass\n",
	"import tempfile"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Change the following:\n",
	"The query can be an arxiv URL or any string containing an arxiv ID.\n",
	"\n",
	"It will prompt you for the Gmail account's password; note that the account security settings will have to \"allow unsecure apps\" for permission to use the Gmail SMTP server with TLS."
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"query = \"http://arxiv.org/abs/1511.08228\"\n",
	"kindle_email = 'ADDRESS_HERE@kindle.com'\n",
	"your_gmail = 'ADDRESS_HERE@gmail.com'\n",
	"gmailpass = getpass.getpass()\n",
	"\n",
	"# paper settings (decrease width/height to increase font)\n",
	"landscape = True\n",
	"width = \"6in\"\n",
	"height = \"4in\"\n",
	"margin = \"0.2in\"\n",
	"# settings for latex geometry package:\n",
	"if landscape:\n",
	" geom_settings = dict(paperwidth=width, paperheight=height, margin=margin)\n",
	"else:\n",
	" geom_settings = dict(paperwidth=height, paperheight=width, margin=margin)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"----------"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"arxiv_id = re.match(r'(http://.*?/)?(?P<id>\\d{4}\\.\\d{4,5}(v\\d{1,2})?)', query).group('id')\n",
	"arxiv_abs = 'http://arxiv.org/abs/' + arxiv_id\n",
	"arxiv_pdf = 'http://arxiv.org/pdf/' + arxiv_id\n",
	"arxiv_pgtitle = html.fromstring(requests.get(arxiv_abs).text.encode('utf8')).xpath('/html/head/title/text()')[0]\n",
	"arxiv_title = re.sub(r'\\s+', ' ', re.sub(r'^\\[[^]]+\\]\\s*', '', arxiv_pgtitle), re.DOTALL)\n",
	"arxiv_title_scrubbed = re.sub('[^-_A-Za-z0-9]+', '_', arxiv_title, re.DOTALL)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"IPython.display.HTML('''\n",
	"<h2><a href=\"{abs}\">[{id}] {title}</a><br />\n",
	"[<a href=\"{pdf}\">pdf</a>]</h2>\n",
	"'''.format(id=arxiv_id, abs=arxiv_abs, pdf=arxiv_pdf, title=arxiv_title))"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"---------------"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"d = tempfile.mkdtemp(prefix='arxiv2kindle_')\n",
	"\n",
	"url = 'http://arxiv.org/e-print/' + arxiv_id\n",
	"!wget -O {os.path.join(d, 'src.tar.gz')} --user-agent=\"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:21.0) Gecko/20100101 Firefox/21.0\" {url}\n",
	"\n",
	"os.chdir(d)\n",
	"!tar xvf src.tar.gz"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"texfiles = glob.glob(os.path.join(d, '*.tex'))\n",
	"for texfile in texfiles:\n",
	" with open(texfile, 'r') as f:\n",
	" src = f.readlines()\n",
	" if 'documentclass' in src[0]:\n",
	" print('correct file: ' + texfile)\n",
	" break"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# filter comments/newlines for easier debugging:\n",
	"src = [line for line in src if line[0] != '%' and len(line.strip()) > 0]\n",
	"\n",
	"# strip font size, column stuff, and paper size stuff in documentclass line:\n",
	"src[0] = re.sub(r'\\b\\d+pt\\b', '', src[0])\n",
	"src[0] = re.sub(r'\\b\\w+column\\b', '', src[0])\n",
	"src[0] = re.sub(r'\\b\\w+paper\\b', '', src[0])\n",
	"src[0] = re.sub(r'(?<=\\[),', '', src[0]) # remove extraneous starting commas\n",
	"src[0] = re.sub(r',(?=[\\],])', '', src[0]) # remove extraneous middle/ending commas\n",
	"\n",
	"# find begin{document}:\n",
	"begindocs = [i for i, line in enumerate(src) if line.startswith(r'\\begin{document}')]\n",
	"assert(len(begindocs) == 1)\n",
	"src.insert(begindocs[0], '\\\\usepackage['+','.join(k+'='+v for k,v in geom_settings.items())+']{geometry}\\n')\n",
	"src.insert(begindocs[0], '\\\\usepackage{times}\\n')\n",
	"src.insert(begindocs[0], '\\\\pagestyle{empty}\\n')\n",
	"if landscape:\n",
	" src.insert(begindocs[0], '\\\\usepackage{pdflscape}\\n')\n",
	"\n",
	"# shrink figures to be at most the size of the page:\n",
	"for i in range(len(src)):\n",
	" line = src[i]\n",
	" m = re.search(r'\\\\includegraphics\\[width=([.\\d]+)\\\\(line\|text)width\\]', line)\n",
	" if m:\n",
	" mul = m.group(1)\n",
	" src[i] = re.sub(r'\\\\includegraphics\\[width=([.\\d]+)\\\\(line\|text)width\\]',\n",
	" r'\\\\includegraphics[width={mul}\\\\textwidth,height={mul}\\\\textheight,keepaspectratio]'.format(mul=mul),\n",
	" line)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"os.rename(texfile, texfile+'.bak')\n",
	"with open(texfile, 'w') as f:\n",
	" f.writelines(src)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"texout = !pdflatex {texfile} && pdflatex {texfile} && pdflatex {texfile}\n",
	"texout[-8:]"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"------"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"pdffilename = texfile[:-4] + '.pdf'\n",
	"if sys.platform == 'darwin':\n",
	" os.system('open ' + pdffilename)\n",
	"else:\n",
	" os.system('xdg-open ' + pdffilename)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"-------"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"from email.mime.application import MIMEApplication\n",
	"from email.mime.multipart import MIMEMultipart\n",
	"msg = MIMEMultipart()\n",
	"pdf_part = MIMEApplication(open(texfile[:-4]+'.pdf', 'rb').read(), _subtype='pdf')\n",
	"pdf_part.add_header('Content-Disposition', 'attachment', filename=arxiv_id+\"_\"+arxiv_title_scrubbed+\".pdf\")\n",
	"msg.attach(pdf_part)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"import smtplib\n",
	"import getpass\n",
	"server = smtplib.SMTP('smtp.gmail.com:587') \n",
	"server.starttls() \n",
	"server.login(your_gmail, gmailpass)\n",
	"server.sendmail(your_gmail, kindle_email, msg.as_string())\n",
	"server.close()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"------------"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 2",
	"language": "python",
	"name": "python2"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.10"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}