sergeyf/arxiv2kindle.py

## arxiv2kindle.py
'''
Based on: https://gist.github.com/bshillingford/6259986edca707ca58dd
Modified to work on Windows by: Sergey Feldman
Jan 17, 2016

Requirements: pdflatex, bibtex
'''

import requests
import lxml.html as html
import re
import os, os.path
import glob
import getpass
import urllib
import tarfile
from email.mime.application import MIMEApplication
from email.mime.multipart import MIMEMultipart
import smtplib
import shutil

# Fill in with your own info
query = 'http://arxiv.org/abs/1506.05908'
kindle_email = 'YOUR_KINDLE_NAME@kindle.com'
your_gmail = 'YOUR_EMAIL@EMAIL.COM'
gmailpass = getpass.getpass()
temp_dir = '\\temp' # where the intermediate files are stored

# paper settings (decrease width/height to increase font)
landscape = True
width = "6in"
height = "4in"
margin = "0.1in"
# settings for latex geometry package:
if landscape:
    geom_settings = dict(paperwidth=width, paperheight=height, margin=margin)
else:
    geom_settings = dict(paperwidth=height, paperheight=width, margin=margin)

arxiv_id = re.match(r'(http://.*?/)?(?P<id>\d{4}\.\d{4,5}(v\d{1,2})?)', query).group('id')
arxiv_abs = 'http://arxiv.org/abs/' + arxiv_id
arxiv_pdf = 'http://arxiv.org/pdf/' + arxiv_id
arxiv_pgtitle = html.fromstring(requests.get(arxiv_abs).text.encode('utf8')).xpath('/html/head/title/text()')[0]
arxiv_title = re.sub(r'\s+', ' ', re.sub(r'^\[[^]]+\]\s*', '', arxiv_pgtitle), re.DOTALL)
arxiv_title_scrubbed = re.sub('[^-_A-Za-z0-9]+', '_', arxiv_title, re.DOTALL)

# make a temporary directory to store the tex files and download
# tar.gz of the source
d = temp_dir + '\\' + arxiv_id
os.mkdir(d)
cwd = os.getcwd()
os.chdir(d)

tar_fn = arxiv_id + '.tar.gz'
url = 'http://arxiv.org/e-print/' + arxiv_id
urllib.urlretrieve(url, tar_fn)
with tarfile.open(tar_fn, 'r:gz') as tar:
    for item in tar:
        tar.extract(item)

# find the files with .tex
# and get the main
texfiles = glob.glob(os.path.join(d, '*.tex'))
for texfile in texfiles:
    with open(texfile, 'r') as f:
        src = f.readlines()
    if 'documentclass' in src[0]:
        print('correct file: ' + texfile)
        break

# filter comments/newlines for easier debugging:
src = [line for line in src if line[0] != '%' and len(line.strip()) > 0]

# strip font size, column stuff, and paper size stuff in documentclass line:
src[0] = re.sub(r'\b\d+pt\b', '', src[0])
src[0] = re.sub(r'\b\w+column\b', '', src[0])
src[0] = re.sub(r'\b\w+paper\b', '', src[0])
src[0] = re.sub(r'(?<=\[),', '', src[0]) # remove extraneous starting commas
src[0] = re.sub(r',(?=[\],])', '', src[0]) # remove extraneous middle/ending commas

# find begin{document}:
begindocs = [i for i, line in enumerate(src) if line.startswith(r'\begin{document}')]
assert(len(begindocs) == 1)
src.insert(begindocs[0], '\\usepackage['+','.join(k+'='+v for k,v in geom_settings.items())+']{geometry}\n')
src.insert(begindocs[0], '\\usepackage{times}\n')
src.insert(begindocs[0], '\\pagestyle{empty}\n')
src.insert(begindocs[0], '\\usepackage{epstopdf}\n') # so eps will work with pdflatex
if landscape:
    src.insert(begindocs[0], '\\usepackage{pdflscape}\n')

# shrink figures to be at most the size of the page, now that it's landscape
for i in range(len(src)):
    line = src[i]
    m = re.search(r'\\includegraphics\[width=([.\d]+)\\(line|text)width\]', line)
    if m:
        mul = m.group(1)
        src[i] = re.sub(r'\\includegraphics\[width=([.\d]+)\\(line|text)width\]',
                   '\\includegraphics[width={mul}\\\\textwidth,height={mul}\\\\textheight,keepaspectratio]'.format(mul=mul),
                   line)

# write updaetd tex
os.rename(texfile, texfile+'.bak')
with open(texfile, 'w') as f:
    f.writelines(src)

# compile -> could hang here if the texfile is poorly written
os.system('pdflatex ' + texfile)
os.system('bibtex ' + texfile)
os.system('pdflatex ' + texfile)
os.system('pdflatex ' + texfile)
file_name = arxiv_title_scrubbed+".pdf"
os.rename(texfile[:-4]+'.pdf',file_name)

# send the email
msg = MIMEMultipart()
pdf = open(file_name, 'rb').read()
pdf_part = MIMEApplication(pdf, _subtype='pdf')
pdf_part.add_header('Content-Disposition', 'attachment', filename=file_name)
msg.attach(pdf_part)

server = smtplib.SMTP('smtp.gmail.com:587')
server.starttls()
server.login(your_gmail, gmailpass)
server.sendmail(your_gmail, kindle_email, msg.as_string())
server.close()

# clean up - delete the directory and its files
os.chdir(cwd)
shutil.rmtree(d)
	'''
	Based on: https://gist.github.com/bshillingford/6259986edca707ca58dd
	Modified to work on Windows by: Sergey Feldman
	Jan 17, 2016

	Requirements: pdflatex, bibtex
	'''

	import requests
	import lxml.html as html
	import re
	import os, os.path
	import glob
	import getpass
	import urllib
	import tarfile
	from email.mime.application import MIMEApplication
	from email.mime.multipart import MIMEMultipart
	import smtplib
	import shutil

	# Fill in with your own info
	query = 'http://arxiv.org/abs/1506.05908'
	kindle_email = 'YOUR_KINDLE_NAME@kindle.com'
	your_gmail = 'YOUR_EMAIL@EMAIL.COM'
	gmailpass = getpass.getpass()
	temp_dir = '\\temp' # where the intermediate files are stored

	# paper settings (decrease width/height to increase font)
	landscape = True
	width = "6in"
	height = "4in"
	margin = "0.1in"
	# settings for latex geometry package:
	if landscape:
	geom_settings = dict(paperwidth=width, paperheight=height, margin=margin)
	else:
	geom_settings = dict(paperwidth=height, paperheight=width, margin=margin)

	arxiv_id = re.match(r'(http://.*?/)?(?P<id>\d{4}\.\d{4,5}(v\d{1,2})?)', query).group('id')
	arxiv_abs = 'http://arxiv.org/abs/' + arxiv_id
	arxiv_pdf = 'http://arxiv.org/pdf/' + arxiv_id
	arxiv_pgtitle = html.fromstring(requests.get(arxiv_abs).text.encode('utf8')).xpath('/html/head/title/text()')[0]
	arxiv_title = re.sub(r'\s+', ' ', re.sub(r'^\[[^]]+\]\s*', '', arxiv_pgtitle), re.DOTALL)
	arxiv_title_scrubbed = re.sub('[^-_A-Za-z0-9]+', '_', arxiv_title, re.DOTALL)

	# make a temporary directory to store the tex files and download
	# tar.gz of the source
	d = temp_dir + '\\' + arxiv_id
	os.mkdir(d)
	cwd = os.getcwd()
	os.chdir(d)

	tar_fn = arxiv_id + '.tar.gz'
	url = 'http://arxiv.org/e-print/' + arxiv_id
	urllib.urlretrieve(url, tar_fn)
	with tarfile.open(tar_fn, 'r:gz') as tar:
	for item in tar:
	tar.extract(item)

	# find the files with .tex
	# and get the main
	texfiles = glob.glob(os.path.join(d, '*.tex'))
	for texfile in texfiles:
	with open(texfile, 'r') as f:
	src = f.readlines()
	if 'documentclass' in src[0]:
	print('correct file: ' + texfile)
	break

	# filter comments/newlines for easier debugging:
	src = [line for line in src if line[0] != '%' and len(line.strip()) > 0]

	# strip font size, column stuff, and paper size stuff in documentclass line:
	src[0] = re.sub(r'\b\d+pt\b', '', src[0])
	src[0] = re.sub(r'\b\w+column\b', '', src[0])
	src[0] = re.sub(r'\b\w+paper\b', '', src[0])
	src[0] = re.sub(r'(?<=\[),', '', src[0]) # remove extraneous starting commas
	src[0] = re.sub(r',(?=[\],])', '', src[0]) # remove extraneous middle/ending commas

	# find begin{document}:
	begindocs = [i for i, line in enumerate(src) if line.startswith(r'\begin{document}')]
	assert(len(begindocs) == 1)
	src.insert(begindocs[0], '\\usepackage['+','.join(k+'='+v for k,v in geom_settings.items())+']{geometry}\n')
	src.insert(begindocs[0], '\\usepackage{times}\n')
	src.insert(begindocs[0], '\\pagestyle{empty}\n')
	src.insert(begindocs[0], '\\usepackage{epstopdf}\n') # so eps will work with pdflatex
	if landscape:
	src.insert(begindocs[0], '\\usepackage{pdflscape}\n')

	# shrink figures to be at most the size of the page, now that it's landscape
	for i in range(len(src)):
	line = src[i]
	m = re.search(r'\\includegraphics\[width=([.\d]+)\\(line\|text)width\]', line)
	if m:
	mul = m.group(1)
	src[i] = re.sub(r'\\includegraphics\[width=([.\d]+)\\(line\|text)width\]',
	'\\includegraphics[width={mul}\\\\textwidth,height={mul}\\\\textheight,keepaspectratio]'.format(mul=mul),
	line)

	# write updaetd tex
	os.rename(texfile, texfile+'.bak')
	with open(texfile, 'w') as f:
	f.writelines(src)

	# compile -> could hang here if the texfile is poorly written
	os.system('pdflatex ' + texfile)
	os.system('bibtex ' + texfile)
	os.system('pdflatex ' + texfile)
	os.system('pdflatex ' + texfile)
	file_name = arxiv_title_scrubbed+".pdf"
	os.rename(texfile[:-4]+'.pdf',file_name)

	# send the email
	msg = MIMEMultipart()
	pdf = open(file_name, 'rb').read()
	pdf_part = MIMEApplication(pdf, _subtype='pdf')
	pdf_part.add_header('Content-Disposition', 'attachment', filename=file_name)
	msg.attach(pdf_part)

	server = smtplib.SMTP('smtp.gmail.com:587')
	server.starttls()
	server.login(your_gmail, gmailpass)
	server.sendmail(your_gmail, kindle_email, msg.as_string())
	server.close()

	# clean up - delete the directory and its files
	os.chdir(cwd)
	shutil.rmtree(d)