Skip to content

Instantly share code, notes, and snippets.

@cerisara
Last active October 25, 2016 19:28
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cerisara/32b96789fb854f56adaabaa70b0c5667 to your computer and use it in GitHub Desktop.
Save cerisara/32b96789fb854f56adaabaa70b0c5667 to your computer and use it in GitHub Desktop.
Arxiv2Kindle
#!/bin/bash
# arxiv paper ID
python2 getarxivid.py $1 > arxivids
mkdir papers
for i in $(cat arxivids)
do
# download latex
./arxiv2md $i > /tmp/aa.log
ma=$(grep 'Found LaTeX source: ' /tmp/aa.log | cut -d':' -f3-)
echo $ma
# convert to markdown
cp tex2md.py /tmp/
pushd .
cd /tmp/tmp-arxiv2md
python2 ../tex2md.py $ma > aa.md
# convert to epub
pandoc -f markdown -t epub -o ../aa.epub aa.md
popd
# convert to mobi
ebook-convert /tmp/aa.epub papers/$i.mobi
done
#!/usr/bin/perl
use Getopt::Std;
use Switch 'Perl6'; use Sys::Hostname; use Time::HiRes qw(usleep);#usleep($microseconds)
use File::Basename; use File::Copy; use File::Find; use Cwd;use File::Path;
# --------------------- OPTIONS --------------------------------
$cdir = cwd;chomp($cdir);
$prog = basename($0);
getopt('Tt', \%opts); #here we add only options with args
foreach $opt (keys %opts){
given($opt){
when 'h' {usage();last;}
default {usage("Not a valid option: $opt");}
}
}
# -------------------------------------------------------------------
if($#ARGV < 0){usage();}
$arxiv_url = "http://arxiv.org/e-print/"; # url where source of papers is
$arxiv_abs = "http://arxiv.org/abs/"; # url where abstract is
$arxiv_no = $ARGV[0];
$arxiv_url .= $arxiv_no;
$meta_url = "${arxiv_abs}/${arxiv_no}";# html file with meta information (title/authors)
# in case the paper number is in the arxiv/000000 format, then redefine $arxiv_no:
$arxiv_ar = "$arxiv_no";
if( $arxiv_no =~ /(.*?)\/(.*?)$/){ $arxiv_ar = $1; $arxiv_no = $2;}
print "${prog}:arxiv_ar = $arxiv_ar ; arxiv_no = $arxiv_no\n";
$mobi = "${arxiv_no}.mobi"; # final output file!
#--------------------------------------------------------------------
$tmpdir = "/tmp/tmp-${prog}";
if( -d $tmpdir){
print "${prog}>rm -rf $tmpdir\n";
system("rm -rf $tmpdir");
}
mkdir $tmpdir;
chdir $tmpdir;
print "${prog}>wget --user-agent=Mozilla/1.2 $arxiv_url\n";
system("wget --user-agent=Mozilla/1.2 $arxiv_url");
print "${prog}>tar xvf $arxiv_no\n";
system("tar xvf $arxiv_no");
$meta_source = "${arxiv_no}.html"; # html file with meta information (title/authors)
print "${prog}>wget --user-agent=Mozilla/1.2 -O $meta_source $meta_url\n";
system("wget --user-agent=Mozilla/1.2 -O $meta_source $meta_url");
$latex_source = "";
foreach $f (<*.tex>){
$results = `grep '\documentclass' $f`;
$results .= `grep '\documentstyle' $f`;
if($results){$latex_source = $f;break;}
}
if( $latex_source){
print "${prog}:Found LaTeX source: $latex_source\n";
} else{
die "${prog}:LaTeX source not found!";
}
# Determine title and authors:
$title = "[${arxiv_no}] "; $authors = "";
open(META,$meta_source);
while(<META>){
if(/citation_title/ ){/content=\"(.*?)\"/; $title .= $1;}
if(/citation_author/){/content=\"(.*?)\"/; $authors .= "$1&";}
}
chop $authors; #remove trailing &
print "${prog}:title= $title\n${prog}:authors= $authors\n";
print "--------------------------------------------------------\n";
print "${prog}:Temporary files can be found in $tmpdir\n";
print "${prog}:Title: $title\n";
print "${prog}:Authors: $authors\n";
print "${prog}:arXiv: $ARGV[0]\n";
# ----------------------- HELP MESSAGE ------------------------------
sub usage(){
if(@_){$me{sage = shift(@_)."\n";}
$message .= << "EOF";
Usage: ${prog} <arxiv number>
e.g. ${prog} 1010.0957
${prog} hep-lat/0402031
err:ntdll:RtlpWaitForCriticalSection section 0x7ebc8404 "lock.c: LOCKTABLEENTRY.crit" wait timed out in thread 0026, blocked by 0009, retrying (60 sec) 77,1 68%
Uses htlatex and ebook-convert (assumes TeX4ht and Calibre exist on your system)
On Ubuntu: sudo apt-get install texlive-full calibre
K. N. Anagnostopoulos, NTUA 2013
EOF
print STDERR $message;
exit(1);
}
sub main::HELP_MESSAGE(){ usage();} #for --help (does not work when default?)
# -------------------------------------------------------------------
# get the paper IDs from a text file
import glob
import sys
import re
fs = glob.glob(sys.argv[1]+'/**/*.txt')
for f in fs:
with open(f,'r') as f: ls = f.readlines()
for l in ls:
m=re.search('\d\d\d\d.\d\d\d\d\d',l)
if not m==None: print(m.group(0))
import sys
import os.path
import glob
def getText(l):
l=l.replace('\\citep','')
l=l.replace('\\cite','')
return l
def checkFig(l):
i=l.find('\\includegraph')
if i>=0:
j=l.find('{',i)
if j>=0:
k=l.find('}',j)
if k>=0:
ff=l[j+1:k]
if ff.lower().endswith(".pdf"):
os.system('convert '+ff+" "+ff[0:-4]+".png")
ff=ff[0:-4]+".png"
if os.path.exists(ff): print("\n![img]("+ff+")\n")
else:
ffs=glob.glob(ff+'*')
if len(ffs)>1: print("WARNING ambiguous pic",ffs)
if len(ffs)>=1: print("\n![img]("+ffs[0]+")\n")
def texfile(tfich):
with open(tfich,'rb') as f : ls = f.readlines()
empty=False
eq=False
for l in ls:
l=l.strip()
if len(l)==0:
if not empty:
empty=True
print("")
continue
if l[0]=='\\':
if eq:
if l.startswith('\\end{eq'): eq=False
else: print(l)
if l.startswith('\\section'):
print('\n# '+getText(l[9:-1])+'\n')
elif l.startswith('\\subsection'):
print('\n## '+getText(l[12:-1])+'\n')
elif l.startswith('\\subsubsection'):
print('\n### '+getText(l[15:-1])+'\n')
elif l.startswith('\\begin{eq'): eq=True
elif l.startswith('\\input{'):
i=l.find('{')
if i>=0:
j=l.find('}',i)
if j>=0:
texfile(l[i+1:j]+'.tex')
continue
# supprime toutes les lignes qui commencent par \
checkFig(l)
if not empty:
empty=True
print("")
elif l[0]=='%':
# supprime toutes les lignes qui commencent par %
if not empty:
empty=True
print(getText(l))
else:
print(l)
empty=False
texfile(sys.argv[1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment