cerisara/Arxiv2Kindle.sh

## Arxiv2Kindle.sh
#!/bin/bash

# arxiv paper ID
python2 getarxivid.py $1 > arxivids

mkdir papers

for i in $(cat arxivids)
do

    # download latex
    ./arxiv2md $i > /tmp/aa.log
    ma=$(grep 'Found LaTeX source: ' /tmp/aa.log | cut -d':' -f3-)
    echo $ma

    # convert to markdown
    cp tex2md.py /tmp/
    pushd .
    cd /tmp/tmp-arxiv2md
    python2 ../tex2md.py $ma > aa.md

    # convert to epub
    pandoc -f markdown -t epub -o ../aa.epub aa.md
    popd

    # convert to mobi
    ebook-convert /tmp/aa.epub papers/$i.mobi
done

## arxiv2md
#!/usr/bin/perl
use Getopt::Std;
use Switch 'Perl6'; use Sys::Hostname; use Time::HiRes qw(usleep);#usleep($microseconds)
use File::Basename; use File::Copy; use File::Find; use Cwd;use File::Path;
# --------------------- OPTIONS       --------------------------------
$cdir = cwd;chomp($cdir);
$prog = basename($0);
getopt('Tt', \%opts); #here we add only options with args
foreach $opt (keys %opts){
 given($opt){
  when 'h' {usage();last;}
  default  {usage("Not a valid option: $opt");}
 }
}
# -------------------------------------------------------------------
if($#ARGV < 0){usage();}
$arxiv_url  = "http://arxiv.org/e-print/"; # url where source of papers is
$arxiv_abs  = "http://arxiv.org/abs/";     # url where abstract is
$arxiv_no   = $ARGV[0];

$arxiv_url .= $arxiv_no;
$meta_url   = "${arxiv_abs}/${arxiv_no}";# html file with meta information (title/authors)
# in case the paper number is in the arxiv/000000 format, then redefine $arxiv_no:
$arxiv_ar   = "$arxiv_no";
if( $arxiv_no =~ /(.*?)\/(.*?)$/){ $arxiv_ar = $1; $arxiv_no = $2;}
print "${prog}:arxiv_ar = $arxiv_ar ; arxiv_no = $arxiv_no\n";
$mobi       = "${arxiv_no}.mobi"; # final output file!
#--------------------------------------------------------------------
$tmpdir = "/tmp/tmp-${prog}";
if( -d $tmpdir){
 print "${prog}>rm -rf $tmpdir\n";
 system("rm -rf $tmpdir");
}
mkdir $tmpdir;
chdir $tmpdir;
print "${prog}>wget  --user-agent=Mozilla/1.2 $arxiv_url\n";
system("wget  --user-agent=Mozilla/1.2 $arxiv_url");
print "${prog}>tar xvf $arxiv_no\n";
system("tar xvf $arxiv_no");
$meta_source = "${arxiv_no}.html"; # html file with meta information (title/authors)
print "${prog}>wget  --user-agent=Mozilla/1.2 -O $meta_source $meta_url\n";
system("wget  --user-agent=Mozilla/1.2 -O $meta_source $meta_url");
$latex_source = "";
foreach $f (<*.tex>){
 $results  = `grep '\documentclass' $f`;
 $results .= `grep '\documentstyle' $f`;
 if($results){$latex_source = $f;break;}
}
if( $latex_source){
 print "${prog}:Found LaTeX source: $latex_source\n";
} else{
 die "${prog}:LaTeX source not found!";
}
# Determine title and authors:
$title = "[${arxiv_no}] "; $authors = "";
open(META,$meta_source);
while(<META>){
 if(/citation_title/ ){/content=\"(.*?)\"/; $title   .= $1;}
 if(/citation_author/){/content=\"(.*?)\"/; $authors .= "$1&";}
}
chop $authors; #remove trailing &
print "${prog}:title= $title\n${prog}:authors= $authors\n";

print "--------------------------------------------------------\n";
print "${prog}:Temporary files can be found in $tmpdir\n";
print "${prog}:Title:   $title\n";
print "${prog}:Authors: $authors\n";
print "${prog}:arXiv:   $ARGV[0]\n";
# ----------------------- HELP MESSAGE ------------------------------
sub usage(){
 if(@_){$me{sage  = shift(@_)."\n";}
 $message .= << "EOF";
Usage:   ${prog} <arxiv number>
    e.g. ${prog} 1010.0957
         ${prog} hep-lat/0402031

err:ntdll:RtlpWaitForCriticalSection section 0x7ebc8404 "lock.c: LOCKTABLEENTRY.crit" wait timed out in thread 0026, blocked by 0009, retrying (60 sec)                                                                                                             77,1          68%
Uses htlatex and ebook-convert  (assumes TeX4ht and Calibre exist on your system)
On Ubuntu: sudo apt-get  install texlive-full calibre
K. N. Anagnostopoulos, NTUA 2013
EOF
 print STDERR $message;
 exit(1);
}
sub main::HELP_MESSAGE(){ usage();} #for --help (does not work when default?)
# -------------------------------------------------------------------

## getarxivid.py
# get the paper IDs from a text file

import glob
import sys
import re

fs = glob.glob(sys.argv[1]+'/**/*.txt')
for f in fs:
    with open(f,'r') as f: ls = f.readlines()
    for l in ls:
        m=re.search('\d\d\d\d.\d\d\d\d\d',l)
        if not m==None: print(m.group(0))

## tex2md.py
import sys
import os.path
import glob

def getText(l):
    l=l.replace('\\citep','')
    l=l.replace('\\cite','')
    return l

def checkFig(l):
    i=l.find('\\includegraph')
    if i>=0:
        j=l.find('{',i)
        if j>=0:
            k=l.find('}',j)
            if k>=0:
                ff=l[j+1:k]
                if ff.lower().endswith(".pdf"):
                    os.system('convert '+ff+" "+ff[0:-4]+".png")
                    ff=ff[0:-4]+".png"
                if os.path.exists(ff): print("\n![img]("+ff+")\n")
                else:
                    ffs=glob.glob(ff+'*')
                    if len(ffs)>1: print("WARNING ambiguous pic",ffs)
                    if len(ffs)>=1: print("\n![img]("+ffs[0]+")\n")

def texfile(tfich):
    with open(tfich,'rb') as f : ls = f.readlines()

    empty=False
    eq=False
    for l in ls:
        l=l.strip()
        if len(l)==0:
            if not empty:
                empty=True
                print("")
            continue
        if l[0]=='\\':
            if eq:
                if l.startswith('\\end{eq'): eq=False
                else: print(l)
            if l.startswith('\\section'):
                print('\n# '+getText(l[9:-1])+'\n')
            elif l.startswith('\\subsection'):
                print('\n## '+getText(l[12:-1])+'\n')
            elif l.startswith('\\subsubsection'):
                print('\n### '+getText(l[15:-1])+'\n')
            elif l.startswith('\\begin{eq'): eq=True
            elif l.startswith('\\input{'):
                i=l.find('{')
                if i>=0:
                    j=l.find('}',i)
                    if j>=0:
                        texfile(l[i+1:j]+'.tex')
                        continue
            # supprime toutes les lignes qui commencent par \
            checkFig(l)
            if not empty:
                empty=True
                print("")
        elif l[0]=='%':
            # supprime toutes les lignes qui commencent par %
            if not empty:
                empty=True
                print(getText(l))
        else:
            print(l)
            empty=False

texfile(sys.argv[1])
	#!/bin/bash

	# arxiv paper ID
	python2 getarxivid.py $1 > arxivids

	mkdir papers

	for i in $(cat arxivids)
	do

	# download latex
	./arxiv2md $i > /tmp/aa.log
	ma=$(grep 'Found LaTeX source: ' /tmp/aa.log \| cut -d':' -f3-)
	echo $ma

	# convert to markdown
	cp tex2md.py /tmp/
	pushd .
	cd /tmp/tmp-arxiv2md
	python2 ../tex2md.py $ma > aa.md

	# convert to epub
	pandoc -f markdown -t epub -o ../aa.epub aa.md
	popd

	# convert to mobi
	ebook-convert /tmp/aa.epub papers/$i.mobi
	done
	#!/usr/bin/perl
	use Getopt::Std;
	use Switch 'Perl6'; use Sys::Hostname; use Time::HiRes qw(usleep);#usleep($microseconds)
	use File::Basename; use File::Copy; use File::Find; use Cwd;use File::Path;
	# --------------------- OPTIONS --------------------------------
	$cdir = cwd;chomp($cdir);
	$prog = basename($0);
	getopt('Tt', \%opts); #here we add only options with args
	foreach $opt (keys %opts){
	given($opt){
	when 'h' {usage();last;}
	default {usage("Not a valid option: $opt");}
	}
	}
	# -------------------------------------------------------------------
	if($#ARGV < 0){usage();}
	$arxiv_url = "http://arxiv.org/e-print/"; # url where source of papers is
	$arxiv_abs = "http://arxiv.org/abs/"; # url where abstract is
	$arxiv_no = $ARGV[0];

	$arxiv_url .= $arxiv_no;
	$meta_url = "${arxiv_abs}/${arxiv_no}";# html file with meta information (title/authors)
	# in case the paper number is in the arxiv/000000 format, then redefine $arxiv_no:
	$arxiv_ar = "$arxiv_no";
	if( $arxiv_no =~ /(.?)\/(.?)$/){ $arxiv_ar = $1; $arxiv_no = $2;}
	print "${prog}:arxiv_ar = $arxiv_ar ; arxiv_no = $arxiv_no\n";
	$mobi = "${arxiv_no}.mobi"; # final output file!
	#--------------------------------------------------------------------
	$tmpdir = "/tmp/tmp-${prog}";
	if( -d $tmpdir){
	print "${prog}>rm -rf $tmpdir\n";
	system("rm -rf $tmpdir");
	}
	mkdir $tmpdir;
	chdir $tmpdir;
	print "${prog}>wget --user-agent=Mozilla/1.2 $arxiv_url\n";
	system("wget --user-agent=Mozilla/1.2 $arxiv_url");
	print "${prog}>tar xvf $arxiv_no\n";
	system("tar xvf $arxiv_no");
	$meta_source = "${arxiv_no}.html"; # html file with meta information (title/authors)
	print "${prog}>wget --user-agent=Mozilla/1.2 -O $meta_source $meta_url\n";
	system("wget --user-agent=Mozilla/1.2 -O $meta_source $meta_url");
	$latex_source = "";
	foreach $f (<*.tex>){
	$results = `grep '\documentclass' $f`;
	$results .= `grep '\documentstyle' $f`;
	if($results){$latex_source = $f;break;}
	}
	if( $latex_source){
	print "${prog}:Found LaTeX source: $latex_source\n";
	} else{
	die "${prog}:LaTeX source not found!";
	}
	# Determine title and authors:
	$title = "[${arxiv_no}] "; $authors = "";
	open(META,$meta_source);
	while(<META>){
	if(/citation_title/ ){/content=\"(.*?)\"/; $title .= $1;}
	if(/citation_author/){/content=\"(.*?)\"/; $authors .= "$1&";}
	}
	chop $authors; #remove trailing &
	print "${prog}:title= $title\n${prog}:authors= $authors\n";

	print "--------------------------------------------------------\n";
	print "${prog}:Temporary files can be found in $tmpdir\n";
	print "${prog}:Title: $title\n";
	print "${prog}:Authors: $authors\n";
	print "${prog}:arXiv: $ARGV[0]\n";
	# ----------------------- HELP MESSAGE ------------------------------
	sub usage(){
	if(@_){$me{sage = shift(@_)."\n";}
	$message .= << "EOF";
	Usage: ${prog} <arxiv number>
	e.g. ${prog} 1010.0957
	${prog} hep-lat/0402031

	err:ntdll:RtlpWaitForCriticalSection section 0x7ebc8404 "lock.c: LOCKTABLEENTRY.crit" wait timed out in thread 0026, blocked by 0009, retrying (60 sec) 77,1 68%
	Uses htlatex and ebook-convert (assumes TeX4ht and Calibre exist on your system)
	On Ubuntu: sudo apt-get install texlive-full calibre
	K. N. Anagnostopoulos, NTUA 2013
	EOF
	print STDERR $message;
	exit(1);
	}
	sub main::HELP_MESSAGE(){ usage();} #for --help (does not work when default?)
	# -------------------------------------------------------------------
	# get the paper IDs from a text file

	import glob
	import sys
	import re

	fs = glob.glob(sys.argv[1]+'/*/.txt')
	for f in fs:
	with open(f,'r') as f: ls = f.readlines()
	for l in ls:
	m=re.search('\d\d\d\d.\d\d\d\d\d',l)
	if not m==None: print(m.group(0))
	import sys
	import os.path
	import glob

	def getText(l):
	l=l.replace('\\citep','')
	l=l.replace('\\cite','')
	return l

	def checkFig(l):
	i=l.find('\\includegraph')
	if i>=0:
	j=l.find('{',i)
	if j>=0:
	k=l.find('}',j)
	if k>=0:
	ff=l[j+1:k]
	if ff.lower().endswith(".pdf"):
	os.system('convert '+ff+" "+ff[0:-4]+".png")
	ff=ff[0:-4]+".png"
	if os.path.exists(ff): print("\n![img]("+ff+")\n")
	else:
	ffs=glob.glob(ff+'*')
	if len(ffs)>1: print("WARNING ambiguous pic",ffs)
	if len(ffs)>=1: print("\n![img]("+ffs[0]+")\n")

	def texfile(tfich):
	with open(tfich,'rb') as f : ls = f.readlines()

	empty=False
	eq=False
	for l in ls:
	l=l.strip()
	if len(l)==0:
	if not empty:
	empty=True
	print("")
	continue
	if l[0]=='\\':
	if eq:
	if l.startswith('\\end{eq'): eq=False
	else: print(l)
	if l.startswith('\\section'):
	print('\n# '+getText(l[9:-1])+'\n')
	elif l.startswith('\\subsection'):
	print('\n## '+getText(l[12:-1])+'\n')
	elif l.startswith('\\subsubsection'):
	print('\n### '+getText(l[15:-1])+'\n')
	elif l.startswith('\\begin{eq'): eq=True
	elif l.startswith('\\input{'):
	i=l.find('{')
	if i>=0:
	j=l.find('}',i)
	if j>=0:
	texfile(l[i+1:j]+'.tex')
	continue
	# supprime toutes les lignes qui commencent par \
	checkFig(l)
	if not empty:
	empty=True
	print("")
	elif l[0]=='%':
	# supprime toutes les lignes qui commencent par %
	if not empty:
	empty=True
	print(getText(l))
	else:
	print(l)
	empty=False

	texfile(sys.argv[1])