Last active
October 25, 2016 19:28
-
-
Save cerisara/32b96789fb854f56adaabaa70b0c5667 to your computer and use it in GitHub Desktop.
Arxiv2Kindle
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# arxiv paper ID | |
python2 getarxivid.py $1 > arxivids | |
mkdir papers | |
for i in $(cat arxivids) | |
do | |
# download latex | |
./arxiv2md $i > /tmp/aa.log | |
ma=$(grep 'Found LaTeX source: ' /tmp/aa.log | cut -d':' -f3-) | |
echo $ma | |
# convert to markdown | |
cp tex2md.py /tmp/ | |
pushd . | |
cd /tmp/tmp-arxiv2md | |
python2 ../tex2md.py $ma > aa.md | |
# convert to epub | |
pandoc -f markdown -t epub -o ../aa.epub aa.md | |
popd | |
# convert to mobi | |
ebook-convert /tmp/aa.epub papers/$i.mobi | |
done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
use Getopt::Std; | |
use Switch 'Perl6'; use Sys::Hostname; use Time::HiRes qw(usleep);#usleep($microseconds) | |
use File::Basename; use File::Copy; use File::Find; use Cwd;use File::Path; | |
# --------------------- OPTIONS -------------------------------- | |
$cdir = cwd;chomp($cdir); | |
$prog = basename($0); | |
getopt('Tt', \%opts); #here we add only options with args | |
foreach $opt (keys %opts){ | |
given($opt){ | |
when 'h' {usage();last;} | |
default {usage("Not a valid option: $opt");} | |
} | |
} | |
# ------------------------------------------------------------------- | |
if($#ARGV < 0){usage();} | |
$arxiv_url = "http://arxiv.org/e-print/"; # url where source of papers is | |
$arxiv_abs = "http://arxiv.org/abs/"; # url where abstract is | |
$arxiv_no = $ARGV[0]; | |
$arxiv_url .= $arxiv_no; | |
$meta_url = "${arxiv_abs}/${arxiv_no}";# html file with meta information (title/authors) | |
# in case the paper number is in the arxiv/000000 format, then redefine $arxiv_no: | |
$arxiv_ar = "$arxiv_no"; | |
if( $arxiv_no =~ /(.*?)\/(.*?)$/){ $arxiv_ar = $1; $arxiv_no = $2;} | |
print "${prog}:arxiv_ar = $arxiv_ar ; arxiv_no = $arxiv_no\n"; | |
$mobi = "${arxiv_no}.mobi"; # final output file! | |
#-------------------------------------------------------------------- | |
$tmpdir = "/tmp/tmp-${prog}"; | |
if( -d $tmpdir){ | |
print "${prog}>rm -rf $tmpdir\n"; | |
system("rm -rf $tmpdir"); | |
} | |
mkdir $tmpdir; | |
chdir $tmpdir; | |
print "${prog}>wget --user-agent=Mozilla/1.2 $arxiv_url\n"; | |
system("wget --user-agent=Mozilla/1.2 $arxiv_url"); | |
print "${prog}>tar xvf $arxiv_no\n"; | |
system("tar xvf $arxiv_no"); | |
$meta_source = "${arxiv_no}.html"; # html file with meta information (title/authors) | |
print "${prog}>wget --user-agent=Mozilla/1.2 -O $meta_source $meta_url\n"; | |
system("wget --user-agent=Mozilla/1.2 -O $meta_source $meta_url"); | |
$latex_source = ""; | |
foreach $f (<*.tex>){ | |
$results = `grep '\documentclass' $f`; | |
$results .= `grep '\documentstyle' $f`; | |
if($results){$latex_source = $f;break;} | |
} | |
if( $latex_source){ | |
print "${prog}:Found LaTeX source: $latex_source\n"; | |
} else{ | |
die "${prog}:LaTeX source not found!"; | |
} | |
# Determine title and authors: | |
$title = "[${arxiv_no}] "; $authors = ""; | |
open(META,$meta_source); | |
while(<META>){ | |
if(/citation_title/ ){/content=\"(.*?)\"/; $title .= $1;} | |
if(/citation_author/){/content=\"(.*?)\"/; $authors .= "$1&";} | |
} | |
chop $authors; #remove trailing & | |
print "${prog}:title= $title\n${prog}:authors= $authors\n"; | |
print "--------------------------------------------------------\n"; | |
print "${prog}:Temporary files can be found in $tmpdir\n"; | |
print "${prog}:Title: $title\n"; | |
print "${prog}:Authors: $authors\n"; | |
print "${prog}:arXiv: $ARGV[0]\n"; | |
# ----------------------- HELP MESSAGE ------------------------------ | |
sub usage(){ | |
if(@_){$me{sage = shift(@_)."\n";} | |
$message .= << "EOF"; | |
Usage: ${prog} <arxiv number> | |
e.g. ${prog} 1010.0957 | |
${prog} hep-lat/0402031 | |
err:ntdll:RtlpWaitForCriticalSection section 0x7ebc8404 "lock.c: LOCKTABLEENTRY.crit" wait timed out in thread 0026, blocked by 0009, retrying (60 sec) 77,1 68% | |
Uses htlatex and ebook-convert (assumes TeX4ht and Calibre exist on your system) | |
On Ubuntu: sudo apt-get install texlive-full calibre | |
K. N. Anagnostopoulos, NTUA 2013 | |
EOF | |
print STDERR $message; | |
exit(1); | |
} | |
sub main::HELP_MESSAGE(){ usage();} #for --help (does not work when default?) | |
# ------------------------------------------------------------------- |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# get the paper IDs from a text file | |
import glob | |
import sys | |
import re | |
fs = glob.glob(sys.argv[1]+'/**/*.txt') | |
for f in fs: | |
with open(f,'r') as f: ls = f.readlines() | |
for l in ls: | |
m=re.search('\d\d\d\d.\d\d\d\d\d',l) | |
if not m==None: print(m.group(0)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import os.path | |
import glob | |
def getText(l): | |
l=l.replace('\\citep','') | |
l=l.replace('\\cite','') | |
return l | |
def checkFig(l): | |
i=l.find('\\includegraph') | |
if i>=0: | |
j=l.find('{',i) | |
if j>=0: | |
k=l.find('}',j) | |
if k>=0: | |
ff=l[j+1:k] | |
if ff.lower().endswith(".pdf"): | |
os.system('convert '+ff+" "+ff[0:-4]+".png") | |
ff=ff[0:-4]+".png" | |
if os.path.exists(ff): print("\n![img]("+ff+")\n") | |
else: | |
ffs=glob.glob(ff+'*') | |
if len(ffs)>1: print("WARNING ambiguous pic",ffs) | |
if len(ffs)>=1: print("\n![img]("+ffs[0]+")\n") | |
def texfile(tfich): | |
with open(tfich,'rb') as f : ls = f.readlines() | |
empty=False | |
eq=False | |
for l in ls: | |
l=l.strip() | |
if len(l)==0: | |
if not empty: | |
empty=True | |
print("") | |
continue | |
if l[0]=='\\': | |
if eq: | |
if l.startswith('\\end{eq'): eq=False | |
else: print(l) | |
if l.startswith('\\section'): | |
print('\n# '+getText(l[9:-1])+'\n') | |
elif l.startswith('\\subsection'): | |
print('\n## '+getText(l[12:-1])+'\n') | |
elif l.startswith('\\subsubsection'): | |
print('\n### '+getText(l[15:-1])+'\n') | |
elif l.startswith('\\begin{eq'): eq=True | |
elif l.startswith('\\input{'): | |
i=l.find('{') | |
if i>=0: | |
j=l.find('}',i) | |
if j>=0: | |
texfile(l[i+1:j]+'.tex') | |
continue | |
# supprime toutes les lignes qui commencent par \ | |
checkFig(l) | |
if not empty: | |
empty=True | |
print("") | |
elif l[0]=='%': | |
# supprime toutes les lignes qui commencent par % | |
if not empty: | |
empty=True | |
print(getText(l)) | |
else: | |
print(l) | |
empty=False | |
texfile(sys.argv[1]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment