Skip to content

Instantly share code, notes, and snippets.

@cryptospectrum
Last active February 1, 2017 06:57
Show Gist options
  • Save cryptospectrum/5996164 to your computer and use it in GitHub Desktop.
Save cryptospectrum/5996164 to your computer and use it in GitHub Desktop.
Scraping Latex Equations from Wikipedia Pages
import sys, requests
# requests is a third party module, more robust that urllib...
#http://docs.python-requests.org/en/latest/
from lxml import html
if len(sys.argv)==1:
#url_name="https://en.wikipedia.org/wiki/Fa%C3%A0_di_Bruno%27s_formula"
#url_name="https://secure.wikimedia.org/wikipedia/en/wiki/Euler%E2%80%93Maclaurin_formula"
#url_name="https://en.wikipedia.org/wiki/Generalized_normal_distribution"
url_name="https://en.wikipedia.org/wiki/Jacobi_elliptic_functions"
#url_name="https://en.wikipedia.org/wiki/Generalised_logistic_function"
#____________________________________________________________________
#html_file="wiki_strip_F2.html"
html_file="wiki_strip_F_test.html"
#html_file="wiki_strip_logisitcmap.html"
elif len(sys.argv)==2:
url_name=sys.argv[1]
html_file='wiki_strip_F.html'
else:
url_name=sys.argv[1]
html_file=sys.argv[2]
#__HTML__________________________________________________________________
html_header="""
<html><head>
<title>Wiki Latex Scraper</title>
<style type=\"text/css\">
body
{background-color:#E7E7E7;}
h1{color:orange;text-align:center;}
p{font-family:\"verdana\";font-size:20px;}
</style>
<body>
<table border=\"0\" WIDTH=\"100%\"><tr><td><b><font size=\"8\" face=\"verdana\" color=\"FF0000\">
Wiki Latex Scraper<br>
<canvas id=\"myCanvas2\" width=\"125\" height=\"125\" style=\"border:1px solid #c3c3c3;\">
Your browser does not support the canvas element.</canvas>
<script type=\"text/javascript\">
var c=document.getElementById(\"myCanvas2\");
var cxt=c.getContext(\"2d\");
var grd=cxt.createRadialGradient(0,0,1,50,50,90);
grd.addColorStop(0.3,\"#FF0000\");
grd.addColorStop(0.9,\"#000000\");
cxt.fillStyle=grd;
cxt.fillRect(0,0,125,125);
cxt.font = \"bold 30px sans-serif\";
cxt.fillText(\"J\", 110, 11);
</script><br></font>
<font size=\"6\" face=\"verdana\" color=\"FF0000\">
Scraping with Python</font>
<br><DIV ALIGN=LEFT>
<!--<canvas id=\"myCanvas2\" width=\"100\" height=\"100\" style=\"border:1px solid #c3c3c3;\">
Your browser does not support the canvas element.
</canvas>
<script type=\"text/javascript\">
var c=document.getElementById(\"myCanvas2\");
var cxt=c.getContext(\"2d\");
var grd=cxt.createRadialGradient(0,0,1,75,50,20);
grd.addColorStop(0,\"#FF0000\");
grd.addColorStop(1,\"#000000\");
cxt.fillStyle=grd;
cxt.fillRect(0,0,475,400);
cxt.font = \"bold 30px sans-serif\";
cxt.fillText(\"J\", 50, 50);
</script>--></DIV></td></tr> </table>
<!-------------------------------------------------------------->
<table border=\"0\" WIDTH=\"80%\" ALIGN=CENTER>
<tr><td colspan=2><br></b></td></tr></table>
<table border=\"1\" ALIGN=CENTER WIDTH=\"80%\" BORDERCOLOR=#000000 BORDERCOLORLIGHT=#33CCFF BORDERCOLORDARK=#0000CC>
<tr BGCOLOR=#FF0000>
<td><b><font face=\"verdana\" color=\"black\" size=\"4.5\">Latex Tags</font></b></td></tr>
<tr><td>
"""
html_footer="</td></tr></table></body></html>"
#_________________________________________________________________
htmlRaw = requests.get(url_name)
print htmlRaw.status_code
print htmlRaw.headers['content-type']
print htmlRaw.encoding
tree = html.fromstring(htmlRaw.content)
#_________________________________________________________
# Future: ?Check if html already exists
#html_file='wiki_strip_1.html'
titletext = tree.xpath('//head//title/text()')
print titletext
reftext = tree.xpath('//cite[@id="Refernces"]//text()')
refbooktext = tree.xpath('//span[@class="reference-text"]//text()')
reftext = tree.xpath('//cite//text()')
divtext = tree.xpath('//div[@id="mw-content-text"]//p/text()')
# # print divtext
# # print reftext
# # print refbooktext
fileout=open(html_file,'w')
fileout.write(html_header)
fileout.write('<br><a href="'+url_name+'">'+titletext[0]+'</a>''<hr><br>')
# # # \TEX~\TEX~\TEX~\TEX~\TEX~\TEX~\TEX~\TEX~\TEX~\TEX~\TEX~\TEX~\TEX~\TEX~\TEX~
# # Wiki-Issue: class tag keeps changing(on same page), examples include *tex*, *math*, ... and so on
tex_term = tree.xpath('//img[@class="mwe-math-fallback-image-inline"]/@alt')
# tex_term = tree.xpath('//img[@class="mwe-math-fallback-image-inline mw-math-element"]/@alt')
img_term = tree.xpath('//img[@class="mwe-math-fallback-image-inline"]/@src')
# img_term = tree.xpath('//img[@class="mwe-math-fallback-image-inline mw-math-element"]/@src')
# # NEXT: use Regex with xpath
# doc.xpath("//a[re:match(text(), '(math)')]", namespaces={"re": "http://exslt.org/regular-expressions"})
# rrinse=tree.xpath("//class[re:match(text(), '(math)')]", namespaces={"re": "http://exslt.org/regular-expressions"})
print tex_term
print img_term
imc=0
coln=0
for it in tex_term:
if len(it) == 0:
print 'the list is empty'
else:
imgterm = img_term[imc]+"\""
imc=imc+1
#TEX
fileout.write('<tr><td><img src="')
##Image and TEX
fileout.write(imgterm+'><hr><br>$'+it+'$')
# ##TEX begin...
fileout.write('<br><hr><br>\\begin{equation}<br>')
fileout.write(it)
fileout.write('<br>\end{equation}<br><br></td></tr>')
fileout.write('<tr><td>References<br><br><br></td></tr><tr><td>')
for bt in refbooktext:
if len(bt)>35:
if isinstance(bt, str):
fileout.write('<br>'+bt+'<br>')
elif isinstance(bt, unicode):
try:
bt = bt.encode('ascii','replace')
fileout.write('<br>'+bt+'<br>')
except:
print "Encoding Error"
else:
print "Ref Title Error"
else:
if isinstance(bt, str):
fileout.write(bt+' ')
elif isinstance(bt, unicode):
try:
bt = bt.encode('ascii','replace')
fileout.write(bt+' ')
except:
print "Encoding Error"
else:
print "Ref Title Error"
# fileout.write('<tr><td>'+str(bt)+'</td></tr>')
fileout.write('</td></tr><tr><td>Text<br><br><br>')
for dvt in divtext:
if len(dvt)>35:
if isinstance(dvt, str):
fileout.write('<br>'+dvt+'<br>')
elif isinstance(dvt, unicode):
try:
dvt = dvt.encode('ascii','replace')
fileout.write('<br>'+dvt+'<br>')
except:
print "Encoding Error"
else:
print "text Title Error"
else:
if isinstance(dvt, str):
fileout.write(dvt+' ')
elif isinstance(dvt, unicode):
try:
dvt = dvt.encode('ascii','replace')
fileout.write(dvt+' ')
except:
print "Encoding Error"
else:
print "text Title Error"
# fileout.write('<tr><td>'+str(bt)+'</td></tr>')
fileout.write('<br><br></td></tr>')
fileout.write(html_footer)
fileout.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment