Last active
February 1, 2017 06:57
-
-
Save cryptospectrum/5996164 to your computer and use it in GitHub Desktop.
Scraping Latex Equations from Wikipedia Pages
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import sys, requests | |
| # requests is a third party module, more robust that urllib... | |
| #http://docs.python-requests.org/en/latest/ | |
| from lxml import html | |
| if len(sys.argv)==1: | |
| #url_name="https://en.wikipedia.org/wiki/Fa%C3%A0_di_Bruno%27s_formula" | |
| #url_name="https://secure.wikimedia.org/wikipedia/en/wiki/Euler%E2%80%93Maclaurin_formula" | |
| #url_name="https://en.wikipedia.org/wiki/Generalized_normal_distribution" | |
| url_name="https://en.wikipedia.org/wiki/Jacobi_elliptic_functions" | |
| #url_name="https://en.wikipedia.org/wiki/Generalised_logistic_function" | |
| #____________________________________________________________________ | |
| #html_file="wiki_strip_F2.html" | |
| html_file="wiki_strip_F_test.html" | |
| #html_file="wiki_strip_logisitcmap.html" | |
| elif len(sys.argv)==2: | |
| url_name=sys.argv[1] | |
| html_file='wiki_strip_F.html' | |
| else: | |
| url_name=sys.argv[1] | |
| html_file=sys.argv[2] | |
| #__HTML__________________________________________________________________ | |
| html_header=""" | |
| <html><head> | |
| <title>Wiki Latex Scraper</title> | |
| <style type=\"text/css\"> | |
| body | |
| {background-color:#E7E7E7;} | |
| h1{color:orange;text-align:center;} | |
| p{font-family:\"verdana\";font-size:20px;} | |
| </style> | |
| <body> | |
| <table border=\"0\" WIDTH=\"100%\"><tr><td><b><font size=\"8\" face=\"verdana\" color=\"FF0000\"> | |
| Wiki Latex Scraper<br> | |
| <canvas id=\"myCanvas2\" width=\"125\" height=\"125\" style=\"border:1px solid #c3c3c3;\"> | |
| Your browser does not support the canvas element.</canvas> | |
| <script type=\"text/javascript\"> | |
| var c=document.getElementById(\"myCanvas2\"); | |
| var cxt=c.getContext(\"2d\"); | |
| var grd=cxt.createRadialGradient(0,0,1,50,50,90); | |
| grd.addColorStop(0.3,\"#FF0000\"); | |
| grd.addColorStop(0.9,\"#000000\"); | |
| cxt.fillStyle=grd; | |
| cxt.fillRect(0,0,125,125); | |
| cxt.font = \"bold 30px sans-serif\"; | |
| cxt.fillText(\"J\", 110, 11); | |
| </script><br></font> | |
| <font size=\"6\" face=\"verdana\" color=\"FF0000\"> | |
| Scraping with Python</font> | |
| <br><DIV ALIGN=LEFT> | |
| <!--<canvas id=\"myCanvas2\" width=\"100\" height=\"100\" style=\"border:1px solid #c3c3c3;\"> | |
| Your browser does not support the canvas element. | |
| </canvas> | |
| <script type=\"text/javascript\"> | |
| var c=document.getElementById(\"myCanvas2\"); | |
| var cxt=c.getContext(\"2d\"); | |
| var grd=cxt.createRadialGradient(0,0,1,75,50,20); | |
| grd.addColorStop(0,\"#FF0000\"); | |
| grd.addColorStop(1,\"#000000\"); | |
| cxt.fillStyle=grd; | |
| cxt.fillRect(0,0,475,400); | |
| cxt.font = \"bold 30px sans-serif\"; | |
| cxt.fillText(\"J\", 50, 50); | |
| </script>--></DIV></td></tr> </table> | |
| <!--------------------------------------------------------------> | |
| <table border=\"0\" WIDTH=\"80%\" ALIGN=CENTER> | |
| <tr><td colspan=2><br></b></td></tr></table> | |
| <table border=\"1\" ALIGN=CENTER WIDTH=\"80%\" BORDERCOLOR=#000000 BORDERCOLORLIGHT=#33CCFF BORDERCOLORDARK=#0000CC> | |
| <tr BGCOLOR=#FF0000> | |
| <td><b><font face=\"verdana\" color=\"black\" size=\"4.5\">Latex Tags</font></b></td></tr> | |
| <tr><td> | |
| """ | |
| html_footer="</td></tr></table></body></html>" | |
| #_________________________________________________________________ | |
| htmlRaw = requests.get(url_name) | |
| print htmlRaw.status_code | |
| print htmlRaw.headers['content-type'] | |
| print htmlRaw.encoding | |
| tree = html.fromstring(htmlRaw.content) | |
| #_________________________________________________________ | |
| # Future: ?Check if html already exists | |
| #html_file='wiki_strip_1.html' | |
| titletext = tree.xpath('//head//title/text()') | |
| print titletext | |
| reftext = tree.xpath('//cite[@id="Refernces"]//text()') | |
| refbooktext = tree.xpath('//span[@class="reference-text"]//text()') | |
| reftext = tree.xpath('//cite//text()') | |
| divtext = tree.xpath('//div[@id="mw-content-text"]//p/text()') | |
| # # print divtext | |
| # # print reftext | |
| # # print refbooktext | |
| fileout=open(html_file,'w') | |
| fileout.write(html_header) | |
| fileout.write('<br><a href="'+url_name+'">'+titletext[0]+'</a>''<hr><br>') | |
| # # # \TEX~\TEX~\TEX~\TEX~\TEX~\TEX~\TEX~\TEX~\TEX~\TEX~\TEX~\TEX~\TEX~\TEX~\TEX~ | |
| # # Wiki-Issue: class tag keeps changing(on same page), examples include *tex*, *math*, ... and so on | |
| tex_term = tree.xpath('//img[@class="mwe-math-fallback-image-inline"]/@alt') | |
| # tex_term = tree.xpath('//img[@class="mwe-math-fallback-image-inline mw-math-element"]/@alt') | |
| img_term = tree.xpath('//img[@class="mwe-math-fallback-image-inline"]/@src') | |
| # img_term = tree.xpath('//img[@class="mwe-math-fallback-image-inline mw-math-element"]/@src') | |
| # # NEXT: use Regex with xpath | |
| # doc.xpath("//a[re:match(text(), '(math)')]", namespaces={"re": "http://exslt.org/regular-expressions"}) | |
| # rrinse=tree.xpath("//class[re:match(text(), '(math)')]", namespaces={"re": "http://exslt.org/regular-expressions"}) | |
| print tex_term | |
| print img_term | |
| imc=0 | |
| coln=0 | |
| for it in tex_term: | |
| if len(it) == 0: | |
| print 'the list is empty' | |
| else: | |
| imgterm = img_term[imc]+"\"" | |
| imc=imc+1 | |
| #TEX | |
| fileout.write('<tr><td><img src="') | |
| ##Image and TEX | |
| fileout.write(imgterm+'><hr><br>$'+it+'$') | |
| # ##TEX begin... | |
| fileout.write('<br><hr><br>\\begin{equation}<br>') | |
| fileout.write(it) | |
| fileout.write('<br>\end{equation}<br><br></td></tr>') | |
| fileout.write('<tr><td>References<br><br><br></td></tr><tr><td>') | |
| for bt in refbooktext: | |
| if len(bt)>35: | |
| if isinstance(bt, str): | |
| fileout.write('<br>'+bt+'<br>') | |
| elif isinstance(bt, unicode): | |
| try: | |
| bt = bt.encode('ascii','replace') | |
| fileout.write('<br>'+bt+'<br>') | |
| except: | |
| print "Encoding Error" | |
| else: | |
| print "Ref Title Error" | |
| else: | |
| if isinstance(bt, str): | |
| fileout.write(bt+' ') | |
| elif isinstance(bt, unicode): | |
| try: | |
| bt = bt.encode('ascii','replace') | |
| fileout.write(bt+' ') | |
| except: | |
| print "Encoding Error" | |
| else: | |
| print "Ref Title Error" | |
| # fileout.write('<tr><td>'+str(bt)+'</td></tr>') | |
| fileout.write('</td></tr><tr><td>Text<br><br><br>') | |
| for dvt in divtext: | |
| if len(dvt)>35: | |
| if isinstance(dvt, str): | |
| fileout.write('<br>'+dvt+'<br>') | |
| elif isinstance(dvt, unicode): | |
| try: | |
| dvt = dvt.encode('ascii','replace') | |
| fileout.write('<br>'+dvt+'<br>') | |
| except: | |
| print "Encoding Error" | |
| else: | |
| print "text Title Error" | |
| else: | |
| if isinstance(dvt, str): | |
| fileout.write(dvt+' ') | |
| elif isinstance(dvt, unicode): | |
| try: | |
| dvt = dvt.encode('ascii','replace') | |
| fileout.write(dvt+' ') | |
| except: | |
| print "Encoding Error" | |
| else: | |
| print "text Title Error" | |
| # fileout.write('<tr><td>'+str(bt)+'</td></tr>') | |
| fileout.write('<br><br></td></tr>') | |
| fileout.write(html_footer) | |
| fileout.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment