Skip to content

Instantly share code, notes, and snippets.

@dietmarw
Created February 23, 2012 14:35
Show Gist options
  • Save dietmarw/1893109 to your computer and use it in GitHub Desktop.
Save dietmarw/1893109 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
from __future__ import with_statement
from BeautifulSoup import BeautifulSoup
import subprocess as sub
import re
import glob
import sys
import os
os.chdir('/tmp/MSLrelease/Modelica_aster/Modelica aster/Resources/help/') # this is needed for check_local_files()
repls = [
(re.compile(r'^[Mm][Oo][Dd][Ee][Ll][Ii][Cc][Aa]://Modelica/'),'../../../Modelica/'),
(re.compile(r'^[Mm][Oo][Dd][Ee][Ll][Ii][Cc][Aa]://ModelicaReference/'),'../../../ModelicaReference/'),
(re.compile(r'^[Mm][Oo][Dd][Ee][Ll][Ii][Cc][Aa]://ModelicaServices/'),'../../../ModelicaServices/'),
(re.compile(r'/Modelica/'), r'/Modelica%20aster/'),
(re.compile(r' '), r'%20'),
(re.compile(r'[Mm][Oo][Dd][Ee][Ll][Ii][Cc][Aa]://([A-Za-z0-9.\'()_]*#)'), r'\1.html#'),
(re.compile(r'[Mm][Oo][Dd][Ee][Ll][Ii][Cc][Aa]://([A-Za-z0-9.\'()_]*)'), r'\1.html'),
]
def linkreplace(link):
for (regex,repl) in repls:
try:
link = regex.sub(repl,link)
except:
pass
return link
def check_local_files(uri):
uri = uri.replace('%20', ' ')
with open('MissingFiles.log', 'w') as log_file:
if not uri.startswith(('http://','https://','mailto:','ftp://','#')):
if not os.path.exists(uri):
log_msg = tag+'Not found: %s \n' % uri
sys.stdout.write(log_msg)
log_file.write(log_msg) # for some reason nothing is written out yet
for filepath in sorted(glob.glob('/tmp/MSLrelease/Modelica_aster/Modelica aster/Resources/help/*.html')):
tag = '[Checking file %s]:\n' % filepath
# sys.stdout.write(tag) # not much point in writing the tags to stdout
sys.stderr.write(tag)
pid = sub.call(['tidy', '-modify', '-quiet', filepath])
if pid <> 2:
with open(filepath,'r') as html_file:
soup = BeautifulSoup(html_file)
for a in soup.findAll('a'):
try:
a['href'] = linkreplace(a['href'])
check_local_files(a['href'])
except:
pass
for img in soup.findAll('img'):
try:
img['src'] = linkreplace(img['src'])
check_local_files(img['src'])
except:
pass
with open(filepath,'w') as html_file:
html_file.write(soup.__str__())
else:
print('Tidy failed with %s, skipping link-replacement for %s!' % (pid,filepath))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment