Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Fix rel canonical on matplotlib docs
import re
from glob import glob
pat = re.compile('\d\.\d\.\d')
def is_versioned_doc(path):
return pat.match(path.split('/')[0]) is not None
all_files = glob('**/*.html', recursive=True)
subpath_rel = {}
for path in filter(lambda x: not is_versioned_doc(x), all_files):
if path.split('/')[0] in ('examples', 'xkcd'):
# we skip examples: they are now in gallery.
continue
subpath_rel[path] = 'https://matplotlib.org/'+path
print(f"{len(subpath_rel)} cannonical files found at top level")
from collections import defaultdict
uniquename = defaultdict(lambda : [])
for k,v in subpath_rel.items():
fname = k.split('/')[-1]
uniquename[fname].append(k)
uniquename = {k:v[0] for k,v in uniquename.items() if len(v) == 1}
print(len(uniquename))
## ok, so now let's iter on examples
from collections import Counter
#print(Counter([k.split('/')[-1] for k in subpath_rel.keys()]).most_common(172))
def update_file_with_relcannonical(fname, rel):
with open(fname) as f:
data = f.read()
c = 0
with open(fname, 'w') as f:
for d in data.splitlines()[:-1]:
if 'rel="canonical"' in d:
continue
if '</head>' in d:
f.write(f' <link rel="canonical" href="{rel}" />\n')
c+=1
f.write(d+'\n')
f.write(data.splitlines()[-1])
if data.endswith('\n'):
f.write('\n')
if c != 1:
print('something went wrong with', fname, c)
we = 0
wne = 0
wue = 0
for path in glob(f'examples/**/*.html', recursive=True):
data = open(path).read()
subpath = 'gallery/'+'/'.join(path.split('/')[1:])
gallery_equiv = subpath_rel.get(subpath, None)
if gallery_equiv:
update_file_with_relcannonical(path, gallery_equiv)
we += 1
subpath_rel[path] = gallery_equiv
else:
fname = path.split('/')[-1]
uname_match = uniquename.get(fname)
if uname_match:
wue +=1
subpath_rel[path] = uname_match
else:
wne += 1
subpath_rel[path] = path
print(f"{we} with equivalent, {wue} based on filename, {wne} without")
import sys
#sys.exit("Done")
########
version_docs = []
from os import scandir
for d in scandir('.'):
if d.is_dir() and pat.match(d.name) is not None:
version_docs.append(d.name)
version_docs = sorted(version_docs, reverse=True)
print(version_docs)
subv = 0
for version in version_docs:
for path in glob(f'{version}/**/*.html', recursive=True):
with open(path) as f:
data = f.read()
subpath = '/'.join(path.split('/')[1:])
if 'rel="canonical"' in data:
continue
else:
if subpath in subpath_rel:
update_file_with_relcannonical(path, subpath_rel[subpath])
subv += 1
print('Done', subv)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.