Bethe_Ansatz/scripts/cleanup_links.py

113 lines
5.3 KiB
Python
Executable File

#! /usr/bin/env python
import os
from lxml.html import parse, etree, tostring
link_svg = """<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-link" viewBox="0 0 16 16">
<path d="M6.354 5.5H4a3 3 0 0 0 0 6h3a3 3 0 0 0 2.83-4H9c-.086 0-.17.01-.25.031A2 2 0 0 1 7 10.5H4a2 2 0 1 1 0-4h1.535c.218-.376.495-.714.82-1z"/>
<path d="M9 5.5a3 3 0 0 0-2.83 4h1.098A2 2 0 0 1 9 6.5h3a2 2 0 1 1 0 4h-1.535a4.02 4.02 0 0 1-.82 1H12a3 3 0 1 0 0-6H9z"/>
</svg>"""
os.chdir('export/html')
filenames = [f.rpartition('.html')[0] for f in os.listdir() if f.endswith('html')]
# Build dictionary of which dedicated links and headline links (by filename)
dl = {}
hl = {}
cl = {}
for filename in filenames:
dl[filename] = []
hl[filename] = []
tree = parse(f'{filename}.html')
for el in tree.iter():
# find all dedicated links, which are of form 'a id="..."'
# (they are the only links with and id)
if (el.tag == 'a' and 'id' in el.attrib):
# #and el.attrib['id'].partition(':')[0] in filenames):
#and el.attrib['id'].partition(':')[0] == 'eq'):
# raise flag if id coincides with a filename:
if el.attrib['id'] in filenames:
print("** Error: dedicated link name clashes with "
f"headline CUSTOM_ID {el.attrib['id']} **")
# raise flag if this key already exists:
if el.attrib['id'] in dl[filename]:
print(f"** Error: multiply-defined label {el.attrib['id']} **")
else: # add this dedicated link to our dictionary
dl[filename].append(el.attrib['id'])
# find the headline links, which are of form '<h[2-6] id="...">'
if (el.tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'] and 'id' in el.attrib):
if el.attrib['id'] in hl[filename]:
print("** Error: multiply-defined headline {el.attrib['id']} **")
else:
hl[filename].append({
'tag': el.tag,
'id': el.attrib['id'],
'text': el[0].text # el[0] is the a tag (only child)
})
# find all the child section links inside `<ul class="child-link-list">`
# and are of the form `<li><a href="....html">`
# if (el.tag == 'ul' and 'class' in el.attrib and
# el.attrib['class'] == 'child-link-list'):
# for c in el:
# cl[c[0].attrib['href'].partition('.')[0] ] =
# Perform all substitutions
for filename in filenames:
with open(f'{filename}.html', 'r') as file:
content = file.read()
# cleanup any stray type in script tags coming from old-fashioned org export
content = content.replace('style type="text/css"', 'style ')
content = content.replace('script type="text/javascript"', 'script ')
# remove validation link if present
content = content.replace('<a href="https://validator.w3.org/check?uri=referer">Validate</a>', '')
# section link substitutions
for val in filenames:
content = content.replace(
# link directly to the headline at `#{val}` even if it's top-level
# f'a href="#{val}"', f'a href="./{val}.html#{val}"')
f'href="#{val}"', f'href="./{val}.html#{val}"')
# equation link substitutions
for key, vals in dl.items():
for val in vals:
#print('Replacing ', f'href="#{val}"', ' by ', f'href="./{key}.html#{val}')
content = content.replace(f'href="#{val}"', f'href="./{key}.html#{val}"')
# add permalinks
content = content.replace(
f'<a id="{val}"></a>',
f'<a id="{val}"></a><a href="./{key}.html#{val}">{link_svg}</a>'
)
# add permalinks to headlines
for key, vals in hl.items():
for val in vals:
el_tag = val['tag']
el_id = val['id']
el_text = val['text']
content = content.replace(
(f'<{el_tag} id="{el_id}">'
f'<a href="./{key}.html#{el_id}">{el_text}</a></{el_tag}>'),
(f'<{el_tag} id="{el_id}">{el_text}'
f'<a class="headline-permalink" href="./{key}.html#{el_id}">'
f'{link_svg}</a>'
f'<span class="headline-id">{el_id.replace("_",".")}</span>'
f'</{el_tag}>')
)
# add section ids to all `child-link-list`s
content = content.replace(
f'<li><a href="{el_id}.html">{el_text}</a></li>',
(f'<li><a href="{el_id}.html">{el_text}</a>'
f'<span class="headline-id">'
f'{el_id.replace("_",".")}</span></li>')
)
# add section ids to items in the toc
content = content.replace(
f'<a class="toc-a" href="./{key}.html#{el_id}">{el_text}</a>',
(f'<a class="toc-a" href="./{key}.html#{el_id}">{el_text}</a>'
f'<span class="headline-id">'
f'{el_id.replace("_",".")}</span>')
)
# rewrite file
with open(f'{filename}.html', 'w') as outfile:
outfile.write(content)