Bethe_Ansatz/scripts/export_split.py

320 lines
11 KiB
Python
Executable File

#! /usr/bin/env python
import os
from lxml.html import parse, etree, tostring
copyright_string = """
<div class="license">
<a rel="license noopener" href="https://creativecommons.org/licenses/by/4.0/"
target="_blank" class="m-2">
<img alt="Creative Commons License" style="border-width:0"
src="https://licensebuttons.net/l/by/4.0/80x15.png"/>
</a>
Except where otherwise noted, all content is licensed under a
<a rel="license noopener" href="https://creativecommons.org/licenses/by/4.0/"
target="_blank">Creative Commons Attribution 4.0 International License</a>.
</div>
"""
BASEDIR = "./"
SRCDIR = f"{BASEDIR}src/org/"
tree = parse(f"{SRCDIR}index.html")
el_head = None
el_header = None
el_title = None
el_nav = None
el_ttoc = None
el_content = None
el_postamble = None
sections_2 = []
for el in tree.iter():
if el.tag == 'head':
el_head = el
if el.tag == 'header':
el_header = el
if 'class' in el.attrib and el.attrib['class'] == 'title':
el_title = el
if el.tag == 'nav':
el_nav = el
if 'id' in el.attrib and el.attrib['id'] == 'text-table-of-contents':
el_ttoc = el
if 'id' in el.attrib and el.attrib['id'] == 'content':
el_content = el
if 'id' in el.attrib and el.attrib['id'] == 'postamble':
el_postamble = el
if 'class' in el.attrib and el.attrib['class'].startswith('outline-2'):
sections_2.append(el)
export_dir = f"{BASEDIR}export/html"
if not os.path.isdir(export_dir):
os.makedirs(export_dir)
def ordered_pages(el):
"""
Creates a list of all pages, able to return previous, next and up links.
`el` should be the text-table-of-contents element.
"""
page_ids = []
page_texts = []
for child in el.iter():
if child.tag == 'a':
page_ids.append(child.attrib['href'].lstrip('#'))
page_texts.append(child.text)
return page_ids, page_texts
page_ids, page_texts = ordered_pages(el_ttoc)
def head_at_location(el_head, page_id):
output = tostring(el_head, pretty_print=True, encoding='unicode')
try:
index = page_ids.index(page_id)
except ValueError:
return output
if index == 0:
return output
return output.replace(
'</title>',
f': {page_texts[index]}</title>'
)
def breadcrumbs(page_id):
try:
index = page_ids.index(page_id)
except ValueError:
return ''
if index == 0: # mains page, no need for breadcrumbs
return ''
breadcrumb_lis = f'<li>{page_texts[index]}</li>'
# If book or article, take shortcut
if page_id.startswith("b-"): # this is a book
page_id_stripped = "l_b"
elif page_id[:4].isdigit(): # this is an article
page_id_stripped = f"l_a_{page_id[:4]}"
else:
page_id_stripped = page_ids[index].rpartition('_')[0]
while page_id_stripped:
index_stripped = page_ids.index(page_id_stripped)
breadcrumb_lis = (
# give class to link to prevent auto addition of ids from cleanup_links script
'<li><a class="breadcrumb-link"' +
f'href="{page_ids[index_stripped]}.html">' +
f'{page_texts[index_stripped]}</a></li>'
+ breadcrumb_lis)
page_id_stripped = page_id_stripped.rpartition('_')[0]
return f'<ul class="breadcrumbs">{breadcrumb_lis}</ul>'
def link_previous(page_id):
try:
index = page_ids.index(page_id)
except ValueError:
return None
if index == 0: # first, no previous
return None
# If depth is 1, there is no previous
if page_id.count('_') == 0:
return None
else:
return f'<a href="{page_ids[index-1]}.html">{page_texts[index-1]}&emsp;<small>[{page_ids[index-1].replace("_",".")}]</small></a>'
def link_next(page_id):
try:
index = page_ids.index(page_id)
except ValueError:
return None
if index == len(page_ids) - 1: # last, no next
return None
return f'<a href="{page_ids[index+1]}.html">{page_texts[index+1]}&emsp;<small>[{page_ids[index+1].replace("_",".")}]</small></a>'
def link_up(page_id):
try:
index = page_ids.index(page_id)
except ValueError:
return None
# Strip one '_' from id and point there
if page_id.count('_') > 0:
index_up = page_ids.index(page_id.rpartition('_')[0])
return f'<a href="{page_ids[index_up]}.html">{page_texts[index_up]}&emsp;<small>[{page_ids[index_up].replace("_",".")}]</small></a>'
return None
def navigation_links(location):
links = '<ul class="navigation-links">'
previous = link_previous(location)
if previous:
links += f'<li>Prev:&nbsp;{previous}</li>'
nxt = link_next(location)
if nxt:
links += f'<li>Next:&nbsp;{nxt}</li>'
up = link_up(location)
if up:
links += f'<li>Up:&nbsp;{up}</li>'
links += '</ul>'
if previous or nxt or up:
return links
return ''
def search_box():
return """
<form style="float: right; padding-right: 0;" method="get" id="search" action="https://duckduckgo.com/" target="_blank">
<input type="hidden" name="sites" value="integrability.org"/>
<input class="search" type="text" name="q" maxlength="300" placeholder="Search"/>
<input type="submit" value="Search" style="visibility: hidden; width: 0;" /></form>
"""
def list_to_details_recursive(el):
"""
`el` contains either a single `a` child, or an `a` followed by `ul`.
In the first case, output as is.
In the second case, replace by details/summary.
"""
# checks
if not (len(el) == 1 or len(el) == 2):
raise ValueError(f'el must have either 1 or 2 children, found {len(el)}')
if not el[0].tag in ['a', 'ul']:
raise ValueError(f'el[0] must be an a or ul tag, but found a {el[0].tag}')
if len(el) == 2 and not (el[0].tag == 'a' and el[1].tag == 'ul'):
raise ValueError(f'for len(el) == 2, el[1] must be ul, but found {el[1].tag}')
# single child, output as is
if len(el) == 1 and el[0].tag == 'a':
output = tostring(el[0], pretty_print=True, encoding='unicode')
else: # build a details/summary
summary_text = (
'Table of contents' + search_box() if len(el) == 1 else
tostring(el[0], pretty_print=True, encoding='unicode')
)
ul = el[0] if len(el) == 1 else el[1]
# print(f'summary_text: {summary_text}')
# print(f'len(ul): {len(ul)}')
# print([li.tag for li in ul])
# print([tostring(li, pretty_print=True, encoding='unicode') for li in ul])
output = ('\n<details>'
f'\n<summary>\n{summary_text}'
'\n</summary>\n<ul>\n')
for li in ul:
if not li.tag == 'li':
raise ValueError('child of ul should be li')
output += '<li>\n'
output += list_to_details_recursive(li)
output += '\n</li>\n'
output += '\n</ul>\n</details>'
return output
collapsed_toc = '\n<nav id="collapsed-table-of-contents">'
collapsed_toc += list_to_details_recursive(el_ttoc)
collapsed_toc += '\n</nav>\n'
def toc_at_location(collapsed_toc, location):
"""
Given a collapsed toc and location (filename),
mark the details hierarchy as open.
"""
output = collapsed_toc
if location != 'index':
# open all ancestors
prefix = location.rpartition('_')[0]
while prefix:
output = output.replace(
f'<details>\n<summary>\n<a href="#{prefix}"',
f'<details open="">\n<summary class="toc-open">\n<a href="#{prefix}"')
prefix = prefix.rpartition('_')[0]
# highlight the current location, whether it's a summary or a
output = output.replace(
f'<details>\n<summary>\n<a href="#{location}"',
f'<details open="">\n<summary class="toc-currentpage">\n<a href="#{location}"'
)
output = output.replace(
f'<li>\n<a href="#{location}"',
f'<li class="toc-currentpage">\n<a href="#{location}"'
)
# but close all details which contain deeper levels than location
output = output.replace(
f'<details open="">\n<summary>\n<a href="#{location}_',
f'<details>\n<summary>\n<a href="#{location}_')
return output.replace('<a href', '<a class="toc-a" href')
def write_file_start(_file, location):
_file.write('<!DOCTYPE html>\n<html lang="en">\n')
_file.write(head_at_location(el_head, location))
_file.write('<div id="content">\n')
#_file.write(tostring(el_header, pretty_print=True, encoding='unicode'))
_file.write('<header>\n<h1 class="title">\n'
'<a href="./index.html" class="homepage-link">')
_file.write(f'{el_title.text}</a>\n</h1>\n</header>')
_file.write(toc_at_location(collapsed_toc, location))
_file.write(breadcrumbs(location))
_file.write(navigation_links(location))
def write_file_end(_file, location):
_file.write('\n<br>')
_file.write(navigation_links(location)) # repeat, for convenience
_file.write('\n<br>')
_file.write('\n<hr>')
_file.write(copyright_string)
_file.write(tostring(el_postamble, pretty_print=True, encoding='unicode'))
_file.write('\n</div>\n</html>')
def write_files_recursive(name, el, levelmax=5):
"""
Recursively extract outlines.
"""
if 'id' in el.attrib and el.attrib['id'] == 'content':
level = 1
elif not (el is not None and
'class' in el.attrib and
el.attrib['class'].startswith('outline-')):
print(f'Element name {name} has no outline class, no files written.')
return
else:
level = int(el.attrib['class'].partition('outline-')[2][0])
_file = open(f'{BASEDIR}export/html/{name}.html', 'w')
write_file_start(_file, name)
# Count outline children for establishing output format
children = []
for child in el.iter():
if ('class' in child.attrib and
child.attrib['class'].startswith('outline-%s' % str(level + 1))):
children.append(child)
# if no children or if we're on levelmax, write everything
if (len(children) == 0 or level == levelmax):
_file.write(tostring(el, pretty_print=True, encoding='unicode'))
# otherwise write anything above first next-level headling,
# and then replace the next-level headlines by links to their files
else:
if (el.text):
_file.write(el.text)
for child in el:
if ('class' in child.attrib and
child.attrib['class'].startswith('outline-%s' % str(level + 1))):
break # break out once we hit the first next-level headline
if child.tag != "nav": # don't write the table-of-contents
_file.write(tostring(child, pretty_print=True, encoding='unicode'))
# now print the list of children
_file.write(f'<h{level+1}>In this section:</h{level+1}>')
_file.write('\n<ul class="child-links-list">')
for child in children:
child_h = next(child.iter('h%s' % int(level + 1)))
child_label = child_h.attrib['id']
child_text = next(child_h.iter('a')).text
_file.write(f'\n<li><a href="{child_label}.html">{child_text}</a></li>')
write_files_recursive(name=child_label, el=child)
_file.write('\n</ul>\n')
write_file_end(_file, name)
_file.close()
# Rewrite the index.html file:
write_files_recursive('index', el_content) # the only call needed