Bethe_Ansatz/scripts/export_split.py

#! /usr/bin/env python

import os
from lxml.html import parse, etree, tostring

copyright_string = """
<div class="license">
<a rel="license noopener" href="https://creativecommons.org/licenses/by/4.0/"
target="_blank" class="m-2">
<img alt="Creative Commons License" style="border-width:0"
src="https://licensebuttons.net/l/by/4.0/80x15.png"/>
</a>
Except where otherwise noted, all content is licensed under a
<a rel="license noopener" href="https://creativecommons.org/licenses/by/4.0/"
target="_blank">Creative Commons Attribution 4.0 International License</a>.
</div>
"""

BASEDIR = "./"
SRCDIR = f"{BASEDIR}src/org/"

tree = parse(f"{SRCDIR}index.html")

el_head = None
el_header = None
el_title = None
el_nav = None
el_ttoc = None
el_content = None
el_postamble = None

sections_2 = []

for el in tree.iter():
    if el.tag == 'head':
        el_head = el
    if el.tag == 'header':
        el_header = el
    if 'class' in el.attrib and el.attrib['class'] == 'title':
        el_title = el
    if el.tag == 'nav':
        el_nav = el
    if 'id' in el.attrib and el.attrib['id'] == 'text-table-of-contents':
        el_ttoc = el
    if 'id' in el.attrib and el.attrib['id'] == 'content':
        el_content = el
    if 'id' in el.attrib and el.attrib['id'] == 'postamble':
        el_postamble = el
    if 'class' in el.attrib and el.attrib['class'].startswith('outline-2'):
        sections_2.append(el)

export_dir = f"{BASEDIR}export/html"
if not os.path.isdir(export_dir):
    os.makedirs(export_dir)


def ordered_pages(el):
    """
    Creates a list of all pages, able to return previous, next and up links.

    `el` should be the text-table-of-contents element.
    """
    page_ids = []
    page_texts = []
    for child in el.iter():
        if child.tag == 'a':
            page_ids.append(child.attrib['href'].lstrip('#'))
            page_texts.append(child.text)
    return page_ids, page_texts

page_ids, page_texts = ordered_pages(el_ttoc)

def head_at_location(el_head, page_id):
    output = tostring(el_head, pretty_print=True, encoding='unicode')
    try:
        index = page_ids.index(page_id)
    except ValueError:
        return output
    if index == 0:
        return output
    return output.replace(
        '</title>',
        f': {page_texts[index]}</title>'
    )

def breadcrumbs(page_id):
    try:
        index = page_ids.index(page_id)
    except ValueError:
        return ''
    if index == 0: # mains page, no need for breadcrumbs
        return ''
    breadcrumb_lis = f'<li>{page_texts[index]}</li>'
    # If book or article, take shortcut
    if page_id.startswith("b-"): # this is a book
        page_id_stripped = "l_b"
    elif page_id[:4].isdigit(): # this is an article
        page_id_stripped = f"l_a_{page_id[:4]}"
    else:
        page_id_stripped = page_ids[index].rpartition('_')[0]
    while page_id_stripped:
        index_stripped = page_ids.index(page_id_stripped)
        breadcrumb_lis = (
            # give class to link to prevent auto addition of ids from cleanup_links script
            '<li><a class="breadcrumb-link"' +
            f'href="{page_ids[index_stripped]}.html">' +
            f'{page_texts[index_stripped]}</a></li>'
            + breadcrumb_lis)
        page_id_stripped = page_id_stripped.rpartition('_')[0]
    return f'<ul class="breadcrumbs">{breadcrumb_lis}</ul>'

def link_previous(page_id):
    try:
        index = page_ids.index(page_id)
    except ValueError:
        return None
    if index == 0: # first, no previous
        return None
    # If depth is 1, there is no previous
    if page_id.count('_') == 0:
        return None
    else:
        return f'<a href="{page_ids[index-1]}.html">{page_texts[index-1]}&emsp;<small>[{page_ids[index-1].replace("_",".")}]</small></a>'

def link_next(page_id):
    try:
        index = page_ids.index(page_id)
    except ValueError:
        return None
    if index == len(page_ids) - 1: # last, no next
        return None
    return f'<a href="{page_ids[index+1]}.html">{page_texts[index+1]}&emsp;<small>[{page_ids[index+1].replace("_",".")}]</small></a>'

def link_up(page_id):
    try:
        index = page_ids.index(page_id)
    except ValueError:
        return None
    # Strip one '_' from id and point there
    if page_id.count('_') > 0:
        index_up = page_ids.index(page_id.rpartition('_')[0])
        return f'<a href="{page_ids[index_up]}.html">{page_texts[index_up]}&emsp;<small>[{page_ids[index_up].replace("_",".")}]</small></a>'
    return None

def navigation_links(location):
    links = '<ul class="navigation-links">'
    previous = link_previous(location)
    if previous:
        links += f'<li>Prev:&nbsp;{previous}</li>'
    nxt = link_next(location)
    if nxt:
        links += f'<li>Next:&nbsp;{nxt}</li>'
    up = link_up(location)
    if up:
        links += f'<li>Up:&nbsp;{up}</li>'
    links += '</ul>'
    if previous or nxt or up:
        return links
    return ''

def search_box():
    return """
    <form style="float: right; padding-right: 0;" method="get" id="search" action="https://duckduckgo.com/" target="_blank">
    <input type="hidden" name="sites" value="integrability.org"/>
    <input class="search" type="text" name="q" maxlength="300" placeholder="Search"/>
    <input type="submit" value="Search" style="visibility: hidden; width: 0;" /></form>
    """

def list_to_details_recursive(el):
    """
    `el` contains either a single `a` child, or an `a` followed by `ul`.
    In the first case, output as is.
    In the second case, replace by details/summary.
    """
    # checks
    if not (len(el) == 1 or len(el) == 2):
        raise ValueError(f'el must have either 1 or 2 children, found {len(el)}')
    if not el[0].tag in ['a', 'ul']:
        raise ValueError(f'el[0] must be an a or ul tag, but found a {el[0].tag}')
    if len(el) == 2 and not (el[0].tag == 'a' and el[1].tag == 'ul'):
        raise ValueError(f'for len(el) == 2, el[1] must be ul, but found {el[1].tag}')
    # single child, output as is
    if len(el) == 1 and el[0].tag == 'a':
        output = tostring(el[0], pretty_print=True, encoding='unicode')
    else: # build a details/summary
        summary_text = (
            'Table of contents' + search_box() if len(el) == 1 else
            tostring(el[0], pretty_print=True, encoding='unicode')
        )
        ul = el[0] if len(el) == 1 else el[1]
        # print(f'summary_text: {summary_text}')
        # print(f'len(ul): {len(ul)}')
        # print([li.tag for li in ul])
        # print([tostring(li, pretty_print=True, encoding='unicode') for li in ul])
        output = ('\n<details>'
                  f'\n<summary>\n{summary_text}'
                  '\n</summary>\n<ul>\n')
        for li in ul:
            if not li.tag == 'li':
                raise ValueError('child of ul should be li')
            output += '<li>\n'
            output += list_to_details_recursive(li)
            output += '\n</li>\n'
        output += '\n</ul>\n</details>'
    return output


collapsed_toc = '\n<nav id="collapsed-table-of-contents">'
collapsed_toc += list_to_details_recursive(el_ttoc)
collapsed_toc += '\n</nav>\n'


def toc_at_location(collapsed_toc, location):
    """
    Given a collapsed toc and location (filename),
    mark the details hierarchy as open.
    """
    output = collapsed_toc
    if location != 'index':
        # open all ancestors
        prefix = location.rpartition('_')[0]
        while prefix:
            output = output.replace(
                f'<details>\n<summary>\n<a href="#{prefix}"',
                f'<details open="">\n<summary class="toc-open">\n<a href="#{prefix}"')
            prefix = prefix.rpartition('_')[0]
        # highlight the current location, whether it's a summary or a
        output = output.replace(
            f'<details>\n<summary>\n<a href="#{location}"',
            f'<details open="">\n<summary class="toc-currentpage">\n<a href="#{location}"'
        )
        output = output.replace(
            f'<li>\n<a href="#{location}"',
            f'<li class="toc-currentpage">\n<a href="#{location}"'
        )
        # but close all details which contain deeper levels than location
        output = output.replace(
            f'<details open="">\n<summary>\n<a href="#{location}_',
            f'<details>\n<summary>\n<a href="#{location}_')
    return output.replace('<a href', '<a class="toc-a" href')

def write_file_start(_file, location):
    _file.write('<!DOCTYPE html>\n<html lang="en">\n')
    _file.write(head_at_location(el_head, location))
    _file.write('<div id="content">\n')
    #_file.write(tostring(el_header, pretty_print=True, encoding='unicode'))
    _file.write('<header>\n<h1 class="title">\n'
                '<a href="./index.html" class="homepage-link">')
    _file.write(f'{el_title.text}</a>\n</h1>\n</header>')
    _file.write(toc_at_location(collapsed_toc, location))
    _file.write(breadcrumbs(location))
    _file.write(navigation_links(location))

def write_file_end(_file, location):
    _file.write('\n<br>')
    _file.write(navigation_links(location)) # repeat, for convenience
    _file.write('\n<br>')
    _file.write('\n<hr>')
    _file.write(copyright_string)
    _file.write(tostring(el_postamble, pretty_print=True, encoding='unicode'))
    _file.write('\n</div>\n</html>')


def write_files_recursive(name, el, levelmax=5):
    """
    Recursively extract outlines.
    """
    if 'id' in el.attrib and el.attrib['id'] == 'content':
        level = 1
    elif not (el is not None and
              'class' in el.attrib and
              el.attrib['class'].startswith('outline-')):
        print(f'Element name {name} has no outline class, no files written.')
        return
    else:
        level = int(el.attrib['class'].partition('outline-')[2][0])

    _file = open(f'{BASEDIR}export/html/{name}.html', 'w')
    write_file_start(_file, name)

    # Count outline children for establishing output format
    children = []
    for child in el.iter():
        if ('class' in child.attrib and
            child.attrib['class'].startswith('outline-%s' % str(level + 1))):
            children.append(child)

    # if no children or if we're on levelmax, write everything
    if (len(children) == 0 or level == levelmax):
        _file.write(tostring(el, pretty_print=True, encoding='unicode'))
    # otherwise write anything above first next-level headling,
    # and then replace the next-level headlines by links to their files
    else:
        if (el.text):
            _file.write(el.text)
        for child in el:
            if ('class' in child.attrib and
                child.attrib['class'].startswith('outline-%s' % str(level + 1))):
                break # break out once we hit the first next-level headline
            if child.tag != "nav": # don't write the table-of-contents
                _file.write(tostring(child, pretty_print=True, encoding='unicode'))
        # now print the list of children
        _file.write(f'<h{level+1}>In this section:</h{level+1}>')
        _file.write('\n<ul class="child-links-list">')
        for child in children:
            child_h = next(child.iter('h%s' % int(level + 1)))
            child_label = child_h.attrib['id']
            child_text = next(child_h.iter('a')).text
            _file.write(f'\n<li><a href="{child_label}.html">{child_text}</a></li>')
            write_files_recursive(name=child_label, el=child)
        _file.write('\n</ul>\n')

    write_file_end(_file, name)
    _file.close()


# Rewrite the index.html file:
write_files_recursive('index', el_content) # the only call needed