#! /usr/bin/env python import os from lxml.html import parse, etree, tostring copyright_string = """

Except where otherwise noted, all content is licensed under a Creative Commons Attribution 4.0 International License.

""" searchbox_string = """ """ BASEDIR = "./" SRCDIR = f"{BASEDIR}src/" tree = parse(f"{SRCDIR}index.html") el_head = None el_header = None el_title = None el_nav = None el_ttoc = None el_content = None el_postamble = None sections_2 = [] for el in tree.iter(): if el.tag == 'head': el_head = el if el.tag == 'header': el_header = el if 'class' in el.attrib and el.attrib['class'] == 'title': el_title = el if el.tag == 'nav': el_nav = el if 'id' in el.attrib and el.attrib['id'] == 'text-table-of-contents': el_ttoc = el if 'id' in el.attrib and el.attrib['id'] == 'content': el_content = el if 'id' in el.attrib and el.attrib['id'] == 'postamble': el_postamble = el if 'class' in el.attrib and el.attrib['class'].startswith('outline-2'): sections_2.append(el) export_dir = f"{BASEDIR}export/html" if not os.path.isdir(export_dir): os.makedirs(export_dir) def ordered_pages(el): """ Creates a list of all pages, able to return previous, next and up links. `el` should be the text-table-of-contents element. """ page_ids = [] page_texts = [] for child in el.iter(): if child.tag == 'a': page_ids.append(child.attrib['href'].lstrip('#')) page_texts.append(child.text) return page_ids, page_texts page_ids, page_texts = ordered_pages(el_ttoc) def head_at_location(el_head, page_id): output = tostring(el_head, pretty_print=True, encoding='unicode') try: index = page_ids.index(page_id) except ValueError: return output if index == 0: return output return output.replace( '', f': {page_texts[index]}' ) def breadcrumbs(page_id): try: index = page_ids.index(page_id) except ValueError: return '' if index == 0: # mains page, no need for breadcrumbs return '' breadcrumb_lis = f'

{page_texts[index]}

' # If book or article, take shortcut if page_id.startswith("b-"): # this is a book page_id_stripped = "l_b" elif page_id[:4].isdigit(): # this is an article page_id_stripped = f"l_a_{page_id[:4]}" else: page_id_stripped = page_ids[index].rpartition('_')[0] while page_id_stripped: index_stripped = page_ids.index(page_id_stripped) breadcrumb_lis = ( # give class to link to prevent auto addition of ids from cleanup_links script '

' + breadcrumb_lis) page_id_stripped = page_id_stripped.rpartition('_')[0] return f'

{breadcrumb_lis}' def link_previous(page_id): try: index = page_ids.index(page_id) except ValueError: return None if index == 0: # first, no previous return None # If depth is 1, there is no previous if page_id.count('_') == 0: return None else: return f'{page_texts[index-1]} [{page_ids[index-1].replace("_",".")}]' def link_next(page_id): try: index = page_ids.index(page_id) except ValueError: return None if index == len(page_ids) - 1: # last, no next return None return f'{page_texts[index+1]} [{page_ids[index+1].replace("_",".")}]' def link_up(page_id): try: index = page_ids.index(page_id) except ValueError: return None # Strip one '_' from id and point there if page_id.count('_') > 0: index_up = page_ids.index(page_id.rpartition('_')[0]) return f'{page_texts[index_up]} [{page_ids[index_up].replace("_",".")}]' return None def navigation_links(location): links = '' if previous or nxt or up: return links return '' def search_box(): return searchbox_string def list_to_details_recursive(el): """ `el` contains either a single `a` child, or an `a` followed by `ul`. In the first case, output as is. In the second case, replace by details/summary. """ # checks if not (len(el) == 1 or len(el) == 2): raise ValueError(f'el must have either 1 or 2 children, found {len(el)}') if not el[0].tag in ['a', 'ul']: raise ValueError(f'el[0] must be an a or ul tag, but found a {el[0].tag}') if len(el) == 2 and not (el[0].tag == 'a' and el[1].tag == 'ul'): raise ValueError(f'for len(el) == 2, el[1] must be ul, but found {el[1].tag}') # single child, output as is if len(el) == 1 and el[0].tag == 'a': output = tostring(el[0], pretty_print=True, encoding='unicode') else: # build a details/summary summary_text = ( 'Table of contents' + search_box() if len(el) == 1 else tostring(el[0], pretty_print=True, encoding='unicode') ) ul = el[0] if len(el) == 1 else el[1] # print(f'summary_text: {summary_text}') # print(f'len(ul): {len(ul)}') # print([li.tag for li in ul]) # print([tostring(li, pretty_print=True, encoding='unicode') for li in ul]) output = ('\n

' f'\n

\n{summary_text}' '\n

\n' output += list_to_details_recursive(li) output += '\n

' return output collapsed_toc = '\n\n' def toc_at_location(collapsed_toc, location): """ Given a collapsed toc and location (filename), mark the details hierarchy as open. """ output = collapsed_toc if location != 'index': # open all ancestors prefix = location.rpartition('_')[0] while prefix: output = output.replace( f'

\n\n

\n \n

\n \n \n \n

\n \n\n') _file.write(head_at_location(el_head, location)) _file.write('

\n') #_file.write(tostring(el_header, pretty_print=True, encoding='unicode')) _file.write('

\n' '') _file.write(f'{el_title.text}\n

') _file.write(toc_at_location(collapsed_toc, location)) _file.write(breadcrumbs(location)) _file.write(navigation_links(location)) def write_file_end(_file, location): _file.write('\n
') _file.write(navigation_links(location)) # repeat, for convenience _file.write('\n
') _file.write('\n

') _file.write(copyright_string) _file.write(tostring(el_postamble, pretty_print=True, encoding='unicode')) _file.write('\n

\n') def write_files_recursive(name, el, levelmax=5): """ Recursively extract outlines. """ if 'id' in el.attrib and el.attrib['id'] == 'content': level = 1 elif not (el is not None and 'class' in el.attrib and el.attrib['class'].startswith('outline-')): print(f'Element name {name} has no outline class, no files written.') return else: level = int(el.attrib['class'].partition('outline-')[2][0]) _file = open(f'{BASEDIR}export/html/{name}.html', 'w') write_file_start(_file, name) # Count outline children for establishing output format children = [] for child in el.iter(): if ('class' in child.attrib and child.attrib['class'].startswith('outline-%s' % str(level + 1))): children.append(child) # if no children or if we're on levelmax, write everything if (len(children) == 0 or level == levelmax): _file.write(tostring(el, pretty_print=True, encoding='unicode')) # otherwise write anything above first next-level headling, # and then replace the next-level headlines by links to their files else: if (el.text): _file.write(el.text) for child in el: if ('class' in child.attrib and child.attrib['class'].startswith('outline-%s' % str(level + 1))): break # break out once we hit the first next-level headline if child.tag != "nav": # don't write the table-of-contents _file.write(tostring(child, pretty_print=True, encoding='unicode')) # now print the list of children _file.write(f'In this section:') _file.write('\n

{child_text}

\n') write_file_end(_file, name) _file.close() # Rewrite the index.html file: write_files_recursive('index', el_content) # the only call needed