#! /usr/bin/env python import os from lxml.html import parse, etree, tostring link_svg = """ """ os.chdir('export/html') filenames = [f.rpartition('.html')[0] for f in os.listdir() if f.endswith('html')] # Build dictionary of which dedicated links and headline links (by filename) dl = {} hl = {} cl = {} for filename in filenames: dl[filename] = [] hl[filename] = [] tree = parse(f'{filename}.html') for el in tree.iter(): # find all dedicated links, which are of form 'a id="..."' # (they are the only links with and id) if (el.tag == 'a' and 'id' in el.attrib): # #and el.attrib['id'].partition(':')[0] in filenames): #and el.attrib['id'].partition(':')[0] == 'eq'): # raise flag if id coincides with a filename: if el.attrib['id'] in filenames: print("** Error: dedicated link name clashes with " f"headline CUSTOM_ID {el.attrib['id']} **") # raise flag if this key already exists: if el.attrib['id'] in dl[filename]: print(f"** Error: multiply-defined label {el.attrib['id']} **") else: # add this dedicated link to our dictionary dl[filename].append(el.attrib['id']) # find the headline links, which are of form '' if (el.tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'] and 'id' in el.attrib): if el.attrib['id'] in hl[filename]: print("** Error: multiply-defined headline {el.attrib['id']} **") else: hl[filename].append({ 'tag': el.tag, 'id': el.attrib['id'], 'text': el[0].text # el[0] is the a tag (only child) }) # find all the child section links inside `