""" Table of Contents Extension for Python-Markdown =============================================== See for documentation. Original code Copyright 2008 [Jack Miller](https://codezen.org/) All changes Copyright 2008-2014 The Python Markdown Project License: [BSD](https://opensource.org/licenses/bsd-license.php) """ from . import Extension from ..treeprocessors import Treeprocessor from ..util import code_escape, parseBoolValue, AMP_SUBSTITUTE, HTML_PLACEHOLDER_RE, AtomicString from ..treeprocessors import UnescapeTreeprocessor import re import html import unicodedata import xml.etree.ElementTree as etree def slugify(value, separator, unicode=False): """ Slugify a string, to make it URL friendly. """ if not unicode: # Replace Extended Latin characters with ASCII, i.e. `žlutý` => `zluty` value = unicodedata.normalize('NFKD', value) value = value.encode('ascii', 'ignore').decode('ascii') value = re.sub(r'[^\w\s-]', '', value).strip().lower() return re.sub(r'[{}\s]+'.format(separator), separator, value) def slugify_unicode(value, separator): """ Slugify a string, to make it URL friendly while preserving Unicode characters. """ return slugify(value, separator, unicode=True) IDCOUNT_RE = re.compile(r'^(.*)_([0-9]+)$') def unique(id, ids): """ Ensure id is unique in set of ids. Append '_1', '_2'... if not """ while id in ids or not id: m = IDCOUNT_RE.match(id) if m: id = '%s_%d' % (m.group(1), int(m.group(2))+1) else: id = '%s_%d' % (id, 1) ids.add(id) return id def get_name(el): """Get title name.""" text = [] for c in el.itertext(): if isinstance(c, AtomicString): text.append(html.unescape(c)) else: text.append(c) return ''.join(text).strip() def stashedHTML2text(text, md, strip_entities=True): """ Extract raw HTML from stash, reduce to plain text and swap with placeholder. """ def _html_sub(m): """ Substitute raw html with plain text. """ try: raw = md.htmlStash.rawHtmlBlocks[int(m.group(1))] except (IndexError, TypeError): # pragma: no cover return m.group(0) # Strip out tags and/or entities - leaving text res = re.sub(r'(<[^>]+>)', '', raw) if strip_entities: res = re.sub(r'(&[\#a-zA-Z0-9]+;)', '', res) return res return HTML_PLACEHOLDER_RE.sub(_html_sub, text) def unescape(text): """ Unescape escaped text. """ c = UnescapeTreeprocessor() return c.unescape(text) def nest_toc_tokens(toc_list): """Given an unsorted list with errors and skips, return a nested one. [{'level': 1}, {'level': 2}] => [{'level': 1, 'children': [{'level': 2, 'children': []}]}] A wrong list is also converted: [{'level': 2}, {'level': 1}] => [{'level': 2, 'children': []}, {'level': 1, 'children': []}] """ ordered_list = [] if len(toc_list): # Initialize everything by processing the first entry last = toc_list.pop(0) last['children'] = [] levels = [last['level']] ordered_list.append(last) parents = [] # Walk the rest nesting the entries properly while toc_list: t = toc_list.pop(0) current_level = t['level'] t['children'] = [] # Reduce depth if current level < last item's level if current_level < levels[-1]: # Pop last level since we know we are less than it levels.pop() # Pop parents and levels we are less than or equal to to_pop = 0 for p in reversed(parents): if current_level <= p['level']: to_pop += 1 else: # pragma: no cover break if to_pop: levels = levels[:-to_pop] parents = parents[:-to_pop] # Note current level as last levels.append(current_level) # Level is the same, so append to # the current parent (if available) if current_level == levels[-1]: (parents[-1]['children'] if parents else ordered_list).append(t) # Current level is > last item's level, # So make last item a parent and append current as child else: last['children'].append(t) parents.append(last) levels.append(current_level) last = t return ordered_list class TocTreeprocessor(Treeprocessor): def __init__(self, md, config): super().__init__(md) self.marker = config["marker"] self.title = config["title"] self.base_level = int(config["baselevel"]) - 1 self.slugify = config["slugify"] self.sep = config["separator"] self.toc_class = config["toc_class"] self.use_anchors = parseBoolValue(config["anchorlink"]) self.anchorlink_class = config["anchorlink_class"] self.use_permalinks = parseBoolValue(config["permalink"], False) if self.use_permalinks is None: self.use_permalinks = config["permalink"] self.permalink_class = config["permalink_class"] self.permalink_title = config["permalink_title"] self.header_rgx = re.compile("[Hh][123456]") if isinstance(config["toc_depth"], str) and '-' in config["toc_depth"]: self.toc_top, self.toc_bottom = [int(x) for x in config["toc_depth"].split('-')] else: self.toc_top = 1 self.toc_bottom = int(config["toc_depth"]) def iterparent(self, node): ''' Iterator wrapper to get allowed parent and child all at once. ''' # We do not allow the marker inside a header as that # would causes an endless loop of placing a new TOC # inside previously generated TOC. for child in node: if not self.header_rgx.match(child.tag) and child.tag not in ['pre', 'code']: yield node, child yield from self.iterparent(child) def replace_marker(self, root, elem): ''' Replace marker with elem. ''' for (p, c) in self.iterparent(root): text = ''.join(c.itertext()).strip() if not text: continue # To keep the output from screwing up the # validation by putting a `
` inside of a `

` # we actually replace the `

` in its entirety. # The `

` element may contain more than a single text content # (`nl2br` can introduce a `
`). In this situation, `c.text` returns # the very first content, ignore children contents or tail content. # `len(c) == 0` is here to ensure there is only text in the `

`. if c.text and c.text.strip() == self.marker and len(c) == 0: for i in range(len(p)): if p[i] == c: p[i] = elem break def set_level(self, elem): ''' Adjust header level according to base level. ''' level = int(elem.tag[-1]) + self.base_level if level > 6: level = 6 elem.tag = 'h%d' % level def add_anchor(self, c, elem_id): anchor = etree.Element("a") anchor.text = c.text anchor.attrib["href"] = "#" + elem_id anchor.attrib["class"] = self.anchorlink_class c.text = "" for elem in c: anchor.append(elem) while len(c): c.remove(c[0]) c.append(anchor) def add_permalink(self, c, elem_id): permalink = etree.Element("a") permalink.text = ("%spara;" % AMP_SUBSTITUTE if self.use_permalinks is True else self.use_permalinks) permalink.attrib["href"] = "#" + elem_id permalink.attrib["class"] = self.permalink_class if self.permalink_title: permalink.attrib["title"] = self.permalink_title c.append(permalink) def build_toc_div(self, toc_list): """ Return a string div given a toc list. """ div = etree.Element("div") div.attrib["class"] = self.toc_class # Add title to the div if self.title: header = etree.SubElement(div, "span") header.attrib["class"] = "toctitle" header.text = self.title def build_etree_ul(toc_list, parent): ul = etree.SubElement(parent, "ul") for item in toc_list: # List item link, to be inserted into the toc div li = etree.SubElement(ul, "li") link = etree.SubElement(li, "a") link.text = item.get('name', '') link.attrib["href"] = '#' + item.get('id', '') if item['children']: build_etree_ul(item['children'], li) return ul build_etree_ul(toc_list, div) if 'prettify' in self.md.treeprocessors: self.md.treeprocessors['prettify'].run(div) return div def run(self, doc): # Get a list of id attributes used_ids = set() for el in doc.iter(): if "id" in el.attrib: used_ids.add(el.attrib["id"]) toc_tokens = [] for el in doc.iter(): if isinstance(el.tag, str) and self.header_rgx.match(el.tag): self.set_level(el) text = get_name(el) # Do not override pre-existing ids if "id" not in el.attrib: innertext = unescape(stashedHTML2text(text, self.md)) el.attrib["id"] = unique(self.slugify(innertext, self.sep), used_ids) if int(el.tag[-1]) >= self.toc_top and int(el.tag[-1]) <= self.toc_bottom: toc_tokens.append({ 'level': int(el.tag[-1]), 'id': el.attrib["id"], 'name': unescape(stashedHTML2text( code_escape(el.attrib.get('data-toc-label', text)), self.md, strip_entities=False )) }) # Remove the data-toc-label attribute as it is no longer needed if 'data-toc-label' in el.attrib: del el.attrib['data-toc-label'] if self.use_anchors: self.add_anchor(el, el.attrib["id"]) if self.use_permalinks not in [False, None]: self.add_permalink(el, el.attrib["id"]) toc_tokens = nest_toc_tokens(toc_tokens) div = self.build_toc_div(toc_tokens) if self.marker: self.replace_marker(doc, div) # serialize and attach to markdown instance. toc = self.md.serializer(div) for pp in self.md.postprocessors: toc = pp.run(toc) self.md.toc_tokens = toc_tokens self.md.toc = toc class TocExtension(Extension): TreeProcessorClass = TocTreeprocessor def __init__(self, **kwargs): self.config = { "marker": ['[TOC]', 'Text to find and replace with Table of Contents - ' 'Set to an empty string to disable. Defaults to "[TOC]"'], "title": ["", "Title to insert into TOC

- " "Defaults to an empty string"], "toc_class": ['toc', 'CSS class(es) used for the link. ' 'Defaults to "toclink"'], "anchorlink": [False, "True if header should be a self link - " "Defaults to False"], "anchorlink_class": ['toclink', 'CSS class(es) used for the link. ' 'Defaults to "toclink"'], "permalink": [0, "True or link text if a Sphinx-style permalink should " "be added - Defaults to False"], "permalink_class": ['headerlink', 'CSS class(es) used for the link. ' 'Defaults to "headerlink"'], "permalink_title": ["Permanent link", "Title attribute of the permalink - " "Defaults to 'Permanent link'"], "baselevel": ['1', 'Base level for headers.'], "slugify": [slugify, "Function to generate anchors based on header text - " "Defaults to the headerid ext's slugify function."], 'separator': ['-', 'Word separator. Defaults to "-".'], "toc_depth": [6, 'Define the range of section levels to include in' 'the Table of Contents. A single integer (b) defines' 'the bottom section level (

..) only.' 'A string consisting of two digits separated by a hyphen' 'in between ("2-5"), define the top (t) and the' 'bottom (b) (..). Defaults to `6` (bottom).'], } super().__init__(**kwargs) def extendMarkdown(self, md): md.registerExtension(self) self.md = md self.reset() tocext = self.TreeProcessorClass(md, self.getConfigs()) md.treeprocessors.register(tocext, 'toc', 5) def reset(self): self.md.toc = '' self.md.toc_tokens = [] def makeExtension(**kwargs): # pragma: no cover return TocExtension(**kwargs)