# WikiNote3 # Copyright © 2020 Lee Yingtong Li (RunasSudo) # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . import markdown import markdown.extensions.admonition, markdown.extensions.md_in_html, markdown.extensions.footnotes, markdown.extensions.attr_list import re import xml.etree.ElementTree as ET directives = {} roles = {} class WNMarkdown(markdown.Markdown): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.registerExtensions([FootnoteExtension(), 'toc', 'tables', 'md_in_html'], {}) self.meta = {} # Override default Markdown processors self.preprocessors.register(NormalizeWhitespace(self), 'normalize_whitespace', 30) self.parser.blockprocessors.register(HashHeaderProcessor(self.parser), 'hashheader', 70) self.parser.blockprocessors.register(ListIndentProcessor(self.parser), 'indent', 90) self.parser.blockprocessors.register(UListProcessor(self.parser), 'ulist', 30) self.treeprocessors.register(AttrListTreeprocessor(self), 'attr_list', 8) # Our own processors self.parser.blockprocessors.register(DirectiveProcessor(self.parser), 'directive', 95) self.parser.blockprocessors.register(AdmonitionProcessor(self.parser), 'admonition', 105) self.inlinePatterns.register(BlueProcessor(self.parser), 'blue_em', 65) self.inlinePatterns.register(RoleProcessor(self.parser), 'role', 500) self.treeprocessors.register(WrapSectionProcessor(self), 'wrap_sections', 100) # Override def reset(self): super().reset() self.meta = {} return self # Based on Markdown.convert def parse(self, source): if not source.strip(): return '' self.lines = source.split('\n') for prep in self.preprocessors: self.lines = prep.run(self.lines) root = self.parser.parseDocument(self.lines).getroot() for treeprocessor in self.treeprocessors: newRoot = treeprocessor.run(root) if newRoot is not None: root = newRoot return root # Based on Markdown.convert def serialise(self, root): output = self.serializer(root) if self.stripTopLevelTags: try: start = output.index('<{}>'.format(self.doc_tag)) + len(self.doc_tag) + 2 end = output.rindex(''.format(self.doc_tag)) output = output[start:end].strip() except ValueError: if output.strip().endswith('<{} />'.format(self.doc_tag)): output = '' for pp in self.postprocessors: output = pp.run(output) return output.strip() # Put it together def convert(self, source): root = self.parse(source) if root == '': return '' return self.serialise(root) def detab(self, text): newtext = [] lines = text.split('\n') for line in lines: if line.startswith(' '*self.tab_length): newtext.append(line[self.tab_length:]) elif line.startswith('\t'): newtext.append(line[1:]) elif not line.strip(): newtext.append('') else: break return '\n'.join(newtext), '\n'.join(lines[len(newtext):]) class HashHeaderProcessor(markdown.blockprocessors.HashHeaderProcessor): # Override to add 1 to level def run(self, parent, blocks): block = blocks.pop(0) m = self.RE.search(block) before = block[:m.start()] after = block[m.end():] if before: self.parser.parseBlocks(parent, [before]) h = ET.SubElement(parent, 'h{}'.format(len(m.group('level')) + 1)) # Add 1 to level h.text = m.group('header').strip() if after: blocks.insert(0, after) class NormalizeWhitespace(markdown.preprocessors.Preprocessor): # Override to retain tabs def run(self, lines): source = '\n'.join(lines) source = source.replace(markdown.util.STX, "").replace(markdown.util.ETX, "") source = source.replace("\r\n", "\n").replace("\r", "\n") + "\n\n" source = re.sub(r'(?<=\n) +\n', '\n', source) return source.split('\n') # Override to inherit markdown attribute # This is required since Python-Markdown 3.3.3 so we implement this for backwards compatibility def HTMLExtractorExtra_get_state(self, tag, attrs): if 'markdown' not in attrs: parent_state = self.mdstate[-1] if self.mdstate else None if parent_state != 'off': attrs['markdown'] = '1' return HTMLExtractorExtra_get_state.orig(self, tag, attrs) HTMLExtractorExtra_get_state.orig = markdown.extensions.md_in_html.HTMLExtractorExtra.get_state markdown.extensions.md_in_html.HTMLExtractorExtra.get_state = HTMLExtractorExtra_get_state class DirectiveProcessor(markdown.blockprocessors.BlockProcessor): RE = re.compile(r'^.. +(?P[a-zA-Z0-9_-]+?)::(?: +(?P.*?))?(?:\n|$)') def test(self, parent, block): return bool(self.RE.search(block)) def run(self, parent, blocks): block = blocks.pop(0) m = self.RE.search(block) # Get directive content if '\n' in block: content = block[block.index('\n') + 1:] else: content = '' for b in blocks[:]: if b.startswith('\t'): blocks.pop(0) content += b else: break content, theRest = self.parser.md.detab(content) directive = directives[m.group('name')](self.parser.md, arg=m.group('arg') or '', content=content) el = directive.render() el.directive = directive parent.append(el) if theRest: blocks.insert(0, theRest) class RoleProcessor(markdown.inlinepatterns.InlineProcessor): def __init__(self, md): super().__init__(r':(?P[^:]+?):`(?P[^`]+?)`', md) def handleMatch(self, m, data): role = roles[m.group('name')](self.md, m.group('content')) el = role.render() el.role = role return el, m.start(0), m.end(0) class BlueProcessor(markdown.inlinepatterns.InlineProcessor): def __init__(self, md): super().__init__(r'!!(.+?)!!', md) def handleMatch(self, m, data): el = ET.Element('span') el.text = m.group(1) el.set('class', 'blue') return el, m.start(0), m.end(0) class WrapSectionProcessor(markdown.treeprocessors.Treeprocessor): def run(self, root): section = ET.Element('section') for child in list(root): if child.tag in ('h1', 'h2'): if len(section) > 0: root.insert(list(root).index(child), section) section = ET.Element('section') else: section.append(child) root.remove(child) if len(section) > 0: root.append(section) # Adapted from Python-Markdown # Allow tabs class AdmonitionProcessor(markdown.extensions.admonition.AdmonitionProcessor): def test(self, parent, block): sibling = self.lastChild(parent) return self.RE.search(block) or \ ((block.startswith(' ' * self.tab_length) or block.startswith('\t')) and sibling is not None and sibling.get('class', '').find(self.CLASSNAME) != -1) def detab(self, text): return self.parser.md.detab(text) # Adapted from Python-Markdown # Allow tabs class ListIndentProcessor(markdown.blockprocessors.ListIndentProcessor): def __init__(self, parser): super().__init__(parser) # Allow tabs self.INDENT_RE = re.compile(r'^(([ ]{%s}|\t)+)' % self.tab_length) def test(self, parent, block): # Allow tabs return (block.startswith(' '*self.tab_length) or block.startswith('\t')) and not self.parser.state.isstate('detabbed') and (parent.tag in self.ITEM_TYPES or (len(parent) and parent[-1] is not None and (parent[-1].tag in self.LIST_TYPES))) def get_level(self, parent, block): m = self.INDENT_RE.match(block) if m: # Allow tabs if m.group(1).startswith('\t'): indent_level = len(m.group(1)) else: indent_level = len(m.group(1))/self.tab_length else: indent_level = 0 if self.parser.state.isstate('list'): level = 1 else: level = 0 while indent_level > level: child = self.lastChild(parent) if (child is not None and (child.tag in self.LIST_TYPES or child.tag in self.ITEM_TYPES)): if child.tag in self.LIST_TYPES: level += 1 parent = child else: break return level, parent def looseDetab(self, text, level=1): lines = text.split('\n') for i in range(len(lines)): if lines[i].startswith(' '*self.tab_length*level): lines[i] = lines[i][self.tab_length*level:] if lines[i].startswith('\t'): lines[i] = lines[i][1:] return '\n'.join(lines) class OListProcessor(markdown.blockprocessors.OListProcessor): def __init__(self, parser): super().__init__(parser) # Allow tabs self.INDENT_RE = re.compile(r'^(?:[ ]{%d,%d}|\t)((\d+\.)|[*+-])[ ]+.*' % (self.tab_length, self.tab_length * 2 - 1)) def run(self, parent, blocks): items = self.get_items(blocks.pop(0)) sibling = self.lastChild(parent) if sibling is not None and sibling.tag in self.SIBLING_TAGS: lst = sibling if lst[-1].text: p = ET.Element('p') p.text = lst[-1].text lst[-1].text = '' lst[-1].insert(0, p) lch = self.lastChild(lst[-1]) if lch is not None and lch.tail: p = ET.SubElement(lst[-1], 'p') p.text = lch.tail.lstrip() lch.tail = '' li = ET.SubElement(lst, 'li') self.parser.state.set('looselist') firstitem = items.pop(0) self.parser.parseBlocks(li, [firstitem]) self.parser.state.reset() elif parent.tag in ['ol', 'ul']: lst = parent else: lst = ET.SubElement(parent, self.TAG) if not self.LAZY_OL and self.STARTSWITH != '1': lst.attrib['start'] = self.STARTSWITH self.parser.state.set('list') for item in items: # Allow tabs if item.startswith(' '*self.tab_length) or item.startswith('\t'): self.parser.parseBlocks(lst[-1], [item]) else: li = ET.SubElement(lst, 'li') self.parser.parseBlocks(li, [item]) self.parser.state.reset() def get_items(self, block): items = [] for line in block.split('\n'): m = self.CHILD_RE.match(line) if m: if not items and self.TAG == 'ol': INTEGER_RE = re.compile(r'(\d+)') self.STARTSWITH = INTEGER_RE.match(m.group(1)).group() items.append(m.group(3)) elif self.INDENT_RE.match(line): # Allow tabs if items[-1].startswith(' '*self.tab_length) or items[-1].startswith('\t'): items[-1] = '{}\n{}'.format(items[-1], line) else: items.append(line) else: items[-1] = '{}\n{}'.format(items[-1], line) return items class UListProcessor(OListProcessor): TAG = 'ul' def __init__(self, parser): super().__init__(parser) self.RE = re.compile(r'^[ ]{0,%d}[*+-][ ]+(.*)' % (self.tab_length - 1)) # Adapted from Python-Markdown # Fix for tables class AttrListTreeprocessor(markdown.extensions.attr_list.AttrListTreeprocessor): def run(self, doc): for elem in doc.iter(): if self.md.is_block_level(elem.tag): RE = self.BLOCK_RE if markdown.extensions.attr_list.isheader(elem) or elem.tag == 'dt': RE = self.HEADER_RE if len(elem) and elem.tag == 'li': pos = None for i, child in enumerate(elem): if child.tag in ['ul', 'ol']: pos = i break if pos is None and elem[-1].tail: m = RE.search(elem[-1].tail) if m: self.assign_attrs(elem, m.group(1)) elem[-1].tail = elem[-1].tail[:m.start()] elif pos is not None and pos > 0 and elem[pos-1].tail: m = RE.search(elem[pos-1].tail) if m: self.assign_attrs(elem, m.group(1)) elem[pos-1].tail = elem[pos-1].tail[:m.start()] elif elem.text: m = RE.search(elem.text) if m: self.assign_attrs(elem, m.group(1)) elem.text = elem.text[:m.start()] elif len(elem) and elem.tag == 'table' and len(elem[-1]) and len(elem[-1][-1]) and elem[-1][-1][0].text: # SPECIAL CASE table, use last row RE = self.INLINE_RE m = RE.search(elem[-1][-1][0].text) # tbody -> tr -> td if m: self.assign_attrs(elem, m.group(1)) # Remove last row elem[-1].remove(elem[-1][-1]) # tbody -> tr elif len(elem) and elem[-1].tail: m = RE.search(elem[-1].tail) if m: self.assign_attrs(elem, m.group(1)) elem[-1].tail = elem[-1].tail[:m.start()] if markdown.extensions.attr_list.isheader(elem): elem[-1].tail = elem[-1].tail.rstrip('#').rstrip() elif elem.text: m = RE.search(elem.text) if not m and elem.tag == 'td': m = re.search(self.BASE_RE, elem.text) if m: self.assign_attrs(elem, m.group(1)) elem.text = elem.text[:m.start()] if markdown.extensions.attr_list.isheader(elem): elem.text = elem.text.rstrip('#').rstrip() else: if elem.tail: m = self.INLINE_RE.match(elem.tail) if m: self.assign_attrs(elem, m.group(1)) elem.tail = elem.tail[m.end():] # Footnotes class FootnoteExtension(markdown.extensions.footnotes.FootnoteExtension): # Override def extendMarkdown(self, md): md.registerExtension(self) self.parser = md.parser self.md = md md.parser.blockprocessors.register(markdown.extensions.footnotes.FootnoteBlockProcessor(self), 'footnote', 17) FOOTNOTE_RE = r'\[\^([^\]]*)\]' # blah blah [^1] blah md.inlinePatterns.register(FootnoteInlineProcessor(FOOTNOTE_RE, self), 'footnote', 175) md.treeprocessors.register(markdown.extensions.footnotes.FootnoteTreeprocessor(self), 'footnote', 50) # Override to omit backlinks and reformat def makeFootnotesDiv(self, root): if not list(self.footnotes.keys()): return None div = ET.Element("div") div.set('class', 'footnote') ol = ET.SubElement(div, "ol") surrogate_parent = ET.Element("div") for index, id in enumerate(self.footnotes.keys(), start=1): li = ET.SubElement(ol, "li") li.set("id", self.makeFootnoteId(id)) fn_src = self.footnotes[id] fn_src = re.sub(r'(https?://[^ ]+)', r'\[URL\]\1', fn_src) fn_src = re.sub(r'doi: ([^ ]+)', r'doi: \1', fn_src) self.parser.parseChunk(surrogate_parent, fn_src) for el in list(surrogate_parent): li.append(el) surrogate_parent.remove(el) return div class FootnoteInlineProcessor(markdown.extensions.footnotes.FootnoteInlineProcessor): # Override to handle commas and semicolons def handleMatch(self, m, data): id = m.group(1).rstrip(',').split(';')[0] if id in self.footnotes.footnotes.keys(): sup = ET.Element("sup") sup.set('class', 'footnote-ref') a = ET.SubElement(sup, "a") sup.set('id', self.footnotes.makeFootnoteRefId(id, found=True)) a.set('href', '#' + self.footnotes.makeFootnoteId(id)) a.text = str(list(self.footnotes.footnotes.keys()).index(id) + 1) a.tail = '' if ';' in m.group(1): a.tail += ' ' + m.group(1).rstrip(',').split(';')[1] if m.group(1).endswith(','): a.tail = ',' return sup, m.start(0), m.end(0) else: return None, None, None # Custom directives and roles from . import markup_custom directives.update(markup_custom.directives) roles.update(markup_custom.roles) try: from . import markup_custom2 directives.update(markup_custom2.directives) roles.update(markup_custom2.roles) except ImportError: pass