This repository has been archived on 2024-11-09. You can view files and clone it, but cannot push or open issues or pull requests.
WikiNote3/wikinote/markup.py

457 lines
15 KiB
Python
Raw Normal View History

2020-06-20 16:25:02 +10:00
# WikiNote3
# Copyright © 2020 Lee Yingtong Li (RunasSudo)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
import markdown
import markdown.extensions.admonition, markdown.extensions.extra, markdown.extensions.footnotes, markdown.extensions.attr_list
2020-06-20 16:25:02 +10:00
import re
import xml.etree.ElementTree as ET
directives = {}
roles = {}
class WNMarkdown(markdown.Markdown):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.registerExtensions([FootnoteExtension(), 'toc', 'tables'], {})
2020-06-20 16:25:02 +10:00
self.meta = {}
# Markdown in HTML
self.preprocessors['html_block'].markdown_in_raw = True
self.parser.blockprocessors.register(markdown.extensions.extra.MarkdownInHtmlProcessor(self.parser), 'markdown_block', 105)
self.parser.blockprocessors.tag_counter = -1
self.parser.blockprocessors.contain_span_tags = re.compile(r'^(p|h[1-6]|li|dd|dt|td|th|legend|address)$', re.IGNORECASE)
# Override default Markdown processors
self.preprocessors.register(NormalizeWhitespace(self), 'normalize_whitespace', 30)
self.parser.blockprocessors.register(HashHeaderProcessor(self.parser), 'hashheader', 70)
self.parser.blockprocessors.register(ListIndentProcessor(self.parser), 'indent', 90)
self.parser.blockprocessors.register(UListProcessor(self.parser), 'ulist', 30)
2020-06-20 16:25:02 +10:00
self.treeprocessors.register(AttrListTreeprocessor(self), 'attr_list', 8)
# Our own processors
self.parser.blockprocessors.register(DirectiveProcessor(self.parser), 'directive', 95)
self.parser.blockprocessors.register(AdmonitionProcessor(self.parser), 'admonition', 105)
self.inlinePatterns.register(BlueProcessor(self.parser), 'blue_em', 65)
self.inlinePatterns.register(RoleProcessor(self.parser), 'role', 500)
self.treeprocessors.register(WrapSectionProcessor(self), 'wrap_sections', 100)
# Override
def reset(self):
super().reset()
self.meta = {}
return self
# Based on Markdown.convert
def parse(self, source):
if not source.strip():
return ''
self.lines = source.split('\n')
for prep in self.preprocessors:
self.lines = prep.run(self.lines)
root = self.parser.parseDocument(self.lines).getroot()
for treeprocessor in self.treeprocessors:
newRoot = treeprocessor.run(root)
if newRoot is not None:
root = newRoot
return root
# Based on Markdown.convert
2020-06-20 23:05:33 +10:00
def serialise(self, root):
2020-06-20 16:25:02 +10:00
output = self.serializer(root)
if self.stripTopLevelTags:
try:
start = output.index('<{}>'.format(self.doc_tag)) + len(self.doc_tag) + 2
end = output.rindex('</{}>'.format(self.doc_tag))
output = output[start:end].strip()
except ValueError:
if output.strip().endswith('<{} />'.format(self.doc_tag)):
output = ''
for pp in self.postprocessors:
output = pp.run(output)
return output.strip()
2020-06-20 23:05:33 +10:00
# Put it together
def convert(self, source):
root = self.parse(source)
if root == '':
return ''
2020-06-20 23:05:33 +10:00
return self.serialise(root)
2020-06-20 16:25:02 +10:00
def detab(self, text):
newtext = []
lines = text.split('\n')
for line in lines:
if line.startswith(' '*self.tab_length):
newtext.append(line[self.tab_length:])
elif line.startswith('\t'):
newtext.append(line[1:])
elif not line.strip():
newtext.append('')
else:
break
return '\n'.join(newtext), '\n'.join(lines[len(newtext):])
class HashHeaderProcessor(markdown.blockprocessors.HashHeaderProcessor):
# Override to add 1 to level
def run(self, parent, blocks):
block = blocks.pop(0)
m = self.RE.search(block)
before = block[:m.start()]
after = block[m.end():]
if before:
self.parser.parseBlocks(parent, [before])
h = ET.SubElement(parent, 'h{}'.format(len(m.group('level')) + 1)) # Add 1 to level
h.text = m.group('header').strip()
if after:
blocks.insert(0, after)
class NormalizeWhitespace(markdown.preprocessors.Preprocessor):
# Override to retain tabs
def run(self, lines):
source = '\n'.join(lines)
source = source.replace(markdown.util.STX, "").replace(markdown.util.ETX, "")
source = source.replace("\r\n", "\n").replace("\r", "\n") + "\n\n"
source = re.sub(r'(?<=\n) +\n', '\n', source)
return source.split('\n')
class DirectiveProcessor(markdown.blockprocessors.BlockProcessor):
RE = re.compile(r'^.. +(?P<name>[a-zA-Z0-9_-]+?)::(?: +(?P<arg>.*?))?(?:\n|$)')
def test(self, parent, block):
return bool(self.RE.search(block))
def run(self, parent, blocks):
block = blocks.pop(0)
m = self.RE.search(block)
# Get directive content
if '\n' in block:
content = block[block.index('\n') + 1:]
else:
content = ''
for b in blocks[:]:
if b.startswith('\t'):
blocks.pop(0)
content += b
else:
break
2020-06-20 16:25:02 +10:00
content, theRest = self.parser.md.detab(content)
directive = directives[m.group('name')](self.parser.md, arg=m.group('arg') or '', content=content)
el = directive.render()
el.directive = directive
parent.append(el)
if theRest:
blocks.insert(0, theRest)
class RoleProcessor(markdown.inlinepatterns.InlineProcessor):
def __init__(self, md):
super().__init__(r':(?P<name>[^:]+?):`(?P<content>[^`]+?)`', md)
def handleMatch(self, m, data):
role = roles[m.group('name')](self.md, m.group('content'))
el = role.render()
el.role = role
return el, m.start(0), m.end(0)
class BlueProcessor(markdown.inlinepatterns.InlineProcessor):
def __init__(self, md):
super().__init__(r'!!(.+?)!!', md)
def handleMatch(self, m, data):
el = ET.Element('span')
el.text = m.group(1)
el.set('class', 'blue')
return el, m.start(0), m.end(0)
class WrapSectionProcessor(markdown.treeprocessors.Treeprocessor):
def run(self, root):
section = ET.Element('section')
for child in list(root):
if child.tag in ('h1', 'h2'):
2020-06-20 16:25:02 +10:00
if len(section) > 0:
root.insert(list(root).index(child), section)
section = ET.Element('section')
else:
section.append(child)
root.remove(child)
if len(section) > 0:
root.append(section)
# Adapted from Python-Markdown
# Allow tabs
class AdmonitionProcessor(markdown.extensions.admonition.AdmonitionProcessor):
2020-06-20 16:25:02 +10:00
def test(self, parent, block):
sibling = self.lastChild(parent)
return self.RE.search(block) or \
((block.startswith(' ' * self.tab_length) or block.startswith('\t')) and sibling is not None and
sibling.get('class', '').find(self.CLASSNAME) != -1)
def detab(self, text):
return self.parser.md.detab(text)
# Adapted from Python-Markdown
# Allow tabs
class ListIndentProcessor(markdown.blockprocessors.ListIndentProcessor):
def __init__(self, parser):
super().__init__(parser)
# Allow tabs
self.INDENT_RE = re.compile(r'^(([ ]{%s}|\t)+)' % self.tab_length)
def test(self, parent, block):
# Allow tabs
return (block.startswith(' '*self.tab_length) or block.startswith('\t')) and not self.parser.state.isstate('detabbed') and (parent.tag in self.ITEM_TYPES or (len(parent) and parent[-1] is not None and (parent[-1].tag in self.LIST_TYPES)))
def get_level(self, parent, block):
m = self.INDENT_RE.match(block)
if m:
# Allow tabs
if m.group(1).startswith('\t'):
indent_level = len(m.group(1))
else:
indent_level = len(m.group(1))/self.tab_length
else:
indent_level = 0
if self.parser.state.isstate('list'):
level = 1
else:
level = 0
while indent_level > level:
child = self.lastChild(parent)
if (child is not None and (child.tag in self.LIST_TYPES or child.tag in self.ITEM_TYPES)):
if child.tag in self.LIST_TYPES:
level += 1
parent = child
else:
break
return level, parent
def looseDetab(self, text, level=1):
lines = text.split('\n')
for i in range(len(lines)):
if lines[i].startswith(' '*self.tab_length*level):
lines[i] = lines[i][self.tab_length*level:]
if lines[i].startswith('\t'):
lines[i] = lines[i][1:]
return '\n'.join(lines)
class OListProcessor(markdown.blockprocessors.OListProcessor):
def __init__(self, parser):
super().__init__(parser)
# Allow tabs
self.INDENT_RE = re.compile(r'^(?:[ ]{%d,%d}|\t)((\d+\.)|[*+-])[ ]+.*' % (self.tab_length, self.tab_length * 2 - 1))
2020-06-20 16:25:02 +10:00
def run(self, parent, blocks):
items = self.get_items(blocks.pop(0))
2020-06-20 16:25:02 +10:00
sibling = self.lastChild(parent)
if sibling is not None and sibling.tag in self.SIBLING_TAGS:
lst = sibling
if lst[-1].text:
p = ET.Element('p')
p.text = lst[-1].text
lst[-1].text = ''
lst[-1].insert(0, p)
lch = self.lastChild(lst[-1])
if lch is not None and lch.tail:
p = ET.SubElement(lst[-1], 'p')
p.text = lch.tail.lstrip()
lch.tail = ''
li = ET.SubElement(lst, 'li')
self.parser.state.set('looselist')
firstitem = items.pop(0)
self.parser.parseBlocks(li, [firstitem])
self.parser.state.reset()
elif parent.tag in ['ol', 'ul']:
lst = parent
2020-06-20 16:25:02 +10:00
else:
lst = ET.SubElement(parent, self.TAG)
if not self.LAZY_OL and self.STARTSWITH != '1':
lst.attrib['start'] = self.STARTSWITH
self.parser.state.set('list')
for item in items:
# Allow tabs
if item.startswith(' '*self.tab_length) or item.startswith('\t'):
self.parser.parseBlocks(lst[-1], [item])
else:
li = ET.SubElement(lst, 'li')
self.parser.parseBlocks(li, [item])
self.parser.state.reset()
2020-06-20 16:25:02 +10:00
def get_items(self, block):
items = []
for line in block.split('\n'):
m = self.CHILD_RE.match(line)
if m:
if not items and self.TAG == 'ol':
INTEGER_RE = re.compile(r'(\d+)')
self.STARTSWITH = INTEGER_RE.match(m.group(1)).group()
items.append(m.group(3))
elif self.INDENT_RE.match(line):
# Allow tabs
if items[-1].startswith(' '*self.tab_length) or items[-1].startswith('\t'):
items[-1] = '{}\n{}'.format(items[-1], line)
else:
items.append(line)
else:
items[-1] = '{}\n{}'.format(items[-1], line)
return items
class UListProcessor(OListProcessor):
TAG = 'ul'
def __init__(self, parser):
super().__init__(parser)
self.RE = re.compile(r'^[ ]{0,%d}[*+-][ ]+(.*)' % (self.tab_length - 1))
2020-06-20 16:25:02 +10:00
# Adapted from Python-Markdown
# Fix for tables
class AttrListTreeprocessor(markdown.extensions.attr_list.AttrListTreeprocessor):
2020-06-20 16:25:02 +10:00
def run(self, doc):
for elem in doc.iter():
if self.md.is_block_level(elem.tag):
RE = self.BLOCK_RE
if markdown.extensions.attr_list.isheader(elem) or elem.tag == 'dt':
RE = self.HEADER_RE
if len(elem) and elem.tag == 'li':
pos = None
for i, child in enumerate(elem):
if child.tag in ['ul', 'ol']:
pos = i
break
if pos is None and elem[-1].tail:
m = RE.search(elem[-1].tail)
if m:
self.assign_attrs(elem, m.group(1))
elem[-1].tail = elem[-1].tail[:m.start()]
elif pos is not None and pos > 0 and elem[pos-1].tail:
m = RE.search(elem[pos-1].tail)
if m:
self.assign_attrs(elem, m.group(1))
elem[pos-1].tail = elem[pos-1].tail[:m.start()]
elif elem.text:
m = RE.search(elem.text)
if m:
self.assign_attrs(elem, m.group(1))
elem.text = elem.text[:m.start()]
elif len(elem) and elem.tag == 'table' and len(elem[-1]) and len(elem[-1][-1]) and elem[-1][-1][0].text:
# SPECIAL CASE table, use last row
RE = self.INLINE_RE
m = RE.search(elem[-1][-1][0].text) # tbody -> tr -> td
if m:
self.assign_attrs(elem, m.group(1))
# Remove last row
elem[-1].remove(elem[-1][-1]) # tbody -> tr
elif len(elem) and elem[-1].tail:
m = RE.search(elem[-1].tail)
if m:
self.assign_attrs(elem, m.group(1))
elem[-1].tail = elem[-1].tail[:m.start()]
if markdown.extensions.attr_list.isheader(elem):
elem[-1].tail = elem[-1].tail.rstrip('#').rstrip()
elif elem.text:
m = RE.search(elem.text)
if not m and elem.tag == 'td':
m = re.search(self.BASE_RE, elem.text)
if m:
self.assign_attrs(elem, m.group(1))
elem.text = elem.text[:m.start()]
if markdown.extensions.attr_list.isheader(elem):
elem.text = elem.text.rstrip('#').rstrip()
else:
if elem.tail:
m = self.INLINE_RE.match(elem.tail)
if m:
self.assign_attrs(elem, m.group(1))
elem.tail = elem.tail[m.end():]
# Footnotes
class FootnoteExtension(markdown.extensions.footnotes.FootnoteExtension):
# Override
def extendMarkdown(self, md):
md.registerExtension(self)
self.parser = md.parser
self.md = md
md.preprocessors.register(markdown.extensions.footnotes.FootnotePreprocessor(self), 'footnote', 15)
FOOTNOTE_RE = r'\[\^([^\]]*)\]' # blah blah [^1] blah
md.inlinePatterns.register(FootnoteInlineProcessor(FOOTNOTE_RE, self), 'footnote', 175)
md.treeprocessors.register(markdown.extensions.footnotes.FootnoteTreeprocessor(self), 'footnote', 50)
# Override to omit backlinks and reformat
2020-06-20 16:25:02 +10:00
def makeFootnotesDiv(self, root):
if not list(self.footnotes.keys()):
return None
div = ET.Element("div")
div.set('class', 'footnote')
ol = ET.SubElement(div, "ol")
surrogate_parent = ET.Element("div")
for index, id in enumerate(self.footnotes.keys(), start=1):
li = ET.SubElement(ol, "li")
li.set("id", self.makeFootnoteId(id))
fn_src = self.footnotes[id]
fn_src = re.sub(r'(https?://[^ ]+)', r'<a href="\1" class="smart-url"><span>\[URL\]</span><span class="url">\1</span></a>', fn_src)
fn_src = re.sub(r'doi: ([^ ]+)', r'<a href="https://doi-org.ezproxy.lib.monash.edu.au/\1">doi: <span class="url">\1</span></a>', fn_src)
self.parser.parseChunk(surrogate_parent, fn_src)
2020-06-20 16:25:02 +10:00
for el in list(surrogate_parent):
li.append(el)
surrogate_parent.remove(el)
return div
class FootnoteInlineProcessor(markdown.extensions.footnotes.FootnoteInlineProcessor):
# Override to handle commas
def handleMatch(self, m, data):
id = m.group(1).rstrip(',')
if id in self.footnotes.footnotes.keys():
sup = ET.Element("sup")
sup.set('class', 'footnote-ref')
a = ET.SubElement(sup, "a")
sup.set('id', self.footnotes.makeFootnoteRefId(id, found=True))
a.set('href', '#' + self.footnotes.makeFootnoteId(id))
a.text = str(list(self.footnotes.footnotes.keys()).index(id) + 1)
if m.group(1).endswith(','):
a.tail = ','
return sup, m.start(0), m.end(0)
else:
return None, None, None
# Custom directives and roles
2020-06-20 23:10:01 +10:00
from . import markup_custom
directives.update(markup_custom.directives)
roles.update(markup_custom.roles)
2020-06-20 16:25:02 +10:00
try:
2020-06-20 23:10:01 +10:00
from . import markup_custom2
directives.update(markup_custom2.directives)
roles.update(markup_custom2.roles)
2020-06-20 16:25:02 +10:00
except ImportError:
pass