WikiNote3/wikinote/markup.py

#   WikiNote3
#   Copyright © 2020  Lee Yingtong Li (RunasSudo)
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU Affero General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU Affero General Public License for more details.
#
#   You should have received a copy of the GNU Affero General Public License
#   along with this program.  If not, see <https://www.gnu.org/licenses/>.

import markdown
import markdown.extensions.extra, markdown.extensions.footnotes, markdown.extensions.attr_list

import re
import xml.etree.ElementTree as ET

from .mdx_urlize import UrlizeExtension

directives = {}
roles = {}

class WNMarkdown(markdown.Markdown):
	def __init__(self, *args, **kwargs):
		super().__init__(*args, **kwargs)

		self.registerExtensions([FootnoteExtension(), UrlizeExtension(), 'toc', 'tables'], {})

		self.meta = {}

		# Markdown in HTML
		self.preprocessors['html_block'].markdown_in_raw = True
		self.parser.blockprocessors.register(markdown.extensions.extra.MarkdownInHtmlProcessor(self.parser), 'markdown_block', 105)
		self.parser.blockprocessors.tag_counter = -1
		self.parser.blockprocessors.contain_span_tags = re.compile(r'^(p|h[1-6]|li|dd|dt|td|th|legend|address)$', re.IGNORECASE)

		# Override default Markdown processors
		self.preprocessors.register(NormalizeWhitespace(self), 'normalize_whitespace', 30)
		self.parser.blockprocessors.register(HashHeaderProcessor(self.parser), 'hashheader', 70)
		self.treeprocessors.register(AttrListTreeprocessor(self), 'attr_list', 8)

		# Our own processors
		self.parser.blockprocessors.register(DirectiveProcessor(self.parser), 'directive', 95)
		self.parser.blockprocessors.register(AdmonitionProcessor(self.parser), 'admonition', 105)
		self.inlinePatterns.register(BlueProcessor(self.parser), 'blue_em', 65)
		self.inlinePatterns.register(RoleProcessor(self.parser), 'role', 500)
		self.treeprocessors.register(WrapSectionProcessor(self), 'wrap_sections', 100)

	# Override
	def reset(self):
		super().reset()
		self.meta = {}
		return self

	# Based on Markdown.convert
	def parse(self, source):
		if not source.strip():
			return ''
		self.lines = source.split('\n')
		for prep in self.preprocessors:
			self.lines = prep.run(self.lines)

		root = self.parser.parseDocument(self.lines).getroot()

		for treeprocessor in self.treeprocessors:
			newRoot = treeprocessor.run(root)
			if newRoot is not None:
				root = newRoot

		return root

	# Based on Markdown.convert
	def serialise(self, root):
		output = self.serializer(root)

		if self.stripTopLevelTags:
			try:
				start = output.index('<{}>'.format(self.doc_tag)) + len(self.doc_tag) + 2
				end = output.rindex('</{}>'.format(self.doc_tag))
				output = output[start:end].strip()
			except ValueError:
				if output.strip().endswith('<{} />'.format(self.doc_tag)):
					output = ''

		for pp in self.postprocessors:
			output = pp.run(output)

		return output.strip()

	# Put it together
	def convert(self, source):
		root = self.parse(source)
		return self.serialise(root)

	def detab(self, text):
		newtext = []
		lines = text.split('\n')
		for line in lines:
			if line.startswith(' '*self.tab_length):
				newtext.append(line[self.tab_length:])
			elif line.startswith('\t'):
				newtext.append(line[1:])
			elif not line.strip():
				newtext.append('')
			else:
				break
		return '\n'.join(newtext), '\n'.join(lines[len(newtext):])

class HashHeaderProcessor(markdown.blockprocessors.HashHeaderProcessor):
	# Override to add 1 to level
	def run(self, parent, blocks):
		block = blocks.pop(0)
		m = self.RE.search(block)
		before = block[:m.start()]
		after = block[m.end():]
		if before:
			self.parser.parseBlocks(parent, [before])
		h = ET.SubElement(parent, 'h{}'.format(len(m.group('level')) + 1)) # Add 1 to level
		h.text = m.group('header').strip()
		if after:
			blocks.insert(0, after)

class NormalizeWhitespace(markdown.preprocessors.Preprocessor):
	# Override to retain tabs
	def run(self, lines):
		source = '\n'.join(lines)
		source = source.replace(markdown.util.STX, "").replace(markdown.util.ETX, "")
		source = source.replace("\r\n", "\n").replace("\r", "\n") + "\n\n"
		source = re.sub(r'(?<=\n) +\n', '\n', source)
		return source.split('\n')

class DirectiveProcessor(markdown.blockprocessors.BlockProcessor):
	RE = re.compile(r'^.. +(?P<name>[a-zA-Z0-9_-]+?)::(?: +(?P<arg>.*?))?(?:\n|$)')

	def test(self, parent, block):
		return bool(self.RE.search(block))

	def run(self, parent, blocks):
		block = blocks.pop(0)
		m = self.RE.search(block)

		# Get directive content
		if '\n' in block:
			content = block[block.index('\n') + 1:]
		else:
			content = ''
		for b in blocks[:]:
			if b.startswith('\t'):
				blocks.pop(0)
				content += b

		content, theRest = self.parser.md.detab(content)

		directive = directives[m.group('name')](self.parser.md, arg=m.group('arg') or '', content=content)
		el = directive.render()
		el.directive = directive
		parent.append(el)

		if theRest:
			blocks.insert(0, theRest)

class RoleProcessor(markdown.inlinepatterns.InlineProcessor):
	def __init__(self, md):
		super().__init__(r':(?P<name>[^:]+?):`(?P<content>[^`]+?)`', md)

	def handleMatch(self, m, data):
		role = roles[m.group('name')](self.md, m.group('content'))
		el = role.render()
		el.role = role
		return el, m.start(0), m.end(0)

class BlueProcessor(markdown.inlinepatterns.InlineProcessor):
	def __init__(self, md):
		super().__init__(r'!!(.+?)!!', md)

	def handleMatch(self, m, data):
		el = ET.Element('span')
		el.text = m.group(1)
		el.set('class', 'blue')
		return el, m.start(0), m.end(0)

class WrapSectionProcessor(markdown.treeprocessors.Treeprocessor):
	def run(self, root):
		section = ET.Element('section')

		for child in list(root):
			if child.tag in ('h1', 'h2', 'h3'):
				if len(section) > 0:
					root.insert(list(root).index(child), section)
					section = ET.Element('section')
			else:
				section.append(child)
				root.remove(child)

		if len(section) > 0:
			root.append(section)

# Adapted from Python-Markdown
# Allow tabs
class AdmonitionProcessor(markdown.blockprocessors.BlockProcessor):
	CLASSNAME = 'admonition'
	CLASSNAME_TITLE = 'admonition-title'
	RE = re.compile(r'(?:^|\n)!!! ?([\w\-]+(?: +[\w\-]+)*)(?: +"(.*?)")? *(?:\n|$)')
	RE_SPACES = re.compile('  +|\t+')

	def test(self, parent, block):
		sibling = self.lastChild(parent)
		return self.RE.search(block) or \
			((block.startswith(' ' * self.tab_length) or block.startswith('\t')) and sibling is not None and
			sibling.get('class', '').find(self.CLASSNAME) != -1)

	def run(self, parent, blocks):
		sibling = self.lastChild(parent)
		block = blocks.pop(0)
		m = self.RE.search(block)

		if m:
			block = block[m.end():]  # removes the first line

		block, theRest = self.parser.md.detab(block)

		if m:
			klass, title = self.get_class_and_title(m)
			div = ET.SubElement(parent, 'div')
			div.set('class', '{} {}'.format(self.CLASSNAME, klass))
			if title:
				p = ET.SubElement(div, 'p')
				p.text = title
				p.set('class', self.CLASSNAME_TITLE)
		else:
			div = sibling

		self.parser.parseChunk(div, block)

		if theRest:
			# This block contained unindented line(s) after the first indented
			# line. Insert these lines as the first block of the master blocks
			# list for future processing.
			blocks.insert(0, theRest)

	def get_class_and_title(self, match):
		klass, title = match.group(1).lower(), match.group(2)
		klass = self.RE_SPACES.sub(' ', klass)
		if title is None:
			# no title was provided, use the capitalized classname as title
			# e.g.: `!!! note` will render
			# `<p class="admonition-title">Note</p>`
			title = klass.split(' ', 1)[0].capitalize()
		elif title == '':
			# an explicit blank title should not be rendered
			# e.g.: `!!! warning ""` will *not* render `p` with a title
			title = None
		return klass, title

# Adapted from Python-Markdown
# Fix for tables
class AttrListTreeprocessor(markdown.treeprocessors.Treeprocessor):
	BASE_RE = r'\{\:?([^\}\n]*)\}'
	HEADER_RE = re.compile(r'[ ]+%s[ ]*$' % BASE_RE)
	BLOCK_RE = re.compile(r'\n[ ]*%s[ ]*$' % BASE_RE)
	INLINE_RE = re.compile(r'^%s' % BASE_RE)
	NAME_RE = re.compile(r'[^A-Z_a-z\u00c0-\u00d6\u00d8-\u00f6\u00f8-\u02ff'
						r'\u0370-\u037d\u037f-\u1fff\u200c-\u200d'
						r'\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff'
						r'\uf900-\ufdcf\ufdf0-\ufffd'
						r'\:\-\.0-9\u00b7\u0300-\u036f\u203f-\u2040]+')

	def run(self, doc):
		for elem in doc.iter():
			if self.md.is_block_level(elem.tag):
				# Block level: check for attrs on last line of text
				RE = self.BLOCK_RE
				if markdown.extensions.attr_list.isheader(elem) or elem.tag == 'dt':
					# header or def-term: check for attrs at end of line
					RE = self.HEADER_RE
				if len(elem) and elem.tag == 'li':
					# special case list items. children may include a ul or ol.
					pos = None
					# find the ul or ol position
					for i, child in enumerate(elem):
						if child.tag in ['ul', 'ol']:
							pos = i
							break
					if pos is None and elem[-1].tail:
						# use tail of last child. no ul or ol.
						m = RE.search(elem[-1].tail)
						if m:
							self.assign_attrs(elem, m.group(1))
							elem[-1].tail = elem[-1].tail[:m.start()]
					elif pos is not None and pos > 0 and elem[pos-1].tail:
						# use tail of last child before ul or ol
						m = RE.search(elem[pos-1].tail)
						if m:
							self.assign_attrs(elem, m.group(1))
							elem[pos-1].tail = elem[pos-1].tail[:m.start()]
					elif elem.text:
						# use text. ul is first child.
						m = RE.search(elem.text)
						if m:
							self.assign_attrs(elem, m.group(1))
							elem.text = elem.text[:m.start()]
				elif len(elem) and elem.tag == 'table' and len(elem[-1]) and len(elem[-1][-1]) and elem[-1][-1][0].text:
					# SPECIAL CASE table, use last row
					RE = self.INLINE_RE
					m = RE.search(elem[-1][-1][0].text) # tbody -> tr -> td
					if m:
						self.assign_attrs(elem, m.group(1))
						# Remove last row
						elem[-1].remove(elem[-1][-1]) # tbody -> tr
				elif len(elem) and elem[-1].tail:
					# has children. Get from tail of last child
					m = RE.search(elem[-1].tail)
					if m:
						self.assign_attrs(elem, m.group(1))
						elem[-1].tail = elem[-1].tail[:m.start()]
						if markdown.extensions.attr_list.isheader(elem):
							# clean up trailing #s
							elem[-1].tail = elem[-1].tail.rstrip('#').rstrip()
				elif elem.text:
					# no children. Get from text.
					m = RE.search(elem.text)
					if not m and elem.tag == 'td':
						m = re.search(self.BASE_RE, elem.text)
					if m:
						self.assign_attrs(elem, m.group(1))
						elem.text = elem.text[:m.start()]
						if markdown.extensions.attr_list.isheader(elem):
							# clean up trailing #s
							elem.text = elem.text.rstrip('#').rstrip()
			else:
				# inline: check for attrs at start of tail
				if elem.tail:
					m = self.INLINE_RE.match(elem.tail)
					if m:
						self.assign_attrs(elem, m.group(1))
						elem.tail = elem.tail[m.end():]

	def assign_attrs(self, elem, attrs):
		""" Assign attrs to element. """
		for k, v in markdown.extensions.attr_list.get_attrs(attrs):
			if k == '.':
				# add to class
				cls = elem.get('class')
				if cls:
					elem.set('class', '{} {}'.format(cls, v))
				else:
					elem.set('class', v)
			else:
				# assign attr k with v
				elem.set(self.sanitize_name(k), v)

	def sanitize_name(self, name):
		"""
		Sanitize name as 'an XML Name, minus the ":"'.
		See https://www.w3.org/TR/REC-xml-names/#NT-NCName
		"""
		return self.NAME_RE.sub('_', name)

# Footnotes

class FootnoteExtension(markdown.extensions.footnotes.FootnoteExtension):
	# Override
	def extendMarkdown(self, md):
		md.registerExtension(self)
		self.parser = md.parser
		self.md = md
		md.preprocessors.register(markdown.extensions.footnotes.FootnotePreprocessor(self), 'footnote', 15)
		FOOTNOTE_RE = r'\[\^([^\]]*)\]'  # blah blah [^1] blah
		md.inlinePatterns.register(FootnoteInlineProcessor(FOOTNOTE_RE, self), 'footnote', 175)
		md.treeprocessors.register(markdown.extensions.footnotes.FootnoteTreeprocessor(self), 'footnote', 50)

	# Override to omit backlinks
	def makeFootnotesDiv(self, root):
		if not list(self.footnotes.keys()):
			return None

		div = ET.Element("div")
		div.set('class', 'footnote')
		ol = ET.SubElement(div, "ol")
		surrogate_parent = ET.Element("div")

		for index, id in enumerate(self.footnotes.keys(), start=1):
			li = ET.SubElement(ol, "li")
			li.set("id", self.makeFootnoteId(id))
			self.parser.parseChunk(surrogate_parent, self.footnotes[id])
			for el in list(surrogate_parent):
				li.append(el)
				surrogate_parent.remove(el)
		return div

class FootnoteInlineProcessor(markdown.extensions.footnotes.FootnoteInlineProcessor):
	# Override to handle commas
	def handleMatch(self, m, data):
		id = m.group(1).rstrip(',')
		if id in self.footnotes.footnotes.keys():
			sup = ET.Element("sup")
			sup.set('class', 'footnote-ref')
			a = ET.SubElement(sup, "a")
			sup.set('id', self.footnotes.makeFootnoteRefId(id, found=True))
			a.set('href', '#' + self.footnotes.makeFootnoteId(id))
			a.text = str(list(self.footnotes.footnotes.keys()).index(id) + 1)
			if m.group(1).endswith(','):
				a.tail = ','
			return sup, m.start(0), m.end(0)
		else:
			return None, None, None

# Custom directives and roles
from . import markup_custom
directives.update(markup_custom.directives)
roles.update(markup_custom.roles)

try:
	from . import markup_custom2
	directives.update(markup_custom2.directives)
	roles.update(markup_custom2.roles)
except ImportError:
	pass