Allow lists indented with tabs

Clean up custom Markdown overrides
2020-08-06 22:33:32 +10:00 · 2020-08-06 22:33:32 +10:00 · a32795fbe6
commit a32795fbe6
parent 70f02e24ac
1 changed files with 120 additions and 92 deletions
--- a/wikinote/markup.py
+++ b/wikinote/markup.py
@ -15,7 +15,7 @@
 #   along with this program.  If not, see <https://www.gnu.org/licenses/>.

 import markdown
-import markdown.extensions.extra, markdown.extensions.footnotes, markdown.extensions.attr_list
+import markdown.extensions.admonition, markdown.extensions.extra, markdown.extensions.footnotes, markdown.extensions.attr_list

 import re
 import xml.etree.ElementTree as ET
@ -40,6 +40,8 @@ class WNMarkdown(markdown.Markdown):
 		# Override default Markdown processors
 		self.preprocessors.register(NormalizeWhitespace(self), 'normalize_whitespace', 30)
 		self.parser.blockprocessors.register(HashHeaderProcessor(self.parser), 'hashheader', 70)
+		self.parser.blockprocessors.register(ListIndentProcessor(self.parser), 'indent', 90)
+		self.parser.blockprocessors.register(UListProcessor(self.parser), 'ulist', 30)
 		self.treeprocessors.register(AttrListTreeprocessor(self), 'attr_list', 8)
 		
 		# Our own processors
@ -153,6 +155,8 @@ class DirectiveProcessor(markdown.blockprocessors.BlockProcessor):
 			if b.startswith('\t'):
 				blocks.pop(0)
 				content += b
+			else:
+				break
 		
 		content, theRest = self.parser.md.detab(content)
 		
@ -202,104 +206,154 @@ class WrapSectionProcessor(markdown.treeprocessors.Treeprocessor):

 # Adapted from Python-Markdown
 # Allow tabs
-class AdmonitionProcessor(markdown.blockprocessors.BlockProcessor):
-	CLASSNAME = 'admonition'
-	CLASSNAME_TITLE = 'admonition-title'
-	RE = re.compile(r'(?:^|\n)!!! ?([\w\-]+(?: +[\w\-]+)*)(?: +"(.*?)")? *(?:\n|$)')
-	RE_SPACES = re.compile('  +|\t+')
-	
+class AdmonitionProcessor(markdown.extensions.admonition.AdmonitionProcessor):
 	def test(self, parent, block):
 		sibling = self.lastChild(parent)
 		return self.RE.search(block) or \
 			((block.startswith(' ' * self.tab_length) or block.startswith('\t')) and sibling is not None and
 			sibling.get('class', '').find(self.CLASSNAME) != -1)
 	
-	def run(self, parent, blocks):
-		sibling = self.lastChild(parent)
-		block = blocks.pop(0)
-		m = self.RE.search(block)
+	def detab(self, text):
+		return self.parser.md.detab(text)

+# Adapted from Python-Markdown
+# Allow tabs
+class ListIndentProcessor(markdown.blockprocessors.ListIndentProcessor):
+	def __init__(self, parser):
+		super().__init__(parser)
+		# Allow tabs
+		self.INDENT_RE = re.compile(r'^(([ ]{%s}|\t)+)' % self.tab_length)
+	
+	def test(self, parent, block):
+		# Allow tabs
+		return (block.startswith(' '*self.tab_length) or block.startswith('\t')) and not self.parser.state.isstate('detabbed') and (parent.tag in self.ITEM_TYPES or (len(parent) and parent[-1] is not None and (parent[-1].tag in self.LIST_TYPES)))
+	
+	def get_level(self, parent, block):
+		m = self.INDENT_RE.match(block)
 		if m:
-			block = block[m.end():]  # removes the first line
-		
-		block, theRest = self.parser.md.detab(block)
-		
-		if m:
-			klass, title = self.get_class_and_title(m)
-			div = ET.SubElement(parent, 'div')
-			div.set('class', '{} {}'.format(self.CLASSNAME, klass))
-			if title:
-				p = ET.SubElement(div, 'p')
-				p.text = title
-				p.set('class', self.CLASSNAME_TITLE)
+			# Allow tabs
+			if m.group(1).startswith('\t'):
+				indent_level = len(m.group(1))
 			else:
-			div = sibling
+				indent_level = len(m.group(1))/self.tab_length
+		else:
+			indent_level = 0
+		if self.parser.state.isstate('list'):
+			level = 1
+		else:
+			level = 0
+		while indent_level > level:
+			child = self.lastChild(parent)
+			if (child is not None and (child.tag in self.LIST_TYPES or child.tag in self.ITEM_TYPES)):
+				if child.tag in self.LIST_TYPES:
+					level += 1
+				parent = child
+			else:
+				break
+		return level, parent
 	
-		self.parser.parseChunk(div, block)
+	def looseDetab(self, text, level=1):
+		lines = text.split('\n')
+		for i in range(len(lines)):
+			if lines[i].startswith(' '*self.tab_length*level):
+				lines[i] = lines[i][self.tab_length*level:]
+			if lines[i].startswith('\t'):
+				lines[i] = lines[i][1:]
+		return '\n'.join(lines)

-		if theRest:
-			# This block contained unindented line(s) after the first indented
-			# line. Insert these lines as the first block of the master blocks
-			# list for future processing.
-			blocks.insert(0, theRest)
+class OListProcessor(markdown.blockprocessors.OListProcessor):
+	def __init__(self, parser):
+		super().__init__(parser)
+		# Allow tabs
+		self.INDENT_RE = re.compile(r'^(?:[ ]{%d,%d}|\t)((\d+\.)|[*+-])[ ]+.*' % (self.tab_length, self.tab_length * 2 - 1))
 	
-	def get_class_and_title(self, match):
-		klass, title = match.group(1).lower(), match.group(2)
-		klass = self.RE_SPACES.sub(' ', klass)
-		if title is None:
-			# no title was provided, use the capitalized classname as title
-			# e.g.: `!!! note` will render
-			# `<p class="admonition-title">Note</p>`
-			title = klass.split(' ', 1)[0].capitalize()
-		elif title == '':
-			# an explicit blank title should not be rendered
-			# e.g.: `!!! warning ""` will *not* render `p` with a title
-			title = None
-		return klass, title
+	def run(self, parent, blocks):
+		items = self.get_items(blocks.pop(0))
+		sibling = self.lastChild(parent)
+		if sibling is not None and sibling.tag in self.SIBLING_TAGS:
+			lst = sibling
+			if lst[-1].text:
+				p = ET.Element('p')
+				p.text = lst[-1].text
+				lst[-1].text = ''
+				lst[-1].insert(0, p)
+			lch = self.lastChild(lst[-1])
+			if lch is not None and lch.tail:
+				p = ET.SubElement(lst[-1], 'p')
+				p.text = lch.tail.lstrip()
+				lch.tail = ''
+			li = ET.SubElement(lst, 'li')
+			self.parser.state.set('looselist')
+			firstitem = items.pop(0)
+			self.parser.parseBlocks(li, [firstitem])
+			self.parser.state.reset()
+		elif parent.tag in ['ol', 'ul']:
+			lst = parent
+		else:
+			lst = ET.SubElement(parent, self.TAG)
+			if not self.LAZY_OL and self.STARTSWITH != '1':
+				lst.attrib['start'] = self.STARTSWITH
+		self.parser.state.set('list')
+		for item in items:
+			# Allow tabs
+			if item.startswith(' '*self.tab_length) or item.startswith('\t'):
+				self.parser.parseBlocks(lst[-1], [item])
+			else:
+				li = ET.SubElement(lst, 'li')
+				self.parser.parseBlocks(li, [item])
+		self.parser.state.reset()
+	
+	def get_items(self, block):
+		items = []
+		for line in block.split('\n'):
+			m = self.CHILD_RE.match(line)
+			if m:
+				if not items and self.TAG == 'ol':
+					INTEGER_RE = re.compile(r'(\d+)')
+					self.STARTSWITH = INTEGER_RE.match(m.group(1)).group()
+				items.append(m.group(3))
+			elif self.INDENT_RE.match(line):
+				# Allow tabs
+				if items[-1].startswith(' '*self.tab_length) or items[-1].startswith('\t'):
+					items[-1] = '{}\n{}'.format(items[-1], line)
+				else:
+					items.append(line)
+			else:
+				items[-1] = '{}\n{}'.format(items[-1], line)
+		return items
+
+class UListProcessor(OListProcessor):
+	TAG = 'ul'
+	def __init__(self, parser):
+		super().__init__(parser)
+		self.RE = re.compile(r'^[ ]{0,%d}[*+-][ ]+(.*)' % (self.tab_length - 1))

 # Adapted from Python-Markdown
 # Fix for tables
-class AttrListTreeprocessor(markdown.treeprocessors.Treeprocessor):
-	BASE_RE = r'\{\:?([^\}\n]*)\}'
-	HEADER_RE = re.compile(r'[ ]+%s[ ]*$' % BASE_RE)
-	BLOCK_RE = re.compile(r'\n[ ]*%s[ ]*$' % BASE_RE)
-	INLINE_RE = re.compile(r'^%s' % BASE_RE)
-	NAME_RE = re.compile(r'[^A-Z_a-z\u00c0-\u00d6\u00d8-\u00f6\u00f8-\u02ff'
-						r'\u0370-\u037d\u037f-\u1fff\u200c-\u200d'
-						r'\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff'
-						r'\uf900-\ufdcf\ufdf0-\ufffd'
-						r'\:\-\.0-9\u00b7\u0300-\u036f\u203f-\u2040]+')
-	
+class AttrListTreeprocessor(markdown.extensions.attr_list.AttrListTreeprocessor):
 	def run(self, doc):
 		for elem in doc.iter():
 			if self.md.is_block_level(elem.tag):
-				# Block level: check for attrs on last line of text
 				RE = self.BLOCK_RE
 				if markdown.extensions.attr_list.isheader(elem) or elem.tag == 'dt':
-					# header or def-term: check for attrs at end of line
 					RE = self.HEADER_RE
 				if len(elem) and elem.tag == 'li':
-					# special case list items. children may include a ul or ol.
 					pos = None
-					# find the ul or ol position
 					for i, child in enumerate(elem):
 						if child.tag in ['ul', 'ol']:
 							pos = i
 							break
 					if pos is None and elem[-1].tail:
-						# use tail of last child. no ul or ol.
 						m = RE.search(elem[-1].tail)
 						if m:
 							self.assign_attrs(elem, m.group(1))
 							elem[-1].tail = elem[-1].tail[:m.start()]
 					elif pos is not None and pos > 0 and elem[pos-1].tail:
-						# use tail of last child before ul or ol
 						m = RE.search(elem[pos-1].tail)
 						if m:
 							self.assign_attrs(elem, m.group(1))
 							elem[pos-1].tail = elem[pos-1].tail[:m.start()]
 					elif elem.text:
-						# use text. ul is first child.
 						m = RE.search(elem.text)
 						if m:
 							self.assign_attrs(elem, m.group(1))
@ -313,16 +367,13 @@ class AttrListTreeprocessor(markdown.treeprocessors.Treeprocessor):
 						# Remove last row
 						elem[-1].remove(elem[-1][-1]) # tbody -> tr
 				elif len(elem) and elem[-1].tail:
-					# has children. Get from tail of last child
 					m = RE.search(elem[-1].tail)
 					if m:
 						self.assign_attrs(elem, m.group(1))
 						elem[-1].tail = elem[-1].tail[:m.start()]
 						if markdown.extensions.attr_list.isheader(elem):
-							# clean up trailing #s
 							elem[-1].tail = elem[-1].tail.rstrip('#').rstrip()
 				elif elem.text:
-					# no children. Get from text.
 					m = RE.search(elem.text)
 					if not m and elem.tag == 'td':
 						m = re.search(self.BASE_RE, elem.text)
@ -330,37 +381,14 @@ class AttrListTreeprocessor(markdown.treeprocessors.Treeprocessor):
 						self.assign_attrs(elem, m.group(1))
 						elem.text = elem.text[:m.start()]
 						if markdown.extensions.attr_list.isheader(elem):
-							# clean up trailing #s
 							elem.text = elem.text.rstrip('#').rstrip()
 			else:
-				# inline: check for attrs at start of tail
 				if elem.tail:
 					m = self.INLINE_RE.match(elem.tail)
 					if m:
 						self.assign_attrs(elem, m.group(1))
 						elem.tail = elem.tail[m.end():]

-	def assign_attrs(self, elem, attrs):
-		""" Assign attrs to element. """
-		for k, v in markdown.extensions.attr_list.get_attrs(attrs):
-			if k == '.':
-				# add to class
-				cls = elem.get('class')
-				if cls:
-					elem.set('class', '{} {}'.format(cls, v))
-				else:
-					elem.set('class', v)
-			else:
-				# assign attr k with v
-				elem.set(self.sanitize_name(k), v)
-	
-	def sanitize_name(self, name):
-		"""
-		Sanitize name as 'an XML Name, minus the ":"'.
-		See https://www.w3.org/TR/REC-xml-names/#NT-NCName
-		"""
-		return self.NAME_RE.sub('_', name)
-
 # Footnotes

 class FootnoteExtension(markdown.extensions.footnotes.FootnoteExtension):