# HG changeset patch
# User Paul Boddie <paul@boddie.org.uk>
# Date 1333315339 -7200
# Node ID 192a2a30aff4ba96271dad8a7d8307ca342757a5
# Parent  55ee6c0d29528eff771acffa8bc1e04ef0d83213
Made the section extraction slightly more strict when recognising the markers.
Added extraction of other block types such as lists, headings and blockquotes.

diff -r 55ee6c0d2952 -r 192a2a30aff4 parser.py
--- a/parser.py	Sat Mar 31 18:01:47 2012 +0200
+++ b/parser.py	Sun Apr 01 23:22:19 2012 +0200
@@ -1,17 +1,22 @@
 #!/usr/bin/env python
 
+"""
+Confluence Wiki syntax parsing.
+
+ 1. Wiki pages are first split up into regions.
+ 2. Then, within these regions, the text is split into blocks.
+    1. First, lists are identified.
+    2. Additionally, other block-like elements are identified.
+ 3. Each block is then parsed.
+"""
+
 import re
 
 # Section extraction.
 
-sections_regexp_str = r"(?<!{)(?P<section>{(?P<type>[^{}]+)}.*?{(?P=type)})"
+sections_regexp_str = r"(?<!{)(?P<section>{(?P<type>[^{}\n]+)}.*?{(?P=type)})"
 sections_regexp = re.compile(sections_regexp_str, re.DOTALL | re.MULTILINE)
 
-# Section inspection.
-
-section_regexp_str = r"{(?P<sectiontype>.*?)}(?P<section>.*){(?P=sectiontype)}"
-section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE)
-
 def get_regions(s):
 
     """
@@ -29,9 +34,14 @@
     regions.append((None, s[last:]))
     return regions
 
+# Section inspection.
+
+section_regexp_str = r"{(?P<sectiontype>[^\n]*?)}(?P<section>.*){(?P=sectiontype)}"
+section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE)
+
 def get_section_details(s):
 
-    "Return the details of a section in the form (type, text)."
+    "Return the details of a section 's' in the form (type, text)."
 
     match = section_regexp.match(s)
     if match:
@@ -39,16 +49,108 @@
     else:
         return None, s
 
+# List extraction.
+
+list_regexp_str = r"^(?P<listtype>[*#-])[*#-]*.*\n((?P=listtype).*(?:\n|$))*"
+list_regexp = re.compile(list_regexp_str, re.MULTILINE)
+
+def get_lists(s):
+
+    """
+    Extract lists from the given string 's'.
+    """
+
+    last = 0
+    blocks = []
+    for match in list_regexp.finditer(s):
+        start, end = match.span()
+        blocks.append((None, s[last:start]))
+        blocks.append(("list", s[start:end]))
+        last = end
+    blocks.append((None, s[last:]))
+    return blocks
+
+# Block extraction.
+
+block_regexp_str = r"^(?:\s*\n)+"
+block_regexp = re.compile(block_regexp_str, re.MULTILINE)
+
+def get_basic_blocks(s):
+
+    """
+    Return blocks from the given string 's' by splitting the text on blank lines
+    and eliminating those lines.
+    """
+
+    return [b for b in block_regexp.split(s) if b.strip()]
+
+# Block inspection.
+
+blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$"
+blocktext_regexp = re.compile(blocktext_regexp_str, re.MULTILINE)
+
+def get_blocks(s):
+
+    """
+    Return blocks from the given string 's', inspecting the basic blocks and
+    generating additional block-level text where appropriate.
+    """
+
+    blocks = []
+
+    for blocktype, blocktext in get_lists(s):
+
+        # Collect list blocks.
+
+        if blocktype is not None:
+            blocks.append((blocktype, blocktext))
+
+        # Attempt to find new subblocks in other regions.
+
+        else:
+            for block in get_basic_blocks(blocktext):
+                last = 0
+                for match in blocktext_regexp.finditer(block):
+                    start, end = match.span()
+
+                    # Add preceding non-block text.
+
+                    preceding = block[last:start]
+                    if preceding.strip():
+                        blocks.append((None, preceding))
+
+                    # Add the subblock.
+
+                    blocks.append((match.group("type"), match.group("text")))
+                    last = end
+
+                # Add trailing non-block text.
+
+                trailing = block[last:]
+                if trailing.strip():
+                    blocks.append((None, trailing))
+
+    return blocks
+
+listitem_regexp_str = r"^([*#-])+\s*(.*)$"
+listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE)
+
 if __name__ == "__main__":
     import sys
 
     s = sys.stdin.read()
 
     for type, text in get_regions(s):
-        print "Region type:", type
-        print "Region:"
-        print text
-        print
+        if type is None:
+            for blocktype, blocktext in get_blocks(text):
+                print "Block type:", blocktype
+                print blocktext
+                print
+        else:
+            print "Region type:", type
+            print text
+            print
+
         print "-" * 60
 
 # vim: tabstop=4 expandtab shiftwidth=4