Made the section extraction slightly more strict when recognising the markers. Added extraction of other block types such as lists, headings and blockquotes.

     1.1 --- a/parser.py	Sat Mar 31 18:01:47 2012 +0200
     1.2 +++ b/parser.py	Sun Apr 01 23:22:19 2012 +0200
     1.3 @@ -1,17 +1,22 @@
     1.4  #!/usr/bin/env python
     1.5  
     1.6 +"""
     1.7 +Confluence Wiki syntax parsing.
     1.8 +
     1.9 + 1. Wiki pages are first split up into regions.
    1.10 + 2. Then, within these regions, the text is split into blocks.
    1.11 +    1. First, lists are identified.
    1.12 +    2. Additionally, other block-like elements are identified.
    1.13 + 3. Each block is then parsed.
    1.14 +"""
    1.15 +
    1.16  import re
    1.17  
    1.18  # Section extraction.
    1.19  
    1.20 -sections_regexp_str = r"(?<!{)(?P<section>{(?P<type>[^{}]+)}.*?{(?P=type)})"
    1.21 +sections_regexp_str = r"(?<!{)(?P<section>{(?P<type>[^{}\n]+)}.*?{(?P=type)})"
    1.22  sections_regexp = re.compile(sections_regexp_str, re.DOTALL | re.MULTILINE)
    1.23  
    1.24 -# Section inspection.
    1.25 -
    1.26 -section_regexp_str = r"{(?P<sectiontype>.*?)}(?P<section>.*){(?P=sectiontype)}"
    1.27 -section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE)
    1.28 -
    1.29  def get_regions(s):
    1.30  
    1.31      """
    1.32 @@ -29,9 +34,14 @@
    1.33      regions.append((None, s[last:]))
    1.34      return regions
    1.35  
    1.36 +# Section inspection.
    1.37 +
    1.38 +section_regexp_str = r"{(?P<sectiontype>[^\n]*?)}(?P<section>.*){(?P=sectiontype)}"
    1.39 +section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE)
    1.40 +
    1.41  def get_section_details(s):
    1.42  
    1.43 -    "Return the details of a section in the form (type, text)."
    1.44 +    "Return the details of a section 's' in the form (type, text)."
    1.45  
    1.46      match = section_regexp.match(s)
    1.47      if match:
    1.48 @@ -39,16 +49,108 @@
    1.49      else:
    1.50          return None, s
    1.51  
    1.52 +# List extraction.
    1.53 +
    1.54 +list_regexp_str = r"^(?P<listtype>[*#-])[*#-]*.*\n((?P=listtype).*(?:\n|$))*"
    1.55 +list_regexp = re.compile(list_regexp_str, re.MULTILINE)
    1.56 +
    1.57 +def get_lists(s):
    1.58 +
    1.59 +    """
    1.60 +    Extract lists from the given string 's'.
    1.61 +    """
    1.62 +
    1.63 +    last = 0
    1.64 +    blocks = []
    1.65 +    for match in list_regexp.finditer(s):
    1.66 +        start, end = match.span()
    1.67 +        blocks.append((None, s[last:start]))
    1.68 +        blocks.append(("list", s[start:end]))
    1.69 +        last = end
    1.70 +    blocks.append((None, s[last:]))
    1.71 +    return blocks
    1.72 +
    1.73 +# Block extraction.
    1.74 +
    1.75 +block_regexp_str = r"^(?:\s*\n)+"
    1.76 +block_regexp = re.compile(block_regexp_str, re.MULTILINE)
    1.77 +
    1.78 +def get_basic_blocks(s):
    1.79 +
    1.80 +    """
    1.81 +    Return blocks from the given string 's' by splitting the text on blank lines
    1.82 +    and eliminating those lines.
    1.83 +    """
    1.84 +
    1.85 +    return [b for b in block_regexp.split(s) if b.strip()]
    1.86 +
    1.87 +# Block inspection.
    1.88 +
    1.89 +blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$"
    1.90 +blocktext_regexp = re.compile(blocktext_regexp_str, re.MULTILINE)
    1.91 +
    1.92 +def get_blocks(s):
    1.93 +
    1.94 +    """
    1.95 +    Return blocks from the given string 's', inspecting the basic blocks and
    1.96 +    generating additional block-level text where appropriate.
    1.97 +    """
    1.98 +
    1.99 +    blocks = []
   1.100 +
   1.101 +    for blocktype, blocktext in get_lists(s):
   1.102 +
   1.103 +        # Collect list blocks.
   1.104 +
   1.105 +        if blocktype is not None:
   1.106 +            blocks.append((blocktype, blocktext))
   1.107 +
   1.108 +        # Attempt to find new subblocks in other regions.
   1.109 +
   1.110 +        else:
   1.111 +            for block in get_basic_blocks(blocktext):
   1.112 +                last = 0
   1.113 +                for match in blocktext_regexp.finditer(block):
   1.114 +                    start, end = match.span()
   1.115 +
   1.116 +                    # Add preceding non-block text.
   1.117 +
   1.118 +                    preceding = block[last:start]
   1.119 +                    if preceding.strip():
   1.120 +                        blocks.append((None, preceding))
   1.121 +
   1.122 +                    # Add the subblock.
   1.123 +
   1.124 +                    blocks.append((match.group("type"), match.group("text")))
   1.125 +                    last = end
   1.126 +
   1.127 +                # Add trailing non-block text.
   1.128 +
   1.129 +                trailing = block[last:]
   1.130 +                if trailing.strip():
   1.131 +                    blocks.append((None, trailing))
   1.132 +
   1.133 +    return blocks
   1.134 +
   1.135 +listitem_regexp_str = r"^([*#-])+\s*(.*)$"
   1.136 +listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE)
   1.137 +
   1.138  if __name__ == "__main__":
   1.139      import sys
   1.140  
   1.141      s = sys.stdin.read()
   1.142  
   1.143      for type, text in get_regions(s):
   1.144 -        print "Region type:", type
   1.145 -        print "Region:"
   1.146 -        print text
   1.147 -        print
   1.148 +        if type is None:
   1.149 +            for blocktype, blocktext in get_blocks(text):
   1.150 +                print "Block type:", blocktype
   1.151 +                print blocktext
   1.152 +                print
   1.153 +        else:
   1.154 +            print "Region type:", type
   1.155 +            print text
   1.156 +            print
   1.157 +
   1.158          print "-" * 60
   1.159  
   1.160  # vim: tabstop=4 expandtab shiftwidth=4
2012-04-01	Paul Boddie	raw files shortlog changelog graph	Made the section extraction slightly more strict when recognising the markers. Added extraction of other block types such as lists, headings and blockquotes.
			parser.py (file)