# HG changeset patch # User Paul Boddie # Date 1333315339 -7200 # Node ID 192a2a30aff4ba96271dad8a7d8307ca342757a5 # Parent 55ee6c0d29528eff771acffa8bc1e04ef0d83213 Made the section extraction slightly more strict when recognising the markers. Added extraction of other block types such as lists, headings and blockquotes. diff -r 55ee6c0d2952 -r 192a2a30aff4 parser.py --- a/parser.py Sat Mar 31 18:01:47 2012 +0200 +++ b/parser.py Sun Apr 01 23:22:19 2012 +0200 @@ -1,17 +1,22 @@ #!/usr/bin/env python +""" +Confluence Wiki syntax parsing. + + 1. Wiki pages are first split up into regions. + 2. Then, within these regions, the text is split into blocks. + 1. First, lists are identified. + 2. Additionally, other block-like elements are identified. + 3. Each block is then parsed. +""" + import re # Section extraction. -sections_regexp_str = r"(?{(?P[^{}]+)}.*?{(?P=type)})" +sections_regexp_str = r"(?{(?P[^{}\n]+)}.*?{(?P=type)})" sections_regexp = re.compile(sections_regexp_str, re.DOTALL | re.MULTILINE) -# Section inspection. - -section_regexp_str = r"{(?P.*?)}(?P
.*){(?P=sectiontype)}" -section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE) - def get_regions(s): """ @@ -29,9 +34,14 @@ regions.append((None, s[last:])) return regions +# Section inspection. + +section_regexp_str = r"{(?P[^\n]*?)}(?P
.*){(?P=sectiontype)}" +section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE) + def get_section_details(s): - "Return the details of a section in the form (type, text)." + "Return the details of a section 's' in the form (type, text)." match = section_regexp.match(s) if match: @@ -39,16 +49,108 @@ else: return None, s +# List extraction. + +list_regexp_str = r"^(?P[*#-])[*#-]*.*\n((?P=listtype).*(?:\n|$))*" +list_regexp = re.compile(list_regexp_str, re.MULTILINE) + +def get_lists(s): + + """ + Extract lists from the given string 's'. + """ + + last = 0 + blocks = [] + for match in list_regexp.finditer(s): + start, end = match.span() + blocks.append((None, s[last:start])) + blocks.append(("list", s[start:end])) + last = end + blocks.append((None, s[last:])) + return blocks + +# Block extraction. + +block_regexp_str = r"^(?:\s*\n)+" +block_regexp = re.compile(block_regexp_str, re.MULTILINE) + +def get_basic_blocks(s): + + """ + Return blocks from the given string 's' by splitting the text on blank lines + and eliminating those lines. + """ + + return [b for b in block_regexp.split(s) if b.strip()] + +# Block inspection. + +blocktext_regexp_str = r"^(?Ph\d|bq)\.\s+(?P.*)$" +blocktext_regexp = re.compile(blocktext_regexp_str, re.MULTILINE) + +def get_blocks(s): + + """ + Return blocks from the given string 's', inspecting the basic blocks and + generating additional block-level text where appropriate. + """ + + blocks = [] + + for blocktype, blocktext in get_lists(s): + + # Collect list blocks. + + if blocktype is not None: + blocks.append((blocktype, blocktext)) + + # Attempt to find new subblocks in other regions. + + else: + for block in get_basic_blocks(blocktext): + last = 0 + for match in blocktext_regexp.finditer(block): + start, end = match.span() + + # Add preceding non-block text. + + preceding = block[last:start] + if preceding.strip(): + blocks.append((None, preceding)) + + # Add the subblock. + + blocks.append((match.group("type"), match.group("text"))) + last = end + + # Add trailing non-block text. + + trailing = block[last:] + if trailing.strip(): + blocks.append((None, trailing)) + + return blocks + +listitem_regexp_str = r"^([*#-])+\s*(.*)$" +listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE) + if __name__ == "__main__": import sys s = sys.stdin.read() for type, text in get_regions(s): - print "Region type:", type - print "Region:" - print text - print + if type is None: + for blocktype, blocktext in get_blocks(text): + print "Block type:", blocktype + print blocktext + print + else: + print "Region type:", type + print text + print + print "-" * 60 # vim: tabstop=4 expandtab shiftwidth=4