1.1 --- a/parser.py Sat Mar 31 18:01:47 2012 +0200
1.2 +++ b/parser.py Sun Apr 01 23:22:19 2012 +0200
1.3 @@ -1,17 +1,22 @@
1.4 #!/usr/bin/env python
1.5
1.6 +"""
1.7 +Confluence Wiki syntax parsing.
1.8 +
1.9 + 1. Wiki pages are first split up into regions.
1.10 + 2. Then, within these regions, the text is split into blocks.
1.11 + 1. First, lists are identified.
1.12 + 2. Additionally, other block-like elements are identified.
1.13 + 3. Each block is then parsed.
1.14 +"""
1.15 +
1.16 import re
1.17
1.18 # Section extraction.
1.19
1.20 -sections_regexp_str = r"(?<!{)(?P<section>{(?P<type>[^{}]+)}.*?{(?P=type)})"
1.21 +sections_regexp_str = r"(?<!{)(?P<section>{(?P<type>[^{}\n]+)}.*?{(?P=type)})"
1.22 sections_regexp = re.compile(sections_regexp_str, re.DOTALL | re.MULTILINE)
1.23
1.24 -# Section inspection.
1.25 -
1.26 -section_regexp_str = r"{(?P<sectiontype>.*?)}(?P<section>.*){(?P=sectiontype)}"
1.27 -section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE)
1.28 -
1.29 def get_regions(s):
1.30
1.31 """
1.32 @@ -29,9 +34,14 @@
1.33 regions.append((None, s[last:]))
1.34 return regions
1.35
1.36 +# Section inspection.
1.37 +
1.38 +section_regexp_str = r"{(?P<sectiontype>[^\n]*?)}(?P<section>.*){(?P=sectiontype)}"
1.39 +section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE)
1.40 +
1.41 def get_section_details(s):
1.42
1.43 - "Return the details of a section in the form (type, text)."
1.44 + "Return the details of a section 's' in the form (type, text)."
1.45
1.46 match = section_regexp.match(s)
1.47 if match:
1.48 @@ -39,16 +49,108 @@
1.49 else:
1.50 return None, s
1.51
1.52 +# List extraction.
1.53 +
1.54 +list_regexp_str = r"^(?P<listtype>[*#-])[*#-]*.*\n((?P=listtype).*(?:\n|$))*"
1.55 +list_regexp = re.compile(list_regexp_str, re.MULTILINE)
1.56 +
1.57 +def get_lists(s):
1.58 +
1.59 + """
1.60 + Extract lists from the given string 's'.
1.61 + """
1.62 +
1.63 + last = 0
1.64 + blocks = []
1.65 + for match in list_regexp.finditer(s):
1.66 + start, end = match.span()
1.67 + blocks.append((None, s[last:start]))
1.68 + blocks.append(("list", s[start:end]))
1.69 + last = end
1.70 + blocks.append((None, s[last:]))
1.71 + return blocks
1.72 +
1.73 +# Block extraction.
1.74 +
1.75 +block_regexp_str = r"^(?:\s*\n)+"
1.76 +block_regexp = re.compile(block_regexp_str, re.MULTILINE)
1.77 +
1.78 +def get_basic_blocks(s):
1.79 +
1.80 + """
1.81 + Return blocks from the given string 's' by splitting the text on blank lines
1.82 + and eliminating those lines.
1.83 + """
1.84 +
1.85 + return [b for b in block_regexp.split(s) if b.strip()]
1.86 +
1.87 +# Block inspection.
1.88 +
1.89 +blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$"
1.90 +blocktext_regexp = re.compile(blocktext_regexp_str, re.MULTILINE)
1.91 +
1.92 +def get_blocks(s):
1.93 +
1.94 + """
1.95 + Return blocks from the given string 's', inspecting the basic blocks and
1.96 + generating additional block-level text where appropriate.
1.97 + """
1.98 +
1.99 + blocks = []
1.100 +
1.101 + for blocktype, blocktext in get_lists(s):
1.102 +
1.103 + # Collect list blocks.
1.104 +
1.105 + if blocktype is not None:
1.106 + blocks.append((blocktype, blocktext))
1.107 +
1.108 + # Attempt to find new subblocks in other regions.
1.109 +
1.110 + else:
1.111 + for block in get_basic_blocks(blocktext):
1.112 + last = 0
1.113 + for match in blocktext_regexp.finditer(block):
1.114 + start, end = match.span()
1.115 +
1.116 + # Add preceding non-block text.
1.117 +
1.118 + preceding = block[last:start]
1.119 + if preceding.strip():
1.120 + blocks.append((None, preceding))
1.121 +
1.122 + # Add the subblock.
1.123 +
1.124 + blocks.append((match.group("type"), match.group("text")))
1.125 + last = end
1.126 +
1.127 + # Add trailing non-block text.
1.128 +
1.129 + trailing = block[last:]
1.130 + if trailing.strip():
1.131 + blocks.append((None, trailing))
1.132 +
1.133 + return blocks
1.134 +
1.135 +listitem_regexp_str = r"^([*#-])+\s*(.*)$"
1.136 +listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE)
1.137 +
1.138 if __name__ == "__main__":
1.139 import sys
1.140
1.141 s = sys.stdin.read()
1.142
1.143 for type, text in get_regions(s):
1.144 - print "Region type:", type
1.145 - print "Region:"
1.146 - print text
1.147 - print
1.148 + if type is None:
1.149 + for blocktype, blocktext in get_blocks(text):
1.150 + print "Block type:", blocktype
1.151 + print blocktext
1.152 + print
1.153 + else:
1.154 + print "Region type:", type
1.155 + print text
1.156 + print
1.157 +
1.158 print "-" * 60
1.159
1.160 # vim: tabstop=4 expandtab shiftwidth=4