paul@6 | 1 | #!/usr/bin/env python |
paul@6 | 2 | |
paul@7 | 3 | """ |
paul@7 | 4 | Confluence Wiki syntax parsing. |
paul@7 | 5 | |
paul@7 | 6 | 1. Wiki pages are first split up into regions. |
paul@7 | 7 | 2. Then, within these regions, the text is split into blocks. |
paul@7 | 8 | 1. First, lists are identified. |
paul@7 | 9 | 2. Additionally, other block-like elements are identified. |
paul@7 | 10 | 3. Each block is then parsed. |
paul@7 | 11 | """ |
paul@7 | 12 | |
paul@6 | 13 | import re |
paul@6 | 14 | |
paul@6 | 15 | # Section extraction. |
paul@6 | 16 | |
paul@7 | 17 | sections_regexp_str = r"(?<!{)(?P<section>{(?P<type>[^{}\n]+)}.*?{(?P=type)})" |
paul@6 | 18 | sections_regexp = re.compile(sections_regexp_str, re.DOTALL | re.MULTILINE) |
paul@6 | 19 | |
paul@6 | 20 | def get_regions(s): |
paul@6 | 21 | |
paul@6 | 22 | """ |
paul@6 | 23 | Return a list of regions from 's'. Each region is specified using a tuple of |
paul@6 | 24 | the form (type, text). |
paul@6 | 25 | """ |
paul@6 | 26 | |
paul@6 | 27 | last = 0 |
paul@6 | 28 | regions = [] |
paul@6 | 29 | for match in sections_regexp.finditer(s): |
paul@6 | 30 | start, end = match.span() |
paul@6 | 31 | regions.append((None, s[last:start])) |
paul@6 | 32 | regions.append(get_section_details(s[start:end])) |
paul@6 | 33 | last = end |
paul@6 | 34 | regions.append((None, s[last:])) |
paul@6 | 35 | return regions |
paul@6 | 36 | |
paul@7 | 37 | # Section inspection. |
paul@7 | 38 | |
paul@7 | 39 | section_regexp_str = r"{(?P<sectiontype>[^\n]*?)}(?P<section>.*){(?P=sectiontype)}" |
paul@7 | 40 | section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE) |
paul@7 | 41 | |
paul@6 | 42 | def get_section_details(s): |
paul@6 | 43 | |
paul@7 | 44 | "Return the details of a section 's' in the form (type, text)." |
paul@6 | 45 | |
paul@6 | 46 | match = section_regexp.match(s) |
paul@6 | 47 | if match: |
paul@6 | 48 | return match.group("sectiontype"), match.group("section") |
paul@6 | 49 | else: |
paul@6 | 50 | return None, s |
paul@6 | 51 | |
paul@7 | 52 | # List extraction. |
paul@7 | 53 | |
paul@7 | 54 | list_regexp_str = r"^(?P<listtype>[*#-])[*#-]*.*\n((?P=listtype).*(?:\n|$))*" |
paul@7 | 55 | list_regexp = re.compile(list_regexp_str, re.MULTILINE) |
paul@7 | 56 | |
paul@7 | 57 | def get_lists(s): |
paul@7 | 58 | |
paul@7 | 59 | """ |
paul@7 | 60 | Extract lists from the given string 's'. |
paul@7 | 61 | """ |
paul@7 | 62 | |
paul@7 | 63 | last = 0 |
paul@7 | 64 | blocks = [] |
paul@7 | 65 | for match in list_regexp.finditer(s): |
paul@7 | 66 | start, end = match.span() |
paul@7 | 67 | blocks.append((None, s[last:start])) |
paul@7 | 68 | blocks.append(("list", s[start:end])) |
paul@7 | 69 | last = end |
paul@7 | 70 | blocks.append((None, s[last:])) |
paul@7 | 71 | return blocks |
paul@7 | 72 | |
paul@7 | 73 | # Block extraction. |
paul@7 | 74 | |
paul@7 | 75 | block_regexp_str = r"^(?:\s*\n)+" |
paul@7 | 76 | block_regexp = re.compile(block_regexp_str, re.MULTILINE) |
paul@7 | 77 | |
paul@7 | 78 | def get_basic_blocks(s): |
paul@7 | 79 | |
paul@7 | 80 | """ |
paul@7 | 81 | Return blocks from the given string 's' by splitting the text on blank lines |
paul@7 | 82 | and eliminating those lines. |
paul@7 | 83 | """ |
paul@7 | 84 | |
paul@7 | 85 | return [b for b in block_regexp.split(s) if b.strip()] |
paul@7 | 86 | |
paul@7 | 87 | # Block inspection. |
paul@7 | 88 | |
paul@7 | 89 | blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$" |
paul@7 | 90 | blocktext_regexp = re.compile(blocktext_regexp_str, re.MULTILINE) |
paul@7 | 91 | |
paul@7 | 92 | def get_blocks(s): |
paul@7 | 93 | |
paul@7 | 94 | """ |
paul@7 | 95 | Return blocks from the given string 's', inspecting the basic blocks and |
paul@7 | 96 | generating additional block-level text where appropriate. |
paul@7 | 97 | """ |
paul@7 | 98 | |
paul@7 | 99 | blocks = [] |
paul@7 | 100 | |
paul@7 | 101 | for blocktype, blocktext in get_lists(s): |
paul@7 | 102 | |
paul@7 | 103 | # Collect list blocks. |
paul@7 | 104 | |
paul@7 | 105 | if blocktype is not None: |
paul@7 | 106 | blocks.append((blocktype, blocktext)) |
paul@7 | 107 | |
paul@7 | 108 | # Attempt to find new subblocks in other regions. |
paul@7 | 109 | |
paul@7 | 110 | else: |
paul@7 | 111 | for block in get_basic_blocks(blocktext): |
paul@7 | 112 | last = 0 |
paul@7 | 113 | for match in blocktext_regexp.finditer(block): |
paul@7 | 114 | start, end = match.span() |
paul@7 | 115 | |
paul@7 | 116 | # Add preceding non-block text. |
paul@7 | 117 | |
paul@7 | 118 | preceding = block[last:start] |
paul@7 | 119 | if preceding.strip(): |
paul@7 | 120 | blocks.append((None, preceding)) |
paul@7 | 121 | |
paul@7 | 122 | # Add the subblock. |
paul@7 | 123 | |
paul@7 | 124 | blocks.append((match.group("type"), match.group("text"))) |
paul@7 | 125 | last = end |
paul@7 | 126 | |
paul@7 | 127 | # Add trailing non-block text. |
paul@7 | 128 | |
paul@7 | 129 | trailing = block[last:] |
paul@7 | 130 | if trailing.strip(): |
paul@7 | 131 | blocks.append((None, trailing)) |
paul@7 | 132 | |
paul@7 | 133 | return blocks |
paul@7 | 134 | |
paul@7 | 135 | listitem_regexp_str = r"^([*#-])+\s*(.*)$" |
paul@7 | 136 | listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE) |
paul@7 | 137 | |
paul@6 | 138 | if __name__ == "__main__": |
paul@6 | 139 | import sys |
paul@6 | 140 | |
paul@6 | 141 | s = sys.stdin.read() |
paul@6 | 142 | |
paul@6 | 143 | for type, text in get_regions(s): |
paul@7 | 144 | if type is None: |
paul@7 | 145 | for blocktype, blocktext in get_blocks(text): |
paul@7 | 146 | print "Block type:", blocktype |
paul@7 | 147 | print blocktext |
paul@7 | 148 | print |
paul@7 | 149 | else: |
paul@7 | 150 | print "Region type:", type |
paul@7 | 151 | print text |
paul@7 | 152 | print |
paul@7 | 153 | |
paul@6 | 154 | print "-" * 60 |
paul@6 | 155 | |
paul@6 | 156 | # vim: tabstop=4 expandtab shiftwidth=4 |