paul@6 | 1 | #!/usr/bin/env python |
paul@6 | 2 | |
paul@7 | 3 | """ |
paul@7 | 4 | Confluence Wiki syntax parsing. |
paul@7 | 5 | |
paul@8 | 6 | Copyright (C) 2012 Paul Boddie <paul@boddie.org.uk> |
paul@8 | 7 | |
paul@8 | 8 | This software is free software; you can redistribute it and/or |
paul@8 | 9 | modify it under the terms of the GNU General Public License as |
paul@8 | 10 | published by the Free Software Foundation; either version 2 of |
paul@8 | 11 | the License, or (at your option) any later version. |
paul@8 | 12 | |
paul@8 | 13 | This software is distributed in the hope that it will be useful, |
paul@8 | 14 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
paul@8 | 15 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
paul@8 | 16 | GNU General Public License for more details. |
paul@8 | 17 | |
paul@8 | 18 | You should have received a copy of the GNU General Public |
paul@8 | 19 | License along with this library; see the file LICENCE.txt |
paul@8 | 20 | If not, write to the Free Software Foundation, Inc., |
paul@8 | 21 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA |
paul@8 | 22 | |
paul@8 | 23 | -------- |
paul@8 | 24 | |
paul@8 | 25 | The basic procedure is as follows: |
paul@8 | 26 | |
paul@7 | 27 | 1. Wiki pages are first split up into regions. |
paul@7 | 28 | 2. Then, within these regions, the text is split into blocks. |
paul@7 | 29 | 1. First, lists are identified. |
paul@7 | 30 | 2. Additionally, other block-like elements are identified. |
paul@7 | 31 | 3. Each block is then parsed. |
paul@7 | 32 | """ |
paul@7 | 33 | |
paul@6 | 34 | import re |
paul@6 | 35 | |
paul@6 | 36 | # Section extraction. |
paul@6 | 37 | |
paul@7 | 38 | sections_regexp_str = r"(?<!{)(?P<section>{(?P<type>[^{}\n]+)}.*?{(?P=type)})" |
paul@6 | 39 | sections_regexp = re.compile(sections_regexp_str, re.DOTALL | re.MULTILINE) |
paul@6 | 40 | |
paul@6 | 41 | def get_regions(s): |
paul@6 | 42 | |
paul@6 | 43 | """ |
paul@6 | 44 | Return a list of regions from 's'. Each region is specified using a tuple of |
paul@6 | 45 | the form (type, text). |
paul@6 | 46 | """ |
paul@6 | 47 | |
paul@6 | 48 | last = 0 |
paul@6 | 49 | regions = [] |
paul@6 | 50 | for match in sections_regexp.finditer(s): |
paul@6 | 51 | start, end = match.span() |
paul@6 | 52 | regions.append((None, s[last:start])) |
paul@6 | 53 | regions.append(get_section_details(s[start:end])) |
paul@6 | 54 | last = end |
paul@6 | 55 | regions.append((None, s[last:])) |
paul@6 | 56 | return regions |
paul@6 | 57 | |
paul@7 | 58 | # Section inspection. |
paul@7 | 59 | |
paul@7 | 60 | section_regexp_str = r"{(?P<sectiontype>[^\n]*?)}(?P<section>.*){(?P=sectiontype)}" |
paul@7 | 61 | section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE) |
paul@7 | 62 | |
paul@6 | 63 | def get_section_details(s): |
paul@6 | 64 | |
paul@7 | 65 | "Return the details of a section 's' in the form (type, text)." |
paul@6 | 66 | |
paul@6 | 67 | match = section_regexp.match(s) |
paul@6 | 68 | if match: |
paul@6 | 69 | return match.group("sectiontype"), match.group("section") |
paul@6 | 70 | else: |
paul@6 | 71 | return None, s |
paul@6 | 72 | |
paul@7 | 73 | # List extraction. |
paul@7 | 74 | |
paul@7 | 75 | list_regexp_str = r"^(?P<listtype>[*#-])[*#-]*.*\n((?P=listtype).*(?:\n|$))*" |
paul@7 | 76 | list_regexp = re.compile(list_regexp_str, re.MULTILINE) |
paul@7 | 77 | |
paul@7 | 78 | def get_lists(s): |
paul@7 | 79 | |
paul@7 | 80 | """ |
paul@7 | 81 | Extract lists from the given string 's'. |
paul@7 | 82 | """ |
paul@7 | 83 | |
paul@7 | 84 | last = 0 |
paul@7 | 85 | blocks = [] |
paul@7 | 86 | for match in list_regexp.finditer(s): |
paul@7 | 87 | start, end = match.span() |
paul@7 | 88 | blocks.append((None, s[last:start])) |
paul@7 | 89 | blocks.append(("list", s[start:end])) |
paul@7 | 90 | last = end |
paul@7 | 91 | blocks.append((None, s[last:])) |
paul@7 | 92 | return blocks |
paul@7 | 93 | |
paul@7 | 94 | # Block extraction. |
paul@7 | 95 | |
paul@7 | 96 | block_regexp_str = r"^(?:\s*\n)+" |
paul@7 | 97 | block_regexp = re.compile(block_regexp_str, re.MULTILINE) |
paul@7 | 98 | |
paul@7 | 99 | def get_basic_blocks(s): |
paul@7 | 100 | |
paul@7 | 101 | """ |
paul@7 | 102 | Return blocks from the given string 's' by splitting the text on blank lines |
paul@7 | 103 | and eliminating those lines. |
paul@7 | 104 | """ |
paul@7 | 105 | |
paul@7 | 106 | return [b for b in block_regexp.split(s) if b.strip()] |
paul@7 | 107 | |
paul@7 | 108 | # Block inspection. |
paul@7 | 109 | |
paul@7 | 110 | blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$" |
paul@7 | 111 | blocktext_regexp = re.compile(blocktext_regexp_str, re.MULTILINE) |
paul@7 | 112 | |
paul@7 | 113 | def get_blocks(s): |
paul@7 | 114 | |
paul@7 | 115 | """ |
paul@7 | 116 | Return blocks from the given string 's', inspecting the basic blocks and |
paul@7 | 117 | generating additional block-level text where appropriate. |
paul@7 | 118 | """ |
paul@7 | 119 | |
paul@7 | 120 | blocks = [] |
paul@7 | 121 | |
paul@7 | 122 | for blocktype, blocktext in get_lists(s): |
paul@7 | 123 | |
paul@7 | 124 | # Collect list blocks. |
paul@7 | 125 | |
paul@7 | 126 | if blocktype is not None: |
paul@7 | 127 | blocks.append((blocktype, blocktext)) |
paul@7 | 128 | |
paul@7 | 129 | # Attempt to find new subblocks in other regions. |
paul@7 | 130 | |
paul@7 | 131 | else: |
paul@7 | 132 | for block in get_basic_blocks(blocktext): |
paul@7 | 133 | last = 0 |
paul@7 | 134 | for match in blocktext_regexp.finditer(block): |
paul@7 | 135 | start, end = match.span() |
paul@7 | 136 | |
paul@7 | 137 | # Add preceding non-block text. |
paul@7 | 138 | |
paul@7 | 139 | preceding = block[last:start] |
paul@7 | 140 | if preceding.strip(): |
paul@7 | 141 | blocks.append((None, preceding)) |
paul@7 | 142 | |
paul@7 | 143 | # Add the subblock. |
paul@7 | 144 | |
paul@7 | 145 | blocks.append((match.group("type"), match.group("text"))) |
paul@7 | 146 | last = end |
paul@7 | 147 | |
paul@7 | 148 | # Add trailing non-block text. |
paul@7 | 149 | |
paul@7 | 150 | trailing = block[last:] |
paul@7 | 151 | if trailing.strip(): |
paul@7 | 152 | blocks.append((None, trailing)) |
paul@7 | 153 | |
paul@7 | 154 | return blocks |
paul@7 | 155 | |
paul@7 | 156 | listitem_regexp_str = r"^([*#-])+\s*(.*)$" |
paul@7 | 157 | listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE) |
paul@7 | 158 | |
paul@6 | 159 | if __name__ == "__main__": |
paul@6 | 160 | import sys |
paul@6 | 161 | |
paul@6 | 162 | s = sys.stdin.read() |
paul@6 | 163 | |
paul@6 | 164 | for type, text in get_regions(s): |
paul@7 | 165 | if type is None: |
paul@7 | 166 | for blocktype, blocktext in get_blocks(text): |
paul@7 | 167 | print "Block type:", blocktype |
paul@7 | 168 | print blocktext |
paul@7 | 169 | print |
paul@7 | 170 | else: |
paul@7 | 171 | print "Region type:", type |
paul@7 | 172 | print text |
paul@7 | 173 | print |
paul@7 | 174 | |
paul@6 | 175 | print "-" * 60 |
paul@6 | 176 | |
paul@6 | 177 | # vim: tabstop=4 expandtab shiftwidth=4 |