paul@6 | 1 | #!/usr/bin/env python |
paul@6 | 2 | |
paul@7 | 3 | """ |
paul@7 | 4 | Confluence Wiki syntax parsing. |
paul@7 | 5 | |
paul@8 | 6 | Copyright (C) 2012 Paul Boddie <paul@boddie.org.uk> |
paul@8 | 7 | |
paul@8 | 8 | This software is free software; you can redistribute it and/or |
paul@8 | 9 | modify it under the terms of the GNU General Public License as |
paul@8 | 10 | published by the Free Software Foundation; either version 2 of |
paul@8 | 11 | the License, or (at your option) any later version. |
paul@8 | 12 | |
paul@8 | 13 | This software is distributed in the hope that it will be useful, |
paul@8 | 14 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
paul@8 | 15 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
paul@8 | 16 | GNU General Public License for more details. |
paul@8 | 17 | |
paul@8 | 18 | You should have received a copy of the GNU General Public |
paul@8 | 19 | License along with this library; see the file LICENCE.txt |
paul@8 | 20 | If not, write to the Free Software Foundation, Inc., |
paul@8 | 21 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA |
paul@8 | 22 | |
paul@8 | 23 | -------- |
paul@8 | 24 | |
paul@8 | 25 | The basic procedure is as follows: |
paul@8 | 26 | |
paul@7 | 27 | 1. Wiki pages are first split up into regions. |
paul@7 | 28 | 2. Then, within these regions, the text is split into blocks. |
paul@7 | 29 | 1. First, lists are identified. |
paul@7 | 30 | 2. Additionally, other block-like elements are identified. |
paul@7 | 31 | 3. Each block is then parsed. |
paul@7 | 32 | """ |
paul@7 | 33 | |
paul@6 | 34 | import re |
paul@6 | 35 | |
paul@6 | 36 | # Section extraction. |
paul@6 | 37 | |
paul@7 | 38 | sections_regexp_str = r"(?<!{)(?P<section>{(?P<type>[^{}\n]+)}.*?{(?P=type)})" |
paul@6 | 39 | sections_regexp = re.compile(sections_regexp_str, re.DOTALL | re.MULTILINE) |
paul@6 | 40 | |
paul@6 | 41 | def get_regions(s): |
paul@6 | 42 | |
paul@6 | 43 | """ |
paul@6 | 44 | Return a list of regions from 's'. Each region is specified using a tuple of |
paul@6 | 45 | the form (type, text). |
paul@6 | 46 | """ |
paul@6 | 47 | |
paul@6 | 48 | last = 0 |
paul@6 | 49 | regions = [] |
paul@6 | 50 | for match in sections_regexp.finditer(s): |
paul@6 | 51 | start, end = match.span() |
paul@6 | 52 | regions.append((None, s[last:start])) |
paul@6 | 53 | regions.append(get_section_details(s[start:end])) |
paul@6 | 54 | last = end |
paul@6 | 55 | regions.append((None, s[last:])) |
paul@6 | 56 | return regions |
paul@6 | 57 | |
paul@7 | 58 | # Section inspection. |
paul@7 | 59 | |
paul@7 | 60 | section_regexp_str = r"{(?P<sectiontype>[^\n]*?)}(?P<section>.*){(?P=sectiontype)}" |
paul@7 | 61 | section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE) |
paul@7 | 62 | |
paul@6 | 63 | def get_section_details(s): |
paul@6 | 64 | |
paul@7 | 65 | "Return the details of a section 's' in the form (type, text)." |
paul@6 | 66 | |
paul@6 | 67 | match = section_regexp.match(s) |
paul@6 | 68 | if match: |
paul@6 | 69 | return match.group("sectiontype"), match.group("section") |
paul@6 | 70 | else: |
paul@6 | 71 | return None, s |
paul@6 | 72 | |
paul@14 | 73 | # Heading, table and list extraction. |
paul@7 | 74 | |
paul@7 | 75 | list_regexp_str = r"^(?P<listtype>[*#-])[*#-]*.*\n((?P=listtype).*(?:\n|$))*" |
paul@14 | 76 | table_regexp_str = r"^((?P<celltype>[|]{1,2})(.+?(?P=celltype))+(\n|$))+" |
paul@14 | 77 | blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$" |
paul@7 | 78 | |
paul@14 | 79 | blockelement_regexp = re.compile( |
paul@14 | 80 | "(" + list_regexp_str + ")" |
paul@14 | 81 | "|" |
paul@14 | 82 | "(" + table_regexp_str + ")" |
paul@14 | 83 | "|" |
paul@14 | 84 | "(" + blocktext_regexp_str + ")", |
paul@14 | 85 | re.MULTILINE |
paul@14 | 86 | ) |
paul@14 | 87 | |
paul@14 | 88 | def get_block_elements(s): |
paul@7 | 89 | |
paul@7 | 90 | """ |
paul@14 | 91 | Extract headings, tables and lists from the given string 's'. |
paul@7 | 92 | """ |
paul@7 | 93 | |
paul@7 | 94 | last = 0 |
paul@7 | 95 | blocks = [] |
paul@14 | 96 | for match in blockelement_regexp.finditer(s): |
paul@7 | 97 | start, end = match.span() |
paul@14 | 98 | matchtype = match.group("listtype") and "list" or match.group("celltype") and "table" or match.group("type") |
paul@7 | 99 | blocks.append((None, s[last:start])) |
paul@14 | 100 | blocks.append((matchtype, match.group("text") or s[start:end])) |
paul@7 | 101 | last = end |
paul@7 | 102 | blocks.append((None, s[last:])) |
paul@7 | 103 | return blocks |
paul@7 | 104 | |
paul@7 | 105 | # Block extraction. |
paul@7 | 106 | |
paul@7 | 107 | block_regexp_str = r"^(?:\s*\n)+" |
paul@7 | 108 | block_regexp = re.compile(block_regexp_str, re.MULTILINE) |
paul@7 | 109 | |
paul@7 | 110 | def get_basic_blocks(s): |
paul@7 | 111 | |
paul@7 | 112 | """ |
paul@7 | 113 | Return blocks from the given string 's' by splitting the text on blank lines |
paul@7 | 114 | and eliminating those lines. |
paul@7 | 115 | """ |
paul@7 | 116 | |
paul@7 | 117 | return [b for b in block_regexp.split(s) if b.strip()] |
paul@7 | 118 | |
paul@7 | 119 | # Block inspection. |
paul@7 | 120 | |
paul@7 | 121 | def get_blocks(s): |
paul@7 | 122 | |
paul@7 | 123 | """ |
paul@7 | 124 | Return blocks from the given string 's', inspecting the basic blocks and |
paul@7 | 125 | generating additional block-level text where appropriate. |
paul@7 | 126 | """ |
paul@7 | 127 | |
paul@7 | 128 | blocks = [] |
paul@7 | 129 | |
paul@14 | 130 | for blocktype, blocktext in get_block_elements(s): |
paul@7 | 131 | |
paul@14 | 132 | # Collect heading, list and table blocks. |
paul@7 | 133 | |
paul@7 | 134 | if blocktype is not None: |
paul@7 | 135 | blocks.append((blocktype, blocktext)) |
paul@7 | 136 | |
paul@7 | 137 | # Attempt to find new subblocks in other regions. |
paul@7 | 138 | |
paul@7 | 139 | else: |
paul@7 | 140 | for block in get_basic_blocks(blocktext): |
paul@14 | 141 | blocks.append((None, block)) |
paul@7 | 142 | |
paul@7 | 143 | return blocks |
paul@7 | 144 | |
paul@14 | 145 | # List item inspection. |
paul@14 | 146 | |
paul@14 | 147 | listitem_regexp_str = r"^(?P<marker>[*#-])+\s*(?P<text>.*)$" |
paul@7 | 148 | listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE) |
paul@7 | 149 | |
paul@14 | 150 | def get_list_items(text): |
paul@14 | 151 | |
paul@14 | 152 | "Return a list of (marker, text) tuples for the given list 'text'." |
paul@14 | 153 | |
paul@14 | 154 | items = [] |
paul@14 | 155 | |
paul@14 | 156 | for match in listitem_regexp.finditer(text): |
paul@14 | 157 | items.append((match.group("marker"), match.group("text"))) |
paul@14 | 158 | |
paul@14 | 159 | return items |
paul@14 | 160 | |
paul@14 | 161 | # Table row inspection. |
paul@14 | 162 | |
paul@14 | 163 | link_regexp_str = r"[[](?P<linktext>.*?)]" |
paul@14 | 164 | image_regexp_str = r"!(?P<imagetext>.*?)!" |
paul@14 | 165 | cellsep_regexp_str = r"(?P<celltype>[|]{1,2})" |
paul@14 | 166 | content_regexp = re.compile( |
paul@14 | 167 | "(" + link_regexp_str + ")" |
paul@14 | 168 | "|" |
paul@14 | 169 | "(" + image_regexp_str + ")" |
paul@14 | 170 | "|" |
paul@14 | 171 | "(" + cellsep_regexp_str + ")" |
paul@14 | 172 | ) |
paul@14 | 173 | |
paul@14 | 174 | def get_table_rows(text): |
paul@14 | 175 | |
paul@14 | 176 | "Return a list of (cellsep, columns) tuples for the given table 'text'." |
paul@14 | 177 | |
paul@14 | 178 | rows = [] |
paul@14 | 179 | |
paul@14 | 180 | for line in text.split("\n"): |
paul@14 | 181 | cellsep = None |
paul@14 | 182 | columns = [""] |
paul@14 | 183 | last = 0 |
paul@14 | 184 | for match in content_regexp.finditer(line): |
paul@14 | 185 | start, end = match.span() |
paul@14 | 186 | columns[-1] += line[last:start] |
paul@14 | 187 | |
paul@14 | 188 | if match.group("celltype"): |
paul@14 | 189 | if cellsep is None: |
paul@14 | 190 | cellsep = match.group("celltype") |
paul@14 | 191 | columns.append("") |
paul@14 | 192 | else: |
paul@14 | 193 | columns[-1] += line[start:end] |
paul@14 | 194 | |
paul@14 | 195 | last = end |
paul@14 | 196 | |
paul@14 | 197 | columns[-1] += line[last:] |
paul@14 | 198 | |
paul@14 | 199 | if cellsep: |
paul@14 | 200 | rows.append((cellsep, columns[1:-1])) |
paul@14 | 201 | |
paul@14 | 202 | return rows |
paul@14 | 203 | |
paul@14 | 204 | # General parsing and translation. |
paul@14 | 205 | |
paul@11 | 206 | blocktypes = { |
paul@11 | 207 | "h1" : "= %s =", |
paul@11 | 208 | "h2" : "== %s ==", |
paul@11 | 209 | "h3" : "=== %s ===", |
paul@11 | 210 | "h4" : "==== %s ====", |
paul@11 | 211 | "h5" : "===== %s =====", |
paul@11 | 212 | "h6" : "====== %s ======", |
paul@11 | 213 | "bq" : "{{{%s}}}", |
paul@11 | 214 | } |
paul@11 | 215 | |
paul@14 | 216 | markers = { |
paul@14 | 217 | "*" : "*", |
paul@14 | 218 | "#" : "1.", |
paul@14 | 219 | "-" : "*", |
paul@14 | 220 | } |
paul@14 | 221 | |
paul@14 | 222 | def translate_marker(marker): |
paul@14 | 223 | |
paul@14 | 224 | "Translate the given 'marker' to a suitable Moin representation." |
paul@14 | 225 | |
paul@14 | 226 | return " " * len(marker) + markers[marker[-1]] |
paul@14 | 227 | |
paul@14 | 228 | cellseps = { |
paul@14 | 229 | "|" : "||", |
paul@14 | 230 | "||" : "||", |
paul@14 | 231 | } |
paul@14 | 232 | |
paul@14 | 233 | cellextra = { |
paul@14 | 234 | "|" : "", |
paul@14 | 235 | "||" : "'''", |
paul@14 | 236 | } |
paul@14 | 237 | |
paul@14 | 238 | def translate_cellsep(cellsep): |
paul@14 | 239 | |
paul@14 | 240 | "Translate the given 'cellsep' to a suitable Moin representation." |
paul@14 | 241 | |
paul@14 | 242 | return cellseps[cellsep] |
paul@14 | 243 | |
paul@14 | 244 | def translate_cell(cellsep, text): |
paul@14 | 245 | |
paul@14 | 246 | "Using 'cellsep', translate the cell 'text'." |
paul@14 | 247 | |
paul@14 | 248 | return cellextra[cellsep] + text + cellextra[cellsep] |
paul@14 | 249 | |
paul@11 | 250 | def parse(s, out): |
paul@11 | 251 | |
paul@11 | 252 | "Parse the content in the string 's', writing a translation to 'out'." |
paul@11 | 253 | |
paul@11 | 254 | for type, text in get_regions(s): |
paul@11 | 255 | |
paul@11 | 256 | # Handle list, heading, blockquote or anonymous blocks. |
paul@11 | 257 | |
paul@11 | 258 | if type is None: |
paul@11 | 259 | for blocktype, blocktext in get_blocks(text): |
paul@14 | 260 | |
paul@14 | 261 | # Translate headings and blockquotes. |
paul@14 | 262 | |
paul@11 | 263 | if blocktypes.has_key(blocktype): |
paul@11 | 264 | print >>out, blocktypes[blocktype] % blocktext |
paul@14 | 265 | |
paul@14 | 266 | # Translate list items. |
paul@14 | 267 | |
paul@14 | 268 | elif blocktype == "list": |
paul@14 | 269 | for listmarker, listitem in get_list_items(blocktext): |
paul@14 | 270 | print >>out, "%s %s" % (translate_marker(listmarker), listitem) |
paul@14 | 271 | |
paul@14 | 272 | # Translate table items. |
paul@14 | 273 | |
paul@14 | 274 | elif blocktype == "table": |
paul@14 | 275 | for cellsep, columns in get_table_rows(blocktext): |
paul@14 | 276 | moinsep = translate_cellsep(cellsep) |
paul@14 | 277 | print >>out, moinsep + moinsep.join([translate_cell(cellsep, column) for column in columns]) + moinsep |
paul@14 | 278 | |
paul@14 | 279 | # Handle anonymous blocks. |
paul@14 | 280 | |
paul@11 | 281 | else: |
paul@14 | 282 | print >>out, blocktext.rstrip() |
paul@14 | 283 | |
paul@14 | 284 | print >>out |
paul@11 | 285 | |
paul@11 | 286 | # Handle sections. |
paul@11 | 287 | |
paul@11 | 288 | else: |
paul@14 | 289 | print >>out, "{{{", |
paul@14 | 290 | print >>out, text, |
paul@14 | 291 | print >>out, "}}}" |
paul@14 | 292 | print >>out |
paul@11 | 293 | |
paul@6 | 294 | if __name__ == "__main__": |
paul@6 | 295 | import sys |
paul@6 | 296 | |
paul@6 | 297 | s = sys.stdin.read() |
paul@11 | 298 | parse(s, sys.stdout) |
paul@6 | 299 | |
paul@6 | 300 | # vim: tabstop=4 expandtab shiftwidth=4 |