1 #!/usr/bin/env python 2 3 """ 4 Confluence Wiki syntax parsing. 5 6 Copyright (C) 2012 Paul Boddie <paul@boddie.org.uk> 7 8 This software is free software; you can redistribute it and/or 9 modify it under the terms of the GNU General Public License as 10 published by the Free Software Foundation; either version 2 of 11 the License, or (at your option) any later version. 12 13 This software is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public 19 License along with this library; see the file LICENCE.txt 20 If not, write to the Free Software Foundation, Inc., 21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 22 23 -------- 24 25 The basic procedure is as follows: 26 27 1. Wiki pages are first split up into regions. 28 2. Then, within these regions, the text is split into blocks. 29 1. First, lists are identified. 30 2. Additionally, other block-like elements are identified. 31 3. Each block is then parsed. 32 """ 33 34 import re 35 36 URL_SCHEMES = ("http", "https", "ftp", "mailto") 37 38 # Section extraction. 39 40 sections_regexp_str = r"(?<!{){(?P<type>[^-_*+{}\n:]+)(:[^}\n]+)?}.*?{(?P=type)}" 41 sections_regexp = re.compile(sections_regexp_str, re.DOTALL | re.MULTILINE) 42 43 def get_regions(s): 44 45 """ 46 Return a list of regions from 's'. Each region is specified using a tuple of 47 the form (type, text). 48 """ 49 50 last = 0 51 regions = [] 52 for match in sections_regexp.finditer(s): 53 start, end = match.span() 54 regions.append((None, s[last:start])) 55 regions.append(get_section_details(s[start:end])) 56 last = end 57 regions.append((None, s[last:])) 58 return regions 59 60 # Section inspection. 61 62 section_regexp_str = r"{(?P<sectiontype>[^\n:]*?)(?::(?P<options>.*?))?}(?P<section>.*){(?P=sectiontype)}" 63 section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE) 64 65 def get_section_details(s): 66 67 "Return the details of a section 's' in the form (type, text)." 68 69 match = section_regexp.match(s) 70 if match: 71 return (match.group("sectiontype"), match.group("options")), match.group("section") 72 else: 73 return None, s 74 75 # Heading, table and list extraction. 76 77 list_regexp_str = r"^\s*(?P<listtype>[*#-])[*#-]*.*\n(\s*(?P=listtype).*(?:\n|$))*" 78 table_regexp_str = r"^((?P<celltype>[|]{1,2})(.+?(?P=celltype))+(\n|$))+" 79 blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$" 80 81 blockelement_regexp = re.compile( 82 "(" + list_regexp_str + ")" 83 "|" 84 "(" + table_regexp_str + ")" 85 "|" 86 "(" + blocktext_regexp_str + ")", 87 re.MULTILINE 88 ) 89 90 def get_block_elements(s): 91 92 """ 93 Extract headings, tables and lists from the given string 's'. 94 """ 95 96 last = 0 97 blocks = [] 98 for match in blockelement_regexp.finditer(s): 99 start, end = match.span() 100 matchtype = match.group("listtype") and "list" or match.group("celltype") and "table" or match.group("type") 101 blocks.append((None, s[last:start])) 102 blocks.append((matchtype, match.group("text") or s[start:end])) 103 last = end 104 blocks.append((None, s[last:])) 105 return blocks 106 107 # Block extraction. 108 109 block_regexp_str = r"^(?:\s*\n)+" 110 block_regexp = re.compile(block_regexp_str, re.MULTILINE) 111 112 def get_basic_blocks(s): 113 114 """ 115 Return blocks from the given string 's' by splitting the text on blank lines 116 and eliminating those lines. 117 """ 118 119 return [b for b in block_regexp.split(s) if b.strip()] 120 121 # Block inspection. 122 123 def get_blocks(s): 124 125 """ 126 Return blocks from the given string 's', inspecting the basic blocks and 127 generating additional block-level text where appropriate. 128 """ 129 130 blocks = [] 131 132 for blocktype, blocktext in get_block_elements(s): 133 134 # Collect heading, list and table blocks. 135 136 if blocktype is not None: 137 blocks.append((blocktype, blocktext)) 138 139 # Attempt to find new subblocks in other regions. 140 141 else: 142 for block in get_basic_blocks(blocktext): 143 blocks.append((None, block)) 144 145 return blocks 146 147 # List item inspection. 148 149 listitem_regexp_str = r"^(?P<marker> *[-*#]+)\s*(?P<text>.*)$" 150 listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE) 151 152 def get_list_items(text): 153 154 "Return a list of (marker, text) tuples for the given list 'text'." 155 156 items = [] 157 158 for match in listitem_regexp.finditer(text): 159 items.append((match.group("marker"), match.group("text"))) 160 161 return items 162 163 # Table row inspection. 164 165 monospace_regexp_str = r"{{(?P<monotext>.*?)}}" 166 link_regexp_str = r"[[](?P<linktext>.*?)]" 167 image_regexp_str = r"!(?P<imagetext>.*?)!" 168 cellsep_regexp_str = r"(?P<celltype>[|]{1,2})" 169 170 content_regexp_str = ( 171 "(" + monospace_regexp_str + ")" 172 "|" 173 "(" + link_regexp_str + ")" 174 "|" 175 "(" + image_regexp_str + ")" 176 ) 177 178 table_content_regexp_str = ( 179 content_regexp_str + 180 "|" 181 "(" + cellsep_regexp_str + ")" 182 ) 183 184 content_regexp = re.compile(content_regexp_str) 185 table_content_regexp = re.compile(table_content_regexp_str) 186 187 def translate_content_match(match): 188 189 "Translate the content described by the given 'match', returning a string." 190 191 if match.group("monotext"): 192 return "{{{%s}}}" % match.group("monotext") 193 194 elif match.group("linktext"): 195 parts = match.group("linktext").split("|") 196 197 # NOTE: Proper detection of external links required. 198 199 if len(parts) == 1: 200 label, target = None, parts[0] 201 elif len(parts) == 2: 202 label, target = parts 203 else: 204 label, target, title = parts 205 206 target = target.strip() 207 208 if target.find(":") != -1: 209 prefix = "" 210 space, rest = target.split(":", 1) 211 if space not in URL_SCHEMES: 212 target = "%s/%s" % (space, rest) 213 elif target.startswith("#"): 214 prefix = "" 215 elif target.startswith("^"): 216 prefix = "attachment:" 217 else: 218 prefix = "../" 219 220 if len(parts) == 1: 221 return "[[%s%s]]" % (prefix, target) 222 elif len(parts) == 2: 223 return "[[%s%s|%s]]" % (prefix, target, label) 224 else: 225 return "[[%s%s|%s|title=%s]]" % (prefix, target, label, title) 226 227 elif match.group("imagetext"): 228 parts = match.group("imagetext").split("|") 229 230 # NOTE: Proper detection of external links required. 231 232 if parts[0].startswith("http"): 233 prefix = "" 234 else: 235 prefix = "attachment:" 236 237 # NOTE: Proper options conversion required. 238 239 if len(parts) == 1: 240 return "{{%s%s}}" % (prefix, parts[0]) 241 else: 242 return "{{%s%s|%s}}" % (prefix, parts[0], parts[1]) 243 244 else: 245 return match.group() 246 247 def get_table_rows(text): 248 249 "Return a list of (cellsep, columns) tuples for the given table 'text'." 250 251 rows = [] 252 253 for line in text.split("\n"): 254 cellsep = None 255 columns = [""] 256 last = 0 257 for match in table_content_regexp.finditer(line): 258 start, end = match.span() 259 columns[-1] += line[last:start] 260 261 if match.group("celltype"): 262 if cellsep is None: 263 cellsep = match.group("celltype") 264 columns.append("") 265 else: 266 columns[-1] += match.group() 267 268 last = end 269 270 columns[-1] += line[last:] 271 272 if cellsep: 273 rows.append((cellsep, columns[1:-1])) 274 275 return rows 276 277 def translate_content(text, sectiontype=None): 278 279 """ 280 Return a translation of the given 'text'. If the optional 'sectiontype' is 281 specified, the translation may be modified to a form appropriate to the 282 section being translated. 283 """ 284 285 parts = [] 286 287 last = 0 288 for match in content_regexp.finditer(text): 289 start, end = match.span() 290 parts.append(text[last:start]) 291 292 # Handle unformatted sections. 293 294 if sectiontype in ("code", "noformat"): 295 parts.append(match.group()) 296 else: 297 parts.append(translate_content_match(match)) 298 299 last = end 300 301 parts.append(text[last:]) 302 return "".join(parts) 303 304 # Translation helpers. 305 306 blocktypes = { 307 "h1" : "= %s =", 308 "h2" : "== %s ==", 309 "h3" : "=== %s ===", 310 "h4" : "==== %s ====", 311 "h5" : "===== %s =====", 312 "h6" : "====== %s ======", 313 "bq" : "{{{%s}}}", 314 } 315 316 markers = { 317 "*" : "*", 318 "#" : "1.", 319 "-" : "*", 320 } 321 322 def translate_marker(marker): 323 324 "Translate the given 'marker' to a suitable Moin representation." 325 326 return " " * len(marker) + markers[marker[-1]] 327 328 cellseps = { 329 "|" : "||", 330 "||" : "||", 331 } 332 333 cellextra = { 334 "|" : "", 335 "||" : "'''", 336 } 337 338 def translate_cellsep(cellsep): 339 340 "Translate the given 'cellsep' to a suitable Moin representation." 341 342 return cellseps[cellsep] 343 344 def translate_cell(cellsep, text): 345 346 "Using 'cellsep', translate the cell 'text'." 347 348 return cellextra[cellsep] + translate_content(text) + cellextra[cellsep] 349 350 sectiontypes = { 351 "code" : "", 352 "noformat" : "", 353 "quote" : "", 354 "info" : "wiki important", 355 "note" : "wiki caution", 356 "tip" : "wiki tip", 357 "warning" : "wiki warning", 358 } 359 360 # General parsing. 361 362 def parse(s, out): 363 364 "Parse the content in the string 's', writing a translation to 'out'." 365 366 for type, text in get_regions(s): 367 368 # Handle list, heading, blockquote or anonymous blocks. 369 370 if type is None: 371 for blocktype, blocktext in get_blocks(text): 372 373 # Translate headings and blockquotes. 374 375 if blocktypes.has_key(blocktype): 376 print >>out, blocktypes[blocktype] % blocktext 377 378 # Translate list items. 379 380 elif blocktype == "list": 381 for listmarker, listitem in get_list_items(blocktext): 382 print >>out, "%s %s" % (translate_marker(listmarker), translate_content(listitem)) 383 384 # Translate table items. 385 386 elif blocktype == "table": 387 for cellsep, columns in get_table_rows(blocktext): 388 moinsep = translate_cellsep(cellsep) 389 print >>out, moinsep + moinsep.join([translate_cell(cellsep, column) for column in columns]) + moinsep 390 391 # Handle anonymous blocks. 392 393 else: 394 print >>out, translate_content(blocktext.rstrip()) 395 396 print >>out 397 398 # Handle sections. 399 400 else: 401 sectiontype, options = type 402 403 # Direct translations of sections. 404 405 mointype = sectiontypes.get(sectiontype) 406 if mointype: 407 print >>out, "{{{#!%s" % mointype 408 if options: 409 print >>out, "##", options 410 else: 411 print >>out, "{{{", 412 print >>out, translate_content(text, sectiontype), 413 print >>out, "}}}" 414 print >>out 415 416 if __name__ == "__main__": 417 import sys 418 419 s = sys.stdin.read() 420 parse(s, sys.stdout) 421 422 # vim: tabstop=4 expandtab shiftwidth=4