1 #!/usr/bin/env python 2 3 """ 4 Confluence Wiki syntax parsing. 5 6 Copyright (C) 2012 Paul Boddie <paul@boddie.org.uk> 7 8 This software is free software; you can redistribute it and/or 9 modify it under the terms of the GNU General Public License as 10 published by the Free Software Foundation; either version 2 of 11 the License, or (at your option) any later version. 12 13 This software is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public 19 License along with this library; see the file LICENCE.txt 20 If not, write to the Free Software Foundation, Inc., 21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 22 23 -------- 24 25 The basic procedure is as follows: 26 27 1. Wiki pages are first split up into regions. 28 2. Then, within these regions, the text is split into blocks. 29 1. First, lists are identified. 30 2. Additionally, other block-like elements are identified. 31 3. Each block is then parsed. 32 """ 33 34 import re 35 36 URL_SCHEMES = ("http", "https", "ftp", "mailto") 37 38 # Section extraction. 39 40 sections_regexp_str = r"(?<!{){(?P<type>[^-_*+{}\n:]+)(:[^}\n]+)?}.*?{(?P=type)}" 41 sections_regexp = re.compile(sections_regexp_str, re.DOTALL | re.MULTILINE) 42 43 def get_regions(s): 44 45 """ 46 Return a list of regions from 's'. Each region is specified using a tuple of 47 the form (type, text). 48 """ 49 50 last = 0 51 regions = [] 52 for match in sections_regexp.finditer(s): 53 start, end = match.span() 54 regions.append((None, s[last:start])) 55 regions.append(get_section_details(s[start:end])) 56 last = end 57 regions.append((None, s[last:])) 58 return regions 59 60 # Section inspection. 61 62 section_regexp_str = r"{(?P<sectiontype>[^\n:]*?)(?::(?P<options>.*?))?}(?P<section>.*){(?P=sectiontype)}" 63 section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE) 64 65 def get_section_details(s): 66 67 "Return the details of a section 's' in the form (type, text)." 68 69 match = section_regexp.match(s) 70 if match: 71 return (match.group("sectiontype"), match.group("options")), match.group("section") 72 else: 73 return None, s 74 75 # Heading, table and list extraction. 76 77 list_regexp_str = r"^\s*(?P<listtype>[*#-])[*#-]*.*\n(\s*(?P=listtype).*(?:\n|$))*" 78 table_regexp_str = r"^((?P<celltype>[|]{1,2})(.+?(?P=celltype))+(\n|$))+" 79 blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$" 80 81 blockelement_regexp = re.compile( 82 "(" + list_regexp_str + ")" 83 "|" 84 "(" + table_regexp_str + ")" 85 "|" 86 "(" + blocktext_regexp_str + ")", 87 re.MULTILINE 88 ) 89 90 def get_block_elements(s): 91 92 """ 93 Extract headings, tables and lists from the given string 's'. 94 """ 95 96 last = 0 97 blocks = [] 98 for match in blockelement_regexp.finditer(s): 99 start, end = match.span() 100 matchtype = match.group("listtype") and "list" or match.group("celltype") and "table" or match.group("type") 101 blocks.append((None, s[last:start])) 102 blocks.append((matchtype, match.group("text") or s[start:end])) 103 last = end 104 blocks.append((None, s[last:])) 105 return blocks 106 107 # Block extraction. 108 109 block_regexp_str = r"^(?:\s*\n)+" 110 block_regexp = re.compile(block_regexp_str, re.MULTILINE) 111 112 def get_basic_blocks(s): 113 114 """ 115 Return blocks from the given string 's' by splitting the text on blank lines 116 and eliminating those lines. 117 """ 118 119 return [b for b in block_regexp.split(s) if b.strip()] 120 121 # Block inspection. 122 123 def get_blocks(s): 124 125 """ 126 Return blocks from the given string 's', inspecting the basic blocks and 127 generating additional block-level text where appropriate. 128 """ 129 130 blocks = [] 131 132 for blocktype, blocktext in get_block_elements(s): 133 134 # Collect heading, list and table blocks. 135 136 if blocktype is not None: 137 blocks.append((blocktype, blocktext)) 138 139 # Attempt to find new subblocks in other regions. 140 141 else: 142 for block in get_basic_blocks(blocktext): 143 blocks.append((None, block)) 144 145 return blocks 146 147 # List item inspection. 148 149 listitem_regexp_str = r"^(?P<marker> *[-*#]+)\s*(?P<text>.*)$" 150 listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE) 151 152 def get_list_items(text): 153 154 "Return a list of (marker, text) tuples for the given list 'text'." 155 156 items = [] 157 158 for match in listitem_regexp.finditer(text): 159 items.append((match.group("marker"), match.group("text"))) 160 161 return items 162 163 # Table row inspection. 164 165 monospace_regexp_str = r"{{(?P<monotext>.*?)}}" 166 link_regexp_str = r"[[](?P<linktext>.*?)]" 167 image_regexp_str = r"!(?P<imagetext>.*?)!" 168 cellsep_regexp_str = r"(?P<celltype>[|]{1,2})" 169 170 content_regexp_str = ( 171 "(" + monospace_regexp_str + ")" 172 "|" 173 "(" + link_regexp_str + ")" 174 "|" 175 "(" + image_regexp_str + ")" 176 ) 177 178 table_content_regexp_str = ( 179 content_regexp_str + 180 "|" 181 "(" + cellsep_regexp_str + ")" 182 ) 183 184 content_regexp = re.compile(content_regexp_str) 185 table_content_regexp = re.compile(table_content_regexp_str) 186 187 def translate_content_match(match): 188 189 "Translate the content described by the given 'match', returning a string." 190 191 if match.group("monotext"): 192 return "{{{%s}}}" % match.group("monotext") 193 194 elif match.group("linktext"): 195 parts = match.group("linktext").split("|") 196 197 # NOTE: Proper detection of external links required. 198 199 if len(parts) == 1: 200 label, target, title = None, parts[0], None 201 elif len(parts) == 2: 202 (label, target), title = parts, None 203 else: 204 label, target, title = parts 205 206 target = target.strip() 207 208 # Look for namespace links and rewrite them. 209 210 if target.find(":") != -1: 211 prefix = "" 212 space, rest = target.split(":", 1) 213 if space not in URL_SCHEMES: 214 target = "%s/%s" % (space, rest) 215 216 # Detect anchors. 217 218 elif target.startswith("#"): 219 prefix = "" 220 221 # Detect attachments. 222 223 elif target.startswith("^"): 224 prefix = "attachment:" 225 226 # Link to other pages within a space. 227 228 else: 229 prefix = "../" 230 231 # Make the link tidier by making a target if none was given. 232 233 if not label: 234 label = target 235 236 if not label and not title: 237 return "[[%s%s]]" % (prefix, target) 238 elif not title: 239 return "[[%s%s|%s]]" % (prefix, target, label) 240 else: 241 return "[[%s%s|%s|title=%s]]" % (prefix, target, label, title) 242 243 elif match.group("imagetext"): 244 parts = match.group("imagetext").split("|") 245 246 # NOTE: Proper detection of external links required. 247 248 if parts[0].startswith("http"): 249 prefix = "" 250 else: 251 prefix = "attachment:" 252 253 # NOTE: Proper options conversion required. 254 255 if len(parts) == 1: 256 return "{{%s%s}}" % (prefix, parts[0]) 257 else: 258 return "{{%s%s|%s}}" % (prefix, parts[0], parts[1]) 259 260 else: 261 return match.group() 262 263 def get_table_rows(text): 264 265 "Return a list of (cellsep, columns) tuples for the given table 'text'." 266 267 rows = [] 268 269 for line in text.split("\n"): 270 cellsep = None 271 columns = [""] 272 last = 0 273 for match in table_content_regexp.finditer(line): 274 start, end = match.span() 275 columns[-1] += line[last:start] 276 277 if match.group("celltype"): 278 if cellsep is None: 279 cellsep = match.group("celltype") 280 columns.append("") 281 else: 282 columns[-1] += match.group() 283 284 last = end 285 286 columns[-1] += line[last:] 287 288 if cellsep: 289 rows.append((cellsep, columns[1:-1])) 290 291 return rows 292 293 def translate_content(text, sectiontype=None): 294 295 """ 296 Return a translation of the given 'text'. If the optional 'sectiontype' is 297 specified, the translation may be modified to a form appropriate to the 298 section being translated. 299 """ 300 301 parts = [] 302 303 last = 0 304 for match in content_regexp.finditer(text): 305 start, end = match.span() 306 parts.append(text[last:start]) 307 308 # Handle unformatted sections. 309 310 if sectiontype in ("code", "noformat"): 311 parts.append(match.group()) 312 else: 313 parts.append(translate_content_match(match)) 314 315 last = end 316 317 parts.append(text[last:]) 318 return "".join(parts) 319 320 # Translation helpers. 321 322 blocktypes = { 323 "h1" : "= %s =", 324 "h2" : "== %s ==", 325 "h3" : "=== %s ===", 326 "h4" : "==== %s ====", 327 "h5" : "===== %s =====", 328 "h6" : "====== %s ======", 329 "bq" : "{{{%s}}}", 330 } 331 332 markers = { 333 "*" : "*", 334 "#" : "1.", 335 "-" : "*", 336 } 337 338 def translate_marker(marker): 339 340 "Translate the given 'marker' to a suitable Moin representation." 341 342 return " " * len(marker) + markers[marker[-1]] 343 344 cellseps = { 345 "|" : "||", 346 "||" : "||", 347 } 348 349 cellextra = { 350 "|" : "", 351 "||" : "'''", 352 } 353 354 def translate_cellsep(cellsep): 355 356 "Translate the given 'cellsep' to a suitable Moin representation." 357 358 return cellseps[cellsep] 359 360 def translate_cell(cellsep, text): 361 362 "Using 'cellsep', translate the cell 'text'." 363 364 return cellextra[cellsep] + translate_content(text) + cellextra[cellsep] 365 366 sectiontypes = { 367 "code" : "", 368 "noformat" : "", 369 "quote" : "", 370 "info" : "wiki important", 371 "note" : "wiki caution", 372 "tip" : "wiki tip", 373 "warning" : "wiki warning", 374 } 375 376 # General parsing. 377 378 def parse(s, out): 379 380 "Parse the content in the string 's', writing a translation to 'out'." 381 382 for type, text in get_regions(s): 383 384 # Handle list, heading, blockquote or anonymous blocks. 385 386 if type is None: 387 for blocktype, blocktext in get_blocks(text): 388 389 # Translate headings and blockquotes. 390 391 if blocktypes.has_key(blocktype): 392 print >>out, blocktypes[blocktype] % blocktext 393 394 # Translate list items. 395 396 elif blocktype == "list": 397 for listmarker, listitem in get_list_items(blocktext): 398 print >>out, "%s %s" % (translate_marker(listmarker), translate_content(listitem)) 399 400 # Translate table items. 401 402 elif blocktype == "table": 403 for cellsep, columns in get_table_rows(blocktext): 404 moinsep = translate_cellsep(cellsep) 405 print >>out, moinsep + moinsep.join([translate_cell(cellsep, column) for column in columns]) + moinsep 406 407 # Handle anonymous blocks. 408 409 else: 410 print >>out, translate_content(blocktext.rstrip()) 411 412 print >>out 413 414 # Handle sections. 415 416 else: 417 sectiontype, options = type 418 419 # Direct translations of sections. 420 421 mointype = sectiontypes.get(sectiontype) 422 if mointype: 423 print >>out, "{{{#!%s" % mointype 424 if options: 425 print >>out, "##", options 426 else: 427 print >>out, "{{{", 428 print >>out, translate_content(text, sectiontype), 429 print >>out, "}}}" 430 print >>out 431 432 if __name__ == "__main__": 433 import sys 434 435 s = sys.stdin.read() 436 parse(s, sys.stdout) 437 438 # vim: tabstop=4 expandtab shiftwidth=4