1 #!/usr/bin/env python 2 3 """ 4 Confluence Wiki syntax parsing. 5 6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk> 7 8 This software is free software; you can redistribute it and/or 9 modify it under the terms of the GNU General Public License as 10 published by the Free Software Foundation; either version 2 of 11 the License, or (at your option) any later version. 12 13 This software is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public 19 License along with this library; see the file LICENCE.txt 20 If not, write to the Free Software Foundation, Inc., 21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 22 23 -------- 24 25 The basic procedure is as follows: 26 27 1. Wiki pages are first split up into regions. 28 2. Then, within these regions, the text is split into blocks. 29 1. First, lists are identified. 30 2. Additionally, other block-like elements are identified. 31 3. Each block is then parsed. 32 """ 33 34 from common import * 35 import re 36 import sys 37 import codecs 38 39 # Section extraction. 40 41 sections_regexp_str = r"(?<!{){(?P<type>[^-_*+{}\n:]+)(:[^}\n]+)?}.*?{(?P=type)}" 42 sections_regexp = re.compile(sections_regexp_str, re.DOTALL | re.MULTILINE) 43 44 def get_regions(s): 45 46 """ 47 Return a list of regions from 's'. Each region is specified using a tuple of 48 the form (type, text). 49 """ 50 51 last = 0 52 regions = [] 53 for match in sections_regexp.finditer(s): 54 start, end = match.span() 55 regions.append((None, s[last:start])) 56 regions.append(get_section_details(s[start:end])) 57 last = end 58 regions.append((None, s[last:])) 59 return regions 60 61 # Section inspection. 62 63 section_regexp_str = r"{(?P<sectiontype>[^\n:]*?)(?::(?P<options>.*?))?}(?P<section>.*){(?P=sectiontype)}" 64 section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE) 65 66 def get_section_details(s): 67 68 "Return the details of a section 's' in the form (type, text)." 69 70 match = section_regexp.match(s) 71 if match: 72 return (match.group("sectiontype"), match.group("options")), match.group("section") 73 else: 74 return None, s 75 76 # Heading, table and list extraction. 77 78 list_regexp_str = r"^\s*(?P<listtype>[*#-])[*#-]*\s+.*(\n\s*(?P=listtype).*?)*(?:\n|$)" 79 table_regexp_str = r"^((?P<celltype>[|]{1,2})((.|\n(?!\n))+?(?P=celltype))+(\n|$))+" 80 blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$" 81 82 blockelement_regexp = re.compile( 83 "(" + list_regexp_str + ")" 84 "|" 85 "(" + table_regexp_str + ")" 86 "|" 87 "(" + blocktext_regexp_str + ")", 88 re.MULTILINE 89 ) 90 91 def get_block_elements(s): 92 93 """ 94 Extract headings, tables and lists from the given string 's'. 95 """ 96 97 last = 0 98 blocks = [] 99 for match in blockelement_regexp.finditer(s): 100 start, end = match.span() 101 matchtype = match.group("listtype") and "list" or match.group("celltype") and "table" or match.group("type") 102 blocks.append((None, s[last:start])) 103 blocks.append((matchtype, match.group("text") or s[start:end])) 104 last = end 105 blocks.append((None, s[last:])) 106 return blocks 107 108 # Block extraction. 109 110 block_regexp_str = r"^(?:\s*\n)+" 111 block_regexp = re.compile(block_regexp_str, re.MULTILINE) 112 113 def get_basic_blocks(s): 114 115 """ 116 Return blocks from the given string 's' by splitting the text on blank lines 117 and eliminating those lines. 118 """ 119 120 return [b for b in block_regexp.split(s) if b.strip()] 121 122 # Block inspection. 123 124 def get_blocks(s): 125 126 """ 127 Return blocks from the given string 's', inspecting the basic blocks and 128 generating additional block-level text where appropriate. 129 """ 130 131 blocks = [] 132 133 for blocktype, blocktext in get_block_elements(s): 134 135 # Collect heading, list and table blocks. 136 137 if blocktype is not None: 138 blocks.append((blocktype, blocktext)) 139 140 # Attempt to find new subblocks in other regions. 141 142 else: 143 for block in get_basic_blocks(blocktext): 144 blocks.append((None, block)) 145 146 return blocks 147 148 # List item inspection. 149 150 listitem_regexp_str = r"^(?P<marker> *[-*#]+)\s+(?P<text>.*)$" 151 listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE) 152 153 def get_list_items(text): 154 155 "Return a list of (marker, text) tuples for the given list 'text'." 156 157 items = [] 158 159 for match in listitem_regexp.finditer(text): 160 items.append((match.group("marker"), match.group("text"))) 161 162 return items 163 164 # Content inspection. 165 166 monospace_regexp_str = r"{{(?P<monotext>.*?)}}" 167 link_regexp_str = r"[[](?P<linktext>.*?)]" 168 image_regexp_str = r"!(?P<imagetext>\w.*?)!" 169 170 # Word-dependent patterns. 171 # Here, the unbracketed markers must test for the absence of surrounding word 172 # characters. 173 174 italic_regexp_str = r"(?:(?<!\w)_|\{_\})(?P<italictext>.*?)(?:_(?!\w)|\{_\})" 175 bold_regexp_str = r"(?:(?<!\w)\*|\{\*\})(?P<boldtext>.*?)(?:\*(?!\w)|\{\*\})" 176 del_regexp_str = r"(?:(?<!\w)-|\{-\})(?P<deltext>.*?)(?:-(?!\w)|\{-\})" 177 underline_regexp_str = r"(?:(?<!\w)\+|\{\+\})(?P<underlinetext>.*?)(?:\+(?!\w)|\{\+\})" 178 sub_regexp_str = r"(?:(?<!\w)~|\{~\})(?P<subtext>.*?)(?:~(?!\w)|\{~\})" 179 180 content_regexp_str = ( 181 "(" + monospace_regexp_str + ")" 182 "|" 183 "(" + link_regexp_str + ")" 184 "|" 185 "(" + image_regexp_str + ")" 186 "|" 187 "(" + italic_regexp_str + ")" 188 "|" 189 "(" + bold_regexp_str + ")" 190 "|" 191 "(" + del_regexp_str + ")" 192 "|" 193 "(" + underline_regexp_str + ")" 194 "|" 195 "(" + sub_regexp_str + ")" 196 ) 197 198 # Table row inspection. 199 200 cellsep_regexp_str = r"(?P<celltype>[|]{1,2})" 201 202 table_content_regexp_str = ( 203 content_regexp_str + 204 "|" 205 "(" + cellsep_regexp_str + ")" 206 ) 207 208 content_regexp = re.compile(content_regexp_str) 209 table_content_regexp = re.compile(table_content_regexp_str) 210 211 # Notation conversion. 212 213 notation_mapping = [ 214 (r"\!", "!"), 215 (r"\-", "-"), 216 (r"\\""\n", "<<BR>>"), 217 (r"\\ ", "<<BR>>"), 218 (r"\~", "~"), 219 ] 220 221 preformatted_notation_mapping = [ 222 (r"\!", "!"), 223 (r"\-", "-"), 224 (r"\\""\n", "\n"), 225 (r"\\ ", "\n"), 226 (r"\~", "~"), 227 ] 228 229 # Translation helpers. 230 231 markers = { 232 "*" : "*", 233 "#" : "1.", 234 "-" : "*", 235 } 236 237 def translate_marker(marker): 238 239 "Translate the given 'marker' to a suitable Moin representation." 240 241 return " " * len(marker) + markers[marker[-1]] 242 243 cellseps = { 244 "|" : "\n|| ", 245 "||" : "\n|| ", 246 } 247 248 cellextra = { 249 "|" : "", 250 "||" : "'''", 251 } 252 253 def translate_cellsep(cellsep): 254 255 "Translate the given 'cellsep' to a suitable Moin representation." 256 257 return cellseps[cellsep] 258 259 def translate_cell(cellsep, text): 260 261 "Using 'cellsep', translate the cell 'text'." 262 263 return cellextra[cellsep] + parse_text(text).strip() + cellextra[cellsep] 264 265 def translate_content_match(match): 266 267 "Translate the content described by the given 'match', returning a string." 268 269 if match.group("monotext"): 270 return "{{{%s}}}" % match.group("monotext") 271 272 elif match.group("linktext"): 273 parts = match.group("linktext").split("|") 274 275 # NOTE: Proper detection of external links required. 276 277 if len(parts) == 1: 278 label, target, title = None, parts[0], None 279 elif len(parts) == 2: 280 (label, target), title = parts, None 281 else: 282 label, target, title = parts 283 284 target = target.strip() 285 286 # Look for namespace links and rewrite them. 287 288 if target.find(":") != -1: 289 prefix = "" 290 space, rest = target.split(":", 1) 291 if space not in URL_SCHEMES: 292 target = "%s/%s" % (space, rest) 293 294 # Detect anchors. 295 296 elif target.startswith("#"): 297 prefix = "" 298 299 # Detect attachments. 300 301 elif target.startswith("^"): 302 prefix = "attachment:" 303 304 # Link to other pages within a space. 305 306 else: 307 prefix = "../" 308 309 # Make the link tidier by making a target if none was given. 310 311 if not label: 312 label = target 313 314 if not label and not title: 315 return "[[%s%s]]" % (prefix, target) 316 elif not title: 317 return "[[%s%s|%s]]" % (prefix, target, label) 318 else: 319 return "[[%s%s|%s|title=%s]]" % (prefix, target, label, title) 320 321 elif match.group("imagetext"): 322 parts = match.group("imagetext").split("|") 323 324 # NOTE: Proper detection of external links required. 325 326 if parts[0].startswith("http"): 327 prefix = "" 328 else: 329 prefix = "attachment:" 330 331 # NOTE: Proper options conversion required. 332 333 if len(parts) == 1: 334 return "{{%s%s}}" % (prefix, parts[0]) 335 else: 336 return "{{%s%s|%s}}" % (prefix, parts[0], parts[1]) 337 338 elif match.group("italictext"): 339 return "''%s''" % translate_content(match.group("italictext")) 340 341 elif match.group("boldtext"): 342 return "'''%s'''" % translate_content(match.group("boldtext")) 343 344 elif match.group("deltext"): 345 return "--(%s)--" % translate_content(match.group("deltext")) 346 347 elif match.group("underlinetext"): 348 return "__%s__" % translate_content(match.group("underlinetext")) 349 350 elif match.group("subtext"): 351 return ",,%s,," % translate_content(match.group("subtext")) 352 353 else: 354 return translate_text(match.group()) 355 356 def translate_text(s, preformatted=False): 357 358 "Translate the plain text string 's', converting notation." 359 360 for before, after in preformatted and preformatted_notation_mapping or notation_mapping: 361 s = s.replace(before, after) 362 return s 363 364 def translate_content(text, sectiontype=None): 365 366 """ 367 Return a translation of the given 'text'. If the optional 'sectiontype' is 368 specified, the translation may be modified to a form appropriate to the 369 section being translated. 370 """ 371 372 parts = [] 373 preformatted = sectiontype in preformatted_sectiontypes 374 375 last = 0 376 for match in content_regexp.finditer(text): 377 start, end = match.span() 378 parts.append(translate_text(text[last:start], preformatted)) 379 380 # Handle unformatted sections. 381 382 if sectiontype in ("code", "noformat"): 383 parts.append(match.group()) 384 else: 385 parts.append(translate_content_match(match)) 386 387 last = end 388 389 parts.append(translate_text(text[last:], preformatted)) 390 return "".join(parts) 391 392 def translate_block(blocktype, blocktext): 393 394 "Translate the block with the given 'blocktype' and 'blocktext'." 395 396 parts = [] 397 398 # Translate headings and blockquotes. 399 400 if blocktypes.has_key(blocktype): 401 parts.append(blocktypes[blocktype] % blocktext) 402 403 # Translate list items. 404 405 elif blocktype == "list": 406 for listmarker, listitem in get_list_items(blocktext): 407 parts.append("%s %s" % (translate_marker(listmarker), translate_content(listitem))) 408 409 # Translate table items. 410 411 elif blocktype == "table": 412 parts.append("{{{#!table") 413 first = True 414 for cellsep, columns in get_table_rows(blocktext): 415 if not first: 416 parts.append("==") 417 else: 418 first = False 419 moinsep = translate_cellsep(cellsep) 420 parts.append(moinsep.join([translate_cell(cellsep, column) for column in columns])) 421 parts.append("}}}") 422 423 # Handle anonymous blocks. 424 425 else: 426 parts.append(translate_content(blocktext)) 427 428 return "\n".join(parts) 429 430 def get_table_rows(text): 431 432 "Return a list of (cellsep, columns) tuples for the given table 'text'." 433 434 rows = [] 435 436 for row in text.split("|\n"): 437 if not row: 438 break 439 440 row += "|" 441 cellsep = None 442 columns = [""] 443 last = 0 444 for match in table_content_regexp.finditer(row): 445 start, end = match.span() 446 columns[-1] += row[last:start] 447 448 if match.group("celltype"): 449 if cellsep is None: 450 cellsep = match.group("celltype") 451 columns.append("") 452 else: 453 columns[-1] += match.group() 454 455 last = end 456 457 columns[-1] += row[last:] 458 459 if cellsep: 460 rows.append((cellsep, columns[1:-1])) 461 462 return rows 463 464 sectiontypes = { 465 "code" : "", 466 "noformat" : "", 467 "quote" : "", 468 "info" : "#!wiki important\n", 469 "note" : "#!wiki caution\n", 470 "tip" : "#!wiki tip\n", 471 "warning" : "#!wiki warning\n", 472 } 473 474 preformatted_sectiontypes = (None, "noformat") 475 476 macrotypes = { 477 "anchor" : "<<Anchor(%s)>>", 478 "color" : "<<Color(%s)>>", 479 } 480 481 # General parsing. 482 483 def parse_text(s): 484 485 "Parse the content in the string 's', returning the translation." 486 487 parts = [] 488 489 # Control spacing between blocks and other blocks or sections. 490 491 preceded_by_block = False 492 493 for type, text in get_regions(s): 494 495 # Handle list, heading, blockquote or anonymous blocks. 496 497 if type is None: 498 if preceded_by_block: 499 parts.append("\n") 500 501 first = True 502 for blocktype, blocktext in get_blocks(text): 503 if not first: 504 parts.append("\n") 505 else: 506 first = False 507 parts.append("%s" % translate_block(blocktype, blocktext)) 508 509 if not first: 510 preceded_by_block = True 511 512 # Handle sections. 513 514 else: 515 sectiontype, options = type 516 517 # Direct translations of sections. 518 519 if sectiontypes.has_key(sectiontype): 520 if preceded_by_block: 521 parts.append("\n") 522 mointype = sectiontypes[sectiontype] 523 524 parts.append("{{{%s" % (mointype or "")) 525 text = text.strip() 526 527 # Sections containing newlines must have a separate header line. 528 529 if options or text.find("\n") != -1: 530 parts.append("\n") 531 532 if options: 533 parts.append("## %s\n" % options) 534 parts.append(translate_content(text, sectiontype)) 535 parts.append("%s}}}\n" % (mointype and "\n" or "")) 536 537 preceded_by_block = True 538 539 # Translations of macros (which can look like sections). 540 541 elif macrotypes.has_key(sectiontype): 542 parts.append(macrotypes[sectiontype] % translate_content(text, sectiontype)) 543 preceded_by_block = False 544 545 # Unrecognised sections. 546 547 else: 548 parts.append("{{{") 549 550 # Sections containing newlines must have a separate header line. 551 552 if text.find("\n") != -1 and not text.startswith("\n"): 553 parts.append("\n") 554 555 parts.append(translate_content(text, sectiontype)) 556 parts.append("}}}") 557 preceded_by_block = False 558 559 return "".join(parts) 560 561 def parse(s, out): 562 563 "Parse the content in the string 's', writing a translation to 'out'." 564 565 out.write(parse_text(s)) 566 567 if __name__ == "__main__": 568 s = codecs.getreader("utf-8")(sys.stdin).read() 569 out = codecs.getwriter("utf-8")(sys.stdout) 570 parse(s, out) 571 572 # vim: tabstop=4 expandtab shiftwidth=4