1 #!/usr/bin/env python 2 3 """ 4 Confluence Wiki syntax parsing. 5 6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk> 7 8 This software is free software; you can redistribute it and/or 9 modify it under the terms of the GNU General Public License as 10 published by the Free Software Foundation; either version 2 of 11 the License, or (at your option) any later version. 12 13 This software is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public 19 License along with this library; see the file LICENCE.txt 20 If not, write to the Free Software Foundation, Inc., 21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 22 23 -------- 24 25 The basic procedure is as follows: 26 27 1. Wiki pages are first split up into regions. 28 2. Then, within these regions, the text is split into blocks. 29 1. First, lists are identified. 30 2. Additionally, other block-like elements are identified. 31 3. Each block is then parsed. 32 """ 33 34 from common import * 35 import re 36 import sys 37 import codecs 38 39 # Section extraction. 40 41 sections_regexp_str = r"(?<!{){(?P<type>[^-_*+{}\n:]+)(:[^}\n]+)?}.*?{(?P=type)}" 42 sections_regexp = re.compile(sections_regexp_str, re.DOTALL | re.MULTILINE) 43 44 def get_regions(s): 45 46 """ 47 Return a list of regions from 's'. Each region is specified using a tuple of 48 the form (type, text). 49 """ 50 51 last = 0 52 regions = [] 53 for match in sections_regexp.finditer(s): 54 start, end = match.span() 55 regions.append((None, s[last:start])) 56 regions.append(get_section_details(s[start:end])) 57 last = end 58 regions.append((None, s[last:])) 59 return regions 60 61 # Section inspection. 62 63 section_regexp_str = r"{(?P<sectiontype>[^\n:]*?)(?::(?P<options>.*?))?}(?P<section>.*){(?P=sectiontype)}" 64 section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE) 65 66 def get_section_details(s): 67 68 "Return the details of a section 's' in the form (type, text)." 69 70 match = section_regexp.match(s) 71 if match: 72 return (match.group("sectiontype"), match.group("options")), match.group("section") 73 else: 74 return None, s 75 76 # Heading, table and list extraction. 77 78 list_regexp_str = r"^\s*(?P<listtype>[*#-])[*#-]*\s+.*(\n\s*(?P=listtype).*?)*(?:\n|$)" 79 table_regexp_str = r"^((?P<celltype>[|]{1,2})((.|\n(?!\n))+?(?P=celltype))+(\n|$))+" 80 blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$" 81 82 blockelement_regexp = re.compile( 83 "(" + list_regexp_str + ")" 84 "|" 85 "(" + table_regexp_str + ")" 86 "|" 87 "(" + blocktext_regexp_str + ")", 88 re.MULTILINE 89 ) 90 91 def get_block_elements(s): 92 93 """ 94 Extract headings, tables and lists from the given string 's'. 95 """ 96 97 last = 0 98 blocks = [] 99 for match in blockelement_regexp.finditer(s): 100 start, end = match.span() 101 matchtype = match.group("listtype") and "list" or match.group("celltype") and "table" or match.group("type") 102 blocks.append((None, s[last:start])) 103 blocks.append((matchtype, match.group("text") or s[start:end])) 104 last = end 105 blocks.append((None, s[last:])) 106 return blocks 107 108 # Block extraction. 109 110 block_regexp_str = r"^(?:\s*\n)+" 111 block_regexp = re.compile(block_regexp_str, re.MULTILINE) 112 113 def get_basic_blocks(s): 114 115 """ 116 Return blocks from the given string 's' by splitting the text on blank lines 117 and eliminating those lines. 118 """ 119 120 return [b for b in block_regexp.split(s) if b.strip()] 121 122 # Block inspection. 123 124 def get_blocks(s): 125 126 """ 127 Return blocks from the given string 's', inspecting the basic blocks and 128 generating additional block-level text where appropriate. 129 """ 130 131 blocks = [] 132 133 for blocktype, blocktext in get_block_elements(s): 134 135 # Collect heading, list and table blocks. 136 137 if blocktype is not None: 138 blocks.append((blocktype, blocktext)) 139 140 # Attempt to find new subblocks in other regions. 141 142 else: 143 for block in get_basic_blocks(blocktext): 144 blocks.append((None, block)) 145 146 return blocks 147 148 # List item inspection. 149 150 listitem_regexp_str = r"^(?P<marker> *[-*#]+)\s+(?P<text>.*)$" 151 listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE) 152 153 def get_list_items(text): 154 155 "Return a list of (marker, text) tuples for the given list 'text'." 156 157 items = [] 158 159 for match in listitem_regexp.finditer(text): 160 items.append((match.group("marker"), match.group("text"))) 161 162 return items 163 164 # Content inspection. 165 166 monospace_regexp_str = r"{{(?P<monotext>.*?)}}" 167 link_regexp_str = r"[[](?P<linktext>.*?)]" 168 image_regexp_str = r"!(?P<imagetext>\w.*?)!" 169 170 # Word-dependent patterns. 171 # Here, the unbracketed markers must test for the absence of surrounding word 172 # characters. 173 174 italic_regexp_str = r"(?:(?<!\w)_|\{_\})(?P<italictext>.*?)(?:_(?!\w)|\{_\})" 175 bold_regexp_str = r"(?:(?<!\w)\*|\{\*\})(?P<boldtext>.*?)(?:\*(?!\w)|\{\*\})" 176 del_regexp_str = r"(?:(?<!\w)-|\{-\})(?P<deltext>.*?)(?:-(?!\w)|\{-\})" 177 underline_regexp_str = r"(?:(?<!\w)\+|\{\+\})(?P<underlinetext>.*?)(?:\+(?!\w)|\{\+\})" 178 sub_regexp_str = r"(?:(?<!\w)~|\{~\})(?P<subtext>.*?)(?:~(?!\w)|\{~\})" 179 180 content_regexp_str = ( 181 "(" + monospace_regexp_str + ")" 182 "|" 183 "(" + link_regexp_str + ")" 184 "|" 185 "(" + image_regexp_str + ")" 186 "|" 187 "(" + italic_regexp_str + ")" 188 "|" 189 "(" + bold_regexp_str + ")" 190 "|" 191 "(" + del_regexp_str + ")" 192 "|" 193 "(" + underline_regexp_str + ")" 194 "|" 195 "(" + sub_regexp_str + ")" 196 ) 197 198 # Table row inspection. 199 200 cellsep_regexp_str = r"(?P<celltype>[|]{1,2})" 201 202 table_content_regexp_str = ( 203 content_regexp_str + 204 "|" 205 "(" + cellsep_regexp_str + ")" 206 ) 207 208 content_regexp = re.compile(content_regexp_str) 209 table_content_regexp = re.compile(table_content_regexp_str) 210 211 # Notation conversion. 212 213 notation_mapping = [ 214 (r"\!", "!"), 215 (r"\-", "-"), 216 (r"\\""\n", "<<BR>>"), 217 (r"\\ ", "<<BR>>"), 218 (r"\~", "~"), 219 ] 220 221 # Translation helpers. 222 223 markers = { 224 "*" : "*", 225 "#" : "1.", 226 "-" : "*", 227 } 228 229 def translate_marker(marker): 230 231 "Translate the given 'marker' to a suitable Moin representation." 232 233 return " " * len(marker) + markers[marker[-1]] 234 235 cellseps = { 236 "|" : "\n|| ", 237 "||" : "\n|| ", 238 } 239 240 cellextra = { 241 "|" : "", 242 "||" : "'''", 243 } 244 245 def translate_cellsep(cellsep): 246 247 "Translate the given 'cellsep' to a suitable Moin representation." 248 249 return cellseps[cellsep] 250 251 def translate_cell(cellsep, text): 252 253 "Using 'cellsep', translate the cell 'text'." 254 255 return cellextra[cellsep] + parse_text(text).strip() + cellextra[cellsep] 256 257 def translate_content_match(match): 258 259 "Translate the content described by the given 'match', returning a string." 260 261 if match.group("monotext"): 262 return "{{{%s}}}" % match.group("monotext") 263 264 elif match.group("linktext"): 265 parts = match.group("linktext").split("|") 266 267 # NOTE: Proper detection of external links required. 268 269 if len(parts) == 1: 270 label, target, title = None, parts[0], None 271 elif len(parts) == 2: 272 (label, target), title = parts, None 273 else: 274 label, target, title = parts 275 276 target = target.strip() 277 278 # Look for namespace links and rewrite them. 279 280 if target.find(":") != -1: 281 prefix = "" 282 space, rest = target.split(":", 1) 283 if space not in URL_SCHEMES: 284 target = "%s/%s" % (space, rest) 285 286 # Detect anchors. 287 288 elif target.startswith("#"): 289 prefix = "" 290 291 # Detect attachments. 292 293 elif target.startswith("^"): 294 prefix = "attachment:" 295 296 # Link to other pages within a space. 297 298 else: 299 prefix = "../" 300 301 # Make the link tidier by making a target if none was given. 302 303 if not label: 304 label = target 305 306 if not label and not title: 307 return "[[%s%s]]" % (prefix, target) 308 elif not title: 309 return "[[%s%s|%s]]" % (prefix, target, label) 310 else: 311 return "[[%s%s|%s|title=%s]]" % (prefix, target, label, title) 312 313 elif match.group("imagetext"): 314 parts = match.group("imagetext").split("|") 315 316 # NOTE: Proper detection of external links required. 317 318 if parts[0].startswith("http"): 319 prefix = "" 320 else: 321 prefix = "attachment:" 322 323 # NOTE: Proper options conversion required. 324 325 if len(parts) == 1: 326 return "{{%s%s}}" % (prefix, parts[0]) 327 else: 328 return "{{%s%s|%s}}" % (prefix, parts[0], parts[1]) 329 330 elif match.group("italictext"): 331 return "''%s''" % translate_content(match.group("italictext")) 332 333 elif match.group("boldtext"): 334 return "'''%s'''" % translate_content(match.group("boldtext")) 335 336 elif match.group("deltext"): 337 return "--(%s)--" % translate_content(match.group("deltext")) 338 339 elif match.group("underlinetext"): 340 return "__%s__" % translate_content(match.group("underlinetext")) 341 342 elif match.group("subtext"): 343 return ",,%s,," % translate_content(match.group("subtext")) 344 345 else: 346 return translate_text(match.group()) 347 348 def translate_text(s): 349 350 "Translate the plain text string 's', converting notation." 351 352 for before, after in notation_mapping: 353 s = s.replace(before, after) 354 return s 355 356 def translate_content(text, sectiontype=None): 357 358 """ 359 Return a translation of the given 'text'. If the optional 'sectiontype' is 360 specified, the translation may be modified to a form appropriate to the 361 section being translated. 362 """ 363 364 parts = [] 365 366 last = 0 367 for match in content_regexp.finditer(text): 368 start, end = match.span() 369 parts.append(translate_text(text[last:start])) 370 371 # Handle unformatted sections. 372 373 if sectiontype in ("code", "noformat"): 374 parts.append(match.group()) 375 else: 376 parts.append(translate_content_match(match)) 377 378 last = end 379 380 parts.append(translate_text(text[last:])) 381 return "".join(parts) 382 383 def translate_block(blocktype, blocktext): 384 385 "Translate the block with the given 'blocktype' and 'blocktext'." 386 387 parts = [] 388 389 # Translate headings and blockquotes. 390 391 if blocktypes.has_key(blocktype): 392 parts.append(blocktypes[blocktype] % blocktext) 393 394 # Translate list items. 395 396 elif blocktype == "list": 397 for listmarker, listitem in get_list_items(blocktext): 398 parts.append("%s %s" % (translate_marker(listmarker), translate_content(listitem))) 399 400 # Translate table items. 401 402 elif blocktype == "table": 403 parts.append("{{{#!table") 404 first = True 405 for cellsep, columns in get_table_rows(blocktext): 406 if not first: 407 parts.append("==") 408 else: 409 first = False 410 moinsep = translate_cellsep(cellsep) 411 parts.append(moinsep.join([translate_cell(cellsep, column) for column in columns])) 412 parts.append("}}}") 413 414 # Handle anonymous blocks. 415 416 else: 417 parts.append(translate_content(blocktext)) 418 419 return "\n".join(parts) 420 421 def get_table_rows(text): 422 423 "Return a list of (cellsep, columns) tuples for the given table 'text'." 424 425 rows = [] 426 427 for row in text.split("|\n"): 428 if not row: 429 break 430 431 row += "|" 432 cellsep = None 433 columns = [""] 434 last = 0 435 for match in table_content_regexp.finditer(row): 436 start, end = match.span() 437 columns[-1] += row[last:start] 438 439 if match.group("celltype"): 440 if cellsep is None: 441 cellsep = match.group("celltype") 442 columns.append("") 443 else: 444 columns[-1] += match.group() 445 446 last = end 447 448 columns[-1] += row[last:] 449 450 if cellsep: 451 rows.append((cellsep, columns[1:-1])) 452 453 return rows 454 455 sectiontypes = { 456 "code" : "", 457 "noformat" : "", 458 "quote" : "", 459 "info" : "#!wiki important\n", 460 "note" : "#!wiki caution\n", 461 "tip" : "#!wiki tip\n", 462 "warning" : "#!wiki warning\n", 463 } 464 465 macrotypes = { 466 "anchor" : "<<Anchor(%s)>>", 467 "color" : "<<Color(%s)>>", 468 } 469 470 # General parsing. 471 472 def parse_text(s): 473 474 "Parse the content in the string 's', returning the translation." 475 476 parts = [] 477 478 # Control spacing between blocks and other blocks or sections. 479 480 preceded_by_block = False 481 482 for type, text in get_regions(s): 483 484 # Handle list, heading, blockquote or anonymous blocks. 485 486 if type is None: 487 if preceded_by_block: 488 parts.append("\n") 489 490 first = True 491 for blocktype, blocktext in get_blocks(text): 492 if not first: 493 parts.append("\n") 494 else: 495 first = False 496 parts.append("%s" % translate_block(blocktype, blocktext)) 497 498 if not first: 499 preceded_by_block = True 500 501 # Handle sections. 502 503 else: 504 sectiontype, options = type 505 506 # Direct translations of sections. 507 508 if sectiontypes.has_key(sectiontype): 509 if preceded_by_block: 510 parts.append("\n") 511 mointype = sectiontypes[sectiontype] 512 513 parts.append("{{{%s" % (mointype or "")) 514 text = text.strip() 515 516 # Sections containing newlines must have a separate header line. 517 518 if options or text.find("\n") != -1: 519 parts.append("\n") 520 521 if options: 522 parts.append("## %s\n" % options) 523 parts.append(translate_content(text, sectiontype)) 524 parts.append("%s}}}\n" % (mointype and "\n" or "")) 525 526 preceded_by_block = True 527 528 # Translations of macros (which can look like sections). 529 530 elif macrotypes.has_key(sectiontype): 531 parts.append(macrotypes[sectiontype] % translate_content(text, sectiontype)) 532 preceded_by_block = False 533 534 # Unrecognised sections. 535 536 else: 537 parts.append("{{{") 538 539 # Sections containing newlines must have a separate header line. 540 541 if text.find("\n") != -1 and not text.startswith("\n"): 542 parts.append("\n") 543 544 parts.append(translate_content(text, sectiontype)) 545 parts.append("}}}") 546 preceded_by_block = False 547 548 return "".join(parts) 549 550 def parse(s, out): 551 552 "Parse the content in the string 's', writing a translation to 'out'." 553 554 out.write(parse_text(s)) 555 556 if __name__ == "__main__": 557 s = codecs.getreader("utf-8")(sys.stdin).read() 558 out = codecs.getwriter("utf-8")(sys.stdout) 559 parse(s, out) 560 561 # vim: tabstop=4 expandtab shiftwidth=4