1 #!/usr/bin/env python 2 3 """ 4 Confluence Wiki syntax parsing. 5 6 Copyright (C) 2012 Paul Boddie <paul@boddie.org.uk> 7 8 This software is free software; you can redistribute it and/or 9 modify it under the terms of the GNU General Public License as 10 published by the Free Software Foundation; either version 2 of 11 the License, or (at your option) any later version. 12 13 This software is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public 19 License along with this library; see the file LICENCE.txt 20 If not, write to the Free Software Foundation, Inc., 21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 22 23 -------- 24 25 The basic procedure is as follows: 26 27 1. Wiki pages are first split up into regions. 28 2. Then, within these regions, the text is split into blocks. 29 1. First, lists are identified. 30 2. Additionally, other block-like elements are identified. 31 3. Each block is then parsed. 32 """ 33 34 import re 35 36 # Section extraction. 37 38 sections_regexp_str = r"(?<!{){(?P<type>[^{}\n:]+)(:[^}\n]+)?}.*?{(?P=type)}" 39 sections_regexp = re.compile(sections_regexp_str, re.DOTALL | re.MULTILINE) 40 41 def get_regions(s): 42 43 """ 44 Return a list of regions from 's'. Each region is specified using a tuple of 45 the form (type, text). 46 """ 47 48 last = 0 49 regions = [] 50 for match in sections_regexp.finditer(s): 51 start, end = match.span() 52 regions.append((None, s[last:start])) 53 regions.append(get_section_details(s[start:end])) 54 last = end 55 regions.append((None, s[last:])) 56 return regions 57 58 # Section inspection. 59 60 section_regexp_str = r"{(?P<sectiontype>[^\n:]*?)(?::(?P<options>.*?))?}(?P<section>.*){(?P=sectiontype)}" 61 section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE) 62 63 def get_section_details(s): 64 65 "Return the details of a section 's' in the form (type, text)." 66 67 match = section_regexp.match(s) 68 if match: 69 return (match.group("sectiontype"), match.group("options")), match.group("section") 70 else: 71 return None, s 72 73 # Heading, table and list extraction. 74 75 list_regexp_str = r"^(?P<listtype>[*#-])[*#-]*.*\n((?P=listtype).*(?:\n|$))*" 76 table_regexp_str = r"^((?P<celltype>[|]{1,2})(.+?(?P=celltype))+(\n|$))+" 77 blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$" 78 79 blockelement_regexp = re.compile( 80 "(" + list_regexp_str + ")" 81 "|" 82 "(" + table_regexp_str + ")" 83 "|" 84 "(" + blocktext_regexp_str + ")", 85 re.MULTILINE 86 ) 87 88 def get_block_elements(s): 89 90 """ 91 Extract headings, tables and lists from the given string 's'. 92 """ 93 94 last = 0 95 blocks = [] 96 for match in blockelement_regexp.finditer(s): 97 start, end = match.span() 98 matchtype = match.group("listtype") and "list" or match.group("celltype") and "table" or match.group("type") 99 blocks.append((None, s[last:start])) 100 blocks.append((matchtype, match.group("text") or s[start:end])) 101 last = end 102 blocks.append((None, s[last:])) 103 return blocks 104 105 # Block extraction. 106 107 block_regexp_str = r"^(?:\s*\n)+" 108 block_regexp = re.compile(block_regexp_str, re.MULTILINE) 109 110 def get_basic_blocks(s): 111 112 """ 113 Return blocks from the given string 's' by splitting the text on blank lines 114 and eliminating those lines. 115 """ 116 117 return [b for b in block_regexp.split(s) if b.strip()] 118 119 # Block inspection. 120 121 def get_blocks(s): 122 123 """ 124 Return blocks from the given string 's', inspecting the basic blocks and 125 generating additional block-level text where appropriate. 126 """ 127 128 blocks = [] 129 130 for blocktype, blocktext in get_block_elements(s): 131 132 # Collect heading, list and table blocks. 133 134 if blocktype is not None: 135 blocks.append((blocktype, blocktext)) 136 137 # Attempt to find new subblocks in other regions. 138 139 else: 140 for block in get_basic_blocks(blocktext): 141 blocks.append((None, block)) 142 143 return blocks 144 145 # List item inspection. 146 147 listitem_regexp_str = r"^(?P<marker>[*#-])+\s*(?P<text>.*)$" 148 listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE) 149 150 def get_list_items(text): 151 152 "Return a list of (marker, text) tuples for the given list 'text'." 153 154 items = [] 155 156 for match in listitem_regexp.finditer(text): 157 items.append((match.group("marker"), match.group("text"))) 158 159 return items 160 161 # Table row inspection. 162 163 link_regexp_str = r"[[](?P<linktext>.*?)]" 164 image_regexp_str = r"!(?P<imagetext>.*?)!" 165 cellsep_regexp_str = r"(?P<celltype>[|]{1,2})" 166 167 content_regexp_str = ( 168 "(" + link_regexp_str + ")" 169 "|" 170 "(" + image_regexp_str + ")" 171 ) 172 173 table_content_regexp_str = ( 174 content_regexp_str + 175 "|" 176 "(" + cellsep_regexp_str + ")" 177 ) 178 179 content_regexp = re.compile(content_regexp_str) 180 table_content_regexp = re.compile(table_content_regexp_str) 181 182 def translate_content_match(match): 183 184 "Translate the content described by the given 'match', returning a string." 185 186 if match.group("linktext"): 187 parts = match.group("linktext").split("|") 188 189 # NOTE: Proper detection of external links required. 190 191 if len(parts) > 1 and parts[1].startswith("http"): 192 prefix = "" 193 elif parts[0].startswith("#"): 194 prefix = "" 195 elif parts[0].startswith("^"): 196 prefix = "attachment:" 197 else: 198 prefix = "../" 199 200 if len(parts) == 1: 201 return "[[%s%s]]" % (prefix, parts[0]) 202 elif len(parts) == 2: 203 return "[[%s%s|%s]]" % (prefix, parts[1], parts[0]) 204 else: 205 return "[[%s%s|%s|title=%s]]" % (prefix, parts[1], parts[0], parts[2]) 206 207 elif match.group("imagetext"): 208 parts = match.group("imagetext").split("|") 209 210 # NOTE: Proper detection of external links required. 211 212 if parts[0].startswith("http"): 213 prefix = "" 214 else: 215 prefix = "attachment:" 216 217 # NOTE: Proper options conversion required. 218 219 if len(parts) == 1: 220 return "{{%s%s}}" % (prefix, parts[0]) 221 else: 222 return "{{%s%s|%s}}" % (prefix, parts[1], parts[0]) 223 224 else: 225 return match.group() 226 227 def get_table_rows(text): 228 229 "Return a list of (cellsep, columns) tuples for the given table 'text'." 230 231 rows = [] 232 233 for line in text.split("\n"): 234 cellsep = None 235 columns = [""] 236 last = 0 237 for match in table_content_regexp.finditer(line): 238 start, end = match.span() 239 columns[-1] += line[last:start] 240 241 if match.group("celltype"): 242 if cellsep is None: 243 cellsep = match.group("celltype") 244 columns.append("") 245 else: 246 columns[-1] += match.group() 247 248 last = end 249 250 columns[-1] += line[last:] 251 252 if cellsep: 253 rows.append((cellsep, columns[1:-1])) 254 255 return rows 256 257 def translate_content(text): 258 259 "Return a translation of the given 'text'." 260 261 parts = [] 262 263 last = 0 264 for match in content_regexp.finditer(text): 265 start, end = match.span() 266 parts.append(text[last:start]) 267 parts.append(translate_content_match(match)) 268 last = end 269 270 parts.append(text[last:]) 271 return "".join(parts) 272 273 # Translation helpers. 274 275 blocktypes = { 276 "h1" : "= %s =", 277 "h2" : "== %s ==", 278 "h3" : "=== %s ===", 279 "h4" : "==== %s ====", 280 "h5" : "===== %s =====", 281 "h6" : "====== %s ======", 282 "bq" : "{{{%s}}}", 283 } 284 285 markers = { 286 "*" : "*", 287 "#" : "1.", 288 "-" : "*", 289 } 290 291 def translate_marker(marker): 292 293 "Translate the given 'marker' to a suitable Moin representation." 294 295 return " " * len(marker) + markers[marker[-1]] 296 297 cellseps = { 298 "|" : "||", 299 "||" : "||", 300 } 301 302 cellextra = { 303 "|" : "", 304 "||" : "'''", 305 } 306 307 def translate_cellsep(cellsep): 308 309 "Translate the given 'cellsep' to a suitable Moin representation." 310 311 return cellseps[cellsep] 312 313 def translate_cell(cellsep, text): 314 315 "Using 'cellsep', translate the cell 'text'." 316 317 return cellextra[cellsep] + translate_content(text) + cellextra[cellsep] 318 319 sectiontypes = { 320 "code" : "", 321 "noformat" : "", 322 "quote" : "", 323 "info" : "wiki important", 324 "note" : "wiki caution", 325 "tip" : "wiki tip", 326 "warning" : "wiki warning", 327 } 328 329 # General parsing. 330 331 def parse(s, out): 332 333 "Parse the content in the string 's', writing a translation to 'out'." 334 335 for type, text in get_regions(s): 336 337 # Handle list, heading, blockquote or anonymous blocks. 338 339 if type is None: 340 for blocktype, blocktext in get_blocks(text): 341 342 # Translate headings and blockquotes. 343 344 if blocktypes.has_key(blocktype): 345 print >>out, blocktypes[blocktype] % blocktext 346 347 # Translate list items. 348 349 elif blocktype == "list": 350 for listmarker, listitem in get_list_items(blocktext): 351 print >>out, "%s %s" % (translate_marker(listmarker), translate_content(listitem)) 352 353 # Translate table items. 354 355 elif blocktype == "table": 356 for cellsep, columns in get_table_rows(blocktext): 357 moinsep = translate_cellsep(cellsep) 358 print >>out, moinsep + moinsep.join([translate_cell(cellsep, column) for column in columns]) + moinsep 359 360 # Handle anonymous blocks. 361 362 else: 363 print >>out, translate_content(blocktext.rstrip()) 364 365 print >>out 366 367 # Handle sections. 368 369 else: 370 sectiontype, options = type 371 372 # Direct translations of sections. 373 374 mointype = sectiontypes.get(sectiontype) 375 if mointype: 376 print >>out, "{{{#!%s" % mointype 377 if options: 378 print >>out, "##", options 379 else: 380 print >>out, "{{{", 381 print >>out, translate_content(text), 382 print >>out, "}}}" 383 print >>out 384 385 if __name__ == "__main__": 386 import sys 387 388 s = sys.stdin.read() 389 parse(s, sys.stdout) 390 391 # vim: tabstop=4 expandtab shiftwidth=4