1 #!/usr/bin/env python 2 3 """ 4 Confluence Wiki syntax parsing. 5 6 Copyright (C) 2012 Paul Boddie <paul@boddie.org.uk> 7 8 This software is free software; you can redistribute it and/or 9 modify it under the terms of the GNU General Public License as 10 published by the Free Software Foundation; either version 2 of 11 the License, or (at your option) any later version. 12 13 This software is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public 19 License along with this library; see the file LICENCE.txt 20 If not, write to the Free Software Foundation, Inc., 21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 22 23 -------- 24 25 The basic procedure is as follows: 26 27 1. Wiki pages are first split up into regions. 28 2. Then, within these regions, the text is split into blocks. 29 1. First, lists are identified. 30 2. Additionally, other block-like elements are identified. 31 3. Each block is then parsed. 32 """ 33 34 import re 35 36 # Section extraction. 37 38 sections_regexp_str = r"(?<!{)(?P<section>{(?P<type>[^{}\n]+)}.*?{(?P=type)})" 39 sections_regexp = re.compile(sections_regexp_str, re.DOTALL | re.MULTILINE) 40 41 def get_regions(s): 42 43 """ 44 Return a list of regions from 's'. Each region is specified using a tuple of 45 the form (type, text). 46 """ 47 48 last = 0 49 regions = [] 50 for match in sections_regexp.finditer(s): 51 start, end = match.span() 52 regions.append((None, s[last:start])) 53 regions.append(get_section_details(s[start:end])) 54 last = end 55 regions.append((None, s[last:])) 56 return regions 57 58 # Section inspection. 59 60 section_regexp_str = r"{(?P<sectiontype>[^\n]*?)}(?P<section>.*){(?P=sectiontype)}" 61 section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE) 62 63 def get_section_details(s): 64 65 "Return the details of a section 's' in the form (type, text)." 66 67 match = section_regexp.match(s) 68 if match: 69 return match.group("sectiontype"), match.group("section") 70 else: 71 return None, s 72 73 # Heading, table and list extraction. 74 75 list_regexp_str = r"^(?P<listtype>[*#-])[*#-]*.*\n((?P=listtype).*(?:\n|$))*" 76 table_regexp_str = r"^((?P<celltype>[|]{1,2})(.+?(?P=celltype))+(\n|$))+" 77 blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$" 78 79 blockelement_regexp = re.compile( 80 "(" + list_regexp_str + ")" 81 "|" 82 "(" + table_regexp_str + ")" 83 "|" 84 "(" + blocktext_regexp_str + ")", 85 re.MULTILINE 86 ) 87 88 def get_block_elements(s): 89 90 """ 91 Extract headings, tables and lists from the given string 's'. 92 """ 93 94 last = 0 95 blocks = [] 96 for match in blockelement_regexp.finditer(s): 97 start, end = match.span() 98 matchtype = match.group("listtype") and "list" or match.group("celltype") and "table" or match.group("type") 99 blocks.append((None, s[last:start])) 100 blocks.append((matchtype, match.group("text") or s[start:end])) 101 last = end 102 blocks.append((None, s[last:])) 103 return blocks 104 105 # Block extraction. 106 107 block_regexp_str = r"^(?:\s*\n)+" 108 block_regexp = re.compile(block_regexp_str, re.MULTILINE) 109 110 def get_basic_blocks(s): 111 112 """ 113 Return blocks from the given string 's' by splitting the text on blank lines 114 and eliminating those lines. 115 """ 116 117 return [b for b in block_regexp.split(s) if b.strip()] 118 119 # Block inspection. 120 121 def get_blocks(s): 122 123 """ 124 Return blocks from the given string 's', inspecting the basic blocks and 125 generating additional block-level text where appropriate. 126 """ 127 128 blocks = [] 129 130 for blocktype, blocktext in get_block_elements(s): 131 132 # Collect heading, list and table blocks. 133 134 if blocktype is not None: 135 blocks.append((blocktype, blocktext)) 136 137 # Attempt to find new subblocks in other regions. 138 139 else: 140 for block in get_basic_blocks(blocktext): 141 blocks.append((None, block)) 142 143 return blocks 144 145 # List item inspection. 146 147 listitem_regexp_str = r"^(?P<marker>[*#-])+\s*(?P<text>.*)$" 148 listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE) 149 150 def get_list_items(text): 151 152 "Return a list of (marker, text) tuples for the given list 'text'." 153 154 items = [] 155 156 for match in listitem_regexp.finditer(text): 157 items.append((match.group("marker"), match.group("text"))) 158 159 return items 160 161 # Table row inspection. 162 163 link_regexp_str = r"[[](?P<linktext>.*?)]" 164 image_regexp_str = r"!(?P<imagetext>.*?)!" 165 cellsep_regexp_str = r"(?P<celltype>[|]{1,2})" 166 content_regexp = re.compile( 167 "(" + link_regexp_str + ")" 168 "|" 169 "(" + image_regexp_str + ")" 170 "|" 171 "(" + cellsep_regexp_str + ")" 172 ) 173 174 def get_table_rows(text): 175 176 "Return a list of (cellsep, columns) tuples for the given table 'text'." 177 178 rows = [] 179 180 for line in text.split("\n"): 181 cellsep = None 182 columns = [""] 183 last = 0 184 for match in content_regexp.finditer(line): 185 start, end = match.span() 186 columns[-1] += line[last:start] 187 188 if match.group("celltype"): 189 if cellsep is None: 190 cellsep = match.group("celltype") 191 columns.append("") 192 else: 193 columns[-1] += line[start:end] 194 195 last = end 196 197 columns[-1] += line[last:] 198 199 if cellsep: 200 rows.append((cellsep, columns[1:-1])) 201 202 return rows 203 204 # General parsing and translation. 205 206 blocktypes = { 207 "h1" : "= %s =", 208 "h2" : "== %s ==", 209 "h3" : "=== %s ===", 210 "h4" : "==== %s ====", 211 "h5" : "===== %s =====", 212 "h6" : "====== %s ======", 213 "bq" : "{{{%s}}}", 214 } 215 216 markers = { 217 "*" : "*", 218 "#" : "1.", 219 "-" : "*", 220 } 221 222 def translate_marker(marker): 223 224 "Translate the given 'marker' to a suitable Moin representation." 225 226 return " " * len(marker) + markers[marker[-1]] 227 228 cellseps = { 229 "|" : "||", 230 "||" : "||", 231 } 232 233 cellextra = { 234 "|" : "", 235 "||" : "'''", 236 } 237 238 def translate_cellsep(cellsep): 239 240 "Translate the given 'cellsep' to a suitable Moin representation." 241 242 return cellseps[cellsep] 243 244 def translate_cell(cellsep, text): 245 246 "Using 'cellsep', translate the cell 'text'." 247 248 return cellextra[cellsep] + text + cellextra[cellsep] 249 250 def parse(s, out): 251 252 "Parse the content in the string 's', writing a translation to 'out'." 253 254 for type, text in get_regions(s): 255 256 # Handle list, heading, blockquote or anonymous blocks. 257 258 if type is None: 259 for blocktype, blocktext in get_blocks(text): 260 261 # Translate headings and blockquotes. 262 263 if blocktypes.has_key(blocktype): 264 print >>out, blocktypes[blocktype] % blocktext 265 266 # Translate list items. 267 268 elif blocktype == "list": 269 for listmarker, listitem in get_list_items(blocktext): 270 print >>out, "%s %s" % (translate_marker(listmarker), listitem) 271 272 # Translate table items. 273 274 elif blocktype == "table": 275 for cellsep, columns in get_table_rows(blocktext): 276 moinsep = translate_cellsep(cellsep) 277 print >>out, moinsep + moinsep.join([translate_cell(cellsep, column) for column in columns]) + moinsep 278 279 # Handle anonymous blocks. 280 281 else: 282 print >>out, blocktext.rstrip() 283 284 print >>out 285 286 # Handle sections. 287 288 else: 289 print >>out, "{{{", 290 print >>out, text, 291 print >>out, "}}}" 292 print >>out 293 294 if __name__ == "__main__": 295 import sys 296 297 s = sys.stdin.read() 298 parse(s, sys.stdout) 299 300 # vim: tabstop=4 expandtab shiftwidth=4