ConfluenceConverter (annotate wikiparser.py in 371d25b0f062)

ConfluenceConverter

Annotated wikiparser.py

41:371d25b0f062

2013-03-02

Paul Boddie

Added XHTML table support; fixed Wiki markup list recognition, avoiding bold formatting conflicts; added tests of XHTML tables and Wiki markup lists. Added UTF-8 output support to the test programs.

paul@6	1	#!/usr/bin/env python
paul@6	2
paul@7	3	"""
paul@7	4	Confluence Wiki syntax parsing.
paul@7	5
paul@34	6	Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>
paul@8	7
paul@8	8	This software is free software; you can redistribute it and/or
paul@8	9	modify it under the terms of the GNU General Public License as
paul@8	10	published by the Free Software Foundation; either version 2 of
paul@8	11	the License, or (at your option) any later version.
paul@8	12
paul@8	13	This software is distributed in the hope that it will be useful,
paul@8	14	but WITHOUT ANY WARRANTY; without even the implied warranty of
paul@8	15	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
paul@8	16	GNU General Public License for more details.
paul@8	17
paul@8	18	You should have received a copy of the GNU General Public
paul@8	19	License along with this library; see the file LICENCE.txt
paul@8	20	If not, write to the Free Software Foundation, Inc.,
paul@8	21	51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
paul@8	22
paul@8	23	--------
paul@8	24
paul@8	25	The basic procedure is as follows:
paul@8	26
paul@7	27	1. Wiki pages are first split up into regions.
paul@7	28	2. Then, within these regions, the text is split into blocks.
paul@7	29	1. First, lists are identified.
paul@7	30	2. Additionally, other block-like elements are identified.
paul@7	31	3. Each block is then parsed.
paul@7	32	"""
paul@7	33
paul@35	34	from common import *
paul@6	35	import re
paul@25	36	import sys
paul@41	37	import codecs
paul@19	38
paul@6	39	# Section extraction.
paul@6	40
paul@19	41	sections_regexp_str = r"(?<!{){(?P<type>[^-_+{}\n:]+)(:[^}\n]+)?}.?{(?P=type)}"
paul@6	42	sections_regexp = re.compile(sections_regexp_str, re.DOTALL \| re.MULTILINE)
paul@6	43
paul@6	44	def get_regions(s):
paul@6	45
paul@6	46	"""
paul@6	47	Return a list of regions from 's'. Each region is specified using a tuple of
paul@6	48	the form (type, text).
paul@6	49	"""
paul@6	50
paul@6	51	last = 0
paul@6	52	regions = []
paul@6	53	for match in sections_regexp.finditer(s):
paul@6	54	start, end = match.span()
paul@6	55	regions.append((None, s[last:start]))
paul@6	56	regions.append(get_section_details(s[start:end]))
paul@6	57	last = end
paul@6	58	regions.append((None, s[last:]))
paul@6	59	return regions
paul@6	60
paul@7	61	# Section inspection.
paul@7	62
paul@15	63	section_regexp_str = r"{(?P<sectiontype>[^\n:]?)(?::(?P<options>.?))?}(?P<section>.*){(?P=sectiontype)}"
paul@7	64	section_regexp = re.compile(section_regexp_str, re.DOTALL \| re.MULTILINE)
paul@7	65
paul@6	66	def get_section_details(s):
paul@6	67
paul@7	68	"Return the details of a section 's' in the form (type, text)."
paul@6	69
paul@6	70	match = section_regexp.match(s)
paul@6	71	if match:
paul@15	72	return (match.group("sectiontype"), match.group("options")), match.group("section")
paul@6	73	else:
paul@6	74	return None, s
paul@6	75
paul@14	76	# Heading, table and list extraction.
paul@7	77
paul@41	78	list_regexp_str = r"^\s(?P<listtype>[#-])[#-]\s+.(\n\s(?P=listtype).?)(?:\n\|$)"
paul@39	79	table_regexp_str = r"^((?P<celltype>[\|]{1,2})((.\|\n(?!\n))+?(?P=celltype))+(\n\|$))+"
paul@14	80	blocktext_regexp_str = r"^(?P<type>h\d\|bq)\.\s+(?P<text>.*)$"
paul@7	81
paul@14	82	blockelement_regexp = re.compile(
paul@14	83	"(" + list_regexp_str + ")"
paul@14	84	"\|"
paul@14	85	"(" + table_regexp_str + ")"
paul@14	86	"\|"
paul@14	87	"(" + blocktext_regexp_str + ")",
paul@14	88	re.MULTILINE
paul@14	89	)
paul@14	90
paul@14	91	def get_block_elements(s):
paul@7	92
paul@7	93	"""
paul@14	94	Extract headings, tables and lists from the given string 's'.
paul@7	95	"""
paul@7	96
paul@7	97	last = 0
paul@7	98	blocks = []
paul@14	99	for match in blockelement_regexp.finditer(s):
paul@7	100	start, end = match.span()
paul@14	101	matchtype = match.group("listtype") and "list" or match.group("celltype") and "table" or match.group("type")
paul@7	102	blocks.append((None, s[last:start]))
paul@14	103	blocks.append((matchtype, match.group("text") or s[start:end]))
paul@7	104	last = end
paul@7	105	blocks.append((None, s[last:]))
paul@7	106	return blocks
paul@7	107
paul@7	108	# Block extraction.
paul@7	109
paul@7	110	block_regexp_str = r"^(?:\s*\n)+"
paul@7	111	block_regexp = re.compile(block_regexp_str, re.MULTILINE)
paul@7	112
paul@7	113	def get_basic_blocks(s):
paul@7	114
paul@7	115	"""
paul@7	116	Return blocks from the given string 's' by splitting the text on blank lines
paul@7	117	and eliminating those lines.
paul@7	118	"""
paul@7	119
paul@7	120	return [b for b in block_regexp.split(s) if b.strip()]
paul@7	121
paul@7	122	# Block inspection.
paul@7	123
paul@7	124	def get_blocks(s):
paul@7	125
paul@7	126	"""
paul@7	127	Return blocks from the given string 's', inspecting the basic blocks and
paul@7	128	generating additional block-level text where appropriate.
paul@7	129	"""
paul@7	130
paul@7	131	blocks = []
paul@7	132
paul@14	133	for blocktype, blocktext in get_block_elements(s):
paul@7	134
paul@14	135	# Collect heading, list and table blocks.
paul@7	136
paul@7	137	if blocktype is not None:
paul@7	138	blocks.append((blocktype, blocktext))
paul@7	139
paul@7	140	# Attempt to find new subblocks in other regions.
paul@7	141
paul@7	142	else:
paul@7	143	for block in get_basic_blocks(blocktext):
paul@14	144	blocks.append((None, block))
paul@7	145
paul@7	146	return blocks
paul@7	147
paul@14	148	# List item inspection.
paul@14	149
paul@41	150	listitem_regexp_str = r"^(?P<marker> [-#]+)\s+(?P<text>.*)$"
paul@7	151	listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE)
paul@7	152
paul@14	153	def get_list_items(text):
paul@14	154
paul@14	155	"Return a list of (marker, text) tuples for the given list 'text'."
paul@14	156
paul@14	157	items = []
paul@14	158
paul@14	159	for match in listitem_regexp.finditer(text):
paul@14	160	items.append((match.group("marker"), match.group("text")))
paul@14	161
paul@14	162	return items
paul@14	163
paul@36	164	# Content inspection.
paul@14	165
paul@19	166	monospace_regexp_str = r"{{(?P<monotext>.*?)}}"
paul@36	167	link_regexp_str = r"[[](?P<linktext>.*?)]"
paul@38	168	image_regexp_str = r"!(?P<imagetext>\w.*?)!"
paul@36	169
paul@36	170	# Word-dependent patterns.
paul@36	171	# Here, the unbracketed markers must test for the absence of surrounding word
paul@36	172	# characters.
paul@36	173
paul@36	174	italic_regexp_str = r"(?:(?<!\w)_\|\{_\})(?P<italictext>.*?)(?:_(?!\w)\|\{_\})"
paul@36	175	bold_regexp_str = r"(?:(?<!\w)\\|\{\\})(?P<boldtext>.?)(?:\(?!\w)\|\{\*\})"
paul@36	176	del_regexp_str = r"(?:(?<!\w)-\|\{-\})(?P<deltext>.*?)(?:-(?!\w)\|\{-\})"
paul@36	177	underline_regexp_str = r"(?:(?<!\w)\+\|\{\+\})(?P<underlinetext>.*?)(?:\+(?!\w)\|\{\+\})"
paul@36	178	sub_regexp_str = r"(?:(?<!\w)~\|\{~\})(?P<subtext>.*?)(?:~(?!\w)\|\{~\})"
paul@16	179
paul@16	180	content_regexp_str = (
paul@19	181	"(" + monospace_regexp_str + ")"
paul@19	182	"\|"
paul@14	183	"(" + link_regexp_str + ")"
paul@14	184	"\|"
paul@14	185	"(" + image_regexp_str + ")"
paul@36	186	"\|"
paul@36	187	"(" + italic_regexp_str + ")"
paul@36	188	"\|"
paul@36	189	"(" + bold_regexp_str + ")"
paul@36	190	"\|"
paul@36	191	"(" + del_regexp_str + ")"
paul@36	192	"\|"
paul@36	193	"(" + underline_regexp_str + ")"
paul@36	194	"\|"
paul@36	195	"(" + sub_regexp_str + ")"
paul@16	196	)
paul@16	197
paul@36	198	# Table row inspection.
paul@36	199
paul@36	200	cellsep_regexp_str = r"(?P<celltype>[\|]{1,2})"
paul@36	201
paul@16	202	table_content_regexp_str = (
paul@16	203	content_regexp_str +
paul@14	204	"\|"
paul@14	205	"(" + cellsep_regexp_str + ")"
paul@14	206	)
paul@14	207
paul@16	208	content_regexp = re.compile(content_regexp_str)
paul@16	209	table_content_regexp = re.compile(table_content_regexp_str)
paul@16	210
paul@39	211	# Notation conversion.
paul@39	212
paul@39	213	notation_mapping = [
paul@39	214	(r"\!", "!"),
paul@39	215	(r"\-", "-"),
paul@39	216	(r"\\""\n", "<<BR>> "),
paul@39	217	(r"\\ ", " "),
paul@39	218	]
paul@39	219
paul@39	220	# Translation helpers.
paul@39	221
paul@39	222	markers = {
paul@39	223	"" : "",
paul@39	224	"#" : "1.",
paul@39	225	"-" : "*",
paul@39	226	}
paul@39	227
paul@39	228	def translate_marker(marker):
paul@39	229
paul@39	230	"Translate the given 'marker' to a suitable Moin representation."
paul@39	231
paul@39	232	return " " * len(marker) + markers[marker[-1]]
paul@39	233
paul@39	234	cellseps = {
paul@39	235	"\|" : "\|\|",
paul@39	236	"\|\|" : "\|\|",
paul@39	237	}
paul@39	238
paul@39	239	cellextra = {
paul@39	240	"\|" : "",
paul@39	241	"\|\|" : "'''",
paul@39	242	}
paul@39	243
paul@39	244	def translate_cellsep(cellsep):
paul@39	245
paul@39	246	"Translate the given 'cellsep' to a suitable Moin representation."
paul@39	247
paul@39	248	return cellseps[cellsep]
paul@39	249
paul@39	250	def translate_cell(cellsep, text):
paul@39	251
paul@39	252	"Using 'cellsep', translate the cell 'text'."
paul@39	253
paul@39	254	return cellextra[cellsep] + parse_text(text) + cellextra[cellsep]
paul@39	255
paul@16	256	def translate_content_match(match):
paul@16	257
paul@16	258	"Translate the content described by the given 'match', returning a string."
paul@16	259
paul@19	260	if match.group("monotext"):
paul@19	261	return "{{{%s}}}" % match.group("monotext")
paul@19	262
paul@19	263	elif match.group("linktext"):
paul@16	264	parts = match.group("linktext").split("\|")
paul@16	265
paul@16	266	# NOTE: Proper detection of external links required.
paul@16	267
paul@19	268	if len(parts) == 1:
paul@22	269	label, target, title = None, parts[0], None
paul@19	270	elif len(parts) == 2:
paul@22	271	(label, target), title = parts, None
paul@19	272	else:
paul@19	273	label, target, title = parts
paul@19	274
paul@21	275	target = target.strip()
paul@21	276
paul@22	277	# Look for namespace links and rewrite them.
paul@22	278
paul@19	279	if target.find(":") != -1:
paul@16	280	prefix = ""
paul@19	281	space, rest = target.split(":", 1)
paul@19	282	if space not in URL_SCHEMES:
paul@19	283	target = "%s/%s" % (space, rest)
paul@22	284
paul@22	285	# Detect anchors.
paul@22	286
paul@19	287	elif target.startswith("#"):
paul@16	288	prefix = ""
paul@22	289
paul@22	290	# Detect attachments.
paul@22	291
paul@19	292	elif target.startswith("^"):
paul@16	293	prefix = "attachment:"
paul@22	294
paul@22	295	# Link to other pages within a space.
paul@22	296
paul@16	297	else:
paul@16	298	prefix = "../"
paul@16	299
paul@22	300	# Make the link tidier by making a target if none was given.
paul@22	301
paul@22	302	if not label:
paul@22	303	label = target
paul@22	304
paul@22	305	if not label and not title:
paul@19	306	return "[[%s%s]]" % (prefix, target)
paul@22	307	elif not title:
paul@19	308	return "[[%s%s\|%s]]" % (prefix, target, label)
paul@16	309	else:
paul@19	310	return "[[%s%s\|%s\|title=%s]]" % (prefix, target, label, title)
paul@16	311
paul@16	312	elif match.group("imagetext"):
paul@16	313	parts = match.group("imagetext").split("\|")
paul@16	314
paul@16	315	# NOTE: Proper detection of external links required.
paul@16	316
paul@16	317	if parts[0].startswith("http"):
paul@16	318	prefix = ""
paul@16	319	else:
paul@16	320	prefix = "attachment:"
paul@16	321
paul@16	322	# NOTE: Proper options conversion required.
paul@16	323
paul@16	324	if len(parts) == 1:
paul@16	325	return "{{%s%s}}" % (prefix, parts[0])
paul@16	326	else:
paul@19	327	return "{{%s%s\|%s}}" % (prefix, parts[0], parts[1])
paul@16	328
paul@36	329	elif match.group("italictext"):
paul@36	330	return "''%s''" % translate_content(match.group("italictext"))
paul@36	331
paul@36	332	elif match.group("boldtext"):
paul@36	333	return "'''%s'''" % translate_content(match.group("boldtext"))
paul@36	334
paul@36	335	elif match.group("deltext"):
paul@36	336	return "--(%s)--" % translate_content(match.group("deltext"))
paul@36	337
paul@36	338	elif match.group("underlinetext"):
paul@36	339	return "__%s__" % translate_content(match.group("underlinetext"))
paul@36	340
paul@36	341	elif match.group("subtext"):
paul@36	342	return ",,%s,," % translate_content(match.group("subtext"))
paul@36	343
paul@16	344	else:
paul@39	345	return translate_text(match.group())
paul@39	346
paul@39	347	def translate_text(s):
paul@39	348
paul@39	349	"Translate the plain text string 's', converting notation."
paul@39	350
paul@39	351	for before, after in notation_mapping:
paul@39	352	s = s.replace(before, after)
paul@39	353	return s
paul@16	354
paul@36	355	def translate_content(text, sectiontype=None):
paul@36	356
paul@36	357	"""
paul@36	358	Return a translation of the given 'text'. If the optional 'sectiontype' is
paul@36	359	specified, the translation may be modified to a form appropriate to the
paul@36	360	section being translated.
paul@36	361	"""
paul@36	362
paul@36	363	parts = []
paul@36	364
paul@36	365	last = 0
paul@36	366	for match in content_regexp.finditer(text):
paul@36	367	start, end = match.span()
paul@39	368	parts.append(translate_text(text[last:start]))
paul@36	369
paul@36	370	# Handle unformatted sections.
paul@36	371
paul@36	372	if sectiontype in ("code", "noformat"):
paul@36	373	parts.append(match.group())
paul@36	374	else:
paul@36	375	parts.append(translate_content_match(match))
paul@36	376
paul@36	377	last = end
paul@36	378
paul@39	379	parts.append(translate_text(text[last:]))
paul@36	380	return "".join(parts)
paul@36	381
paul@39	382	def translate_block(blocktype, blocktext):
paul@39	383
paul@39	384	"Translate the block with the given 'blocktype' and 'blocktext'."
paul@39	385
paul@39	386	parts = []
paul@39	387
paul@39	388	# Translate headings and blockquotes.
paul@39	389
paul@39	390	if blocktypes.has_key(blocktype):
paul@39	391	parts.append(blocktypes[blocktype] % blocktext)
paul@39	392
paul@39	393	# Translate list items.
paul@39	394
paul@39	395	elif blocktype == "list":
paul@39	396	for listmarker, listitem in get_list_items(blocktext):
paul@39	397	parts.append("%s %s" % (translate_marker(listmarker), translate_content(listitem)))
paul@39	398
paul@39	399	# Translate table items.
paul@39	400
paul@39	401	elif blocktype == "table":
paul@39	402	parts.append("{{{#!table")
paul@39	403	first = True
paul@39	404	for cellsep, columns in get_table_rows(blocktext):
paul@39	405	if not first:
paul@39	406	parts.append("==")
paul@39	407	else:
paul@39	408	first = False
paul@39	409	moinsep = translate_cellsep(cellsep)
paul@39	410	parts.append(moinsep.join([translate_cell(cellsep, column) for column in columns]))
paul@39	411	parts.append("}}}")
paul@39	412
paul@39	413	# Handle anonymous blocks.
paul@39	414
paul@39	415	else:
paul@39	416	parts.append(translate_content(blocktext).rstrip())
paul@39	417
paul@39	418	return "\n".join(parts) + "\n"
paul@39	419
paul@14	420	def get_table_rows(text):
paul@14	421
paul@14	422	"Return a list of (cellsep, columns) tuples for the given table 'text'."
paul@14	423
paul@14	424	rows = []
paul@14	425
paul@39	426	for row in text.split("\|\n"):
paul@39	427	if not row:
paul@39	428	break
paul@39	429
paul@39	430	row += "\|"
paul@14	431	cellsep = None
paul@14	432	columns = [""]
paul@14	433	last = 0
paul@39	434	for match in table_content_regexp.finditer(row):
paul@14	435	start, end = match.span()
paul@39	436	columns[-1] += row[last:start]
paul@14	437
paul@14	438	if match.group("celltype"):
paul@14	439	if cellsep is None:
paul@14	440	cellsep = match.group("celltype")
paul@14	441	columns.append("")
paul@14	442	else:
paul@16	443	columns[-1] += match.group()
paul@14	444
paul@14	445	last = end
paul@14	446
paul@39	447	columns[-1] += row[last:]
paul@14	448
paul@14	449	if cellsep:
paul@14	450	rows.append((cellsep, columns[1:-1]))
paul@14	451
paul@14	452	return rows
paul@14	453
paul@15	454	sectiontypes = {
paul@15	455	"code" : "",
paul@15	456	"noformat" : "",
paul@15	457	"quote" : "",
paul@15	458	"info" : "wiki important",
paul@15	459	"note" : "wiki caution",
paul@15	460	"tip" : "wiki tip",
paul@15	461	"warning" : "wiki warning",
paul@15	462	}
paul@15	463
paul@15	464	# General parsing.
paul@15	465
paul@39	466	def parse_text(s):
paul@11	467
paul@39	468	"Parse the content in the string 's', returning the translation."
paul@39	469
paul@39	470	parts = []
paul@11	471
paul@11	472	for type, text in get_regions(s):
paul@11	473
paul@11	474	# Handle list, heading, blockquote or anonymous blocks.
paul@11	475
paul@11	476	if type is None:
paul@11	477	for blocktype, blocktext in get_blocks(text):
paul@39	478	parts.append("%s\n" % translate_block(blocktype, blocktext))
paul@11	479
paul@11	480	# Handle sections.
paul@11	481
paul@11	482	else:
paul@15	483	sectiontype, options = type
paul@15	484
paul@15	485	# Direct translations of sections.
paul@15	486
paul@15	487	mointype = sectiontypes.get(sectiontype)
paul@15	488	if mointype:
paul@39	489	parts.append("{{{#!%s\n" % mointype)
paul@15	490	if options:
paul@39	491	parts.append("## %s\n" % options)
paul@15	492	else:
paul@39	493	parts.append("{{{")
paul@39	494	parts.append(translate_content(text, sectiontype))
paul@39	495	parts.append("}}}\n")
paul@39	496
paul@39	497	return "".join(parts)
paul@39	498
paul@39	499	def parse(s, out):
paul@39	500
paul@39	501	"Parse the content in the string 's', writing a translation to 'out'."
paul@39	502
paul@39	503	out.write(parse_text(s))
paul@11	504
paul@6	505	if __name__ == "__main__":
paul@6	506	s = sys.stdin.read()
paul@41	507	out = codecs.getwriter("utf-8")(sys.stdout)
paul@41	508	parse(s, out)
paul@6	509
paul@6	510	# vim: tabstop=4 expandtab shiftwidth=4