ConfluenceConverter (annotate wikiparser.py in aa8a26c5de70)

ConfluenceConverter

Annotated wikiparser.py

37:1eb319255ae9

36:aa8a26c5de70

2013-02-23

Paul Boddie

Added translation of various text styles. Fixed recognition of lists at the end of page regions.

paul@6	1	#!/usr/bin/env python
paul@6	2
paul@7	3	"""
paul@7	4	Confluence Wiki syntax parsing.
paul@7	5
paul@34	6	Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>
paul@8	7
paul@8	8	This software is free software; you can redistribute it and/or
paul@8	9	modify it under the terms of the GNU General Public License as
paul@8	10	published by the Free Software Foundation; either version 2 of
paul@8	11	the License, or (at your option) any later version.
paul@8	12
paul@8	13	This software is distributed in the hope that it will be useful,
paul@8	14	but WITHOUT ANY WARRANTY; without even the implied warranty of
paul@8	15	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
paul@8	16	GNU General Public License for more details.
paul@8	17
paul@8	18	You should have received a copy of the GNU General Public
paul@8	19	License along with this library; see the file LICENCE.txt
paul@8	20	If not, write to the Free Software Foundation, Inc.,
paul@8	21	51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
paul@8	22
paul@8	23	--------
paul@8	24
paul@8	25	The basic procedure is as follows:
paul@8	26
paul@7	27	1. Wiki pages are first split up into regions.
paul@7	28	2. Then, within these regions, the text is split into blocks.
paul@7	29	1. First, lists are identified.
paul@7	30	2. Additionally, other block-like elements are identified.
paul@7	31	3. Each block is then parsed.
paul@7	32	"""
paul@7	33
paul@35	34	from common import *
paul@6	35	import re
paul@25	36	import sys
paul@19	37
paul@6	38	# Section extraction.
paul@6	39
paul@19	40	sections_regexp_str = r"(?<!{){(?P<type>[^-_+{}\n:]+)(:[^}\n]+)?}.?{(?P=type)}"
paul@6	41	sections_regexp = re.compile(sections_regexp_str, re.DOTALL \| re.MULTILINE)
paul@6	42
paul@6	43	def get_regions(s):
paul@6	44
paul@6	45	"""
paul@6	46	Return a list of regions from 's'. Each region is specified using a tuple of
paul@6	47	the form (type, text).
paul@6	48	"""
paul@6	49
paul@6	50	last = 0
paul@6	51	regions = []
paul@6	52	for match in sections_regexp.finditer(s):
paul@6	53	start, end = match.span()
paul@6	54	regions.append((None, s[last:start]))
paul@6	55	regions.append(get_section_details(s[start:end]))
paul@6	56	last = end
paul@6	57	regions.append((None, s[last:]))
paul@6	58	return regions
paul@6	59
paul@7	60	# Section inspection.
paul@7	61
paul@15	62	section_regexp_str = r"{(?P<sectiontype>[^\n:]?)(?::(?P<options>.?))?}(?P<section>.*){(?P=sectiontype)}"
paul@7	63	section_regexp = re.compile(section_regexp_str, re.DOTALL \| re.MULTILINE)
paul@7	64
paul@6	65	def get_section_details(s):
paul@6	66
paul@7	67	"Return the details of a section 's' in the form (type, text)."
paul@6	68
paul@6	69	match = section_regexp.match(s)
paul@6	70	if match:
paul@15	71	return (match.group("sectiontype"), match.group("options")), match.group("section")
paul@6	72	else:
paul@6	73	return None, s
paul@6	74
paul@14	75	# Heading, table and list extraction.
paul@7	76
paul@36	77	list_regexp_str = r"^\s(?P<listtype>[#-])[#-].(\n\s(?P=listtype).?)(?:\n\|$)"
paul@14	78	table_regexp_str = r"^((?P<celltype>[\|]{1,2})(.+?(?P=celltype))+(\n\|$))+"
paul@14	79	blocktext_regexp_str = r"^(?P<type>h\d\|bq)\.\s+(?P<text>.*)$"
paul@7	80
paul@14	81	blockelement_regexp = re.compile(
paul@14	82	"(" + list_regexp_str + ")"
paul@14	83	"\|"
paul@14	84	"(" + table_regexp_str + ")"
paul@14	85	"\|"
paul@14	86	"(" + blocktext_regexp_str + ")",
paul@14	87	re.MULTILINE
paul@14	88	)
paul@14	89
paul@14	90	def get_block_elements(s):
paul@7	91
paul@7	92	"""
paul@14	93	Extract headings, tables and lists from the given string 's'.
paul@7	94	"""
paul@7	95
paul@7	96	last = 0
paul@7	97	blocks = []
paul@14	98	for match in blockelement_regexp.finditer(s):
paul@7	99	start, end = match.span()
paul@14	100	matchtype = match.group("listtype") and "list" or match.group("celltype") and "table" or match.group("type")
paul@7	101	blocks.append((None, s[last:start]))
paul@14	102	blocks.append((matchtype, match.group("text") or s[start:end]))
paul@7	103	last = end
paul@7	104	blocks.append((None, s[last:]))
paul@7	105	return blocks
paul@7	106
paul@7	107	# Block extraction.
paul@7	108
paul@7	109	block_regexp_str = r"^(?:\s*\n)+"
paul@7	110	block_regexp = re.compile(block_regexp_str, re.MULTILINE)
paul@7	111
paul@7	112	def get_basic_blocks(s):
paul@7	113
paul@7	114	"""
paul@7	115	Return blocks from the given string 's' by splitting the text on blank lines
paul@7	116	and eliminating those lines.
paul@7	117	"""
paul@7	118
paul@7	119	return [b for b in block_regexp.split(s) if b.strip()]
paul@7	120
paul@7	121	# Block inspection.
paul@7	122
paul@7	123	def get_blocks(s):
paul@7	124
paul@7	125	"""
paul@7	126	Return blocks from the given string 's', inspecting the basic blocks and
paul@7	127	generating additional block-level text where appropriate.
paul@7	128	"""
paul@7	129
paul@7	130	blocks = []
paul@7	131
paul@14	132	for blocktype, blocktext in get_block_elements(s):
paul@7	133
paul@14	134	# Collect heading, list and table blocks.
paul@7	135
paul@7	136	if blocktype is not None:
paul@7	137	blocks.append((blocktype, blocktext))
paul@7	138
paul@7	139	# Attempt to find new subblocks in other regions.
paul@7	140
paul@7	141	else:
paul@7	142	for block in get_basic_blocks(blocktext):
paul@14	143	blocks.append((None, block))
paul@7	144
paul@7	145	return blocks
paul@7	146
paul@14	147	# List item inspection.
paul@14	148
paul@17	149	listitem_regexp_str = r"^(?P<marker> [-#]+)\s(?P<text>.)$"
paul@7	150	listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE)
paul@7	151
paul@14	152	def get_list_items(text):
paul@14	153
paul@14	154	"Return a list of (marker, text) tuples for the given list 'text'."
paul@14	155
paul@14	156	items = []
paul@14	157
paul@14	158	for match in listitem_regexp.finditer(text):
paul@14	159	items.append((match.group("marker"), match.group("text")))
paul@14	160
paul@14	161	return items
paul@14	162
paul@36	163	# Content inspection.
paul@14	164
paul@19	165	monospace_regexp_str = r"{{(?P<monotext>.*?)}}"
paul@36	166	link_regexp_str = r"[[](?P<linktext>.*?)]"
paul@36	167	image_regexp_str = r"!(?P<imagetext>.*?)!"
paul@36	168
paul@36	169	# Word-dependent patterns.
paul@36	170	# Here, the unbracketed markers must test for the absence of surrounding word
paul@36	171	# characters.
paul@36	172
paul@36	173	italic_regexp_str = r"(?:(?<!\w)_\|\{_\})(?P<italictext>.*?)(?:_(?!\w)\|\{_\})"
paul@36	174	bold_regexp_str = r"(?:(?<!\w)\\|\{\\})(?P<boldtext>.?)(?:\(?!\w)\|\{\*\})"
paul@36	175	del_regexp_str = r"(?:(?<!\w)-\|\{-\})(?P<deltext>.*?)(?:-(?!\w)\|\{-\})"
paul@36	176	underline_regexp_str = r"(?:(?<!\w)\+\|\{\+\})(?P<underlinetext>.*?)(?:\+(?!\w)\|\{\+\})"
paul@36	177	sub_regexp_str = r"(?:(?<!\w)~\|\{~\})(?P<subtext>.*?)(?:~(?!\w)\|\{~\})"
paul@16	178
paul@16	179	content_regexp_str = (
paul@19	180	"(" + monospace_regexp_str + ")"
paul@19	181	"\|"
paul@14	182	"(" + link_regexp_str + ")"
paul@14	183	"\|"
paul@14	184	"(" + image_regexp_str + ")"
paul@36	185	"\|"
paul@36	186	"(" + italic_regexp_str + ")"
paul@36	187	"\|"
paul@36	188	"(" + bold_regexp_str + ")"
paul@36	189	"\|"
paul@36	190	"(" + del_regexp_str + ")"
paul@36	191	"\|"
paul@36	192	"(" + underline_regexp_str + ")"
paul@36	193	"\|"
paul@36	194	"(" + sub_regexp_str + ")"
paul@16	195	)
paul@16	196
paul@36	197	# Table row inspection.
paul@36	198
paul@36	199	cellsep_regexp_str = r"(?P<celltype>[\|]{1,2})"
paul@36	200
paul@16	201	table_content_regexp_str = (
paul@16	202	content_regexp_str +
paul@14	203	"\|"
paul@14	204	"(" + cellsep_regexp_str + ")"
paul@14	205	)
paul@14	206
paul@16	207	content_regexp = re.compile(content_regexp_str)
paul@16	208	table_content_regexp = re.compile(table_content_regexp_str)
paul@16	209
paul@16	210	def translate_content_match(match):
paul@16	211
paul@16	212	"Translate the content described by the given 'match', returning a string."
paul@16	213
paul@19	214	if match.group("monotext"):
paul@19	215	return "{{{%s}}}" % match.group("monotext")
paul@19	216
paul@19	217	elif match.group("linktext"):
paul@16	218	parts = match.group("linktext").split("\|")
paul@16	219
paul@16	220	# NOTE: Proper detection of external links required.
paul@16	221
paul@19	222	if len(parts) == 1:
paul@22	223	label, target, title = None, parts[0], None
paul@19	224	elif len(parts) == 2:
paul@22	225	(label, target), title = parts, None
paul@19	226	else:
paul@19	227	label, target, title = parts
paul@19	228
paul@21	229	target = target.strip()
paul@21	230
paul@22	231	# Look for namespace links and rewrite them.
paul@22	232
paul@19	233	if target.find(":") != -1:
paul@16	234	prefix = ""
paul@19	235	space, rest = target.split(":", 1)
paul@19	236	if space not in URL_SCHEMES:
paul@19	237	target = "%s/%s" % (space, rest)
paul@22	238
paul@22	239	# Detect anchors.
paul@22	240
paul@19	241	elif target.startswith("#"):
paul@16	242	prefix = ""
paul@22	243
paul@22	244	# Detect attachments.
paul@22	245
paul@19	246	elif target.startswith("^"):
paul@16	247	prefix = "attachment:"
paul@22	248
paul@22	249	# Link to other pages within a space.
paul@22	250
paul@16	251	else:
paul@16	252	prefix = "../"
paul@16	253
paul@22	254	# Make the link tidier by making a target if none was given.
paul@22	255
paul@22	256	if not label:
paul@22	257	label = target
paul@22	258
paul@22	259	if not label and not title:
paul@19	260	return "[[%s%s]]" % (prefix, target)
paul@22	261	elif not title:
paul@19	262	return "[[%s%s\|%s]]" % (prefix, target, label)
paul@16	263	else:
paul@19	264	return "[[%s%s\|%s\|title=%s]]" % (prefix, target, label, title)
paul@16	265
paul@16	266	elif match.group("imagetext"):
paul@16	267	parts = match.group("imagetext").split("\|")
paul@16	268
paul@16	269	# NOTE: Proper detection of external links required.
paul@16	270
paul@16	271	if parts[0].startswith("http"):
paul@16	272	prefix = ""
paul@16	273	else:
paul@16	274	prefix = "attachment:"
paul@16	275
paul@16	276	# NOTE: Proper options conversion required.
paul@16	277
paul@16	278	if len(parts) == 1:
paul@16	279	return "{{%s%s}}" % (prefix, parts[0])
paul@16	280	else:
paul@19	281	return "{{%s%s\|%s}}" % (prefix, parts[0], parts[1])
paul@16	282
paul@36	283	elif match.group("italictext"):
paul@36	284	return "''%s''" % translate_content(match.group("italictext"))
paul@36	285
paul@36	286	elif match.group("boldtext"):
paul@36	287	return "'''%s'''" % translate_content(match.group("boldtext"))
paul@36	288
paul@36	289	elif match.group("deltext"):
paul@36	290	return "--(%s)--" % translate_content(match.group("deltext"))
paul@36	291
paul@36	292	elif match.group("underlinetext"):
paul@36	293	return "__%s__" % translate_content(match.group("underlinetext"))
paul@36	294
paul@36	295	elif match.group("subtext"):
paul@36	296	return ",,%s,," % translate_content(match.group("subtext"))
paul@36	297
paul@16	298	else:
paul@16	299	return match.group()
paul@16	300
paul@36	301	def translate_content(text, sectiontype=None):
paul@36	302
paul@36	303	"""
paul@36	304	Return a translation of the given 'text'. If the optional 'sectiontype' is
paul@36	305	specified, the translation may be modified to a form appropriate to the
paul@36	306	section being translated.
paul@36	307	"""
paul@36	308
paul@36	309	parts = []
paul@36	310
paul@36	311	last = 0
paul@36	312	for match in content_regexp.finditer(text):
paul@36	313	start, end = match.span()
paul@36	314	parts.append(text[last:start])
paul@36	315
paul@36	316	# Handle unformatted sections.
paul@36	317
paul@36	318	if sectiontype in ("code", "noformat"):
paul@36	319	parts.append(match.group())
paul@36	320	else:
paul@36	321	parts.append(translate_content_match(match))
paul@36	322
paul@36	323	last = end
paul@36	324
paul@36	325	parts.append(text[last:])
paul@36	326	return "".join(parts)
paul@36	327
paul@14	328	def get_table_rows(text):
paul@14	329
paul@14	330	"Return a list of (cellsep, columns) tuples for the given table 'text'."
paul@14	331
paul@14	332	rows = []
paul@14	333
paul@14	334	for line in text.split("\n"):
paul@14	335	cellsep = None
paul@14	336	columns = [""]
paul@14	337	last = 0
paul@16	338	for match in table_content_regexp.finditer(line):
paul@14	339	start, end = match.span()
paul@14	340	columns[-1] += line[last:start]
paul@14	341
paul@14	342	if match.group("celltype"):
paul@14	343	if cellsep is None:
paul@14	344	cellsep = match.group("celltype")
paul@14	345	columns.append("")
paul@14	346	else:
paul@16	347	columns[-1] += match.group()
paul@14	348
paul@14	349	last = end
paul@14	350
paul@14	351	columns[-1] += line[last:]
paul@14	352
paul@14	353	if cellsep:
paul@14	354	rows.append((cellsep, columns[1:-1]))
paul@14	355
paul@14	356	return rows
paul@14	357
paul@15	358	# Translation helpers.
paul@14	359
paul@14	360	markers = {
paul@14	361	"" : "",
paul@14	362	"#" : "1.",
paul@14	363	"-" : "*",
paul@14	364	}
paul@14	365
paul@14	366	def translate_marker(marker):
paul@14	367
paul@14	368	"Translate the given 'marker' to a suitable Moin representation."
paul@14	369
paul@14	370	return " " * len(marker) + markers[marker[-1]]
paul@14	371
paul@14	372	cellseps = {
paul@14	373	"\|" : "\|\|",
paul@14	374	"\|\|" : "\|\|",
paul@14	375	}
paul@14	376
paul@14	377	cellextra = {
paul@14	378	"\|" : "",
paul@14	379	"\|\|" : "'''",
paul@14	380	}
paul@14	381
paul@14	382	def translate_cellsep(cellsep):
paul@14	383
paul@14	384	"Translate the given 'cellsep' to a suitable Moin representation."
paul@14	385
paul@14	386	return cellseps[cellsep]
paul@14	387
paul@14	388	def translate_cell(cellsep, text):
paul@14	389
paul@14	390	"Using 'cellsep', translate the cell 'text'."
paul@14	391
paul@16	392	return cellextra[cellsep] + translate_content(text) + cellextra[cellsep]
paul@14	393
paul@15	394	sectiontypes = {
paul@15	395	"code" : "",
paul@15	396	"noformat" : "",
paul@15	397	"quote" : "",
paul@15	398	"info" : "wiki important",
paul@15	399	"note" : "wiki caution",
paul@15	400	"tip" : "wiki tip",
paul@15	401	"warning" : "wiki warning",
paul@15	402	}
paul@15	403
paul@15	404	# General parsing.
paul@15	405
paul@11	406	def parse(s, out):
paul@11	407
paul@11	408	"Parse the content in the string 's', writing a translation to 'out'."
paul@11	409
paul@11	410	for type, text in get_regions(s):
paul@11	411
paul@11	412	# Handle list, heading, blockquote or anonymous blocks.
paul@11	413
paul@11	414	if type is None:
paul@11	415	for blocktype, blocktext in get_blocks(text):
paul@14	416
paul@14	417	# Translate headings and blockquotes.
paul@14	418
paul@11	419	if blocktypes.has_key(blocktype):
paul@11	420	print >>out, blocktypes[blocktype] % blocktext
paul@14	421
paul@14	422	# Translate list items.
paul@14	423
paul@14	424	elif blocktype == "list":
paul@14	425	for listmarker, listitem in get_list_items(blocktext):
paul@16	426	print >>out, "%s %s" % (translate_marker(listmarker), translate_content(listitem))
paul@14	427
paul@14	428	# Translate table items.
paul@14	429
paul@14	430	elif blocktype == "table":
paul@14	431	for cellsep, columns in get_table_rows(blocktext):
paul@14	432	moinsep = translate_cellsep(cellsep)
paul@14	433	print >>out, moinsep + moinsep.join([translate_cell(cellsep, column) for column in columns]) + moinsep
paul@14	434
paul@14	435	# Handle anonymous blocks.
paul@14	436
paul@11	437	else:
paul@16	438	print >>out, translate_content(blocktext.rstrip())
paul@14	439
paul@14	440	print >>out
paul@11	441
paul@11	442	# Handle sections.
paul@11	443
paul@11	444	else:
paul@15	445	sectiontype, options = type
paul@15	446
paul@15	447	# Direct translations of sections.
paul@15	448
paul@15	449	mointype = sectiontypes.get(sectiontype)
paul@15	450	if mointype:
paul@15	451	print >>out, "{{{#!%s" % mointype
paul@15	452	if options:
paul@15	453	print >>out, "##", options
paul@15	454	else:
paul@15	455	print >>out, "{{{",
paul@18	456	print >>out, translate_content(text, sectiontype),
paul@14	457	print >>out, "}}}"
paul@14	458	print >>out
paul@11	459
paul@6	460	if __name__ == "__main__":
paul@6	461	s = sys.stdin.read()
paul@35	462	parse(s, sys.stdout)
paul@6	463
paul@6	464	# vim: tabstop=4 expandtab shiftwidth=4