1.1 --- a/moinformat/serialisers/common.py Tue Jul 31 17:38:45 2018 +0200
1.2 +++ b/moinformat/serialisers/common.py Tue Jul 31 17:39:28 2018 +0200
1.3 @@ -19,6 +19,8 @@
1.4 this program. If not, see <http://www.gnu.org/licenses/>.
1.5 """
1.6
1.7 +from urllib import quote_plus
1.8 +
1.9 class Serialiser:
1.10
1.11 "General serialisation support."
1.12 @@ -98,4 +100,61 @@
1.13
1.14 return s.replace("&", "&").replace("<", "<").replace(">", ">")
1.15
1.16 +def make_id(s):
1.17 +
1.18 + "Make a suitable identifier for XML element identification."
1.19 +
1.20 + # NOTE: This reproduces the Moin algorithm for compatibility.
1.21 + # NOTE: There may well be improvements possible, possibly by replacing plus
1.22 + # NOTE: with something less cumbersome, even though plus may be unusual in
1.23 + # NOTE: things like headings, anyway.
1.24 +
1.25 + # The desired output is the following pattern:
1.26 +
1.27 + # [A-Za-z][-_:.A-Za-z0-9]*
1.28 +
1.29 + # The Python UTF-7 encoder preserves symbols and it encodes + as +- with an
1.30 + # output range as follows (in addition to A-Za-z0-9):
1.31 +
1.32 + # -_:.%+ !"#$&\'()*,/;<=>?@[]^`{|}
1.33 +
1.34 + # The quote_plus function converts space to plus, preserves -_:. and encodes
1.35 + # all other symbols (including original occurrences of plus and percent) and
1.36 + # non-alphanumeric (ASCII) characters using percent encoding.
1.37 +
1.38 + # With colons preserved, the resulting output is in the following range
1.39 + # (in addition to A-Za-z0-9):
1.40 +
1.41 + # -_:.%+
1.42 +
1.43 + # Percent will only occur as an encoding prefix. Plus will only occur as a
1.44 + # replacement for space.
1.45 +
1.46 + # Combining quote_plus and UTF-7 gives the following range (in addition to
1.47 + # A-Za-z0-9):
1.48 +
1.49 + # -_:.%+
1.50 +
1.51 + # Examples:
1.52 +
1.53 + # UTF-7 quote_plus replace percent and plus
1.54 + # : -> : -> : -> :
1.55 + # - -> - -> - -> -
1.56 + # . -> . -> . -> .
1.57 + # % -> % -> %25 -> .25
1.58 + # + -> +- -> %2B- -> .2B-
1.59 + # _ -> _ -> _ -> _
1.60 + # space -> space -> + -> _
1.61 +
1.62 + # See: RFC2152 - UTF-7 A Mail-Safe Transformation Format of Unicode
1.63 +
1.64 + quoted = quote_plus(s.encode("utf-7"), ":").replace("%", ".").replace("+", "_")
1.65 +
1.66 + # Ensure that the identifier starts with an alphabetical character.
1.67 +
1.68 + if not quoted[0].isalpha():
1.69 + return "A%s" % quoted
1.70 + else:
1.71 + return quoted
1.72 +
1.73 # vim: tabstop=4 expandtab shiftwidth=4