Lichen (annotate lib/__builtins__/unicode.py in be8a103fdd9b)

Lichen

Annotated lib/builtins/unicode.py

997:be8a103fdd9b

13 months ago

Paul Boddie

Merged changes from the trailing-data branch.

well-defined-instances

paul@390	1	#!/usr/bin/env python
paul@390	2
paul@390	3	"""
paul@390	4	Unicode objects.
paul@390	5
paul@934	6	Copyright (C) 2015, 2016, 2017, 2021 Paul Boddie <paul@boddie.org.uk>
paul@390	7
paul@390	8	This program is free software; you can redistribute it and/or modify it under
paul@390	9	the terms of the GNU General Public License as published by the Free Software
paul@390	10	Foundation; either version 3 of the License, or (at your option) any later
paul@390	11	version.
paul@390	12
paul@390	13	This program is distributed in the hope that it will be useful, but WITHOUT
paul@390	14	ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
paul@390	15	FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
paul@390	16	details.
paul@390	17
paul@390	18	You should have received a copy of the GNU General Public License along with
paul@390	19	this program. If not, see <http://www.gnu.org/licenses/>.
paul@390	20	"""
paul@390	21
paul@390	22	from __builtins__.str import basestring
paul@431	23	from __builtins__.types import check_int
paul@390	24	from posix.iconv import Converter
paul@534	25	from native import str_add, unicode_len, unicode_ord, unicode_substr, \
paul@431	26	isinstance as _isinstance
paul@390	27
paul@934	28	class unicode(basestring):
paul@390	29
paul@390	30	"A character string representation based on UTF-8."
paul@390	31
paul@934	32	def __init__(self, s, encoding=None, original=None):
paul@396	33
paul@396	34	"""
paul@934	35	Initialise the string from 'other', employing any indicated 'encoding'
paul@934	36	for the provided string data.
paul@934	37
paul@934	38	If 'original' is indicated, this may be used to override the original
paul@934	39	encoding. This is useful when the string data is already in UTF-8
paul@934	40	format, but where the original encoding needs to be communicated.
paul@396	41	"""
paul@396	42
paul@403	43	self.length = None
paul@396	44
paul@934	45	# Initialise using another Unicode object.
paul@934	46
paul@934	47	if _isinstance(s, unicode):
paul@934	48	get_using(basestring.__init__, self)(s)
paul@934	49	self.encoding = s.encoding
paul@934	50
paul@934	51	# Initialise using suitable string data but with an explicit original
paul@934	52	# encoding.
paul@934	53
paul@934	54	elif original:
paul@934	55	get_using(basestring.__init__, self)(s)
paul@934	56	self.encoding = original
paul@934	57
paul@934	58	# Initialise using string data having either UTF-8 or another encoding,
paul@934	59	# converting to UTF-8 and retaining the encoding details as the original
paul@934	60	# encoding.
paul@934	61
paul@934	62	else:
paul@934	63	# Obtain a string representation.
paul@934	64
paul@934	65	s = s.__str__()
paul@934	66
paul@934	67	# Convert the string to UTF-8. Even if the stated encoding is UTF-8, it
paul@934	68	# needs to be validated.
paul@934	69
paul@934	70	to_utf8 = Converter(encoding or "UTF-8", "UTF-8")
paul@934	71
paul@934	72	try:
paul@934	73	to_utf8.feed(s)
paul@934	74	get_using(basestring.__init__, self)(str(to_utf8))
paul@934	75	finally:
paul@934	76	to_utf8.close()
paul@934	77
paul@934	78	self.encoding = encoding
paul@934	79
paul@585	80	def _binary_op(self, op, other, sizes=False):
paul@396	81
paul@396	82	"Perform 'op' on this object and 'other' if appropriate."
paul@396	83
paul@396	84	# Reject non-strings.
paul@396	85
paul@396	86	if not _isinstance(other, basestring):
paul@396	87	return NotImplemented
paul@396	88
paul@396	89	# Combining text with bytes.
paul@396	90
paul@934	91	if not _isinstance(other, unicode):
paul@396	92	s = self.encode()
paul@585	93	else:
paul@585	94	s = self
paul@585	95
paul@585	96	if sizes:
paul@585	97	return op(s.__data__, other.__data__, s.__size__, other.__size__)
paul@585	98	else:
paul@396	99	return op(s.__data__, other.__data__)
paul@396	100
paul@585	101	def _binary_op_rev(self, op, other, sizes=False):
paul@396	102
paul@396	103	"Perform 'op' on 'other' and this object if appropriate."
paul@396	104
paul@396	105	# Reject non-strings.
paul@396	106
paul@396	107	if not _isinstance(other, basestring):
paul@396	108	return NotImplemented
paul@396	109
paul@396	110	# Combining text with bytes.
paul@396	111
paul@934	112	if not _isinstance(other, unicode):
paul@396	113	s = self.encode()
paul@585	114	else:
paul@585	115	s = self
paul@396	116
paul@585	117	if sizes:
paul@585	118	return op(other.__data__, s.__data__, other.__size__, s.__size__)
paul@396	119	else:
paul@585	120	return op(other.__data__, s.__data__)
paul@396	121
paul@398	122	def _convert(self, result, other):
paul@398	123
paul@398	124	"Convert 'result' to a Unicode object if 'other' already is."
paul@398	125
paul@934	126	if _isinstance(other, unicode):
paul@934	127	return unicode(result, None, self.encoding)
paul@398	128	else:
paul@398	129	return result
paul@398	130
paul@536	131	def _quote_value(self, b, n):
paul@536	132
paul@536	133	"Append to 'b' the quoted form of 'n'."
paul@536	134
paul@536	135	if n < 0:
paul@536	136	n += 256
paul@536	137
paul@536	138	if n > 0xffff:
paul@536	139	b.append("\\U")
paul@536	140	digits = 8
paul@550	141	else:
paul@536	142	b.append("\\u")
paul@536	143	digits = 4
paul@536	144
paul@536	145	x = hex(n, "")
paul@536	146	i = len(x)
paul@536	147
paul@536	148	while i < digits:
paul@536	149	b.append("0")
paul@536	150	i += 1
paul@536	151
paul@536	152	b.append(x)
paul@536	153
paul@536	154	# Operator methods.
paul@536	155
paul@398	156	def __iadd__(self, other):
paul@398	157
paul@398	158	"Return a string combining this string with 'other'."
paul@398	159
paul@585	160	return self._convert(self._binary_op(str_add, other, True), other)
paul@398	161
paul@398	162	__add__ = __iadd__
paul@398	163
paul@398	164	def __radd__(self, other):
paul@398	165
paul@398	166	"Return a string combining this string with 'other'."
paul@398	167
paul@585	168	return self._convert(self._binary_op_rev(str_add, other, True), other)
paul@398	169
paul@403	170	def __len__(self):
paul@403	171
paul@403	172	"Return the length of this string in characters."
paul@403	173
paul@403	174	if self.length is None:
paul@583	175	self.length = unicode_len(self.__data__, self.__size__)
paul@403	176
paul@403	177	return self.length
paul@403	178
paul@534	179	def __ord__(self):
paul@534	180
paul@534	181	"Return the value of the string, if only a single character."
paul@534	182
paul@534	183	if self.__len__() == 1:
paul@583	184	return unicode_ord(self.__data__, self.__size__)
paul@534	185	else:
paul@534	186	raise ValueError, self
paul@534	187
paul@396	188	def encode(self, encoding=None):
paul@396	189
paul@396	190	"""
paul@396	191	Encode the string to the given 'encoding' or any original encoding if
paul@396	192	omitted.
paul@396	193	"""
paul@396	194
paul@396	195	encoding = encoding or self.encoding
paul@396	196	if not encoding:
paul@396	197	return self
paul@390	198
paul@390	199	from_utf8 = Converter("UTF-8", encoding)
paul@392	200
paul@390	201	try:
paul@390	202	from_utf8.feed(self)
paul@390	203	return str(from_utf8)
paul@392	204
paul@390	205	finally:
paul@390	206	from_utf8.close()
paul@390	207
paul@409	208	def join(self, l):
paul@409	209
paul@409	210	"Join the elements in 'l' with this string."
paul@409	211
paul@409	212	# Empty strings just cause the list elements to be concatenated.
paul@409	213
paul@409	214	nonempty = self.__bool__()
paul@409	215
paul@409	216	# Non-empty strings join the elements together in a buffer.
paul@409	217
paul@409	218	b = buffer()
paul@409	219	first = True
paul@409	220	encoding = self.encoding
paul@409	221
paul@409	222	for s in l:
paul@409	223	if first:
paul@409	224	first = False
paul@409	225	elif nonempty:
paul@409	226	b.append(self)
paul@409	227
paul@934	228	if _isinstance(s, unicode):
paul@409	229	encoding = None
paul@409	230
paul@409	231	b.append(s)
paul@409	232
paul@409	233	s = str(b)
paul@409	234	if encoding:
paul@934	235	s = unicode(s, None, encoding)
paul@409	236	return s
paul@409	237
paul@431	238	# Special implementation methods.
paul@431	239
paul@431	240	def __get_single_item__(self, index):
paul@431	241
paul@431	242	"Return the item at the normalised (positive) 'index'."
paul@934	243
paul@431	244	self._check_index(index)
paul@934	245	return unicode(unicode_substr(self.__data__, self.__size__, index, index + 1, 1), None, self.encoding)
paul@431	246
paul@431	247	def __get_multiple_items__(self, start, end, step):
paul@431	248
paul@431	249	"""
paul@431	250	Return items from 'start' until (but excluding) 'end', at 'step'
paul@431	251	intervals.
paul@431	252	"""
paul@431	253
paul@520	254	if start == end:
paul@520	255	return ""
paul@520	256
paul@431	257	check_int(step)
paul@431	258
paul@431	259	if step == 0:
paul@431	260	raise ValueError(step)
paul@431	261
paul@520	262	l = get_using(basestring.__get_multiple_items__, self)(start, end, step)
paul@934	263	return unicode("".join(l), None, self.encoding)
paul@390	264
paul@390	265	# vim: tabstop=4 expandtab shiftwidth=4

Lichen

Annotated lib/__builtins__/unicode.py

Annotated lib/builtins/unicode.py