1 #!/usr/bin/env python 2 3 """ 4 Unicode objects. 5 6 Copyright (C) 2015, 2016, 2017 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from __builtins__.str import basestring 23 from __builtins__.types import check_int 24 from posix.iconv import Converter 25 from native import str_add, unicode_len, unicode_ord, unicode_substr, \ 26 isinstance as _isinstance 27 28 class utf8string(basestring): 29 30 "A character string representation based on UTF-8." 31 32 def __init__(self, other=None, encoding=None): 33 34 """ 35 Initialise the string, perhaps from 'other', with any original 36 'encoding' indicated. 37 """ 38 39 get_using(basestring.__init__, self)(other) 40 self.encoding = encoding 41 self.length = None 42 43 def _binary_op(self, op, other): 44 45 "Perform 'op' on this object and 'other' if appropriate." 46 47 # Reject non-strings. 48 49 if not _isinstance(other, basestring): 50 return NotImplemented 51 52 # Combining text with bytes. 53 54 elif not _isinstance(other, utf8string): 55 s = self.encode() 56 return op(s.__data__, other.__data__) 57 58 # Otherwise, perform the operation on the operands' data. 59 60 else: 61 return op(self.__data__, other.__data__) 62 63 def _binary_op_rev(self, op, other): 64 65 "Perform 'op' on 'other' and this object if appropriate." 66 67 # Reject non-strings. 68 69 if not _isinstance(other, basestring): 70 return NotImplemented 71 72 # Combining text with bytes. 73 74 elif not _isinstance(other, utf8string): 75 s = self.encode() 76 return op(other.__data__, s.__data__) 77 78 # Otherwise, perform the operation on the operands' data. 79 80 else: 81 return op(other.__data__, self.__data__) 82 83 def _convert(self, result, other): 84 85 "Convert 'result' to a Unicode object if 'other' already is." 86 87 if _isinstance(other, utf8string): 88 return utf8string(result, self.encoding) 89 else: 90 return result 91 92 def _quote_value(self, b, n): 93 94 "Append to 'b' the quoted form of 'n'." 95 96 if n < 0: 97 n += 256 98 99 if n > 0xffff: 100 b.append("\\U") 101 digits = 8 102 elif n > 0xff: 103 b.append("\\u") 104 digits = 4 105 else: 106 b.append("\\x") 107 digits = 2 108 109 x = hex(n, "") 110 i = len(x) 111 112 while i < digits: 113 b.append("0") 114 i += 1 115 116 b.append(x) 117 118 # Operator methods. 119 120 def __iadd__(self, other): 121 122 "Return a string combining this string with 'other'." 123 124 return self._convert(self._binary_op(str_add, other), other) 125 126 __add__ = __iadd__ 127 128 def __radd__(self, other): 129 130 "Return a string combining this string with 'other'." 131 132 return self._convert(self._binary_op_rev(str_add, other), other) 133 134 def __len__(self): 135 136 "Return the length of this string in characters." 137 138 if self.length is None: 139 self.length = unicode_len(self.__data__) 140 141 return self.length 142 143 def __ord__(self): 144 145 "Return the value of the string, if only a single character." 146 147 if self.__len__() == 1: 148 return unicode_ord(self.__data__) 149 else: 150 raise ValueError, self 151 152 def encode(self, encoding=None): 153 154 """ 155 Encode the string to the given 'encoding' or any original encoding if 156 omitted. 157 """ 158 159 encoding = encoding or self.encoding 160 if not encoding: 161 return self 162 163 from_utf8 = Converter("UTF-8", encoding) 164 165 try: 166 from_utf8.feed(self) 167 return str(from_utf8) 168 169 finally: 170 from_utf8.close() 171 172 def join(self, l): 173 174 "Join the elements in 'l' with this string." 175 176 # Empty strings just cause the list elements to be concatenated. 177 178 nonempty = self.__bool__() 179 180 # Non-empty strings join the elements together in a buffer. 181 182 b = buffer() 183 first = True 184 encoding = self.encoding 185 186 for s in l: 187 if first: 188 first = False 189 elif nonempty: 190 b.append(self) 191 192 if _isinstance(s, utf8string): 193 encoding = None 194 195 b.append(s) 196 197 s = str(b) 198 if encoding: 199 s = utf8string(s) 200 s.encoding = encoding 201 return s 202 203 # Special implementation methods. 204 205 def __get_single_item__(self, index): 206 207 "Return the item at the normalised (positive) 'index'." 208 209 self._check_index(index) 210 return utf8string(unicode_substr(self.__data__, index, index + 1, 1), self.encoding) 211 212 def __get_multiple_items__(self, start, end, step): 213 214 """ 215 Return items from 'start' until (but excluding) 'end', at 'step' 216 intervals. 217 """ 218 219 if start == end: 220 return "" 221 222 check_int(step) 223 224 if step == 0: 225 raise ValueError(step) 226 227 l = get_using(basestring.__get_multiple_items__, self)(start, end, step) 228 return utf8string("".join(l), self.encoding) 229 230 def unicode(s, encoding): 231 232 "Convert 's' to a Unicode object, interpreting 's' as using 'encoding'." 233 234 if isinstance(s, utf8string): 235 return s 236 237 # Obtain a string representation. 238 239 s = s.__str__() 240 241 # Convert the string to UTF-8. Even if the stated encoding is UTF-8, it 242 # needs to be validated. 243 244 to_utf8 = Converter(encoding, "UTF-8") 245 246 try: 247 to_utf8.feed(s) 248 return utf8string(str(to_utf8), encoding) 249 250 finally: 251 to_utf8.close() 252 253 # vim: tabstop=4 expandtab shiftwidth=4