# HG changeset patch # User Paul Boddie # Date 1624998288 -7200 # Node ID 7dec147995715d416b51021c69b5d6900698d3b6 # Parent b47da767f70404e0c2b0186b7a9403f8ba5f8d62# Parent 2989aab1b4f76a7641659133cce26f0e0e05a5ac Merged changes from the default branch. diff -r b47da767f704 -r 7dec14799571 common.py --- a/common.py Sun Jun 27 22:14:51 2021 +0200 +++ b/common.py Tue Jun 29 22:24:48 2021 +0200 @@ -3,8 +3,7 @@ """ Common functions. -Copyright (C) 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, - 2017, 2018, 2019 Paul Boddie +Copyright (C) 2007-2019, 2021 Paul Boddie This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -1597,8 +1596,6 @@ if name == "string": modname = "str" - elif name == "utf8string": - modname = "unicode" elif name == "NoneType": modname = "none" else: @@ -1612,8 +1609,6 @@ if name == "str": return "string" - elif name == "unicode": - return "utf8string" else: return name diff -r b47da767f704 -r 7dec14799571 docs/wiki/History --- a/docs/wiki/History Sun Jun 27 22:14:51 2021 +0200 +++ b/docs/wiki/History Tue Jun 29 22:24:48 2021 +0200 @@ -71,7 +71,7 @@ == Current Work == It was with such realisations that a new project was effectively born. -Tentatively called "!PythonLight" but renamed to "Lichen" as the code matured, +Tentatively called "PythonLight" but renamed to "Lichen" as the code matured, the objectives now involved a simpler processing framework that merely attempted to catalogue structure members, to determine the origins of such members, and to record data flow within namespaces in order to determine diff -r b47da767f704 -r 7dec14799571 docs/wiki/Toolchain --- a/docs/wiki/Toolchain Sun Jun 27 22:14:51 2021 +0200 +++ b/docs/wiki/Toolchain Tue Jun 29 22:24:48 2021 +0200 @@ -73,12 +73,12 @@ which the `parser` module effectively is (as would the `ast` module also be if it were used here), with it typically being implemented as an extension module in a non-Python language (in C for CPython, in Java for Jython, and so on). -Fortunately, the !PyPy project implemented their own parsing module, -`pyparser`, that is intended to be used within the !PyPy environment together -with their own `ast` equivalent, but it has been possible to rework `pyparser` -to produce representations that are compatible with the `compiler` package, -itself being modified in various ways to achieve compatibility (and also to -provide various other conveniences). +Fortunately, the [[http://pypy.org/|PyPy]] project implemented their own +parsing module, `pyparser`, that is intended to be used within the PyPy +environment together with their own `ast` equivalent, but it has been possible +to rework `pyparser` to produce representations that are compatible with the +`compiler` package, itself being modified in various ways to achieve +compatibility (and also to provide various other conveniences). == Program Analysis == diff -r b47da767f704 -r 7dec14799571 generator.py --- a/generator.py Sun Jun 27 22:14:51 2021 +0200 +++ b/generator.py Tue Jun 29 22:24:48 2021 +0200 @@ -3,7 +3,7 @@ """ Generate C code from object layouts and other deduced information. -Copyright (C) 2015, 2016, 2017, 2018, 2019 Paul Boddie +Copyright (C) 2015-2019, 2021 Paul Boddie This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -51,7 +51,7 @@ string_type = "__builtins__.str.string" tuple_type = "__builtins__.tuple.tuple" type_type = "__builtins__.core.type" - unicode_type = "__builtins__.unicode.utf8string" + unicode_type = "__builtins__.unicode.unicode" none_value = "__builtins__.none.None" @@ -1265,14 +1265,14 @@ # Special-case the integer type. + # Here, the __builtins__.int.new_int function is called with the + # initialiser's parameter. + if path == self.int_type: print >>f_code, """\ -__attr %s(__attr __self, __attr number_or_string) +__attr %s(__attr __self, __attr number_or_string, __attr base) { - if (!__BOOL(__fn_native_int_is_int(__self, number_or_string))) - __raise_value_error(number_or_string); - - return number_or_string; + return __fn___builtins___int_new_int(__NULL, number_or_string, base); } """ % ( encode_instantiator_pointer(path), diff -r b47da767f704 -r 7dec14799571 lib/__builtins__/__init__.py --- a/lib/__builtins__/__init__.py Sun Jun 27 22:14:51 2021 +0200 +++ b/lib/__builtins__/__init__.py Tue Jun 29 22:24:48 2021 +0200 @@ -3,7 +3,7 @@ """ Simple built-in classes and functions. -Copyright (C) 2015, 2016, 2017, 2019 Paul Boddie +Copyright (C) 2015, 2016, 2017, 2019, 2021 Paul Boddie This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -70,7 +70,7 @@ from __builtins__.set import frozenset, set from __builtins__.str import basestring, str, string from __builtins__.tuple import tuple -from __builtins__.unicode import unicode, utf8string +from __builtins__.unicode import unicode # Functions. diff -r b47da767f704 -r 7dec14799571 lib/__builtins__/character.py --- a/lib/__builtins__/character.py Sun Jun 27 22:14:51 2021 +0200 +++ b/lib/__builtins__/character.py Tue Jun 29 22:24:48 2021 +0200 @@ -103,7 +103,7 @@ check_int(i) if 0 <= i <= 2097151: - return utf8string(unicode_unichr(i)) + return unicode(unicode_unichr(i)) else: raise ValueError, i diff -r b47da767f704 -r 7dec14799571 lib/__builtins__/int.py --- a/lib/__builtins__/int.py Sun Jun 27 22:14:51 2021 +0200 +++ b/lib/__builtins__/int.py Tue Jun 29 22:24:48 2021 +0200 @@ -19,22 +19,55 @@ this program. If not, see . """ -from __builtins__.unicode import utf8string +from __builtins__.str import basestring +from __builtins__.unicode import unicode from native import get_maxint, get_minint, is_int, \ int_add, int_and, int_div, int_eq, int_ge, int_gt, \ int_lshift, int_le, int_lt, int_mod, int_mul, int_ne, \ int_neg, int_not, int_or, int_pow, int_rshift, int_str, \ int_sub, int_xor +def new_int(number_or_string, base=10): + + "Initialise the integer with the given 'number_or_string'." + + if is_int(number_or_string): + return number_or_string + elif isinstance(number_or_string, basestring): + return str_to_int(number_or_string, base) + else: + raise TypeError + +def str_to_int(value, base=10): + + "Decode the string 'value' using the given 'base'." + + # NOTE: Add support for lower and upper in the string classes. + + #value = value.lower() + len_value = len(value) + digits = "0123456789abcdefghijklmnopqrstuvwxyz" + + result = 0 + i = 0 + + while i < len_value: + c = value[i] + d = digits.index(c) + result = result * base + d + i += 1 + + return result + class int: "An integer abstraction." - def __init__(self, number_or_string=None): + def __init__(self, number_or_string=None, base=10): "Initialise the integer with the given 'number_or_string'." - # Implemented in the translator. + # Implemented by new_int above, invoked specially by the translator. pass @@ -245,7 +278,7 @@ "Return a string representation." - return utf8string(int_str(self)) + return unicode(int_str(self)) __repr__ = __str__ diff -r b47da767f704 -r 7dec14799571 lib/__builtins__/stream.py --- a/lib/__builtins__/stream.py Sun Jun 27 22:14:51 2021 +0200 +++ b/lib/__builtins__/stream.py Tue Jun 29 22:24:48 2021 +0200 @@ -144,7 +144,7 @@ # Encode text as bytes if necessary. When the encoding is not set, any # original encoding of the text will be applied. - if _isinstance(s, utf8string): + if _isinstance(s, unicode): s = s.encode(self.encoding) fwrite(self.__data__, s) diff -r b47da767f704 -r 7dec14799571 lib/__builtins__/unicode.py --- a/lib/__builtins__/unicode.py Sun Jun 27 22:14:51 2021 +0200 +++ b/lib/__builtins__/unicode.py Tue Jun 29 22:24:48 2021 +0200 @@ -3,7 +3,7 @@ """ Unicode objects. -Copyright (C) 2015, 2016, 2017 Paul Boddie +Copyright (C) 2015, 2016, 2017, 2021 Paul Boddie This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -25,21 +25,58 @@ from native import str_add, unicode_len, unicode_ord, unicode_substr, \ isinstance as _isinstance -class utf8string(basestring): +class unicode(basestring): "A character string representation based on UTF-8." - def __init__(self, other=None, encoding=None): + def __init__(self, s, encoding=None, original=None): """ - Initialise the string, perhaps from 'other', with any original - 'encoding' indicated. + Initialise the string from 'other', employing any indicated 'encoding' + for the provided string data. + + If 'original' is indicated, this may be used to override the original + encoding. This is useful when the string data is already in UTF-8 + format, but where the original encoding needs to be communicated. """ - get_using(basestring.__init__, self)(other) - self.encoding = encoding self.length = None + # Initialise using another Unicode object. + + if _isinstance(s, unicode): + get_using(basestring.__init__, self)(s) + self.encoding = s.encoding + + # Initialise using suitable string data but with an explicit original + # encoding. + + elif original: + get_using(basestring.__init__, self)(s) + self.encoding = original + + # Initialise using string data having either UTF-8 or another encoding, + # converting to UTF-8 and retaining the encoding details as the original + # encoding. + + else: + # Obtain a string representation. + + s = s.__str__() + + # Convert the string to UTF-8. Even if the stated encoding is UTF-8, it + # needs to be validated. + + to_utf8 = Converter(encoding or "UTF-8", "UTF-8") + + try: + to_utf8.feed(s) + get_using(basestring.__init__, self)(str(to_utf8)) + finally: + to_utf8.close() + + self.encoding = encoding + def _binary_op(self, op, other, sizes=False): "Perform 'op' on this object and 'other' if appropriate." @@ -51,7 +88,7 @@ # Combining text with bytes. - if not _isinstance(other, utf8string): + if not _isinstance(other, unicode): s = self.encode() else: s = self @@ -72,7 +109,7 @@ # Combining text with bytes. - if not _isinstance(other, utf8string): + if not _isinstance(other, unicode): s = self.encode() else: s = self @@ -86,8 +123,8 @@ "Convert 'result' to a Unicode object if 'other' already is." - if _isinstance(other, utf8string): - return utf8string(result, self.encoding) + if _isinstance(other, unicode): + return unicode(result, None, self.encoding) else: return result @@ -188,15 +225,14 @@ elif nonempty: b.append(self) - if _isinstance(s, utf8string): + if _isinstance(s, unicode): encoding = None b.append(s) s = str(b) if encoding: - s = utf8string(s) - s.encoding = encoding + s = unicode(s, None, encoding) return s # Special implementation methods. @@ -204,9 +240,9 @@ def __get_single_item__(self, index): "Return the item at the normalised (positive) 'index'." - + self._check_index(index) - return utf8string(unicode_substr(self.__data__, self.__size__, index, index + 1, 1), self.encoding) + return unicode(unicode_substr(self.__data__, self.__size__, index, index + 1, 1), None, self.encoding) def __get_multiple_items__(self, start, end, step): @@ -224,29 +260,6 @@ raise ValueError(step) l = get_using(basestring.__get_multiple_items__, self)(start, end, step) - return utf8string("".join(l), self.encoding) - -def unicode(s, encoding): - - "Convert 's' to a Unicode object, interpreting 's' as using 'encoding'." - - if isinstance(s, utf8string): - return s - - # Obtain a string representation. - - s = s.__str__() - - # Convert the string to UTF-8. Even if the stated encoding is UTF-8, it - # needs to be validated. - - to_utf8 = Converter(encoding, "UTF-8") - - try: - to_utf8.feed(s) - return utf8string(str(to_utf8), encoding) - - finally: - to_utf8.close() + return unicode("".join(l), None, self.encoding) # vim: tabstop=4 expandtab shiftwidth=4 diff -r b47da767f704 -r 7dec14799571 tests/int.py --- a/tests/int.py Sun Jun 27 22:14:51 2021 +0200 +++ b/tests/int.py Tue Jun 29 22:24:48 2021 +0200 @@ -11,3 +11,20 @@ a = int("a") # should raise an exception except ValueError, exc: print 'int("a") failed:', exc.value + +try: + a = int("!") # should raise an exception +except ValueError, exc: + print 'int("!") failed:', exc.value + +a = int("a", 16) +b = int("123") +print a # 10 +print b, i, b == i # 123, 123, True +print b, j, b == j # 123, 123, True + +a_is_int = isinstance(a, int) +j_is_int = isinstance(j, int) + +print a_is_int # True +print j_is_int # True diff -r b47da767f704 -r 7dec14799571 tests/unicode.py --- a/tests/unicode.py Sun Jun 27 22:14:51 2021 +0200 +++ b/tests/unicode.py Tue Jun 29 22:24:48 2021 +0200 @@ -48,7 +48,7 @@ s7 = r"\346\370\345" print "Untranslated values:" print s7 # \346\370\345 -print s7.__class__ # __builtins__.unicode.utf8string +print s7.__class__ # __builtins__.unicode.unicode print len(s7) # 12 # Obtain text and print it. @@ -58,7 +58,7 @@ u = unicode(b"זרו", "ISO-8859-15") print "Unicode values:" print u # זרו -print u.__class__ # __builtins__.unicode.utf8string +print u.__class__ # __builtins__.unicode.unicode print u.encode("ISO-8859-15") # זרו print u.encoding # ISO-8859-15 print len(u) # 3 @@ -68,7 +68,7 @@ u2 = u"זרו" print "Unicode values:" print u2 # זרו -print u2.__class__ # __builtins__.unicode.utf8string +print u2.__class__ # __builtins__.unicode.unicode print u2.encode("ISO-8859-15") # זרו print u2.encoding # ISO-8859-15 print len(u2) # 3 @@ -78,7 +78,7 @@ u3 = "זרו" print "Unicode values:" print u3 # זרו -print u3.__class__ # __builtins__.unicode.utf8string +print u3.__class__ # __builtins__.unicode.unicode print u3.encode("ISO-8859-15") # זרו print u3.encoding # ISO-8859-15 print len(u3) # 3 @@ -88,7 +88,7 @@ u4 = unicode("זרו", "ISO-8859-15") print "Unicode values:" print u4 # זרו -print u4.__class__ # __builtins__.unicode.utf8string +print u4.__class__ # __builtins__.unicode.unicode print u4.encode("ISO-8859-15") # זרו print u4.encoding # ISO-8859-15 print len(u4) # 3 @@ -163,7 +163,7 @@ uu2 = u + u2 print "Unicode values:" print uu2 # זרוזרו -print uu2.__class__ # __builtins__.unicode.utf8string +print uu2.__class__ # __builtins__.unicode.unicode print uu2.encoding # ISO-8859-15 print len(uu2) # 6 @@ -195,7 +195,7 @@ # Test character access. u0 = u[0] -print u0.__class__ # __builtins__.unicode.utf8string +print u0.__class__ # __builtins__.unicode.unicode print u0.encoding # ISO-8859-15 print u0 # ז print u[-1] # ו