# HG changeset patch
# User Paul Boddie <paul@boddie.org.uk>
# Date 1624998288 -7200
# Node ID 7dec147995715d416b51021c69b5d6900698d3b6
# Parent  b47da767f70404e0c2b0186b7a9403f8ba5f8d62# Parent  2989aab1b4f76a7641659133cce26f0e0e05a5ac
Merged changes from the default branch.

diff -r b47da767f704 -r 7dec14799571 common.py
--- a/common.py	Sun Jun 27 22:14:51 2021 +0200
+++ b/common.py	Tue Jun 29 22:24:48 2021 +0200
@@ -3,8 +3,7 @@
 """
 Common functions.
 
-Copyright (C) 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016,
-              2017, 2018, 2019 Paul Boddie <paul@boddie.org.uk>
+Copyright (C) 2007-2019, 2021 Paul Boddie <paul@boddie.org.uk>
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -1597,8 +1596,6 @@
 
     if name == "string":
         modname = "str"
-    elif name == "utf8string":
-        modname = "unicode"
     elif name == "NoneType":
         modname = "none"
     else:
@@ -1612,8 +1609,6 @@
 
     if name == "str":
         return "string"
-    elif name == "unicode":
-        return "utf8string"
     else:
         return name
 
diff -r b47da767f704 -r 7dec14799571 docs/wiki/History
--- a/docs/wiki/History	Sun Jun 27 22:14:51 2021 +0200
+++ b/docs/wiki/History	Tue Jun 29 22:24:48 2021 +0200
@@ -71,7 +71,7 @@
 == Current Work ==
 
 It was with such realisations that a new project was effectively born.
-Tentatively called "!PythonLight" but renamed to "Lichen" as the code matured,
+Tentatively called "PythonLight" but renamed to "Lichen" as the code matured,
 the objectives now involved a simpler processing framework that merely
 attempted to catalogue structure members, to determine the origins of such
 members, and to record data flow within namespaces in order to determine
diff -r b47da767f704 -r 7dec14799571 docs/wiki/Toolchain
--- a/docs/wiki/Toolchain	Sun Jun 27 22:14:51 2021 +0200
+++ b/docs/wiki/Toolchain	Tue Jun 29 22:24:48 2021 +0200
@@ -73,12 +73,12 @@
 which the `parser` module effectively is (as would the `ast` module also be if
 it were used here), with it typically being implemented as an extension module
 in a non-Python language (in C for CPython, in Java for Jython, and so on).
-Fortunately, the !PyPy project implemented their own parsing module,
-`pyparser`, that is intended to be used within the !PyPy environment together
-with their own `ast` equivalent, but it has been possible to rework `pyparser`
-to produce representations that are compatible with the `compiler` package,
-itself being modified in various ways to achieve compatibility (and also to
-provide various other conveniences).
+Fortunately, the [[http://pypy.org/|PyPy]] project implemented their own
+parsing module, `pyparser`, that is intended to be used within the PyPy
+environment together with their own `ast` equivalent, but it has been possible
+to rework `pyparser` to produce representations that are compatible with the
+`compiler` package, itself being modified in various ways to achieve
+compatibility (and also to provide various other conveniences).
 
 == Program Analysis ==
 
diff -r b47da767f704 -r 7dec14799571 generator.py
--- a/generator.py	Sun Jun 27 22:14:51 2021 +0200
+++ b/generator.py	Tue Jun 29 22:24:48 2021 +0200
@@ -3,7 +3,7 @@
 """
 Generate C code from object layouts and other deduced information.
 
-Copyright (C) 2015, 2016, 2017, 2018, 2019 Paul Boddie <paul@boddie.org.uk>
+Copyright (C) 2015-2019, 2021 Paul Boddie <paul@boddie.org.uk>
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -51,7 +51,7 @@
     string_type = "__builtins__.str.string"
     tuple_type = "__builtins__.tuple.tuple"
     type_type = "__builtins__.core.type"
-    unicode_type = "__builtins__.unicode.utf8string"
+    unicode_type = "__builtins__.unicode.unicode"
 
     none_value = "__builtins__.none.None"
 
@@ -1265,14 +1265,14 @@
 
         # Special-case the integer type.
 
+        # Here, the __builtins__.int.new_int function is called with the
+        # initialiser's parameter.
+
         if path == self.int_type:
             print >>f_code, """\
-__attr %s(__attr __self, __attr number_or_string)
+__attr %s(__attr __self, __attr number_or_string, __attr base)
 {
-    if (!__BOOL(__fn_native_int_is_int(__self, number_or_string)))
-        __raise_value_error(number_or_string);
-
-    return number_or_string;
+    return __fn___builtins___int_new_int(__NULL, number_or_string, base);
 }
 """ % (
                 encode_instantiator_pointer(path),
diff -r b47da767f704 -r 7dec14799571 lib/__builtins__/__init__.py
--- a/lib/__builtins__/__init__.py	Sun Jun 27 22:14:51 2021 +0200
+++ b/lib/__builtins__/__init__.py	Tue Jun 29 22:24:48 2021 +0200
@@ -3,7 +3,7 @@
 """
 Simple built-in classes and functions.
 
-Copyright (C) 2015, 2016, 2017, 2019 Paul Boddie <paul@boddie.org.uk>
+Copyright (C) 2015, 2016, 2017, 2019, 2021 Paul Boddie <paul@boddie.org.uk>
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -70,7 +70,7 @@
 from __builtins__.set import frozenset, set
 from __builtins__.str import basestring, str, string
 from __builtins__.tuple import tuple
-from __builtins__.unicode import unicode, utf8string
+from __builtins__.unicode import unicode
 
 # Functions.
 
diff -r b47da767f704 -r 7dec14799571 lib/__builtins__/character.py
--- a/lib/__builtins__/character.py	Sun Jun 27 22:14:51 2021 +0200
+++ b/lib/__builtins__/character.py	Tue Jun 29 22:24:48 2021 +0200
@@ -103,7 +103,7 @@
     check_int(i)
 
     if 0 <= i <= 2097151:
-        return utf8string(unicode_unichr(i))
+        return unicode(unicode_unichr(i))
     else:
         raise ValueError, i
 
diff -r b47da767f704 -r 7dec14799571 lib/__builtins__/int.py
--- a/lib/__builtins__/int.py	Sun Jun 27 22:14:51 2021 +0200
+++ b/lib/__builtins__/int.py	Tue Jun 29 22:24:48 2021 +0200
@@ -19,22 +19,55 @@
 this program.  If not, see <http://www.gnu.org/licenses/>.
 """
 
-from __builtins__.unicode import utf8string
+from __builtins__.str import basestring
+from __builtins__.unicode import unicode
 from native import get_maxint, get_minint, is_int, \
                    int_add, int_and, int_div, int_eq, int_ge, int_gt, \
                    int_lshift, int_le, int_lt, int_mod, int_mul, int_ne, \
                    int_neg, int_not, int_or, int_pow, int_rshift, int_str, \
                    int_sub, int_xor
 
+def new_int(number_or_string, base=10):
+
+    "Initialise the integer with the given 'number_or_string'."
+
+    if is_int(number_or_string):
+        return number_or_string
+    elif isinstance(number_or_string, basestring):
+        return str_to_int(number_or_string, base)
+    else:
+        raise TypeError
+
+def str_to_int(value, base=10):
+
+    "Decode the string 'value' using the given 'base'."
+
+    # NOTE: Add support for lower and upper in the string classes.
+
+    #value = value.lower()
+    len_value = len(value)
+    digits = "0123456789abcdefghijklmnopqrstuvwxyz"
+
+    result = 0
+    i = 0
+
+    while i < len_value:
+        c = value[i]
+        d = digits.index(c)
+        result = result * base + d
+        i += 1
+
+    return result
+
 class int:
 
     "An integer abstraction."
 
-    def __init__(self, number_or_string=None):
+    def __init__(self, number_or_string=None, base=10):
 
         "Initialise the integer with the given 'number_or_string'."
 
-        # Implemented in the translator.
+        # Implemented by new_int above, invoked specially by the translator.
 
         pass
 
@@ -245,7 +278,7 @@
 
         "Return a string representation."
 
-        return utf8string(int_str(self))
+        return unicode(int_str(self))
 
     __repr__ = __str__
 
diff -r b47da767f704 -r 7dec14799571 lib/__builtins__/stream.py
--- a/lib/__builtins__/stream.py	Sun Jun 27 22:14:51 2021 +0200
+++ b/lib/__builtins__/stream.py	Tue Jun 29 22:24:48 2021 +0200
@@ -144,7 +144,7 @@
         # Encode text as bytes if necessary. When the encoding is not set, any
         # original encoding of the text will be applied.
 
-        if _isinstance(s, utf8string):
+        if _isinstance(s, unicode):
             s = s.encode(self.encoding)
 
         fwrite(self.__data__, s)
diff -r b47da767f704 -r 7dec14799571 lib/__builtins__/unicode.py
--- a/lib/__builtins__/unicode.py	Sun Jun 27 22:14:51 2021 +0200
+++ b/lib/__builtins__/unicode.py	Tue Jun 29 22:24:48 2021 +0200
@@ -3,7 +3,7 @@
 """
 Unicode objects.
 
-Copyright (C) 2015, 2016, 2017 Paul Boddie <paul@boddie.org.uk>
+Copyright (C) 2015, 2016, 2017, 2021 Paul Boddie <paul@boddie.org.uk>
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -25,21 +25,58 @@
 from native import str_add, unicode_len, unicode_ord, unicode_substr, \
                    isinstance as _isinstance
 
-class utf8string(basestring):
+class unicode(basestring):
 
     "A character string representation based on UTF-8."
 
-    def __init__(self, other=None, encoding=None):
+    def __init__(self, s, encoding=None, original=None):
 
         """
-        Initialise the string, perhaps from 'other', with any original
-        'encoding' indicated.
+        Initialise the string from 'other', employing any indicated 'encoding'
+        for the provided string data.
+
+        If 'original' is indicated, this may be used to override the original
+        encoding. This is useful when the string data is already in UTF-8
+        format, but where the original encoding needs to be communicated.
         """
 
-        get_using(basestring.__init__, self)(other)
-        self.encoding = encoding
         self.length = None
 
+        # Initialise using another Unicode object.
+
+        if _isinstance(s, unicode):
+            get_using(basestring.__init__, self)(s)
+            self.encoding = s.encoding
+
+        # Initialise using suitable string data but with an explicit original
+        # encoding.
+
+        elif original:
+            get_using(basestring.__init__, self)(s)
+            self.encoding = original
+
+        # Initialise using string data having either UTF-8 or another encoding,
+        # converting to UTF-8 and retaining the encoding details as the original
+        # encoding.
+
+        else:
+            # Obtain a string representation.
+
+            s = s.__str__()
+
+            # Convert the string to UTF-8. Even if the stated encoding is UTF-8, it
+            # needs to be validated.
+
+            to_utf8 = Converter(encoding or "UTF-8", "UTF-8")
+
+            try:
+                to_utf8.feed(s)
+                get_using(basestring.__init__, self)(str(to_utf8))
+            finally:
+                to_utf8.close()
+
+            self.encoding = encoding
+
     def _binary_op(self, op, other, sizes=False):
 
         "Perform 'op' on this object and 'other' if appropriate."
@@ -51,7 +88,7 @@
 
         # Combining text with bytes.
 
-        if not _isinstance(other, utf8string):
+        if not _isinstance(other, unicode):
             s = self.encode()
         else:
             s = self
@@ -72,7 +109,7 @@
 
         # Combining text with bytes.
 
-        if not _isinstance(other, utf8string):
+        if not _isinstance(other, unicode):
             s = self.encode()
         else:
             s = self
@@ -86,8 +123,8 @@
 
         "Convert 'result' to a Unicode object if 'other' already is."
 
-        if _isinstance(other, utf8string):
-            return utf8string(result, self.encoding)
+        if _isinstance(other, unicode):
+            return unicode(result, None, self.encoding)
         else:
             return result
 
@@ -188,15 +225,14 @@
             elif nonempty:
                 b.append(self)
 
-            if _isinstance(s, utf8string):
+            if _isinstance(s, unicode):
                 encoding = None
 
             b.append(s)
 
         s = str(b)
         if encoding:
-            s = utf8string(s)
-            s.encoding = encoding
+            s = unicode(s, None, encoding)
         return s
 
     # Special implementation methods.
@@ -204,9 +240,9 @@
     def __get_single_item__(self, index):
     
         "Return the item at the normalised (positive) 'index'."
-    
+ 
         self._check_index(index)
-        return utf8string(unicode_substr(self.__data__, self.__size__, index, index + 1, 1), self.encoding)
+        return unicode(unicode_substr(self.__data__, self.__size__, index, index + 1, 1), None, self.encoding)
 
     def __get_multiple_items__(self, start, end, step):
 
@@ -224,29 +260,6 @@
             raise ValueError(step)
 
         l = get_using(basestring.__get_multiple_items__, self)(start, end, step)
-        return utf8string("".join(l), self.encoding)
-
-def unicode(s, encoding):
-
-    "Convert 's' to a Unicode object, interpreting 's' as using 'encoding'."
-
-    if isinstance(s, utf8string):
-        return s
-
-    # Obtain a string representation.
-
-    s = s.__str__()
-
-    # Convert the string to UTF-8. Even if the stated encoding is UTF-8, it
-    # needs to be validated.
-
-    to_utf8 = Converter(encoding, "UTF-8")
-
-    try:
-        to_utf8.feed(s)
-        return utf8string(str(to_utf8), encoding)
-
-    finally:
-        to_utf8.close()
+        return unicode("".join(l), None, self.encoding)
 
 # vim: tabstop=4 expandtab shiftwidth=4
diff -r b47da767f704 -r 7dec14799571 tests/int.py
--- a/tests/int.py	Sun Jun 27 22:14:51 2021 +0200
+++ b/tests/int.py	Tue Jun 29 22:24:48 2021 +0200
@@ -11,3 +11,20 @@
     a = int("a")        # should raise an exception
 except ValueError, exc:
     print 'int("a") failed:', exc.value
+
+try:
+    a = int("!")        # should raise an exception
+except ValueError, exc:
+    print 'int("!") failed:', exc.value
+
+a = int("a", 16)
+b = int("123")
+print a                 # 10
+print b, i, b == i      # 123, 123, True
+print b, j, b == j      # 123, 123, True
+
+a_is_int = isinstance(a, int)
+j_is_int = isinstance(j, int)
+
+print a_is_int          # True
+print j_is_int          # True
diff -r b47da767f704 -r 7dec14799571 tests/unicode.py
--- a/tests/unicode.py	Sun Jun 27 22:14:51 2021 +0200
+++ b/tests/unicode.py	Tue Jun 29 22:24:48 2021 +0200
@@ -48,7 +48,7 @@
 s7 = r"\346\370\345"
 print "Untranslated values:"
 print s7                            # \346\370\345
-print s7.__class__                  # __builtins__.unicode.utf8string
+print s7.__class__                  # __builtins__.unicode.unicode
 print len(s7)                       # 12
 
 # Obtain text and print it.
@@ -58,7 +58,7 @@
 u = unicode(b"æøå", "ISO-8859-15")
 print "Unicode values:"
 print u                             # æøå
-print u.__class__                   # __builtins__.unicode.utf8string
+print u.__class__                   # __builtins__.unicode.unicode
 print u.encode("ISO-8859-15")       # æøå
 print u.encoding                    # ISO-8859-15
 print len(u)                        # 3
@@ -68,7 +68,7 @@
 u2 = u"æøå"
 print "Unicode values:"
 print u2                            # æøå
-print u2.__class__                  # __builtins__.unicode.utf8string
+print u2.__class__                  # __builtins__.unicode.unicode
 print u2.encode("ISO-8859-15")      # æøå
 print u2.encoding                   # ISO-8859-15
 print len(u2)                       # 3
@@ -78,7 +78,7 @@
 u3 = "æøå"
 print "Unicode values:"
 print u3                            # æøå
-print u3.__class__                  # __builtins__.unicode.utf8string
+print u3.__class__                  # __builtins__.unicode.unicode
 print u3.encode("ISO-8859-15")      # æøå
 print u3.encoding                   # ISO-8859-15
 print len(u3)                       # 3
@@ -88,7 +88,7 @@
 u4 = unicode("æøå", "ISO-8859-15")
 print "Unicode values:"
 print u4                            # æøå
-print u4.__class__                  # __builtins__.unicode.utf8string
+print u4.__class__                  # __builtins__.unicode.unicode
 print u4.encode("ISO-8859-15")      # æøå
 print u4.encoding                   # ISO-8859-15
 print len(u4)                       # 3
@@ -163,7 +163,7 @@
 uu2 = u + u2
 print "Unicode values:"
 print uu2                           # æøåæøå
-print uu2.__class__                 # __builtins__.unicode.utf8string
+print uu2.__class__                 # __builtins__.unicode.unicode
 print uu2.encoding                  # ISO-8859-15
 print len(uu2)                      # 6
 
@@ -195,7 +195,7 @@
 # Test character access.
 
 u0 = u[0]
-print u0.__class__                  # __builtins__.unicode.utf8string
+print u0.__class__                  # __builtins__.unicode.unicode
 print u0.encoding                   # ISO-8859-15
 print u0                            # æ
 print u[-1]                         # å