# HG changeset patch # User Paul Boddie # Date 1481499654 -3600 # Node ID 86cd43476c54e9d8b74c836cf18a6b3a1926afd6 # Parent 38d6eb5db365930ecf41add39942fa88fcb9164a Added support for character set conversions using the iconv API. diff -r 38d6eb5db365 -r 86cd43476c54 lib/native/__init__.py --- a/lib/native/__init__.py Mon Dec 12 00:40:23 2016 +0100 +++ b/lib/native/__init__.py Mon Dec 12 00:40:54 2016 +0100 @@ -29,6 +29,8 @@ from native.introspection import object_getattr, isinstance, issubclass +from native.iconv import iconv, iconv_close, iconv_open + from native.io import fclose, fopen, fdopen, close, read, write, fread, fwrite from native.limits import get_maxint, get_minint diff -r 38d6eb5db365 -r 86cd43476c54 lib/native/iconv.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lib/native/iconv.py Mon Dec 12 00:40:54 2016 +0100 @@ -0,0 +1,36 @@ +#!/usr/bin/env python + +""" +Native library functions for character set conversion. + +None of these are actually defined here. Instead, native implementations are +substituted when each program is built. It is, however, important to declare +non-core exceptions used by the native functions because they need to be +identified as being needed by the program. + +Copyright (C) 2016 Paul Boddie + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; either version 3 of the License, or (at your option) any later +version. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +details. + +You should have received a copy of the GNU General Public License along with +this program. If not, see . +""" + +def iconv_open(tocode, fromcode): + OSError + +def iconv_close(cd): + OSError + +def iconv(cd, instr, state): + OSError + +# vim: tabstop=4 expandtab shiftwidth=4 diff -r 38d6eb5db365 -r 86cd43476c54 lib/posix/iconv.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lib/posix/iconv.py Mon Dec 12 00:40:54 2016 +0100 @@ -0,0 +1,79 @@ +#!/usr/bin/env python + +""" +POSIX character set conversion functions. + +Copyright (C) 2016 Paul Boddie + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; either version 3 of the License, or (at your option) any later +version. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +details. + +You should have received a copy of the GNU General Public License along with +this program. If not, see . +""" + +from __builtins__.types import check_int, check_string +from native import iconv_close, iconv_open, iconv + +class ConverterError(Exception): + + "An error indicating a failure involving a character set converter." + + pass + +E2BIG = 7 +EINVAL = 22 +EILSEQ = 84 + +class Converter: + + "A character set converter." + + def __init__(self, from_encoding, to_encoding): + + "Initialise conversion between 'from_encoding' and 'to_encoding'." + + check_string(from_encoding) + check_string(to_encoding) + self.__data__ = iconv_open(to_encoding, from_encoding) + + def close(self): + + "Close this converter." + + iconv_close(self.__data__) + self.__data__ = None + + def convert(self, s): + + "Convert 's' between the converter's encodings." + + if self.__data__ is None: + raise ConverterError + + check_string(s) + + result = [] + state = [0, len(s)] + + while True: + + # Obtain converted text and update the state. + + out = iconv(self.__data__, s, state) + result.append(out) + + # Test for the end of the conversion. + + start, remaining = state + if not remaining: + return "".join(result) + +# vim: tabstop=4 expandtab shiftwidth=4 diff -r 38d6eb5db365 -r 86cd43476c54 templates/native/iconv.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/templates/native/iconv.c Mon Dec 12 00:40:54 2016 +0100 @@ -0,0 +1,151 @@ +/* Native functions for character set conversion. + +Copyright (C) 2016 Paul Boddie + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; either version 3 of the License, or (at your option) any later +version. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +details. + +You should have received a copy of the GNU General Public License along with +this program. If not, see . +*/ + +#include /* iconv, iconv_close, iconv_open */ +#include /* memcpy */ +#include /* errno */ +#include "native/common.h" +#include "types.h" +#include "exceptions.h" +#include "ops.h" +#include "progconsts.h" +#include "progops.h" +#include "progtypes.h" +#include "main.h" + +static const size_t OUTBUFSIZE_MIN = 16; + +/* Character set conversion. */ + +__attr __fn_native_iconv_iconv(__attr __args[]) +{ + __attr * const cd = &__args[1]; + __attr * const instr = &__args[2]; + __attr * const state = &__args[3]; + /* cd interpreted as iconv_t */ + iconv_t c = (iconv_t) cd->datavalue; + /* instr.__data__ interpreted as string */ + char *inbuf = __load_via_object(instr->value, __pos___data__).strvalue; + /* state.__data__ interpreted as list */ + __fragment *f = __load_via_object(state->value, __pos___data__).seqvalue; + + /* Obtain the start position from the state. */ + + int start = __load_via_object(f->attrs[0].value, __pos___data__).intvalue; + int remaining = __load_via_object(f->attrs[1].value, __pos___data__).intvalue; + + /* Allocate a string for the output buffer using the remaining input size + as a guide. */ + + size_t outbufsize = remaining < OUTBUFSIZE_MIN ? OUTBUFSIZE_MIN : remaining; + size_t outbytesleft = outbufsize; + size_t inbytesleft = remaining; + + char buf[outbytesleft]; + char *outbuf = buf, *outbufstart = outbuf, *resultbuf; + size_t result, outbytestotal; + + /* Convert from the start point. */ + + inbuf += start; + + errno = 0; + result = iconv(c, &inbuf, &inbytesleft, &outbuf, &outbytesleft); + + /* Return any string. */ + + if ((result != -1) || (errno == E2BIG)) + { + outbytestotal = outbufsize - outbytesleft; + resultbuf = __ALLOCATE(outbytestotal + 1, sizeof(char)); + memcpy(resultbuf, outbufstart, outbytestotal); + + /* Mutate the state to indicate the next input buffer position. */ + + f->attrs[0] = __new_int(start + remaining - inbytesleft); + f->attrs[1] = __new_int(inbytesleft); + return __new_str(resultbuf, outbytestotal); + } + + /* Invalid sequence. */ + + if (errno == EILSEQ) + { + resultbuf = __ALLOCATE(inbytesleft + 1, sizeof(char)); + memcpy(resultbuf, inbuf, inbytesleft); + __raise_os_error(__new_int(errno), __new_str(resultbuf, inbytesleft)); + } + + /* Incomplete sequence. */ + + else if (errno == EINVAL) + { + resultbuf = __ALLOCATE(inbytesleft + 1, sizeof(char)); + memcpy(resultbuf, inbuf, inbytesleft); + __raise_os_error(__new_int(errno), __new_str(resultbuf, inbytesleft)); + } + + /* General failure. */ + + else + __raise_os_error(__new_int(errno), __builtins___none_None); +} + +__attr __fn_native_iconv_iconv_close(__attr __args[]) +{ + __attr * const cd = &__args[1]; + /* cd interpreted as iconv_t */ + iconv_t c = (iconv_t) cd->datavalue; + + errno = 0; + + if (iconv_close(c) == -1) + __raise_os_error(__new_int(errno), __builtins___none_None); + + return __builtins___none_None; +} + +__attr __fn_native_iconv_iconv_open(__attr __args[]) +{ + __attr * const tocode = &__args[1]; + __attr * const fromcode = &__args[2]; + /* tocode.__data__ interpreted as string */ + char *t = __load_via_object(tocode->value, __pos___data__).strvalue; + /* fromcode.__data__ interpreted as string */ + char *f = __load_via_object(fromcode->value, __pos___data__).strvalue; + iconv_t result; + __attr attr; + + errno = 0; + result = iconv_open(t, f); + + if (result == (iconv_t) -1) + __raise_os_error(__new_int(errno), __builtins___none_None); + + /* Return the descriptor as an opaque value. */ + + attr.context = 0; + attr.datavalue = (void *) result; + return attr; +} + +/* Module initialisation. */ + +void __main_native_iconv() +{ +} diff -r 38d6eb5db365 -r 86cd43476c54 templates/native/iconv.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/templates/native/iconv.h Mon Dec 12 00:40:54 2016 +0100 @@ -0,0 +1,34 @@ +/* Native functions for character set conversion. + +Copyright (C) 2016 Paul Boddie + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; either version 3 of the License, or (at your option) any later +version. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +details. + +You should have received a copy of the GNU General Public License along with +this program. If not, see . +*/ + +#ifndef __NATIVE_ICONV_H__ +#define __NATIVE_ICONV_H__ + +#include "types.h" + +/* Input/output. */ + +__attr __fn_native_iconv_iconv(__attr __args[]); +__attr __fn_native_iconv_iconv_close(__attr __args[]); +__attr __fn_native_iconv_iconv_open(__attr __args[]); + +/* Module initialisation. */ + +void __main_native_iconv(); + +#endif /* __NATIVE_ICONV_H__ */ diff -r 38d6eb5db365 -r 86cd43476c54 tests/iconv.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/iconv.py Mon Dec 12 00:40:54 2016 +0100 @@ -0,0 +1,36 @@ +# -*- coding: ISO-8859-1 -*- + +from posix.iconv import Converter, EINVAL, EILSEQ + +to_utf8 = Converter("ISO-8859-1", "UTF-8") +to_utf16 = Converter("ISO-8859-1", "UTF-16") +from_utf8 = Converter("UTF-8", "ISO-8859-1") +from_utf16 = Converter("UTF-16", "ISO-8859-1") + +try: + iso = "æøå" + print iso # æøå + utf = to_utf8.convert(iso) + print utf # æøå + print from_utf8.convert(utf) # æøå + utf = to_utf16.convert(iso) + print utf # ... + try: + print from_utf16.convert(utf) # æøå + except OSError, exc: + if exc.value == EINVAL: + print "Incomplete input", exc.arg + elif exc.value == EILSEQ: + print "Invalid input", exc.arg + else: + print exc.value, exc.arg +finally: + to_utf8.close() + to_utf16.close() + from_utf8.close() + from_utf16.close() + +try: + Converter("horses", "giraffes") +except OSError, exc: + print 'Converter("horses", "giraffes"): not valid encodings; error is', exc.value