Lichen

Annotated templates/native/unicode.c

610:2c1ae8f292a9
2017-02-22 Paul Boddie Fixed the testing of attribute usage. method-wrapper-for-context
paul@403 1
/* Native functions for Unicode operations.
paul@403 2
paul@607 3
Copyright (C) 2016, 2017 Paul Boddie <paul@boddie.org.uk>
paul@403 4
paul@403 5
This program is free software; you can redistribute it and/or modify it under
paul@403 6
the terms of the GNU General Public License as published by the Free Software
paul@403 7
Foundation; either version 3 of the License, or (at your option) any later
paul@403 8
version.
paul@403 9
paul@403 10
This program is distributed in the hope that it will be useful, but WITHOUT
paul@403 11
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
paul@403 12
FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
paul@403 13
details.
paul@403 14
paul@403 15
You should have received a copy of the GNU General Public License along with
paul@403 16
this program.  If not, see <http://www.gnu.org/licenses/>.
paul@403 17
*/
paul@403 18
paul@403 19
#include "native/common.h"
paul@403 20
#include "types.h"
paul@403 21
#include "exceptions.h"
paul@403 22
#include "ops.h"
paul@403 23
#include "progconsts.h"
paul@403 24
#include "progops.h"
paul@403 25
#include "progtypes.h"
paul@403 26
#include "main.h"
paul@403 27
paul@431 28
static inline int boundary(char c)
paul@431 29
{
paul@431 30
    return ((c & 0xc0) == 0xc0) || !(c & 0x80);
paul@431 31
}
paul@431 32
paul@534 33
static inline int boundary_value(char c)
paul@534 34
{
paul@534 35
    if (!(c & 0x80)) return c;
paul@534 36
    else if ((c & 0xf8) == 0xf0) return c & 0x07;
paul@534 37
    else if ((c & 0xf0) == 0xe0) return c & 0x0f;
paul@534 38
    else if ((c & 0xe0) == 0xc0) return c & 0x1f;
paul@534 39
    else return 0;
paul@534 40
}
paul@534 41
paul@431 42
static unsigned int nextpos(char *s, unsigned int size, unsigned int bytestart)
paul@431 43
{
paul@431 44
    unsigned int i = bytestart;
paul@431 45
paul@431 46
    while (i < size)
paul@431 47
    {
paul@431 48
        i++;
paul@431 49
        if (boundary(s[i]))
paul@431 50
            break;
paul@431 51
    }
paul@431 52
paul@431 53
    return i;
paul@431 54
}
paul@431 55
paul@431 56
static unsigned int prevpos(char *s, unsigned int bytestart)
paul@431 57
{
paul@431 58
    unsigned int i = bytestart;
paul@431 59
paul@431 60
    while (i > 0)
paul@431 61
    {
paul@431 62
        i--;
paul@431 63
        if (boundary(s[i]))
paul@431 64
            break;
paul@431 65
    }
paul@431 66
paul@431 67
    return i;
paul@431 68
}
paul@431 69
paul@403 70
/* Unicode operations. */
paul@403 71
paul@403 72
__attr __fn_native_unicode_unicode_len(__attr __args[])
paul@403 73
{
paul@403 74
    __attr * const _data = &__args[1];
paul@583 75
    __attr * const _size = &__args[2];
paul@403 76
    /* _data interpreted as string */
paul@403 77
    char *s = _data->strvalue;
paul@583 78
    /* _size interpreted as int */
paul@583 79
    int size = _size->intvalue;
paul@431 80
    unsigned int i, c = 0;
paul@403 81
paul@583 82
    for (i = 0; i < size; i++)
paul@431 83
        if (boundary(s[i]))
paul@403 84
            c++;
paul@403 85
paul@403 86
    /* Return the new integer. */
paul@403 87
    return __new_int(c);
paul@403 88
}
paul@403 89
paul@534 90
__attr __fn_native_unicode_unicode_ord(__attr __args[])
paul@534 91
{
paul@534 92
    __attr * const _data = &__args[1];
paul@583 93
    __attr * const _size = &__args[2];
paul@534 94
    /* _data interpreted as string */
paul@534 95
    char *s = _data->strvalue;
paul@583 96
    /* _size interpreted as int */
paul@583 97
    int size = _size->intvalue;
paul@534 98
    unsigned int i, c = 0, v;
paul@534 99
paul@583 100
    for (i = 0; i < size; i++)
paul@534 101
    {
paul@534 102
        /* Evaluate the current character as a boundary. */
paul@534 103
paul@534 104
        v = boundary_value(s[i]);
paul@534 105
paul@534 106
        /* Boundary with characters read: stop reading. */
paul@534 107
paul@534 108
        if (v && i)
paul@534 109
            break;
paul@534 110
paul@534 111
        /* Boundary: initialise with the extracted value. */
paul@534 112
paul@534 113
        else if (v)
paul@534 114
            c = v;
paul@534 115
paul@534 116
        /* Not a boundary: shift and combine with the continuation value. */
paul@534 117
paul@534 118
        else
paul@534 119
            c = (c << 6) | (s[i] & 0x3f);
paul@534 120
    }
paul@534 121
paul@534 122
    /* Return the new integer. */
paul@534 123
    return __new_int(c);
paul@534 124
}
paul@534 125
paul@431 126
__attr __fn_native_unicode_unicode_substr(__attr __args[])
paul@431 127
{
paul@431 128
    __attr * const _data = &__args[1];
paul@583 129
    __attr * const _size = &__args[2];
paul@583 130
    __attr * const start = &__args[3];
paul@583 131
    __attr * const end = &__args[4];
paul@583 132
    __attr * const step = &__args[5];
paul@431 133
    /* _data interpreted as string */
paul@431 134
    char *s = _data->strvalue, *sub;
paul@583 135
    /* _size interpreted as int */
paul@583 136
    int ss = _size->intvalue;
paul@431 137
    /* start.__data__ interpreted as int */
paul@431 138
    int istart = __load_via_object(start->value, __pos___data__).intvalue;
paul@431 139
    /* end.__data__ interpreted as int */
paul@431 140
    int iend = __load_via_object(end->value, __pos___data__).intvalue;
paul@431 141
    /* step.__data__ interpreted as int */
paul@431 142
    int istep = __load_via_object(step->value, __pos___data__).intvalue;
paul@431 143
paul@431 144
    /* Calculate the number of characters. */
paul@431 145
    size_t nchar = ((iend - istart - (istep > 0 ? 1 : -1)) / istep) + 1;
paul@431 146
    unsigned int indexes[nchar];
paul@431 147
paul@431 148
    unsigned int c, d, i, to, from, lastbyte = 0;
paul@583 149
    int resultsize = 0;
paul@431 150
paul@431 151
    /* Find the indexes of the characters. */
paul@431 152
    if (istep > 0)
paul@431 153
    {
paul@431 154
        /* Get the first byte position. */
paul@431 155
        for (c = 0; c < istart; c++)
paul@569 156
            lastbyte = nextpos(s, ss, lastbyte);
paul@431 157
paul@431 158
        /* Get each subsequent byte position. */
paul@431 159
        for (c = istart, i = 0; i < nchar; c += istep, i++)
paul@431 160
        {
paul@431 161
            indexes[i] = lastbyte;
paul@431 162
paul@431 163
            /* Add the character size to the result size. */
paul@569 164
            resultsize += nextpos(s, ss, lastbyte) - lastbyte;
paul@431 165
paul@431 166
            for (d = c; d < c + istep; d++)
paul@569 167
                lastbyte = nextpos(s, ss, lastbyte);
paul@431 168
        }
paul@431 169
    }
paul@431 170
    else
paul@431 171
    {
paul@431 172
        /* Get the first byte position. */
paul@431 173
        for (c = 0; c < istart; c++)
paul@569 174
            lastbyte = nextpos(s, ss, lastbyte);
paul@431 175
paul@431 176
        /* Get each subsequent byte position. */
paul@431 177
        for (c = istart, i = 0; i < nchar; c += istep, i++)
paul@431 178
        {
paul@431 179
            indexes[i] = lastbyte;
paul@431 180
paul@431 181
            /* Add the character size to the result size. */
paul@569 182
            resultsize += nextpos(s, ss, lastbyte) - lastbyte;
paul@431 183
paul@431 184
            for (d = c; d > c + istep; d--)
paul@431 185
                lastbyte = prevpos(s, lastbyte);
paul@431 186
        }
paul@431 187
    }
paul@431 188
paul@431 189
    /* Reserve space for a new string. */
paul@431 190
    sub = (char *) __ALLOCATE(resultsize + 1, sizeof(char));
paul@431 191
paul@431 192
    /* Does not null terminate but final byte should be zero. */
paul@431 193
    for (i = 0, to = 0; i < nchar; i++)
paul@431 194
    {
paul@431 195
        from = indexes[i];
paul@431 196
        do
paul@431 197
        {
paul@431 198
            sub[to++] = s[from++];
paul@431 199
        } while (!boundary(s[from]));
paul@431 200
    }
paul@431 201
paul@431 202
    return __new_str(sub, resultsize);
paul@431 203
}
paul@431 204
paul@607 205
__attr __fn_native_unicode_unicode_unichr(__attr __args[])
paul@607 206
{
paul@607 207
    __attr * const value = &__args[1];
paul@607 208
    /* value interpreted as int */
paul@607 209
    int i = value->intvalue;
paul@607 210
    unsigned int resultsize;
paul@607 211
    char *s;
paul@607 212
paul@607 213
    if (i < 128) resultsize = 1;
paul@607 214
    else if (i < 2048) resultsize = 2;
paul@607 215
    else if (i < 65536) resultsize = 3;
paul@607 216
    else resultsize = 4;
paul@607 217
paul@607 218
    /* Reserve space for a new string. */
paul@607 219
paul@607 220
    s = (char *) __ALLOCATE(resultsize + 1, sizeof(char));
paul@607 221
paul@607 222
    /* Populate the string. */
paul@607 223
paul@607 224
    if (i < 128) s[0] = (char) i;
paul@607 225
    else if (i < 2048)
paul@607 226
    {
paul@607 227
        s[0] = 0b11000000 | (i >> 6);
paul@607 228
        s[1] = 0b10000000 | (i & 0b00111111);
paul@607 229
    }
paul@607 230
    else if (i < 65536)
paul@607 231
    {
paul@607 232
        s[0] = 0b11100000 | (i >> 12);
paul@607 233
        s[1] = 0b10000000 | ((i >> 6) & 0b00111111);
paul@607 234
        s[2] = 0b10000000 | (i & 0b00111111);
paul@607 235
    }
paul@607 236
    else
paul@607 237
    {
paul@607 238
        s[0] = 0b11110000 | (i >> 18);
paul@607 239
        s[1] = 0b10000000 | ((i >> 12) & 0b00111111);
paul@607 240
        s[2] = 0b10000000 | ((i >> 6) & 0b00111111);
paul@607 241
        s[3] = 0b10000000 | (i & 0b00111111);
paul@607 242
    }
paul@607 243
paul@607 244
    return __new_str(s, resultsize);
paul@607 245
}
paul@607 246
paul@403 247
/* Module initialisation. */
paul@403 248
paul@403 249
void __main_native_unicode()
paul@403 250
{
paul@403 251
}