paul@403 | 1 | /* Native functions for Unicode operations. |
paul@403 | 2 | |
paul@607 | 3 | Copyright (C) 2016, 2017 Paul Boddie <paul@boddie.org.uk> |
paul@403 | 4 | |
paul@403 | 5 | This program is free software; you can redistribute it and/or modify it under |
paul@403 | 6 | the terms of the GNU General Public License as published by the Free Software |
paul@403 | 7 | Foundation; either version 3 of the License, or (at your option) any later |
paul@403 | 8 | version. |
paul@403 | 9 | |
paul@403 | 10 | This program is distributed in the hope that it will be useful, but WITHOUT |
paul@403 | 11 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
paul@403 | 12 | FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
paul@403 | 13 | details. |
paul@403 | 14 | |
paul@403 | 15 | You should have received a copy of the GNU General Public License along with |
paul@403 | 16 | this program. If not, see <http://www.gnu.org/licenses/>. |
paul@403 | 17 | */ |
paul@403 | 18 | |
paul@403 | 19 | #include "native/common.h" |
paul@403 | 20 | #include "types.h" |
paul@403 | 21 | #include "exceptions.h" |
paul@403 | 22 | #include "ops.h" |
paul@403 | 23 | #include "progconsts.h" |
paul@403 | 24 | #include "progops.h" |
paul@403 | 25 | #include "progtypes.h" |
paul@403 | 26 | #include "main.h" |
paul@403 | 27 | |
paul@431 | 28 | static inline int boundary(char c) |
paul@431 | 29 | { |
paul@431 | 30 | return ((c & 0xc0) == 0xc0) || !(c & 0x80); |
paul@431 | 31 | } |
paul@431 | 32 | |
paul@534 | 33 | static inline int boundary_value(char c) |
paul@534 | 34 | { |
paul@534 | 35 | if (!(c & 0x80)) return c; |
paul@534 | 36 | else if ((c & 0xf8) == 0xf0) return c & 0x07; |
paul@534 | 37 | else if ((c & 0xf0) == 0xe0) return c & 0x0f; |
paul@534 | 38 | else if ((c & 0xe0) == 0xc0) return c & 0x1f; |
paul@534 | 39 | else return 0; |
paul@534 | 40 | } |
paul@534 | 41 | |
paul@431 | 42 | static unsigned int nextpos(char *s, unsigned int size, unsigned int bytestart) |
paul@431 | 43 | { |
paul@431 | 44 | unsigned int i = bytestart; |
paul@431 | 45 | |
paul@431 | 46 | while (i < size) |
paul@431 | 47 | { |
paul@431 | 48 | i++; |
paul@431 | 49 | if (boundary(s[i])) |
paul@431 | 50 | break; |
paul@431 | 51 | } |
paul@431 | 52 | |
paul@431 | 53 | return i; |
paul@431 | 54 | } |
paul@431 | 55 | |
paul@431 | 56 | static unsigned int prevpos(char *s, unsigned int bytestart) |
paul@431 | 57 | { |
paul@431 | 58 | unsigned int i = bytestart; |
paul@431 | 59 | |
paul@431 | 60 | while (i > 0) |
paul@431 | 61 | { |
paul@431 | 62 | i--; |
paul@431 | 63 | if (boundary(s[i])) |
paul@431 | 64 | break; |
paul@431 | 65 | } |
paul@431 | 66 | |
paul@431 | 67 | return i; |
paul@431 | 68 | } |
paul@431 | 69 | |
paul@403 | 70 | /* Unicode operations. */ |
paul@403 | 71 | |
paul@403 | 72 | __attr __fn_native_unicode_unicode_len(__attr __args[]) |
paul@403 | 73 | { |
paul@403 | 74 | __attr * const _data = &__args[1]; |
paul@583 | 75 | __attr * const _size = &__args[2]; |
paul@403 | 76 | /* _data interpreted as string */ |
paul@403 | 77 | char *s = _data->strvalue; |
paul@583 | 78 | /* _size interpreted as int */ |
paul@583 | 79 | int size = _size->intvalue; |
paul@431 | 80 | unsigned int i, c = 0; |
paul@403 | 81 | |
paul@583 | 82 | for (i = 0; i < size; i++) |
paul@431 | 83 | if (boundary(s[i])) |
paul@403 | 84 | c++; |
paul@403 | 85 | |
paul@403 | 86 | /* Return the new integer. */ |
paul@403 | 87 | return __new_int(c); |
paul@403 | 88 | } |
paul@403 | 89 | |
paul@534 | 90 | __attr __fn_native_unicode_unicode_ord(__attr __args[]) |
paul@534 | 91 | { |
paul@534 | 92 | __attr * const _data = &__args[1]; |
paul@583 | 93 | __attr * const _size = &__args[2]; |
paul@534 | 94 | /* _data interpreted as string */ |
paul@534 | 95 | char *s = _data->strvalue; |
paul@583 | 96 | /* _size interpreted as int */ |
paul@583 | 97 | int size = _size->intvalue; |
paul@534 | 98 | unsigned int i, c = 0, v; |
paul@534 | 99 | |
paul@583 | 100 | for (i = 0; i < size; i++) |
paul@534 | 101 | { |
paul@534 | 102 | /* Evaluate the current character as a boundary. */ |
paul@534 | 103 | |
paul@534 | 104 | v = boundary_value(s[i]); |
paul@534 | 105 | |
paul@534 | 106 | /* Boundary with characters read: stop reading. */ |
paul@534 | 107 | |
paul@534 | 108 | if (v && i) |
paul@534 | 109 | break; |
paul@534 | 110 | |
paul@534 | 111 | /* Boundary: initialise with the extracted value. */ |
paul@534 | 112 | |
paul@534 | 113 | else if (v) |
paul@534 | 114 | c = v; |
paul@534 | 115 | |
paul@534 | 116 | /* Not a boundary: shift and combine with the continuation value. */ |
paul@534 | 117 | |
paul@534 | 118 | else |
paul@534 | 119 | c = (c << 6) | (s[i] & 0x3f); |
paul@534 | 120 | } |
paul@534 | 121 | |
paul@534 | 122 | /* Return the new integer. */ |
paul@534 | 123 | return __new_int(c); |
paul@534 | 124 | } |
paul@534 | 125 | |
paul@431 | 126 | __attr __fn_native_unicode_unicode_substr(__attr __args[]) |
paul@431 | 127 | { |
paul@431 | 128 | __attr * const _data = &__args[1]; |
paul@583 | 129 | __attr * const _size = &__args[2]; |
paul@583 | 130 | __attr * const start = &__args[3]; |
paul@583 | 131 | __attr * const end = &__args[4]; |
paul@583 | 132 | __attr * const step = &__args[5]; |
paul@431 | 133 | /* _data interpreted as string */ |
paul@431 | 134 | char *s = _data->strvalue, *sub; |
paul@583 | 135 | /* _size interpreted as int */ |
paul@583 | 136 | int ss = _size->intvalue; |
paul@431 | 137 | /* start.__data__ interpreted as int */ |
paul@431 | 138 | int istart = __load_via_object(start->value, __pos___data__).intvalue; |
paul@431 | 139 | /* end.__data__ interpreted as int */ |
paul@431 | 140 | int iend = __load_via_object(end->value, __pos___data__).intvalue; |
paul@431 | 141 | /* step.__data__ interpreted as int */ |
paul@431 | 142 | int istep = __load_via_object(step->value, __pos___data__).intvalue; |
paul@431 | 143 | |
paul@431 | 144 | /* Calculate the number of characters. */ |
paul@431 | 145 | size_t nchar = ((iend - istart - (istep > 0 ? 1 : -1)) / istep) + 1; |
paul@431 | 146 | unsigned int indexes[nchar]; |
paul@431 | 147 | |
paul@431 | 148 | unsigned int c, d, i, to, from, lastbyte = 0; |
paul@583 | 149 | int resultsize = 0; |
paul@431 | 150 | |
paul@431 | 151 | /* Find the indexes of the characters. */ |
paul@431 | 152 | if (istep > 0) |
paul@431 | 153 | { |
paul@431 | 154 | /* Get the first byte position. */ |
paul@431 | 155 | for (c = 0; c < istart; c++) |
paul@569 | 156 | lastbyte = nextpos(s, ss, lastbyte); |
paul@431 | 157 | |
paul@431 | 158 | /* Get each subsequent byte position. */ |
paul@431 | 159 | for (c = istart, i = 0; i < nchar; c += istep, i++) |
paul@431 | 160 | { |
paul@431 | 161 | indexes[i] = lastbyte; |
paul@431 | 162 | |
paul@431 | 163 | /* Add the character size to the result size. */ |
paul@569 | 164 | resultsize += nextpos(s, ss, lastbyte) - lastbyte; |
paul@431 | 165 | |
paul@431 | 166 | for (d = c; d < c + istep; d++) |
paul@569 | 167 | lastbyte = nextpos(s, ss, lastbyte); |
paul@431 | 168 | } |
paul@431 | 169 | } |
paul@431 | 170 | else |
paul@431 | 171 | { |
paul@431 | 172 | /* Get the first byte position. */ |
paul@431 | 173 | for (c = 0; c < istart; c++) |
paul@569 | 174 | lastbyte = nextpos(s, ss, lastbyte); |
paul@431 | 175 | |
paul@431 | 176 | /* Get each subsequent byte position. */ |
paul@431 | 177 | for (c = istart, i = 0; i < nchar; c += istep, i++) |
paul@431 | 178 | { |
paul@431 | 179 | indexes[i] = lastbyte; |
paul@431 | 180 | |
paul@431 | 181 | /* Add the character size to the result size. */ |
paul@569 | 182 | resultsize += nextpos(s, ss, lastbyte) - lastbyte; |
paul@431 | 183 | |
paul@431 | 184 | for (d = c; d > c + istep; d--) |
paul@431 | 185 | lastbyte = prevpos(s, lastbyte); |
paul@431 | 186 | } |
paul@431 | 187 | } |
paul@431 | 188 | |
paul@431 | 189 | /* Reserve space for a new string. */ |
paul@431 | 190 | sub = (char *) __ALLOCATE(resultsize + 1, sizeof(char)); |
paul@431 | 191 | |
paul@431 | 192 | /* Does not null terminate but final byte should be zero. */ |
paul@431 | 193 | for (i = 0, to = 0; i < nchar; i++) |
paul@431 | 194 | { |
paul@431 | 195 | from = indexes[i]; |
paul@431 | 196 | do |
paul@431 | 197 | { |
paul@431 | 198 | sub[to++] = s[from++]; |
paul@431 | 199 | } while (!boundary(s[from])); |
paul@431 | 200 | } |
paul@431 | 201 | |
paul@431 | 202 | return __new_str(sub, resultsize); |
paul@431 | 203 | } |
paul@431 | 204 | |
paul@607 | 205 | __attr __fn_native_unicode_unicode_unichr(__attr __args[]) |
paul@607 | 206 | { |
paul@607 | 207 | __attr * const value = &__args[1]; |
paul@607 | 208 | /* value interpreted as int */ |
paul@607 | 209 | int i = value->intvalue; |
paul@607 | 210 | unsigned int resultsize; |
paul@607 | 211 | char *s; |
paul@607 | 212 | |
paul@607 | 213 | if (i < 128) resultsize = 1; |
paul@607 | 214 | else if (i < 2048) resultsize = 2; |
paul@607 | 215 | else if (i < 65536) resultsize = 3; |
paul@607 | 216 | else resultsize = 4; |
paul@607 | 217 | |
paul@607 | 218 | /* Reserve space for a new string. */ |
paul@607 | 219 | |
paul@607 | 220 | s = (char *) __ALLOCATE(resultsize + 1, sizeof(char)); |
paul@607 | 221 | |
paul@607 | 222 | /* Populate the string. */ |
paul@607 | 223 | |
paul@607 | 224 | if (i < 128) s[0] = (char) i; |
paul@607 | 225 | else if (i < 2048) |
paul@607 | 226 | { |
paul@607 | 227 | s[0] = 0b11000000 | (i >> 6); |
paul@607 | 228 | s[1] = 0b10000000 | (i & 0b00111111); |
paul@607 | 229 | } |
paul@607 | 230 | else if (i < 65536) |
paul@607 | 231 | { |
paul@607 | 232 | s[0] = 0b11100000 | (i >> 12); |
paul@607 | 233 | s[1] = 0b10000000 | ((i >> 6) & 0b00111111); |
paul@607 | 234 | s[2] = 0b10000000 | (i & 0b00111111); |
paul@607 | 235 | } |
paul@607 | 236 | else |
paul@607 | 237 | { |
paul@607 | 238 | s[0] = 0b11110000 | (i >> 18); |
paul@607 | 239 | s[1] = 0b10000000 | ((i >> 12) & 0b00111111); |
paul@607 | 240 | s[2] = 0b10000000 | ((i >> 6) & 0b00111111); |
paul@607 | 241 | s[3] = 0b10000000 | (i & 0b00111111); |
paul@607 | 242 | } |
paul@607 | 243 | |
paul@607 | 244 | return __new_str(s, resultsize); |
paul@607 | 245 | } |
paul@607 | 246 | |
paul@403 | 247 | /* Module initialisation. */ |
paul@403 | 248 | |
paul@403 | 249 | void __main_native_unicode() |
paul@403 | 250 | { |
paul@403 | 251 | } |