1.1 --- a/templates/native/unicode.c Mon Dec 19 00:26:49 2016 +0100
1.2 +++ b/templates/native/unicode.c Fri Jan 06 22:23:52 2017 +0100
1.3 @@ -16,7 +16,6 @@
1.4 this program. If not, see <http://www.gnu.org/licenses/>.
1.5 */
1.6
1.7 -#include <string.h> /* strcmp, memcpy */
1.8 #include "native/common.h"
1.9 #include "types.h"
1.10 #include "exceptions.h"
1.11 @@ -26,6 +25,39 @@
1.12 #include "progtypes.h"
1.13 #include "main.h"
1.14
1.15 +static inline int boundary(char c)
1.16 +{
1.17 + return ((c & 0xc0) == 0xc0) || !(c & 0x80);
1.18 +}
1.19 +
1.20 +static unsigned int nextpos(char *s, unsigned int size, unsigned int bytestart)
1.21 +{
1.22 + unsigned int i = bytestart;
1.23 +
1.24 + while (i < size)
1.25 + {
1.26 + i++;
1.27 + if (boundary(s[i]))
1.28 + break;
1.29 + }
1.30 +
1.31 + return i;
1.32 +}
1.33 +
1.34 +static unsigned int prevpos(char *s, unsigned int bytestart)
1.35 +{
1.36 + unsigned int i = bytestart;
1.37 +
1.38 + while (i > 0)
1.39 + {
1.40 + i--;
1.41 + if (boundary(s[i]))
1.42 + break;
1.43 + }
1.44 +
1.45 + return i;
1.46 +}
1.47 +
1.48 /* Unicode operations. */
1.49
1.50 __attr __fn_native_unicode_unicode_len(__attr __args[])
1.51 @@ -33,16 +65,92 @@
1.52 __attr * const _data = &__args[1];
1.53 /* _data interpreted as string */
1.54 char *s = _data->strvalue;
1.55 - int i, c = 0;
1.56 + unsigned int i, c = 0;
1.57
1.58 for (i = 0; i < _data->size; i++)
1.59 - if (((s[i] & 0xc0) == 0xc0) || !(s[i] & 0x80))
1.60 + if (boundary(s[i]))
1.61 c++;
1.62
1.63 /* Return the new integer. */
1.64 return __new_int(c);
1.65 }
1.66
1.67 +__attr __fn_native_unicode_unicode_substr(__attr __args[])
1.68 +{
1.69 + __attr * const _data = &__args[1];
1.70 + __attr * const start = &__args[2];
1.71 + __attr * const end = &__args[3];
1.72 + __attr * const step = &__args[4];
1.73 + /* _data interpreted as string */
1.74 + char *s = _data->strvalue, *sub;
1.75 + /* start.__data__ interpreted as int */
1.76 + int istart = __load_via_object(start->value, __pos___data__).intvalue;
1.77 + /* end.__data__ interpreted as int */
1.78 + int iend = __load_via_object(end->value, __pos___data__).intvalue;
1.79 + /* step.__data__ interpreted as int */
1.80 + int istep = __load_via_object(step->value, __pos___data__).intvalue;
1.81 +
1.82 + /* Calculate the number of characters. */
1.83 + size_t nchar = ((iend - istart - (istep > 0 ? 1 : -1)) / istep) + 1;
1.84 + unsigned int indexes[nchar];
1.85 +
1.86 + unsigned int c, d, i, to, from, lastbyte = 0;
1.87 + size_t resultsize = 0;
1.88 +
1.89 + /* Find the indexes of the characters. */
1.90 + if (istep > 0)
1.91 + {
1.92 + /* Get the first byte position. */
1.93 + for (c = 0; c < istart; c++)
1.94 + lastbyte = nextpos(s, _data->size, lastbyte);
1.95 +
1.96 + /* Get each subsequent byte position. */
1.97 + for (c = istart, i = 0; i < nchar; c += istep, i++)
1.98 + {
1.99 + indexes[i] = lastbyte;
1.100 +
1.101 + /* Add the character size to the result size. */
1.102 + resultsize += nextpos(s, _data->size, lastbyte) - lastbyte;
1.103 +
1.104 + for (d = c; d < c + istep; d++)
1.105 + lastbyte = nextpos(s, _data->size, lastbyte);
1.106 + }
1.107 + }
1.108 + else
1.109 + {
1.110 + /* Get the first byte position. */
1.111 + for (c = 0; c < istart; c++)
1.112 + lastbyte = nextpos(s, _data->size, lastbyte);
1.113 +
1.114 + /* Get each subsequent byte position. */
1.115 + for (c = istart, i = 0; i < nchar; c += istep, i++)
1.116 + {
1.117 + indexes[i] = lastbyte;
1.118 +
1.119 + /* Add the character size to the result size. */
1.120 + resultsize += nextpos(s, _data->size, lastbyte) - lastbyte;
1.121 +
1.122 + for (d = c; d > c + istep; d--)
1.123 + lastbyte = prevpos(s, lastbyte);
1.124 + }
1.125 + }
1.126 +
1.127 + /* Reserve space for a new string. */
1.128 + sub = (char *) __ALLOCATE(resultsize + 1, sizeof(char));
1.129 +
1.130 + /* Does not null terminate but final byte should be zero. */
1.131 + for (i = 0, to = 0; i < nchar; i++)
1.132 + {
1.133 + from = indexes[i];
1.134 + do
1.135 + {
1.136 + sub[to++] = s[from++];
1.137 + } while (!boundary(s[from]));
1.138 + }
1.139 +
1.140 + return __new_str(sub, resultsize);
1.141 +}
1.142 +
1.143 /* Module initialisation. */
1.144
1.145 void __main_native_unicode()