Re: [scc-dev] [PATCH 3/3] libc: fix wchar unicode handling from lhr_at

From: <lhr_at_disroot.org>
Date: Wed, 5 Mar 2025 03:08:18 +0000

> struct mbtest {
> char *s;
> int nbytes;
> wchar_t res;
> };
>
> that would even unify positive and negative tests in one loop as they
> only become different entries in a single array (something similar for
> the wc tests).

You could reuse the same array for both tests, the wc tests would use `res'
as input and compare the result with `s'. I'd rather not keep `nbytes',
since that's repeating the same information twice -- once in the actual
length of the string, and once in `nbytes' which would have to be manually
updated. Better to use strlen()

Also I just thought, now that mbrtowc() returns (size_t)-2, mbtowc() should
check for that, since according to the standard it cannot return -2 (it must
return -1 if not passed a complete, valid multibyte sequence).

diff --git a/src/libc/stdlib/mbtowc.c b/src/libc/stdlib/mbtowc.c
index a9e5749f..73f6db3e 100644
--- a/src/libc/stdlib/mbtowc.c
+++ b/src/libc/stdlib/mbtowc.c
_at_@ -6,5 +6,6 @@
int
mbtowc(wchar_t *restrict pwc, const char *restrict s, size_t n)
{
- return mbrtowc(pwc, s, n, NULL);
+ int ret = mbrtowc(pwc, s, n, NULL);
+ return ret == -2 ? -1 : ret;
}
diff --git a/tests/libc/execute/0038-wchar.c b/tests/libc/execute/0038-wchar.c
index a7438030..6bfe5133 100644
--- a/tests/libc/execute/0038-wchar.c
+++ b/tests/libc/execute/0038-wchar.c
_at_@ -3,6 +3,7 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
+#include <wchar.h>

/*
output:
_at_@ -11,34 +12,56 @@ done
end:
*/

+#define NELEM(x) (sizeof(x)/sizeof((x)[0]))
+
+static struct {
+ wchar_t wc;
+ char const *mb;
+} test[] = {
+ { L'!', "!" },
+ { 0xA1, "\xC2\xA1" },
+ { 0x2014, "\xE2\x80\x94" },
+ { 0x1F4A9, "\xF0\x9F\x92\xA9" }
+};
+
void
-tests_positive()
+test_mbtowc()
{
- const char *const mb[] = { "!", "\u00A1", "\u2014", "\U0001F4A9" };
- const wchar_t wc[] = { L'!', L'\u00A1', L'\u2014', L'\U0001F4A9' };
         int i, ret;

- for (i = 0; i < 4; i++) {
+ for (i = 0; i < NELEM(test); i++) {
                 wchar_t utf32 = 0;
- ret = mbtowc(&utf32, mb[i], strlen(mb[i]));
+ ret = mbtowc(&utf32, test[i].mb, strlen(test[i].mb));
                 assert(ret == i + 1);
- assert(utf32 == wc[i]);
+ assert(utf32 == test[i].wc);
         }
+}
+
+void
+test_wctomb()
+{
+ int i, ret;

- for (i = 0; i < 4; i++) {
+ for (i = 0; i < NELEM(test); i++) {
                 char utf8[5] = "";
- ret = wctomb(utf8, wc[i]);
+ ret = wctomb(utf8, test[i].wc);
                 assert(ret == i + 1);
                 utf8[ret] = '\0';
- assert(!strcmp(mb[i], utf8));
+ assert(!strcmp(test[i].mb, utf8));
         }
}

void
-tests_negative()
+bogus_mbtowc()
{
- char badutf8[] = { 0xF8 /* 0b11111000 */, 0x80 | 1, 0x80 | 2, 0x80 | 3, 0x80 | 4, 0x80 | 5, '\0' };
+ char badutf8[] = {
+ 0xF8 /* 0b11111000 */,
+ 0x80 | 1, 0x80 | 2, 0x80 | 3,
+ 0x80 | 4, 0x80 | 5,
+ '\0'
+ };
         char overlong[] = { 0xC0, 0x80 | 'a' };
+ char incomplete[] = { 0xE2, 0x80 };
         int ret;

         ret = mbtowc(NULL, badutf8, sizeof badutf8);
_at_@ -48,14 +71,20 @@ tests_negative()
         ret = mbtowc(NULL, overlong, sizeof overlong);
         assert(ret == -1);
         assert(errno == EILSEQ);
+
+ errno = 0;
+ ret = mbrtowc(NULL, incomplete, sizeof incomplete, NULL);
+ assert(ret == -2);
+ assert(errno == 0);
}

int
main()
{
         puts("testing");
- tests_positive();
- tests_negative();
+ test_mbtowc();
+ test_wctomb();
+ bogus_mbtowc();
         puts("done");
         return 0;
}

--
To unsubscribe send a mail to scc-dev+unsubscribe_at_simple-cc.org

Received on Wed 05 Mar 2025 - 04:08:18 CET

This archive was generated by hypermail 2.3.0 : Wed 05 Mar 2025 - 04:10:02 CET