[scc-dev] [PATCH 3/3] libc: fix wchar unicode handling

From: remph <lhr_at_disroot.org>
Date: Thu, 20 Feb 2025 19:32:21 +0000

* mbrtowc: validate input, handle 4-byte UTF-8 code points, set errno
* wcrtomb: if a UTF-8 sequence has N bytes, the leading byte has the first
  N bits set (with ASCII characters a special case), not the first N-1 bits
* _validutf8: negate condition
* Add test
---
 src/libc/wchar/_validutf8.c       |  2 +-
 src/libc/wchar/mbrtowc.c          | 24 +++++++++---
 src/libc/wchar/wcrtomb.c          |  7 +++-
 tests/libc/execute/.gitignore     |  1 +
 tests/libc/execute/0038-wchar.c   | 61 +++++++++++++++++++++++++++++++
 tests/libc/execute/libc-tests.lst |  1 +
 6 files changed, 89 insertions(+), 7 deletions(-)
 create mode 100644 tests/libc/execute/0038-wchar.c
diff --git a/src/libc/wchar/_validutf8.c b/src/libc/wchar/_validutf8.c
index 45b12fdc..48bc8066 100644
--- a/src/libc/wchar/_validutf8.c
+++ b/src/libc/wchar/_validutf8.c
_at_@ -23,7 +23,7 @@ _validutf8(wchar_t wc, int *nbytes)
 	};
 	struct range *bp;
 
-	for (bp = ranges; bp->begin <= wc && bp->end > wc; ++bp)
+	for (bp = ranges; !(bp->begin <= wc && bp->end > wc); ++bp)
 		;
 	*nbytes = bp->nbytes;
 
diff --git a/src/libc/wchar/mbrtowc.c b/src/libc/wchar/mbrtowc.c
index 6f825f8b..e1b217f0 100644
--- a/src/libc/wchar/mbrtowc.c
+++ b/src/libc/wchar/mbrtowc.c
_at_@ -1,3 +1,5 @@
+#include <errno.h>
+#include <stddef.h>
 #include <wchar.h>
 
 #include "../libc.h"
_at_@ -5,10 +7,11 @@
 #undef mbrtowc
 
 size_t
-mbrtowc(wchar_t *restrict pwc, const char *restrict s, size_t n,
+mbrtowc(wchar_t *restrict pwc, const char *restrict s, size_t n_,
         mbstate_t *restrict ps)
 {
-	unsigned char *t = (unsigned char *) s;
+	const unsigned char *t = (const unsigned char *) s;
+	ptrdiff_t n = n_;
 	unsigned long wc;
 	unsigned c;
 	int i, len, maxlen;
_at_@ -16,24 +19,35 @@ mbrtowc(wchar_t *restrict pwc, const char *restrict s, size_t n,
 	if (s == NULL)
 		return 0;
 
+	if (!(*t < 0x80 || (*t >= 0xC2 && *t <= 0xF4))) {
+		errno = EILSEQ;
+		return -1;
+	}
+
 	wc = c = *t++;
 	for (len = 0; n > 0 && c & 0x80; --n, ++len)
 		c <<= 1;
-	if (n == 0 || len == 1 || len == 8)
+	if (n < 0 || len == 1 || len == 8) {
+		errno = EILSEQ;
 		return -1;
+	}
 	if (len == 0)
 		goto return_code;
 
 	wc = (c & 0xFF) >> len;
 	for (i = 0; i < len-1; i++) {
-		if (((c = *t++) & 0xC0) != 0x80)
+		if (((c = *t++) & 0xC0) != 0x80) {
+			errno = EILSEQ;
 			return -1;
+		}
 		wc <<= 6;
 		wc |= c & 0x3F;
 	}
 
-	if (!_validutf8(wc, &maxlen) || len != maxlen)
+	if (!_validutf8(wc, &maxlen) || len != maxlen) {
+		errno = EILSEQ;
 		return -1;
+	}
 
 return_code:
 	if (pwc)
diff --git a/src/libc/wchar/wcrtomb.c b/src/libc/wchar/wcrtomb.c
index b302266f..e23a25c3 100644
--- a/src/libc/wchar/wcrtomb.c
+++ b/src/libc/wchar/wcrtomb.c
_at_@ -14,13 +14,18 @@ wcrtomb(char *restrict s, wchar_t wc, mbstate_t *restrict ps)
 	if (!s)
 		return 1;
 
+	if (wc < 0x80) {
+		*s = (char)wc;
+		return 1;
+	}
+
 	if (!_validutf8(wc, &n)) {
 		errno = EILSEQ;
 		return -1;
 	}
 
 	n--;
-	*s = 0;
+	*s = 0x80;
 	for (i = 0; i < n; i++) {
 		*s >>= 1;
 		*s |= 0x80;
diff --git a/tests/libc/execute/.gitignore b/tests/libc/execute/.gitignore
index 042ba515..f191dcee 100644
--- a/tests/libc/execute/.gitignore
+++ b/tests/libc/execute/.gitignore
_at_@ -35,4 +35,5 @@
 0035-setlocale
 0036-localeconv
 0037-malloc
+0038-wchar
 test.log
diff --git a/tests/libc/execute/0038-wchar.c b/tests/libc/execute/0038-wchar.c
new file mode 100644
index 00000000..a7438030
--- /dev/null
+++ b/tests/libc/execute/0038-wchar.c
_at_@ -0,0 +1,61 @@
+#include <assert.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+/*
+output:
+testing
+done
+end:
+*/
+
+void
+tests_positive()
+{
+	const char *const mb[] = { "!", "\u00A1", "\u2014", "\U0001F4A9" };
+	const wchar_t wc[] = { L'!', L'\u00A1', L'\u2014', L'\U0001F4A9' };
+	int i, ret;
+
+	for (i = 0; i < 4; i++) {
+		wchar_t utf32 = 0;
+		ret = mbtowc(&utf32, mb[i], strlen(mb[i]));
+		assert(ret == i + 1);
+		assert(utf32 == wc[i]);
+	}
+
+	for (i = 0; i < 4; i++) {
+		char utf8[5] = "";
+		ret = wctomb(utf8, wc[i]);
+		assert(ret == i + 1);
+		utf8[ret] = '\0';
+		assert(!strcmp(mb[i], utf8));
+	}
+}
+
+void
+tests_negative()
+{
+	char badutf8[] = { 0xF8 /* 0b11111000 */, 0x80 | 1, 0x80 | 2, 0x80 | 3, 0x80 | 4, 0x80 | 5, '\0' };
+	char overlong[] = { 0xC0, 0x80 | 'a' };
+	int ret;
+
+	ret = mbtowc(NULL, badutf8, sizeof badutf8);
+	assert(ret == -1);
+	assert(errno == EILSEQ);
+
+	ret = mbtowc(NULL, overlong, sizeof overlong);
+	assert(ret == -1);
+	assert(errno == EILSEQ);
+}
+
+int
+main()
+{
+	puts("testing");
+	tests_positive();
+	tests_negative();
+	puts("done");
+	return 0;
+}
diff --git a/tests/libc/execute/libc-tests.lst b/tests/libc/execute/libc-tests.lst
index 9e0e1dd9..46e61910 100644
--- a/tests/libc/execute/libc-tests.lst
+++ b/tests/libc/execute/libc-tests.lst
_at_@ -34,3 +34,4 @@
 0035-setlocale
 0036-localeconv
 0037-malloc [TODO]
+0038-wchar
-- 
2.48.1
--
To unsubscribe send a mail to scc-dev+unsubscribe_at_simple-cc.org
Received on Thu 20 Feb 2025 - 20:32:21 CET

This archive was generated by hypermail 2.3.0 : Thu 20 Feb 2025 - 20:40:01 CET