[scc-dev] [PATCH 2/3] libc/wchar: Fix unicode handling

From: Roberto E. Vargas Caballero <k0ga_at_shike2.net>
Date: Mon, 10 Mar 2025 11:54:05 +0100

From: "Roberto E. Vargas Caballero" <k0ga_at_shike2.com>

* mbrtowc: validate input, handle 4-byte UTF-8 code points, set errno
* wcrtomb: if a UTF-8 sequence has N bytes, the leading byte has the first
  N bits set (with ASCII characters a special case), not the first N-1 bits
* _validutf8: negate condition
---
 src/libc/stdlib/mbtowc.c    |  3 ++-
 src/libc/wchar/_validutf8.c |  2 +-
 src/libc/wchar/mbrtowc.c    | 21 +++++++++++++--------
 src/libc/wchar/wcrtomb.c    |  9 +++++++--
 4 files changed, 23 insertions(+), 12 deletions(-)
diff --git a/src/libc/stdlib/mbtowc.c b/src/libc/stdlib/mbtowc.c
index a9e5749f..75dd1f2c 100644
--- a/src/libc/stdlib/mbtowc.c
+++ b/src/libc/stdlib/mbtowc.c
_at_@ -6,5 +6,6 @@
 int
 mbtowc(wchar_t *restrict pwc, const char *restrict s, size_t n)
 {
-	return mbrtowc(pwc, s, n, NULL);
+	int  ret = mbrtowc(pwc, s, n, NULL);
+	return ret == -2 ? -1 : ret;
 }
diff --git a/src/libc/wchar/_validutf8.c b/src/libc/wchar/_validutf8.c
index 45b12fdc..c41afe26 100644
--- a/src/libc/wchar/_validutf8.c
+++ b/src/libc/wchar/_validutf8.c
_at_@ -23,7 +23,7 @@ _validutf8(wchar_t wc, int *nbytes)
 	};
 	struct range *bp;
 
-	for (bp = ranges; bp->begin <= wc && bp->end > wc; ++bp)
+	for (bp = ranges; bp->begin > wc || bp->end <= wc; ++bp)
 		;
 	*nbytes = bp->nbytes;
 
diff --git a/src/libc/wchar/mbrtowc.c b/src/libc/wchar/mbrtowc.c
index 6f825f8b..2cf27900 100644
--- a/src/libc/wchar/mbrtowc.c
+++ b/src/libc/wchar/mbrtowc.c
_at_@ -1,3 +1,4 @@
+#include <errno.h>
 #include <wchar.h>
 
 #include "../libc.h"
_at_@ -8,37 +9,41 @@ size_t
 mbrtowc(wchar_t *restrict pwc, const char *restrict s, size_t n,
         mbstate_t *restrict ps)
 {
-	unsigned char *t = (unsigned char *) s;
+	const unsigned char *t = (const unsigned char *) s;
 	unsigned long wc;
 	unsigned c;
 	int i, len, maxlen;
 
-	if (s == NULL)
+	if (s == NULL || *s == '\0')
 		return 0;
 
 	wc = c = *t++;
 	for (len = 0; n > 0 && c & 0x80; --n, ++len)
 		c <<= 1;
-	if (n == 0 || len == 1 || len == 8)
-		return -1;
+	if (n == 0 && c & 0x80)
+		return -2;
+	if (len == 1 || len == 8)
+		goto return_error;
 	if (len == 0)
 		goto return_code;
 
 	wc = (c & 0xFF) >> len;
 	for (i = 0; i < len-1; i++) {
 		if (((c = *t++) & 0xC0) != 0x80)
-			return -1;
+			goto return_error;
 		wc <<= 6;
 		wc |= c & 0x3F;
 	}
 
 	if (!_validutf8(wc, &maxlen) || len != maxlen)
-		return -1;
+		goto return_error;
 
 return_code:
 	if (pwc)
 		*pwc = wc;
-	if (*s == '\0')
-		return 0;
 	return t - (unsigned char *) s;
+
+return_error:
+	errno = EILSEQ;
+	return -1;
 }
diff --git a/src/libc/wchar/wcrtomb.c b/src/libc/wchar/wcrtomb.c
index b302266f..4c2a3014 100644
--- a/src/libc/wchar/wcrtomb.c
+++ b/src/libc/wchar/wcrtomb.c
_at_@ -14,13 +14,18 @@ wcrtomb(char *restrict s, wchar_t wc, mbstate_t *restrict ps)
 	if (!s)
 		return 1;
 
+	if (c < 0x80) {
+		*s = wc;
+		return 1;
+	}
+
 	if (!_validutf8(wc, &n)) {
 		errno = EILSEQ;
 		return -1;
 	}
-
 	n--;
-	*s = 0;
+
+	*s = 0x80;
 	for (i = 0; i < n; i++) {
 		*s >>= 1;
 		*s |= 0x80;
-- 
2.46.1
--
To unsubscribe send a mail to scc-dev+unsubscribe_at_simple-cc.org
Received on Mon 10 Mar 2025 - 11:54:05 CET

This archive was generated by hypermail 2.3.0 : Mon 10 Mar 2025 - 12:00:01 CET