diff mbox series

[v2,6/8] expand: Support multi-byte characters during field splitting

Message ID ffd890700a18b9beaad65e26876b1d7932b0d018.1714276539.git.herbert@gondor.apana.org.au (mailing list archive)
State Changes Requested
Delegated to: Herbert Xu
Headers show
Series Add multi-byte support | expand

Commit Message

Herbert Xu April 28, 2024, 3:57 a.m. UTC
When multi-byte characters are used in IFS, they will be used
for field splitting.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 src/expand.c | 201 +++++++++++++++++++++++++++++++++++----------------
 1 file changed, 140 insertions(+), 61 deletions(-)
diff mbox series

Patch

diff --git a/src/expand.c b/src/expand.c
index 0e85025..dd2b71e 100644
--- a/src/expand.c
+++ b/src/expand.c
@@ -54,6 +54,7 @@ 
 #include <sys/stat.h>
 #include <unistd.h>
 #include <wchar.h>
+#include <wctype.h>
 
 /*
  * Routines to expand arguments to commands.  We have to deal with
@@ -164,6 +165,30 @@  esclen(const char *start, const char *p) {
 	return esc;
 }
 
+static __attribute__((noinline)) unsigned mbnext(const char *p)
+{
+	unsigned start = 0;
+	unsigned end = 0;
+	unsigned ml;
+	int c;
+
+	c = p[end++];
+
+	switch (c) {
+	case CTLMBCHAR:
+		if (p[end] == CTLESC)
+			end++;
+		ml = (unsigned char)p[end++];
+		start = end;
+		end = ml + 2;
+		break;
+	case CTLESC:
+		start++;
+		break;
+	}
+
+	return start | end << 8;
+}
 
 static inline const char *getpwhome(const char *name)
 {
@@ -552,6 +577,7 @@  static char *scanleft(char *startp, char *endp, char *rmesc, char *rmescend,
 	loc2 = rmesc;
 	do {
 		const char *s = loc2;
+		unsigned mb;
 		unsigned ml;
 		int match;
 
@@ -568,19 +594,9 @@  static char *scanleft(char *startp, char *endp, char *rmesc, char *rmescend,
 		if (!c)
 			break;
 
-		if (*loc != (char)CTLMBCHAR) {
-			if (*loc == (char)CTLESC)
-				loc++;
-			loc++;
-			loc2++;
-			continue;
-		}
-
-		if (*++loc == (char)CTLESC)
-			loc++;
-
-		ml = (unsigned char)*loc;
-		loc += ml + 3;
+		mb = mbnext(loc);
+		loc += (mb & 0xff) + (mb >> 8);
+		ml = (mb >> 8) > 3 ? (mb >> 8) - 2 : 1;
 		loc2 += ml;
 	} while (1);
 	return 0;
@@ -930,18 +946,22 @@  static size_t strtodest(const char *p, int flags)
 STATIC ssize_t
 varvalue(char *name, int varflags, int flags, int quoted)
 {
+	int subtype = varflags & VSTYPE;
+	const char *seps;
+	ssize_t len = 0;
+	unsigned seplen;
+	size_t start;
+	int discard;
+	char sepc;
+	char **ap;
+	int sep;
 	int num;
 	char *p;
 	int i;
-	int sep;
-	char sepc;
-	char **ap;
-	int subtype = varflags & VSTYPE;
-	int discard = (subtype == VSPLUS || subtype == VSLENGTH) |
-		      (flags & EXP_DISCARD);
-	ssize_t len = 0;
-	size_t start;
-	char c;
+	int c;
+
+	discard = (subtype == VSPLUS || subtype == VSLENGTH) |
+		  (flags & EXP_DISCARD);
 
 	if (!subtype) {
 		if (discard)
@@ -1004,15 +1024,27 @@  numvar:
 		sep &= ~quoted;
 		sep |= ifsset() ? (unsigned char)(c & ifsval()[0]) : ' ';
 param:
-		sepc = sep;
 		if (!(ap = shellparam.p))
 			return -1;
+		sepc = sep;
+		seps = &sepc;
+		seplen = 1;
+		if (sepc < 0) {
+			mbstate_t mbs = {};
+			size_t ml;
+
+			ml = mbrlen(ifsval(), strlen(ifsval()), &mbs);
+			if (ml != -1 && ml != -2 && ml > 1) {
+				seps = ifsval();
+				seplen = ml;
+			}
+		}
 		while ((p = *ap++)) {
 			len += strtodest(p, flags);
 
 			if (*ap && sep) {
 				len++;
-				memtodest(&sepc, 1, flags | EXP_KEEPNUL);
+				memtodest(seps, seplen, flags | EXP_KEEPNUL);
 			}
 		}
 		break;
@@ -1074,7 +1106,54 @@  recordregion(int start, int end, int nulonly)
 	ifslastp->nulonly = nulonly;
 }
 
+static __attribute__((noinline)) unsigned ifsisifs(
+	const char *p, unsigned ml, const char *ifs, size_t ifslen)
+{
+	bool isdefifs = false;
+	size_t slen = ifslen;
+	const char *s = ifs;
+	wchar_t c = *p;
+	bool isifs;
 
+	isifs = !c;
+	if (isifs) {
+		p = ifs;
+		c = *p;
+		slen = 0;
+	}
+
+	while (slen) {
+		mbstate_t mbst = {};
+		size_t ifsml;
+		wchar_t c2;
+
+		if ((signed char)*s > 0 ||
+		    (ifsml = mbrtowc(&c2, s, slen, &mbst),
+		     ifsml == -2 || ifsml == -1 || ifsml < 2)) {
+			if (c == *s) {
+				isifs = true;
+				break;
+			}
+			s++;
+			slen--;
+			continue;
+		}
+
+		if (ifsml == ml && !memcmp(p, s, ifsml)) {
+			isifs = true;
+			c = c2;
+			break;
+		}
+
+		s += ifsml;
+		slen -= ifsml;
+	}
+
+	if (isifs)
+		isdefifs = iswspace(c);
+
+	return isifs | isdefifs << 1;
+}
 
 /*
  * Break the argument string into pieces based upon IFS and add the
@@ -1086,16 +1165,16 @@  recordregion(int start, int end, int nulonly)
 void
 ifsbreakup(char *string, int maxargs, struct arglist *arglist)
 {
+	const char *ifs, *realifs;
 	struct ifsregion *ifsp;
 	struct strlist *sp;
+	char *r = NULL;
+	size_t ifslen;
 	char *start;
+	int nulonly;
+	int ifsspc;
 	char *p;
 	char *q;
-	char *r = NULL;
-	const char *ifs, *realifs;
-	int ifsspc;
-	int nulonly;
-
 
 	start = string;
 	if (ifslastp != NULL) {
@@ -1110,21 +1189,27 @@  ifsbreakup(char *string, int maxargs, struct arglist *arglist)
 			afternul = nulonly;
 			nulonly = ifsp->nulonly;
 			ifs = nulonly ? nullstr : realifs;
+			ifslen = strlen(ifs);
 			ifsspc = 0;
 			while (p < string + ifsp->endoff) {
-				int c;
-				bool isifs;
+				unsigned ifschar;
+				unsigned sisifs;
 				bool isdefifs;
+				unsigned ml;
+				bool isifs;
 
 				q = p;
-				c = *p++;
-				if (c == (char)CTLESC)
-					c = *p++;
 
-				isifs = strchr(ifs, c);
-				isdefifs = false;
-				if (isifs)
-					isdefifs = strchr(defifs, c);
+				ifschar = mbnext(p);
+				p += ifschar & 0xff;
+				ml = (ifschar >> 8) > 3 ?
+				     (ifschar >> 8) - 2 : 0;
+
+				sisifs = ifsisifs(p, ml, ifs, ifslen);
+				p += ifschar >> 8;
+
+				isifs = sisifs & 1;
+				isdefifs = sisifs >> 1;
 
 				/* If only reading one more argument:
 				 * If we have exactly one field,
@@ -1380,32 +1465,24 @@  static void expmeta_rmescapes(char *enddir, char *name)
 	preglob(strcpy(enddir, name), RMESCAPE_EMETA);
 }
 
-static unsigned mbcharlen(char *p)
-{
-	int esc = 0;
-
-	if (*++p == (char)CTLESC)
-		esc++;
-
-	return esc + 3 + (unsigned char)p[esc];
-}
-
 static int skipesc(char *p)
 {
+	unsigned short mb;
 	int esc = 0;
 
-	if (p[esc] == (char)CTLMBCHAR)
-		return esc + mbcharlen(p);
+	mb = mbnext(p);
+	if ((mb >> 8) > 3)
+		return (mb & 0xff) + (mb >> 8) - 1;
 
-	if (*p == (char)CTLESC)
-		esc++;
+	esc = mb & 0xff;
 
 	if (p[esc] == '\\' && p[esc + 1]) {
 		esc++;
-		if (p[esc] == (char)CTLMBCHAR)
-			return esc + mbcharlen(p + esc);
-		if (p[esc] == (char)CTLESC)
-			esc++;
+		mb = mbnext(p + esc);
+		if ((mb >> 8) > 3)
+			return esc + (mb & 0xff) + (mb >> 8) - 1;
+
+		esc += mb & 0xff;
 	}
 
 	return esc;
@@ -1813,6 +1890,7 @@  _rmescapes(char *str, int flag)
 	inquotes = 0;
 	notescaped = globbing;
 	while (*p) {
+		unsigned mb;
 		unsigned ml;
 
 		if (*p == (char)CTLQUOTEMARK) {
@@ -1845,13 +1923,14 @@  add_escape:
 		}
 		notescaped = globbing;
 
-		if (*p != (char)CTLMBCHAR)
+		mb = mbnext(p);
+		ml = mb >> 8;
+
+		if (ml <= 3)
 			goto copy;
 
-		if (*++p == (char)CTLESC)
-			p++;
-
-		ml = (unsigned char)*p++;
+		ml -= 2;
+		p += mb & 0xff;
 		q = mempcpy(q, p, ml);
 		p += ml + 2;
 		continue;