From patchwork Sat Apr 27 08:41:00 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Herbert Xu X-Patchwork-Id: 13645619 X-Patchwork-Delegate: herbert@gondor.apana.org.au Received: from abb.hmeau.com (abb.hmeau.com [144.6.53.87]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id C00494AECA for ; Sat, 27 Apr 2024 11:07:17 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=144.6.53.87 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1714216039; cv=none; b=uAPTSuoD9IXaRtsIoSE90OD1+CE9SfVpgBTxvD75VLGVg0Bmz8zoqZnaS6W+QCdjKxaankYdjzIuIaRNYIiAf/L03uZzPTK46TMPEzFBNiH8naoF3xMMjP5sqT0WFwXTAZzALcy25uoMPe+MavMPqnkPkw233AdI0FahCqe8wHM= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1714216039; c=relaxed/simple; bh=+AzRcGwyC7KLieni1Ij1thKLA+FYTOMpheL9m2xuzsI=; h=Message-Id:In-Reply-To:References:From:Date:Subject:To; b=kqZ/lXxDxX8J/DF/aotKmtHo/lAlGzwa33pk+KbDEoS/eXyBa+vuptRfeGSXj0+asAk4+ecM1DsWiH+UKzbe/eD9BNWk1mHUM5Ts3CGTbl2oF4CYWu58IliAeqYHH27BXLrzCJMiCDxfJbSkLWzDEgm1fhGx6xZLDOalnY4gPmc= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=gondor.apana.org.au; spf=pass smtp.mailfrom=gondor.apana.org.au; arc=none smtp.client-ip=144.6.53.87 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=gondor.apana.org.au Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=gondor.apana.org.au Received: from loth.rohan.me.apana.org.au ([192.168.167.2]) by formenos.hmeau.com with smtp (Exim 4.96 #2 (Debian)) id 1s0fti-0079pK-0y; Sat, 27 Apr 2024 19:07:15 +0800 Received: by loth.rohan.me.apana.org.au (sSMTP sendmail emulation); Sat, 27 Apr 2024 19:07:32 +0800 Message-Id: In-Reply-To: References: From: Herbert Xu Date: Sat, 27 Apr 2024 16:41:00 +0800 Subject: [PATCH 8/8] parser: Add support for multi-byte characters To: DASH Mailing List Precedence: bulk X-Mailing-List: dash@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: Add the requisite markers for multi-byte characters so that the expansion code can recognise them. Also allow wide blank characters to terminate words. Signed-off-by: Herbert Xu --- src/expand.c | 19 ++++++++ src/parser.c | 127 +++++++++++++++++++++++++++++++++++++++++---------- 2 files changed, 121 insertions(+), 25 deletions(-) diff --git a/src/expand.c b/src/expand.c index 679bbb8..7c3f350 100644 --- a/src/expand.c +++ b/src/expand.c @@ -265,6 +265,7 @@ static char *argstr(char *p, int flag) CTLESC, CTLVAR, CTLBACKQ, + CTLMBCHAR, CTLARI, CTLENDARI, 0 @@ -289,6 +290,8 @@ tilde: start: startloc = expdest - (char *)stackblock(); for (;;) { + unsigned ml; + unsigned mb; int end; length += strcspn(p + length, reject); @@ -351,6 +354,22 @@ addquote: startloc++; } break; + case CTLMBCHAR: + c = (signed char)*p--; + mb = mbnext(p); + ml = (mb >> 8) - 2; + if (flag & QUOTES_ESC) { + length = (mb >> 8) + (mb & 0xff); + if (c == (char)CTLESC) + startloc += length; + break; + } + if (c == CTLESC) + startloc += ml; + p += mb & 0xff; + expdest = stnputs(p, ml, expdest); + p += mb >> 8; + break; case CTLESC: startloc++; length++; diff --git a/src/parser.c b/src/parser.c index 27611f0..c23cc9b 100644 --- a/src/parser.c +++ b/src/parser.c @@ -36,7 +36,11 @@ #include #endif +#include +#include #include +#include +#include #include "shell.h" #include "parser.h" @@ -876,7 +880,53 @@ static void synstack_pop(struct synstack **stack) *stack = (*stack)->next; } +static unsigned getmbc(int c, char *out, int mode) +{ + char *const start = out; + mbstate_t mbst = {}; + unsigned ml = 0; + size_t ml2; + wchar_t wc; + char *mbc; + if (likely(c >= 0)) + return 0; + + mbc = (mode & 3) < 2 ? out + 2 + (mode == 1) : out; + mbc[ml] = c; + while ((ml2 = mbrtowc(&wc, mbc + ml++, 1, &mbst)) == -2) { + if (ml >= MB_LEN_MAX) + break; + c = pgetc(); + if (c == PEOF) + break; + mbc[ml] = c; + } + + if (ml2 == 1 && ml > 1) { + if (mode == 4 && iswblank(wc)) + return 1; + + if ((mode & 3) < 2) { + USTPUTC(CTLMBCHAR, out); + if (mode == 1) + USTPUTC(CTLESC, out); + USTPUTC(ml, out); + } + STADJUST(ml, out); + if ((mode & 3) < 2) { + USTPUTC(ml, out); + USTPUTC(CTLMBCHAR, out); + } + + return out - start; + } + + if (ml > 1) + pungetn(ml - 1); + + return 0; +} /* * If eofmark is NULL, read a word or a redirection symbol. If eofmark @@ -929,12 +979,27 @@ readtoken1(int firstc, char const *syntax, char *eofmark, int striptabs) } #endif CHECKEND(); /* set c to PEOF if at end of here document */ - for (;;) { /* until end of line or end of word */ - CHECKSTRSPACE(4, out); /* permit 4 calls to USTPUTC */ + /* Until end of line or end of word */ + for (;; c = pgetc_top(synstack)) { + int fieldsplitting; + unsigned ml; + + /* Permit max(MB_LEN_MAX, 23) calls to USTPUTC. */ + CHECKSTRSPACE((MB_LEN_MAX > 16 ? MB_LEN_MAX : 16) + 7, + out); + fieldsplitting = synstack->syntax == BASESYNTAX && + !synstack->varnest ? 4 : 0; + ml = getmbc(c, out, fieldsplitting); + if (ml == 1) { + c = pgetc(); + break; + } + out += ml; + if (ml) + continue; switch(synstack->syntax[c]) { case CNL: /* '\n' */ - if (synstack->syntax == BASESYNTAX && - !synstack->varnest) + if (fieldsplitting) goto endword; /* exit outer loop */ USTPUTC(c, out); nlprompt(); @@ -956,26 +1021,33 @@ readtoken1(int firstc, char const *syntax, char *eofmark, int striptabs) USTPUTC(CTLESC, out); USTPUTC('\\', out); pungetc(); - } else { - if ( - synstack->dblquote && - c != '\\' && c != '`' && - c != '$' && ( - c != '"' || - (eofmark != NULL && - !synstack->varnest) - ) && ( - c != '}' || - !synstack->varnest - ) - ) { - USTPUTC(CTLESC, out); - USTPUTC('\\', out); - } - USTPUTC(CTLESC, out); - USTPUTC(c, out); - quotef++; + break; } + + if ( + synstack->dblquote && + c != '\\' && c != '`' && + c != '$' && ( + c != '"' || + (eofmark != NULL && + !synstack->varnest) + ) && ( + c != '}' || + !synstack->varnest + ) + ) { + USTPUTC(CTLESC, out); + USTPUTC('\\', out); + } + quotef++; + + ml = getmbc(c, out, 1); + out += ml; + if (ml) + break; + + USTPUTC(CTLESC, out); + USTPUTC(c, out); break; case CSQUOTE: synstack->syntax = SQSYNTAX; @@ -1053,11 +1125,10 @@ toggledq: case CEOF: goto endword; /* exit outer loop */ default: - if (synstack->varnest == 0) + if (fieldsplitting) goto endword; /* exit outer loop */ USTPUTC(c, out); } - c = pgetc_top(synstack); } } endword: @@ -1384,6 +1455,7 @@ parsebackq: { size_t psavelen; size_t savelen; union node *n; + unsigned ml; char *pstr; char *str; @@ -1415,6 +1487,11 @@ parsebackq: { if (pc != '\\' && pc != '`' && pc != '$' && (!synstack->dblquote || pc != '"')) STPUTC('\\', pout); + CHECKSTRSPACE(MB_LEN_MAX, pout); + ml = getmbc(pc, pout, 2); + pout += ml; + if (ml) + continue; break; case PEOF: