From patchwork Thu Mar 8 16:14:02 2018 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Herbert Xu X-Patchwork-Id: 10268379 X-Patchwork-Delegate: herbert@gondor.apana.org.au Return-Path: Received: from mail.wl.linuxfoundation.org (pdx-wl-mail.web.codeaurora.org [172.30.200.125]) by pdx-korg-patchwork.web.codeaurora.org (Postfix) with ESMTP id CFF186016D for ; Thu, 8 Mar 2018 16:14:25 +0000 (UTC) Received: from mail.wl.linuxfoundation.org (localhost [127.0.0.1]) by mail.wl.linuxfoundation.org (Postfix) with ESMTP id BAB6E28DAB for ; Thu, 8 Mar 2018 16:14:25 +0000 (UTC) Received: by mail.wl.linuxfoundation.org (Postfix, from userid 486) id AEBEA298B3; Thu, 8 Mar 2018 16:14:25 +0000 (UTC) X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on pdx-wl-mail.web.codeaurora.org X-Spam-Level: X-Spam-Status: No, score=-6.9 required=2.0 tests=BAYES_00,RCVD_IN_DNSWL_HI autolearn=ham version=3.3.1 Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by mail.wl.linuxfoundation.org (Postfix) with ESMTP id A73F4292F7 for ; Thu, 8 Mar 2018 16:14:24 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1756094AbeCHQOW (ORCPT ); Thu, 8 Mar 2018 11:14:22 -0500 Received: from orcrist.hmeau.com ([104.223.48.154]:35714 "EHLO deadmen.hmeau.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1752059AbeCHQOT (ORCPT ); Thu, 8 Mar 2018 11:14:19 -0500 Received: from gondobar.mordor.me.apana.org.au ([192.168.128.4] helo=gondobar) by deadmen.hmeau.com with esmtp (Exim 4.84_2 #2 (Debian)) id 1etyBA-0000sc-7A; Fri, 09 Mar 2018 00:14:04 +0800 Received: from herbert by gondobar with local (Exim 4.84_2) (envelope-from ) id 1etyB8-0002ZO-DE; Fri, 09 Mar 2018 00:14:02 +0800 Date: Fri, 9 Mar 2018 00:14:02 +0800 From: Herbert Xu To: Harald van Dijk Cc: Martijn Dekker , Denys Vlasenko , dash@vger.kernel.org Subject: [PATCH v3] parser: Add syntax stack for recursive parsing Message-ID: <20180308161402.GA9757@gondor.apana.org.au> References: <7dac7df9-4093-095e-dd71-2d7383edd8c3@inlv.org> <041881f9-9084-4083-345a-8f85792b48ef@gigawatt.nl> <20180307162944.GA4960@gondor.apana.org.au> <066e53c4-ad05-35bb-2da2-a377ce8f4629@gigawatt.nl> <20180308075549.GB8211@gondor.apana.org.au> <160da770-3475-2b8f-2f33-817bc1948219@gigawatt.nl> MIME-Version: 1.0 Content-Disposition: inline In-Reply-To: <160da770-3475-2b8f-2f33-817bc1948219@gigawatt.nl> User-Agent: Mutt/1.5.23 (2014-03-12) Sender: dash-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: dash@vger.kernel.org X-Virus-Scanned: ClamAV using ClamSMTP On Thu, Mar 08, 2018 at 12:36:04PM +0100, Harald van Dijk wrote: > > The first line of this part of my patch is about something else: > > x=\"; cat < ${x#"\""} > EOF > > This shouldn't print anything. Right. So here is a new patch with that change thrown in. ---8<--- Without a stack of syntaxes we cannot correctly these two cases together: "${a#'$$'}" "${a#"${b-'$$'}"}" A recursive parser also helps in some other corner cases such as nested arithmetic expansion with paratheses. This patch adds a syntax stack allocated from the stack using alloca. As a side-effect this allows us to remove the naked backslashes for patterns within double-quotes, which means that EXP_QPAT also has to go. This patch also fixes removes any backslashes that precede right braces when they are present within a parameter expansion context, and backslashes that precede double quotes within inner double quotes inside a parameter expansion in a here-document context. The idea of a recursive parser is based on a patch by Harald van Dijk. Signed-off-by: Herbert Xu diff --git a/src/expand.c b/src/expand.c index 2a50830..903e250 100644 --- a/src/expand.c +++ b/src/expand.c @@ -83,7 +83,7 @@ #define RMESCAPE_HEAP 0x10 /* Malloc strings instead of stalloc */ /* Add CTLESC when necessary. */ -#define QUOTES_ESC (EXP_FULL | EXP_CASE | EXP_QPAT) +#define QUOTES_ESC (EXP_FULL | EXP_CASE) /* Do not skip NUL characters. */ #define QUOTES_KEEPNUL EXP_TILDE @@ -333,16 +333,6 @@ addquote: case CTLESC: startloc++; length++; - - /* - * Quoted parameter expansion pattern: remove quote - * unless inside inner quotes or we have a literal - * backslash. - */ - if (((flag | inquotes) & (EXP_QPAT | EXP_QUOTED)) == - EXP_QPAT && *p != '\\') - break; - goto addquote; case CTLVAR: p = evalvar(p, flag | inquotes); @@ -651,8 +641,7 @@ subevalvar(char *p, char *str, int strloc, int subtype, int startloc, int varfla char *(*scan)(char *, char *, char *, char *, int , int); argstr(p, EXP_TILDE | (subtype != VSASSIGN && subtype != VSQUESTION ? - (flag & (EXP_QUOTED | EXP_QPAT) ? - EXP_QPAT : EXP_CASE) : 0)); + EXP_CASE : 0)); STPUTC('\0', expdest); argbackq = saveargbackq; startp = stackblock() + startloc; @@ -1644,7 +1633,6 @@ char * _rmescapes(char *str, int flag) { char *p, *q, *r; - unsigned inquotes; int notescaped; int globbing; @@ -1674,24 +1662,23 @@ _rmescapes(char *str, int flag) q = mempcpy(q, str, len); } } - inquotes = 0; globbing = flag & RMESCAPE_GLOB; notescaped = globbing; while (*p) { if (*p == (char)CTLQUOTEMARK) { - inquotes = ~inquotes; p++; notescaped = globbing; continue; } + if (*p == '\\') { + /* naked back slash */ + notescaped = 0; + goto copy; + } if (*p == (char)CTLESC) { p++; if (notescaped) *q++ = '\\'; - } else if (*p == '\\' && !inquotes) { - /* naked back slash */ - notescaped = 0; - goto copy; } notescaped = globbing; copy: diff --git a/src/expand.h b/src/expand.h index 26dc5b4..90f5328 100644 --- a/src/expand.h +++ b/src/expand.h @@ -55,7 +55,6 @@ struct arglist { #define EXP_VARTILDE 0x4 /* expand tildes in an assignment */ #define EXP_REDIR 0x8 /* file glob for a redirection (1 match only) */ #define EXP_CASE 0x10 /* keeps quotes around for CASE pattern */ -#define EXP_QPAT 0x20 /* pattern in quoted parameter expansion */ #define EXP_VARTILDE2 0x40 /* expand tildes after colons only */ #define EXP_WORD 0x80 /* expand word in parameter expansion */ #define EXP_QUOTED 0x100 /* expand word in double quotes */ diff --git a/src/parser.c b/src/parser.c index 382658e..3aeb9f6 100644 --- a/src/parser.c +++ b/src/parser.c @@ -80,6 +80,18 @@ struct heredoc { int striptabs; /* if set, strip leading tabs */ }; +struct synstack { + const char *syntax; + struct synstack *prev; + struct synstack *next; + int innerdq; + int varpushed; + int dblquote; + int varnest; /* levels of variables expansion */ + int parenlevel; /* levels of parens in arithmetic */ + int dqvarnest; /* levels of variables expansion within double quotes */ +}; + struct heredoc *heredoclist; /* list of here documents to read */ @@ -847,6 +859,21 @@ static int pgetc_eatbnl(void) return c; } +static void synstack_push(struct synstack **stack, struct synstack *next, + const char *syntax) +{ + memset(next, 0, sizeof(*next)); + next->syntax = syntax; + next->next = *stack; + (*stack)->prev = next; + *stack = next; +} + +static void synstack_pop(struct synstack **stack) +{ + *stack = (*stack)->next; +} + /* @@ -876,24 +903,15 @@ readtoken1(int firstc, char const *syntax, char *eofmark, int striptabs) size_t len; struct nodelist *bqlist; int quotef; - int dblquote; - int varnest; /* levels of variables expansion */ - int arinest; /* levels of arithmetic expansion */ - int parenlevel; /* levels of parens in arithmetic */ - int dqvarnest; /* levels of variables expansion within double quotes */ int oldstyle; - /* syntax before arithmetic */ - char const *uninitialized_var(prevsyntax); + /* syntax stack */ + struct synstack synbase = { .syntax = syntax }; + struct synstack *synstack = &synbase; - dblquote = 0; if (syntax == DQSYNTAX) - dblquote = 1; + synstack->dblquote = 1; quotef = 0; bqlist = NULL; - varnest = 0; - arinest = 0; - parenlevel = 0; - dqvarnest = 0; STARTSTACKSTR(out); loop: { /* for each line, until end of word */ @@ -901,7 +919,7 @@ readtoken1(int firstc, char const *syntax, char *eofmark, int striptabs) if (c == '\034' && doprompt && attyset() && ! equal(termval(), "emacs")) { attyline(); - if (syntax == BASESYNTAX) + if (synstack->syntax == BASESYNTAX) return readtoken(); c = pgetc(); goto loop; @@ -910,9 +928,9 @@ readtoken1(int firstc, char const *syntax, char *eofmark, int striptabs) CHECKEND(); /* set c to PEOF if at end of here document */ for (;;) { /* until end of line or end of word */ CHECKSTRSPACE(4, out); /* permit 4 calls to USTPUTC */ - switch(syntax[c]) { + switch(synstack->syntax[c]) { case CNL: /* '\n' */ - if (syntax == BASESYNTAX) + if (synstack->syntax == BASESYNTAX) goto endword; /* exit outer loop */ USTPUTC(c, out); nlprompt(); @@ -922,7 +940,7 @@ readtoken1(int firstc, char const *syntax, char *eofmark, int striptabs) USTPUTC(c, out); break; case CCTL: - if (eofmark == NULL || dblquote) + if (eofmark == NULL || synstack->dblquote) USTPUTC(CTLESC, out); USTPUTC(c, out); break; @@ -937,13 +955,18 @@ readtoken1(int firstc, char const *syntax, char *eofmark, int striptabs) nlprompt(); } else { if ( - dblquote && + synstack->dblquote && c != '\\' && c != '`' && c != '$' && ( c != '"' || - eofmark != NULL + (eofmark != NULL && + !synstack->varnest) + ) && ( + c != '}' || + !synstack->varnest ) ) { + USTPUTC(CTLESC, out); USTPUTC('\\', out); } USTPUTC(CTLESC, out); @@ -952,55 +975,64 @@ readtoken1(int firstc, char const *syntax, char *eofmark, int striptabs) } break; case CSQUOTE: - syntax = SQSYNTAX; + synstack->syntax = SQSYNTAX; quotemark: if (eofmark == NULL) { USTPUTC(CTLQUOTEMARK, out); } break; case CDQUOTE: - syntax = DQSYNTAX; - dblquote = 1; + synstack->syntax = DQSYNTAX; + synstack->dblquote = 1; +toggledq: + if (synstack->varnest) + synstack->innerdq ^= 1; goto quotemark; case CENDQUOTE: - if (eofmark && !varnest) + if (eofmark && !synstack->varnest) { USTPUTC(c, out); - else { - if (dqvarnest == 0) { - syntax = BASESYNTAX; - dblquote = 0; - } - quotef++; - goto quotemark; + break; } - break; + + if (synstack->dqvarnest == 0) { + synstack->syntax = BASESYNTAX; + synstack->dblquote = 0; + } + + quotef++; + + if (c == '"') + goto toggledq; + + goto quotemark; case CVAR: /* '$' */ PARSESUB(); /* parse substitution */ break; case CENDVAR: /* '}' */ - if (varnest > 0) { - varnest--; - if (dqvarnest > 0) { - dqvarnest--; - } + if (!synstack->innerdq && + synstack->varnest > 0) { + if (!--synstack->varnest && + synstack->varpushed) + synstack_pop(&synstack); + else if (synstack->dqvarnest > 0) + synstack->dqvarnest--; USTPUTC(CTLENDVAR, out); } else { USTPUTC(c, out); } break; case CLP: /* '(' in arithmetic */ - parenlevel++; + synstack->parenlevel++; USTPUTC(c, out); break; case CRP: /* ')' in arithmetic */ - if (parenlevel > 0) { + if (synstack->parenlevel > 0) { USTPUTC(c, out); - --parenlevel; + --synstack->parenlevel; } else { if (pgetc() == ')') { USTPUTC(CTLENDARI, out); - if (!--arinest) - syntax = prevsyntax; + synstack_pop(&synstack); } else { /* * unbalanced parens @@ -1019,7 +1051,7 @@ quotemark: case CIGN: break; default: - if (varnest == 0) + if (synstack->varnest == 0) goto endword; /* exit outer loop */ if (c != PEOA) { USTPUTC(c, out); @@ -1029,11 +1061,11 @@ quotemark: } } endword: - if (syntax == ARISYNTAX) + if (synstack->syntax == ARISYNTAX) synerror("Missing '))'"); - if (syntax != BASESYNTAX && eofmark == NULL) + if (synstack->syntax != BASESYNTAX && eofmark == NULL) synerror("Unterminated quoted string"); - if (varnest != 0) { + if (synstack->varnest != 0) { /* { */ synerror("Missing '}'"); } @@ -1210,6 +1242,8 @@ parsesub: { PARSEBACKQNEW(); } } else { + const char *newsyn = synstack->syntax; + USTPUTC(CTLVAR, out); typeloc = out - (char *)stackblock(); STADJUST(1, out); @@ -1260,6 +1294,8 @@ varname: } if (subtype == 0) { + int cc = c; + switch (c) { case ':': subtype = VSNUL; @@ -1273,27 +1309,40 @@ varname: break; case '%': case '#': - { - int cc = c; - subtype = c == '#' ? VSTRIMLEFT : - VSTRIMRIGHT; - c = pgetc_eatbnl(); - if (c == cc) - subtype++; - else - pungetc(); - break; - } + subtype = c == '#' ? VSTRIMLEFT : + VSTRIMRIGHT; + c = pgetc_eatbnl(); + if (c == cc) + subtype++; + else + pungetc(); + + newsyn = BASESYNTAX; + break; } } else { badsub: pungetc(); } + + if (newsyn == ARISYNTAX && subtype > VSNORMAL) + newsyn = DQSYNTAX; + + if (newsyn != synstack->syntax) { + synstack_push(&synstack, + synstack->prev ?: + alloca(sizeof(*synstack)), + newsyn); + + synstack->varpushed++; + synstack->dblquote = newsyn != BASESYNTAX; + } + *((char *)stackblock() + typeloc) = subtype; if (subtype != VSNORMAL) { - varnest++; - if (dblquote) - dqvarnest++; + synstack->varnest++; + if (synstack->dblquote) + synstack->dqvarnest++; } STPUTC('=', out); } @@ -1352,7 +1401,7 @@ parsebackq: { continue; } if (pc != '\\' && pc != '`' && pc != '$' - && (!dblquote || pc != '"')) + && (!synstack->dblquote || pc != '"')) STPUTC('\\', pout); if (pc > PEOA) { break; @@ -1428,10 +1477,10 @@ done: */ parsearith: { - if (++arinest == 1) { - prevsyntax = syntax; - syntax = ARISYNTAX; - } + synstack_push(&synstack, + synstack->prev ?: alloca(sizeof(*synstack)), + ARISYNTAX); + synstack->dblquote = 1; USTPUTC(CTLARI, out); goto parsearith_return; }