@@ -54,6 +54,7 @@
#include <sys/stat.h>
#include <unistd.h>
#include <wchar.h>
+#include <wctype.h>
/*
* Routines to expand arguments to commands. We have to deal with
@@ -101,6 +102,14 @@ struct ifsregion {
int nulonly; /* search for nul bytes only */
};
+struct ifs_state {
+ const char *ifs;
+ char *start;
+ char *r;
+ int maxargs;
+ int ifsspc;
+};
+
/* output of current string */
static char *expdest;
/* list of back quote expressions */
@@ -112,6 +121,11 @@ static struct ifsregion *ifslastp;
/* holds expanded arg list */
static struct arglist exparg;
+static char ifsmap[128];
+static const char *ncifs;
+static size_t ifsmb0len;
+static wchar_t *wcifs;
+
static char *argstr(char *p, int flag);
static char *exptilde(char *startp, int flag);
static char *expari(char *start, int flag);
@@ -119,7 +133,7 @@ STATIC void expbackq(union node *, int);
STATIC char *evalvar(char *, int);
static size_t strtodest(const char *p, int flags);
static size_t memtodest(const char *p, size_t len, int flags);
-STATIC ssize_t varvalue(char *, int, int, int);
+STATIC ssize_t varvalue(char *, int, unsigned);
STATIC void expandmeta(struct strlist *);
static void addglob(const glob64_t *);
STATIC void expmeta(char *, unsigned, unsigned);
@@ -157,6 +171,30 @@ esclen(const char *start, const char *p) {
return esc;
}
+static __attribute__((noinline)) unsigned mbnext(const char *p)
+{
+ unsigned start = 0;
+ unsigned end = 0;
+ unsigned ml;
+ int c;
+
+ c = (signed char)p[end++];
+
+ switch (__builtin_expect(c, 0)) {
+ case CTLMBCHAR:
+ if ((signed char)p[end] == CTLESC)
+ end++;
+ ml = (unsigned char)p[end++];
+ start = end;
+ end = ml + 2;
+ break;
+ case CTLESC:
+ start++;
+ break;
+ }
+
+ return start | end << 8;
+}
static inline const char *getpwhome(const char *name)
{
@@ -545,6 +583,7 @@ static char *scanleft(char *startp, char *endp, char *rmesc, char *rmescend,
loc2 = rmesc;
do {
const char *s = loc2;
+ unsigned mb;
unsigned ml;
int match;
@@ -561,19 +600,9 @@ static char *scanleft(char *startp, char *endp, char *rmesc, char *rmescend,
if (!c)
break;
- if (*loc != (char)CTLMBCHAR) {
- if (*loc == (char)CTLESC)
- loc++;
- loc++;
- loc2++;
- continue;
- }
-
- if (*++loc == (char)CTLESC)
- loc++;
-
- ml = (unsigned char)*loc;
- loc += ml + 3;
+ mb = mbnext(loc);
+ loc += (mb & 0xff) + (mb >> 8);
+ ml = (mb >> 8) > 3 ? (mb >> 8) - 2 : 1;
loc2 += ml;
} while (1);
return 0;
@@ -753,7 +782,7 @@ evalvar(char *p, int flag)
}
again:
- varlen = varvalue(var, varflags, flag | mbchar, quoted);
+ varlen = varvalue(var, varflags, flag | mbchar);
if (varflags & VSNUL)
varlen--;
@@ -970,23 +999,23 @@ static size_t strtodest(const char *p, int flags)
* Add the value of a specialized variable to the stack string.
*/
-STATIC ssize_t
-varvalue(char *name, int varflags, int flags, int quoted)
+static ssize_t varvalue(char *name, int varflags, unsigned flags)
{
+ int subtype = varflags & VSTYPE;
+ unsigned long seplen;
+ const char *seps;
+ ssize_t len = 0;
+ size_t start;
+ int discard;
+ char **ap;
int num;
char *p;
int i;
- int sep;
- char sepc;
- char **ap;
- int subtype = varflags & VSTYPE;
- int discard = (subtype == VSPLUS || subtype == VSLENGTH) |
- (flags & EXP_DISCARD);
- ssize_t len = 0;
- size_t start;
- char c;
- if (!subtype) {
+ discard = (subtype == VSPLUS || subtype == VSLENGTH) |
+ (flags & EXP_DISCARD);
+
+ if (unlikely(!subtype)) {
if (discard)
return -1;
@@ -994,7 +1023,8 @@ varvalue(char *name, int varflags, int flags, int quoted)
}
flags &= discard ? ~QUOTES_ESC : ~0;
- sep = (flags & EXP_FULL) << CHAR_BIT;
+ seps = nullstr;
+ seplen = flags & EXP_FULL;
start = expdest - (char *)stackblock();
switch (*name) {
@@ -1025,13 +1055,14 @@ numvar:
expdest = p;
break;
case '@':
- if (quoted && sep)
+ if ((flags & (EXP_QUOTED | EXP_FULL)) ==
+ (EXP_QUOTED | EXP_FULL))
goto param;
/* fall through */
case '*':
- /* We will set c to 0 or ~0 depending on whether
+ /* We will set seplen to 0 or !0 depending on whether
* we're doing field splitting. We won't do field
- * splitting if either we're quoted or sep is zero.
+ * splitting if either we're quoted or seplen is zero.
*
* Instead of testing (quoted || !sep) the following
* trick optimises away any branches by using the
@@ -1043,20 +1074,22 @@ numvar:
#if EXP_QUOTED >> CHAR_BIT != EXP_FULL
#error The following two lines expect EXP_QUOTED == EXP_FULL << CHAR_BIT
#endif
- c = !((quoted | ~sep) & EXP_QUOTED) - 1;
- sep &= ~quoted;
- sep |= ifsset() ? (unsigned char)(c & ifsval()[0]) : ' ';
+ seplen &= ~(flags >> CHAR_BIT);
+ if (!seplen)
+ seps = ncifs;
+ seplen = ((seplen - 1) & (ifsmb0len - 1)) + 1;
param:
- sepc = sep;
if (!(ap = shellparam.p))
return -1;
- while ((p = *ap++)) {
+ if (!(p = *ap))
+ break;
+ for (;;) {
len += strtodest(p, flags);
- if (*ap && sep) {
- len++;
- memtodest(&sepc, 1, flags | EXP_KEEPNUL);
- }
+ if (!(p = *++ap))
+ break;
+
+ len += memtodest(seps, seplen, flags | EXP_KEEPNUL);
}
break;
case '0':
@@ -1117,7 +1150,126 @@ recordregion(int start, int end, int nulonly)
ifslastp->nulonly = nulonly;
}
+static unsigned ifsisifs(const char *p, unsigned ml, const char *ifs)
+{
+ bool isdefifs = false;
+ bool isifs = false;
+ wchar_t wc = *p;
+ wchar_t ifs0;
+ if (likely(ifs[0]) && unlikely(wcifs)) {
+ if (wc & 0x80) {
+ mbstate_t mbst = {};
+ wchar_t wc2;
+
+ if (mbrtowc(&wc2, p, ml, &mbst) != ml)
+ goto out;
+ wc = wc2;
+ }
+
+ isifs = wcschr(wcifs, wc);
+ ifs0 = wcifs[0];
+ } else if (likely(!ml)) {
+ isifs = strchr(ifs, wc);
+ ifs0 = ifs[0];
+ }
+
+ if (isifs)
+ isdefifs = iswspace(wc ?: ifs0);
+
+out:
+ return isifs << 1 | isdefifs;
+}
+
+static char *ifsbreakup_slow(struct ifs_state *ifst, struct arglist *arglist,
+ int nulonly, char *p)
+{
+ struct strlist *sp;
+ unsigned ifschar;
+ unsigned sisifs;
+ bool isdefifs;
+ unsigned ml;
+ bool isifs;
+ char *q;
+
+ q = p;
+
+ ifschar = mbnext(p);
+ p += ifschar & 0xff;
+ ml = (ifschar >> 8) > 3 ?
+ (ifschar >> 8) - 2 : 0;
+
+ sisifs = ifsisifs(p, ml, ifst->ifs);
+ p += ifschar >> 8;
+
+ isifs = sisifs >> 1;
+ isdefifs = sisifs & 1;
+
+ /* If only reading one more argument:
+ * If we have exactly one field,
+ * read that field without its terminator.
+ * If we have more than one field,
+ * read all fields including their terminators,
+ * except for trailing IFS whitespace.
+ *
+ * This means that if we have only IFS
+ * characters left, and at most one
+ * of them is non-whitespace, we stop
+ * reading here.
+ * Otherwise, we read all the remaining
+ * characters except for trailing
+ * IFS whitespace.
+ *
+ * In any case, r indicates the start
+ * of the characters to remove, or NULL
+ * if no characters should be removed.
+ */
+ if (!ifst->maxargs) {
+ if (isdefifs) {
+ if (!ifst->r)
+ ifst->r = q;
+ return p;
+ }
+
+ if (!(isifs && ifst->ifsspc))
+ ifst->r = NULL;
+ } else if (ifst->ifsspc) {
+ if (isifs)
+ q = p;
+
+ ifst->start = q;
+
+ if (isdefifs)
+ return p;
+ } else if (isifs) {
+ int ifsspc = ifst->ifsspc;
+
+ if (!nulonly) {
+ ifsspc = isdefifs;
+ ifst->ifsspc = ifsspc;
+ }
+
+ /* Ignore IFS whitespace at start */
+ if (q == ifst->start && ifsspc) {
+ ifst->start = p;
+ return p;
+ }
+ if (ifst->maxargs > 0 && !--ifst->maxargs) {
+ ifst->r = q;
+ return p;
+ }
+ *q = '\0';
+ sp = (struct strlist *)stalloc(sizeof *sp);
+ sp->text = ifst->start;
+ *arglist->lastp = sp;
+ arglist->lastp = &sp->next;
+ ifst->start = p;
+ return p;
+ }
+
+ ifst->ifsspc = 0;
+ return p;
+}
/*
* Break the argument string into pieces based upon IFS and add the
@@ -1130,21 +1282,19 @@ void
ifsbreakup(char *string, int maxargs, struct arglist *arglist)
{
struct ifsregion *ifsp;
+ struct ifs_state ifst;
+ const char *realifs;
struct strlist *sp;
- char *start;
- char *p;
- char *q;
- char *r = NULL;
- const char *ifs, *realifs;
- int ifsspc;
int nulonly;
+ char *p;
-
- start = string;
+ ifst.r = NULL;
+ ifst.start = string;
+ ifst.maxargs = maxargs;
if (ifslastp != NULL) {
- ifsspc = 0;
+ ifst.ifsspc = 0;
nulonly = 0;
- realifs = ifsset() ? ifsval() : defifs;
+ realifs = ncifs;
ifsp = &ifsfirst;
do {
int afternul;
@@ -1152,106 +1302,60 @@ ifsbreakup(char *string, int maxargs, struct arglist *arglist)
p = string + ifsp->begoff;
afternul = nulonly;
nulonly = ifsp->nulonly;
- ifs = nulonly ? nullstr : realifs;
- ifsspc = 0;
- while (p < string + ifsp->endoff) {
- int c;
- bool isifs;
- bool isdefifs;
+ ifst.ifs = nulonly ? nullstr : realifs;
+ ifst.ifsspc = 0;
+ for (;;) {
+ char *p0 = p;
- q = p;
- c = *p++;
- if (c == (char)CTLESC)
- c = *p++;
+ while (string + ifsp->endoff - p >= 8) {
+ union {
+ uint64_t qw;
+ unsigned char b[8];
+ } x;
- isifs = strchr(ifs, c);
- isdefifs = false;
- if (isifs)
- isdefifs = strchr(defifs, c);
+ x.qw = *(uint64_t *)p;
- /* If only reading one more argument:
- * If we have exactly one field,
- * read that field without its terminator.
- * If we have more than one field,
- * read all fields including their terminators,
- * except for trailing IFS whitespace.
- *
- * This means that if we have only IFS
- * characters left, and at most one
- * of them is non-whitespace, we stop
- * reading here.
- * Otherwise, we read all the remaining
- * characters except for trailing
- * IFS whitespace.
- *
- * In any case, r indicates the start
- * of the characters to remove, or NULL
- * if no characters should be removed.
- */
- if (!maxargs) {
- if (isdefifs) {
- if (!r)
- r = q;
- continue;
- }
-
- if (!(isifs && ifsspc))
- r = NULL;
-
- ifsspc = 0;
- continue;
+ if ((x.qw & 0x8080808080808080))
+ break;
+ if (ifsmap[x.b[0]] |
+ ifsmap[x.b[1]] |
+ ifsmap[x.b[2]] |
+ ifsmap[x.b[3]] |
+ ifsmap[x.b[4]] |
+ ifsmap[x.b[5]] |
+ ifsmap[x.b[6]] |
+ ifsmap[x.b[7]])
+ break;
+ p += 8;
}
- if (ifsspc) {
- if (isifs)
- q = p;
-
- start = q;
-
- if (isdefifs)
- continue;
-
- isifs = false;
+ if (p != p0) {
+ if (!ifst.maxargs)
+ ifst.r = NULL;
+ else if (ifst.ifsspc)
+ ifst.start = p0;
+ ifst.ifsspc = 0;
}
- if (isifs) {
- if (!(afternul || nulonly))
- ifsspc = isdefifs;
- /* Ignore IFS whitespace at start */
- if (q == start && ifsspc) {
- start = p;
- ifsspc = 0;
- continue;
- }
- if (maxargs > 0 && !--maxargs) {
- r = q;
- continue;
- }
- *q = '\0';
- sp = (struct strlist *)stalloc(sizeof *sp);
- sp->text = start;
- *arglist->lastp = sp;
- arglist->lastp = &sp->next;
- start = p;
- continue;
- }
+ if (p >= string + ifsp->endoff)
+ break;
- ifsspc = 0;
+ p = ifsbreakup_slow(&ifst, arglist,
+ afternul | nulonly, p);
}
} while ((ifsp = ifsp->next) != NULL);
if (nulonly)
goto add;
+ if (ifst.r)
+ *ifst.r = '\0';
}
- if (r)
- *r = '\0';
-
- if (!*start)
+ if (!*ifst.start)
return;
add:
sp = (struct strlist *)stalloc(sizeof *sp);
- sp->text = start;
+ sp->text = ifst.start;
*arglist->lastp = sp;
arglist->lastp = &sp->next;
}
@@ -1277,7 +1381,56 @@ out:
ifslastp = NULL;
}
+void changeifs(const char *ifs)
+{
+ mbstate_t mbs = {};
+ wchar_t *nwcifs;
+ unsigned mb = 0;
+ size_t len = 0;
+ const char *p;
+ size_t ml;
+ if (!ifsset())
+ ifs = defifs;
+ ncifs = ifs;
+
+ memset(ifsmap, 0, sizeof(ifsmap));
+
+ for (p = ifs;; p++) {
+ unsigned c = (unsigned char)*p;
+
+ mb |= c >> 7;
+ if (!(c >> 7))
+ ifsmap[c] = 1;
+
+ if (c == 0)
+ break;
+
+ len++;
+ }
+
+ nwcifs = NULL;
+
+ ifsmb0len = !!len;
+
+ if (!mb)
+ goto out;
+
+ ml = mbrlen(ifs, len, &mbs);
+ if (ml == -2 || ml == -1)
+ ml = 1;
+ ifsmb0len = ml;
+
+ nwcifs = ckmalloc((len + 1) * sizeof(*wcifs));
+ memset(nwcifs, 0, (len + 1) * sizeof(*wcifs));
+
+ p = ifs;
+ mbsrtowcs(nwcifs, &p, len + 1, &mbs);
+
+out:
+ ckfree(wcifs);
+ wcifs = nwcifs;
+}
/*
* Expand shell metacharacters. At this point, the only control characters
@@ -1420,31 +1573,25 @@ static void expmeta_rmescapes(char *enddir, char *name)
preglob(strcpy(enddir, name), RMESCAPE_EMETA);
}
-static unsigned mbcharlen(char *p)
+static int skipesc(char *p)
{
+ unsigned short mb;
int esc = 0;
- if (*++p == (char)CTLESC)
- esc++;
+ mb = mbnext(p);
+ if ((mb >> 8) > 3)
+ return (mb & 0xff) + (mb >> 8) - 1;
- return esc + 3 + (unsigned char)p[esc];
-}
+ esc = mb & 0xff;
-static size_t skipesc(char *p)
-{
- size_t esc = 0;
-
- if (p[esc] == (char)CTLMBCHAR)
- esc += mbcharlen(p);
- else if (p[esc] == (char)CTLESC)
- esc++;
- else if (p[esc] == '\\' && p[esc + 1]) {
+ if (!esc && p[esc] == '\\' && p[esc + 1]) {
while (p[++esc] == (char)CTLQUOTEMARK)
;
- if (p[esc] == (char)CTLMBCHAR)
- esc += mbcharlen(p + esc);
- else if (p[esc] == (char)CTLESC)
- esc++;
+ mb = mbnext(p + esc);
+ esc += mb & 0xff;
+
+ if ((mb >> 8) > 3)
+ esc += (mb >> 8) - 1;
}
return esc;
@@ -1845,6 +1992,7 @@ _rmescapes(char *str, int flag)
inquotes = 0;
notescaped = globbing;
while (*p) {
+ unsigned mb;
unsigned ml;
int newnesc = globbing;
@@ -1862,10 +2010,11 @@ _rmescapes(char *str, int flag)
goto setnesc;
}
} else if (*p == (char)CTLMBCHAR) {
- if (*++p == (char)CTLESC)
- p++;
+ mb = mbnext(p);
+ ml = mb >> 8;
- ml = (unsigned char)*p++;
+ ml -= 2;
+ p += mb & 0xff;
q = mempcpy(q, p, ml);
p += ml + 2;
goto setnesc;
@@ -75,6 +75,7 @@ void removerecordregions(int);
void ifsbreakup(char *, int, struct arglist *);
void ifsfree(void);
void restore_handler_expandarg(struct jmploc *savehandler, int err);
+void changeifs(const char *);
/* From arith.y */
intmax_t arith(const char *);
@@ -86,7 +86,7 @@ struct var varinit[] = {
#if ATTY
{ 0, VSTRFIXED|VTEXTFIXED|VUNSET, "ATTY\0", 0 },
#endif
- { 0, VSTRFIXED|VTEXTFIXED, defifsvar, 0 },
+ { 0, VSTRFIXED|VTEXTFIXED, defifsvar, changeifs },
{ 0, VSTRFIXED|VTEXTFIXED|VUNSET, "MAIL\0", changemail },
{ 0, VSTRFIXED|VTEXTFIXED|VUNSET, "MAILPATH\0", changemail },
{ 0, VSTRFIXED|VTEXTFIXED, defpathvar, changepath },
@@ -267,9 +267,6 @@ struct var *setvareq(char *s, int flags)
n);
}
- if (vp->func && (flags & VNOFUNC) == 0)
- (*vp->func)(varnull(s));
-
if ((vp->flags & (VTEXTFIXED|VSTACK)) == 0)
ckfree(vp->text);
@@ -301,6 +298,9 @@ out_free:
vp->text = s;
vp->flags = flags;
+ if (vp->func && (flags & VNOFUNC) == 0)
+ (*vp->func)(varnull(s));
+
out:
return vp;
}
@@ -531,12 +531,12 @@ poplocalvars(void)
vp->flags &= ~(VSTRFIXED|VREADONLY);
unsetvar(vp->text);
} else {
- if (vp->func)
- (*vp->func)(varnull(lvp->text));
if ((vp->flags & (VTEXTFIXED|VSTACK)) == 0)
ckfree(vp->text);
vp->flags = lvp->flags;
vp->text = lvp->text;
+ if (vp->func)
+ (*vp->func)(varnull(vp->text));
}
ckfree(lvp);
}
When multi-byte characters are used in IFS, they will be used for field splitting. Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> --- src/expand.c | 455 ++++++++++++++++++++++++++++++++++----------------- src/expand.h | 1 + src/var.c | 12 +- 3 files changed, 309 insertions(+), 159 deletions(-)