• R/O
  • SSH
  • HTTPS

yash: 提交


Commit MetaInfo

修訂4087 (tree)
時間2020-09-26 12:23:47
作者magicant

Log Message

Faster brace expansion (#40708)

Previously, the time complexity of the brace expansion algorithm for an
input string of the form "{{{{{{...{{{" was O(n2) where n is the length
of the input string. It was because, for each '{', the algorithm tries
to search all the remaining characters for a matching '}'.

In the new algorithm, we first scan the entire input string in O(n) time
and then generates the final results using the scan result. It should
work faster in case the expansion results in the same string as the
input.

Change Summary

差異

--- yash/trunk/expand.c (revision 4086)
+++ yash/trunk/expand.c (revision 4087)
@@ -103,6 +103,15 @@
103103 static void subst_length_each(void **slist)
104104 __attribute__((nonnull));
105105
106+/* data used in brace expansion */
107+struct brace_expand_T {
108+ const wchar_t *word; /* the word to expand */
109+ const char *cc; /* the corresponding charcategory_T string */
110+ void *const *graph; /* see the comment in the `expand_brace` function */
111+ plist_T *valuelist; /* the list to add the results (words) */
112+ plist_T *cclist; /* the list to add the results (charcategory_T) */
113+};
114+
106115 static void expand_brace_each(
107116 void *const *restrict values, void *const *restrict ccs,
108117 plist_T *restrict valuelist, plist_T *restrict cclist)
@@ -111,9 +120,13 @@
111120 wchar_t *restrict word, char *restrict cc,
112121 plist_T *restrict valuelist, plist_T *restrict cclist)
113122 __attribute__((nonnull));
123+static void generate_brace_expand_results(
124+ const struct brace_expand_T *restrict e, size_t ci,
125+ xwcsbuf_T *restrict valuebuf, xstrbuf_T *restrict ccbuf)
126+ __attribute__((nonnull));
114127 static bool try_expand_brace_sequence(
115- wchar_t *word, char *restrict cc, wchar_t *startc,
116- plist_T *restrict valuelist, plist_T *restrict cclist)
128+ const struct brace_expand_T *restrict e, size_t ci,
129+ xwcsbuf_T *restrict valuebuf, xstrbuf_T *restrict ccbuf)
117130 __attribute__((nonnull));
118131 static bool has_leading_zero(const wchar_t *restrict s, bool *restrict sign)
119132 __attribute__((nonnull));
@@ -1110,96 +1123,179 @@
11101123 {
11111124 #define idx(p) ((size_t) ((wchar_t *) (p) - word))
11121125
1113- size_t ci = 0;
1126+ /* quick check */
1127+ const wchar_t *c;
1128+ if ((c = wcschr(word, L'{')) == NULL || (c = wcschr(c + 1, L'}')) == NULL) {
1129+no_expansion:
1130+ pl_add(valuelist, word);
1131+ pl_add(cclist, cc);
1132+ return;
1133+ }
11141134
1115-start:
1116-
1117- /* find '{' */
1118- do {
1119- wchar_t *c = wcschr(&word[ci], L'{');
1120- if (c == NULL) {
1121- /* no L'{', no expansion */
1122- pl_add(valuelist, word);
1123- pl_add(cclist, cc);
1124- return;
1135+ /* First, we create a `graph' by scanning all the characters in the `word'.
1136+ * The graph is a list of pointers that has the same length as the word,
1137+ * which means each element in the graph corresponds to the character at the
1138+ * same index in the word. For each L'{' and L',' in the word, the
1139+ * corresponding graph element will be a pointer to the next matching L','
1140+ * or L'}' in the word. For L'}' that has one or more matching L','s, the
1141+ * corresponding element is a pointer to the L'}' itself. For other
1142+ * characters, the graph element will be NULL. */
1143+ /* To find matching L','s and L'}'s correctly, we keep track of nested
1144+ * braces by using a `stack'. Every time we find a new L'{', a pointer to
1145+ * the L'{' is pushed into the stack *twice*. The first one keeps pointing
1146+ * to the L'{' while the second is updated to point to the last found L','
1147+ * during the scan. The two pointers are popped out of the stack when a
1148+ * matching L'}' is found. */
1149+ bool pairfound = false;
1150+ plist_T graph;
1151+ plist_T stack;
1152+ pl_initwithmax(&graph, idx(c) + wcslen(c) /* = wcslen(word) */);
1153+ pl_init(&stack);
1154+ while (word[graph.length] != L'\0') {
1155+ if (cc[graph.length] == CC_LITERAL) {
1156+ switch (word[graph.length]) {
1157+ case L'{':
1158+ pl_add(&stack, &word[graph.length]);
1159+ pl_add(&stack, &word[graph.length]);
1160+ break;
1161+ case L',':
1162+ if (stack.length > 0) {
1163+ size_t ci = idx(stack.contents[stack.length - 1]);
1164+ assert(ci < graph.length);
1165+ graph.contents[ci] = &word[graph.length];
1166+ stack.contents[stack.length - 1] = &word[graph.length];
1167+ }
1168+ break;
1169+ case L'}':
1170+ if (stack.length > 0) {
1171+ size_t ci = idx(stack.contents[stack.length - 1]);
1172+ assert(ci < graph.length);
1173+ graph.contents[ci] = &word[graph.length];
1174+ assert(stack.length % 2 == 0);
1175+ pl_truncate(&stack, stack.length - 2);
1176+ pairfound = true;
1177+ if (word[ci] == L',') {
1178+ pl_add(&graph, &word[graph.length]);
1179+ continue;
1180+ }
1181+ }
1182+ break;
1183+ }
11251184 }
1126- ci = idx(c);
1127- } while (cc[ci++] != CC_LITERAL);
1185+ pl_add(&graph, NULL);
1186+ }
11281187
1129- if (try_expand_brace_sequence(word, cc, &word[ci], valuelist, cclist)) {
1130- return;
1188+ /* If no pairs of braces were found, we don't need to expand anything. */
1189+ if (!pairfound) {
1190+ pl_destroy(&stack);
1191+ pl_destroy(&graph);
1192+ goto no_expansion;
11311193 }
11321194
1133- plist_T splitpoints;
1134- unsigned nest;
1135-
1136- /* collect pointers to characters where the word is split */
1137- /* The pointers point to the character just after L'{', L',' or L'}'. */
1138- pl_init(&splitpoints);
1139- pl_add(&splitpoints, &word[ci]);
1140- nest = 0;
1141- for (; word[ci] != L'\0'; ci++) {
1142- if (cc[ci] != CC_LITERAL)
1143- continue;
1144- switch (word[ci]) {
1145- case L'{':
1146- nest++;
1147- break;
1148- case L',':
1149- if (nest == 0)
1150- pl_add(&splitpoints, &word[ci + 1]);
1151- break;
1152- case L'}':
1153- if (nest > 0) {
1154- nest--;
1155- break;
1156- } else if (splitpoints.length == 1) {
1157- /* no comma between { and } */
1158- goto restart;
1159- } else {
1160- pl_add(&splitpoints, &word[ci + 1]);
1161- goto done;
1162- }
1195+ /* After scanning the whole `word', if we have any elements left in the
1196+ * stack, they are L'{'s that have no matching L'}'s. They cannot be
1197+ * expanded, so let's remove them from the graph. */
1198+ while (stack.length > 0) {
1199+ assert(stack.length % 2 == 0);
1200+ size_t ci = idx(stack.contents[stack.length - 2]);
1201+ const wchar_t *cnext;
1202+ while ((cnext = graph.contents[ci]) != NULL) {
1203+ graph.contents[ci] = NULL;
1204+ ci = idx(cnext);
11631205 }
1206+ pl_truncate(&stack, stack.length - 2);
11641207 }
1165-restart:
1166- /* if there is no L',' or L'}' corresponding to L'{',
1167- * find the next L'{' and try again */
1168- ci = idx(splitpoints.contents[0]);
1169- pl_destroy(&splitpoints);
1170- goto start;
1208+ pl_destroy(&stack);
11711209
1172-done:;
1173- size_t lastelemindex = splitpoints.length - 1;
1174- size_t headlen = idx(splitpoints.contents[0]) - 1;
1175- size_t taillen = wcslen(splitpoints.contents[lastelemindex]);
1176- size_t totallen = idx(splitpoints.contents[lastelemindex]) + taillen;
1177- for (size_t i = 0; i < lastelemindex; i++) {
1178- xwcsbuf_T buf;
1179- xstrbuf_T cbuf;
1180- wb_initwithmax(&buf, totallen);
1181- sb_initwithmax(&cbuf, totallen);
1210+ /* Now start expansion! */
1211+ struct brace_expand_T e = {
1212+ .word = word,
1213+ .cc = cc,
1214+ .graph = graph.contents,
1215+ .valuelist = valuelist,
1216+ .cclist = cclist,
1217+ };
1218+ xwcsbuf_T valuebuf;
1219+ xstrbuf_T ccbuf;
1220+ wb_init(&valuebuf);
1221+ sb_init(&ccbuf);
1222+ generate_brace_expand_results(&e, 0, &valuebuf, &ccbuf);
1223+ pl_destroy(&graph);
1224+ free(word);
1225+ free(cc);
1226+#undef idx
1227+}
11821228
1183- wb_ncat_force(&buf, word, headlen);
1184- sb_ncat_force(&cbuf, cc, headlen);
1229+/* Generates results of brace expansion.
1230+ * Part of `e->word' that has been processed before calling this function
1231+ * may have been added to `valuebuf' and `ccbuf'.
1232+ * This function modifies `valuebuf' and `ccbuf' in place to construct the
1233+ * results, and finally destroys them.
1234+ * The results are added to `e->valuelist' and `e->cclist'. */
1235+void generate_brace_expand_results(
1236+ const struct brace_expand_T *restrict e, size_t ci,
1237+ xwcsbuf_T *restrict valuebuf, xstrbuf_T *restrict ccbuf)
1238+{
1239+start:
1240+ /* add normal characters up to the next delimiter */
1241+ while (e->word[ci] != L'\0' && e->graph[ci] == NULL) {
1242+normal:
1243+ wb_wccat(valuebuf, e->word[ci]);
1244+ sb_ccat(ccbuf, e->cc[ci]);
1245+ ci++;
1246+ }
11851247
1186- size_t len = (wchar_t *) splitpoints.contents[i + 1] -
1187- (wchar_t *) splitpoints.contents[i ] - 1;
1188- ci = idx(splitpoints.contents[i]);
1189- wb_ncat_force(&buf, &word[ci], len);
1190- sb_ncat_force(&cbuf, &cc[ci], len);
1248+ switch (e->word[ci]) {
1249+ case L'\0':
1250+ /* No more characters: we're done! */
1251+ pl_add(e->valuelist, wb_towcs(valuebuf));
1252+ pl_add(e->cclist, sb_tostr(ccbuf));
1253+ return;
1254+ case L',':
1255+ /* skip up to next L'}' and go on */
1256+ do {
1257+ const wchar_t *nextdelimiter = e->graph[ci];
1258+ ci = nextdelimiter - e->word;
1259+ } while (e->word[ci] == L',');
1260+ assert(e->word[ci] == L'}');
1261+ /* falls thru! */
1262+ case L'}':
1263+ /* skip this L'}' and go on */
1264+ ci++;
1265+ goto start;
1266+ }
1267+ assert(e->word[ci] == L'{');
11911268
1192- ci = idx(splitpoints.contents[lastelemindex]);
1193- wb_ncat_force(&buf, &word[ci], taillen);
1194- sb_ncat_force(&cbuf, &cc[ci], taillen);
1195- assert(buf.length == cbuf.length);
1269+ const wchar_t *nextdelimiter = e->graph[ci];
1270+ if (*nextdelimiter == L'}') {
1271+ /* No commas between the braces: try numeric brace expansion */
1272+ if (try_expand_brace_sequence(e, ci, valuebuf, ccbuf))
1273+ return;
11961274
1197- /* expand the remaining portion recursively */
1198- expand_brace(wb_towcs(&buf), sb_tostr(&cbuf), valuelist, cclist);
1275+ /* No numeric brace expansion happened.
1276+ * Treat this brace as normal and go on. */
1277+ goto normal;
11991278 }
1200- pl_destroy(&splitpoints);
1201- free(word);
1202- free(cc);
1279+ assert(*nextdelimiter == L',');
1280+
1281+ /* Now generate the results (except the last one) */
1282+ while (*nextdelimiter == L',') {
1283+ xwcsbuf_T valuebuf2;
1284+ xstrbuf_T ccbuf2;
1285+ wb_initwithmax(&valuebuf2, valuebuf->maxlength);
1286+ wb_ncat_force(&valuebuf2, valuebuf->contents, valuebuf->length);
1287+ sb_initwithmax(&ccbuf2, ccbuf->maxlength);
1288+ sb_ncat_force(&ccbuf2, ccbuf->contents, ccbuf->length);
1289+ ci++;
1290+ generate_brace_expand_results(e, ci, &valuebuf2, &ccbuf2);
1291+ ci = nextdelimiter - e->word;
1292+ nextdelimiter = e->graph[ci];
1293+ }
1294+ assert(*nextdelimiter == L'}');
1295+
1296+ /* Generate the last one */
1297+ ci++;
1298+ goto start; // generate_brace_expand_results(e, ci, valuebuf, ccbuf);
12031299 }
12041300
12051301 /* Tries numeric brace expansion like "{01..05}".
@@ -1209,41 +1305,43 @@
12091305 * `startc' is a pointer to the character right after L'{' in `word'.
12101306 */
12111307 bool try_expand_brace_sequence(
1212- wchar_t *const word, char *restrict const cc, wchar_t *const startc,
1213- plist_T *restrict valuelist, plist_T *restrict cclist)
1308+ const struct brace_expand_T *restrict e, size_t ci,
1309+ xwcsbuf_T *restrict valuebuf, xstrbuf_T *restrict ccbuf)
12141310 {
1215- long start, end, delta, value;
1216- wchar_t *dotp, *dotbracep, *bracep, *c;
1217- int startlen, endlen, len, wordlen;
1218- bool sign = false;
1311+ assert(e->word[ci] == L'{');
1312+ ci++;
12191313
1220- assert(startc[-1] == L'{');
1221- c = startc;
1314+ size_t starti = ci;
12221315
12231316 /* parse the starting point */
1224- dotp = wcschr(c, L'.');
1317+ const wchar_t *c = &e->word[ci];
1318+ const wchar_t *dotp = wcschr(c, L'.');
12251319 if (dotp == NULL || c == dotp || dotp[1] != L'.')
12261320 return false;
1227- startlen = has_leading_zero(c, &sign) ? (dotp - c) : 0;
1321+ bool sign = false;
1322+ int startlen = has_leading_zero(c, &sign) ? (dotp - c) : 0;
12281323 errno = 0;
1229- start = wcstol(c, &c, 10);
1230- if (errno != 0 || c != dotp)
1324+ wchar_t *cp;
1325+ long start = wcstol(c, &cp, 10);
1326+ if (errno != 0 || cp != dotp)
12311327 return false;
12321328
12331329 c = dotp + 2;
12341330
12351331 /* parse the ending point */
1236- dotbracep = wcspbrk(c, L".}");
1332+ const wchar_t *dotbracep = wcspbrk(c, L".}");
12371333 if (dotbracep == NULL || c == dotbracep ||
12381334 (dotbracep[0] == L'.' && dotbracep[1] != L'.'))
12391335 return false;
1240- endlen = has_leading_zero(c, &sign) ? (dotbracep - c) : 0;
1336+ int endlen = has_leading_zero(c, &sign) ? (dotbracep - c) : 0;
12411337 errno = 0;
1242- end = wcstol(c, &c, 10);
1243- if (errno != 0 || c != dotbracep)
1338+ long end = wcstol(c, &cp, 10);
1339+ if (errno != 0 || cp != dotbracep)
12441340 return false;
12451341
12461342 /* parse the delta */
1343+ long delta;
1344+ const wchar_t *bracep;
12471345 if (dotbracep[0] == L'.') {
12481346 assert(dotbracep[1] == L'.');
12491347 c = dotbracep + 2;
@@ -1251,8 +1349,8 @@
12511349 if (bracep == NULL || c == bracep)
12521350 return false;
12531351 errno = 0;
1254- delta = wcstol(c, &c, 10);
1255- if (delta == 0 || errno != 0 || c != bracep)
1352+ delta = wcstol(c, &cp, 10);
1353+ if (delta == 0 || errno != 0 || cp != bracep)
12561354 return false;
12571355 } else {
12581356 assert(dotbracep[0] == L'}');
@@ -1264,37 +1362,33 @@
12641362 }
12651363
12661364 /* validate charcategory_T */
1267- if (cc[idx(bracep)] != CC_LITERAL)
1365+ size_t bracei = bracep - e->word;
1366+ if (e->cc[bracei] != CC_LITERAL)
12681367 return false;
1269- for (size_t ci = idx(startc); ci < idx(bracep); ci++)
1270- if (cc[ci] & CC_QUOTED)
1368+ for (ci = starti; ci < bracei; ci++)
1369+ if (e->cc[ci] & CC_QUOTED)
12711370 return false;
12721371
12731372 /* expand the sequence */
1274- value = start;
1275- len = (startlen > endlen) ? startlen : endlen;
1276- wordlen = idx(bracep + 1) + wcslen(bracep + 1); // = wcslen(word);
1373+ long value = start;
1374+ int len = (startlen > endlen) ? startlen : endlen;
1375+ ci = bracep - e->word + 1;
12771376 do {
1278- xwcsbuf_T buf;
1279- xstrbuf_T cbuf;
1280- wb_initwithmax(&buf, wordlen);
1281- sb_initwithmax(&cbuf, wordlen);
1377+ xwcsbuf_T valuebuf2;
1378+ xstrbuf_T ccbuf2;
1379+ wb_initwithmax(&valuebuf2, valuebuf->maxlength);
1380+ wb_ncat_force(&valuebuf2, valuebuf->contents, valuebuf->length);
1381+ sb_initwithmax(&ccbuf2, ccbuf->maxlength);
1382+ sb_ncat_force(&ccbuf2, ccbuf->contents, ccbuf->length);
12821383
1283- size_t slen = idx(startc - 1);
1284- wb_ncat_force(&buf, word, slen);
1285- sb_ncat_force(&cbuf, cc, slen);
1286-
1287- int plen = wb_wprintf(&buf, sign ? L"%0+*ld" : L"%0*ld", len, value);
1384+ /* format the number */
1385+ int plen =
1386+ wb_wprintf(&valuebuf2, sign ? L"%0+*ld" : L"%0*ld", len, value);
12881387 if (plen >= 0)
1289- sb_ccat_repeat(&cbuf, CC_HARD_EXPANSION, plen);
1388+ sb_ccat_repeat(&ccbuf2, CC_HARD_EXPANSION, plen);
12901389
1291- slen = idx(bracep + 1);
1292- wb_ncat_force(&buf, bracep + 1, wordlen - slen);
1293- sb_ncat_force(&cbuf, cc + slen, wordlen - slen);
1294- assert(buf.length == cbuf.length);
1295-
12961390 /* expand the remaining portion recursively */
1297- expand_brace(wb_towcs(&buf), sb_tostr(&cbuf), valuelist, cclist);
1391+ generate_brace_expand_results(e, ci, &valuebuf2, &ccbuf2);
12981392
12991393 if (delta >= 0) {
13001394 if (LONG_MAX - delta < value)
@@ -1305,10 +1399,8 @@
13051399 }
13061400 value += delta;
13071401 } while (delta >= 0 ? value <= end : value >= end);
1308- free(word);
1309- free(cc);
1402+
13101403 return true;
1311-#undef idx
13121404 }
13131405
13141406 /* Checks if the specified numeral starts with a L'0'.
Show on old repository browser