- Sync inetcomm, inseng, jscript to Wine-1.1.43.
[reactos.git] / reactos / dll / win32 / jscript / regexp.c
1 /*
2 * Copyright 2008 Jacek Caban for CodeWeavers
3 *
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public
6 * License as published by the Free Software Foundation; either
7 * version 2.1 of the License, or (at your option) any later version.
8 *
9 * This library is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
13 *
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
17 */
18
19 /*
20 * Code in this file is based on files:
21 * js/src/jsregexp.h
22 * js/src/jsregexp.c
23 * from Mozilla project, released under LGPL 2.1 or later.
24 *
25 * The Original Code is Mozilla Communicator client code, released
26 * March 31, 1998.
27 *
28 * The Initial Developer of the Original Code is
29 * Netscape Communications Corporation.
30 * Portions created by the Initial Developer are Copyright (C) 1998
31 * the Initial Developer. All Rights Reserved.
32 */
33
34 #include <assert.h>
35 #include <math.h>
36
37 #include "jscript.h"
38
39 #include "wine/debug.h"
40
41 WINE_DEFAULT_DEBUG_CHANNEL(jscript);
42
43 #define JSREG_FOLD 0x01 /* fold uppercase to lowercase */
44 #define JSREG_GLOB 0x02 /* global exec, creates array of matches */
45 #define JSREG_MULTILINE 0x04 /* treat ^ and $ as begin and end of line */
46 #define JSREG_STICKY 0x08 /* only match starting at lastIndex */
47
48 typedef BYTE JSPackedBool;
49 typedef BYTE jsbytecode;
50
51 /*
52 * This struct holds a bitmap representation of a class from a regexp.
53 * There's a list of these referenced by the classList field in the JSRegExp
54 * struct below. The initial state has startIndex set to the offset in the
55 * original regexp source of the beginning of the class contents. The first
56 * use of the class converts the source representation into a bitmap.
57 *
58 */
59 typedef struct RECharSet {
60 JSPackedBool converted;
61 JSPackedBool sense;
62 WORD length;
63 union {
64 BYTE *bits;
65 struct {
66 size_t startIndex;
67 size_t length;
68 } src;
69 } u;
70 } RECharSet;
71
72 typedef struct {
73 WORD flags; /* flags, see jsapi.h's JSREG_* defines */
74 size_t parenCount; /* number of parenthesized submatches */
75 size_t classCount; /* count [...] bitmaps */
76 RECharSet *classList; /* list of [...] bitmaps */
77 BSTR source; /* locked source string, sans // */
78 jsbytecode program[1]; /* regular expression bytecode */
79 } JSRegExp;
80
81 typedef struct {
82 DispatchEx dispex;
83
84 JSRegExp *jsregexp;
85 BSTR str;
86 INT last_index;
87 VARIANT last_index_var;
88 } RegExpInstance;
89
90 static const WCHAR sourceW[] = {'s','o','u','r','c','e',0};
91 static const WCHAR globalW[] = {'g','l','o','b','a','l',0};
92 static const WCHAR ignoreCaseW[] = {'i','g','n','o','r','e','C','a','s','e',0};
93 static const WCHAR multilineW[] = {'m','u','l','t','i','l','i','n','e',0};
94 static const WCHAR lastIndexW[] = {'l','a','s','t','I','n','d','e','x',0};
95 static const WCHAR toStringW[] = {'t','o','S','t','r','i','n','g',0};
96 static const WCHAR execW[] = {'e','x','e','c',0};
97 static const WCHAR testW[] = {'t','e','s','t',0};
98
99 static const WCHAR emptyW[] = {0};
100
101 /* FIXME: Better error handling */
102 #define ReportRegExpError(a,b,c)
103 #define ReportRegExpErrorHelper(a,b,c,d)
104 #define JS_ReportErrorNumber(a,b,c,d)
105 #define JS_ReportErrorFlagsAndNumber(a,b,c,d,e,f)
106 #define js_ReportOutOfScriptQuota(a)
107 #define JS_ReportOutOfMemory(a)
108 #define JS_COUNT_OPERATION(a,b)
109
110 #define JSMSG_MIN_TOO_BIG 47
111 #define JSMSG_MAX_TOO_BIG 48
112 #define JSMSG_OUT_OF_ORDER 49
113 #define JSMSG_OUT_OF_MEMORY 137
114
115 #define LINE_SEPARATOR 0x2028
116 #define PARA_SEPARATOR 0x2029
117
118 #define RE_IS_LETTER(c) (((c >= 'A') && (c <= 'Z')) || \
119 ((c >= 'a') && (c <= 'z')) )
120 #define RE_IS_LINE_TERM(c) ((c == '\n') || (c == '\r') || \
121 (c == LINE_SEPARATOR) || (c == PARA_SEPARATOR))
122
123 #define JS_ISWORD(c) ((c) < 128 && (isalnum(c) || (c) == '_'))
124
125 #define JS7_ISDEC(c) ((((unsigned)(c)) - '0') <= 9)
126 #define JS7_UNDEC(c) ((c) - '0')
127
128 typedef enum REOp {
129 REOP_EMPTY,
130 REOP_BOL,
131 REOP_EOL,
132 REOP_WBDRY,
133 REOP_WNONBDRY,
134 REOP_DOT,
135 REOP_DIGIT,
136 REOP_NONDIGIT,
137 REOP_ALNUM,
138 REOP_NONALNUM,
139 REOP_SPACE,
140 REOP_NONSPACE,
141 REOP_BACKREF,
142 REOP_FLAT,
143 REOP_FLAT1,
144 REOP_FLATi,
145 REOP_FLAT1i,
146 REOP_UCFLAT1,
147 REOP_UCFLAT1i,
148 REOP_UCFLAT,
149 REOP_UCFLATi,
150 REOP_CLASS,
151 REOP_NCLASS,
152 REOP_ALT,
153 REOP_QUANT,
154 REOP_STAR,
155 REOP_PLUS,
156 REOP_OPT,
157 REOP_LPAREN,
158 REOP_RPAREN,
159 REOP_JUMP,
160 REOP_DOTSTAR,
161 REOP_LPARENNON,
162 REOP_ASSERT,
163 REOP_ASSERT_NOT,
164 REOP_ASSERTTEST,
165 REOP_ASSERTNOTTEST,
166 REOP_MINIMALSTAR,
167 REOP_MINIMALPLUS,
168 REOP_MINIMALOPT,
169 REOP_MINIMALQUANT,
170 REOP_ENDCHILD,
171 REOP_REPEAT,
172 REOP_MINIMALREPEAT,
173 REOP_ALTPREREQ,
174 REOP_ALTPREREQ2,
175 REOP_ENDALT,
176 REOP_CONCAT,
177 REOP_END,
178 REOP_LIMIT /* META: no operator >= to this */
179 } REOp;
180
181 #define REOP_IS_SIMPLE(op) ((op) <= REOP_NCLASS)
182
183 static const char *reop_names[] = {
184 "empty",
185 "bol",
186 "eol",
187 "wbdry",
188 "wnonbdry",
189 "dot",
190 "digit",
191 "nondigit",
192 "alnum",
193 "nonalnum",
194 "space",
195 "nonspace",
196 "backref",
197 "flat",
198 "flat1",
199 "flati",
200 "flat1i",
201 "ucflat1",
202 "ucflat1i",
203 "ucflat",
204 "ucflati",
205 "class",
206 "nclass",
207 "alt",
208 "quant",
209 "star",
210 "plus",
211 "opt",
212 "lparen",
213 "rparen",
214 "jump",
215 "dotstar",
216 "lparennon",
217 "assert",
218 "assert_not",
219 "asserttest",
220 "assertnottest",
221 "minimalstar",
222 "minimalplus",
223 "minimalopt",
224 "minimalquant",
225 "endchild",
226 "repeat",
227 "minimalrepeat",
228 "altprereq",
229 "alrprereq2",
230 "endalt",
231 "concat",
232 "end",
233 NULL
234 };
235
236 typedef struct RECapture {
237 ptrdiff_t index; /* start of contents, -1 for empty */
238 size_t length; /* length of capture */
239 } RECapture;
240
241 typedef struct REMatchState {
242 const WCHAR *cp;
243 RECapture parens[1]; /* first of 're->parenCount' captures,
244 allocated at end of this struct */
245 } REMatchState;
246
247 typedef struct REProgState {
248 jsbytecode *continue_pc; /* current continuation data */
249 jsbytecode continue_op;
250 ptrdiff_t index; /* progress in text */
251 size_t parenSoFar; /* highest indexed paren started */
252 union {
253 struct {
254 UINT min; /* current quantifier limits */
255 UINT max;
256 } quantifier;
257 struct {
258 size_t top; /* backtrack stack state */
259 size_t sz;
260 } assertion;
261 } u;
262 } REProgState;
263
264 typedef struct REBackTrackData {
265 size_t sz; /* size of previous stack entry */
266 jsbytecode *backtrack_pc; /* where to backtrack to */
267 jsbytecode backtrack_op;
268 const WCHAR *cp; /* index in text of match at backtrack */
269 size_t parenIndex; /* start index of saved paren contents */
270 size_t parenCount; /* # of saved paren contents */
271 size_t saveStateStackTop; /* number of parent states */
272 /* saved parent states follow */
273 /* saved paren contents follow */
274 } REBackTrackData;
275
276 #define INITIAL_STATESTACK 100
277 #define INITIAL_BACKTRACK 8000
278
279 typedef struct REGlobalData {
280 script_ctx_t *cx;
281 JSRegExp *regexp; /* the RE in execution */
282 BOOL ok; /* runtime error (out_of_memory only?) */
283 size_t start; /* offset to start at */
284 ptrdiff_t skipped; /* chars skipped anchoring this r.e. */
285 const WCHAR *cpbegin; /* text base address */
286 const WCHAR *cpend; /* text limit address */
287
288 REProgState *stateStack; /* stack of state of current parents */
289 size_t stateStackTop;
290 size_t stateStackLimit;
291
292 REBackTrackData *backTrackStack;/* stack of matched-so-far positions */
293 REBackTrackData *backTrackSP;
294 size_t backTrackStackSize;
295 size_t cursz; /* size of current stack entry */
296 size_t backTrackCount; /* how many times we've backtracked */
297 size_t backTrackLimit; /* upper limit on backtrack states */
298
299 jsheap_t *pool; /* It's faster to use one malloc'd pool
300 than to malloc/free the three items
301 that are allocated from this pool */
302 } REGlobalData;
303
304 typedef struct RENode RENode;
305 struct RENode {
306 REOp op; /* r.e. op bytecode */
307 RENode *next; /* next in concatenation order */
308 void *kid; /* first operand */
309 union {
310 void *kid2; /* second operand */
311 INT num; /* could be a number */
312 size_t parenIndex; /* or a parenthesis index */
313 struct { /* or a quantifier range */
314 UINT min;
315 UINT max;
316 JSPackedBool greedy;
317 } range;
318 struct { /* or a character class */
319 size_t startIndex;
320 size_t kidlen; /* length of string at kid, in jschars */
321 size_t index; /* index into class list */
322 WORD bmsize; /* bitmap size, based on max char code */
323 JSPackedBool sense;
324 } ucclass;
325 struct { /* or a literal sequence */
326 WCHAR chr; /* of one character */
327 size_t length; /* or many (via the kid) */
328 } flat;
329 struct {
330 RENode *kid2; /* second operand from ALT */
331 WCHAR ch1; /* match char for ALTPREREQ */
332 WCHAR ch2; /* ditto, or class index for ALTPREREQ2 */
333 } altprereq;
334 } u;
335 };
336
337 #define CLASS_CACHE_SIZE 4
338
339 typedef struct CompilerState {
340 script_ctx_t *context;
341 const WCHAR *cpbegin;
342 const WCHAR *cpend;
343 const WCHAR *cp;
344 size_t parenCount;
345 size_t classCount; /* number of [] encountered */
346 size_t treeDepth; /* maximum depth of parse tree */
347 size_t progLength; /* estimated bytecode length */
348 RENode *result;
349 size_t classBitmapsMem; /* memory to hold all class bitmaps */
350 struct {
351 const WCHAR *start; /* small cache of class strings */
352 size_t length; /* since they're often the same */
353 size_t index;
354 } classCache[CLASS_CACHE_SIZE];
355 WORD flags;
356 } CompilerState;
357
358 typedef struct EmitStateStackEntry {
359 jsbytecode *altHead; /* start of REOP_ALT* opcode */
360 jsbytecode *nextAltFixup; /* fixup pointer to next-alt offset */
361 jsbytecode *nextTermFixup; /* fixup ptr. to REOP_JUMP offset */
362 jsbytecode *endTermFixup; /* fixup ptr. to REOPT_ALTPREREQ* offset */
363 RENode *continueNode; /* original REOP_ALT* node being stacked */
364 jsbytecode continueOp; /* REOP_JUMP or REOP_ENDALT continuation */
365 JSPackedBool jumpToJumpFlag; /* true if we've patched jump-to-jump to
366 avoid 16-bit unsigned offset overflow */
367 } EmitStateStackEntry;
368
369 /*
370 * Immediate operand sizes and getter/setters. Unlike the ones in jsopcode.h,
371 * the getters and setters take the pc of the offset, not of the opcode before
372 * the offset.
373 */
374 #define ARG_LEN 2
375 #define GET_ARG(pc) ((WORD)(((pc)[0] << 8) | (pc)[1]))
376 #define SET_ARG(pc, arg) ((pc)[0] = (jsbytecode) ((arg) >> 8), \
377 (pc)[1] = (jsbytecode) (arg))
378
379 #define OFFSET_LEN ARG_LEN
380 #define OFFSET_MAX ((1 << (ARG_LEN * 8)) - 1)
381 #define GET_OFFSET(pc) GET_ARG(pc)
382
383 static BOOL ParseRegExp(CompilerState*);
384
385 /*
386 * Maximum supported tree depth is maximum size of EmitStateStackEntry stack.
387 * For sanity, we limit it to 2^24 bytes.
388 */
389 #define TREE_DEPTH_MAX ((1 << 24) / sizeof(EmitStateStackEntry))
390
391 /*
392 * The maximum memory that can be allocated for class bitmaps.
393 * For sanity, we limit it to 2^24 bytes.
394 */
395 #define CLASS_BITMAPS_MEM_LIMIT (1 << 24)
396
397 /*
398 * Functions to get size and write/read bytecode that represent small indexes
399 * compactly.
400 * Each byte in the code represent 7-bit chunk of the index. 8th bit when set
401 * indicates that the following byte brings more bits to the index. Otherwise
402 * this is the last byte in the index bytecode representing highest index bits.
403 */
404 static size_t
405 GetCompactIndexWidth(size_t index)
406 {
407 size_t width;
408
409 for (width = 1; (index >>= 7) != 0; ++width) { }
410 return width;
411 }
412
413 static inline jsbytecode *
414 WriteCompactIndex(jsbytecode *pc, size_t index)
415 {
416 size_t next;
417
418 while ((next = index >> 7) != 0) {
419 *pc++ = (jsbytecode)(index | 0x80);
420 index = next;
421 }
422 *pc++ = (jsbytecode)index;
423 return pc;
424 }
425
426 static inline jsbytecode *
427 ReadCompactIndex(jsbytecode *pc, size_t *result)
428 {
429 size_t nextByte;
430
431 nextByte = *pc++;
432 if ((nextByte & 0x80) == 0) {
433 /*
434 * Short-circuit the most common case when compact index <= 127.
435 */
436 *result = nextByte;
437 } else {
438 size_t shift = 7;
439 *result = 0x7F & nextByte;
440 do {
441 nextByte = *pc++;
442 *result |= (nextByte & 0x7F) << shift;
443 shift += 7;
444 } while ((nextByte & 0x80) != 0);
445 }
446 return pc;
447 }
448
449 /* Construct and initialize an RENode, returning NULL for out-of-memory */
450 static RENode *
451 NewRENode(CompilerState *state, REOp op)
452 {
453 RENode *ren;
454
455 ren = jsheap_alloc(&state->context->tmp_heap, sizeof(*ren));
456 if (!ren) {
457 /* js_ReportOutOfScriptQuota(cx); */
458 return NULL;
459 }
460 ren->op = op;
461 ren->next = NULL;
462 ren->kid = NULL;
463 return ren;
464 }
465
466 /*
467 * Validates and converts hex ascii value.
468 */
469 static BOOL
470 isASCIIHexDigit(WCHAR c, UINT *digit)
471 {
472 UINT cv = c;
473
474 if (cv < '0')
475 return FALSE;
476 if (cv <= '9') {
477 *digit = cv - '0';
478 return TRUE;
479 }
480 cv |= 0x20;
481 if (cv >= 'a' && cv <= 'f') {
482 *digit = cv - 'a' + 10;
483 return TRUE;
484 }
485 return FALSE;
486 }
487
488 typedef struct {
489 REOp op;
490 const WCHAR *errPos;
491 size_t parenIndex;
492 } REOpData;
493
494 #define JUMP_OFFSET_HI(off) ((jsbytecode)((off) >> 8))
495 #define JUMP_OFFSET_LO(off) ((jsbytecode)(off))
496
497 static BOOL
498 SetForwardJumpOffset(jsbytecode *jump, jsbytecode *target)
499 {
500 ptrdiff_t offset = target - jump;
501
502 /* Check that target really points forward. */
503 assert(offset >= 2);
504 if ((size_t)offset > OFFSET_MAX)
505 return FALSE;
506
507 jump[0] = JUMP_OFFSET_HI(offset);
508 jump[1] = JUMP_OFFSET_LO(offset);
509 return TRUE;
510 }
511
512 /*
513 * Generate bytecode for the tree rooted at t using an explicit stack instead
514 * of recursion.
515 */
516 static jsbytecode *
517 EmitREBytecode(CompilerState *state, JSRegExp *re, size_t treeDepth,
518 jsbytecode *pc, RENode *t)
519 {
520 EmitStateStackEntry *emitStateSP, *emitStateStack;
521 RECharSet *charSet;
522 REOp op;
523
524 if (treeDepth == 0) {
525 emitStateStack = NULL;
526 } else {
527 emitStateStack = heap_alloc(sizeof(EmitStateStackEntry) * treeDepth);
528 if (!emitStateStack)
529 return NULL;
530 }
531 emitStateSP = emitStateStack;
532 op = t->op;
533 assert(op < REOP_LIMIT);
534
535 for (;;) {
536 *pc++ = op;
537 switch (op) {
538 case REOP_EMPTY:
539 --pc;
540 break;
541
542 case REOP_ALTPREREQ2:
543 case REOP_ALTPREREQ:
544 assert(emitStateSP);
545 emitStateSP->altHead = pc - 1;
546 emitStateSP->endTermFixup = pc;
547 pc += OFFSET_LEN;
548 SET_ARG(pc, t->u.altprereq.ch1);
549 pc += ARG_LEN;
550 SET_ARG(pc, t->u.altprereq.ch2);
551 pc += ARG_LEN;
552
553 emitStateSP->nextAltFixup = pc; /* offset to next alternate */
554 pc += OFFSET_LEN;
555
556 emitStateSP->continueNode = t;
557 emitStateSP->continueOp = REOP_JUMP;
558 emitStateSP->jumpToJumpFlag = FALSE;
559 ++emitStateSP;
560 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
561 t = t->kid;
562 op = t->op;
563 assert(op < REOP_LIMIT);
564 continue;
565
566 case REOP_JUMP:
567 emitStateSP->nextTermFixup = pc; /* offset to following term */
568 pc += OFFSET_LEN;
569 if (!SetForwardJumpOffset(emitStateSP->nextAltFixup, pc))
570 goto jump_too_big;
571 emitStateSP->continueOp = REOP_ENDALT;
572 ++emitStateSP;
573 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
574 t = t->u.kid2;
575 op = t->op;
576 assert(op < REOP_LIMIT);
577 continue;
578
579 case REOP_ENDALT:
580 /*
581 * If we already patched emitStateSP->nextTermFixup to jump to
582 * a nearer jump, to avoid 16-bit immediate offset overflow, we
583 * are done here.
584 */
585 if (emitStateSP->jumpToJumpFlag)
586 break;
587
588 /*
589 * Fix up the REOP_JUMP offset to go to the op after REOP_ENDALT.
590 * REOP_ENDALT is executed only on successful match of the last
591 * alternate in a group.
592 */
593 if (!SetForwardJumpOffset(emitStateSP->nextTermFixup, pc))
594 goto jump_too_big;
595 if (t->op != REOP_ALT) {
596 if (!SetForwardJumpOffset(emitStateSP->endTermFixup, pc))
597 goto jump_too_big;
598 }
599
600 /*
601 * If the program is bigger than the REOP_JUMP offset range, then
602 * we must check for alternates before this one that are part of
603 * the same group, and fix up their jump offsets to target jumps
604 * close enough to fit in a 16-bit unsigned offset immediate.
605 */
606 if ((size_t)(pc - re->program) > OFFSET_MAX &&
607 emitStateSP > emitStateStack) {
608 EmitStateStackEntry *esp, *esp2;
609 jsbytecode *alt, *jump;
610 ptrdiff_t span, header;
611
612 esp2 = emitStateSP;
613 alt = esp2->altHead;
614 for (esp = esp2 - 1; esp >= emitStateStack; --esp) {
615 if (esp->continueOp == REOP_ENDALT &&
616 !esp->jumpToJumpFlag &&
617 esp->nextTermFixup + OFFSET_LEN == alt &&
618 (size_t)(pc - ((esp->continueNode->op != REOP_ALT)
619 ? esp->endTermFixup
620 : esp->nextTermFixup)) > OFFSET_MAX) {
621 alt = esp->altHead;
622 jump = esp->nextTermFixup;
623
624 /*
625 * The span must be 1 less than the distance from
626 * jump offset to jump offset, so we actually jump
627 * to a REOP_JUMP bytecode, not to its offset!
628 */
629 for (;;) {
630 assert(jump < esp2->nextTermFixup);
631 span = esp2->nextTermFixup - jump - 1;
632 if ((size_t)span <= OFFSET_MAX)
633 break;
634 do {
635 if (--esp2 == esp)
636 goto jump_too_big;
637 } while (esp2->continueOp != REOP_ENDALT);
638 }
639
640 jump[0] = JUMP_OFFSET_HI(span);
641 jump[1] = JUMP_OFFSET_LO(span);
642
643 if (esp->continueNode->op != REOP_ALT) {
644 /*
645 * We must patch the offset at esp->endTermFixup
646 * as well, for the REOP_ALTPREREQ{,2} opcodes.
647 * If we're unlucky and endTermFixup is more than
648 * OFFSET_MAX bytes from its target, we cheat by
649 * jumping 6 bytes to the jump whose offset is at
650 * esp->nextTermFixup, which has the same target.
651 */
652 jump = esp->endTermFixup;
653 header = esp->nextTermFixup - jump;
654 span += header;
655 if ((size_t)span > OFFSET_MAX)
656 span = header;
657
658 jump[0] = JUMP_OFFSET_HI(span);
659 jump[1] = JUMP_OFFSET_LO(span);
660 }
661
662 esp->jumpToJumpFlag = TRUE;
663 }
664 }
665 }
666 break;
667
668 case REOP_ALT:
669 assert(emitStateSP);
670 emitStateSP->altHead = pc - 1;
671 emitStateSP->nextAltFixup = pc; /* offset to next alternate */
672 pc += OFFSET_LEN;
673 emitStateSP->continueNode = t;
674 emitStateSP->continueOp = REOP_JUMP;
675 emitStateSP->jumpToJumpFlag = FALSE;
676 ++emitStateSP;
677 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
678 t = t->kid;
679 op = t->op;
680 assert(op < REOP_LIMIT);
681 continue;
682
683 case REOP_FLAT:
684 /*
685 * Coalesce FLATs if possible and if it would not increase bytecode
686 * beyond preallocated limit. The latter happens only when bytecode
687 * size for coalesced string with offset p and length 2 exceeds 6
688 * bytes preallocated for 2 single char nodes, i.e. when
689 * 1 + GetCompactIndexWidth(p) + GetCompactIndexWidth(2) > 6 or
690 * GetCompactIndexWidth(p) > 4.
691 * Since when GetCompactIndexWidth(p) <= 4 coalescing of 3 or more
692 * nodes strictly decreases bytecode size, the check has to be
693 * done only for the first coalescing.
694 */
695 if (t->kid &&
696 GetCompactIndexWidth((WCHAR*)t->kid - state->cpbegin) <= 4)
697 {
698 while (t->next &&
699 t->next->op == REOP_FLAT &&
700 (WCHAR*)t->kid + t->u.flat.length ==
701 t->next->kid) {
702 t->u.flat.length += t->next->u.flat.length;
703 t->next = t->next->next;
704 }
705 }
706 if (t->kid && t->u.flat.length > 1) {
707 pc[-1] = (state->flags & JSREG_FOLD) ? REOP_FLATi : REOP_FLAT;
708 pc = WriteCompactIndex(pc, (WCHAR*)t->kid - state->cpbegin);
709 pc = WriteCompactIndex(pc, t->u.flat.length);
710 } else if (t->u.flat.chr < 256) {
711 pc[-1] = (state->flags & JSREG_FOLD) ? REOP_FLAT1i : REOP_FLAT1;
712 *pc++ = (jsbytecode) t->u.flat.chr;
713 } else {
714 pc[-1] = (state->flags & JSREG_FOLD)
715 ? REOP_UCFLAT1i
716 : REOP_UCFLAT1;
717 SET_ARG(pc, t->u.flat.chr);
718 pc += ARG_LEN;
719 }
720 break;
721
722 case REOP_LPAREN:
723 assert(emitStateSP);
724 pc = WriteCompactIndex(pc, t->u.parenIndex);
725 emitStateSP->continueNode = t;
726 emitStateSP->continueOp = REOP_RPAREN;
727 ++emitStateSP;
728 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
729 t = t->kid;
730 op = t->op;
731 continue;
732
733 case REOP_RPAREN:
734 pc = WriteCompactIndex(pc, t->u.parenIndex);
735 break;
736
737 case REOP_BACKREF:
738 pc = WriteCompactIndex(pc, t->u.parenIndex);
739 break;
740
741 case REOP_ASSERT:
742 assert(emitStateSP);
743 emitStateSP->nextTermFixup = pc;
744 pc += OFFSET_LEN;
745 emitStateSP->continueNode = t;
746 emitStateSP->continueOp = REOP_ASSERTTEST;
747 ++emitStateSP;
748 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
749 t = t->kid;
750 op = t->op;
751 continue;
752
753 case REOP_ASSERTTEST:
754 case REOP_ASSERTNOTTEST:
755 if (!SetForwardJumpOffset(emitStateSP->nextTermFixup, pc))
756 goto jump_too_big;
757 break;
758
759 case REOP_ASSERT_NOT:
760 assert(emitStateSP);
761 emitStateSP->nextTermFixup = pc;
762 pc += OFFSET_LEN;
763 emitStateSP->continueNode = t;
764 emitStateSP->continueOp = REOP_ASSERTNOTTEST;
765 ++emitStateSP;
766 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
767 t = t->kid;
768 op = t->op;
769 continue;
770
771 case REOP_QUANT:
772 assert(emitStateSP);
773 if (t->u.range.min == 0 && t->u.range.max == (UINT)-1) {
774 pc[-1] = (t->u.range.greedy) ? REOP_STAR : REOP_MINIMALSTAR;
775 } else if (t->u.range.min == 0 && t->u.range.max == 1) {
776 pc[-1] = (t->u.range.greedy) ? REOP_OPT : REOP_MINIMALOPT;
777 } else if (t->u.range.min == 1 && t->u.range.max == (UINT) -1) {
778 pc[-1] = (t->u.range.greedy) ? REOP_PLUS : REOP_MINIMALPLUS;
779 } else {
780 if (!t->u.range.greedy)
781 pc[-1] = REOP_MINIMALQUANT;
782 pc = WriteCompactIndex(pc, t->u.range.min);
783 /*
784 * Write max + 1 to avoid using size_t(max) + 1 bytes
785 * for (UINT)-1 sentinel.
786 */
787 pc = WriteCompactIndex(pc, t->u.range.max + 1);
788 }
789 emitStateSP->nextTermFixup = pc;
790 pc += OFFSET_LEN;
791 emitStateSP->continueNode = t;
792 emitStateSP->continueOp = REOP_ENDCHILD;
793 ++emitStateSP;
794 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
795 t = t->kid;
796 op = t->op;
797 continue;
798
799 case REOP_ENDCHILD:
800 if (!SetForwardJumpOffset(emitStateSP->nextTermFixup, pc))
801 goto jump_too_big;
802 break;
803
804 case REOP_CLASS:
805 if (!t->u.ucclass.sense)
806 pc[-1] = REOP_NCLASS;
807 pc = WriteCompactIndex(pc, t->u.ucclass.index);
808 charSet = &re->classList[t->u.ucclass.index];
809 charSet->converted = FALSE;
810 charSet->length = t->u.ucclass.bmsize;
811 charSet->u.src.startIndex = t->u.ucclass.startIndex;
812 charSet->u.src.length = t->u.ucclass.kidlen;
813 charSet->sense = t->u.ucclass.sense;
814 break;
815
816 default:
817 break;
818 }
819
820 t = t->next;
821 if (t) {
822 op = t->op;
823 } else {
824 if (emitStateSP == emitStateStack)
825 break;
826 --emitStateSP;
827 t = emitStateSP->continueNode;
828 op = (REOp) emitStateSP->continueOp;
829 }
830 }
831
832 cleanup:
833 heap_free(emitStateStack);
834 return pc;
835
836 jump_too_big:
837 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
838 pc = NULL;
839 goto cleanup;
840 }
841
842 /*
843 * Process the op against the two top operands, reducing them to a single
844 * operand in the penultimate slot. Update progLength and treeDepth.
845 */
846 static BOOL
847 ProcessOp(CompilerState *state, REOpData *opData, RENode **operandStack,
848 INT operandSP)
849 {
850 RENode *result;
851
852 switch (opData->op) {
853 case REOP_ALT:
854 result = NewRENode(state, REOP_ALT);
855 if (!result)
856 return FALSE;
857 result->kid = operandStack[operandSP - 2];
858 result->u.kid2 = operandStack[operandSP - 1];
859 operandStack[operandSP - 2] = result;
860
861 if (state->treeDepth == TREE_DEPTH_MAX) {
862 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
863 return FALSE;
864 }
865 ++state->treeDepth;
866
867 /*
868 * Look at both alternates to see if there's a FLAT or a CLASS at
869 * the start of each. If so, use a prerequisite match.
870 */
871 if (((RENode *) result->kid)->op == REOP_FLAT &&
872 ((RENode *) result->u.kid2)->op == REOP_FLAT &&
873 (state->flags & JSREG_FOLD) == 0) {
874 result->op = REOP_ALTPREREQ;
875 result->u.altprereq.ch1 = ((RENode *) result->kid)->u.flat.chr;
876 result->u.altprereq.ch2 = ((RENode *) result->u.kid2)->u.flat.chr;
877 /* ALTPREREQ, <end>, uch1, uch2, <next>, ...,
878 JUMP, <end> ... ENDALT */
879 state->progLength += 13;
880 }
881 else
882 if (((RENode *) result->kid)->op == REOP_CLASS &&
883 ((RENode *) result->kid)->u.ucclass.index < 256 &&
884 ((RENode *) result->u.kid2)->op == REOP_FLAT &&
885 (state->flags & JSREG_FOLD) == 0) {
886 result->op = REOP_ALTPREREQ2;
887 result->u.altprereq.ch1 = ((RENode *) result->u.kid2)->u.flat.chr;
888 result->u.altprereq.ch2 = ((RENode *) result->kid)->u.ucclass.index;
889 /* ALTPREREQ2, <end>, uch1, uch2, <next>, ...,
890 JUMP, <end> ... ENDALT */
891 state->progLength += 13;
892 }
893 else
894 if (((RENode *) result->kid)->op == REOP_FLAT &&
895 ((RENode *) result->u.kid2)->op == REOP_CLASS &&
896 ((RENode *) result->u.kid2)->u.ucclass.index < 256 &&
897 (state->flags & JSREG_FOLD) == 0) {
898 result->op = REOP_ALTPREREQ2;
899 result->u.altprereq.ch1 = ((RENode *) result->kid)->u.flat.chr;
900 result->u.altprereq.ch2 =
901 ((RENode *) result->u.kid2)->u.ucclass.index;
902 /* ALTPREREQ2, <end>, uch1, uch2, <next>, ...,
903 JUMP, <end> ... ENDALT */
904 state->progLength += 13;
905 }
906 else {
907 /* ALT, <next>, ..., JUMP, <end> ... ENDALT */
908 state->progLength += 7;
909 }
910 break;
911
912 case REOP_CONCAT:
913 result = operandStack[operandSP - 2];
914 while (result->next)
915 result = result->next;
916 result->next = operandStack[operandSP - 1];
917 break;
918
919 case REOP_ASSERT:
920 case REOP_ASSERT_NOT:
921 case REOP_LPARENNON:
922 case REOP_LPAREN:
923 /* These should have been processed by a close paren. */
924 ReportRegExpErrorHelper(state, JSREPORT_ERROR, JSMSG_MISSING_PAREN,
925 opData->errPos);
926 return FALSE;
927
928 default:;
929 }
930 return TRUE;
931 }
932
933 /*
934 * Hack two bits in CompilerState.flags, for use within FindParenCount to flag
935 * its being on the stack, and to propagate errors to its callers.
936 */
937 #define JSREG_FIND_PAREN_COUNT 0x8000
938 #define JSREG_FIND_PAREN_ERROR 0x4000
939
940 /*
941 * Magic return value from FindParenCount and GetDecimalValue, to indicate
942 * overflow beyond GetDecimalValue's max parameter, or a computed maximum if
943 * its findMax parameter is non-null.
944 */
945 #define OVERFLOW_VALUE ((UINT)-1)
946
947 static UINT
948 FindParenCount(CompilerState *state)
949 {
950 CompilerState temp;
951 int i;
952
953 if (state->flags & JSREG_FIND_PAREN_COUNT)
954 return OVERFLOW_VALUE;
955
956 /*
957 * Copy state into temp, flag it so we never report an invalid backref,
958 * and reset its members to parse the entire regexp. This is obviously
959 * suboptimal, but GetDecimalValue calls us only if a backref appears to
960 * refer to a forward parenthetical, which is rare.
961 */
962 temp = *state;
963 temp.flags |= JSREG_FIND_PAREN_COUNT;
964 temp.cp = temp.cpbegin;
965 temp.parenCount = 0;
966 temp.classCount = 0;
967 temp.progLength = 0;
968 temp.treeDepth = 0;
969 temp.classBitmapsMem = 0;
970 for (i = 0; i < CLASS_CACHE_SIZE; i++)
971 temp.classCache[i].start = NULL;
972
973 if (!ParseRegExp(&temp)) {
974 state->flags |= JSREG_FIND_PAREN_ERROR;
975 return OVERFLOW_VALUE;
976 }
977 return temp.parenCount;
978 }
979
980 /*
981 * Extract and return a decimal value at state->cp. The initial character c
982 * has already been read. Return OVERFLOW_VALUE if the result exceeds max.
983 * Callers who pass a non-null findMax should test JSREG_FIND_PAREN_ERROR in
984 * state->flags to discover whether an error occurred under findMax.
985 */
986 static UINT
987 GetDecimalValue(WCHAR c, UINT max, UINT (*findMax)(CompilerState *state),
988 CompilerState *state)
989 {
990 UINT value = JS7_UNDEC(c);
991 BOOL overflow = (value > max && (!findMax || value > findMax(state)));
992
993 /* The following restriction allows simpler overflow checks. */
994 assert(max <= ((UINT)-1 - 9) / 10);
995 while (state->cp < state->cpend) {
996 c = *state->cp;
997 if (!JS7_ISDEC(c))
998 break;
999 value = 10 * value + JS7_UNDEC(c);
1000 if (!overflow && value > max && (!findMax || value > findMax(state)))
1001 overflow = TRUE;
1002 ++state->cp;
1003 }
1004 return overflow ? OVERFLOW_VALUE : value;
1005 }
1006
1007 /*
1008 * Calculate the total size of the bitmap required for a class expression.
1009 */
1010 static BOOL
1011 CalculateBitmapSize(CompilerState *state, RENode *target, const WCHAR *src,
1012 const WCHAR *end)
1013 {
1014 UINT max = 0;
1015 BOOL inRange = FALSE;
1016 WCHAR c, rangeStart = 0;
1017 UINT n, digit, nDigits, i;
1018
1019 target->u.ucclass.bmsize = 0;
1020 target->u.ucclass.sense = TRUE;
1021
1022 if (src == end)
1023 return TRUE;
1024
1025 if (*src == '^') {
1026 ++src;
1027 target->u.ucclass.sense = FALSE;
1028 }
1029
1030 while (src != end) {
1031 BOOL canStartRange = TRUE;
1032 UINT localMax = 0;
1033
1034 switch (*src) {
1035 case '\\':
1036 ++src;
1037 c = *src++;
1038 switch (c) {
1039 case 'b':
1040 localMax = 0x8;
1041 break;
1042 case 'f':
1043 localMax = 0xC;
1044 break;
1045 case 'n':
1046 localMax = 0xA;
1047 break;
1048 case 'r':
1049 localMax = 0xD;
1050 break;
1051 case 't':
1052 localMax = 0x9;
1053 break;
1054 case 'v':
1055 localMax = 0xB;
1056 break;
1057 case 'c':
1058 if (src < end && RE_IS_LETTER(*src)) {
1059 localMax = (UINT) (*src++) & 0x1F;
1060 } else {
1061 --src;
1062 localMax = '\\';
1063 }
1064 break;
1065 case 'x':
1066 nDigits = 2;
1067 goto lexHex;
1068 case 'u':
1069 nDigits = 4;
1070 lexHex:
1071 n = 0;
1072 for (i = 0; (i < nDigits) && (src < end); i++) {
1073 c = *src++;
1074 if (!isASCIIHexDigit(c, &digit)) {
1075 /*
1076 * Back off to accepting the original
1077 *'\' as a literal.
1078 */
1079 src -= i + 1;
1080 n = '\\';
1081 break;
1082 }
1083 n = (n << 4) | digit;
1084 }
1085 localMax = n;
1086 break;
1087 case 'd':
1088 canStartRange = FALSE;
1089 if (inRange) {
1090 JS_ReportErrorNumber(state->context,
1091 js_GetErrorMessage, NULL,
1092 JSMSG_BAD_CLASS_RANGE);
1093 return FALSE;
1094 }
1095 localMax = '9';
1096 break;
1097 case 'D':
1098 case 's':
1099 case 'S':
1100 case 'w':
1101 case 'W':
1102 canStartRange = FALSE;
1103 if (inRange) {
1104 JS_ReportErrorNumber(state->context,
1105 js_GetErrorMessage, NULL,
1106 JSMSG_BAD_CLASS_RANGE);
1107 return FALSE;
1108 }
1109 max = 65535;
1110
1111 /*
1112 * If this is the start of a range, ensure that it's less than
1113 * the end.
1114 */
1115 localMax = 0;
1116 break;
1117 case '0':
1118 case '1':
1119 case '2':
1120 case '3':
1121 case '4':
1122 case '5':
1123 case '6':
1124 case '7':
1125 /*
1126 * This is a non-ECMA extension - decimal escapes (in this
1127 * case, octal!) are supposed to be an error inside class
1128 * ranges, but supported here for backwards compatibility.
1129 *
1130 */
1131 n = JS7_UNDEC(c);
1132 c = *src;
1133 if ('0' <= c && c <= '7') {
1134 src++;
1135 n = 8 * n + JS7_UNDEC(c);
1136 c = *src;
1137 if ('0' <= c && c <= '7') {
1138 src++;
1139 i = 8 * n + JS7_UNDEC(c);
1140 if (i <= 0377)
1141 n = i;
1142 else
1143 src--;
1144 }
1145 }
1146 localMax = n;
1147 break;
1148
1149 default:
1150 localMax = c;
1151 break;
1152 }
1153 break;
1154 default:
1155 localMax = *src++;
1156 break;
1157 }
1158
1159 if (inRange) {
1160 /* Throw a SyntaxError here, per ECMA-262, 15.10.2.15. */
1161 if (rangeStart > localMax) {
1162 JS_ReportErrorNumber(state->context,
1163 js_GetErrorMessage, NULL,
1164 JSMSG_BAD_CLASS_RANGE);
1165 return FALSE;
1166 }
1167 inRange = FALSE;
1168 } else {
1169 if (canStartRange && src < end - 1) {
1170 if (*src == '-') {
1171 ++src;
1172 inRange = TRUE;
1173 rangeStart = (WCHAR)localMax;
1174 continue;
1175 }
1176 }
1177 if (state->flags & JSREG_FOLD)
1178 rangeStart = localMax; /* one run of the uc/dc loop below */
1179 }
1180
1181 if (state->flags & JSREG_FOLD) {
1182 WCHAR maxch = localMax;
1183
1184 for (i = rangeStart; i <= localMax; i++) {
1185 WCHAR uch, dch;
1186
1187 uch = toupperW(i);
1188 dch = tolowerW(i);
1189 if(maxch < uch)
1190 maxch = uch;
1191 if(maxch < dch)
1192 maxch = dch;
1193 }
1194 localMax = maxch;
1195 }
1196
1197 if (localMax > max)
1198 max = localMax;
1199 }
1200 target->u.ucclass.bmsize = max;
1201 return TRUE;
1202 }
1203
1204 static INT
1205 ParseMinMaxQuantifier(CompilerState *state, BOOL ignoreValues)
1206 {
1207 UINT min, max;
1208 WCHAR c;
1209 const WCHAR *errp = state->cp++;
1210
1211 c = *state->cp;
1212 if (JS7_ISDEC(c)) {
1213 ++state->cp;
1214 min = GetDecimalValue(c, 0xFFFF, NULL, state);
1215 c = *state->cp;
1216
1217 if (!ignoreValues && min == OVERFLOW_VALUE)
1218 return JSMSG_MIN_TOO_BIG;
1219
1220 if (c == ',') {
1221 c = *++state->cp;
1222 if (JS7_ISDEC(c)) {
1223 ++state->cp;
1224 max = GetDecimalValue(c, 0xFFFF, NULL, state);
1225 c = *state->cp;
1226 if (!ignoreValues && max == OVERFLOW_VALUE)
1227 return JSMSG_MAX_TOO_BIG;
1228 if (!ignoreValues && min > max)
1229 return JSMSG_OUT_OF_ORDER;
1230 } else {
1231 max = (UINT)-1;
1232 }
1233 } else {
1234 max = min;
1235 }
1236 if (c == '}') {
1237 state->result = NewRENode(state, REOP_QUANT);
1238 if (!state->result)
1239 return JSMSG_OUT_OF_MEMORY;
1240 state->result->u.range.min = min;
1241 state->result->u.range.max = max;
1242 /*
1243 * QUANT, <min>, <max>, <next> ... <ENDCHILD>
1244 * where <max> is written as compact(max+1) to make
1245 * (UINT)-1 sentinel to occupy 1 byte, not width_of(max)+1.
1246 */
1247 state->progLength += (1 + GetCompactIndexWidth(min)
1248 + GetCompactIndexWidth(max + 1)
1249 +3);
1250 return 0;
1251 }
1252 }
1253
1254 state->cp = errp;
1255 return -1;
1256 }
1257
1258 static BOOL
1259 ParseQuantifier(CompilerState *state)
1260 {
1261 RENode *term;
1262 term = state->result;
1263 if (state->cp < state->cpend) {
1264 switch (*state->cp) {
1265 case '+':
1266 state->result = NewRENode(state, REOP_QUANT);
1267 if (!state->result)
1268 return FALSE;
1269 state->result->u.range.min = 1;
1270 state->result->u.range.max = (UINT)-1;
1271 /* <PLUS>, <next> ... <ENDCHILD> */
1272 state->progLength += 4;
1273 goto quantifier;
1274 case '*':
1275 state->result = NewRENode(state, REOP_QUANT);
1276 if (!state->result)
1277 return FALSE;
1278 state->result->u.range.min = 0;
1279 state->result->u.range.max = (UINT)-1;
1280 /* <STAR>, <next> ... <ENDCHILD> */
1281 state->progLength += 4;
1282 goto quantifier;
1283 case '?':
1284 state->result = NewRENode(state, REOP_QUANT);
1285 if (!state->result)
1286 return FALSE;
1287 state->result->u.range.min = 0;
1288 state->result->u.range.max = 1;
1289 /* <OPT>, <next> ... <ENDCHILD> */
1290 state->progLength += 4;
1291 goto quantifier;
1292 case '{': /* balance '}' */
1293 {
1294 INT err;
1295
1296 err = ParseMinMaxQuantifier(state, FALSE);
1297 if (err == 0)
1298 goto quantifier;
1299 if (err == -1)
1300 return TRUE;
1301
1302 ReportRegExpErrorHelper(state, JSREPORT_ERROR, err, errp);
1303 return FALSE;
1304 }
1305 default:;
1306 }
1307 }
1308 return TRUE;
1309
1310 quantifier:
1311 if (state->treeDepth == TREE_DEPTH_MAX) {
1312 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
1313 return FALSE;
1314 }
1315
1316 ++state->treeDepth;
1317 ++state->cp;
1318 state->result->kid = term;
1319 if (state->cp < state->cpend && *state->cp == '?') {
1320 ++state->cp;
1321 state->result->u.range.greedy = FALSE;
1322 } else {
1323 state->result->u.range.greedy = TRUE;
1324 }
1325 return TRUE;
1326 }
1327
1328 /*
1329 * item: assertion An item is either an assertion or
1330 * quantatom a quantified atom.
1331 *
1332 * assertion: '^' Assertions match beginning of string
1333 * (or line if the class static property
1334 * RegExp.multiline is true).
1335 * '$' End of string (or line if the class
1336 * static property RegExp.multiline is
1337 * true).
1338 * '\b' Word boundary (between \w and \W).
1339 * '\B' Word non-boundary.
1340 *
1341 * quantatom: atom An unquantified atom.
1342 * quantatom '{' n ',' m '}'
1343 * Atom must occur between n and m times.
1344 * quantatom '{' n ',' '}' Atom must occur at least n times.
1345 * quantatom '{' n '}' Atom must occur exactly n times.
1346 * quantatom '*' Zero or more times (same as {0,}).
1347 * quantatom '+' One or more times (same as {1,}).
1348 * quantatom '?' Zero or one time (same as {0,1}).
1349 *
1350 * any of which can be optionally followed by '?' for ungreedy
1351 *
1352 * atom: '(' regexp ')' A parenthesized regexp (what matched
1353 * can be addressed using a backreference,
1354 * see '\' n below).
1355 * '.' Matches any char except '\n'.
1356 * '[' classlist ']' A character class.
1357 * '[' '^' classlist ']' A negated character class.
1358 * '\f' Form Feed.
1359 * '\n' Newline (Line Feed).
1360 * '\r' Carriage Return.
1361 * '\t' Horizontal Tab.
1362 * '\v' Vertical Tab.
1363 * '\d' A digit (same as [0-9]).
1364 * '\D' A non-digit.
1365 * '\w' A word character, [0-9a-z_A-Z].
1366 * '\W' A non-word character.
1367 * '\s' A whitespace character, [ \b\f\n\r\t\v].
1368 * '\S' A non-whitespace character.
1369 * '\' n A backreference to the nth (n decimal
1370 * and positive) parenthesized expression.
1371 * '\' octal An octal escape sequence (octal must be
1372 * two or three digits long, unless it is
1373 * 0 for the null character).
1374 * '\x' hex A hex escape (hex must be two digits).
1375 * '\u' unicode A unicode escape (must be four digits).
1376 * '\c' ctrl A control character, ctrl is a letter.
1377 * '\' literalatomchar Any character except one of the above
1378 * that follow '\' in an atom.
1379 * otheratomchar Any character not first among the other
1380 * atom right-hand sides.
1381 */
1382 static BOOL
1383 ParseTerm(CompilerState *state)
1384 {
1385 WCHAR c = *state->cp++;
1386 UINT nDigits;
1387 UINT num, tmp, n, i;
1388 const WCHAR *termStart;
1389
1390 switch (c) {
1391 /* assertions and atoms */
1392 case '^':
1393 state->result = NewRENode(state, REOP_BOL);
1394 if (!state->result)
1395 return FALSE;
1396 state->progLength++;
1397 return TRUE;
1398 case '$':
1399 state->result = NewRENode(state, REOP_EOL);
1400 if (!state->result)
1401 return FALSE;
1402 state->progLength++;
1403 return TRUE;
1404 case '\\':
1405 if (state->cp >= state->cpend) {
1406 /* a trailing '\' is an error */
1407 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_TRAILING_SLASH);
1408 return FALSE;
1409 }
1410 c = *state->cp++;
1411 switch (c) {
1412 /* assertion escapes */
1413 case 'b' :
1414 state->result = NewRENode(state, REOP_WBDRY);
1415 if (!state->result)
1416 return FALSE;
1417 state->progLength++;
1418 return TRUE;
1419 case 'B':
1420 state->result = NewRENode(state, REOP_WNONBDRY);
1421 if (!state->result)
1422 return FALSE;
1423 state->progLength++;
1424 return TRUE;
1425 /* Decimal escape */
1426 case '0':
1427 /* Give a strict warning. See also the note below. */
1428 WARN("non-octal digit in an escape sequence that doesn't match a back-reference\n");
1429 doOctal:
1430 num = 0;
1431 while (state->cp < state->cpend) {
1432 c = *state->cp;
1433 if (c < '0' || '7' < c)
1434 break;
1435 state->cp++;
1436 tmp = 8 * num + (UINT)JS7_UNDEC(c);
1437 if (tmp > 0377)
1438 break;
1439 num = tmp;
1440 }
1441 c = (WCHAR)num;
1442 doFlat:
1443 state->result = NewRENode(state, REOP_FLAT);
1444 if (!state->result)
1445 return FALSE;
1446 state->result->u.flat.chr = c;
1447 state->result->u.flat.length = 1;
1448 state->progLength += 3;
1449 break;
1450 case '1':
1451 case '2':
1452 case '3':
1453 case '4':
1454 case '5':
1455 case '6':
1456 case '7':
1457 case '8':
1458 case '9':
1459 termStart = state->cp - 1;
1460 num = GetDecimalValue(c, state->parenCount, FindParenCount, state);
1461 if (state->flags & JSREG_FIND_PAREN_ERROR)
1462 return FALSE;
1463 if (num == OVERFLOW_VALUE) {
1464 /* Give a strict mode warning. */
1465 WARN("back-reference exceeds number of capturing parentheses\n");
1466
1467 /*
1468 * Note: ECMA 262, 15.10.2.9 says that we should throw a syntax
1469 * error here. However, for compatibility with IE, we treat the
1470 * whole backref as flat if the first character in it is not a
1471 * valid octal character, and as an octal escape otherwise.
1472 */
1473 state->cp = termStart;
1474 if (c >= '8') {
1475 /* Treat this as flat. termStart - 1 is the \. */
1476 c = '\\';
1477 goto asFlat;
1478 }
1479
1480 /* Treat this as an octal escape. */
1481 goto doOctal;
1482 }
1483 assert(1 <= num && num <= 0x10000);
1484 state->result = NewRENode(state, REOP_BACKREF);
1485 if (!state->result)
1486 return FALSE;
1487 state->result->u.parenIndex = num - 1;
1488 state->progLength
1489 += 1 + GetCompactIndexWidth(state->result->u.parenIndex);
1490 break;
1491 /* Control escape */
1492 case 'f':
1493 c = 0xC;
1494 goto doFlat;
1495 case 'n':
1496 c = 0xA;
1497 goto doFlat;
1498 case 'r':
1499 c = 0xD;
1500 goto doFlat;
1501 case 't':
1502 c = 0x9;
1503 goto doFlat;
1504 case 'v':
1505 c = 0xB;
1506 goto doFlat;
1507 /* Control letter */
1508 case 'c':
1509 if (state->cp < state->cpend && RE_IS_LETTER(*state->cp)) {
1510 c = (WCHAR) (*state->cp++ & 0x1F);
1511 } else {
1512 /* back off to accepting the original '\' as a literal */
1513 --state->cp;
1514 c = '\\';
1515 }
1516 goto doFlat;
1517 /* HexEscapeSequence */
1518 case 'x':
1519 nDigits = 2;
1520 goto lexHex;
1521 /* UnicodeEscapeSequence */
1522 case 'u':
1523 nDigits = 4;
1524 lexHex:
1525 n = 0;
1526 for (i = 0; i < nDigits && state->cp < state->cpend; i++) {
1527 UINT digit;
1528 c = *state->cp++;
1529 if (!isASCIIHexDigit(c, &digit)) {
1530 /*
1531 * Back off to accepting the original 'u' or 'x' as a
1532 * literal.
1533 */
1534 state->cp -= i + 2;
1535 n = *state->cp++;
1536 break;
1537 }
1538 n = (n << 4) | digit;
1539 }
1540 c = (WCHAR) n;
1541 goto doFlat;
1542 /* Character class escapes */
1543 case 'd':
1544 state->result = NewRENode(state, REOP_DIGIT);
1545 doSimple:
1546 if (!state->result)
1547 return FALSE;
1548 state->progLength++;
1549 break;
1550 case 'D':
1551 state->result = NewRENode(state, REOP_NONDIGIT);
1552 goto doSimple;
1553 case 's':
1554 state->result = NewRENode(state, REOP_SPACE);
1555 goto doSimple;
1556 case 'S':
1557 state->result = NewRENode(state, REOP_NONSPACE);
1558 goto doSimple;
1559 case 'w':
1560 state->result = NewRENode(state, REOP_ALNUM);
1561 goto doSimple;
1562 case 'W':
1563 state->result = NewRENode(state, REOP_NONALNUM);
1564 goto doSimple;
1565 /* IdentityEscape */
1566 default:
1567 state->result = NewRENode(state, REOP_FLAT);
1568 if (!state->result)
1569 return FALSE;
1570 state->result->u.flat.chr = c;
1571 state->result->u.flat.length = 1;
1572 state->result->kid = (void *) (state->cp - 1);
1573 state->progLength += 3;
1574 break;
1575 }
1576 break;
1577 case '[':
1578 state->result = NewRENode(state, REOP_CLASS);
1579 if (!state->result)
1580 return FALSE;
1581 termStart = state->cp;
1582 state->result->u.ucclass.startIndex = termStart - state->cpbegin;
1583 for (;;) {
1584 if (state->cp == state->cpend) {
1585 ReportRegExpErrorHelper(state, JSREPORT_ERROR,
1586 JSMSG_UNTERM_CLASS, termStart);
1587
1588 return FALSE;
1589 }
1590 if (*state->cp == '\\') {
1591 state->cp++;
1592 if (state->cp != state->cpend)
1593 state->cp++;
1594 continue;
1595 }
1596 if (*state->cp == ']') {
1597 state->result->u.ucclass.kidlen = state->cp - termStart;
1598 break;
1599 }
1600 state->cp++;
1601 }
1602 for (i = 0; i < CLASS_CACHE_SIZE; i++) {
1603 if (!state->classCache[i].start) {
1604 state->classCache[i].start = termStart;
1605 state->classCache[i].length = state->result->u.ucclass.kidlen;
1606 state->classCache[i].index = state->classCount;
1607 break;
1608 }
1609 if (state->classCache[i].length ==
1610 state->result->u.ucclass.kidlen) {
1611 for (n = 0; ; n++) {
1612 if (n == state->classCache[i].length) {
1613 state->result->u.ucclass.index
1614 = state->classCache[i].index;
1615 goto claim;
1616 }
1617 if (state->classCache[i].start[n] != termStart[n])
1618 break;
1619 }
1620 }
1621 }
1622 state->result->u.ucclass.index = state->classCount++;
1623
1624 claim:
1625 /*
1626 * Call CalculateBitmapSize now as we want any errors it finds
1627 * to be reported during the parse phase, not at execution.
1628 */
1629 if (!CalculateBitmapSize(state, state->result, termStart, state->cp++))
1630 return FALSE;
1631 /*
1632 * Update classBitmapsMem with number of bytes to hold bmsize bits,
1633 * which is (bitsCount + 7) / 8 or (highest_bit + 1 + 7) / 8
1634 * or highest_bit / 8 + 1 where highest_bit is u.ucclass.bmsize.
1635 */
1636 n = (state->result->u.ucclass.bmsize >> 3) + 1;
1637 if (n > CLASS_BITMAPS_MEM_LIMIT - state->classBitmapsMem) {
1638 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
1639 return FALSE;
1640 }
1641 state->classBitmapsMem += n;
1642 /* CLASS, <index> */
1643 state->progLength
1644 += 1 + GetCompactIndexWidth(state->result->u.ucclass.index);
1645 break;
1646
1647 case '.':
1648 state->result = NewRENode(state, REOP_DOT);
1649 goto doSimple;
1650
1651 case '{':
1652 {
1653 const WCHAR *errp = state->cp--;
1654 INT err;
1655
1656 err = ParseMinMaxQuantifier(state, TRUE);
1657 state->cp = errp;
1658
1659 if (err < 0)
1660 goto asFlat;
1661
1662 /* FALL THROUGH */
1663 }
1664 case '*':
1665 case '+':
1666 case '?':
1667 ReportRegExpErrorHelper(state, JSREPORT_ERROR,
1668 JSMSG_BAD_QUANTIFIER, state->cp - 1);
1669 return FALSE;
1670 default:
1671 asFlat:
1672 state->result = NewRENode(state, REOP_FLAT);
1673 if (!state->result)
1674 return FALSE;
1675 state->result->u.flat.chr = c;
1676 state->result->u.flat.length = 1;
1677 state->result->kid = (void *) (state->cp - 1);
1678 state->progLength += 3;
1679 break;
1680 }
1681 return ParseQuantifier(state);
1682 }
1683
1684 /*
1685 * Top-down regular expression grammar, based closely on Perl4.
1686 *
1687 * regexp: altern A regular expression is one or more
1688 * altern '|' regexp alternatives separated by vertical bar.
1689 */
1690 #define INITIAL_STACK_SIZE 128
1691
1692 static BOOL
1693 ParseRegExp(CompilerState *state)
1694 {
1695 size_t parenIndex;
1696 RENode *operand;
1697 REOpData *operatorStack;
1698 RENode **operandStack;
1699 REOp op;
1700 INT i;
1701 BOOL result = FALSE;
1702
1703 INT operatorSP = 0, operatorStackSize = INITIAL_STACK_SIZE;
1704 INT operandSP = 0, operandStackSize = INITIAL_STACK_SIZE;
1705
1706 /* Watch out for empty regexp */
1707 if (state->cp == state->cpend) {
1708 state->result = NewRENode(state, REOP_EMPTY);
1709 return (state->result != NULL);
1710 }
1711
1712 operatorStack = heap_alloc(sizeof(REOpData) * operatorStackSize);
1713 if (!operatorStack)
1714 return FALSE;
1715
1716 operandStack = heap_alloc(sizeof(RENode *) * operandStackSize);
1717 if (!operandStack)
1718 goto out;
1719
1720 for (;;) {
1721 parenIndex = state->parenCount;
1722 if (state->cp == state->cpend) {
1723 /*
1724 * If we are at the end of the regexp and we're short one or more
1725 * operands, the regexp must have the form /x|/ or some such, with
1726 * left parentheses making us short more than one operand.
1727 */
1728 if (operatorSP >= operandSP) {
1729 operand = NewRENode(state, REOP_EMPTY);
1730 if (!operand)
1731 goto out;
1732 goto pushOperand;
1733 }
1734 } else {
1735 switch (*state->cp) {
1736 case '(':
1737 ++state->cp;
1738 if (state->cp + 1 < state->cpend &&
1739 *state->cp == '?' &&
1740 (state->cp[1] == '=' ||
1741 state->cp[1] == '!' ||
1742 state->cp[1] == ':')) {
1743 switch (state->cp[1]) {
1744 case '=':
1745 op = REOP_ASSERT;
1746 /* ASSERT, <next>, ... ASSERTTEST */
1747 state->progLength += 4;
1748 break;
1749 case '!':
1750 op = REOP_ASSERT_NOT;
1751 /* ASSERTNOT, <next>, ... ASSERTNOTTEST */
1752 state->progLength += 4;
1753 break;
1754 default:
1755 op = REOP_LPARENNON;
1756 break;
1757 }
1758 state->cp += 2;
1759 } else {
1760 op = REOP_LPAREN;
1761 /* LPAREN, <index>, ... RPAREN, <index> */
1762 state->progLength
1763 += 2 * (1 + GetCompactIndexWidth(parenIndex));
1764 state->parenCount++;
1765 if (state->parenCount == 65535) {
1766 ReportRegExpError(state, JSREPORT_ERROR,
1767 JSMSG_TOO_MANY_PARENS);
1768 goto out;
1769 }
1770 }
1771 goto pushOperator;
1772
1773 case ')':
1774 /*
1775 * If there's no stacked open parenthesis, throw syntax error.
1776 */
1777 for (i = operatorSP - 1; ; i--) {
1778 if (i < 0) {
1779 ReportRegExpError(state, JSREPORT_ERROR,
1780 JSMSG_UNMATCHED_RIGHT_PAREN);
1781 goto out;
1782 }
1783 if (operatorStack[i].op == REOP_ASSERT ||
1784 operatorStack[i].op == REOP_ASSERT_NOT ||
1785 operatorStack[i].op == REOP_LPARENNON ||
1786 operatorStack[i].op == REOP_LPAREN) {
1787 break;
1788 }
1789 }
1790 /* FALL THROUGH */
1791
1792 case '|':
1793 /* Expected an operand before these, so make an empty one */
1794 operand = NewRENode(state, REOP_EMPTY);
1795 if (!operand)
1796 goto out;
1797 goto pushOperand;
1798
1799 default:
1800 if (!ParseTerm(state))
1801 goto out;
1802 operand = state->result;
1803 pushOperand:
1804 if (operandSP == operandStackSize) {
1805 RENode **tmp;
1806 operandStackSize += operandStackSize;
1807 tmp = heap_realloc(operandStack, sizeof(RENode *) * operandStackSize);
1808 if (!tmp)
1809 goto out;
1810 operandStack = tmp;
1811 }
1812 operandStack[operandSP++] = operand;
1813 break;
1814 }
1815 }
1816
1817 /* At the end; process remaining operators. */
1818 restartOperator:
1819 if (state->cp == state->cpend) {
1820 while (operatorSP) {
1821 --operatorSP;
1822 if (!ProcessOp(state, &operatorStack[operatorSP],
1823 operandStack, operandSP))
1824 goto out;
1825 --operandSP;
1826 }
1827 assert(operandSP == 1);
1828 state->result = operandStack[0];
1829 result = TRUE;
1830 goto out;
1831 }
1832
1833 switch (*state->cp) {
1834 case '|':
1835 /* Process any stacked 'concat' operators */
1836 ++state->cp;
1837 while (operatorSP &&
1838 operatorStack[operatorSP - 1].op == REOP_CONCAT) {
1839 --operatorSP;
1840 if (!ProcessOp(state, &operatorStack[operatorSP],
1841 operandStack, operandSP)) {
1842 goto out;
1843 }
1844 --operandSP;
1845 }
1846 op = REOP_ALT;
1847 goto pushOperator;
1848
1849 case ')':
1850 /*
1851 * If there's no stacked open parenthesis, throw syntax error.
1852 */
1853 for (i = operatorSP - 1; ; i--) {
1854 if (i < 0) {
1855 ReportRegExpError(state, JSREPORT_ERROR,
1856 JSMSG_UNMATCHED_RIGHT_PAREN);
1857 goto out;
1858 }
1859 if (operatorStack[i].op == REOP_ASSERT ||
1860 operatorStack[i].op == REOP_ASSERT_NOT ||
1861 operatorStack[i].op == REOP_LPARENNON ||
1862 operatorStack[i].op == REOP_LPAREN) {
1863 break;
1864 }
1865 }
1866 ++state->cp;
1867
1868 /* Process everything on the stack until the open parenthesis. */
1869 for (;;) {
1870 assert(operatorSP);
1871 --operatorSP;
1872 switch (operatorStack[operatorSP].op) {
1873 case REOP_ASSERT:
1874 case REOP_ASSERT_NOT:
1875 case REOP_LPAREN:
1876 operand = NewRENode(state, operatorStack[operatorSP].op);
1877 if (!operand)
1878 goto out;
1879 operand->u.parenIndex =
1880 operatorStack[operatorSP].parenIndex;
1881 assert(operandSP);
1882 operand->kid = operandStack[operandSP - 1];
1883 operandStack[operandSP - 1] = operand;
1884 if (state->treeDepth == TREE_DEPTH_MAX) {
1885 ReportRegExpError(state, JSREPORT_ERROR,
1886 JSMSG_REGEXP_TOO_COMPLEX);
1887 goto out;
1888 }
1889 ++state->treeDepth;
1890 /* FALL THROUGH */
1891
1892 case REOP_LPARENNON:
1893 state->result = operandStack[operandSP - 1];
1894 if (!ParseQuantifier(state))
1895 goto out;
1896 operandStack[operandSP - 1] = state->result;
1897 goto restartOperator;
1898 default:
1899 if (!ProcessOp(state, &operatorStack[operatorSP],
1900 operandStack, operandSP))
1901 goto out;
1902 --operandSP;
1903 break;
1904 }
1905 }
1906 break;
1907
1908 case '{':
1909 {
1910 const WCHAR *errp = state->cp;
1911
1912 if (ParseMinMaxQuantifier(state, TRUE) < 0) {
1913 /*
1914 * This didn't even scan correctly as a quantifier, so we should
1915 * treat it as flat.
1916 */
1917 op = REOP_CONCAT;
1918 goto pushOperator;
1919 }
1920
1921 state->cp = errp;
1922 /* FALL THROUGH */
1923 }
1924
1925 case '+':
1926 case '*':
1927 case '?':
1928 ReportRegExpErrorHelper(state, JSREPORT_ERROR, JSMSG_BAD_QUANTIFIER,
1929 state->cp);
1930 result = FALSE;
1931 goto out;
1932
1933 default:
1934 /* Anything else is the start of the next term. */
1935 op = REOP_CONCAT;
1936 pushOperator:
1937 if (operatorSP == operatorStackSize) {
1938 REOpData *tmp;
1939 operatorStackSize += operatorStackSize;
1940 tmp = heap_realloc(operatorStack, sizeof(REOpData) * operatorStackSize);
1941 if (!tmp)
1942 goto out;
1943 operatorStack = tmp;
1944 }
1945 operatorStack[operatorSP].op = op;
1946 operatorStack[operatorSP].errPos = state->cp;
1947 operatorStack[operatorSP++].parenIndex = parenIndex;
1948 break;
1949 }
1950 }
1951 out:
1952 heap_free(operatorStack);
1953 heap_free(operandStack);
1954 return result;
1955 }
1956
1957 /*
1958 * Save the current state of the match - the position in the input
1959 * text as well as the position in the bytecode. The state of any
1960 * parent expressions is also saved (preceding state).
1961 * Contents of parenCount parentheses from parenIndex are also saved.
1962 */
1963 static REBackTrackData *
1964 PushBackTrackState(REGlobalData *gData, REOp op,
1965 jsbytecode *target, REMatchState *x, const WCHAR *cp,
1966 size_t parenIndex, size_t parenCount)
1967 {
1968 size_t i;
1969 REBackTrackData *result =
1970 (REBackTrackData *) ((char *)gData->backTrackSP + gData->cursz);
1971
1972 size_t sz = sizeof(REBackTrackData) +
1973 gData->stateStackTop * sizeof(REProgState) +
1974 parenCount * sizeof(RECapture);
1975
1976 ptrdiff_t btsize = gData->backTrackStackSize;
1977 ptrdiff_t btincr = ((char *)result + sz) -
1978 ((char *)gData->backTrackStack + btsize);
1979
1980 TRACE("\tBT_Push: %lu,%lu\n", (unsigned long) parenIndex, (unsigned long) parenCount);
1981
1982 JS_COUNT_OPERATION(gData->cx, JSOW_JUMP * (1 + parenCount));
1983 if (btincr > 0) {
1984 ptrdiff_t offset = (char *)result - (char *)gData->backTrackStack;
1985
1986 JS_COUNT_OPERATION(gData->cx, JSOW_ALLOCATION);
1987 btincr = ((btincr+btsize-1)/btsize)*btsize;
1988 gData->backTrackStack = jsheap_grow(gData->pool, gData->backTrackStack, btsize, btincr);
1989 if (!gData->backTrackStack) {
1990 js_ReportOutOfScriptQuota(gData->cx);
1991 gData->ok = FALSE;
1992 return NULL;
1993 }
1994 gData->backTrackStackSize = btsize + btincr;
1995 result = (REBackTrackData *) ((char *)gData->backTrackStack + offset);
1996 }
1997 gData->backTrackSP = result;
1998 result->sz = gData->cursz;
1999 gData->cursz = sz;
2000
2001 result->backtrack_op = op;
2002 result->backtrack_pc = target;
2003 result->cp = cp;
2004 result->parenCount = parenCount;
2005 result->parenIndex = parenIndex;
2006
2007 result->saveStateStackTop = gData->stateStackTop;
2008 assert(gData->stateStackTop);
2009 memcpy(result + 1, gData->stateStack,
2010 sizeof(REProgState) * result->saveStateStackTop);
2011
2012 if (parenCount != 0) {
2013 memcpy((char *)(result + 1) +
2014 sizeof(REProgState) * result->saveStateStackTop,
2015 &x->parens[parenIndex],
2016 sizeof(RECapture) * parenCount);
2017 for (i = 0; i != parenCount; i++)
2018 x->parens[parenIndex + i].index = -1;
2019 }
2020
2021 return result;
2022 }
2023
2024 static inline REMatchState *
2025 FlatNIMatcher(REGlobalData *gData, REMatchState *x, WCHAR *matchChars,
2026 size_t length)
2027 {
2028 size_t i;
2029 assert(gData->cpend >= x->cp);
2030 if (length > (size_t)(gData->cpend - x->cp))
2031 return NULL;
2032 for (i = 0; i != length; i++) {
2033 if (toupperW(matchChars[i]) != toupperW(x->cp[i]))
2034 return NULL;
2035 }
2036 x->cp += length;
2037 return x;
2038 }
2039
2040 /*
2041 * 1. Evaluate DecimalEscape to obtain an EscapeValue E.
2042 * 2. If E is not a character then go to step 6.
2043 * 3. Let ch be E's character.
2044 * 4. Let A be a one-element RECharSet containing the character ch.
2045 * 5. Call CharacterSetMatcher(A, false) and return its Matcher result.
2046 * 6. E must be an integer. Let n be that integer.
2047 * 7. If n=0 or n>NCapturingParens then throw a SyntaxError exception.
2048 * 8. Return an internal Matcher closure that takes two arguments, a State x
2049 * and a Continuation c, and performs the following:
2050 * 1. Let cap be x's captures internal array.
2051 * 2. Let s be cap[n].
2052 * 3. If s is undefined, then call c(x) and return its result.
2053 * 4. Let e be x's endIndex.
2054 * 5. Let len be s's length.
2055 * 6. Let f be e+len.
2056 * 7. If f>InputLength, return failure.
2057 * 8. If there exists an integer i between 0 (inclusive) and len (exclusive)
2058 * such that Canonicalize(s[i]) is not the same character as
2059 * Canonicalize(Input [e+i]), then return failure.
2060 * 9. Let y be the State (f, cap).
2061 * 10. Call c(y) and return its result.
2062 */
2063 static REMatchState *
2064 BackrefMatcher(REGlobalData *gData, REMatchState *x, size_t parenIndex)
2065 {
2066 size_t len, i;
2067 const WCHAR *parenContent;
2068 RECapture *cap = &x->parens[parenIndex];
2069
2070 if (cap->index == -1)
2071 return x;
2072
2073 len = cap->length;
2074 if (x->cp + len > gData->cpend)
2075 return NULL;
2076
2077 parenContent = &gData->cpbegin[cap->index];
2078 if (gData->regexp->flags & JSREG_FOLD) {
2079 for (i = 0; i < len; i++) {
2080 if (toupperW(parenContent[i]) != toupperW(x->cp[i]))
2081 return NULL;
2082 }
2083 } else {
2084 for (i = 0; i < len; i++) {
2085 if (parenContent[i] != x->cp[i])
2086 return NULL;
2087 }
2088 }
2089 x->cp += len;
2090 return x;
2091 }
2092
2093 /* Add a single character to the RECharSet */
2094 static void
2095 AddCharacterToCharSet(RECharSet *cs, WCHAR c)
2096 {
2097 UINT byteIndex = (UINT)(c >> 3);
2098 assert(c <= cs->length);
2099 cs->u.bits[byteIndex] |= 1 << (c & 0x7);
2100 }
2101
2102
2103 /* Add a character range, c1 to c2 (inclusive) to the RECharSet */
2104 static void
2105 AddCharacterRangeToCharSet(RECharSet *cs, UINT c1, UINT c2)
2106 {
2107 UINT i;
2108
2109 UINT byteIndex1 = c1 >> 3;
2110 UINT byteIndex2 = c2 >> 3;
2111
2112 assert(c2 <= cs->length && c1 <= c2);
2113
2114 c1 &= 0x7;
2115 c2 &= 0x7;
2116
2117 if (byteIndex1 == byteIndex2) {
2118 cs->u.bits[byteIndex1] |= ((BYTE)0xFF >> (7 - (c2 - c1))) << c1;
2119 } else {
2120 cs->u.bits[byteIndex1] |= 0xFF << c1;
2121 for (i = byteIndex1 + 1; i < byteIndex2; i++)
2122 cs->u.bits[i] = 0xFF;
2123 cs->u.bits[byteIndex2] |= (BYTE)0xFF >> (7 - c2);
2124 }
2125 }
2126
2127 /* Compile the source of the class into a RECharSet */
2128 static BOOL
2129 ProcessCharSet(REGlobalData *gData, RECharSet *charSet)
2130 {
2131 const WCHAR *src, *end;
2132 BOOL inRange = FALSE;
2133 WCHAR rangeStart = 0;
2134 UINT byteLength, n;
2135 WCHAR c, thisCh;
2136 INT nDigits, i;
2137
2138 assert(!charSet->converted);
2139 /*
2140 * Assert that startIndex and length points to chars inside [] inside
2141 * source string.
2142 */
2143 assert(1 <= charSet->u.src.startIndex);
2144 assert(charSet->u.src.startIndex
2145 < SysStringLen(gData->regexp->source));
2146 assert(charSet->u.src.length <= SysStringLen(gData->regexp->source)
2147 - 1 - charSet->u.src.startIndex);
2148
2149 charSet->converted = TRUE;
2150 src = gData->regexp->source + charSet->u.src.startIndex;
2151
2152 end = src + charSet->u.src.length;
2153
2154 assert(src[-1] == '[' && end[0] == ']');
2155
2156 byteLength = (charSet->length >> 3) + 1;
2157 charSet->u.bits = heap_alloc(byteLength);
2158 if (!charSet->u.bits) {
2159 JS_ReportOutOfMemory(gData->cx);
2160 gData->ok = FALSE;
2161 return FALSE;
2162 }
2163 memset(charSet->u.bits, 0, byteLength);
2164
2165 if (src == end)
2166 return TRUE;
2167
2168 if (*src == '^') {
2169 assert(charSet->sense == FALSE);
2170 ++src;
2171 } else {
2172 assert(charSet->sense == TRUE);
2173 }
2174
2175 while (src != end) {
2176 switch (*src) {
2177 case '\\':
2178 ++src;
2179 c = *src++;
2180 switch (c) {
2181 case 'b':
2182 thisCh = 0x8;
2183 break;
2184 case 'f':
2185 thisCh = 0xC;
2186 break;
2187 case 'n':
2188 thisCh = 0xA;
2189 break;
2190 case 'r':
2191 thisCh = 0xD;
2192 break;
2193 case 't':
2194 thisCh = 0x9;
2195 break;
2196 case 'v':
2197 thisCh = 0xB;
2198 break;
2199 case 'c':
2200 if (src < end && JS_ISWORD(*src)) {
2201 thisCh = (WCHAR)(*src++ & 0x1F);
2202 } else {
2203 --src;
2204 thisCh = '\\';
2205 }
2206 break;
2207 case 'x':
2208 nDigits = 2;
2209 goto lexHex;
2210 case 'u':
2211 nDigits = 4;
2212 lexHex:
2213 n = 0;
2214 for (i = 0; (i < nDigits) && (src < end); i++) {
2215 UINT digit;
2216 c = *src++;
2217 if (!isASCIIHexDigit(c, &digit)) {
2218 /*
2219 * Back off to accepting the original '\'
2220 * as a literal
2221 */
2222 src -= i + 1;
2223 n = '\\';
2224 break;
2225 }
2226 n = (n << 4) | digit;
2227 }
2228 thisCh = (WCHAR)n;
2229 break;
2230 case '0':
2231 case '1':
2232 case '2':
2233 case '3':
2234 case '4':
2235 case '5':
2236 case '6':
2237 case '7':
2238 /*
2239 * This is a non-ECMA extension - decimal escapes (in this
2240 * case, octal!) are supposed to be an error inside class
2241 * ranges, but supported here for backwards compatibility.
2242 */
2243 n = JS7_UNDEC(c);
2244 c = *src;
2245 if ('0' <= c && c <= '7') {
2246 src++;
2247 n = 8 * n + JS7_UNDEC(c);
2248 c = *src;
2249 if ('0' <= c && c <= '7') {
2250 src++;
2251 i = 8 * n + JS7_UNDEC(c);
2252 if (i <= 0377)
2253 n = i;
2254 else
2255 src--;
2256 }
2257 }
2258 thisCh = (WCHAR)n;
2259 break;
2260
2261 case 'd':
2262 AddCharacterRangeToCharSet(charSet, '0', '9');
2263 continue; /* don't need range processing */
2264 case 'D':
2265 AddCharacterRangeToCharSet(charSet, 0, '0' - 1);
2266 AddCharacterRangeToCharSet(charSet,
2267 (WCHAR)('9' + 1),
2268 (WCHAR)charSet->length);
2269 continue;
2270 case 's':
2271 for (i = (INT)charSet->length; i >= 0; i--)
2272 if (isspaceW(i))
2273 AddCharacterToCharSet(charSet, (WCHAR)i);
2274 continue;
2275 case 'S':
2276 for (i = (INT)charSet->length; i >= 0; i--)
2277 if (!isspaceW(i))
2278 AddCharacterToCharSet(charSet, (WCHAR)i);
2279 continue;
2280 case 'w':
2281 for (i = (INT)charSet->length; i >= 0; i--)
2282 if (JS_ISWORD(i))
2283 AddCharacterToCharSet(charSet, (WCHAR)i);
2284 continue;
2285 case 'W':
2286 for (i = (INT)charSet->length; i >= 0; i--)
2287 if (!JS_ISWORD(i))
2288 AddCharacterToCharSet(charSet, (WCHAR)i);
2289 continue;
2290 default:
2291 thisCh = c;
2292 break;
2293
2294 }
2295 break;
2296
2297 default:
2298 thisCh = *src++;
2299 break;
2300
2301 }
2302 if (inRange) {
2303 if (gData->regexp->flags & JSREG_FOLD) {
2304 int i;
2305
2306 assert(rangeStart <= thisCh);
2307 for (i = rangeStart; i <= thisCh; i++) {
2308 WCHAR uch, dch;
2309
2310 AddCharacterToCharSet(charSet, i);
2311 uch = toupperW(i);
2312 dch = tolowerW(i);
2313 if (i != uch)
2314 AddCharacterToCharSet(charSet, uch);
2315 if (i != dch)
2316 AddCharacterToCharSet(charSet, dch);
2317 }
2318 } else {
2319 AddCharacterRangeToCharSet(charSet, rangeStart, thisCh);
2320 }
2321 inRange = FALSE;
2322 } else {
2323 if (gData->regexp->flags & JSREG_FOLD) {
2324 AddCharacterToCharSet(charSet, toupperW(thisCh));
2325 AddCharacterToCharSet(charSet, tolowerW(thisCh));
2326 } else {
2327 AddCharacterToCharSet(charSet, thisCh);
2328 }
2329 if (src < end - 1) {
2330 if (*src == '-') {
2331 ++src;
2332 inRange = TRUE;
2333 rangeStart = thisCh;
2334 }
2335 }
2336 }
2337 }
2338 return TRUE;
2339 }
2340
2341 static BOOL
2342 ReallocStateStack(REGlobalData *gData)
2343 {
2344 size_t limit = gData->stateStackLimit;
2345 size_t sz = sizeof(REProgState) * limit;
2346
2347 gData->stateStack = jsheap_grow(gData->pool, gData->stateStack, sz, sz);
2348 if (!gData->stateStack) {
2349 js_ReportOutOfScriptQuota(gData->cx);
2350 gData->ok = FALSE;
2351 return FALSE;
2352 }
2353 gData->stateStackLimit = limit + limit;
2354 return TRUE;
2355 }
2356
2357 #define PUSH_STATE_STACK(data) \
2358 do { \
2359 ++(data)->stateStackTop; \
2360 if ((data)->stateStackTop == (data)->stateStackLimit && \
2361 !ReallocStateStack((data))) { \
2362 return NULL; \
2363 } \
2364 }while(0)
2365
2366 /*
2367 * Apply the current op against the given input to see if it's going to match
2368 * or fail. Return false if we don't get a match, true if we do. If updatecp is
2369 * true, then update the current state's cp. Always update startpc to the next
2370 * op.
2371 */
2372 static inline REMatchState *
2373 SimpleMatch(REGlobalData *gData, REMatchState *x, REOp op,
2374 jsbytecode **startpc, BOOL updatecp)
2375 {
2376 REMatchState *result = NULL;
2377 WCHAR matchCh;
2378 size_t parenIndex;
2379 size_t offset, length, index;
2380 jsbytecode *pc = *startpc; /* pc has already been incremented past op */
2381 WCHAR *source;
2382 const WCHAR *startcp = x->cp;
2383 WCHAR ch;
2384 RECharSet *charSet;
2385
2386 const char *opname = reop_names[op];
2387 TRACE("\n%06d: %*s%s\n", (int)(pc - gData->regexp->program),
2388 (int)gData->stateStackTop * 2, "", opname);
2389
2390 switch (op) {
2391 case REOP_EMPTY:
2392 result = x;
2393 break;
2394 case REOP_BOL:
2395 if (x->cp != gData->cpbegin) {
2396 if (/*!gData->cx->regExpStatics.multiline && FIXME !!! */
2397 !(gData->regexp->flags & JSREG_MULTILINE)) {
2398 break;
2399 }
2400 if (!RE_IS_LINE_TERM(x->cp[-1]))
2401 break;
2402 }
2403 result = x;
2404 break;
2405 case REOP_EOL:
2406 if (x->cp != gData->cpend) {
2407 if (/*!gData->cx->regExpStatics.multiline &&*/
2408 !(gData->regexp->flags & JSREG_MULTILINE)) {
2409 break;
2410 }
2411 if (!RE_IS_LINE_TERM(*x->cp))
2412 break;
2413 }
2414 result = x;
2415 break;
2416 case REOP_WBDRY:
2417 if ((x->cp == gData->cpbegin || !JS_ISWORD(x->cp[-1])) ^
2418 !(x->cp != gData->cpend && JS_ISWORD(*x->cp))) {
2419 result = x;
2420 }
2421 break;
2422 case REOP_WNONBDRY:
2423 if ((x->cp == gData->cpbegin || !JS_ISWORD(x->cp[-1])) ^
2424 (x->cp != gData->cpend && JS_ISWORD(*x->cp))) {
2425 result = x;
2426 }
2427 break;
2428 case REOP_DOT:
2429 if (x->cp != gData->cpend && !RE_IS_LINE_TERM(*x->cp)) {
2430 result = x;
2431 result->cp++;
2432 }
2433 break;
2434 case REOP_DIGIT:
2435 if (x->cp != gData->cpend && JS7_ISDEC(*x->cp)) {
2436 result = x;
2437 result->cp++;
2438 }
2439 break;
2440 case REOP_NONDIGIT:
2441 if (x->cp != gData->cpend && !JS7_ISDEC(*x->cp)) {
2442 result = x;
2443 result->cp++;
2444 }
2445 break;
2446 case REOP_ALNUM:
2447 if (x->cp != gData->cpend && JS_ISWORD(*x->cp)) {
2448 result = x;
2449 result->cp++;
2450 }
2451 break;
2452 case REOP_NONALNUM:
2453 if (x->cp != gData->cpend && !JS_ISWORD(*x->cp)) {
2454 result = x;
2455 result->cp++;
2456 }
2457 break;
2458 case REOP_SPACE:
2459 if (x->cp != gData->cpend && isspaceW(*x->cp)) {
2460 result = x;
2461 result->cp++;
2462 }
2463 break;
2464 case REOP_NONSPACE:
2465 if (x->cp != gData->cpend && !isspaceW(*x->cp)) {
2466 result = x;
2467 result->cp++;
2468 }
2469 break;
2470 case REOP_BACKREF:
2471 pc = ReadCompactIndex(pc, &parenIndex);
2472 assert(parenIndex < gData->regexp->parenCount);
2473 result = BackrefMatcher(gData, x, parenIndex);
2474 break;
2475 case REOP_FLAT:
2476 pc = ReadCompactIndex(pc, &offset);
2477 assert(offset < SysStringLen(gData->regexp->source));
2478 pc = ReadCompactIndex(pc, &length);
2479 assert(1 <= length);
2480 assert(length <= SysStringLen(gData->regexp->source) - offset);
2481 if (length <= (size_t)(gData->cpend - x->cp)) {
2482 source = gData->regexp->source + offset;
2483 TRACE("%s\n", debugstr_wn(source, length));
2484 for (index = 0; index != length; index++) {
2485 if (source[index] != x->cp[index])
2486 return NULL;
2487 }
2488 x->cp += length;
2489 result = x;
2490 }
2491 break;
2492 case REOP_FLAT1:
2493 matchCh = *pc++;
2494 TRACE(" '%c' == '%c'\n", (char)matchCh, (char)*x->cp);
2495 if (x->cp != gData->cpend && *x->cp == matchCh) {
2496 result = x;
2497 result->cp++;
2498 }
2499 break;
2500 case REOP_FLATi:
2501 pc = ReadCompactIndex(pc, &offset);
2502 assert(offset < SysStringLen(gData->regexp->source));
2503 pc = ReadCompactIndex(pc, &length);
2504 assert(1 <= length);
2505 assert(length <= SysStringLen(gData->regexp->source) - offset);
2506 source = gData->regexp->source;
2507 result = FlatNIMatcher(gData, x, source + offset, length);
2508 break;
2509 case REOP_FLAT1i:
2510 matchCh = *pc++;
2511 if (x->cp != gData->cpend && toupperW(*x->cp) == toupperW(matchCh)) {
2512 result = x;
2513 result->cp++;
2514 }
2515 break;
2516 case REOP_UCFLAT1:
2517 matchCh = GET_ARG(pc);
2518 TRACE(" '%c' == '%c'\n", (char)matchCh, (char)*x->cp);
2519 pc += ARG_LEN;
2520 if (x->cp != gData->cpend && *x->cp == matchCh) {
2521 result = x;
2522 result->cp++;
2523 }
2524 break;
2525 case REOP_UCFLAT1i:
2526 matchCh = GET_ARG(pc);
2527 pc += ARG_LEN;
2528 if (x->cp != gData->cpend && toupperW(*x->cp) == toupperW(matchCh)) {
2529 result = x;
2530 result->cp++;
2531 }
2532 break;
2533 case REOP_CLASS:
2534 pc = ReadCompactIndex(pc, &index);
2535 assert(index < gData->regexp->classCount);
2536 if (x->cp != gData->cpend) {
2537 charSet = &gData->regexp->classList[index];
2538 assert(charSet->converted);
2539 ch = *x->cp;
2540 index = ch >> 3;
2541 if (charSet->length != 0 &&
2542 ch <= charSet->length &&
2543 (charSet->u.bits[index] & (1 << (ch & 0x7)))) {
2544 result = x;
2545 result->cp++;
2546 }
2547 }
2548 break;
2549 case REOP_NCLASS:
2550 pc = ReadCompactIndex(pc, &index);
2551 assert(index < gData->regexp->classCount);
2552 if (x->cp != gData->cpend) {
2553 charSet = &gData->regexp->classList[index];
2554 assert(charSet->converted);
2555 ch = *x->cp;
2556 index = ch >> 3;
2557 if (charSet->length == 0 ||
2558 ch > charSet->length ||
2559 !(charSet->u.bits[index] & (1 << (ch & 0x7)))) {
2560 result = x;
2561 result->cp++;
2562 }
2563 }
2564 break;
2565
2566 default:
2567 assert(FALSE);
2568 }
2569 if (result) {
2570 if (!updatecp)
2571 x->cp = startcp;
2572 *startpc = pc;
2573 TRACE(" *\n");
2574 return result;
2575 }
2576 x->cp = startcp;
2577 return NULL;
2578 }
2579
2580 static inline REMatchState *
2581 ExecuteREBytecode(REGlobalData *gData, REMatchState *x)
2582 {
2583 REMatchState *result = NULL;
2584 REBackTrackData *backTrackData;
2585 jsbytecode *nextpc, *testpc;
2586 REOp nextop;
2587 RECapture *cap;
2588 REProgState *curState;
2589 const WCHAR *startcp;
2590 size_t parenIndex, k;
2591 size_t parenSoFar = 0;
2592
2593 WCHAR matchCh1, matchCh2;
2594 RECharSet *charSet;
2595
2596 BOOL anchor;
2597 jsbytecode *pc = gData->regexp->program;
2598 REOp op = (REOp) *pc++;
2599
2600 /*
2601 * If the first node is a simple match, step the index into the string
2602 * until that match is made, or fail if it can't be found at all.
2603 */
2604 if (REOP_IS_SIMPLE(op) && !(gData->regexp->flags & JSREG_STICKY)) {
2605 anchor = FALSE;
2606 while (x->cp <= gData->cpend) {
2607 nextpc = pc; /* reset back to start each time */
2608 result = SimpleMatch(gData, x, op, &nextpc, TRUE);
2609 if (result) {
2610 anchor = TRUE;
2611 x = result;
2612 pc = nextpc; /* accept skip to next opcode */
2613 op = (REOp) *pc++;
2614 assert(op < REOP_LIMIT);
2615 break;
2616 }
2617 gData->skipped++;
2618 x->cp++;
2619 }
2620 if (!anchor)
2621 goto bad;
2622 }
2623
2624 for (;;) {
2625 const char *opname = reop_names[op];
2626 TRACE("\n%06d: %*s%s\n", (int)(pc - gData->regexp->program),
2627 (int)gData->stateStackTop * 2, "", opname);
2628
2629 if (REOP_IS_SIMPLE(op)) {
2630 result = SimpleMatch(gData, x, op, &pc, TRUE);
2631 } else {
2632 curState = &gData->stateStack[gData->stateStackTop];
2633 switch (op) {
2634 case REOP_END:
2635 goto good;
2636 case REOP_ALTPREREQ2:
2637 nextpc = pc + GET_OFFSET(pc); /* start of next op */
2638 pc += ARG_LEN;
2639 matchCh2 = GET_ARG(pc);
2640 pc += ARG_LEN;
2641 k = GET_ARG(pc);
2642 pc += ARG_LEN;
2643
2644 if (x->cp != gData->cpend) {
2645 if (*x->cp == matchCh2)
2646 goto doAlt;
2647
2648 charSet = &gData->regexp->classList[k];
2649 if (!charSet->converted && !ProcessCharSet(gData, charSet))
2650 goto bad;
2651 matchCh1 = *x->cp;
2652 k = matchCh1 >> 3;
2653 if ((charSet->length == 0 ||
2654 matchCh1 > charSet->length ||
2655 !(charSet->u.bits[k] & (1 << (matchCh1 & 0x7)))) ^
2656 charSet->sense) {
2657 goto doAlt;
2658 }
2659 }
2660 result = NULL;
2661 break;
2662
2663 case REOP_ALTPREREQ:
2664 nextpc = pc + GET_OFFSET(pc); /* start of next op */
2665 pc += ARG_LEN;
2666 matchCh1 = GET_ARG(pc);
2667 pc += ARG_LEN;
2668 matchCh2 = GET_ARG(pc);
2669 pc += ARG_LEN;
2670 if (x->cp == gData->cpend ||
2671 (*x->cp != matchCh1 && *x->cp != matchCh2)) {
2672 result = NULL;
2673 break;
2674 }
2675 /* else false thru... */
2676
2677 case REOP_ALT:
2678 doAlt:
2679 nextpc = pc + GET_OFFSET(pc); /* start of next alternate */
2680 pc += ARG_LEN; /* start of this alternate */
2681 curState->parenSoFar = parenSoFar;
2682 PUSH_STATE_STACK(gData);
2683 op = (REOp) *pc++;
2684 startcp = x->cp;
2685 if (REOP_IS_SIMPLE(op)) {
2686 if (!SimpleMatch(gData, x, op, &pc, TRUE)) {
2687 op = (REOp) *nextpc++;
2688 pc = nextpc;
2689 continue;
2690 }
2691 result = x;
2692 op = (REOp) *pc++;
2693 }
2694 nextop = (REOp) *nextpc++;
2695 if (!PushBackTrackState(gData, nextop, nextpc, x, startcp, 0, 0))
2696 goto bad;
2697 continue;
2698
2699 /*
2700 * Occurs at (successful) end of REOP_ALT,
2701 */
2702 case REOP_JUMP:
2703 /*
2704 * If we have not gotten a result here, it is because of an
2705 * empty match. Do the same thing REOP_EMPTY would do.
2706 */
2707 if (!result)
2708 result = x;
2709
2710 --gData->stateStackTop;
2711 pc += GET_OFFSET(pc);
2712 op = (REOp) *pc++;
2713 continue;
2714
2715 /*
2716 * Occurs at last (successful) end of REOP_ALT,
2717 */
2718 case REOP_ENDALT:
2719 /*
2720 * If we have not gotten a result here, it is because of an
2721 * empty match. Do the same thing REOP_EMPTY would do.
2722 */
2723 if (!result)
2724 result = x;
2725
2726 --gData->stateStackTop;
2727 op = (REOp) *pc++;
2728 continue;
2729
2730 case REOP_LPAREN:
2731 pc = ReadCompactIndex(pc, &parenIndex);
2732 TRACE("[ %lu ]\n", (unsigned long) parenIndex);
2733 assert(parenIndex < gData->regexp->parenCount);
2734 if (parenIndex + 1 > parenSoFar)
2735 parenSoFar = parenIndex + 1;
2736 x->parens[parenIndex].index = x->cp - gData->cpbegin;
2737 x->parens[parenIndex].length = 0;
2738 op = (REOp) *pc++;
2739 continue;
2740
2741 case REOP_RPAREN:
2742 {
2743 ptrdiff_t delta;
2744
2745 pc = ReadCompactIndex(pc, &parenIndex);
2746 assert(parenIndex < gData->regexp->parenCount);
2747 cap = &x->parens[parenIndex];
2748 delta = x->cp - (gData->cpbegin + cap->index);
2749 cap->length = (delta < 0) ? 0 : (size_t) delta;
2750 op = (REOp) *pc++;
2751
2752 if (!result)
2753 result = x;
2754 continue;
2755 }
2756 case REOP_ASSERT:
2757 nextpc = pc + GET_OFFSET(pc); /* start of term after ASSERT */
2758 pc += ARG_LEN; /* start of ASSERT child */
2759 op = (REOp) *pc++;
2760 testpc = pc;
2761 if (REOP_IS_SIMPLE(op) &&
2762 !SimpleMatch(gData, x, op, &testpc, FALSE)) {
2763 result = NULL;
2764 break;
2765 }
2766 curState->u.assertion.top =
2767 (char *)gData->backTrackSP - (char *)gData->backTrackStack;
2768 curState->u.assertion.sz = gData->cursz;
2769 curState->index = x->cp - gData->cpbegin;
2770 curState->parenSoFar = parenSoFar;
2771 PUSH_STATE_STACK(gData);
2772 if (!PushBackTrackState(gData, REOP_ASSERTTEST,
2773 nextpc, x, x->cp, 0, 0)) {
2774 goto bad;
2775 }
2776 continue;
2777
2778 case REOP_ASSERT_NOT:
2779 nextpc = pc + GET_OFFSET(pc);
2780 pc += ARG_LEN;
2781 op = (REOp) *pc++;
2782 testpc = pc;
2783 if (REOP_IS_SIMPLE(op) /* Note - fail to fail! */ &&
2784 SimpleMatch(gData, x, op, &testpc, FALSE) &&
2785 *testpc == REOP_ASSERTNOTTEST) {
2786 result = NULL;
2787 break;
2788 }
2789 curState->u.assertion.top
2790 = (char *)gData->backTrackSP -
2791 (char *)gData->backTrackStack;
2792 curState->u.assertion.sz = gData->cursz;
2793 curState->index = x->cp - gData->cpbegin;
2794 curState->parenSoFar = parenSoFar;
2795 PUSH_STATE_STACK(gData);
2796 if (!PushBackTrackState(gData, REOP_ASSERTNOTTEST,
2797 nextpc, x, x->cp, 0, 0)) {
2798 goto bad;
2799 }
2800 continue;
2801
2802 case REOP_ASSERTTEST:
2803 --gData->stateStackTop;
2804 --curState;
2805 x->cp = gData->cpbegin + curState->index;
2806 gData->backTrackSP =
2807 (REBackTrackData *) ((char *)gData->backTrackStack +
2808 curState->u.assertion.top);
2809 gData->cursz = curState->u.assertion.sz;
2810 if (result)
2811 result = x;
2812 break;
2813
2814 case REOP_ASSERTNOTTEST:
2815 --gData->stateStackTop;
2816 --curState;
2817 x->cp = gData->cpbegin + curState->index;
2818 gData->backTrackSP =
2819 (REBackTrackData *) ((char *)gData->backTrackStack +
2820 curState->u.assertion.top);
2821 gData->cursz = curState->u.assertion.sz;
2822 result = (!result) ? x : NULL;
2823 break;
2824 case REOP_STAR:
2825 curState->u.quantifier.min = 0;
2826 curState->u.quantifier.max = (UINT)-1;
2827 goto quantcommon;
2828 case REOP_PLUS:
2829 curState->u.quantifier.min = 1;
2830 curState->u.quantifier.max = (UINT)-1;
2831 goto quantcommon;
2832 case REOP_OPT:
2833 curState->u.quantifier.min = 0;
2834 curState->u.quantifier.max = 1;
2835 goto quantcommon;
2836 case REOP_QUANT:
2837 pc = ReadCompactIndex(pc, &k);
2838 curState->u.quantifier.min = k;
2839 pc = ReadCompactIndex(pc, &k);
2840 /* max is k - 1 to use one byte for (UINT)-1 sentinel. */
2841 curState->u.quantifier.max = k - 1;
2842 assert(curState->u.quantifier.min <= curState->u.quantifier.max);
2843 quantcommon:
2844 if (curState->u.quantifier.max == 0) {
2845 pc = pc + GET_OFFSET(pc);
2846 op = (REOp) *pc++;
2847 result = x;
2848 continue;
2849 }
2850 /* Step over <next> */
2851 nextpc = pc + ARG_LEN;
2852 op = (REOp) *nextpc++;
2853 startcp = x->cp;
2854 if (REOP_IS_SIMPLE(op)) {
2855 if (!SimpleMatch(gData, x, op, &nextpc, TRUE)) {
2856 if (curState->u.quantifier.min == 0)
2857 result = x;
2858 else
2859 result = NULL;
2860 pc = pc + GET_OFFSET(pc);
2861 break;
2862 }
2863 op = (REOp) *nextpc++;
2864 result = x;
2865 }
2866 curState->index = startcp - gData->cpbegin;
2867 curState->continue_op = REOP_REPEAT;
2868 curState->continue_pc = pc;
2869 curState->parenSoFar = parenSoFar;
2870 PUSH_STATE_STACK(gData);
2871 if (curState->u.quantifier.min == 0 &&
2872 !PushBackTrackState(gData, REOP_REPEAT, pc, x, startcp,
2873 0, 0)) {
2874 goto bad;
2875 }
2876 pc = nextpc;
2877 continue;
2878
2879 case REOP_ENDCHILD: /* marks the end of a quantifier child */
2880 pc = curState[-1].continue_pc;
2881 op = (REOp) curState[-1].continue_op;
2882
2883 if (!result)
2884 result = x;
2885 continue;
2886
2887 case REOP_REPEAT:
2888 --curState;
2889 do {
2890 --gData->stateStackTop;
2891 if (!result) {
2892 /* Failed, see if we have enough children. */
2893 if (curState->u.quantifier.min == 0)
2894 goto repeatDone;
2895 goto break_switch;
2896 }
2897 if (curState->u.quantifier.min == 0 &&
2898 x->cp == gData->cpbegin + curState->index) {
2899 /* matched an empty string, that'll get us nowhere */
2900 result = NULL;
2901 goto break_switch;
2902 }
2903 if (curState->u.quantifier.min != 0)
2904 curState->u.quantifier.min--;
2905 if (curState->u.quantifier.max != (UINT) -1)
2906 curState->u.quantifier.max--;
2907 if (curState->u.quantifier.max == 0)
2908 goto repeatDone;
2909 nextpc = pc + ARG_LEN;
2910 nextop = (REOp) *nextpc;
2911 startcp = x->cp;
2912 if (REOP_IS_SIMPLE(nextop)) {
2913 nextpc++;
2914 if (!SimpleMatch(gData, x, nextop, &nextpc, TRUE)) {
2915 if (curState->u.quantifier.min == 0)
2916 goto repeatDone;
2917 result = NULL;
2918 goto break_switch;
2919 }
2920 result = x;
2921 }
2922 curState->index = startcp - gData->cpbegin;
2923 PUSH_STATE_STACK(gData);
2924 if (curState->u.quantifier.min == 0 &&
2925 !PushBackTrackState(gData, REOP_REPEAT,
2926 pc, x, startcp,
2927 curState->parenSoFar,
2928 parenSoFar -
2929 curState->parenSoFar)) {
2930 goto bad;
2931 }
2932 } while (*nextpc == REOP_ENDCHILD);
2933 pc = nextpc;
2934 op = (REOp) *pc++;
2935 parenSoFar = curState->parenSoFar;
2936 continue;
2937
2938 repeatDone:
2939 result = x;
2940 pc += GET_OFFSET(pc);
2941 goto break_switch;
2942
2943 case REOP_MINIMALSTAR:
2944 curState->u.quantifier.min = 0;
2945 curState->u.quantifier.max = (UINT)-1;
2946 goto minimalquantcommon;
2947 case REOP_MINIMALPLUS:
2948 curState->u.quantifier.min = 1;
2949 curState->u.quantifier.max = (UINT)-1;
2950 goto minimalquantcommon;
2951 case REOP_MINIMALOPT:
2952 curState->u.quantifier.min = 0;
2953 curState->u.quantifier.max = 1;
2954 goto minimalquantcommon;
2955 case REOP_MINIMALQUANT:
2956 pc = ReadCompactIndex(pc, &k);
2957 curState->u.quantifier.min = k;
2958 pc = ReadCompactIndex(pc, &k);
2959 /* See REOP_QUANT comments about k - 1. */
2960 curState->u.quantifier.max = k - 1;
2961 assert(curState->u.quantifier.min
2962 <= curState->u.quantifier.max);
2963 minimalquantcommon:
2964 curState->index = x->cp - gData->cpbegin;
2965 curState->parenSoFar = parenSoFar;
2966 PUSH_STATE_STACK(gData);
2967 if (curState->u.quantifier.min != 0) {
2968 curState->continue_op = REOP_MINIMALREPEAT;
2969 curState->continue_pc = pc;
2970 /* step over <next> */
2971 pc += OFFSET_LEN;
2972 op = (REOp) *pc++;
2973 } else {
2974 if (!PushBackTrackState(gData, REOP_MINIMALREPEAT,
2975 pc, x, x->cp, 0, 0)) {
2976 goto bad;
2977 }
2978 --gData->stateStackTop;
2979 pc = pc + GET_OFFSET(pc);
2980 op = (REOp) *pc++;
2981 }
2982 continue;
2983
2984 case REOP_MINIMALREPEAT:
2985 --gData->stateStackTop;
2986 --curState;
2987
2988 TRACE("{%d,%d}\n", curState->u.quantifier.min, curState->u.quantifier.max);
2989 #define PREPARE_REPEAT() \
2990 do { \
2991 curState->index = x->cp - gData->cpbegin; \
2992 curState->continue_op = REOP_MINIMALREPEAT; \
2993 curState->continue_pc = pc; \
2994 pc += ARG_LEN; \
2995 for (k = curState->parenSoFar; k < parenSoFar; k++) \
2996 x->parens[k].index = -1; \
2997 PUSH_STATE_STACK(gData); \
2998 op = (REOp) *pc++; \
2999 assert(op < REOP_LIMIT); \
3000 }while(0)
3001
3002 if (!result) {
3003 TRACE(" -\n");
3004 /*
3005 * Non-greedy failure - try to consume another child.
3006 */
3007 if (curState->u.quantifier.max == (UINT) -1 ||
3008 curState->u.quantifier.max > 0) {
3009 PREPARE_REPEAT();
3010 continue;
3011 }
3012 /* Don't need to adjust pc since we're going to pop. */
3013 break;
3014 }
3015 if (curState->u.quantifier.min == 0 &&
3016 x->cp == gData->cpbegin + curState->index) {
3017 /* Matched an empty string, that'll get us nowhere. */
3018 result = NULL;
3019 break;
3020 }
3021 if (curState->u.quantifier.min != 0)
3022 curState->u.quantifier.min--;
3023 if (curState->u.quantifier.max != (UINT) -1)
3024 curState->u.quantifier.max--;
3025 if (curState->u.quantifier.min != 0) {
3026 PREPARE_REPEAT();
3027 continue;
3028 }
3029 curState->index = x->cp - gData->cpbegin;
3030 curState->parenSoFar = parenSoFar;
3031 PUSH_STATE_STACK(gData);
3032 if (!PushBackTrackState(gData, REOP_MINIMALREPEAT,
3033 pc, x, x->cp,
3034 curState->parenSoFar,
3035 parenSoFar - curState->parenSoFar)) {
3036 goto bad;
3037 }
3038 --gData->stateStackTop;
3039 pc = pc + GET_OFFSET(pc);
3040 op = (REOp) *pc++;
3041 assert(op < REOP_LIMIT);
3042 continue;
3043 default:
3044 assert(FALSE);
3045 result = NULL;
3046 }
3047 break_switch:;
3048 }
3049
3050 /*
3051 * If the match failed and there's a backtrack option, take it.
3052 * Otherwise this is a complete and utter failure.
3053 */
3054 if (!result) {
3055 if (gData->cursz == 0)
3056 return NULL;
3057
3058 /* Potentially detect explosive regex here. */
3059 gData->backTrackCount++;
3060 if (gData->backTrackLimit &&
3061 gData->backTrackCount >= gData->backTrackLimit) {
3062 JS_ReportErrorNumber(gData->cx, js_GetErrorMessage, NULL,
3063 JSMSG_REGEXP_TOO_COMPLEX);
3064 gData->ok = FALSE;
3065 return NULL;
3066 }
3067
3068 backTrackData = gData->backTrackSP;
3069 gData->cursz = backTrackData->sz;
3070 gData->backTrackSP =
3071 (REBackTrackData *) ((char *)backTrackData - backTrackData->sz);
3072 x->cp = backTrackData->cp;
3073 pc = backTrackData->backtrack_pc;
3074 op = (REOp) backTrackData->backtrack_op;
3075 assert(op < REOP_LIMIT);
3076 gData->stateStackTop = backTrackData->saveStateStackTop;
3077 assert(gData->stateStackTop);
3078
3079 memcpy(gData->stateStack, backTrackData + 1,
3080 sizeof(REProgState) * backTrackData->saveStateStackTop);
3081 curState = &gData->stateStack[gData->stateStackTop - 1];
3082
3083 if (backTrackData->parenCount) {
3084 memcpy(&x->parens[backTrackData->parenIndex],
3085 (char *)(backTrackData + 1) +
3086 sizeof(REProgState) * backTrackData->saveStateStackTop,
3087 sizeof(RECapture) * backTrackData->parenCount);
3088 parenSoFar = backTrackData->parenIndex + backTrackData->parenCount;
3089 } else {
3090 for (k = curState->parenSoFar; k < parenSoFar; k++)
3091 x->parens[k].index = -1;
3092 parenSoFar = curState->parenSoFar;
3093 }
3094
3095 TRACE("\tBT_Pop: %ld,%ld\n",
3096 (unsigned long) backTrackData->parenIndex,
3097 (unsigned long) backTrackData->parenCount);