724e8de1017f22691a83c1d2f7b345b7db1e5038
[reactos.git] / reactos / dll / win32 / vbscript / regexp.c
1 /*
2 * Copyright 2008 Jacek Caban for CodeWeavers
3 *
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public
6 * License as published by the Free Software Foundation; either
7 * version 2.1 of the License, or (at your option) any later version.
8 *
9 * This library is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
13 *
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
17 */
18
19 /*
20 * Code in this file is based on files:
21 * js/src/jsregexp.h
22 * js/src/jsregexp.c
23 * from Mozilla project, released under LGPL 2.1 or later.
24 *
25 * The Original Code is Mozilla Communicator client code, released
26 * March 31, 1998.
27 *
28 * The Initial Developer of the Original Code is
29 * Netscape Communications Corporation.
30 * Portions created by the Initial Developer are Copyright (C) 1998
31 * the Initial Developer. All Rights Reserved.
32 */
33
34 #include "vbscript.h"
35
36 /* FIXME: Better error handling */
37 #define ReportRegExpError(a,b,c)
38 #define ReportRegExpErrorHelper(a,b,c,d)
39 #define JS_ReportErrorNumber(a,b,c,d)
40 #define JS_ReportErrorFlagsAndNumber(a,b,c,d,e,f)
41 #define js_ReportOutOfScriptQuota(a)
42 #define JS_ReportOutOfMemory(a)
43 #define JS_COUNT_OPERATION(a,b)
44
45
46 typedef BYTE JSPackedBool;
47
48 /*
49 * This struct holds a bitmap representation of a class from a regexp.
50 * There's a list of these referenced by the classList field in the regexp_t
51 * struct below. The initial state has startIndex set to the offset in the
52 * original regexp source of the beginning of the class contents. The first
53 * use of the class converts the source representation into a bitmap.
54 *
55 */
56 typedef struct RECharSet {
57 JSPackedBool converted;
58 JSPackedBool sense;
59 WORD length;
60 union {
61 BYTE *bits;
62 struct {
63 size_t startIndex;
64 size_t length;
65 } src;
66 } u;
67 } RECharSet;
68
69 #define JSMSG_MIN_TOO_BIG 47
70 #define JSMSG_MAX_TOO_BIG 48
71 #define JSMSG_OUT_OF_ORDER 49
72 #define JSMSG_OUT_OF_MEMORY 137
73
74 #define LINE_SEPARATOR 0x2028
75 #define PARA_SEPARATOR 0x2029
76
77 #define RE_IS_LETTER(c) (((c >= 'A') && (c <= 'Z')) || \
78 ((c >= 'a') && (c <= 'z')) )
79 #define RE_IS_LINE_TERM(c) ((c == '\n') || (c == '\r') || \
80 (c == LINE_SEPARATOR) || (c == PARA_SEPARATOR))
81
82 #define JS_ISWORD(c) ((c) < 128 && (isalnum(c) || (c) == '_'))
83
84 #define JS7_ISDEC(c) ((((unsigned)(c)) - '0') <= 9)
85 #define JS7_UNDEC(c) ((c) - '0')
86
87 typedef enum REOp {
88 REOP_EMPTY,
89 REOP_BOL,
90 REOP_EOL,
91 REOP_WBDRY,
92 REOP_WNONBDRY,
93 REOP_DOT,
94 REOP_DIGIT,
95 REOP_NONDIGIT,
96 REOP_ALNUM,
97 REOP_NONALNUM,
98 REOP_SPACE,
99 REOP_NONSPACE,
100 REOP_BACKREF,
101 REOP_FLAT,
102 REOP_FLAT1,
103 REOP_FLATi,
104 REOP_FLAT1i,
105 REOP_UCFLAT1,
106 REOP_UCFLAT1i,
107 REOP_UCFLAT,
108 REOP_UCFLATi,
109 REOP_CLASS,
110 REOP_NCLASS,
111 REOP_ALT,
112 REOP_QUANT,
113 REOP_STAR,
114 REOP_PLUS,
115 REOP_OPT,
116 REOP_LPAREN,
117 REOP_RPAREN,
118 REOP_JUMP,
119 REOP_DOTSTAR,
120 REOP_LPARENNON,
121 REOP_ASSERT,
122 REOP_ASSERT_NOT,
123 REOP_ASSERTTEST,
124 REOP_ASSERTNOTTEST,
125 REOP_MINIMALSTAR,
126 REOP_MINIMALPLUS,
127 REOP_MINIMALOPT,
128 REOP_MINIMALQUANT,
129 REOP_ENDCHILD,
130 REOP_REPEAT,
131 REOP_MINIMALREPEAT,
132 REOP_ALTPREREQ,
133 REOP_ALTPREREQ2,
134 REOP_ENDALT,
135 REOP_CONCAT,
136 REOP_END,
137 REOP_LIMIT /* META: no operator >= to this */
138 } REOp;
139
140 #define REOP_IS_SIMPLE(op) ((op) <= REOP_NCLASS)
141
142 static const char *reop_names[] = {
143 "empty",
144 "bol",
145 "eol",
146 "wbdry",
147 "wnonbdry",
148 "dot",
149 "digit",
150 "nondigit",
151 "alnum",
152 "nonalnum",
153 "space",
154 "nonspace",
155 "backref",
156 "flat",
157 "flat1",
158 "flati",
159 "flat1i",
160 "ucflat1",
161 "ucflat1i",
162 "ucflat",
163 "ucflati",
164 "class",
165 "nclass",
166 "alt",
167 "quant",
168 "star",
169 "plus",
170 "opt",
171 "lparen",
172 "rparen",
173 "jump",
174 "dotstar",
175 "lparennon",
176 "assert",
177 "assert_not",
178 "asserttest",
179 "assertnottest",
180 "minimalstar",
181 "minimalplus",
182 "minimalopt",
183 "minimalquant",
184 "endchild",
185 "repeat",
186 "minimalrepeat",
187 "altprereq",
188 "alrprereq2",
189 "endalt",
190 "concat",
191 "end",
192 NULL
193 };
194
195 typedef struct REProgState {
196 jsbytecode *continue_pc; /* current continuation data */
197 jsbytecode continue_op;
198 ptrdiff_t index; /* progress in text */
199 size_t parenSoFar; /* highest indexed paren started */
200 union {
201 struct {
202 UINT min; /* current quantifier limits */
203 UINT max;
204 } quantifier;
205 struct {
206 size_t top; /* backtrack stack state */
207 size_t sz;
208 } assertion;
209 } u;
210 } REProgState;
211
212 typedef struct REBackTrackData {
213 size_t sz; /* size of previous stack entry */
214 jsbytecode *backtrack_pc; /* where to backtrack to */
215 jsbytecode backtrack_op;
216 const WCHAR *cp; /* index in text of match at backtrack */
217 size_t parenIndex; /* start index of saved paren contents */
218 size_t parenCount; /* # of saved paren contents */
219 size_t saveStateStackTop; /* number of parent states */
220 /* saved parent states follow */
221 /* saved paren contents follow */
222 } REBackTrackData;
223
224 #define INITIAL_STATESTACK 100
225 #define INITIAL_BACKTRACK 8000
226
227 typedef struct REGlobalData {
228 void *cx;
229 regexp_t *regexp; /* the RE in execution */
230 BOOL ok; /* runtime error (out_of_memory only?) */
231 size_t start; /* offset to start at */
232 ptrdiff_t skipped; /* chars skipped anchoring this r.e. */
233 const WCHAR *cpbegin; /* text base address */
234 const WCHAR *cpend; /* text limit address */
235
236 REProgState *stateStack; /* stack of state of current parents */
237 size_t stateStackTop;
238 size_t stateStackLimit;
239
240 REBackTrackData *backTrackStack;/* stack of matched-so-far positions */
241 REBackTrackData *backTrackSP;
242 size_t backTrackStackSize;
243 size_t cursz; /* size of current stack entry */
244 size_t backTrackCount; /* how many times we've backtracked */
245 size_t backTrackLimit; /* upper limit on backtrack states */
246
247 heap_pool_t *pool; /* It's faster to use one malloc'd pool
248 than to malloc/free the three items
249 that are allocated from this pool */
250 } REGlobalData;
251
252 typedef struct RENode RENode;
253 struct RENode {
254 REOp op; /* r.e. op bytecode */
255 RENode *next; /* next in concatenation order */
256 void *kid; /* first operand */
257 union {
258 void *kid2; /* second operand */
259 INT num; /* could be a number */
260 size_t parenIndex; /* or a parenthesis index */
261 struct { /* or a quantifier range */
262 UINT min;
263 UINT max;
264 JSPackedBool greedy;
265 } range;
266 struct { /* or a character class */
267 size_t startIndex;
268 size_t kidlen; /* length of string at kid, in jschars */
269 size_t index; /* index into class list */
270 WORD bmsize; /* bitmap size, based on max char code */
271 JSPackedBool sense;
272 } ucclass;
273 struct { /* or a literal sequence */
274 WCHAR chr; /* of one character */
275 size_t length; /* or many (via the kid) */
276 } flat;
277 struct {
278 RENode *kid2; /* second operand from ALT */
279 WCHAR ch1; /* match char for ALTPREREQ */
280 WCHAR ch2; /* ditto, or class index for ALTPREREQ2 */
281 } altprereq;
282 } u;
283 };
284
285 #define CLASS_CACHE_SIZE 4
286
287 typedef struct CompilerState {
288 void *context;
289 const WCHAR *cpbegin;
290 const WCHAR *cpend;
291 const WCHAR *cp;
292 size_t parenCount;
293 size_t classCount; /* number of [] encountered */
294 size_t treeDepth; /* maximum depth of parse tree */
295 size_t progLength; /* estimated bytecode length */
296 RENode *result;
297 size_t classBitmapsMem; /* memory to hold all class bitmaps */
298 struct {
299 const WCHAR *start; /* small cache of class strings */
300 size_t length; /* since they're often the same */
301 size_t index;
302 } classCache[CLASS_CACHE_SIZE];
303 WORD flags;
304
305 heap_pool_t *pool; /* It's faster to use one malloc'd pool
306 than to malloc/free */
307 } CompilerState;
308
309 typedef struct EmitStateStackEntry {
310 jsbytecode *altHead; /* start of REOP_ALT* opcode */
311 jsbytecode *nextAltFixup; /* fixup pointer to next-alt offset */
312 jsbytecode *nextTermFixup; /* fixup ptr. to REOP_JUMP offset */
313 jsbytecode *endTermFixup; /* fixup ptr. to REOPT_ALTPREREQ* offset */
314 RENode *continueNode; /* original REOP_ALT* node being stacked */
315 jsbytecode continueOp; /* REOP_JUMP or REOP_ENDALT continuation */
316 JSPackedBool jumpToJumpFlag; /* true if we've patched jump-to-jump to
317 avoid 16-bit unsigned offset overflow */
318 } EmitStateStackEntry;
319
320 /*
321 * Immediate operand sizes and getter/setters. Unlike the ones in jsopcode.h,
322 * the getters and setters take the pc of the offset, not of the opcode before
323 * the offset.
324 */
325 #define ARG_LEN 2
326 #define GET_ARG(pc) ((WORD)(((pc)[0] << 8) | (pc)[1]))
327 #define SET_ARG(pc, arg) ((pc)[0] = (jsbytecode) ((arg) >> 8), \
328 (pc)[1] = (jsbytecode) (arg))
329
330 #define OFFSET_LEN ARG_LEN
331 #define OFFSET_MAX ((1 << (ARG_LEN * 8)) - 1)
332 #define GET_OFFSET(pc) GET_ARG(pc)
333
334 static BOOL ParseRegExp(CompilerState*);
335
336 /*
337 * Maximum supported tree depth is maximum size of EmitStateStackEntry stack.
338 * For sanity, we limit it to 2^24 bytes.
339 */
340 #define TREE_DEPTH_MAX ((1 << 24) / sizeof(EmitStateStackEntry))
341
342 /*
343 * The maximum memory that can be allocated for class bitmaps.
344 * For sanity, we limit it to 2^24 bytes.
345 */
346 #define CLASS_BITMAPS_MEM_LIMIT (1 << 24)
347
348 /*
349 * Functions to get size and write/read bytecode that represent small indexes
350 * compactly.
351 * Each byte in the code represent 7-bit chunk of the index. 8th bit when set
352 * indicates that the following byte brings more bits to the index. Otherwise
353 * this is the last byte in the index bytecode representing highest index bits.
354 */
355 static size_t
356 GetCompactIndexWidth(size_t index)
357 {
358 size_t width;
359
360 for (width = 1; (index >>= 7) != 0; ++width) { }
361 return width;
362 }
363
364 static inline jsbytecode *
365 WriteCompactIndex(jsbytecode *pc, size_t index)
366 {
367 size_t next;
368
369 while ((next = index >> 7) != 0) {
370 *pc++ = (jsbytecode)(index | 0x80);
371 index = next;
372 }
373 *pc++ = (jsbytecode)index;
374 return pc;
375 }
376
377 static inline jsbytecode *
378 ReadCompactIndex(jsbytecode *pc, size_t *result)
379 {
380 size_t nextByte;
381
382 nextByte = *pc++;
383 if ((nextByte & 0x80) == 0) {
384 /*
385 * Short-circuit the most common case when compact index <= 127.
386 */
387 *result = nextByte;
388 } else {
389 size_t shift = 7;
390 *result = 0x7F & nextByte;
391 do {
392 nextByte = *pc++;
393 *result |= (nextByte & 0x7F) << shift;
394 shift += 7;
395 } while ((nextByte & 0x80) != 0);
396 }
397 return pc;
398 }
399
400 /* Construct and initialize an RENode, returning NULL for out-of-memory */
401 static RENode *
402 NewRENode(CompilerState *state, REOp op)
403 {
404 RENode *ren;
405
406 ren = heap_pool_alloc(state->pool, sizeof(*ren));
407 if (!ren) {
408 /* js_ReportOutOfScriptQuota(cx); */
409 return NULL;
410 }
411 ren->op = op;
412 ren->next = NULL;
413 ren->kid = NULL;
414 return ren;
415 }
416
417 /*
418 * Validates and converts hex ascii value.
419 */
420 static BOOL
421 isASCIIHexDigit(WCHAR c, UINT *digit)
422 {
423 UINT cv = c;
424
425 if (cv < '0')
426 return FALSE;
427 if (cv <= '9') {
428 *digit = cv - '0';
429 return TRUE;
430 }
431 cv |= 0x20;
432 if (cv >= 'a' && cv <= 'f') {
433 *digit = cv - 'a' + 10;
434 return TRUE;
435 }
436 return FALSE;
437 }
438
439 typedef struct {
440 REOp op;
441 const WCHAR *errPos;
442 size_t parenIndex;
443 } REOpData;
444
445 #define JUMP_OFFSET_HI(off) ((jsbytecode)((off) >> 8))
446 #define JUMP_OFFSET_LO(off) ((jsbytecode)(off))
447
448 static BOOL
449 SetForwardJumpOffset(jsbytecode *jump, jsbytecode *target)
450 {
451 ptrdiff_t offset = target - jump;
452
453 /* Check that target really points forward. */
454 assert(offset >= 2);
455 if ((size_t)offset > OFFSET_MAX)
456 return FALSE;
457
458 jump[0] = JUMP_OFFSET_HI(offset);
459 jump[1] = JUMP_OFFSET_LO(offset);
460 return TRUE;
461 }
462
463 /*
464 * Generate bytecode for the tree rooted at t using an explicit stack instead
465 * of recursion.
466 */
467 static jsbytecode *
468 EmitREBytecode(CompilerState *state, regexp_t *re, size_t treeDepth,
469 jsbytecode *pc, RENode *t)
470 {
471 EmitStateStackEntry *emitStateSP, *emitStateStack;
472 RECharSet *charSet;
473 REOp op;
474
475 if (treeDepth == 0) {
476 emitStateStack = NULL;
477 } else {
478 emitStateStack = heap_alloc(sizeof(EmitStateStackEntry) * treeDepth);
479 if (!emitStateStack)
480 return NULL;
481 }
482 emitStateSP = emitStateStack;
483 op = t->op;
484 assert(op < REOP_LIMIT);
485
486 for (;;) {
487 *pc++ = op;
488 switch (op) {
489 case REOP_EMPTY:
490 --pc;
491 break;
492
493 case REOP_ALTPREREQ2:
494 case REOP_ALTPREREQ:
495 assert(emitStateSP);
496 emitStateSP->altHead = pc - 1;
497 emitStateSP->endTermFixup = pc;
498 pc += OFFSET_LEN;
499 SET_ARG(pc, t->u.altprereq.ch1);
500 pc += ARG_LEN;
501 SET_ARG(pc, t->u.altprereq.ch2);
502 pc += ARG_LEN;
503
504 emitStateSP->nextAltFixup = pc; /* offset to next alternate */
505 pc += OFFSET_LEN;
506
507 emitStateSP->continueNode = t;
508 emitStateSP->continueOp = REOP_JUMP;
509 emitStateSP->jumpToJumpFlag = FALSE;
510 ++emitStateSP;
511 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
512 t = t->kid;
513 op = t->op;
514 assert(op < REOP_LIMIT);
515 continue;
516
517 case REOP_JUMP:
518 emitStateSP->nextTermFixup = pc; /* offset to following term */
519 pc += OFFSET_LEN;
520 if (!SetForwardJumpOffset(emitStateSP->nextAltFixup, pc))
521 goto jump_too_big;
522 emitStateSP->continueOp = REOP_ENDALT;
523 ++emitStateSP;
524 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
525 t = t->u.kid2;
526 op = t->op;
527 assert(op < REOP_LIMIT);
528 continue;
529
530 case REOP_ENDALT:
531 /*
532 * If we already patched emitStateSP->nextTermFixup to jump to
533 * a nearer jump, to avoid 16-bit immediate offset overflow, we
534 * are done here.
535 */
536 if (emitStateSP->jumpToJumpFlag)
537 break;
538
539 /*
540 * Fix up the REOP_JUMP offset to go to the op after REOP_ENDALT.
541 * REOP_ENDALT is executed only on successful match of the last
542 * alternate in a group.
543 */
544 if (!SetForwardJumpOffset(emitStateSP->nextTermFixup, pc))
545 goto jump_too_big;
546 if (t->op != REOP_ALT) {
547 if (!SetForwardJumpOffset(emitStateSP->endTermFixup, pc))
548 goto jump_too_big;
549 }
550
551 /*
552 * If the program is bigger than the REOP_JUMP offset range, then
553 * we must check for alternates before this one that are part of
554 * the same group, and fix up their jump offsets to target jumps
555 * close enough to fit in a 16-bit unsigned offset immediate.
556 */
557 if ((size_t)(pc - re->program) > OFFSET_MAX &&
558 emitStateSP > emitStateStack) {
559 EmitStateStackEntry *esp, *esp2;
560 jsbytecode *alt, *jump;
561 ptrdiff_t span, header;
562
563 esp2 = emitStateSP;
564 alt = esp2->altHead;
565 for (esp = esp2 - 1; esp >= emitStateStack; --esp) {
566 if (esp->continueOp == REOP_ENDALT &&
567 !esp->jumpToJumpFlag &&
568 esp->nextTermFixup + OFFSET_LEN == alt &&
569 (size_t)(pc - ((esp->continueNode->op != REOP_ALT)
570 ? esp->endTermFixup
571 : esp->nextTermFixup)) > OFFSET_MAX) {
572 alt = esp->altHead;
573 jump = esp->nextTermFixup;
574
575 /*
576 * The span must be 1 less than the distance from
577 * jump offset to jump offset, so we actually jump
578 * to a REOP_JUMP bytecode, not to its offset!
579 */
580 for (;;) {
581 assert(jump < esp2->nextTermFixup);
582 span = esp2->nextTermFixup - jump - 1;
583 if ((size_t)span <= OFFSET_MAX)
584 break;
585 do {
586 if (--esp2 == esp)
587 goto jump_too_big;
588 } while (esp2->continueOp != REOP_ENDALT);
589 }
590
591 jump[0] = JUMP_OFFSET_HI(span);
592 jump[1] = JUMP_OFFSET_LO(span);
593
594 if (esp->continueNode->op != REOP_ALT) {
595 /*
596 * We must patch the offset at esp->endTermFixup
597 * as well, for the REOP_ALTPREREQ{,2} opcodes.
598 * If we're unlucky and endTermFixup is more than
599 * OFFSET_MAX bytes from its target, we cheat by
600 * jumping 6 bytes to the jump whose offset is at
601 * esp->nextTermFixup, which has the same target.
602 */
603 jump = esp->endTermFixup;
604 header = esp->nextTermFixup - jump;
605 span += header;
606 if ((size_t)span > OFFSET_MAX)
607 span = header;
608
609 jump[0] = JUMP_OFFSET_HI(span);
610 jump[1] = JUMP_OFFSET_LO(span);
611 }
612
613 esp->jumpToJumpFlag = TRUE;
614 }
615 }
616 }
617 break;
618
619 case REOP_ALT:
620 assert(emitStateSP);
621 emitStateSP->altHead = pc - 1;
622 emitStateSP->nextAltFixup = pc; /* offset to next alternate */
623 pc += OFFSET_LEN;
624 emitStateSP->continueNode = t;
625 emitStateSP->continueOp = REOP_JUMP;
626 emitStateSP->jumpToJumpFlag = FALSE;
627 ++emitStateSP;
628 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
629 t = t->kid;
630 op = t->op;
631 assert(op < REOP_LIMIT);
632 continue;
633
634 case REOP_FLAT:
635 /*
636 * Coalesce FLATs if possible and if it would not increase bytecode
637 * beyond preallocated limit. The latter happens only when bytecode
638 * size for coalesced string with offset p and length 2 exceeds 6
639 * bytes preallocated for 2 single char nodes, i.e. when
640 * 1 + GetCompactIndexWidth(p) + GetCompactIndexWidth(2) > 6 or
641 * GetCompactIndexWidth(p) > 4.
642 * Since when GetCompactIndexWidth(p) <= 4 coalescing of 3 or more
643 * nodes strictly decreases bytecode size, the check has to be
644 * done only for the first coalescing.
645 */
646 if (t->kid &&
647 GetCompactIndexWidth((WCHAR*)t->kid - state->cpbegin) <= 4)
648 {
649 while (t->next &&
650 t->next->op == REOP_FLAT &&
651 (WCHAR*)t->kid + t->u.flat.length ==
652 t->next->kid) {
653 t->u.flat.length += t->next->u.flat.length;
654 t->next = t->next->next;
655 }
656 }
657 if (t->kid && t->u.flat.length > 1) {
658 pc[-1] = (state->flags & REG_FOLD) ? REOP_FLATi : REOP_FLAT;
659 pc = WriteCompactIndex(pc, (WCHAR*)t->kid - state->cpbegin);
660 pc = WriteCompactIndex(pc, t->u.flat.length);
661 } else if (t->u.flat.chr < 256) {
662 pc[-1] = (state->flags & REG_FOLD) ? REOP_FLAT1i : REOP_FLAT1;
663 *pc++ = (jsbytecode) t->u.flat.chr;
664 } else {
665 pc[-1] = (state->flags & REG_FOLD)
666 ? REOP_UCFLAT1i
667 : REOP_UCFLAT1;
668 SET_ARG(pc, t->u.flat.chr);
669 pc += ARG_LEN;
670 }
671 break;
672
673 case REOP_LPAREN:
674 assert(emitStateSP);
675 pc = WriteCompactIndex(pc, t->u.parenIndex);
676 emitStateSP->continueNode = t;
677 emitStateSP->continueOp = REOP_RPAREN;
678 ++emitStateSP;
679 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
680 t = t->kid;
681 op = t->op;
682 continue;
683
684 case REOP_RPAREN:
685 pc = WriteCompactIndex(pc, t->u.parenIndex);
686 break;
687
688 case REOP_BACKREF:
689 pc = WriteCompactIndex(pc, t->u.parenIndex);
690 break;
691
692 case REOP_ASSERT:
693 assert(emitStateSP);
694 emitStateSP->nextTermFixup = pc;
695 pc += OFFSET_LEN;
696 emitStateSP->continueNode = t;
697 emitStateSP->continueOp = REOP_ASSERTTEST;
698 ++emitStateSP;
699 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
700 t = t->kid;
701 op = t->op;
702 continue;
703
704 case REOP_ASSERTTEST:
705 case REOP_ASSERTNOTTEST:
706 if (!SetForwardJumpOffset(emitStateSP->nextTermFixup, pc))
707 goto jump_too_big;
708 break;
709
710 case REOP_ASSERT_NOT:
711 assert(emitStateSP);
712 emitStateSP->nextTermFixup = pc;
713 pc += OFFSET_LEN;
714 emitStateSP->continueNode = t;
715 emitStateSP->continueOp = REOP_ASSERTNOTTEST;
716 ++emitStateSP;
717 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
718 t = t->kid;
719 op = t->op;
720 continue;
721
722 case REOP_QUANT:
723 assert(emitStateSP);
724 if (t->u.range.min == 0 && t->u.range.max == (UINT)-1) {
725 pc[-1] = (t->u.range.greedy) ? REOP_STAR : REOP_MINIMALSTAR;
726 } else if (t->u.range.min == 0 && t->u.range.max == 1) {
727 pc[-1] = (t->u.range.greedy) ? REOP_OPT : REOP_MINIMALOPT;
728 } else if (t->u.range.min == 1 && t->u.range.max == (UINT) -1) {
729 pc[-1] = (t->u.range.greedy) ? REOP_PLUS : REOP_MINIMALPLUS;
730 } else {
731 if (!t->u.range.greedy)
732 pc[-1] = REOP_MINIMALQUANT;
733 pc = WriteCompactIndex(pc, t->u.range.min);
734 /*
735 * Write max + 1 to avoid using size_t(max) + 1 bytes
736 * for (UINT)-1 sentinel.
737 */
738 pc = WriteCompactIndex(pc, t->u.range.max + 1);
739 }
740 emitStateSP->nextTermFixup = pc;
741 pc += OFFSET_LEN;
742 emitStateSP->continueNode = t;
743 emitStateSP->continueOp = REOP_ENDCHILD;
744 ++emitStateSP;
745 assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
746 t = t->kid;
747 op = t->op;
748 continue;
749
750 case REOP_ENDCHILD:
751 if (!SetForwardJumpOffset(emitStateSP->nextTermFixup, pc))
752 goto jump_too_big;
753 break;
754
755 case REOP_CLASS:
756 if (!t->u.ucclass.sense)
757 pc[-1] = REOP_NCLASS;
758 pc = WriteCompactIndex(pc, t->u.ucclass.index);
759 charSet = &re->classList[t->u.ucclass.index];
760 charSet->converted = FALSE;
761 charSet->length = t->u.ucclass.bmsize;
762 charSet->u.src.startIndex = t->u.ucclass.startIndex;
763 charSet->u.src.length = t->u.ucclass.kidlen;
764 charSet->sense = t->u.ucclass.sense;
765 break;
766
767 default:
768 break;
769 }
770
771 t = t->next;
772 if (t) {
773 op = t->op;
774 } else {
775 if (emitStateSP == emitStateStack)
776 break;
777 --emitStateSP;
778 t = emitStateSP->continueNode;
779 op = (REOp) emitStateSP->continueOp;
780 }
781 }
782
783 cleanup:
784 heap_free(emitStateStack);
785 return pc;
786
787 jump_too_big:
788 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
789 pc = NULL;
790 goto cleanup;
791 }
792
793 /*
794 * Process the op against the two top operands, reducing them to a single
795 * operand in the penultimate slot. Update progLength and treeDepth.
796 */
797 static BOOL
798 ProcessOp(CompilerState *state, REOpData *opData, RENode **operandStack,
799 INT operandSP)
800 {
801 RENode *result;
802
803 switch (opData->op) {
804 case REOP_ALT:
805 result = NewRENode(state, REOP_ALT);
806 if (!result)
807 return FALSE;
808 result->kid = operandStack[operandSP - 2];
809 result->u.kid2 = operandStack[operandSP - 1];
810 operandStack[operandSP - 2] = result;
811
812 if (state->treeDepth == TREE_DEPTH_MAX) {
813 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
814 return FALSE;
815 }
816 ++state->treeDepth;
817
818 /*
819 * Look at both alternates to see if there's a FLAT or a CLASS at
820 * the start of each. If so, use a prerequisite match.
821 */
822 if (((RENode *) result->kid)->op == REOP_FLAT &&
823 ((RENode *) result->u.kid2)->op == REOP_FLAT &&
824 (state->flags & REG_FOLD) == 0) {
825 result->op = REOP_ALTPREREQ;
826 result->u.altprereq.ch1 = ((RENode *) result->kid)->u.flat.chr;
827 result->u.altprereq.ch2 = ((RENode *) result->u.kid2)->u.flat.chr;
828 /* ALTPREREQ, <end>, uch1, uch2, <next>, ...,
829 JUMP, <end> ... ENDALT */
830 state->progLength += 13;
831 }
832 else
833 if (((RENode *) result->kid)->op == REOP_CLASS &&
834 ((RENode *) result->kid)->u.ucclass.index < 256 &&
835 ((RENode *) result->u.kid2)->op == REOP_FLAT &&
836 (state->flags & REG_FOLD) == 0) {
837 result->op = REOP_ALTPREREQ2;
838 result->u.altprereq.ch1 = ((RENode *) result->u.kid2)->u.flat.chr;
839 result->u.altprereq.ch2 = ((RENode *) result->kid)->u.ucclass.index;
840 /* ALTPREREQ2, <end>, uch1, uch2, <next>, ...,
841 JUMP, <end> ... ENDALT */
842 state->progLength += 13;
843 }
844 else
845 if (((RENode *) result->kid)->op == REOP_FLAT &&
846 ((RENode *) result->u.kid2)->op == REOP_CLASS &&
847 ((RENode *) result->u.kid2)->u.ucclass.index < 256 &&
848 (state->flags & REG_FOLD) == 0) {
849 result->op = REOP_ALTPREREQ2;
850 result->u.altprereq.ch1 = ((RENode *) result->kid)->u.flat.chr;
851 result->u.altprereq.ch2 =
852 ((RENode *) result->u.kid2)->u.ucclass.index;
853 /* ALTPREREQ2, <end>, uch1, uch2, <next>, ...,
854 JUMP, <end> ... ENDALT */
855 state->progLength += 13;
856 }
857 else {
858 /* ALT, <next>, ..., JUMP, <end> ... ENDALT */
859 state->progLength += 7;
860 }
861 break;
862
863 case REOP_CONCAT:
864 result = operandStack[operandSP - 2];
865 while (result->next)
866 result = result->next;
867 result->next = operandStack[operandSP - 1];
868 break;
869
870 case REOP_ASSERT:
871 case REOP_ASSERT_NOT:
872 case REOP_LPARENNON:
873 case REOP_LPAREN:
874 /* These should have been processed by a close paren. */
875 ReportRegExpErrorHelper(state, JSREPORT_ERROR, JSMSG_MISSING_PAREN,
876 opData->errPos);
877 return FALSE;
878
879 default:;
880 }
881 return TRUE;
882 }
883
884 /*
885 * Hack two bits in CompilerState.flags, for use within FindParenCount to flag
886 * its being on the stack, and to propagate errors to its callers.
887 */
888 #define JSREG_FIND_PAREN_COUNT 0x8000
889 #define JSREG_FIND_PAREN_ERROR 0x4000
890
891 /*
892 * Magic return value from FindParenCount and GetDecimalValue, to indicate
893 * overflow beyond GetDecimalValue's max parameter, or a computed maximum if
894 * its findMax parameter is non-null.
895 */
896 #define OVERFLOW_VALUE ((UINT)-1)
897
898 static UINT
899 FindParenCount(CompilerState *state)
900 {
901 CompilerState temp;
902 int i;
903
904 if (state->flags & JSREG_FIND_PAREN_COUNT)
905 return OVERFLOW_VALUE;
906
907 /*
908 * Copy state into temp, flag it so we never report an invalid backref,
909 * and reset its members to parse the entire regexp. This is obviously
910 * suboptimal, but GetDecimalValue calls us only if a backref appears to
911 * refer to a forward parenthetical, which is rare.
912 */
913 temp = *state;
914 temp.flags |= JSREG_FIND_PAREN_COUNT;
915 temp.cp = temp.cpbegin;
916 temp.parenCount = 0;
917 temp.classCount = 0;
918 temp.progLength = 0;
919 temp.treeDepth = 0;
920 temp.classBitmapsMem = 0;
921 for (i = 0; i < CLASS_CACHE_SIZE; i++)
922 temp.classCache[i].start = NULL;
923
924 if (!ParseRegExp(&temp)) {
925 state->flags |= JSREG_FIND_PAREN_ERROR;
926 return OVERFLOW_VALUE;
927 }
928 return temp.parenCount;
929 }
930
931 /*
932 * Extract and return a decimal value at state->cp. The initial character c
933 * has already been read. Return OVERFLOW_VALUE if the result exceeds max.
934 * Callers who pass a non-null findMax should test JSREG_FIND_PAREN_ERROR in
935 * state->flags to discover whether an error occurred under findMax.
936 */
937 static UINT
938 GetDecimalValue(WCHAR c, UINT max, UINT (*findMax)(CompilerState *state),
939 CompilerState *state)
940 {
941 UINT value = JS7_UNDEC(c);
942 BOOL overflow = (value > max && (!findMax || value > findMax(state)));
943
944 /* The following restriction allows simpler overflow checks. */
945 assert(max <= ((UINT)-1 - 9) / 10);
946 while (state->cp < state->cpend) {
947 c = *state->cp;
948 if (!JS7_ISDEC(c))
949 break;
950 value = 10 * value + JS7_UNDEC(c);
951 if (!overflow && value > max && (!findMax || value > findMax(state)))
952 overflow = TRUE;
953 ++state->cp;
954 }
955 return overflow ? OVERFLOW_VALUE : value;
956 }
957
958 /*
959 * Calculate the total size of the bitmap required for a class expression.
960 */
961 static BOOL
962 CalculateBitmapSize(CompilerState *state, RENode *target, const WCHAR *src,
963 const WCHAR *end)
964 {
965 UINT max = 0;
966 BOOL inRange = FALSE;
967 WCHAR c, rangeStart = 0;
968 UINT n, digit, nDigits, i;
969
970 target->u.ucclass.bmsize = 0;
971 target->u.ucclass.sense = TRUE;
972
973 if (src == end)
974 return TRUE;
975
976 if (*src == '^') {
977 ++src;
978 target->u.ucclass.sense = FALSE;
979 }
980
981 while (src != end) {
982 BOOL canStartRange = TRUE;
983 UINT localMax = 0;
984
985 switch (*src) {
986 case '\\':
987 ++src;
988 c = *src++;
989 switch (c) {
990 case 'b':
991 localMax = 0x8;
992 break;
993 case 'f':
994 localMax = 0xC;
995 break;
996 case 'n':
997 localMax = 0xA;
998 break;
999 case 'r':
1000 localMax = 0xD;
1001 break;
1002 case 't':
1003 localMax = 0x9;
1004 break;
1005 case 'v':
1006 localMax = 0xB;
1007 break;
1008 case 'c':
1009 if (src < end && RE_IS_LETTER(*src)) {
1010 localMax = (UINT) (*src++) & 0x1F;
1011 } else {
1012 --src;
1013 localMax = '\\';
1014 }
1015 break;
1016 case 'x':
1017 nDigits = 2;
1018 goto lexHex;
1019 case 'u':
1020 nDigits = 4;
1021 lexHex:
1022 n = 0;
1023 for (i = 0; (i < nDigits) && (src < end); i++) {
1024 c = *src++;
1025 if (!isASCIIHexDigit(c, &digit)) {
1026 /*
1027 * Back off to accepting the original
1028 *'\' as a literal.
1029 */
1030 src -= i + 1;
1031 n = '\\';
1032 break;
1033 }
1034 n = (n << 4) | digit;
1035 }
1036 localMax = n;
1037 break;
1038 case 'd':
1039 canStartRange = FALSE;
1040 if (inRange) {
1041 JS_ReportErrorNumber(state->context,
1042 js_GetErrorMessage, NULL,
1043 JSMSG_BAD_CLASS_RANGE);
1044 return FALSE;
1045 }
1046 localMax = '9';
1047 break;
1048 case 'D':
1049 case 's':
1050 case 'S':
1051 case 'w':
1052 case 'W':
1053 canStartRange = FALSE;
1054 if (inRange) {
1055 JS_ReportErrorNumber(state->context,
1056 js_GetErrorMessage, NULL,
1057 JSMSG_BAD_CLASS_RANGE);
1058 return FALSE;
1059 }
1060 max = 65535;
1061
1062 /*
1063 * If this is the start of a range, ensure that it's less than
1064 * the end.
1065 */
1066 localMax = 0;
1067 break;
1068 case '0':
1069 case '1':
1070 case '2':
1071 case '3':
1072 case '4':
1073 case '5':
1074 case '6':
1075 case '7':
1076 /*
1077 * This is a non-ECMA extension - decimal escapes (in this
1078 * case, octal!) are supposed to be an error inside class
1079 * ranges, but supported here for backwards compatibility.
1080 *
1081 */
1082 n = JS7_UNDEC(c);
1083 c = *src;
1084 if ('0' <= c && c <= '7') {
1085 src++;
1086 n = 8 * n + JS7_UNDEC(c);
1087 c = *src;
1088 if ('0' <= c && c <= '7') {
1089 src++;
1090 i = 8 * n + JS7_UNDEC(c);
1091 if (i <= 0377)
1092 n = i;
1093 else
1094 src--;
1095 }
1096 }
1097 localMax = n;
1098 break;
1099
1100 default:
1101 localMax = c;
1102 break;
1103 }
1104 break;
1105 default:
1106 localMax = *src++;
1107 break;
1108 }
1109
1110 if (inRange) {
1111 /* Throw a SyntaxError here, per ECMA-262, 15.10.2.15. */
1112 if (rangeStart > localMax) {
1113 JS_ReportErrorNumber(state->context,
1114 js_GetErrorMessage, NULL,
1115 JSMSG_BAD_CLASS_RANGE);
1116 return FALSE;
1117 }
1118 inRange = FALSE;
1119 } else {
1120 if (canStartRange && src < end - 1) {
1121 if (*src == '-') {
1122 ++src;
1123 inRange = TRUE;
1124 rangeStart = (WCHAR)localMax;
1125 continue;
1126 }
1127 }
1128 if (state->flags & REG_FOLD)
1129 rangeStart = localMax; /* one run of the uc/dc loop below */
1130 }
1131
1132 if (state->flags & REG_FOLD) {
1133 WCHAR maxch = localMax;
1134
1135 for (i = rangeStart; i <= localMax; i++) {
1136 WCHAR uch, dch;
1137
1138 uch = toupperW(i);
1139 dch = tolowerW(i);
1140 if(maxch < uch)
1141 maxch = uch;
1142 if(maxch < dch)
1143 maxch = dch;
1144 }
1145 localMax = maxch;
1146 }
1147
1148 if (localMax > max)
1149 max = localMax;
1150 }
1151 target->u.ucclass.bmsize = max;
1152 return TRUE;
1153 }
1154
1155 static INT
1156 ParseMinMaxQuantifier(CompilerState *state, BOOL ignoreValues)
1157 {
1158 UINT min, max;
1159 WCHAR c;
1160 const WCHAR *errp = state->cp++;
1161
1162 c = *state->cp;
1163 if (JS7_ISDEC(c)) {
1164 ++state->cp;
1165 min = GetDecimalValue(c, 0xFFFF, NULL, state);
1166 c = *state->cp;
1167
1168 if (!ignoreValues && min == OVERFLOW_VALUE)
1169 return JSMSG_MIN_TOO_BIG;
1170
1171 if (c == ',') {
1172 c = *++state->cp;
1173 if (JS7_ISDEC(c)) {
1174 ++state->cp;
1175 max = GetDecimalValue(c, 0xFFFF, NULL, state);
1176 c = *state->cp;
1177 if (!ignoreValues && max == OVERFLOW_VALUE)
1178 return JSMSG_MAX_TOO_BIG;
1179 if (!ignoreValues && min > max)
1180 return JSMSG_OUT_OF_ORDER;
1181 } else {
1182 max = (UINT)-1;
1183 }
1184 } else {
1185 max = min;
1186 }
1187 if (c == '}') {
1188 state->result = NewRENode(state, REOP_QUANT);
1189 if (!state->result)
1190 return JSMSG_OUT_OF_MEMORY;
1191 state->result->u.range.min = min;
1192 state->result->u.range.max = max;
1193 /*
1194 * QUANT, <min>, <max>, <next> ... <ENDCHILD>
1195 * where <max> is written as compact(max+1) to make
1196 * (UINT)-1 sentinel to occupy 1 byte, not width_of(max)+1.
1197 */
1198 state->progLength += (1 + GetCompactIndexWidth(min)
1199 + GetCompactIndexWidth(max + 1)
1200 +3);
1201 return 0;
1202 }
1203 }
1204
1205 state->cp = errp;
1206 return -1;
1207 }
1208
1209 static BOOL
1210 ParseQuantifier(CompilerState *state)
1211 {
1212 RENode *term;
1213 term = state->result;
1214 if (state->cp < state->cpend) {
1215 switch (*state->cp) {
1216 case '+':
1217 state->result = NewRENode(state, REOP_QUANT);
1218 if (!state->result)
1219 return FALSE;
1220 state->result->u.range.min = 1;
1221 state->result->u.range.max = (UINT)-1;
1222 /* <PLUS>, <next> ... <ENDCHILD> */
1223 state->progLength += 4;
1224 goto quantifier;
1225 case '*':
1226 state->result = NewRENode(state, REOP_QUANT);
1227 if (!state->result)
1228 return FALSE;
1229 state->result->u.range.min = 0;
1230 state->result->u.range.max = (UINT)-1;
1231 /* <STAR>, <next> ... <ENDCHILD> */
1232 state->progLength += 4;
1233 goto quantifier;
1234 case '?':
1235 state->result = NewRENode(state, REOP_QUANT);
1236 if (!state->result)
1237 return FALSE;
1238 state->result->u.range.min = 0;
1239 state->result->u.range.max = 1;
1240 /* <OPT>, <next> ... <ENDCHILD> */
1241 state->progLength += 4;
1242 goto quantifier;
1243 case '{': /* balance '}' */
1244 {
1245 INT err;
1246
1247 err = ParseMinMaxQuantifier(state, FALSE);
1248 if (err == 0)
1249 goto quantifier;
1250 if (err == -1)
1251 return TRUE;
1252
1253 ReportRegExpErrorHelper(state, JSREPORT_ERROR, err, errp);
1254 return FALSE;
1255 }
1256 default:;
1257 }
1258 }
1259 return TRUE;
1260
1261 quantifier:
1262 if (state->treeDepth == TREE_DEPTH_MAX) {
1263 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
1264 return FALSE;
1265 }
1266
1267 ++state->treeDepth;
1268 ++state->cp;
1269 state->result->kid = term;
1270 if (state->cp < state->cpend && *state->cp == '?') {
1271 ++state->cp;
1272 state->result->u.range.greedy = FALSE;
1273 } else {
1274 state->result->u.range.greedy = TRUE;
1275 }
1276 return TRUE;
1277 }
1278
1279 /*
1280 * item: assertion An item is either an assertion or
1281 * quantatom a quantified atom.
1282 *
1283 * assertion: '^' Assertions match beginning of string
1284 * (or line if the class static property
1285 * RegExp.multiline is true).
1286 * '$' End of string (or line if the class
1287 * static property RegExp.multiline is
1288 * true).
1289 * '\b' Word boundary (between \w and \W).
1290 * '\B' Word non-boundary.
1291 *
1292 * quantatom: atom An unquantified atom.
1293 * quantatom '{' n ',' m '}'
1294 * Atom must occur between n and m times.
1295 * quantatom '{' n ',' '}' Atom must occur at least n times.
1296 * quantatom '{' n '}' Atom must occur exactly n times.
1297 * quantatom '*' Zero or more times (same as {0,}).
1298 * quantatom '+' One or more times (same as {1,}).
1299 * quantatom '?' Zero or one time (same as {0,1}).
1300 *
1301 * any of which can be optionally followed by '?' for ungreedy
1302 *
1303 * atom: '(' regexp ')' A parenthesized regexp (what matched
1304 * can be addressed using a backreference,
1305 * see '\' n below).
1306 * '.' Matches any char except '\n'.
1307 * '[' classlist ']' A character class.
1308 * '[' '^' classlist ']' A negated character class.
1309 * '\f' Form Feed.
1310 * '\n' Newline (Line Feed).
1311 * '\r' Carriage Return.
1312 * '\t' Horizontal Tab.
1313 * '\v' Vertical Tab.
1314 * '\d' A digit (same as [0-9]).
1315 * '\D' A non-digit.
1316 * '\w' A word character, [0-9a-z_A-Z].
1317 * '\W' A non-word character.
1318 * '\s' A whitespace character, [ \b\f\n\r\t\v].
1319 * '\S' A non-whitespace character.
1320 * '\' n A backreference to the nth (n decimal
1321 * and positive) parenthesized expression.
1322 * '\' octal An octal escape sequence (octal must be
1323 * two or three digits long, unless it is
1324 * 0 for the null character).
1325 * '\x' hex A hex escape (hex must be two digits).
1326 * '\u' unicode A unicode escape (must be four digits).
1327 * '\c' ctrl A control character, ctrl is a letter.
1328 * '\' literalatomchar Any character except one of the above
1329 * that follow '\' in an atom.
1330 * otheratomchar Any character not first among the other
1331 * atom right-hand sides.
1332 */
1333 static BOOL
1334 ParseTerm(CompilerState *state)
1335 {
1336 WCHAR c = *state->cp++;
1337 UINT nDigits;
1338 UINT num, tmp, n, i;
1339 const WCHAR *termStart;
1340
1341 switch (c) {
1342 /* assertions and atoms */
1343 case '^':
1344 state->result = NewRENode(state, REOP_BOL);
1345 if (!state->result)
1346 return FALSE;
1347 state->progLength++;
1348 return TRUE;
1349 case '$':
1350 state->result = NewRENode(state, REOP_EOL);
1351 if (!state->result)
1352 return FALSE;
1353 state->progLength++;
1354 return TRUE;
1355 case '\\':
1356 if (state->cp >= state->cpend) {
1357 /* a trailing '\' is an error */
1358 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_TRAILING_SLASH);
1359 return FALSE;
1360 }
1361 c = *state->cp++;
1362 switch (c) {
1363 /* assertion escapes */
1364 case 'b' :
1365 state->result = NewRENode(state, REOP_WBDRY);
1366 if (!state->result)
1367 return FALSE;
1368 state->progLength++;
1369 return TRUE;
1370 case 'B':
1371 state->result = NewRENode(state, REOP_WNONBDRY);
1372 if (!state->result)
1373 return FALSE;
1374 state->progLength++;
1375 return TRUE;
1376 /* Decimal escape */
1377 case '0':
1378 /* Give a strict warning. See also the note below. */
1379 WARN("non-octal digit in an escape sequence that doesn't match a back-reference\n");
1380 doOctal:
1381 num = 0;
1382 while (state->cp < state->cpend) {
1383 c = *state->cp;
1384 if (c < '0' || '7' < c)
1385 break;
1386 state->cp++;
1387 tmp = 8 * num + (UINT)JS7_UNDEC(c);
1388 if (tmp > 0377)
1389 break;
1390 num = tmp;
1391 }
1392 c = (WCHAR)num;
1393 doFlat:
1394 state->result = NewRENode(state, REOP_FLAT);
1395 if (!state->result)
1396 return FALSE;
1397 state->result->u.flat.chr = c;
1398 state->result->u.flat.length = 1;
1399 state->progLength += 3;
1400 break;
1401 case '1':
1402 case '2':
1403 case '3':
1404 case '4':
1405 case '5':
1406 case '6':
1407 case '7':
1408 case '8':
1409 case '9':
1410 termStart = state->cp - 1;
1411 num = GetDecimalValue(c, state->parenCount, FindParenCount, state);
1412 if (state->flags & JSREG_FIND_PAREN_ERROR)
1413 return FALSE;
1414 if (num == OVERFLOW_VALUE) {
1415 /* Give a strict mode warning. */
1416 WARN("back-reference exceeds number of capturing parentheses\n");
1417
1418 /*
1419 * Note: ECMA 262, 15.10.2.9 says that we should throw a syntax
1420 * error here. However, for compatibility with IE, we treat the
1421 * whole backref as flat if the first character in it is not a
1422 * valid octal character, and as an octal escape otherwise.
1423 */
1424 state->cp = termStart;
1425 if (c >= '8') {
1426 /* Treat this as flat. termStart - 1 is the \. */
1427 c = '\\';
1428 goto asFlat;
1429 }
1430
1431 /* Treat this as an octal escape. */
1432 goto doOctal;
1433 }
1434 assert(1 <= num && num <= 0x10000);
1435 state->result = NewRENode(state, REOP_BACKREF);
1436 if (!state->result)
1437 return FALSE;
1438 state->result->u.parenIndex = num - 1;
1439 state->progLength
1440 += 1 + GetCompactIndexWidth(state->result->u.parenIndex);
1441 break;
1442 /* Control escape */
1443 case 'f':
1444 c = 0xC;
1445 goto doFlat;
1446 case 'n':
1447 c = 0xA;
1448 goto doFlat;
1449 case 'r':
1450 c = 0xD;
1451 goto doFlat;
1452 case 't':
1453 c = 0x9;
1454 goto doFlat;
1455 case 'v':
1456 c = 0xB;
1457 goto doFlat;
1458 /* Control letter */
1459 case 'c':
1460 if (state->cp < state->cpend && RE_IS_LETTER(*state->cp)) {
1461 c = (WCHAR) (*state->cp++ & 0x1F);
1462 } else {
1463 /* back off to accepting the original '\' as a literal */
1464 --state->cp;
1465 c = '\\';
1466 }
1467 goto doFlat;
1468 /* HexEscapeSequence */
1469 case 'x':
1470 nDigits = 2;
1471 goto lexHex;
1472 /* UnicodeEscapeSequence */
1473 case 'u':
1474 nDigits = 4;
1475 lexHex:
1476 n = 0;
1477 for (i = 0; i < nDigits && state->cp < state->cpend; i++) {
1478 UINT digit;
1479 c = *state->cp++;
1480 if (!isASCIIHexDigit(c, &digit)) {
1481 /*
1482 * Back off to accepting the original 'u' or 'x' as a
1483 * literal.
1484 */
1485 state->cp -= i + 2;
1486 n = *state->cp++;
1487 break;
1488 }
1489 n = (n << 4) | digit;
1490 }
1491 c = (WCHAR) n;
1492 goto doFlat;
1493 /* Character class escapes */
1494 case 'd':
1495 state->result = NewRENode(state, REOP_DIGIT);
1496 doSimple:
1497 if (!state->result)
1498 return FALSE;
1499 state->progLength++;
1500 break;
1501 case 'D':
1502 state->result = NewRENode(state, REOP_NONDIGIT);
1503 goto doSimple;
1504 case 's':
1505 state->result = NewRENode(state, REOP_SPACE);
1506 goto doSimple;
1507 case 'S':
1508 state->result = NewRENode(state, REOP_NONSPACE);
1509 goto doSimple;
1510 case 'w':
1511 state->result = NewRENode(state, REOP_ALNUM);
1512 goto doSimple;
1513 case 'W':
1514 state->result = NewRENode(state, REOP_NONALNUM);
1515 goto doSimple;
1516 /* IdentityEscape */
1517 default:
1518 state->result = NewRENode(state, REOP_FLAT);
1519 if (!state->result)
1520 return FALSE;
1521 state->result->u.flat.chr = c;
1522 state->result->u.flat.length = 1;
1523 state->result->kid = (void *) (state->cp - 1);
1524 state->progLength += 3;
1525 break;
1526 }
1527 break;
1528 case '[':
1529 state->result = NewRENode(state, REOP_CLASS);
1530 if (!state->result)
1531 return FALSE;
1532 termStart = state->cp;
1533 state->result->u.ucclass.startIndex = termStart - state->cpbegin;
1534 for (;;) {
1535 if (state->cp == state->cpend) {
1536 ReportRegExpErrorHelper(state, JSREPORT_ERROR,
1537 JSMSG_UNTERM_CLASS, termStart);
1538
1539 return FALSE;
1540 }
1541 if (*state->cp == '\\') {
1542 state->cp++;
1543 if (state->cp != state->cpend)
1544 state->cp++;
1545 continue;
1546 }
1547 if (*state->cp == ']') {
1548 state->result->u.ucclass.kidlen = state->cp - termStart;
1549 break;
1550 }
1551 state->cp++;
1552 }
1553 for (i = 0; i < CLASS_CACHE_SIZE; i++) {
1554 if (!state->classCache[i].start) {
1555 state->classCache[i].start = termStart;
1556 state->classCache[i].length = state->result->u.ucclass.kidlen;
1557 state->classCache[i].index = state->classCount;
1558 break;
1559 }
1560 if (state->classCache[i].length ==
1561 state->result->u.ucclass.kidlen) {
1562 for (n = 0; ; n++) {
1563 if (n == state->classCache[i].length) {
1564 state->result->u.ucclass.index
1565 = state->classCache[i].index;
1566 goto claim;
1567 }
1568 if (state->classCache[i].start[n] != termStart[n])
1569 break;
1570 }
1571 }
1572 }
1573 state->result->u.ucclass.index = state->classCount++;
1574
1575 claim:
1576 /*
1577 * Call CalculateBitmapSize now as we want any errors it finds
1578 * to be reported during the parse phase, not at execution.
1579 */
1580 if (!CalculateBitmapSize(state, state->result, termStart, state->cp++))
1581 return FALSE;
1582 /*
1583 * Update classBitmapsMem with number of bytes to hold bmsize bits,
1584 * which is (bitsCount + 7) / 8 or (highest_bit + 1 + 7) / 8
1585 * or highest_bit / 8 + 1 where highest_bit is u.ucclass.bmsize.
1586 */
1587 n = (state->result->u.ucclass.bmsize >> 3) + 1;
1588 if (n > CLASS_BITMAPS_MEM_LIMIT - state->classBitmapsMem) {
1589 ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
1590 return FALSE;
1591 }
1592 state->classBitmapsMem += n;
1593 /* CLASS, <index> */
1594 state->progLength
1595 += 1 + GetCompactIndexWidth(state->result->u.ucclass.index);
1596 break;
1597
1598 case '.':
1599 state->result = NewRENode(state, REOP_DOT);
1600 goto doSimple;
1601
1602 case '{':
1603 {
1604 const WCHAR *errp = state->cp--;
1605 INT err;
1606
1607 err = ParseMinMaxQuantifier(state, TRUE);
1608 state->cp = errp;
1609
1610 if (err < 0)
1611 goto asFlat;
1612
1613 /* FALL THROUGH */
1614 }
1615 case '*':
1616 case '+':
1617 case '?':
1618 ReportRegExpErrorHelper(state, JSREPORT_ERROR,
1619 JSMSG_BAD_QUANTIFIER, state->cp - 1);
1620 return FALSE;
1621 default:
1622 asFlat:
1623 state->result = NewRENode(state, REOP_FLAT);
1624 if (!state->result)
1625 return FALSE;
1626 state->result->u.flat.chr = c;
1627 state->result->u.flat.length = 1;
1628 state->result->kid = (void *) (state->cp - 1);
1629 state->progLength += 3;
1630 break;
1631 }
1632 return ParseQuantifier(state);
1633 }
1634
1635 /*
1636 * Top-down regular expression grammar, based closely on Perl4.
1637 *
1638 * regexp: altern A regular expression is one or more
1639 * altern '|' regexp alternatives separated by vertical bar.
1640 */
1641 #define INITIAL_STACK_SIZE 128
1642
1643 static BOOL
1644 ParseRegExp(CompilerState *state)
1645 {
1646 size_t parenIndex;
1647 RENode *operand;
1648 REOpData *operatorStack;
1649 RENode **operandStack;
1650 REOp op;
1651 INT i;
1652 BOOL result = FALSE;
1653
1654 INT operatorSP = 0, operatorStackSize = INITIAL_STACK_SIZE;
1655 INT operandSP = 0, operandStackSize = INITIAL_STACK_SIZE;
1656
1657 /* Watch out for empty regexp */
1658 if (state->cp == state->cpend) {
1659 state->result = NewRENode(state, REOP_EMPTY);
1660 return (state->result != NULL);
1661 }
1662
1663 operatorStack = heap_alloc(sizeof(REOpData) * operatorStackSize);
1664 if (!operatorStack)
1665 return FALSE;
1666
1667 operandStack = heap_alloc(sizeof(RENode *) * operandStackSize);
1668 if (!operandStack)
1669 goto out;
1670
1671 for (;;) {
1672 parenIndex = state->parenCount;
1673 if (state->cp == state->cpend) {
1674 /*
1675 * If we are at the end of the regexp and we're short one or more
1676 * operands, the regexp must have the form /x|/ or some such, with
1677 * left parentheses making us short more than one operand.
1678 */
1679 if (operatorSP >= operandSP) {
1680 operand = NewRENode(state, REOP_EMPTY);
1681 if (!operand)
1682 goto out;
1683 goto pushOperand;
1684 }
1685 } else {
1686 switch (*state->cp) {
1687 case '(':
1688 ++state->cp;
1689 if (state->cp + 1 < state->cpend &&
1690 *state->cp == '?' &&
1691 (state->cp[1] == '=' ||
1692 state->cp[1] == '!' ||
1693 state->cp[1] == ':')) {
1694 switch (state->cp[1]) {
1695 case '=':
1696 op = REOP_ASSERT;
1697 /* ASSERT, <next>, ... ASSERTTEST */
1698 state->progLength += 4;
1699 break;
1700 case '!':
1701 op = REOP_ASSERT_NOT;
1702 /* ASSERTNOT, <next>, ... ASSERTNOTTEST */
1703 state->progLength += 4;
1704 break;
1705 default:
1706 op = REOP_LPARENNON;
1707 break;
1708 }
1709 state->cp += 2;
1710 } else {
1711 op = REOP_LPAREN;
1712 /* LPAREN, <index>, ... RPAREN, <index> */
1713 state->progLength
1714 += 2 * (1 + GetCompactIndexWidth(parenIndex));
1715 state->parenCount++;
1716 if (state->parenCount == 65535) {
1717 ReportRegExpError(state, JSREPORT_ERROR,
1718 JSMSG_TOO_MANY_PARENS);
1719 goto out;
1720 }
1721 }
1722 goto pushOperator;
1723
1724 case ')':
1725 /*
1726 * If there's no stacked open parenthesis, throw syntax error.
1727 */
1728 for (i = operatorSP - 1; ; i--) {
1729 if (i < 0) {
1730 ReportRegExpError(state, JSREPORT_ERROR,
1731 JSMSG_UNMATCHED_RIGHT_PAREN);
1732 goto out;
1733 }
1734 if (operatorStack[i].op == REOP_ASSERT ||
1735 operatorStack[i].op == REOP_ASSERT_NOT ||
1736 operatorStack[i].op == REOP_LPARENNON ||
1737 operatorStack[i].op == REOP_LPAREN) {
1738 break;
1739 }
1740 }
1741 /* FALL THROUGH */
1742
1743 case '|':
1744 /* Expected an operand before these, so make an empty one */
1745 operand = NewRENode(state, REOP_EMPTY);
1746 if (!operand)
1747 goto out;
1748 goto pushOperand;
1749
1750 default:
1751 if (!ParseTerm(state))
1752 goto out;
1753 operand = state->result;
1754 pushOperand:
1755 if (operandSP == operandStackSize) {
1756 RENode **tmp;
1757 operandStackSize += operandStackSize;
1758 tmp = heap_realloc(operandStack, sizeof(RENode *) * operandStackSize);
1759 if (!tmp)
1760 goto out;
1761 operandStack = tmp;
1762 }
1763 operandStack[operandSP++] = operand;
1764 break;
1765 }
1766 }
1767
1768 /* At the end; process remaining operators. */
1769 restartOperator:
1770 if (state->cp == state->cpend) {
1771 while (operatorSP) {
1772 --operatorSP;
1773 if (!ProcessOp(state, &operatorStack[operatorSP],
1774 operandStack, operandSP))
1775 goto out;
1776 --operandSP;
1777 }
1778 assert(operandSP == 1);
1779 state->result = operandStack[0];
1780 result = TRUE;
1781 goto out;
1782 }
1783
1784 switch (*state->cp) {
1785 case '|':
1786 /* Process any stacked 'concat' operators */
1787 ++state->cp;
1788 while (operatorSP &&
1789 operatorStack[operatorSP - 1].op == REOP_CONCAT) {
1790 --operatorSP;
1791 if (!ProcessOp(state, &operatorStack[operatorSP],
1792 operandStack, operandSP)) {
1793 goto out;
1794 }
1795 --operandSP;
1796 }
1797 op = REOP_ALT;
1798 goto pushOperator;
1799
1800 case ')':
1801 /*
1802 * If there's no stacked open parenthesis, throw syntax error.
1803 */
1804 for (i = operatorSP - 1; ; i--) {
1805 if (i < 0) {
1806 ReportRegExpError(state, JSREPORT_ERROR,
1807 JSMSG_UNMATCHED_RIGHT_PAREN);
1808 goto out;
1809 }
1810 if (operatorStack[i].op == REOP_ASSERT ||
1811 operatorStack[i].op == REOP_ASSERT_NOT ||
1812 operatorStack[i].op == REOP_LPARENNON ||
1813 operatorStack[i].op == REOP_LPAREN) {
1814 break;
1815 }
1816 }
1817 ++state->cp;
1818
1819 /* Process everything on the stack until the open parenthesis. */
1820 for (;;) {
1821 assert(operatorSP);
1822 --operatorSP;
1823 switch (operatorStack[operatorSP].op) {
1824 case REOP_ASSERT:
1825 case REOP_ASSERT_NOT:
1826 case REOP_LPAREN:
1827 operand = NewRENode(state, operatorStack[operatorSP].op);
1828 if (!operand)
1829 goto out;
1830 operand->u.parenIndex =
1831 operatorStack[operatorSP].parenIndex;
1832 assert(operandSP);
1833 operand->kid = operandStack[operandSP - 1];
1834 operandStack[operandSP - 1] = operand;
1835 if (state->treeDepth == TREE_DEPTH_MAX) {
1836 ReportRegExpError(state, JSREPORT_ERROR,
1837 JSMSG_REGEXP_TOO_COMPLEX);
1838 goto out;
1839 }
1840 ++state->treeDepth;
1841 /* FALL THROUGH */
1842
1843 case REOP_LPARENNON:
1844 state->result = operandStack[operandSP - 1];
1845 if (!ParseQuantifier(state))
1846 goto out;
1847 operandStack[operandSP - 1] = state->result;
1848 goto restartOperator;
1849 default:
1850 if (!ProcessOp(state, &operatorStack[operatorSP],
1851 operandStack, operandSP))
1852 goto out;
1853 --operandSP;
1854 break;
1855 }
1856 }
1857 break;
1858
1859 case '{':
1860 {
1861 const WCHAR *errp = state->cp;
1862
1863 if (ParseMinMaxQuantifier(state, TRUE) < 0) {
1864 /*
1865 * This didn't even scan correctly as a quantifier, so we should
1866 * treat it as flat.
1867 */
1868 op = REOP_CONCAT;
1869 goto pushOperator;
1870 }
1871
1872 state->cp = errp;
1873 /* FALL THROUGH */
1874 }
1875
1876 case '+':
1877 case '*':
1878 case '?':
1879 ReportRegExpErrorHelper(state, JSREPORT_ERROR, JSMSG_BAD_QUANTIFIER,
1880 state->cp);
1881 result = FALSE;
1882 goto out;
1883
1884 default:
1885 /* Anything else is the start of the next term. */
1886 op = REOP_CONCAT;
1887 pushOperator:
1888 if (operatorSP == operatorStackSize) {
1889 REOpData *tmp;
1890 operatorStackSize += operatorStackSize;
1891 tmp = heap_realloc(operatorStack, sizeof(REOpData) * operatorStackSize);
1892 if (!tmp)
1893 goto out;
1894 operatorStack = tmp;
1895 }
1896 operatorStack[operatorSP].op = op;
1897 operatorStack[operatorSP].errPos = state->cp;
1898 operatorStack[operatorSP++].parenIndex = parenIndex;
1899 break;
1900 }
1901 }
1902 out:
1903 heap_free(operatorStack);
1904 heap_free(operandStack);
1905 return result;
1906 }
1907
1908 /*
1909 * Save the current state of the match - the position in the input
1910 * text as well as the position in the bytecode. The state of any
1911 * parent expressions is also saved (preceding state).
1912 * Contents of parenCount parentheses from parenIndex are also saved.
1913 */
1914 static REBackTrackData *
1915 PushBackTrackState(REGlobalData *gData, REOp op,
1916 jsbytecode *target, match_state_t *x, const WCHAR *cp,
1917 size_t parenIndex, size_t parenCount)
1918 {
1919 size_t i;
1920 REBackTrackData *result =
1921 (REBackTrackData *) ((char *)gData->backTrackSP + gData->cursz);
1922
1923 size_t sz = sizeof(REBackTrackData) +
1924 gData->stateStackTop * sizeof(REProgState) +
1925 parenCount * sizeof(RECapture);
1926
1927 ptrdiff_t btsize = gData->backTrackStackSize;
1928 ptrdiff_t btincr = ((char *)result + sz) -
1929 ((char *)gData->backTrackStack + btsize);
1930
1931 TRACE("\tBT_Push: %lu,%lu\n", (ULONG_PTR)parenIndex, (ULONG_PTR)parenCount);
1932
1933 JS_COUNT_OPERATION(gData->cx, JSOW_JUMP * (1 + parenCount));
1934 if (btincr > 0) {
1935 ptrdiff_t offset = (char *)result - (char *)gData->backTrackStack;
1936
1937 JS_COUNT_OPERATION(gData->cx, JSOW_ALLOCATION);
1938 btincr = ((btincr+btsize-1)/btsize)*btsize;
1939 gData->backTrackStack = heap_pool_grow(gData->pool, gData->backTrackStack, btsize, btincr);
1940 if (!gData->backTrackStack) {
1941 js_ReportOutOfScriptQuota(gData->cx);
1942 gData->ok = FALSE;
1943 return NULL;
1944 }
1945 gData->backTrackStackSize = btsize + btincr;
1946 result = (REBackTrackData *) ((char *)gData->backTrackStack + offset);
1947 }
1948 gData->backTrackSP = result;
1949 result->sz = gData->cursz;
1950 gData->cursz = sz;
1951
1952 result->backtrack_op = op;
1953 result->backtrack_pc = target;
1954 result->cp = cp;
1955 result->parenCount = parenCount;
1956 result->parenIndex = parenIndex;
1957
1958 result->saveStateStackTop = gData->stateStackTop;
1959 assert(gData->stateStackTop);
1960 memcpy(result + 1, gData->stateStack,
1961 sizeof(REProgState) * result->saveStateStackTop);
1962
1963 if (parenCount != 0) {
1964 memcpy((char *)(result + 1) +
1965 sizeof(REProgState) * result->saveStateStackTop,
1966 &x->parens[parenIndex],
1967 sizeof(RECapture) * parenCount);
1968 for (i = 0; i != parenCount; i++)
1969 x->parens[parenIndex + i].index = -1;
1970 }
1971
1972 return result;
1973 }
1974
1975 static inline match_state_t *
1976 FlatNIMatcher(REGlobalData *gData, match_state_t *x, const WCHAR *matchChars,
1977 size_t length)
1978 {
1979 size_t i;
1980 assert(gData->cpend >= x->cp);
1981 if (length > (size_t)(gData->cpend - x->cp))
1982 return NULL;
1983 for (i = 0; i != length; i++) {
1984 if (toupperW(matchChars[i]) != toupperW(x->cp[i]))
1985 return NULL;
1986 }
1987 x->cp += length;
1988 return x;
1989 }
1990
1991 /*
1992 * 1. Evaluate DecimalEscape to obtain an EscapeValue E.
1993 * 2. If E is not a character then go to step 6.
1994 * 3. Let ch be E's character.
1995 * 4. Let A be a one-element RECharSet containing the character ch.
1996 * 5. Call CharacterSetMatcher(A, false) and return its Matcher result.
1997 * 6. E must be an integer. Let n be that integer.
1998 * 7. If n=0 or n>NCapturingParens then throw a SyntaxError exception.
1999 * 8. Return an internal Matcher closure that takes two arguments, a State x
2000 * and a Continuation c, and performs the following:
2001 * 1. Let cap be x's captures internal array.
2002 * 2. Let s be cap[n].
2003 * 3. If s is undefined, then call c(x) and return its result.
2004 * 4. Let e be x's endIndex.
2005 * 5. Let len be s's length.
2006 * 6. Let f be e+len.
2007 * 7. If f>InputLength, return failure.
2008 * 8. If there exists an integer i between 0 (inclusive) and len (exclusive)
2009 * such that Canonicalize(s[i]) is not the same character as
2010 * Canonicalize(Input [e+i]), then return failure.
2011 * 9. Let y be the State (f, cap).
2012 * 10. Call c(y) and return its result.
2013 */
2014 static match_state_t *
2015 BackrefMatcher(REGlobalData *gData, match_state_t *x, size_t parenIndex)
2016 {
2017 size_t len, i;
2018 const WCHAR *parenContent;
2019 RECapture *cap = &x->parens[parenIndex];
2020
2021 if (cap->index == -1)
2022 return x;
2023
2024 len = cap->length;
2025 if (x->cp + len > gData->cpend)
2026 return NULL;
2027
2028 parenContent = &gData->cpbegin[cap->index];
2029 if (gData->regexp->flags & REG_FOLD) {
2030 for (i = 0; i < len; i++) {
2031 if (toupperW(parenContent[i]) != toupperW(x->cp[i]))
2032 return NULL;
2033 }
2034 } else {
2035 for (i = 0; i < len; i++) {
2036 if (parenContent[i] != x->cp[i])
2037 return NULL;
2038 }
2039 }
2040 x->cp += len;
2041 return x;
2042 }
2043
2044 /* Add a single character to the RECharSet */
2045 static void
2046 AddCharacterToCharSet(RECharSet *cs, WCHAR c)
2047 {
2048 UINT byteIndex = (UINT)(c >> 3);
2049 assert(c <= cs->length);
2050 cs->u.bits[byteIndex] |= 1 << (c & 0x7);
2051 }
2052
2053
2054 /* Add a character range, c1 to c2 (inclusive) to the RECharSet */
2055 static void
2056 AddCharacterRangeToCharSet(RECharSet *cs, UINT c1, UINT c2)
2057 {
2058 UINT i;
2059
2060 UINT byteIndex1 = c1 >> 3;
2061 UINT byteIndex2 = c2 >> 3;
2062
2063 assert(c2 <= cs->length && c1 <= c2);
2064
2065 c1 &= 0x7;
2066 c2 &= 0x7;
2067
2068 if (byteIndex1 == byteIndex2) {
2069 cs->u.bits[byteIndex1] |= ((BYTE)0xFF >> (7 - (c2 - c1))) << c1;
2070 } else {
2071 cs->u.bits[byteIndex1] |= 0xFF << c1;
2072 for (i = byteIndex1 + 1; i < byteIndex2; i++)
2073 cs->u.bits[i] = 0xFF;
2074 cs->u.bits[byteIndex2] |= (BYTE)0xFF >> (7 - c2);
2075 }
2076 }
2077
2078 /* Compile the source of the class into a RECharSet */
2079 static BOOL
2080 ProcessCharSet(REGlobalData *gData, RECharSet *charSet)
2081 {
2082 const WCHAR *src, *end;
2083 BOOL inRange = FALSE;
2084 WCHAR rangeStart = 0;
2085 UINT byteLength, n;
2086 WCHAR c, thisCh;
2087 INT nDigits, i;
2088
2089 assert(!charSet->converted);
2090 /*
2091 * Assert that startIndex and length points to chars inside [] inside
2092 * source string.
2093 */
2094 assert(1 <= charSet->u.src.startIndex);
2095 assert(charSet->u.src.startIndex < gData->regexp->source_len);
2096 assert(charSet->u.src.length <= gData->regexp->source_len
2097 - 1 - charSet->u.src.startIndex);
2098
2099 charSet->converted = TRUE;
2100 src = gData->regexp->source + charSet->u.src.startIndex;
2101
2102 end = src + charSet->u.src.length;
2103
2104 assert(src[-1] == '[' && end[0] == ']');
2105
2106 byteLength = (charSet->length >> 3) + 1;
2107 charSet->u.bits = heap_alloc(byteLength);
2108 if (!charSet->u.bits) {
2109 JS_ReportOutOfMemory(gData->cx);
2110 gData->ok = FALSE;
2111 return FALSE;
2112 }
2113 memset(charSet->u.bits, 0, byteLength);
2114
2115 if (src == end)
2116 return TRUE;
2117
2118 if (*src == '^') {
2119 assert(charSet->sense == FALSE);
2120 ++src;
2121 } else {
2122 assert(charSet->sense == TRUE);
2123 }
2124
2125 while (src != end) {
2126 switch (*src) {
2127 case '\\':
2128 ++src;
2129 c = *src++;
2130 switch (c) {
2131 case 'b':
2132 thisCh = 0x8;
2133 break;
2134 case 'f':
2135 thisCh = 0xC;
2136 break;
2137 case 'n':
2138 thisCh = 0xA;
2139 break;
2140 case 'r':
2141 thisCh = 0xD;
2142 break;
2143 case 't':
2144 thisCh = 0x9;
2145 break;
2146 case 'v':
2147 thisCh = 0xB;
2148 break;
2149 case 'c':
2150 if (src < end && JS_ISWORD(*src)) {
2151 thisCh = (WCHAR)(*src++ & 0x1F);
2152 } else {
2153 --src;
2154 thisCh = '\\';
2155 }
2156 break;
2157 case 'x':
2158 nDigits = 2;
2159 goto lexHex;
2160 case 'u':
2161 nDigits = 4;
2162 lexHex:
2163 n = 0;
2164 for (i = 0; (i < nDigits) && (src < end); i++) {
2165 UINT digit;
2166 c = *src++;
2167 if (!isASCIIHexDigit(c, &digit)) {
2168 /*
2169 * Back off to accepting the original '\'
2170 * as a literal
2171 */
2172 src -= i + 1;
2173 n = '\\';
2174 break;
2175 }
2176 n = (n << 4) | digit;
2177 }
2178 thisCh = (WCHAR)n;
2179 break;
2180 case '0':
2181 case '1':
2182 case '2':
2183 case '3':
2184 case '4':
2185 case '5':
2186 case '6':
2187 case '7':
2188 /*
2189 * This is a non-ECMA extension - decimal escapes (in this
2190 * case, octal!) are supposed to be an error inside class
2191 * ranges, but supported here for backwards compatibility.
2192 */
2193 n = JS7_UNDEC(c);
2194 c = *src;
2195 if ('0' <= c && c <= '7') {
2196 src++;
2197 n = 8 * n + JS7_UNDEC(c);
2198 c = *src;
2199 if ('0' <= c && c <= '7') {
2200 src++;
2201 i = 8 * n + JS7_UNDEC(c);
2202 if (i <= 0377)
2203 n = i;
2204 else
2205 src--;
2206 }
2207 }
2208 thisCh = (WCHAR)n;
2209 break;
2210
2211 case 'd':
2212 AddCharacterRangeToCharSet(charSet, '0', '9');
2213 continue; /* don't need range processing */
2214 case 'D':
2215 AddCharacterRangeToCharSet(charSet, 0, '0' - 1);
2216 AddCharacterRangeToCharSet(charSet,
2217 (WCHAR)('9' + 1),
2218 (WCHAR)charSet->length);
2219 continue;
2220 case 's':
2221 for (i = (INT)charSet->length; i >= 0; i--)
2222 if (isspaceW(i))
2223 AddCharacterToCharSet(charSet, (WCHAR)i);
2224 continue;
2225 case 'S':
2226 for (i = (INT)charSet->length; i >= 0; i--)
2227 if (!isspaceW(i))
2228 AddCharacterToCharSet(charSet, (WCHAR)i);
2229 continue;
2230 case 'w':
2231 for (i = (INT)charSet->length; i >= 0; i--)
2232 if (JS_ISWORD(i))
2233 AddCharacterToCharSet(charSet, (WCHAR)i);
2234 continue;
2235 case 'W':
2236 for (i = (INT)charSet->length; i >= 0; i--)
2237 if (!JS_ISWORD(i))
2238 AddCharacterToCharSet(charSet, (WCHAR)i);
2239 continue;
2240 default:
2241 thisCh = c;
2242 break;
2243
2244 }
2245 break;
2246
2247 default:
2248 thisCh = *src++;
2249 break;
2250
2251 }
2252 if (inRange) {
2253 if (gData->regexp->flags & REG_FOLD) {
2254 assert(rangeStart <= thisCh);
2255 for (i = rangeStart; i <= thisCh; i++) {
2256 WCHAR uch, dch;
2257
2258 AddCharacterToCharSet(charSet, i);
2259 uch = toupperW(i);
2260 dch = tolowerW(i);
2261 if (i != uch)
2262 AddCharacterToCharSet(charSet, uch);
2263 if (i != dch)
2264 AddCharacterToCharSet(charSet, dch);
2265 }
2266 } else {
2267 AddCharacterRangeToCharSet(charSet, rangeStart, thisCh);
2268 }
2269 inRange = FALSE;
2270 } else {
2271 if (gData->regexp->flags & REG_FOLD) {
2272 AddCharacterToCharSet(charSet, toupperW(thisCh));
2273 AddCharacterToCharSet(charSet, tolowerW(thisCh));
2274 } else {
2275 AddCharacterToCharSet(charSet, thisCh);
2276 }
2277 if (src < end - 1) {
2278 if (*src == '-') {
2279 ++src;
2280 inRange = TRUE;
2281 rangeStart = thisCh;
2282 }
2283 }
2284 }
2285 }
2286 return TRUE;
2287 }
2288
2289 static BOOL
2290 ReallocStateStack(REGlobalData *gData)
2291 {
2292 size_t limit = gData->stateStackLimit;
2293 size_t sz = sizeof(REProgState) * limit;
2294
2295 gData->stateStack = heap_pool_grow(gData->pool, gData->stateStack, sz, sz);
2296 if (!gData->stateStack) {
2297 js_ReportOutOfScriptQuota(gData->cx);
2298 gData->ok = FALSE;
2299 return FALSE;
2300 }
2301 gData->stateStackLimit = limit + limit;
2302 return TRUE;
2303 }
2304
2305 #define PUSH_STATE_STACK(data) \
2306 do { \
2307 ++(data)->stateStackTop; \
2308 if ((data)->stateStackTop == (data)->stateStackLimit && \
2309 !ReallocStateStack((data))) { \
2310 return NULL; \
2311 } \
2312 }while(0)
2313
2314 /*
2315 * Apply the current op against the given input to see if it's going to match
2316 * or fail. Return false if we don't get a match, true if we do. If updatecp is
2317 * true, then update the current state's cp. Always update startpc to the next
2318 * op.
2319 */
2320 static inline match_state_t *
2321 SimpleMatch(REGlobalData *gData, match_state_t *x, REOp op,
2322 jsbytecode **startpc, BOOL updatecp)
2323 {
2324 match_state_t *result = NULL;
2325 WCHAR matchCh;
2326 size_t parenIndex;
2327 size_t offset, length, index;
2328 jsbytecode *pc = *startpc; /* pc has already been incremented past op */
2329 const WCHAR *source;
2330 const WCHAR *startcp = x->cp;
2331 WCHAR ch;
2332 RECharSet *charSet;
2333
2334 const char *opname = reop_names[op];
2335 TRACE("\n%06d: %*s%s\n", (int)(pc - gData->regexp->program),
2336 (int)gData->stateStackTop * 2, "", opname);
2337
2338 switch (op) {
2339 case REOP_EMPTY:
2340 result = x;
2341 break;
2342 case REOP_BOL:
2343 if (x->cp != gData->cpbegin) {
2344 if (/*!gData->cx->regExpStatics.multiline && FIXME !!! */
2345 !(gData->regexp->flags & REG_MULTILINE)) {
2346 break;
2347 }
2348 if (!RE_IS_LINE_TERM(x->cp[-1]))
2349 break;
2350 }
2351 result = x;
2352 break;
2353 case REOP_EOL:
2354 if (x->cp != gData->cpend) {
2355 if (/*!gData->cx->regExpStatics.multiline &&*/
2356 !(gData->regexp->flags & REG_MULTILINE)) {
2357 break;
2358 }
2359 if (!RE_IS_LINE_TERM(*x->cp))
2360 break;
2361 }
2362 result = x;
2363 break;
2364 case REOP_WBDRY:
2365 if ((x->cp == gData->cpbegin || !JS_ISWORD(x->cp[-1])) ^
2366 !(x->cp != gData->cpend && JS_ISWORD(*x->cp))) {
2367 result = x;
2368 }
2369 break;
2370 case REOP_WNONBDRY:
2371 if ((x->cp == gData->cpbegin || !JS_ISWORD(x->cp[-1])) ^
2372 (x->cp != gData->cpend && JS_ISWORD(*x->cp))) {
2373 result = x;
2374 }
2375 break;
2376 case REOP_DOT:
2377 if (x->cp != gData->cpend && !RE_IS_LINE_TERM(*x->cp)) {
2378 result = x;
2379 result->cp++;
2380 }
2381 break;
2382 case REOP_DIGIT:
2383 if (x->cp != gData->cpend && JS7_ISDEC(*x->cp)) {
2384 result = x;
2385 result->cp++;
2386 }
2387 break;
2388 case REOP_NONDIGIT:
2389 if (x->cp != gData->cpend && !JS7_ISDEC(*x->cp)) {
2390 result = x;
2391 result->cp++;
2392 }
2393 break;
2394 case REOP_ALNUM:
2395 if (x->cp != gData->cpend && JS_ISWORD(*x->cp)) {
2396 result = x;
2397 result->cp++;
2398 }
2399 break;
2400 case REOP_NONALNUM:
2401 if (x->cp != gData->cpend && !JS_ISWORD(*x->cp)) {
2402 result = x;
2403 result->cp++;
2404 }
2405 break;
2406 case REOP_SPACE:
2407 if (x->cp != gData->cpend && isspaceW(*x->cp)) {
2408 result = x;
2409 result->cp++;
2410 }
2411 break;
2412 case REOP_NONSPACE:
2413 if (x->cp != gData->cpend && !isspaceW(*x->cp)) {
2414 result = x;
2415 result->cp++;
2416 }
2417 break;
2418 case REOP_BACKREF:
2419 pc = ReadCompactIndex(pc, &parenIndex);
2420 assert(parenIndex < gData->regexp->parenCount);
2421 result = BackrefMatcher(gData, x, parenIndex);
2422 break;
2423 case REOP_FLAT:
2424 pc = ReadCompactIndex(pc, &offset);
2425 assert(offset < gData->regexp->source_len);
2426 pc = ReadCompactIndex(pc, &length);
2427 assert(1 <= length);
2428 assert(length <= gData->regexp->source_len - offset);
2429 if (length <= (size_t)(gData->cpend - x->cp)) {
2430 source = gData->regexp->source + offset;
2431 TRACE("%s\n", debugstr_wn(source, length));
2432 for (index = 0; index != length; index++) {
2433 if (source[index] != x->cp[index])
2434 return NULL;
2435 }
2436 x->cp += length;
2437 result = x;
2438 }
2439 break;
2440 case REOP_FLAT1:
2441 matchCh = *pc++;
2442 TRACE(" '%c' == '%c'\n", (char)matchCh, (char)*x->cp);
2443 if (x->cp != gData->cpend && *x->cp == matchCh) {
2444 result = x;
2445 result->cp++;
2446 }
2447 break;
2448 case REOP_FLATi:
2449 pc = ReadCompactIndex(pc, &offset);
2450 assert(offset < gData->regexp->source_len);
2451 pc = ReadCompactIndex(pc, &length);
2452 assert(1 <= length);
2453 assert(length <= gData->regexp->source_len - offset);
2454 source = gData->regexp->source;
2455 result = FlatNIMatcher(gData, x, source + offset, length);
2456 break;
2457 case REOP_FLAT1i:
2458 matchCh = *pc++;
2459 if (x->cp != gData->cpend && toupperW(*x->cp) == toupperW(matchCh)) {
2460 result = x;
2461 result->cp++;
2462 }
2463 break;
2464 case REOP_UCFLAT1:
2465 matchCh = GET_ARG(pc);
2466 TRACE(" '%c' == '%c'\n", (char)matchCh, (char)*x->cp);
2467 pc += ARG_LEN;
2468 if (x->cp != gData->cpend && *x->cp == matchCh) {
2469 result = x;
2470 result->cp++;
2471 }
2472 break;
2473 case REOP_UCFLAT1i:
2474 matchCh = GET_ARG(pc);
2475 pc += ARG_LEN;
2476 if (x->cp != gData->cpend && toupperW(*x->cp) == toupperW(matchCh)) {
2477 result = x;
2478 result->cp++;
2479 }
2480 break;
2481 case REOP_CLASS:
2482 pc = ReadCompactIndex(pc, &index);
2483 assert(index < gData->regexp->classCount);
2484 if (x->cp != gData->cpend) {
2485 charSet = &gData->regexp->classList[index];
2486 assert(charSet->converted);
2487 ch = *x->cp;
2488 index = ch >> 3;
2489 if (charSet->length != 0 &&
2490 ch <= charSet->length &&
2491 (charSet->u.bits[index] & (1 << (ch & 0x7)))) {
2492 result = x;
2493 result->cp++;
2494 }
2495 }
2496 break;
2497 case REOP_NCLASS:
2498 pc = ReadCompactIndex(pc, &index);
2499 assert(index < gData->regexp->classCount);
2500 if (x->cp != gData->cpend) {
2501 charSet = &gData->regexp->classList[index];
2502 assert(charSet->converted);
2503 ch = *x->cp;
2504 index = ch >> 3;
2505 if (charSet->length == 0 ||
2506 ch > charSet->length ||
2507 !(charSet->u.bits[index] & (1 << (ch & 0x7)))) {
2508 result = x;
2509 result->cp++;
2510 }
2511 }
2512 break;
2513
2514 default:
2515 assert(FALSE);
2516 }
2517 if (result) {
2518 if (!updatecp)
2519 x->cp = startcp;
2520 *startpc = pc;
2521 TRACE(" *\n");
2522 return result;
2523 }
2524 x->cp = startcp;
2525 return NULL;
2526 }
2527
2528 static inline match_state_t *
2529 ExecuteREBytecode(REGlobalData *gData, match_state_t *x)
2530 {
2531 match_state_t *result = NULL;
2532 REBackTrackData *backTrackData;
2533 jsbytecode *nextpc, *testpc;
2534 REOp nextop;
2535 RECapture *cap;
2536 REProgState *curState;
2537 const WCHAR *startcp;
2538 size_t parenIndex, k;
2539 size_t parenSoFar = 0;
2540
2541 WCHAR matchCh1, matchCh2;
2542 RECharSet *charSet;
2543
2544 BOOL anchor;
2545 jsbytecode *pc = gData->regexp->program;
2546 REOp op = (REOp) *pc++;
2547
2548 /*
2549 * If the first node is a simple match, step the index into the string
2550 * until that match is made, or fail if it can't be found at all.
2551 */
2552 if (REOP_IS_SIMPLE(op) && !(gData->regexp->flags & REG_STICKY)) {
2553 anchor = FALSE;
2554 while (x->cp <= gData->cpend) {
2555 nextpc = pc; /* reset back to start each time */
2556 result = SimpleMatch(gData, x, op, &nextpc, TRUE);
2557 if (result) {
2558 anchor = TRUE;
2559 x = result;
2560 pc = nextpc; /* accept skip to next opcode */
2561 op = (REOp) *pc++;
2562 assert(op < REOP_LIMIT);
2563 break;
2564 }
2565 gData->skipped++;
2566 x->cp++;
2567 }
2568 if (!anchor)
2569 goto bad;
2570 }
2571
2572 for (;;) {
2573 const char *opname = reop_names[op];
2574 TRACE("\n%06d: %*s%s\n", (int)(pc - gData->regexp->program),
2575 (int)gData->stateStackTop * 2, "", opname);
2576
2577 if (REOP_IS_SIMPLE(op)) {
2578 result = SimpleMatch(gData, x, op, &pc, TRUE);
2579 } else {
2580 curState = &gData->stateStack[gData->stateStackTop];
2581 switch (op) {
2582 case REOP_END:
2583 goto good;
2584 case REOP_ALTPREREQ2:
2585 nextpc = pc + GET_OFFSET(pc); /* start of next op */
2586 pc += ARG_LEN;
2587 matchCh2 = GET_ARG(pc);
2588 pc += ARG_LEN;
2589 k = GET_ARG(pc);
2590 pc += ARG_LEN;
2591
2592 if (x->cp != gData->cpend) {
2593 if (*x->cp == matchCh2)
2594 goto doAlt;
2595
2596 charSet = &gData->regexp->classList[k];
2597 if (!charSet->converted && !ProcessCharSet(gData, charSet))
2598 goto bad;
2599 matchCh1 = *x->cp;
2600 k = matchCh1 >> 3;
2601 if ((charSet->length == 0 ||
2602 matchCh1 > charSet->length ||
2603 !(charSet->u.bits[k] & (1 << (matchCh1 & 0x7)))) ^
2604 charSet->sense) {
2605 goto doAlt;
2606 }
2607 }
2608 result = NULL;
2609 break;
2610
2611 case REOP_ALTPREREQ:
2612 nextpc = pc + GET_OFFSET(pc); /* start of next op */
2613 pc += ARG_LEN;
2614 matchCh1 = GET_ARG(pc);
2615 pc += ARG_LEN;
2616 matchCh2 = GET_ARG(pc);
2617 pc += ARG_LEN;
2618 if (x->cp == gData->cpend ||
2619 (*x->cp != matchCh1 && *x->cp != matchCh2)) {
2620 result = NULL;
2621 break;
2622 }
2623 /* else fall through... */
2624
2625 case REOP_ALT:
2626 doAlt:
2627 nextpc = pc + GET_OFFSET(pc); /* start of next alternate */
2628 pc += ARG_LEN; /* start of this alternate */
2629 curState->parenSoFar = parenSoFar;
2630 PUSH_STATE_STACK(gData);
2631 op = (REOp) *pc++;
2632 startcp = x->cp;
2633 if (REOP_IS_SIMPLE(op)) {
2634 if (!SimpleMatch(gData, x, op, &pc, TRUE)) {
2635 op = (REOp) *nextpc++;
2636 pc = nextpc;
2637 continue;
2638 }
2639 result = x;
2640 op = (REOp) *pc++;
2641 }
2642 nextop = (REOp) *nextpc++;
2643 if (!PushBackTrackState(gData, nextop, nextpc, x, startcp, 0, 0))
2644 goto bad;
2645 continue;
2646
2647 /*
2648 * Occurs at (successful) end of REOP_ALT,
2649 */
2650 case REOP_JUMP:
2651 /*
2652 * If we have not gotten a result here, it is because of an
2653 * empty match. Do the same thing REOP_EMPTY would do.
2654 */
2655 if (!result)
2656 result = x;
2657
2658 --gData->stateStackTop;
2659 pc += GET_OFFSET(pc);
2660 op = (REOp) *pc++;
2661 continue;
2662
2663 /*
2664 * Occurs at last (successful) end of REOP_ALT,
2665 */
2666 case REOP_ENDALT:
2667 /*
2668 * If we have not gotten a result here, it is because of an
2669 * empty match. Do the same thing REOP_EMPTY would do.
2670 */
2671 if (!result)
2672 result = x;
2673
2674 --gData->stateStackTop;
2675 op = (REOp) *pc++;
2676 continue;
2677
2678 case REOP_LPAREN:
2679 pc = ReadCompactIndex(pc, &parenIndex);
2680 TRACE("[ %lu ]\n", (ULONG_PTR)parenIndex);
2681 assert(parenIndex < gData->regexp->parenCount);
2682 if (parenIndex + 1 > parenSoFar)
2683 parenSoFar = parenIndex + 1;
2684 x->parens[parenIndex].index = x->cp - gData->cpbegin;
2685 x->parens[parenIndex].length = 0;
2686 op = (REOp) *pc++;
2687 continue;
2688
2689 case REOP_RPAREN:
2690 {
2691 ptrdiff_t delta;
2692
2693 pc = ReadCompactIndex(pc, &parenIndex);
2694 assert(parenIndex < gData->regexp->parenCount);
2695 cap = &x->parens[parenIndex];
2696 delta = x->cp - (gData->cpbegin + cap->index);
2697 cap->length = (delta < 0) ? 0 : (size_t) delta;
2698 op = (REOp) *pc++;
2699
2700 if (!result)
2701 result = x;
2702 continue;
2703 }
2704 case REOP_ASSERT:
2705 nextpc = pc + GET_OFFSET(pc); /* start of term after ASSERT */
2706 pc += ARG_LEN; /* start of ASSERT child */
2707 op = (REOp) *pc++;
2708 testpc = pc;
2709 if (REOP_IS_SIMPLE(op) &&
2710 !SimpleMatch(gData, x, op, &testpc, FALSE)) {
2711 result = NULL;
2712 break;
2713 }
2714 curState->u.assertion.top =
2715 (char *)gData->backTrackSP - (char *)gData->backTrackStack;
2716 curState->u.assertion.sz = gData->cursz;
2717 curState->index = x->cp - gData->cpbegin;
2718 curState->parenSoFar = parenSoFar;
2719 PUSH_STATE_STACK(gData);
2720 if (!PushBackTrackState(gData, REOP_ASSERTTEST,
2721 nextpc, x, x->cp, 0, 0)) {
2722 goto bad;
2723 }
2724 continue;
2725
2726 case REOP_ASSERT_NOT:
2727 nextpc = pc + GET_OFFSET(pc);
2728 pc += ARG_LEN;
2729 op = (REOp) *pc++;
2730 testpc = pc;
2731 if (REOP_IS_SIMPLE(op) /* Note - fail to fail! */ &&
2732 SimpleMatch(gData, x, op, &testpc, FALSE) &&
2733 *testpc == REOP_ASSERTNOTTEST) {
2734 result = NULL;
2735 break;
2736 }
2737 curState->u.assertion.top
2738 = (char *)gData->backTrackSP -
2739 (char *)gData->backTrackStack;
2740 curState->u.assertion.sz = gData->cursz;
2741 curState->index = x->cp - gData->cpbegin;
2742 curState->parenSoFar = parenSoFar;
2743 PUSH_STATE_STACK(gData);
2744 if (!PushBackTrackState(gData, REOP_ASSERTNOTTEST,
2745 nextpc, x, x->cp, 0, 0)) {
2746 goto bad;
2747 }
2748 continue;
2749
2750 case REOP_ASSERTTEST:
2751 --gData->stateStackTop;
2752 --curState;
2753 x->cp = gData->cpbegin + curState->index;
2754 gData->backTrackSP =
2755 (REBackTrackData *) ((char *)gData->backTrackStack +
2756 curState->u.assertion.top);
2757 gData->cursz = curState->u.assertion.sz;
2758 if (result)
2759 result = x;
2760 break;
2761
2762 case REOP_ASSERTNOTTEST:
2763 --gData->stateStackTop;
2764 --curState;
2765 x->cp = gData->cpbegin + curState->index;
2766 gData->backTrackSP =
2767 (REBackTrackData *) ((char *)gData->backTrackStack +
2768 curState->u.assertion.top);
2769 gData->cursz = curState->u.assertion.sz;
2770 result = (!result) ? x : NULL;
2771 break;
2772 case REOP_STAR:
2773 curState->u.quantifier.min = 0;
2774 curState->u.quantifier.max = (UINT)-1;
2775 goto quantcommon;
2776 case REOP_PLUS:
2777 curState->u.quantifier.min = 1;
2778 curState->u.quantifier.max = (UINT)-1;
2779 goto quantcommon;
2780 case REOP_OPT:
2781 curState->u.quantifier.min = 0;
2782 curState->u.quantifier.max = 1;
2783 goto quantcommon;
2784 case REOP_QUANT:
2785 pc = ReadCompactIndex(pc, &k);
2786 curState->u.quantifier.min = k;
2787 pc = ReadCompactIndex(pc, &k);
2788 /* max is k - 1 to use one byte for (UINT)-1 sentinel. */
2789 curState->u.quantifier.max = k - 1;
2790 assert(curState->u.quantifier.min <= curState->u.quantifier.max);
2791 quantcommon:
2792 if (curState->u.quantifier.max == 0) {
2793 pc = pc + GET_OFFSET(pc);
2794 op = (REOp) *pc++;
2795 result = x;
2796 continue;
2797 }
2798 /* Step over <next> */
2799 nextpc = pc + ARG_LEN;
2800 op = (REOp) *nextpc++;
2801 startcp = x->cp;
2802 if (REOP_IS_SIMPLE(op)) {
2803 if (!SimpleMatch(gData, x, op, &nextpc, TRUE)) {
2804 if (curState->u.quantifier.min == 0)
2805 result = x;
2806 else
2807 result = NULL;
2808 pc = pc + GET_OFFSET(pc);
2809 break;
2810 }
2811 op = (REOp) *nextpc++;
2812 result = x;
2813 }
2814 curState->index = startcp - gData->cpbegin;
2815 curState->continue_op = REOP_REPEAT;
2816 curState->continue_pc = pc;
2817 curState->parenSoFar = parenSoFar;
2818 PUSH_STATE_STACK(gData);
2819 if (curState->u.quantifier.min == 0 &&
2820 !PushBackTrackState(gData, REOP_REPEAT, pc, x, startcp,
2821 0, 0)) {
2822 goto bad;
2823 }
2824 pc = nextpc;
2825 continue;
2826
2827 case REOP_ENDCHILD: /* marks the end of a quantifier child */
2828 pc = curState[-1].continue_pc;
2829 op = (REOp) curState[-1].continue_op;
2830
2831 if (!result)
2832 result = x;
2833 continue;
2834
2835 case REOP_REPEAT:
2836 --curState;
2837 do {
2838 --gData->stateStackTop;
2839 if (!result) {
2840 /* Failed, see if we have enough children. */
2841 if (curState->u.quantifier.min == 0)
2842 goto repeatDone;
2843 goto break_switch;
2844 }
2845 if (curState->u.quantifier.min == 0 &&
2846 x->cp == gData->cpbegin + curState->index) {
2847 /* matched an empty string, that'll get us nowhere */
2848 result = NULL;
2849 goto break_switch;
2850 }
2851 if (curState->u.quantifier.min != 0)
2852 curState->u.quantifier.min--;
2853 if (curState->u.quantifier.max != (UINT) -1)
2854 curState->u.quantifier.max--;
2855 if (curState->u.quantifier.max == 0)
2856 goto repeatDone;
2857 nextpc = pc + ARG_LEN;
2858 nextop = (REOp) *nextpc;
2859 startcp = x->cp;
2860 if (REOP_IS_SIMPLE(nextop)) {
2861 nextpc++;
2862 if (!SimpleMatch(gData, x, nextop, &nextpc, TRUE)) {
2863 if (curState->u.quantifier.min == 0)
2864 goto repeatDone;
2865 result = NULL;
2866 goto break_switch;
2867 }
2868 result = x;
2869 }
2870 curState->index = startcp - gData->cpbegin;
2871 PUSH_STATE_STACK(gData);
2872 if (curState->u.quantifier.min == 0 &&
2873 !PushBackTrackState(gData, REOP_REPEAT,
2874 pc, x, startcp,
2875 curState->parenSoFar,
2876 parenSoFar -
2877 curState->parenSoFar)) {
2878 goto bad;
2879 }
2880 } while (*nextpc == REOP_ENDCHILD);
2881 pc = nextpc;
2882 op = (REOp) *pc++;
2883 parenSoFar = curState->parenSoFar;
2884 continue;
2885
2886 repeatDone:
2887 result = x;
2888 pc += GET_OFFSET(pc);
2889 goto break_switch;
2890
2891 case REOP_MINIMALSTAR:
2892 curState->u.quantifier.min = 0;
2893 curState->u.quantifier.max = (UINT)-1;
2894 goto minimalquantcommon;
2895 case REOP_MINIMALPLUS:
2896 curState->u.quantifier.min = 1;
2897 curState->u.quantifier.max = (UINT)-1;
2898 goto minimalquantcommon;
2899 case REOP_MINIMALOPT:
2900 curState->u.quantifier.min = 0;
2901 curState->u.quantifier.max = 1;
2902 goto minimalquantcommon;
2903 case REOP_MINIMALQUANT:
2904 pc = ReadCompactIndex(pc, &k);
2905 curState->u.quantifier.min = k;
2906 pc = ReadCompactIndex(pc, &k);
2907 /* See REOP_QUANT comments about k - 1. */
2908 curState->u.quantifier.max = k - 1;
2909 assert(curState->u.quantifier.min
2910 <= curState->u.quantifier.max);
2911 minimalquantcommon:
2912 curState->index = x->cp - gData->cpbegin;
2913 curState->parenSoFar = parenSoFar;
2914 PUSH_STATE_STACK(gData);
2915 if (curState->u.quantifier.min != 0) {
2916 curState->continue_op = REOP_MINIMALREPEAT;
2917 curState->continue_pc = pc;
2918 /* step over <next> */
2919 pc += OFFSET_LEN;
2920 op = (REOp) *pc++;
2921 } else {
2922 if (!PushBackTrackState(gData, REOP_MINIMALREPEAT,
2923 pc, x, x->cp, 0, 0)) {
2924 goto bad;
2925 }
2926 --gData->stateStackTop;
2927 pc = pc + GET_OFFSET(pc);
2928 op = (REOp) *pc++;
2929 }
2930 continue;
2931
2932 case REOP_MINIMALREPEAT:
2933 --gData->stateStackTop;
2934 --curState;
2935
2936 TRACE("{%d,%d}\n", curState->u.quantifier.min, curState->u.quantifier.max);
2937 #define PREPARE_REPEAT() \
2938 do { \
2939 curState->index = x->cp - gData->cpbegin; \
2940 curState->continue_op = REOP_MINIMALREPEAT; \
2941 curState->continue_pc = pc; \
2942 pc += ARG_LEN; \
2943 for (k = curState->parenSoFar; k < parenSoFar; k++) \
2944 x->parens[k].index = -1; \
2945 PUSH_STATE_STACK(gData); \
2946 op = (REOp) *pc++; \
2947 assert(op < REOP_LIMIT); \
2948 }while(0)
2949
2950 if (!result) {
2951 TRACE(" -\n");
2952 /*
2953 * Non-greedy failure - try to consume another child.
2954 */
2955 if (curState->u.quantifier.max == (UINT) -1 ||
2956 curState->u.quantifier.max > 0) {
2957 PREPARE_REPEAT();
2958 continue;
2959 }
2960 /* Don't need to adjust pc since we're going to pop. */
2961 break;
2962 }
2963 if (curState->u.quantifier.min == 0 &&
2964 x->cp == gData->cpbegin + curState->index) {
2965 /* Matched an empty string, that'll get us nowhere. */
2966 result = NULL;
2967 break;
2968 }
2969 if (curState->u.quantifier.min != 0)
2970 curState->u.quantifier.min--;
2971 if (curState->u.quantifier.max != (UINT) -1)
2972 curState->u.quantifier.max--;
2973 if (curState->u.quantifier.min != 0) {
2974 PREPARE_REPEAT();
2975 continue;
2976 }
2977 curState->index = x->cp - gData->cpbegin;
2978 curState->parenSoFar = parenSoFar;
2979 PUSH_STATE_STACK(gData);
2980 if (!PushBackTrackState(gData, REOP_MINIMALREPEAT,
2981 pc, x, x->cp,
2982 curState->parenSoFar,
2983 parenSoFar - curState->parenSoFar)) {
2984 goto bad;
2985 }
2986 --gData->stateStackTop;
2987 pc = pc + GET_OFFSET(pc);
2988 op = (REOp) *pc++;
2989 assert(op < REOP_LIMIT);
2990 continue;
2991 default:
2992 assert(FALSE);
2993 result = NULL;
2994 }
2995 break_switch:;
2996 }
2997
2998 /*
2999 * If the match failed and there's a backtrack option, take it.
3000 * Otherwise this is a complete and utter failure.
3001 */
3002 if (!result) {
3003 if (gData->cursz == 0)
3004 return NULL;
3005
3006 /* Potentially detect explosive regex here. */
3007 gData->backTrackCount++;
3008 if (gData->backTrackLimit &&
3009 gData->backTrackCount >= gData->backTrackLimit) {
3010 JS_ReportErrorNumber(gData->cx, js_GetErrorMessage, NULL,
3011 JSMSG_REGEXP_TOO_COMPLEX);
3012 gData->ok = FALSE;
3013 return NULL;
3014 }
3015
3016 backTrackData = gData->backTrackSP;
3017 gData->cursz = backTrackData->sz;
3018 gData->backTrackSP =
3019 (REBackTrackData *) ((char *)backTrackData - backTrackData->sz);
3020 x->cp = backTrackData->cp;
3021 pc = backTrackData->backtrack_pc;
3022 op = (REOp) backTrackData->backtrack_op;
3023 assert(op < REOP_LIMIT);
3024 gData->stateStackTop = backTrackData->saveStateStackTop;
3025 assert(gData->stateStackTop);
3026
3027 memcpy(gData->stateStack, backTrackData + 1,
3028 sizeof(REProgState) * backTrackData->saveStateStackTop);
3029 curState = &gData->stateStack[gData->stateStackTop - 1];
3030
3031 if (backTrackData->parenCount) {
3032 memcpy(&x->parens[backTrackData->parenIndex],
3033 (char *)(backTrackData + 1) +
3034 sizeof(REProgState) * backTrackData->saveStateStackTop,
3035 sizeof(RECapture) * backTrackData->parenCount);
3036 parenSoFar = backTrackData->parenIndex + backTrackData->parenCount;
3037 } else {
3038 for (k = curState->parenSoFar; k < parenSoFar; k++)
3039 x->parens[k].index = -1;
3040 parenSoFar = curState->parenSoFar;
3041 }
3042
3043 TRACE("\tBT_Pop: %ld,%ld\n",
3044 (ULONG_PTR)backTrackData->parenIndex,
3045 (ULONG_PTR)backTrackData->parenCount);
3046 continue;
3047 }
3048 x = result;
3049
3050 /*
3051 * Continue with the expression.
3052 */
3053 op = (REOp)*pc++;
3054 assert(op < REOP_LIMIT);
3055 }
3056
3057 bad:
3058 TRACE("\n");
3059 return NULL;
3060
3061 good:
3062 TRACE("\n");
3063 return x;
3064 }
3065
3066 static match_state_t *MatchRegExp(REGlobalData *gData, match_state_t *x)
3067 {
3068 match_state_t *result;
3069 const WCHAR *cp = x->cp;
3070 const WCHAR *cp2;
3071 UINT j;
3072
3073 /*
3074 * Have to include the position beyond the last character
3075 * in order to detect end-of-input/line condition.
3076 */
3077 for (cp2 = cp; cp2 <= gData->cpend; cp2++) {
3078 gData->skipped = cp2 - cp;
3079 x->cp = cp2;
3080 for (j = 0; j < gData->regexp->parenCount; j++)
3081 x->parens[j].index = -1;
3082 result = ExecuteREBytecode(gData, x);
3083 if (!gData->ok || result || (gData->regexp->flags & REG_STICKY))
3084 return result;
3085 gData->backTrackSP = gData->backTrackStack;
3086 gData->cursz = 0;
3087 gData->stateStackTop = 0;
3088 cp2 = cp + gData->skipped;
3089 }
3090 return NULL;
3091 }
3092
3093 static HRESULT InitMatch(regexp_t *re, void *cx, heap_pool_t *pool, REGlobalData *gData)
3094 {
3095 UINT i;
3096
3097 gData->backTrackStackSize = INITIAL_BACKTRACK;
3098 gData->backTrackStack = heap_pool_alloc(gData->pool, INITIAL_BACKTRACK);
3099 if (!gData->backTrackStack)
3100 goto bad;
3101
3102 gData->backTrackSP = gData->backTrackStack;
3103 gData->cursz = 0;
3104 gData->backTrackCount = 0;
3105 gData->backTrackLimit = 0;
3106
3107 gData->stateStackLimit = INITIAL_STATESTACK;
3108 gData->stateStack = heap_pool_alloc(gData->pool, sizeof(REProgState) * INITIAL_STATESTACK);
3109 if (!gData->stateStack)
3110 goto bad;
3111
3112 gData->stateStackTop = 0;
3113 gData->cx = cx;
3114 gData->pool = pool;
3115 gData->regexp = re;
3116 gData->ok = TRUE;
3117
3118 for (i = 0; i < re->classCount; i++) {
3119 if (!re->classList[i].converted &&
3120 !ProcessCharSet(gData, &re->classList[i])) {
3121 return E_FAIL;
3122 }
3123 }
3124
3125 return S_OK;
3126
3127 bad:
3128 js_ReportOutOfScriptQuota(cx);
3129 gData->ok = FALSE;
3130 return E_OUTOFMEMORY;
3131 }
3132
3133 HRESULT regexp_execute(regexp_t *regexp, void *cx, heap_pool_t *pool,
3134 const WCHAR *str, DWORD str_len, match_state_t *result)
3135 {
3136 match_state_t *res;
3137 REGlobalData gData;
3138 heap_pool_t *mark = heap_pool_mark(pool);
3139 const WCHAR *str_beg = result->cp;
3140 HRESULT hres;
3141
3142 assert(result->cp != NULL);
3143
3144 gData.cpbegin = str;
3145 gData.cpend = str+str_len;
3146 gData.start = result->cp-str;
3147 gData.skipped = 0;
3148 gData.pool = pool;
3149
3150 hres = InitMatch(regexp, cx, pool, &gData);
3151 if(FAILED(hres)) {
3152 WARN("InitMatch failed\n");
3153 heap_pool_clear(mark);
3154 return hres;
3155 }
3156
3157 res = MatchRegExp(&gData, result);
3158 heap_pool_clear(mark);
3159 if(!gData.ok) {
3160 WARN("MatchRegExp failed\n");
3161 return E_FAIL;
3162 }
3163
3164 if(!res) {
3165 result->match_len = 0;
3166 return S_FALSE;
3167 }
3168
3169 result->match_len = (result->cp-str_beg) - gData.skipped;
3170 result->paren_count = regexp->parenCount;
3171 return S_OK;
3172 }
3173
3174 void regexp_destroy(regexp_t *re)
3175 {
3176 if (re->classList) {
3177 UINT i;
3178 for (i = 0; i < re->classCount; i++) {
3179 if (re->classList[i].converted)
3180 heap_free(re->classList[i].u.bits);
3181 re->classList[i].u.bits = NULL;
3182 }
3183 heap_free(re->classList);
3184 }
3185 heap_free(re);
3186 }
3187
3188 regexp_t* regexp_new(void *cx, heap_pool_t *pool, const WCHAR *str,
3189 DWORD str_len, WORD flags, BOOL flat)
3190 {
3191 regexp_t *re;
3192 heap_pool_t *mark;
3193 CompilerState state;
3194 size_t resize;
3195 jsbytecode *endPC;
3196 UINT i;
3197 size_t len;
3198
3199 re = NULL;
3200 mark = heap_pool_mark(pool);
3201 len = str_len;
3202
3203 state.context = cx;
3204 state.pool = pool;
3205 state.cp = str;
3206 if (!state.cp)
3207 goto out;
3208 state.cpbegin = state.cp;
3209 state.cpend = state.cp + len;
3210 state.flags = flags;
3211 state.parenCount = 0;
3212 state.classCount = 0;
3213 state.progLength = 0;
3214 state.treeDepth = 0;
3215 state.classBitmapsMem = 0;
3216 for (i = 0; i < CLASS_CACHE_SIZE; i++)
3217 state.classCache[i].start = NULL;
3218
3219 if (len != 0 && flat) {
3220 state.result = NewRENode(&state, REOP_FLAT);
3221 if (!state.result)
3222 goto out;
3223 state.result->u.flat.chr = *state.cpbegin;
3224 state.result->u.flat.length = len;
3225 state.result->kid = (void *) state.cpbegin;
3226 /* Flat bytecode: REOP_FLAT compact(string_offset) compact(len). */
3227 state.progLength += 1 + GetCompactIndexWidth(0)
3228 + GetCompactIndexWidth(len);
3229 } else {
3230 if (!ParseRegExp(&state))
3231 goto out;
3232 }
3233 resize = offsetof(regexp_t, program) + state.progLength + 1;
3234 re = heap_alloc(resize);
3235 if (!re)
3236 goto out;
3237
3238 assert(state.classBitmapsMem <= CLASS_BITMAPS_MEM_LIMIT);
3239 re->classCount = state.classCount;
3240 if (re->classCount) {
3241 re->classList = heap_alloc(re->classCount * sizeof(RECharSet));
3242 if (!re->classList) {
3243 regexp_destroy(re);
3244 re = NULL;
3245 goto out;
3246 }
3247 for (i = 0; i < re->classCount; i++)
3248 re->classList[i].converted = FALSE;
3249 } else {
3250 re->classList = NULL;
3251 }
3252 endPC = EmitREBytecode(&state, re, state.treeDepth, re->program, state.result);
3253 if (!endPC) {
3254 regexp_destroy(re);
3255 re = NULL;
3256 goto out;
3257 }
3258 *endPC++ = REOP_END;
3259 /*
3260 * Check whether size was overestimated and shrink using realloc.
3261 * This is safe since no pointers to newly parsed regexp or its parts
3262 * besides re exist here.
3263 */
3264 if ((size_t)(endPC - re->program) != state.progLength + 1) {
3265 regexp_t *tmp;
3266 assert((size_t)(endPC - re->program) < state.progLength + 1);
3267 resize = offsetof(regexp_t, program) + (endPC - re->program);
3268 tmp = heap_realloc(re, resize);
3269 if (tmp)
3270 re = tmp;
3271 }
3272
3273 re->flags = flags;
3274 re->parenCount = state.parenCount;
3275 re->source = str;
3276 re->source_len = str_len;
3277
3278 out:
3279 heap_pool_clear(mark);
3280 return re;
3281 }
3282
3283 HRESULT regexp_set_flags(regexp_t **regexp, void *cx, heap_pool_t *pool, WORD flags)
3284 {
3285 if(((*regexp)->flags & REG_FOLD) != (flags & REG_FOLD)) {
3286 regexp_t *new_regexp = regexp_new(cx, pool, (*regexp)->source,
3287 (*regexp)->source_len, flags, FALSE);
3288
3289 if(!new_regexp)
3290 return E_FAIL;
3291
3292 regexp_destroy(*regexp);
3293 *regexp = new_regexp;
3294 }else {
3295 (*regexp)->flags = flags;
3296 }
3297
3298 return S_OK;
3299 }