2 * Copyright 2008 Jacek Caban for CodeWeavers
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public
6 * License as published by the Free Software Foundation; either
7 * version 2.1 of the License, or (at your option) any later version.
9 * This library is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
20 * Code in this file is based on files:
23 * from Mozilla project, released under LGPL 2.1 or later.
25 * The Original Code is Mozilla Communicator client code, released
28 * The Initial Developer of the Original Code is
29 * Netscape Communications Corporation.
30 * Portions created by the Initial Developer are Copyright (C) 1998
31 * the Initial Developer. All Rights Reserved.
36 /* FIXME: Better error handling */
37 #define ReportRegExpError(a,b,c)
38 #define ReportRegExpErrorHelper(a,b,c,d)
39 #define JS_ReportErrorNumber(a,b,c,d)
40 #define JS_ReportErrorFlagsAndNumber(a,b,c,d,e,f)
41 #define js_ReportOutOfScriptQuota(a)
42 #define JS_ReportOutOfMemory(a)
43 #define JS_COUNT_OPERATION(a,b)
46 typedef BYTE JSPackedBool
;
49 * This struct holds a bitmap representation of a class from a regexp.
50 * There's a list of these referenced by the classList field in the regexp_t
51 * struct below. The initial state has startIndex set to the offset in the
52 * original regexp source of the beginning of the class contents. The first
53 * use of the class converts the source representation into a bitmap.
56 typedef struct RECharSet
{
57 JSPackedBool converted
;
69 #define JSMSG_MIN_TOO_BIG 47
70 #define JSMSG_MAX_TOO_BIG 48
71 #define JSMSG_OUT_OF_ORDER 49
72 #define JSMSG_OUT_OF_MEMORY 137
74 #define LINE_SEPARATOR 0x2028
75 #define PARA_SEPARATOR 0x2029
77 #define RE_IS_LETTER(c) (((c >= 'A') && (c <= 'Z')) || \
78 ((c >= 'a') && (c <= 'z')) )
79 #define RE_IS_LINE_TERM(c) ((c == '\n') || (c == '\r') || \
80 (c == LINE_SEPARATOR) || (c == PARA_SEPARATOR))
82 #define JS_ISWORD(c) ((c) < 128 && (isalnum(c) || (c) == '_'))
84 #define JS7_ISDEC(c) ((((unsigned)(c)) - '0') <= 9)
85 #define JS7_UNDEC(c) ((c) - '0')
137 REOP_LIMIT
/* META: no operator >= to this */
140 #define REOP_IS_SIMPLE(op) ((op) <= REOP_NCLASS)
142 static const char *reop_names
[] = {
195 typedef struct REProgState
{
196 jsbytecode
*continue_pc
; /* current continuation data */
197 jsbytecode continue_op
;
198 ptrdiff_t index
; /* progress in text */
199 size_t parenSoFar
; /* highest indexed paren started */
202 UINT min
; /* current quantifier limits */
206 size_t top
; /* backtrack stack state */
212 typedef struct REBackTrackData
{
213 size_t sz
; /* size of previous stack entry */
214 jsbytecode
*backtrack_pc
; /* where to backtrack to */
215 jsbytecode backtrack_op
;
216 const WCHAR
*cp
; /* index in text of match at backtrack */
217 size_t parenIndex
; /* start index of saved paren contents */
218 size_t parenCount
; /* # of saved paren contents */
219 size_t saveStateStackTop
; /* number of parent states */
220 /* saved parent states follow */
221 /* saved paren contents follow */
224 #define INITIAL_STATESTACK 100
225 #define INITIAL_BACKTRACK 8000
227 typedef struct REGlobalData
{
229 regexp_t
*regexp
; /* the RE in execution */
230 BOOL ok
; /* runtime error (out_of_memory only?) */
231 size_t start
; /* offset to start at */
232 ptrdiff_t skipped
; /* chars skipped anchoring this r.e. */
233 const WCHAR
*cpbegin
; /* text base address */
234 const WCHAR
*cpend
; /* text limit address */
236 REProgState
*stateStack
; /* stack of state of current parents */
237 size_t stateStackTop
;
238 size_t stateStackLimit
;
240 REBackTrackData
*backTrackStack
;/* stack of matched-so-far positions */
241 REBackTrackData
*backTrackSP
;
242 size_t backTrackStackSize
;
243 size_t cursz
; /* size of current stack entry */
244 size_t backTrackCount
; /* how many times we've backtracked */
245 size_t backTrackLimit
; /* upper limit on backtrack states */
247 heap_pool_t
*pool
; /* It's faster to use one malloc'd pool
248 than to malloc/free the three items
249 that are allocated from this pool */
252 typedef struct RENode RENode
;
254 REOp op
; /* r.e. op bytecode */
255 RENode
*next
; /* next in concatenation order */
256 void *kid
; /* first operand */
258 void *kid2
; /* second operand */
259 INT num
; /* could be a number */
260 size_t parenIndex
; /* or a parenthesis index */
261 struct { /* or a quantifier range */
266 struct { /* or a character class */
268 size_t kidlen
; /* length of string at kid, in jschars */
269 size_t index
; /* index into class list */
270 WORD bmsize
; /* bitmap size, based on max char code */
273 struct { /* or a literal sequence */
274 WCHAR chr
; /* of one character */
275 size_t length
; /* or many (via the kid) */
278 RENode
*kid2
; /* second operand from ALT */
279 WCHAR ch1
; /* match char for ALTPREREQ */
280 WCHAR ch2
; /* ditto, or class index for ALTPREREQ2 */
285 #define CLASS_CACHE_SIZE 4
287 typedef struct CompilerState
{
289 const WCHAR
*cpbegin
;
293 size_t classCount
; /* number of [] encountered */
294 size_t treeDepth
; /* maximum depth of parse tree */
295 size_t progLength
; /* estimated bytecode length */
297 size_t classBitmapsMem
; /* memory to hold all class bitmaps */
299 const WCHAR
*start
; /* small cache of class strings */
300 size_t length
; /* since they're often the same */
302 } classCache
[CLASS_CACHE_SIZE
];
305 heap_pool_t
*pool
; /* It's faster to use one malloc'd pool
306 than to malloc/free */
309 typedef struct EmitStateStackEntry
{
310 jsbytecode
*altHead
; /* start of REOP_ALT* opcode */
311 jsbytecode
*nextAltFixup
; /* fixup pointer to next-alt offset */
312 jsbytecode
*nextTermFixup
; /* fixup ptr. to REOP_JUMP offset */
313 jsbytecode
*endTermFixup
; /* fixup ptr. to REOPT_ALTPREREQ* offset */
314 RENode
*continueNode
; /* original REOP_ALT* node being stacked */
315 jsbytecode continueOp
; /* REOP_JUMP or REOP_ENDALT continuation */
316 JSPackedBool jumpToJumpFlag
; /* true if we've patched jump-to-jump to
317 avoid 16-bit unsigned offset overflow */
318 } EmitStateStackEntry
;
321 * Immediate operand sizes and getter/setters. Unlike the ones in jsopcode.h,
322 * the getters and setters take the pc of the offset, not of the opcode before
326 #define GET_ARG(pc) ((WORD)(((pc)[0] << 8) | (pc)[1]))
327 #define SET_ARG(pc, arg) ((pc)[0] = (jsbytecode) ((arg) >> 8), \
328 (pc)[1] = (jsbytecode) (arg))
330 #define OFFSET_LEN ARG_LEN
331 #define OFFSET_MAX ((1 << (ARG_LEN * 8)) - 1)
332 #define GET_OFFSET(pc) GET_ARG(pc)
334 static BOOL
ParseRegExp(CompilerState
*);
337 * Maximum supported tree depth is maximum size of EmitStateStackEntry stack.
338 * For sanity, we limit it to 2^24 bytes.
340 #define TREE_DEPTH_MAX ((1 << 24) / sizeof(EmitStateStackEntry))
343 * The maximum memory that can be allocated for class bitmaps.
344 * For sanity, we limit it to 2^24 bytes.
346 #define CLASS_BITMAPS_MEM_LIMIT (1 << 24)
349 * Functions to get size and write/read bytecode that represent small indexes
351 * Each byte in the code represent 7-bit chunk of the index. 8th bit when set
352 * indicates that the following byte brings more bits to the index. Otherwise
353 * this is the last byte in the index bytecode representing highest index bits.
356 GetCompactIndexWidth(size_t index
)
360 for (width
= 1; (index
>>= 7) != 0; ++width
) { }
364 static inline jsbytecode
*
365 WriteCompactIndex(jsbytecode
*pc
, size_t index
)
369 while ((next
= index
>> 7) != 0) {
370 *pc
++ = (jsbytecode
)(index
| 0x80);
373 *pc
++ = (jsbytecode
)index
;
377 static inline jsbytecode
*
378 ReadCompactIndex(jsbytecode
*pc
, size_t *result
)
383 if ((nextByte
& 0x80) == 0) {
385 * Short-circuit the most common case when compact index <= 127.
390 *result
= 0x7F & nextByte
;
393 *result
|= (nextByte
& 0x7F) << shift
;
395 } while ((nextByte
& 0x80) != 0);
400 /* Construct and initialize an RENode, returning NULL for out-of-memory */
402 NewRENode(CompilerState
*state
, REOp op
)
406 ren
= heap_pool_alloc(state
->pool
, sizeof(*ren
));
408 /* js_ReportOutOfScriptQuota(cx); */
418 * Validates and converts hex ascii value.
421 isASCIIHexDigit(WCHAR c
, UINT
*digit
)
432 if (cv
>= 'a' && cv
<= 'f') {
433 *digit
= cv
- 'a' + 10;
445 #define JUMP_OFFSET_HI(off) ((jsbytecode)((off) >> 8))
446 #define JUMP_OFFSET_LO(off) ((jsbytecode)(off))
449 SetForwardJumpOffset(jsbytecode
*jump
, jsbytecode
*target
)
451 ptrdiff_t offset
= target
- jump
;
453 /* Check that target really points forward. */
455 if ((size_t)offset
> OFFSET_MAX
)
458 jump
[0] = JUMP_OFFSET_HI(offset
);
459 jump
[1] = JUMP_OFFSET_LO(offset
);
464 * Generate bytecode for the tree rooted at t using an explicit stack instead
468 EmitREBytecode(CompilerState
*state
, regexp_t
*re
, size_t treeDepth
,
469 jsbytecode
*pc
, RENode
*t
)
471 EmitStateStackEntry
*emitStateSP
, *emitStateStack
;
475 if (treeDepth
== 0) {
476 emitStateStack
= NULL
;
478 emitStateStack
= heap_alloc(sizeof(EmitStateStackEntry
) * treeDepth
);
482 emitStateSP
= emitStateStack
;
484 assert(op
< REOP_LIMIT
);
493 case REOP_ALTPREREQ2
:
496 emitStateSP
->altHead
= pc
- 1;
497 emitStateSP
->endTermFixup
= pc
;
499 SET_ARG(pc
, t
->u
.altprereq
.ch1
);
501 SET_ARG(pc
, t
->u
.altprereq
.ch2
);
504 emitStateSP
->nextAltFixup
= pc
; /* offset to next alternate */
507 emitStateSP
->continueNode
= t
;
508 emitStateSP
->continueOp
= REOP_JUMP
;
509 emitStateSP
->jumpToJumpFlag
= FALSE
;
511 assert((size_t)(emitStateSP
- emitStateStack
) <= treeDepth
);
514 assert(op
< REOP_LIMIT
);
518 emitStateSP
->nextTermFixup
= pc
; /* offset to following term */
520 if (!SetForwardJumpOffset(emitStateSP
->nextAltFixup
, pc
))
522 emitStateSP
->continueOp
= REOP_ENDALT
;
524 assert((size_t)(emitStateSP
- emitStateStack
) <= treeDepth
);
527 assert(op
< REOP_LIMIT
);
532 * If we already patched emitStateSP->nextTermFixup to jump to
533 * a nearer jump, to avoid 16-bit immediate offset overflow, we
536 if (emitStateSP
->jumpToJumpFlag
)
540 * Fix up the REOP_JUMP offset to go to the op after REOP_ENDALT.
541 * REOP_ENDALT is executed only on successful match of the last
542 * alternate in a group.
544 if (!SetForwardJumpOffset(emitStateSP
->nextTermFixup
, pc
))
546 if (t
->op
!= REOP_ALT
) {
547 if (!SetForwardJumpOffset(emitStateSP
->endTermFixup
, pc
))
552 * If the program is bigger than the REOP_JUMP offset range, then
553 * we must check for alternates before this one that are part of
554 * the same group, and fix up their jump offsets to target jumps
555 * close enough to fit in a 16-bit unsigned offset immediate.
557 if ((size_t)(pc
- re
->program
) > OFFSET_MAX
&&
558 emitStateSP
> emitStateStack
) {
559 EmitStateStackEntry
*esp
, *esp2
;
560 jsbytecode
*alt
, *jump
;
561 ptrdiff_t span
, header
;
565 for (esp
= esp2
- 1; esp
>= emitStateStack
; --esp
) {
566 if (esp
->continueOp
== REOP_ENDALT
&&
567 !esp
->jumpToJumpFlag
&&
568 esp
->nextTermFixup
+ OFFSET_LEN
== alt
&&
569 (size_t)(pc
- ((esp
->continueNode
->op
!= REOP_ALT
)
571 : esp
->nextTermFixup
)) > OFFSET_MAX
) {
573 jump
= esp
->nextTermFixup
;
576 * The span must be 1 less than the distance from
577 * jump offset to jump offset, so we actually jump
578 * to a REOP_JUMP bytecode, not to its offset!
581 assert(jump
< esp2
->nextTermFixup
);
582 span
= esp2
->nextTermFixup
- jump
- 1;
583 if ((size_t)span
<= OFFSET_MAX
)
588 } while (esp2
->continueOp
!= REOP_ENDALT
);
591 jump
[0] = JUMP_OFFSET_HI(span
);
592 jump
[1] = JUMP_OFFSET_LO(span
);
594 if (esp
->continueNode
->op
!= REOP_ALT
) {
596 * We must patch the offset at esp->endTermFixup
597 * as well, for the REOP_ALTPREREQ{,2} opcodes.
598 * If we're unlucky and endTermFixup is more than
599 * OFFSET_MAX bytes from its target, we cheat by
600 * jumping 6 bytes to the jump whose offset is at
601 * esp->nextTermFixup, which has the same target.
603 jump
= esp
->endTermFixup
;
604 header
= esp
->nextTermFixup
- jump
;
606 if ((size_t)span
> OFFSET_MAX
)
609 jump
[0] = JUMP_OFFSET_HI(span
);
610 jump
[1] = JUMP_OFFSET_LO(span
);
613 esp
->jumpToJumpFlag
= TRUE
;
621 emitStateSP
->altHead
= pc
- 1;
622 emitStateSP
->nextAltFixup
= pc
; /* offset to next alternate */
624 emitStateSP
->continueNode
= t
;
625 emitStateSP
->continueOp
= REOP_JUMP
;
626 emitStateSP
->jumpToJumpFlag
= FALSE
;
628 assert((size_t)(emitStateSP
- emitStateStack
) <= treeDepth
);
631 assert(op
< REOP_LIMIT
);
636 * Coalesce FLATs if possible and if it would not increase bytecode
637 * beyond preallocated limit. The latter happens only when bytecode
638 * size for coalesced string with offset p and length 2 exceeds 6
639 * bytes preallocated for 2 single char nodes, i.e. when
640 * 1 + GetCompactIndexWidth(p) + GetCompactIndexWidth(2) > 6 or
641 * GetCompactIndexWidth(p) > 4.
642 * Since when GetCompactIndexWidth(p) <= 4 coalescing of 3 or more
643 * nodes strictly decreases bytecode size, the check has to be
644 * done only for the first coalescing.
647 GetCompactIndexWidth((WCHAR
*)t
->kid
- state
->cpbegin
) <= 4)
650 t
->next
->op
== REOP_FLAT
&&
651 (WCHAR
*)t
->kid
+ t
->u
.flat
.length
==
653 t
->u
.flat
.length
+= t
->next
->u
.flat
.length
;
654 t
->next
= t
->next
->next
;
657 if (t
->kid
&& t
->u
.flat
.length
> 1) {
658 pc
[-1] = (state
->flags
& REG_FOLD
) ? REOP_FLATi
: REOP_FLAT
;
659 pc
= WriteCompactIndex(pc
, (WCHAR
*)t
->kid
- state
->cpbegin
);
660 pc
= WriteCompactIndex(pc
, t
->u
.flat
.length
);
661 } else if (t
->u
.flat
.chr
< 256) {
662 pc
[-1] = (state
->flags
& REG_FOLD
) ? REOP_FLAT1i
: REOP_FLAT1
;
663 *pc
++ = (jsbytecode
) t
->u
.flat
.chr
;
665 pc
[-1] = (state
->flags
& REG_FOLD
)
668 SET_ARG(pc
, t
->u
.flat
.chr
);
675 pc
= WriteCompactIndex(pc
, t
->u
.parenIndex
);
676 emitStateSP
->continueNode
= t
;
677 emitStateSP
->continueOp
= REOP_RPAREN
;
679 assert((size_t)(emitStateSP
- emitStateStack
) <= treeDepth
);
685 pc
= WriteCompactIndex(pc
, t
->u
.parenIndex
);
689 pc
= WriteCompactIndex(pc
, t
->u
.parenIndex
);
694 emitStateSP
->nextTermFixup
= pc
;
696 emitStateSP
->continueNode
= t
;
697 emitStateSP
->continueOp
= REOP_ASSERTTEST
;
699 assert((size_t)(emitStateSP
- emitStateStack
) <= treeDepth
);
704 case REOP_ASSERTTEST
:
705 case REOP_ASSERTNOTTEST
:
706 if (!SetForwardJumpOffset(emitStateSP
->nextTermFixup
, pc
))
710 case REOP_ASSERT_NOT
:
712 emitStateSP
->nextTermFixup
= pc
;
714 emitStateSP
->continueNode
= t
;
715 emitStateSP
->continueOp
= REOP_ASSERTNOTTEST
;
717 assert((size_t)(emitStateSP
- emitStateStack
) <= treeDepth
);
724 if (t
->u
.range
.min
== 0 && t
->u
.range
.max
== (UINT
)-1) {
725 pc
[-1] = (t
->u
.range
.greedy
) ? REOP_STAR
: REOP_MINIMALSTAR
;
726 } else if (t
->u
.range
.min
== 0 && t
->u
.range
.max
== 1) {
727 pc
[-1] = (t
->u
.range
.greedy
) ? REOP_OPT
: REOP_MINIMALOPT
;
728 } else if (t
->u
.range
.min
== 1 && t
->u
.range
.max
== (UINT
) -1) {
729 pc
[-1] = (t
->u
.range
.greedy
) ? REOP_PLUS
: REOP_MINIMALPLUS
;
731 if (!t
->u
.range
.greedy
)
732 pc
[-1] = REOP_MINIMALQUANT
;
733 pc
= WriteCompactIndex(pc
, t
->u
.range
.min
);
735 * Write max + 1 to avoid using size_t(max) + 1 bytes
736 * for (UINT)-1 sentinel.
738 pc
= WriteCompactIndex(pc
, t
->u
.range
.max
+ 1);
740 emitStateSP
->nextTermFixup
= pc
;
742 emitStateSP
->continueNode
= t
;
743 emitStateSP
->continueOp
= REOP_ENDCHILD
;
745 assert((size_t)(emitStateSP
- emitStateStack
) <= treeDepth
);
751 if (!SetForwardJumpOffset(emitStateSP
->nextTermFixup
, pc
))
756 if (!t
->u
.ucclass
.sense
)
757 pc
[-1] = REOP_NCLASS
;
758 pc
= WriteCompactIndex(pc
, t
->u
.ucclass
.index
);
759 charSet
= &re
->classList
[t
->u
.ucclass
.index
];
760 charSet
->converted
= FALSE
;
761 charSet
->length
= t
->u
.ucclass
.bmsize
;
762 charSet
->u
.src
.startIndex
= t
->u
.ucclass
.startIndex
;
763 charSet
->u
.src
.length
= t
->u
.ucclass
.kidlen
;
764 charSet
->sense
= t
->u
.ucclass
.sense
;
775 if (emitStateSP
== emitStateStack
)
778 t
= emitStateSP
->continueNode
;
779 op
= (REOp
) emitStateSP
->continueOp
;
784 heap_free(emitStateStack
);
788 ReportRegExpError(state
, JSREPORT_ERROR
, JSMSG_REGEXP_TOO_COMPLEX
);
794 * Process the op against the two top operands, reducing them to a single
795 * operand in the penultimate slot. Update progLength and treeDepth.
798 ProcessOp(CompilerState
*state
, REOpData
*opData
, RENode
**operandStack
,
803 switch (opData
->op
) {
805 result
= NewRENode(state
, REOP_ALT
);
808 result
->kid
= operandStack
[operandSP
- 2];
809 result
->u
.kid2
= operandStack
[operandSP
- 1];
810 operandStack
[operandSP
- 2] = result
;
812 if (state
->treeDepth
== TREE_DEPTH_MAX
) {
813 ReportRegExpError(state
, JSREPORT_ERROR
, JSMSG_REGEXP_TOO_COMPLEX
);
819 * Look at both alternates to see if there's a FLAT or a CLASS at
820 * the start of each. If so, use a prerequisite match.
822 if (((RENode
*) result
->kid
)->op
== REOP_FLAT
&&
823 ((RENode
*) result
->u
.kid2
)->op
== REOP_FLAT
&&
824 (state
->flags
& REG_FOLD
) == 0) {
825 result
->op
= REOP_ALTPREREQ
;
826 result
->u
.altprereq
.ch1
= ((RENode
*) result
->kid
)->u
.flat
.chr
;
827 result
->u
.altprereq
.ch2
= ((RENode
*) result
->u
.kid2
)->u
.flat
.chr
;
828 /* ALTPREREQ, <end>, uch1, uch2, <next>, ...,
829 JUMP, <end> ... ENDALT */
830 state
->progLength
+= 13;
833 if (((RENode
*) result
->kid
)->op
== REOP_CLASS
&&
834 ((RENode
*) result
->kid
)->u
.ucclass
.index
< 256 &&
835 ((RENode
*) result
->u
.kid2
)->op
== REOP_FLAT
&&
836 (state
->flags
& REG_FOLD
) == 0) {
837 result
->op
= REOP_ALTPREREQ2
;
838 result
->u
.altprereq
.ch1
= ((RENode
*) result
->u
.kid2
)->u
.flat
.chr
;
839 result
->u
.altprereq
.ch2
= ((RENode
*) result
->kid
)->u
.ucclass
.index
;
840 /* ALTPREREQ2, <end>, uch1, uch2, <next>, ...,
841 JUMP, <end> ... ENDALT */
842 state
->progLength
+= 13;
845 if (((RENode
*) result
->kid
)->op
== REOP_FLAT
&&
846 ((RENode
*) result
->u
.kid2
)->op
== REOP_CLASS
&&
847 ((RENode
*) result
->u
.kid2
)->u
.ucclass
.index
< 256 &&
848 (state
->flags
& REG_FOLD
) == 0) {
849 result
->op
= REOP_ALTPREREQ2
;
850 result
->u
.altprereq
.ch1
= ((RENode
*) result
->kid
)->u
.flat
.chr
;
851 result
->u
.altprereq
.ch2
=
852 ((RENode
*) result
->u
.kid2
)->u
.ucclass
.index
;
853 /* ALTPREREQ2, <end>, uch1, uch2, <next>, ...,
854 JUMP, <end> ... ENDALT */
855 state
->progLength
+= 13;
858 /* ALT, <next>, ..., JUMP, <end> ... ENDALT */
859 state
->progLength
+= 7;
864 result
= operandStack
[operandSP
- 2];
866 result
= result
->next
;
867 result
->next
= operandStack
[operandSP
- 1];
871 case REOP_ASSERT_NOT
:
874 /* These should have been processed by a close paren. */
875 ReportRegExpErrorHelper(state
, JSREPORT_ERROR
, JSMSG_MISSING_PAREN
,
885 * Hack two bits in CompilerState.flags, for use within FindParenCount to flag
886 * its being on the stack, and to propagate errors to its callers.
888 #define JSREG_FIND_PAREN_COUNT 0x8000
889 #define JSREG_FIND_PAREN_ERROR 0x4000
892 * Magic return value from FindParenCount and GetDecimalValue, to indicate
893 * overflow beyond GetDecimalValue's max parameter, or a computed maximum if
894 * its findMax parameter is non-null.
896 #define OVERFLOW_VALUE ((UINT)-1)
899 FindParenCount(CompilerState
*state
)
904 if (state
->flags
& JSREG_FIND_PAREN_COUNT
)
905 return OVERFLOW_VALUE
;
908 * Copy state into temp, flag it so we never report an invalid backref,
909 * and reset its members to parse the entire regexp. This is obviously
910 * suboptimal, but GetDecimalValue calls us only if a backref appears to
911 * refer to a forward parenthetical, which is rare.
914 temp
.flags
|= JSREG_FIND_PAREN_COUNT
;
915 temp
.cp
= temp
.cpbegin
;
920 temp
.classBitmapsMem
= 0;
921 for (i
= 0; i
< CLASS_CACHE_SIZE
; i
++)
922 temp
.classCache
[i
].start
= NULL
;
924 if (!ParseRegExp(&temp
)) {
925 state
->flags
|= JSREG_FIND_PAREN_ERROR
;
926 return OVERFLOW_VALUE
;
928 return temp
.parenCount
;
932 * Extract and return a decimal value at state->cp. The initial character c
933 * has already been read. Return OVERFLOW_VALUE if the result exceeds max.
934 * Callers who pass a non-null findMax should test JSREG_FIND_PAREN_ERROR in
935 * state->flags to discover whether an error occurred under findMax.
938 GetDecimalValue(WCHAR c
, UINT max
, UINT (*findMax
)(CompilerState
*state
),
939 CompilerState
*state
)
941 UINT value
= JS7_UNDEC(c
);
942 BOOL overflow
= (value
> max
&& (!findMax
|| value
> findMax(state
)));
944 /* The following restriction allows simpler overflow checks. */
945 assert(max
<= ((UINT
)-1 - 9) / 10);
946 while (state
->cp
< state
->cpend
) {
950 value
= 10 * value
+ JS7_UNDEC(c
);
951 if (!overflow
&& value
> max
&& (!findMax
|| value
> findMax(state
)))
955 return overflow
? OVERFLOW_VALUE
: value
;
959 * Calculate the total size of the bitmap required for a class expression.
962 CalculateBitmapSize(CompilerState
*state
, RENode
*target
, const WCHAR
*src
,
966 BOOL inRange
= FALSE
;
967 WCHAR c
, rangeStart
= 0;
968 UINT n
, digit
, nDigits
, i
;
970 target
->u
.ucclass
.bmsize
= 0;
971 target
->u
.ucclass
.sense
= TRUE
;
978 target
->u
.ucclass
.sense
= FALSE
;
982 BOOL canStartRange
= TRUE
;
1009 if (src
< end
&& RE_IS_LETTER(*src
)) {
1010 localMax
= (UINT
) (*src
++) & 0x1F;
1023 for (i
= 0; (i
< nDigits
) && (src
< end
); i
++) {
1025 if (!isASCIIHexDigit(c
, &digit
)) {
1027 * Back off to accepting the original
1034 n
= (n
<< 4) | digit
;
1039 canStartRange
= FALSE
;
1041 JS_ReportErrorNumber(state
->context
,
1042 js_GetErrorMessage
, NULL
,
1043 JSMSG_BAD_CLASS_RANGE
);
1053 canStartRange
= FALSE
;
1055 JS_ReportErrorNumber(state
->context
,
1056 js_GetErrorMessage
, NULL
,
1057 JSMSG_BAD_CLASS_RANGE
);
1063 * If this is the start of a range, ensure that it's less than
1077 * This is a non-ECMA extension - decimal escapes (in this
1078 * case, octal!) are supposed to be an error inside class
1079 * ranges, but supported here for backwards compatibility.
1084 if ('0' <= c
&& c
<= '7') {
1086 n
= 8 * n
+ JS7_UNDEC(c
);
1088 if ('0' <= c
&& c
<= '7') {
1090 i
= 8 * n
+ JS7_UNDEC(c
);
1111 /* Throw a SyntaxError here, per ECMA-262, 15.10.2.15. */
1112 if (rangeStart
> localMax
) {
1113 JS_ReportErrorNumber(state
->context
,
1114 js_GetErrorMessage
, NULL
,
1115 JSMSG_BAD_CLASS_RANGE
);
1120 if (canStartRange
&& src
< end
- 1) {
1124 rangeStart
= (WCHAR
)localMax
;
1128 if (state
->flags
& REG_FOLD
)
1129 rangeStart
= localMax
; /* one run of the uc/dc loop below */
1132 if (state
->flags
& REG_FOLD
) {
1133 WCHAR maxch
= localMax
;
1135 for (i
= rangeStart
; i
<= localMax
; i
++) {
1151 target
->u
.ucclass
.bmsize
= max
;
1156 ParseMinMaxQuantifier(CompilerState
*state
, BOOL ignoreValues
)
1160 const WCHAR
*errp
= state
->cp
++;
1165 min
= GetDecimalValue(c
, 0xFFFF, NULL
, state
);
1168 if (!ignoreValues
&& min
== OVERFLOW_VALUE
)
1169 return JSMSG_MIN_TOO_BIG
;
1175 max
= GetDecimalValue(c
, 0xFFFF, NULL
, state
);
1177 if (!ignoreValues
&& max
== OVERFLOW_VALUE
)
1178 return JSMSG_MAX_TOO_BIG
;
1179 if (!ignoreValues
&& min
> max
)
1180 return JSMSG_OUT_OF_ORDER
;
1188 state
->result
= NewRENode(state
, REOP_QUANT
);
1190 return JSMSG_OUT_OF_MEMORY
;
1191 state
->result
->u
.range
.min
= min
;
1192 state
->result
->u
.range
.max
= max
;
1194 * QUANT, <min>, <max>, <next> ... <ENDCHILD>
1195 * where <max> is written as compact(max+1) to make
1196 * (UINT)-1 sentinel to occupy 1 byte, not width_of(max)+1.
1198 state
->progLength
+= (1 + GetCompactIndexWidth(min
)
1199 + GetCompactIndexWidth(max
+ 1)
1210 ParseQuantifier(CompilerState
*state
)
1213 term
= state
->result
;
1214 if (state
->cp
< state
->cpend
) {
1215 switch (*state
->cp
) {
1217 state
->result
= NewRENode(state
, REOP_QUANT
);
1220 state
->result
->u
.range
.min
= 1;
1221 state
->result
->u
.range
.max
= (UINT
)-1;
1222 /* <PLUS>, <next> ... <ENDCHILD> */
1223 state
->progLength
+= 4;
1226 state
->result
= NewRENode(state
, REOP_QUANT
);
1229 state
->result
->u
.range
.min
= 0;
1230 state
->result
->u
.range
.max
= (UINT
)-1;
1231 /* <STAR>, <next> ... <ENDCHILD> */
1232 state
->progLength
+= 4;
1235 state
->result
= NewRENode(state
, REOP_QUANT
);
1238 state
->result
->u
.range
.min
= 0;
1239 state
->result
->u
.range
.max
= 1;
1240 /* <OPT>, <next> ... <ENDCHILD> */
1241 state
->progLength
+= 4;
1243 case '{': /* balance '}' */
1247 err
= ParseMinMaxQuantifier(state
, FALSE
);
1253 ReportRegExpErrorHelper(state
, JSREPORT_ERROR
, err
, errp
);
1262 if (state
->treeDepth
== TREE_DEPTH_MAX
) {
1263 ReportRegExpError(state
, JSREPORT_ERROR
, JSMSG_REGEXP_TOO_COMPLEX
);
1269 state
->result
->kid
= term
;
1270 if (state
->cp
< state
->cpend
&& *state
->cp
== '?') {
1272 state
->result
->u
.range
.greedy
= FALSE
;
1274 state
->result
->u
.range
.greedy
= TRUE
;
1280 * item: assertion An item is either an assertion or
1281 * quantatom a quantified atom.
1283 * assertion: '^' Assertions match beginning of string
1284 * (or line if the class static property
1285 * RegExp.multiline is true).
1286 * '$' End of string (or line if the class
1287 * static property RegExp.multiline is
1289 * '\b' Word boundary (between \w and \W).
1290 * '\B' Word non-boundary.
1292 * quantatom: atom An unquantified atom.
1293 * quantatom '{' n ',' m '}'
1294 * Atom must occur between n and m times.
1295 * quantatom '{' n ',' '}' Atom must occur at least n times.
1296 * quantatom '{' n '}' Atom must occur exactly n times.
1297 * quantatom '*' Zero or more times (same as {0,}).
1298 * quantatom '+' One or more times (same as {1,}).
1299 * quantatom '?' Zero or one time (same as {0,1}).
1301 * any of which can be optionally followed by '?' for ungreedy
1303 * atom: '(' regexp ')' A parenthesized regexp (what matched
1304 * can be addressed using a backreference,
1306 * '.' Matches any char except '\n'.
1307 * '[' classlist ']' A character class.
1308 * '[' '^' classlist ']' A negated character class.
1310 * '\n' Newline (Line Feed).
1311 * '\r' Carriage Return.
1312 * '\t' Horizontal Tab.
1313 * '\v' Vertical Tab.
1314 * '\d' A digit (same as [0-9]).
1316 * '\w' A word character, [0-9a-z_A-Z].
1317 * '\W' A non-word character.
1318 * '\s' A whitespace character, [ \b\f\n\r\t\v].
1319 * '\S' A non-whitespace character.
1320 * '\' n A backreference to the nth (n decimal
1321 * and positive) parenthesized expression.
1322 * '\' octal An octal escape sequence (octal must be
1323 * two or three digits long, unless it is
1324 * 0 for the null character).
1325 * '\x' hex A hex escape (hex must be two digits).
1326 * '\u' unicode A unicode escape (must be four digits).
1327 * '\c' ctrl A control character, ctrl is a letter.
1328 * '\' literalatomchar Any character except one of the above
1329 * that follow '\' in an atom.
1330 * otheratomchar Any character not first among the other
1331 * atom right-hand sides.
1334 ParseTerm(CompilerState
*state
)
1336 WCHAR c
= *state
->cp
++;
1338 UINT num
, tmp
, n
, i
;
1339 const WCHAR
*termStart
;
1342 /* assertions and atoms */
1344 state
->result
= NewRENode(state
, REOP_BOL
);
1347 state
->progLength
++;
1350 state
->result
= NewRENode(state
, REOP_EOL
);
1353 state
->progLength
++;
1356 if (state
->cp
>= state
->cpend
) {
1357 /* a trailing '\' is an error */
1358 ReportRegExpError(state
, JSREPORT_ERROR
, JSMSG_TRAILING_SLASH
);
1363 /* assertion escapes */
1365 state
->result
= NewRENode(state
, REOP_WBDRY
);
1368 state
->progLength
++;
1371 state
->result
= NewRENode(state
, REOP_WNONBDRY
);
1374 state
->progLength
++;
1376 /* Decimal escape */
1378 /* Give a strict warning. See also the note below. */
1379 WARN("non-octal digit in an escape sequence that doesn't match a back-reference\n");
1382 while (state
->cp
< state
->cpend
) {
1384 if (c
< '0' || '7' < c
)
1387 tmp
= 8 * num
+ (UINT
)JS7_UNDEC(c
);
1394 state
->result
= NewRENode(state
, REOP_FLAT
);
1397 state
->result
->u
.flat
.chr
= c
;
1398 state
->result
->u
.flat
.length
= 1;
1399 state
->progLength
+= 3;
1410 termStart
= state
->cp
- 1;
1411 num
= GetDecimalValue(c
, state
->parenCount
, FindParenCount
, state
);
1412 if (state
->flags
& JSREG_FIND_PAREN_ERROR
)
1414 if (num
== OVERFLOW_VALUE
) {
1415 /* Give a strict mode warning. */
1416 WARN("back-reference exceeds number of capturing parentheses\n");
1419 * Note: ECMA 262, 15.10.2.9 says that we should throw a syntax
1420 * error here. However, for compatibility with IE, we treat the
1421 * whole backref as flat if the first character in it is not a
1422 * valid octal character, and as an octal escape otherwise.
1424 state
->cp
= termStart
;
1426 /* Treat this as flat. termStart - 1 is the \. */
1431 /* Treat this as an octal escape. */
1434 assert(1 <= num
&& num
<= 0x10000);
1435 state
->result
= NewRENode(state
, REOP_BACKREF
);
1438 state
->result
->u
.parenIndex
= num
- 1;
1440 += 1 + GetCompactIndexWidth(state
->result
->u
.parenIndex
);
1442 /* Control escape */
1458 /* Control letter */
1460 if (state
->cp
< state
->cpend
&& RE_IS_LETTER(*state
->cp
)) {
1461 c
= (WCHAR
) (*state
->cp
++ & 0x1F);
1463 /* back off to accepting the original '\' as a literal */
1468 /* HexEscapeSequence */
1472 /* UnicodeEscapeSequence */
1477 for (i
= 0; i
< nDigits
&& state
->cp
< state
->cpend
; i
++) {
1480 if (!isASCIIHexDigit(c
, &digit
)) {
1482 * Back off to accepting the original 'u' or 'x' as a
1489 n
= (n
<< 4) | digit
;
1493 /* Character class escapes */
1495 state
->result
= NewRENode(state
, REOP_DIGIT
);
1499 state
->progLength
++;
1502 state
->result
= NewRENode(state
, REOP_NONDIGIT
);
1505 state
->result
= NewRENode(state
, REOP_SPACE
);
1508 state
->result
= NewRENode(state
, REOP_NONSPACE
);
1511 state
->result
= NewRENode(state
, REOP_ALNUM
);
1514 state
->result
= NewRENode(state
, REOP_NONALNUM
);
1516 /* IdentityEscape */
1518 state
->result
= NewRENode(state
, REOP_FLAT
);
1521 state
->result
->u
.flat
.chr
= c
;
1522 state
->result
->u
.flat
.length
= 1;
1523 state
->result
->kid
= (void *) (state
->cp
- 1);
1524 state
->progLength
+= 3;
1529 state
->result
= NewRENode(state
, REOP_CLASS
);
1532 termStart
= state
->cp
;
1533 state
->result
->u
.ucclass
.startIndex
= termStart
- state
->cpbegin
;
1535 if (state
->cp
== state
->cpend
) {
1536 ReportRegExpErrorHelper(state
, JSREPORT_ERROR
,
1537 JSMSG_UNTERM_CLASS
, termStart
);
1541 if (*state
->cp
== '\\') {
1543 if (state
->cp
!= state
->cpend
)
1547 if (*state
->cp
== ']') {
1548 state
->result
->u
.ucclass
.kidlen
= state
->cp
- termStart
;
1553 for (i
= 0; i
< CLASS_CACHE_SIZE
; i
++) {
1554 if (!state
->classCache
[i
].start
) {
1555 state
->classCache
[i
].start
= termStart
;
1556 state
->classCache
[i
].length
= state
->result
->u
.ucclass
.kidlen
;
1557 state
->classCache
[i
].index
= state
->classCount
;
1560 if (state
->classCache
[i
].length
==
1561 state
->result
->u
.ucclass
.kidlen
) {
1562 for (n
= 0; ; n
++) {
1563 if (n
== state
->classCache
[i
].length
) {
1564 state
->result
->u
.ucclass
.index
1565 = state
->classCache
[i
].index
;
1568 if (state
->classCache
[i
].start
[n
] != termStart
[n
])
1573 state
->result
->u
.ucclass
.index
= state
->classCount
++;
1577 * Call CalculateBitmapSize now as we want any errors it finds
1578 * to be reported during the parse phase, not at execution.
1580 if (!CalculateBitmapSize(state
, state
->result
, termStart
, state
->cp
++))
1583 * Update classBitmapsMem with number of bytes to hold bmsize bits,
1584 * which is (bitsCount + 7) / 8 or (highest_bit + 1 + 7) / 8
1585 * or highest_bit / 8 + 1 where highest_bit is u.ucclass.bmsize.
1587 n
= (state
->result
->u
.ucclass
.bmsize
>> 3) + 1;
1588 if (n
> CLASS_BITMAPS_MEM_LIMIT
- state
->classBitmapsMem
) {
1589 ReportRegExpError(state
, JSREPORT_ERROR
, JSMSG_REGEXP_TOO_COMPLEX
);
1592 state
->classBitmapsMem
+= n
;
1593 /* CLASS, <index> */
1595 += 1 + GetCompactIndexWidth(state
->result
->u
.ucclass
.index
);
1599 state
->result
= NewRENode(state
, REOP_DOT
);
1604 const WCHAR
*errp
= state
->cp
--;
1607 err
= ParseMinMaxQuantifier(state
, TRUE
);
1618 ReportRegExpErrorHelper(state
, JSREPORT_ERROR
,
1619 JSMSG_BAD_QUANTIFIER
, state
->cp
- 1);
1623 state
->result
= NewRENode(state
, REOP_FLAT
);
1626 state
->result
->u
.flat
.chr
= c
;
1627 state
->result
->u
.flat
.length
= 1;
1628 state
->result
->kid
= (void *) (state
->cp
- 1);
1629 state
->progLength
+= 3;
1632 return ParseQuantifier(state
);
1636 * Top-down regular expression grammar, based closely on Perl4.
1638 * regexp: altern A regular expression is one or more
1639 * altern '|' regexp alternatives separated by vertical bar.
1641 #define INITIAL_STACK_SIZE 128
1644 ParseRegExp(CompilerState
*state
)
1648 REOpData
*operatorStack
;
1649 RENode
**operandStack
;
1652 BOOL result
= FALSE
;
1654 INT operatorSP
= 0, operatorStackSize
= INITIAL_STACK_SIZE
;
1655 INT operandSP
= 0, operandStackSize
= INITIAL_STACK_SIZE
;
1657 /* Watch out for empty regexp */
1658 if (state
->cp
== state
->cpend
) {
1659 state
->result
= NewRENode(state
, REOP_EMPTY
);
1660 return (state
->result
!= NULL
);
1663 operatorStack
= heap_alloc(sizeof(REOpData
) * operatorStackSize
);
1667 operandStack
= heap_alloc(sizeof(RENode
*) * operandStackSize
);
1672 parenIndex
= state
->parenCount
;
1673 if (state
->cp
== state
->cpend
) {
1675 * If we are at the end of the regexp and we're short one or more
1676 * operands, the regexp must have the form /x|/ or some such, with
1677 * left parentheses making us short more than one operand.
1679 if (operatorSP
>= operandSP
) {
1680 operand
= NewRENode(state
, REOP_EMPTY
);
1686 switch (*state
->cp
) {
1689 if (state
->cp
+ 1 < state
->cpend
&&
1690 *state
->cp
== '?' &&
1691 (state
->cp
[1] == '=' ||
1692 state
->cp
[1] == '!' ||
1693 state
->cp
[1] == ':')) {
1694 switch (state
->cp
[1]) {
1697 /* ASSERT, <next>, ... ASSERTTEST */
1698 state
->progLength
+= 4;
1701 op
= REOP_ASSERT_NOT
;
1702 /* ASSERTNOT, <next>, ... ASSERTNOTTEST */
1703 state
->progLength
+= 4;
1706 op
= REOP_LPARENNON
;
1712 /* LPAREN, <index>, ... RPAREN, <index> */
1714 += 2 * (1 + GetCompactIndexWidth(parenIndex
));
1715 state
->parenCount
++;
1716 if (state
->parenCount
== 65535) {
1717 ReportRegExpError(state
, JSREPORT_ERROR
,
1718 JSMSG_TOO_MANY_PARENS
);
1726 * If there's no stacked open parenthesis, throw syntax error.
1728 for (i
= operatorSP
- 1; ; i
--) {
1730 ReportRegExpError(state
, JSREPORT_ERROR
,
1731 JSMSG_UNMATCHED_RIGHT_PAREN
);
1734 if (operatorStack
[i
].op
== REOP_ASSERT
||
1735 operatorStack
[i
].op
== REOP_ASSERT_NOT
||
1736 operatorStack
[i
].op
== REOP_LPARENNON
||
1737 operatorStack
[i
].op
== REOP_LPAREN
) {
1744 /* Expected an operand before these, so make an empty one */
1745 operand
= NewRENode(state
, REOP_EMPTY
);
1751 if (!ParseTerm(state
))
1753 operand
= state
->result
;
1755 if (operandSP
== operandStackSize
) {
1757 operandStackSize
+= operandStackSize
;
1758 tmp
= heap_realloc(operandStack
, sizeof(RENode
*) * operandStackSize
);
1763 operandStack
[operandSP
++] = operand
;
1768 /* At the end; process remaining operators. */
1770 if (state
->cp
== state
->cpend
) {
1771 while (operatorSP
) {
1773 if (!ProcessOp(state
, &operatorStack
[operatorSP
],
1774 operandStack
, operandSP
))
1778 assert(operandSP
== 1);
1779 state
->result
= operandStack
[0];
1784 switch (*state
->cp
) {
1786 /* Process any stacked 'concat' operators */
1788 while (operatorSP
&&
1789 operatorStack
[operatorSP
- 1].op
== REOP_CONCAT
) {
1791 if (!ProcessOp(state
, &operatorStack
[operatorSP
],
1792 operandStack
, operandSP
)) {
1802 * If there's no stacked open parenthesis, throw syntax error.
1804 for (i
= operatorSP
- 1; ; i
--) {
1806 ReportRegExpError(state
, JSREPORT_ERROR
,
1807 JSMSG_UNMATCHED_RIGHT_PAREN
);
1810 if (operatorStack
[i
].op
== REOP_ASSERT
||
1811 operatorStack
[i
].op
== REOP_ASSERT_NOT
||
1812 operatorStack
[i
].op
== REOP_LPARENNON
||
1813 operatorStack
[i
].op
== REOP_LPAREN
) {
1819 /* Process everything on the stack until the open parenthesis. */
1823 switch (operatorStack
[operatorSP
].op
) {
1825 case REOP_ASSERT_NOT
:
1827 operand
= NewRENode(state
, operatorStack
[operatorSP
].op
);
1830 operand
->u
.parenIndex
=
1831 operatorStack
[operatorSP
].parenIndex
;
1833 operand
->kid
= operandStack
[operandSP
- 1];
1834 operandStack
[operandSP
- 1] = operand
;
1835 if (state
->treeDepth
== TREE_DEPTH_MAX
) {
1836 ReportRegExpError(state
, JSREPORT_ERROR
,
1837 JSMSG_REGEXP_TOO_COMPLEX
);
1843 case REOP_LPARENNON
:
1844 state
->result
= operandStack
[operandSP
- 1];
1845 if (!ParseQuantifier(state
))
1847 operandStack
[operandSP
- 1] = state
->result
;
1848 goto restartOperator
;
1850 if (!ProcessOp(state
, &operatorStack
[operatorSP
],
1851 operandStack
, operandSP
))
1861 const WCHAR
*errp
= state
->cp
;
1863 if (ParseMinMaxQuantifier(state
, TRUE
) < 0) {
1865 * This didn't even scan correctly as a quantifier, so we should
1879 ReportRegExpErrorHelper(state
, JSREPORT_ERROR
, JSMSG_BAD_QUANTIFIER
,
1885 /* Anything else is the start of the next term. */
1888 if (operatorSP
== operatorStackSize
) {
1890 operatorStackSize
+= operatorStackSize
;
1891 tmp
= heap_realloc(operatorStack
, sizeof(REOpData
) * operatorStackSize
);
1894 operatorStack
= tmp
;
1896 operatorStack
[operatorSP
].op
= op
;
1897 operatorStack
[operatorSP
].errPos
= state
->cp
;
1898 operatorStack
[operatorSP
++].parenIndex
= parenIndex
;
1903 heap_free(operatorStack
);
1904 heap_free(operandStack
);
1909 * Save the current state of the match - the position in the input
1910 * text as well as the position in the bytecode. The state of any
1911 * parent expressions is also saved (preceding state).
1912 * Contents of parenCount parentheses from parenIndex are also saved.
1914 static REBackTrackData
*
1915 PushBackTrackState(REGlobalData
*gData
, REOp op
,
1916 jsbytecode
*target
, match_state_t
*x
, const WCHAR
*cp
,
1917 size_t parenIndex
, size_t parenCount
)
1920 REBackTrackData
*result
=
1921 (REBackTrackData
*) ((char *)gData
->backTrackSP
+ gData
->cursz
);
1923 size_t sz
= sizeof(REBackTrackData
) +
1924 gData
->stateStackTop
* sizeof(REProgState
) +
1925 parenCount
* sizeof(RECapture
);
1927 ptrdiff_t btsize
= gData
->backTrackStackSize
;
1928 ptrdiff_t btincr
= ((char *)result
+ sz
) -
1929 ((char *)gData
->backTrackStack
+ btsize
);
1931 TRACE("\tBT_Push: %lu,%lu\n", (ULONG_PTR
)parenIndex
, (ULONG_PTR
)parenCount
);
1933 JS_COUNT_OPERATION(gData
->cx
, JSOW_JUMP
* (1 + parenCount
));
1935 ptrdiff_t offset
= (char *)result
- (char *)gData
->backTrackStack
;
1937 JS_COUNT_OPERATION(gData
->cx
, JSOW_ALLOCATION
);
1938 btincr
= ((btincr
+btsize
-1)/btsize
)*btsize
;
1939 gData
->backTrackStack
= heap_pool_grow(gData
->pool
, gData
->backTrackStack
, btsize
, btincr
);
1940 if (!gData
->backTrackStack
) {
1941 js_ReportOutOfScriptQuota(gData
->cx
);
1945 gData
->backTrackStackSize
= btsize
+ btincr
;
1946 result
= (REBackTrackData
*) ((char *)gData
->backTrackStack
+ offset
);
1948 gData
->backTrackSP
= result
;
1949 result
->sz
= gData
->cursz
;
1952 result
->backtrack_op
= op
;
1953 result
->backtrack_pc
= target
;
1955 result
->parenCount
= parenCount
;
1956 result
->parenIndex
= parenIndex
;
1958 result
->saveStateStackTop
= gData
->stateStackTop
;
1959 assert(gData
->stateStackTop
);
1960 memcpy(result
+ 1, gData
->stateStack
,
1961 sizeof(REProgState
) * result
->saveStateStackTop
);
1963 if (parenCount
!= 0) {
1964 memcpy((char *)(result
+ 1) +
1965 sizeof(REProgState
) * result
->saveStateStackTop
,
1966 &x
->parens
[parenIndex
],
1967 sizeof(RECapture
) * parenCount
);
1968 for (i
= 0; i
!= parenCount
; i
++)
1969 x
->parens
[parenIndex
+ i
].index
= -1;
1975 static inline match_state_t
*
1976 FlatNIMatcher(REGlobalData
*gData
, match_state_t
*x
, const WCHAR
*matchChars
,
1980 assert(gData
->cpend
>= x
->cp
);
1981 if (length
> (size_t)(gData
->cpend
- x
->cp
))
1983 for (i
= 0; i
!= length
; i
++) {
1984 if (toupperW(matchChars
[i
]) != toupperW(x
->cp
[i
]))
1992 * 1. Evaluate DecimalEscape to obtain an EscapeValue E.
1993 * 2. If E is not a character then go to step 6.
1994 * 3. Let ch be E's character.
1995 * 4. Let A be a one-element RECharSet containing the character ch.
1996 * 5. Call CharacterSetMatcher(A, false) and return its Matcher result.
1997 * 6. E must be an integer. Let n be that integer.
1998 * 7. If n=0 or n>NCapturingParens then throw a SyntaxError exception.
1999 * 8. Return an internal Matcher closure that takes two arguments, a State x
2000 * and a Continuation c, and performs the following:
2001 * 1. Let cap be x's captures internal array.
2002 * 2. Let s be cap[n].
2003 * 3. If s is undefined, then call c(x) and return its result.
2004 * 4. Let e be x's endIndex.
2005 * 5. Let len be s's length.
2006 * 6. Let f be e+len.
2007 * 7. If f>InputLength, return failure.
2008 * 8. If there exists an integer i between 0 (inclusive) and len (exclusive)
2009 * such that Canonicalize(s[i]) is not the same character as
2010 * Canonicalize(Input [e+i]), then return failure.
2011 * 9. Let y be the State (f, cap).
2012 * 10. Call c(y) and return its result.
2014 static match_state_t
*
2015 BackrefMatcher(REGlobalData
*gData
, match_state_t
*x
, size_t parenIndex
)
2018 const WCHAR
*parenContent
;
2019 RECapture
*cap
= &x
->parens
[parenIndex
];
2021 if (cap
->index
== -1)
2025 if (x
->cp
+ len
> gData
->cpend
)
2028 parenContent
= &gData
->cpbegin
[cap
->index
];
2029 if (gData
->regexp
->flags
& REG_FOLD
) {
2030 for (i
= 0; i
< len
; i
++) {
2031 if (toupperW(parenContent
[i
]) != toupperW(x
->cp
[i
]))
2035 for (i
= 0; i
< len
; i
++) {
2036 if (parenContent
[i
] != x
->cp
[i
])
2044 /* Add a single character to the RECharSet */
2046 AddCharacterToCharSet(RECharSet
*cs
, WCHAR c
)
2048 UINT byteIndex
= (UINT
)(c
>> 3);
2049 assert(c
<= cs
->length
);
2050 cs
->u
.bits
[byteIndex
] |= 1 << (c
& 0x7);
2054 /* Add a character range, c1 to c2 (inclusive) to the RECharSet */
2056 AddCharacterRangeToCharSet(RECharSet
*cs
, UINT c1
, UINT c2
)
2060 UINT byteIndex1
= c1
>> 3;
2061 UINT byteIndex2
= c2
>> 3;
2063 assert(c2
<= cs
->length
&& c1
<= c2
);
2068 if (byteIndex1
== byteIndex2
) {
2069 cs
->u
.bits
[byteIndex1
] |= ((BYTE
)0xFF >> (7 - (c2
- c1
))) << c1
;
2071 cs
->u
.bits
[byteIndex1
] |= 0xFF << c1
;
2072 for (i
= byteIndex1
+ 1; i
< byteIndex2
; i
++)
2073 cs
->u
.bits
[i
] = 0xFF;
2074 cs
->u
.bits
[byteIndex2
] |= (BYTE
)0xFF >> (7 - c2
);
2078 /* Compile the source of the class into a RECharSet */
2080 ProcessCharSet(REGlobalData
*gData
, RECharSet
*charSet
)
2082 const WCHAR
*src
, *end
;
2083 BOOL inRange
= FALSE
;
2084 WCHAR rangeStart
= 0;
2089 assert(!charSet
->converted
);
2091 * Assert that startIndex and length points to chars inside [] inside
2094 assert(1 <= charSet
->u
.src
.startIndex
);
2095 assert(charSet
->u
.src
.startIndex
< gData
->regexp
->source_len
);
2096 assert(charSet
->u
.src
.length
<= gData
->regexp
->source_len
2097 - 1 - charSet
->u
.src
.startIndex
);
2099 charSet
->converted
= TRUE
;
2100 src
= gData
->regexp
->source
+ charSet
->u
.src
.startIndex
;
2102 end
= src
+ charSet
->u
.src
.length
;
2104 assert(src
[-1] == '[' && end
[0] == ']');
2106 byteLength
= (charSet
->length
>> 3) + 1;
2107 charSet
->u
.bits
= heap_alloc(byteLength
);
2108 if (!charSet
->u
.bits
) {
2109 JS_ReportOutOfMemory(gData
->cx
);
2113 memset(charSet
->u
.bits
, 0, byteLength
);
2119 assert(charSet
->sense
== FALSE
);
2122 assert(charSet
->sense
== TRUE
);
2125 while (src
!= end
) {
2150 if (src
< end
&& JS_ISWORD(*src
)) {
2151 thisCh
= (WCHAR
)(*src
++ & 0x1F);
2164 for (i
= 0; (i
< nDigits
) && (src
< end
); i
++) {
2167 if (!isASCIIHexDigit(c
, &digit
)) {
2169 * Back off to accepting the original '\'
2176 n
= (n
<< 4) | digit
;
2189 * This is a non-ECMA extension - decimal escapes (in this
2190 * case, octal!) are supposed to be an error inside class
2191 * ranges, but supported here for backwards compatibility.
2195 if ('0' <= c
&& c
<= '7') {
2197 n
= 8 * n
+ JS7_UNDEC(c
);
2199 if ('0' <= c
&& c
<= '7') {
2201 i
= 8 * n
+ JS7_UNDEC(c
);
2212 AddCharacterRangeToCharSet(charSet
, '0', '9');
2213 continue; /* don't need range processing */
2215 AddCharacterRangeToCharSet(charSet
, 0, '0' - 1);
2216 AddCharacterRangeToCharSet(charSet
,
2218 (WCHAR
)charSet
->length
);
2221 for (i
= (INT
)charSet
->length
; i
>= 0; i
--)
2223 AddCharacterToCharSet(charSet
, (WCHAR
)i
);
2226 for (i
= (INT
)charSet
->length
; i
>= 0; i
--)
2228 AddCharacterToCharSet(charSet
, (WCHAR
)i
);
2231 for (i
= (INT
)charSet
->length
; i
>= 0; i
--)
2233 AddCharacterToCharSet(charSet
, (WCHAR
)i
);
2236 for (i
= (INT
)charSet
->length
; i
>= 0; i
--)
2238 AddCharacterToCharSet(charSet
, (WCHAR
)i
);
2253 if (gData
->regexp
->flags
& REG_FOLD
) {
2254 assert(rangeStart
<= thisCh
);
2255 for (i
= rangeStart
; i
<= thisCh
; i
++) {
2258 AddCharacterToCharSet(charSet
, i
);
2262 AddCharacterToCharSet(charSet
, uch
);
2264 AddCharacterToCharSet(charSet
, dch
);
2267 AddCharacterRangeToCharSet(charSet
, rangeStart
, thisCh
);
2271 if (gData
->regexp
->flags
& REG_FOLD
) {
2272 AddCharacterToCharSet(charSet
, toupperW(thisCh
));
2273 AddCharacterToCharSet(charSet
, tolowerW(thisCh
));
2275 AddCharacterToCharSet(charSet
, thisCh
);
2277 if (src
< end
- 1) {
2281 rangeStart
= thisCh
;
2290 ReallocStateStack(REGlobalData
*gData
)
2292 size_t limit
= gData
->stateStackLimit
;
2293 size_t sz
= sizeof(REProgState
) * limit
;
2295 gData
->stateStack
= heap_pool_grow(gData
->pool
, gData
->stateStack
, sz
, sz
);
2296 if (!gData
->stateStack
) {
2297 js_ReportOutOfScriptQuota(gData
->cx
);
2301 gData
->stateStackLimit
= limit
+ limit
;
2305 #define PUSH_STATE_STACK(data) \
2307 ++(data)->stateStackTop; \
2308 if ((data)->stateStackTop == (data)->stateStackLimit && \
2309 !ReallocStateStack((data))) { \
2315 * Apply the current op against the given input to see if it's going to match
2316 * or fail. Return false if we don't get a match, true if we do. If updatecp is
2317 * true, then update the current state's cp. Always update startpc to the next
2320 static inline match_state_t
*
2321 SimpleMatch(REGlobalData
*gData
, match_state_t
*x
, REOp op
,
2322 jsbytecode
**startpc
, BOOL updatecp
)
2324 match_state_t
*result
= NULL
;
2327 size_t offset
, length
, index
;
2328 jsbytecode
*pc
= *startpc
; /* pc has already been incremented past op */
2329 const WCHAR
*source
;
2330 const WCHAR
*startcp
= x
->cp
;
2334 const char *opname
= reop_names
[op
];
2335 TRACE("\n%06d: %*s%s\n", (int)(pc
- gData
->regexp
->program
),
2336 (int)gData
->stateStackTop
* 2, "", opname
);
2343 if (x
->cp
!= gData
->cpbegin
) {
2344 if (/*!gData->cx->regExpStatics.multiline && FIXME !!! */
2345 !(gData
->regexp
->flags
& REG_MULTILINE
)) {
2348 if (!RE_IS_LINE_TERM(x
->cp
[-1]))
2354 if (x
->cp
!= gData
->cpend
) {
2355 if (/*!gData->cx->regExpStatics.multiline &&*/
2356 !(gData
->regexp
->flags
& REG_MULTILINE
)) {
2359 if (!RE_IS_LINE_TERM(*x
->cp
))
2365 if ((x
->cp
== gData
->cpbegin
|| !JS_ISWORD(x
->cp
[-1])) ^
2366 !(x
->cp
!= gData
->cpend
&& JS_ISWORD(*x
->cp
))) {
2371 if ((x
->cp
== gData
->cpbegin
|| !JS_ISWORD(x
->cp
[-1])) ^
2372 (x
->cp
!= gData
->cpend
&& JS_ISWORD(*x
->cp
))) {
2377 if (x
->cp
!= gData
->cpend
&& !RE_IS_LINE_TERM(*x
->cp
)) {
2383 if (x
->cp
!= gData
->cpend
&& JS7_ISDEC(*x
->cp
)) {
2389 if (x
->cp
!= gData
->cpend
&& !JS7_ISDEC(*x
->cp
)) {
2395 if (x
->cp
!= gData
->cpend
&& JS_ISWORD(*x
->cp
)) {
2401 if (x
->cp
!= gData
->cpend
&& !JS_ISWORD(*x
->cp
)) {
2407 if (x
->cp
!= gData
->cpend
&& isspaceW(*x
->cp
)) {
2413 if (x
->cp
!= gData
->cpend
&& !isspaceW(*x
->cp
)) {
2419 pc
= ReadCompactIndex(pc
, &parenIndex
);
2420 assert(parenIndex
< gData
->regexp
->parenCount
);
2421 result
= BackrefMatcher(gData
, x
, parenIndex
);
2424 pc
= ReadCompactIndex(pc
, &offset
);
2425 assert(offset
< gData
->regexp
->source_len
);
2426 pc
= ReadCompactIndex(pc
, &length
);
2427 assert(1 <= length
);
2428 assert(length
<= gData
->regexp
->source_len
- offset
);
2429 if (length
<= (size_t)(gData
->cpend
- x
->cp
)) {
2430 source
= gData
->regexp
->source
+ offset
;
2431 TRACE("%s\n", debugstr_wn(source
, length
));
2432 for (index
= 0; index
!= length
; index
++) {
2433 if (source
[index
] != x
->cp
[index
])
2442 TRACE(" '%c' == '%c'\n", (char)matchCh
, (char)*x
->cp
);
2443 if (x
->cp
!= gData
->cpend
&& *x
->cp
== matchCh
) {
2449 pc
= ReadCompactIndex(pc
, &offset
);
2450 assert(offset
< gData
->regexp
->source_len
);
2451 pc
= ReadCompactIndex(pc
, &length
);
2452 assert(1 <= length
);
2453 assert(length
<= gData
->regexp
->source_len
- offset
);
2454 source
= gData
->regexp
->source
;
2455 result
= FlatNIMatcher(gData
, x
, source
+ offset
, length
);
2459 if (x
->cp
!= gData
->cpend
&& toupperW(*x
->cp
) == toupperW(matchCh
)) {
2465 matchCh
= GET_ARG(pc
);
2466 TRACE(" '%c' == '%c'\n", (char)matchCh
, (char)*x
->cp
);
2468 if (x
->cp
!= gData
->cpend
&& *x
->cp
== matchCh
) {
2474 matchCh
= GET_ARG(pc
);
2476 if (x
->cp
!= gData
->cpend
&& toupperW(*x
->cp
) == toupperW(matchCh
)) {
2482 pc
= ReadCompactIndex(pc
, &index
);
2483 assert(index
< gData
->regexp
->classCount
);
2484 if (x
->cp
!= gData
->cpend
) {
2485 charSet
= &gData
->regexp
->classList
[index
];
2486 assert(charSet
->converted
);
2489 if (charSet
->length
!= 0 &&
2490 ch
<= charSet
->length
&&
2491 (charSet
->u
.bits
[index
] & (1 << (ch
& 0x7)))) {
2498 pc
= ReadCompactIndex(pc
, &index
);
2499 assert(index
< gData
->regexp
->classCount
);
2500 if (x
->cp
!= gData
->cpend
) {
2501 charSet
= &gData
->regexp
->classList
[index
];
2502 assert(charSet
->converted
);
2505 if (charSet
->length
== 0 ||
2506 ch
> charSet
->length
||
2507 !(charSet
->u
.bits
[index
] & (1 << (ch
& 0x7)))) {
2528 static inline match_state_t
*
2529 ExecuteREBytecode(REGlobalData
*gData
, match_state_t
*x
)
2531 match_state_t
*result
= NULL
;
2532 REBackTrackData
*backTrackData
;
2533 jsbytecode
*nextpc
, *testpc
;
2536 REProgState
*curState
;
2537 const WCHAR
*startcp
;
2538 size_t parenIndex
, k
;
2539 size_t parenSoFar
= 0;
2541 WCHAR matchCh1
, matchCh2
;
2545 jsbytecode
*pc
= gData
->regexp
->program
;
2546 REOp op
= (REOp
) *pc
++;
2549 * If the first node is a simple match, step the index into the string
2550 * until that match is made, or fail if it can't be found at all.
2552 if (REOP_IS_SIMPLE(op
) && !(gData
->regexp
->flags
& REG_STICKY
)) {
2554 while (x
->cp
<= gData
->cpend
) {
2555 nextpc
= pc
; /* reset back to start each time */
2556 result
= SimpleMatch(gData
, x
, op
, &nextpc
, TRUE
);
2560 pc
= nextpc
; /* accept skip to next opcode */
2562 assert(op
< REOP_LIMIT
);
2573 const char *opname
= reop_names
[op
];
2574 TRACE("\n%06d: %*s%s\n", (int)(pc
- gData
->regexp
->program
),
2575 (int)gData
->stateStackTop
* 2, "", opname
);
2577 if (REOP_IS_SIMPLE(op
)) {
2578 result
= SimpleMatch(gData
, x
, op
, &pc
, TRUE
);
2580 curState
= &gData
->stateStack
[gData
->stateStackTop
];
2584 case REOP_ALTPREREQ2
:
2585 nextpc
= pc
+ GET_OFFSET(pc
); /* start of next op */
2587 matchCh2
= GET_ARG(pc
);
2592 if (x
->cp
!= gData
->cpend
) {
2593 if (*x
->cp
== matchCh2
)
2596 charSet
= &gData
->regexp
->classList
[k
];
2597 if (!charSet
->converted
&& !ProcessCharSet(gData
, charSet
))
2601 if ((charSet
->length
== 0 ||
2602 matchCh1
> charSet
->length
||
2603 !(charSet
->u
.bits
[k
] & (1 << (matchCh1
& 0x7)))) ^
2611 case REOP_ALTPREREQ
:
2612 nextpc
= pc
+ GET_OFFSET(pc
); /* start of next op */
2614 matchCh1
= GET_ARG(pc
);
2616 matchCh2
= GET_ARG(pc
);
2618 if (x
->cp
== gData
->cpend
||
2619 (*x
->cp
!= matchCh1
&& *x
->cp
!= matchCh2
)) {
2623 /* else fall through... */
2627 nextpc
= pc
+ GET_OFFSET(pc
); /* start of next alternate */
2628 pc
+= ARG_LEN
; /* start of this alternate */
2629 curState
->parenSoFar
= parenSoFar
;
2630 PUSH_STATE_STACK(gData
);
2633 if (REOP_IS_SIMPLE(op
)) {
2634 if (!SimpleMatch(gData
, x
, op
, &pc
, TRUE
)) {
2635 op
= (REOp
) *nextpc
++;
2642 nextop
= (REOp
) *nextpc
++;
2643 if (!PushBackTrackState(gData
, nextop
, nextpc
, x
, startcp
, 0, 0))
2648 * Occurs at (successful) end of REOP_ALT,
2652 * If we have not gotten a result here, it is because of an
2653 * empty match. Do the same thing REOP_EMPTY would do.
2658 --gData
->stateStackTop
;
2659 pc
+= GET_OFFSET(pc
);
2664 * Occurs at last (successful) end of REOP_ALT,
2668 * If we have not gotten a result here, it is because of an
2669 * empty match. Do the same thing REOP_EMPTY would do.
2674 --gData
->stateStackTop
;
2679 pc
= ReadCompactIndex(pc
, &parenIndex
);
2680 TRACE("[ %lu ]\n", (ULONG_PTR
)parenIndex
);
2681 assert(parenIndex
< gData
->regexp
->parenCount
);
2682 if (parenIndex
+ 1 > parenSoFar
)
2683 parenSoFar
= parenIndex
+ 1;
2684 x
->parens
[parenIndex
].index
= x
->cp
- gData
->cpbegin
;
2685 x
->parens
[parenIndex
].length
= 0;
2693 pc
= ReadCompactIndex(pc
, &parenIndex
);
2694 assert(parenIndex
< gData
->regexp
->parenCount
);
2695 cap
= &x
->parens
[parenIndex
];
2696 delta
= x
->cp
- (gData
->cpbegin
+ cap
->index
);
2697 cap
->length
= (delta
< 0) ? 0 : (size_t) delta
;
2705 nextpc
= pc
+ GET_OFFSET(pc
); /* start of term after ASSERT */
2706 pc
+= ARG_LEN
; /* start of ASSERT child */
2709 if (REOP_IS_SIMPLE(op
) &&
2710 !SimpleMatch(gData
, x
, op
, &testpc
, FALSE
)) {
2714 curState
->u
.assertion
.top
=
2715 (char *)gData
->backTrackSP
- (char *)gData
->backTrackStack
;
2716 curState
->u
.assertion
.sz
= gData
->cursz
;
2717 curState
->index
= x
->cp
- gData
->cpbegin
;
2718 curState
->parenSoFar
= parenSoFar
;
2719 PUSH_STATE_STACK(gData
);
2720 if (!PushBackTrackState(gData
, REOP_ASSERTTEST
,
2721 nextpc
, x
, x
->cp
, 0, 0)) {
2726 case REOP_ASSERT_NOT
:
2727 nextpc
= pc
+ GET_OFFSET(pc
);
2731 if (REOP_IS_SIMPLE(op
) /* Note - fail to fail! */ &&
2732 SimpleMatch(gData
, x
, op
, &testpc
, FALSE
) &&
2733 *testpc
== REOP_ASSERTNOTTEST
) {
2737 curState
->u
.assertion
.top
2738 = (char *)gData
->backTrackSP
-
2739 (char *)gData
->backTrackStack
;
2740 curState
->u
.assertion
.sz
= gData
->cursz
;
2741 curState
->index
= x
->cp
- gData
->cpbegin
;
2742 curState
->parenSoFar
= parenSoFar
;
2743 PUSH_STATE_STACK(gData
);
2744 if (!PushBackTrackState(gData
, REOP_ASSERTNOTTEST
,
2745 nextpc
, x
, x
->cp
, 0, 0)) {
2750 case REOP_ASSERTTEST
:
2751 --gData
->stateStackTop
;
2753 x
->cp
= gData
->cpbegin
+ curState
->index
;
2754 gData
->backTrackSP
=
2755 (REBackTrackData
*) ((char *)gData
->backTrackStack
+
2756 curState
->u
.assertion
.top
);
2757 gData
->cursz
= curState
->u
.assertion
.sz
;
2762 case REOP_ASSERTNOTTEST
:
2763 --gData
->stateStackTop
;
2765 x
->cp
= gData
->cpbegin
+ curState
->index
;
2766 gData
->backTrackSP
=
2767 (REBackTrackData
*) ((char *)gData
->backTrackStack
+
2768 curState
->u
.assertion
.top
);
2769 gData
->cursz
= curState
->u
.assertion
.sz
;
2770 result
= (!result
) ? x
: NULL
;
2773 curState
->u
.quantifier
.min
= 0;
2774 curState
->u
.quantifier
.max
= (UINT
)-1;
2777 curState
->u
.quantifier
.min
= 1;
2778 curState
->u
.quantifier
.max
= (UINT
)-1;
2781 curState
->u
.quantifier
.min
= 0;
2782 curState
->u
.quantifier
.max
= 1;
2785 pc
= ReadCompactIndex(pc
, &k
);
2786 curState
->u
.quantifier
.min
= k
;
2787 pc
= ReadCompactIndex(pc
, &k
);
2788 /* max is k - 1 to use one byte for (UINT)-1 sentinel. */
2789 curState
->u
.quantifier
.max
= k
- 1;
2790 assert(curState
->u
.quantifier
.min
<= curState
->u
.quantifier
.max
);
2792 if (curState
->u
.quantifier
.max
== 0) {
2793 pc
= pc
+ GET_OFFSET(pc
);
2798 /* Step over <next> */
2799 nextpc
= pc
+ ARG_LEN
;
2800 op
= (REOp
) *nextpc
++;
2802 if (REOP_IS_SIMPLE(op
)) {
2803 if (!SimpleMatch(gData
, x
, op
, &nextpc
, TRUE
)) {
2804 if (curState
->u
.quantifier
.min
== 0)
2808 pc
= pc
+ GET_OFFSET(pc
);
2811 op
= (REOp
) *nextpc
++;
2814 curState
->index
= startcp
- gData
->cpbegin
;
2815 curState
->continue_op
= REOP_REPEAT
;
2816 curState
->continue_pc
= pc
;
2817 curState
->parenSoFar
= parenSoFar
;
2818 PUSH_STATE_STACK(gData
);
2819 if (curState
->u
.quantifier
.min
== 0 &&
2820 !PushBackTrackState(gData
, REOP_REPEAT
, pc
, x
, startcp
,
2827 case REOP_ENDCHILD
: /* marks the end of a quantifier child */
2828 pc
= curState
[-1].continue_pc
;
2829 op
= (REOp
) curState
[-1].continue_op
;
2838 --gData
->stateStackTop
;
2840 /* Failed, see if we have enough children. */
2841 if (curState
->u
.quantifier
.min
== 0)
2845 if (curState
->u
.quantifier
.min
== 0 &&
2846 x
->cp
== gData
->cpbegin
+ curState
->index
) {
2847 /* matched an empty string, that'll get us nowhere */
2851 if (curState
->u
.quantifier
.min
!= 0)
2852 curState
->u
.quantifier
.min
--;
2853 if (curState
->u
.quantifier
.max
!= (UINT
) -1)
2854 curState
->u
.quantifier
.max
--;
2855 if (curState
->u
.quantifier
.max
== 0)
2857 nextpc
= pc
+ ARG_LEN
;
2858 nextop
= (REOp
) *nextpc
;
2860 if (REOP_IS_SIMPLE(nextop
)) {
2862 if (!SimpleMatch(gData
, x
, nextop
, &nextpc
, TRUE
)) {
2863 if (curState
->u
.quantifier
.min
== 0)
2870 curState
->index
= startcp
- gData
->cpbegin
;
2871 PUSH_STATE_STACK(gData
);
2872 if (curState
->u
.quantifier
.min
== 0 &&
2873 !PushBackTrackState(gData
, REOP_REPEAT
,
2875 curState
->parenSoFar
,
2877 curState
->parenSoFar
)) {
2880 } while (*nextpc
== REOP_ENDCHILD
);
2883 parenSoFar
= curState
->parenSoFar
;
2888 pc
+= GET_OFFSET(pc
);
2891 case REOP_MINIMALSTAR
:
2892 curState
->u
.quantifier
.min
= 0;
2893 curState
->u
.quantifier
.max
= (UINT
)-1;
2894 goto minimalquantcommon
;
2895 case REOP_MINIMALPLUS
:
2896 curState
->u
.quantifier
.min
= 1;
2897 curState
->u
.quantifier
.max
= (UINT
)-1;
2898 goto minimalquantcommon
;
2899 case REOP_MINIMALOPT
:
2900 curState
->u
.quantifier
.min
= 0;
2901 curState
->u
.quantifier
.max
= 1;
2902 goto minimalquantcommon
;
2903 case REOP_MINIMALQUANT
:
2904 pc
= ReadCompactIndex(pc
, &k
);
2905 curState
->u
.quantifier
.min
= k
;
2906 pc
= ReadCompactIndex(pc
, &k
);
2907 /* See REOP_QUANT comments about k - 1. */
2908 curState
->u
.quantifier
.max
= k
- 1;
2909 assert(curState
->u
.quantifier
.min
2910 <= curState
->u
.quantifier
.max
);
2912 curState
->index
= x
->cp
- gData
->cpbegin
;
2913 curState
->parenSoFar
= parenSoFar
;
2914 PUSH_STATE_STACK(gData
);
2915 if (curState
->u
.quantifier
.min
!= 0) {
2916 curState
->continue_op
= REOP_MINIMALREPEAT
;
2917 curState
->continue_pc
= pc
;
2918 /* step over <next> */
2922 if (!PushBackTrackState(gData
, REOP_MINIMALREPEAT
,
2923 pc
, x
, x
->cp
, 0, 0)) {
2926 --gData
->stateStackTop
;
2927 pc
= pc
+ GET_OFFSET(pc
);
2932 case REOP_MINIMALREPEAT
:
2933 --gData
->stateStackTop
;
2936 TRACE("{%d,%d}\n", curState
->u
.quantifier
.min
, curState
->u
.quantifier
.max
);
2937 #define PREPARE_REPEAT() \
2939 curState->index = x->cp - gData->cpbegin; \
2940 curState->continue_op = REOP_MINIMALREPEAT; \
2941 curState->continue_pc = pc; \
2943 for (k = curState->parenSoFar; k < parenSoFar; k++) \
2944 x->parens[k].index = -1; \
2945 PUSH_STATE_STACK(gData); \
2946 op = (REOp) *pc++; \
2947 assert(op < REOP_LIMIT); \
2953 * Non-greedy failure - try to consume another child.
2955 if (curState
->u
.quantifier
.max
== (UINT
) -1 ||
2956 curState
->u
.quantifier
.max
> 0) {
2960 /* Don't need to adjust pc since we're going to pop. */
2963 if (curState
->u
.quantifier
.min
== 0 &&
2964 x
->cp
== gData
->cpbegin
+ curState
->index
) {
2965 /* Matched an empty string, that'll get us nowhere. */
2969 if (curState
->u
.quantifier
.min
!= 0)
2970 curState
->u
.quantifier
.min
--;
2971 if (curState
->u
.quantifier
.max
!= (UINT
) -1)
2972 curState
->u
.quantifier
.max
--;
2973 if (curState
->u
.quantifier
.min
!= 0) {
2977 curState
->index
= x
->cp
- gData
->cpbegin
;
2978 curState
->parenSoFar
= parenSoFar
;
2979 PUSH_STATE_STACK(gData
);
2980 if (!PushBackTrackState(gData
, REOP_MINIMALREPEAT
,
2982 curState
->parenSoFar
,
2983 parenSoFar
- curState
->parenSoFar
)) {
2986 --gData
->stateStackTop
;
2987 pc
= pc
+ GET_OFFSET(pc
);
2989 assert(op
< REOP_LIMIT
);
2999 * If the match failed and there's a backtrack option, take it.
3000 * Otherwise this is a complete and utter failure.
3003 if (gData
->cursz
== 0)
3006 /* Potentially detect explosive regex here. */
3007 gData
->backTrackCount
++;
3008 if (gData
->backTrackLimit
&&
3009 gData
->backTrackCount
>= gData
->backTrackLimit
) {
3010 JS_ReportErrorNumber(gData
->cx
, js_GetErrorMessage
, NULL
,
3011 JSMSG_REGEXP_TOO_COMPLEX
);
3016 backTrackData
= gData
->backTrackSP
;
3017 gData
->cursz
= backTrackData
->sz
;
3018 gData
->backTrackSP
=
3019 (REBackTrackData
*) ((char *)backTrackData
- backTrackData
->sz
);
3020 x
->cp
= backTrackData
->cp
;
3021 pc
= backTrackData
->backtrack_pc
;
3022 op
= (REOp
) backTrackData
->backtrack_op
;
3023 assert(op
< REOP_LIMIT
);
3024 gData
->stateStackTop
= backTrackData
->saveStateStackTop
;
3025 assert(gData
->stateStackTop
);
3027 memcpy(gData
->stateStack
, backTrackData
+ 1,
3028 sizeof(REProgState
) * backTrackData
->saveStateStackTop
);
3029 curState
= &gData
->stateStack
[gData
->stateStackTop
- 1];
3031 if (backTrackData
->parenCount
) {
3032 memcpy(&x
->parens
[backTrackData
->parenIndex
],
3033 (char *)(backTrackData
+ 1) +
3034 sizeof(REProgState
) * backTrackData
->saveStateStackTop
,
3035 sizeof(RECapture
) * backTrackData
->parenCount
);
3036 parenSoFar
= backTrackData
->parenIndex
+ backTrackData
->parenCount
;
3038 for (k
= curState
->parenSoFar
; k
< parenSoFar
; k
++)
3039 x
->parens
[k
].index
= -1;
3040 parenSoFar
= curState
->parenSoFar
;
3043 TRACE("\tBT_Pop: %ld,%ld\n",
3044 (ULONG_PTR
)backTrackData
->parenIndex
,
3045 (ULONG_PTR
)backTrackData
->parenCount
);
3051 * Continue with the expression.
3054 assert(op
< REOP_LIMIT
);
3066 static match_state_t
*MatchRegExp(REGlobalData
*gData
, match_state_t
*x
)
3068 match_state_t
*result
;
3069 const WCHAR
*cp
= x
->cp
;
3074 * Have to include the position beyond the last character
3075 * in order to detect end-of-input/line condition.
3077 for (cp2
= cp
; cp2
<= gData
->cpend
; cp2
++) {
3078 gData
->skipped
= cp2
- cp
;
3080 for (j
= 0; j
< gData
->regexp
->parenCount
; j
++)
3081 x
->parens
[j
].index
= -1;
3082 result
= ExecuteREBytecode(gData
, x
);
3083 if (!gData
->ok
|| result
|| (gData
->regexp
->flags
& REG_STICKY
))
3085 gData
->backTrackSP
= gData
->backTrackStack
;
3087 gData
->stateStackTop
= 0;
3088 cp2
= cp
+ gData
->skipped
;
3093 static HRESULT
InitMatch(regexp_t
*re
, void *cx
, heap_pool_t
*pool
, REGlobalData
*gData
)
3097 gData
->backTrackStackSize
= INITIAL_BACKTRACK
;
3098 gData
->backTrackStack
= heap_pool_alloc(gData
->pool
, INITIAL_BACKTRACK
);
3099 if (!gData
->backTrackStack
)
3102 gData
->backTrackSP
= gData
->backTrackStack
;
3104 gData
->backTrackCount
= 0;
3105 gData
->backTrackLimit
= 0;
3107 gData
->stateStackLimit
= INITIAL_STATESTACK
;
3108 gData
->stateStack
= heap_pool_alloc(gData
->pool
, sizeof(REProgState
) * INITIAL_STATESTACK
);
3109 if (!gData
->stateStack
)
3112 gData
->stateStackTop
= 0;
3118 for (i
= 0; i
< re
->classCount
; i
++) {
3119 if (!re
->classList
[i
].converted
&&
3120 !ProcessCharSet(gData
, &re
->classList
[i
])) {
3128 js_ReportOutOfScriptQuota(cx
);
3130 return E_OUTOFMEMORY
;
3133 HRESULT
regexp_execute(regexp_t
*regexp
, void *cx
, heap_pool_t
*pool
,
3134 const WCHAR
*str
, DWORD str_len
, match_state_t
*result
)
3138 heap_pool_t
*mark
= heap_pool_mark(pool
);
3139 const WCHAR
*str_beg
= result
->cp
;
3142 assert(result
->cp
!= NULL
);
3144 gData
.cpbegin
= str
;
3145 gData
.cpend
= str
+str_len
;
3146 gData
.start
= result
->cp
-str
;
3150 hres
= InitMatch(regexp
, cx
, pool
, &gData
);
3152 WARN("InitMatch failed\n");
3153 heap_pool_clear(mark
);
3157 res
= MatchRegExp(&gData
, result
);
3158 heap_pool_clear(mark
);
3160 WARN("MatchRegExp failed\n");
3165 result
->match_len
= 0;
3169 result
->match_len
= (result
->cp
-str_beg
) - gData
.skipped
;
3170 result
->paren_count
= regexp
->parenCount
;
3174 void regexp_destroy(regexp_t
*re
)
3176 if (re
->classList
) {
3178 for (i
= 0; i
< re
->classCount
; i
++) {
3179 if (re
->classList
[i
].converted
)
3180 heap_free(re
->classList
[i
].u
.bits
);
3181 re
->classList
[i
].u
.bits
= NULL
;
3183 heap_free(re
->classList
);
3188 regexp_t
* regexp_new(void *cx
, heap_pool_t
*pool
, const WCHAR
*str
,
3189 DWORD str_len
, WORD flags
, BOOL flat
)
3193 CompilerState state
;
3200 mark
= heap_pool_mark(pool
);
3208 state
.cpbegin
= state
.cp
;
3209 state
.cpend
= state
.cp
+ len
;
3210 state
.flags
= flags
;
3211 state
.parenCount
= 0;
3212 state
.classCount
= 0;
3213 state
.progLength
= 0;
3214 state
.treeDepth
= 0;
3215 state
.classBitmapsMem
= 0;
3216 for (i
= 0; i
< CLASS_CACHE_SIZE
; i
++)
3217 state
.classCache
[i
].start
= NULL
;
3219 if (len
!= 0 && flat
) {
3220 state
.result
= NewRENode(&state
, REOP_FLAT
);
3223 state
.result
->u
.flat
.chr
= *state
.cpbegin
;
3224 state
.result
->u
.flat
.length
= len
;
3225 state
.result
->kid
= (void *) state
.cpbegin
;
3226 /* Flat bytecode: REOP_FLAT compact(string_offset) compact(len). */
3227 state
.progLength
+= 1 + GetCompactIndexWidth(0)
3228 + GetCompactIndexWidth(len
);
3230 if (!ParseRegExp(&state
))
3233 resize
= offsetof(regexp_t
, program
) + state
.progLength
+ 1;
3234 re
= heap_alloc(resize
);
3238 assert(state
.classBitmapsMem
<= CLASS_BITMAPS_MEM_LIMIT
);
3239 re
->classCount
= state
.classCount
;
3240 if (re
->classCount
) {
3241 re
->classList
= heap_alloc(re
->classCount
* sizeof(RECharSet
));
3242 if (!re
->classList
) {
3247 for (i
= 0; i
< re
->classCount
; i
++)
3248 re
->classList
[i
].converted
= FALSE
;
3250 re
->classList
= NULL
;
3252 endPC
= EmitREBytecode(&state
, re
, state
.treeDepth
, re
->program
, state
.result
);
3258 *endPC
++ = REOP_END
;
3260 * Check whether size was overestimated and shrink using realloc.
3261 * This is safe since no pointers to newly parsed regexp or its parts
3262 * besides re exist here.
3264 if ((size_t)(endPC
- re
->program
) != state
.progLength
+ 1) {
3266 assert((size_t)(endPC
- re
->program
) < state
.progLength
+ 1);
3267 resize
= offsetof(regexp_t
, program
) + (endPC
- re
->program
);
3268 tmp
= heap_realloc(re
, resize
);
3274 re
->parenCount
= state
.parenCount
;
3276 re
->source_len
= str_len
;
3279 heap_pool_clear(mark
);
3283 HRESULT
regexp_set_flags(regexp_t
**regexp
, void *cx
, heap_pool_t
*pool
, WORD flags
)
3285 if(((*regexp
)->flags
& REG_FOLD
) != (flags
& REG_FOLD
)) {
3286 regexp_t
*new_regexp
= regexp_new(cx
, pool
, (*regexp
)->source
,
3287 (*regexp
)->source_len
, flags
, FALSE
);
3292 regexp_destroy(*regexp
);
3293 *regexp
= new_regexp
;
3295 (*regexp
)->flags
= flags
;