reactos/dll/win32/vbscript/regexp.c

   1 /*
   2  * Copyright 2008 Jacek Caban for CodeWeavers
   3  *
   4  * This library is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU Lesser General Public
   6  * License as published by the Free Software Foundation; either
   7  * version 2.1 of the License, or (at your option) any later version.
   8  *
   9  * This library is distributed in the hope that it will be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * Lesser General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU Lesser General Public
  15  * License along with this library; if not, write to the Free Software
  16  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
  17  */
  18
  19 /*
  20  * Code in this file is based on files:
  21  * js/src/jsregexp.h
  22  * js/src/jsregexp.c
  23  * from Mozilla project, released under LGPL 2.1 or later.
  24  *
  25  * The Original Code is Mozilla Communicator client code, released
  26  * March 31, 1998.
  27  *
  28  * The Initial Developer of the Original Code is
  29  * Netscape Communications Corporation.
  30  * Portions created by the Initial Developer are Copyright (C) 1998
  31  * the Initial Developer. All Rights Reserved.
  32  */
  33
  34 #include "vbscript.h"
  35
  36 /* FIXME: Better error handling */
  37 #define ReportRegExpError(a,b,c)
  38 #define ReportRegExpErrorHelper(a,b,c,d)
  39 #define JS_ReportErrorNumber(a,b,c,d)
  40 #define JS_ReportErrorFlagsAndNumber(a,b,c,d,e,f)
  41 #define js_ReportOutOfScriptQuota(a)
  42 #define JS_ReportOutOfMemory(a)
  43 #define JS_COUNT_OPERATION(a,b)
  44
  45
  46 typedef BYTE JSPackedBool;
  47
  48 /*
  49  * This struct holds a bitmap representation of a class from a regexp.
  50  * There's a list of these referenced by the classList field in the regexp_t
  51  * struct below. The initial state has startIndex set to the offset in the
  52  * original regexp source of the beginning of the class contents. The first
  53  * use of the class converts the source representation into a bitmap.
  54  *
  55  */
  56 typedef struct RECharSet {
  57     JSPackedBool    converted;
  58     JSPackedBool    sense;
  59     WORD            length;
  60     union {
  61         BYTE        *bits;
  62         struct {
  63             size_t  startIndex;
  64             size_t  length;
  65         } src;
  66     } u;
  67 } RECharSet;
  68
  69 #define JSMSG_MIN_TOO_BIG 47
  70 #define JSMSG_MAX_TOO_BIG 48
  71 #define JSMSG_OUT_OF_ORDER 49
  72 #define JSMSG_OUT_OF_MEMORY 137
  73
  74 #define LINE_SEPARATOR  0x2028
  75 #define PARA_SEPARATOR  0x2029
  76
  77 #define RE_IS_LETTER(c)     (((c >= 'A') && (c <= 'Z')) ||                    \
  78                              ((c >= 'a') && (c <= 'z')) )
  79 #define RE_IS_LINE_TERM(c)  ((c == '\n') || (c == '\r') ||                    \
  80                              (c == LINE_SEPARATOR) || (c == PARA_SEPARATOR))
  81
  82 #define JS_ISWORD(c)    ((c) < 128 && (isalnum(c) || (c) == '_'))
  83
  84 #define JS7_ISDEC(c)    ((((unsigned)(c)) - '0') <= 9)
  85 #define JS7_UNDEC(c)    ((c) - '0')
  86
  87 typedef enum REOp {
  88     REOP_EMPTY,
  89     REOP_BOL,
  90     REOP_EOL,
  91     REOP_WBDRY,
  92     REOP_WNONBDRY,
  93     REOP_DOT,
  94     REOP_DIGIT,
  95     REOP_NONDIGIT,
  96     REOP_ALNUM,
  97     REOP_NONALNUM,
  98     REOP_SPACE,
  99     REOP_NONSPACE,
 100     REOP_BACKREF,
 101     REOP_FLAT,
 102     REOP_FLAT1,
 103     REOP_FLATi,
 104     REOP_FLAT1i,
 105     REOP_UCFLAT1,
 106     REOP_UCFLAT1i,
 107     REOP_UCFLAT,
 108     REOP_UCFLATi,
 109     REOP_CLASS,
 110     REOP_NCLASS,
 111     REOP_ALT,
 112     REOP_QUANT,
 113     REOP_STAR,
 114     REOP_PLUS,
 115     REOP_OPT,
 116     REOP_LPAREN,
 117     REOP_RPAREN,
 118     REOP_JUMP,
 119     REOP_DOTSTAR,
 120     REOP_LPARENNON,
 121     REOP_ASSERT,
 122     REOP_ASSERT_NOT,
 123     REOP_ASSERTTEST,
 124     REOP_ASSERTNOTTEST,
 125     REOP_MINIMALSTAR,
 126     REOP_MINIMALPLUS,
 127     REOP_MINIMALOPT,
 128     REOP_MINIMALQUANT,
 129     REOP_ENDCHILD,
 130     REOP_REPEAT,
 131     REOP_MINIMALREPEAT,
 132     REOP_ALTPREREQ,
 133     REOP_ALTPREREQ2,
 134     REOP_ENDALT,
 135     REOP_CONCAT,
 136     REOP_END,
 137     REOP_LIMIT /* META: no operator >= to this */
 138 } REOp;
 139
 140 #define REOP_IS_SIMPLE(op)  ((op) <= REOP_NCLASS)
 141
 142 static const char *reop_names[] = {
 143     "empty",
 144     "bol",
 145     "eol",
 146     "wbdry",
 147     "wnonbdry",
 148     "dot",
 149     "digit",
 150     "nondigit",
 151     "alnum",
 152     "nonalnum",
 153     "space",
 154     "nonspace",
 155     "backref",
 156     "flat",
 157     "flat1",
 158     "flati",
 159     "flat1i",
 160     "ucflat1",
 161     "ucflat1i",
 162     "ucflat",
 163     "ucflati",
 164     "class",
 165     "nclass",
 166     "alt",
 167     "quant",
 168     "star",
 169     "plus",
 170     "opt",
 171     "lparen",
 172     "rparen",
 173     "jump",
 174     "dotstar",
 175     "lparennon",
 176     "assert",
 177     "assert_not",
 178     "asserttest",
 179     "assertnottest",
 180     "minimalstar",
 181     "minimalplus",
 182     "minimalopt",
 183     "minimalquant",
 184     "endchild",
 185     "repeat",
 186     "minimalrepeat",
 187     "altprereq",
 188     "alrprereq2",
 189     "endalt",
 190     "concat",
 191     "end",
 192     NULL
 193 };
 194
 195 typedef struct REProgState {
 196     jsbytecode *continue_pc;        /* current continuation data */
 197     jsbytecode continue_op;
 198     ptrdiff_t index;                /* progress in text */
 199     size_t parenSoFar;              /* highest indexed paren started */
 200     union {
 201         struct {
 202             UINT min;               /* current quantifier limits */
 203             UINT max;
 204         } quantifier;
 205         struct {
 206             size_t top;             /* backtrack stack state */
 207             size_t sz;
 208         } assertion;
 209     } u;
 210 } REProgState;
 211
 212 typedef struct REBackTrackData {
 213     size_t sz;                      /* size of previous stack entry */
 214     jsbytecode *backtrack_pc;       /* where to backtrack to */
 215     jsbytecode backtrack_op;
 216     const WCHAR *cp;                /* index in text of match at backtrack */
 217     size_t parenIndex;              /* start index of saved paren contents */
 218     size_t parenCount;              /* # of saved paren contents */
 219     size_t saveStateStackTop;       /* number of parent states */
 220     /* saved parent states follow */
 221     /* saved paren contents follow */
 222 } REBackTrackData;
 223
 224 #define INITIAL_STATESTACK  100
 225 #define INITIAL_BACKTRACK   8000
 226
 227 typedef struct REGlobalData {
 228     void *cx;
 229     regexp_t *regexp;               /* the RE in execution */
 230     BOOL ok;                        /* runtime error (out_of_memory only?) */
 231     size_t start;                   /* offset to start at */
 232     ptrdiff_t skipped;              /* chars skipped anchoring this r.e. */
 233     const WCHAR    *cpbegin;        /* text base address */
 234     const WCHAR    *cpend;          /* text limit address */
 235
 236     REProgState *stateStack;        /* stack of state of current parents */
 237     size_t stateStackTop;
 238     size_t stateStackLimit;
 239
 240     REBackTrackData *backTrackStack;/* stack of matched-so-far positions */
 241     REBackTrackData *backTrackSP;
 242     size_t backTrackStackSize;
 243     size_t cursz;                   /* size of current stack entry */
 244     size_t backTrackCount;          /* how many times we've backtracked */
 245     size_t backTrackLimit;          /* upper limit on backtrack states */
 246
 247     heap_pool_t *pool;              /* It's faster to use one malloc'd pool
 248                                        than to malloc/free the three items
 249                                        that are allocated from this pool */
 250 } REGlobalData;
 251
 252 typedef struct RENode RENode;
 253 struct RENode {
 254     REOp            op;         /* r.e. op bytecode */
 255     RENode          *next;      /* next in concatenation order */
 256     void            *kid;       /* first operand */
 257     union {
 258         void        *kid2;      /* second operand */
 259         INT         num;        /* could be a number */
 260         size_t      parenIndex; /* or a parenthesis index */
 261         struct {                /* or a quantifier range */
 262             UINT  min;
 263             UINT  max;
 264             JSPackedBool greedy;
 265         } range;
 266         struct {                /* or a character class */
 267             size_t  startIndex;
 268             size_t  kidlen;     /* length of string at kid, in jschars */
 269             size_t  index;      /* index into class list */
 270             WORD  bmsize;       /* bitmap size, based on max char code */
 271             JSPackedBool sense;
 272         } ucclass;
 273         struct {                /* or a literal sequence */
 274             WCHAR   chr;        /* of one character */
 275             size_t  length;     /* or many (via the kid) */
 276         } flat;
 277         struct {
 278             RENode  *kid2;      /* second operand from ALT */
 279             WCHAR   ch1;        /* match char for ALTPREREQ */
 280             WCHAR   ch2;        /* ditto, or class index for ALTPREREQ2 */
 281         } altprereq;
 282     } u;
 283 };
 284
 285 #define CLASS_CACHE_SIZE    4
 286
 287 typedef struct CompilerState {
 288     void            *context;
 289     const WCHAR     *cpbegin;
 290     const WCHAR     *cpend;
 291     const WCHAR     *cp;
 292     size_t          parenCount;
 293     size_t          classCount;   /* number of [] encountered */
 294     size_t          treeDepth;    /* maximum depth of parse tree */
 295     size_t          progLength;   /* estimated bytecode length */
 296     RENode          *result;
 297     size_t          classBitmapsMem; /* memory to hold all class bitmaps */
 298     struct {
 299         const WCHAR *start;         /* small cache of class strings */
 300         size_t length;              /* since they're often the same */
 301         size_t index;
 302     } classCache[CLASS_CACHE_SIZE];
 303     WORD          flags;
 304
 305     heap_pool_t *pool;              /* It's faster to use one malloc'd pool
 306                                        than to malloc/free */
 307 } CompilerState;
 308
 309 typedef struct EmitStateStackEntry {
 310     jsbytecode      *altHead;       /* start of REOP_ALT* opcode */
 311     jsbytecode      *nextAltFixup;  /* fixup pointer to next-alt offset */
 312     jsbytecode      *nextTermFixup; /* fixup ptr. to REOP_JUMP offset */
 313     jsbytecode      *endTermFixup;  /* fixup ptr. to REOPT_ALTPREREQ* offset */
 314     RENode          *continueNode;  /* original REOP_ALT* node being stacked */
 315     jsbytecode      continueOp;     /* REOP_JUMP or REOP_ENDALT continuation */
 316     JSPackedBool    jumpToJumpFlag; /* true if we've patched jump-to-jump to
 317                                        avoid 16-bit unsigned offset overflow */
 318 } EmitStateStackEntry;
 319
 320 /*
 321  * Immediate operand sizes and getter/setters.  Unlike the ones in jsopcode.h,
 322  * the getters and setters take the pc of the offset, not of the opcode before
 323  * the offset.
 324  */
 325 #define ARG_LEN             2
 326 #define GET_ARG(pc)         ((WORD)(((pc)[0] << 8) | (pc)[1]))
 327 #define SET_ARG(pc, arg)    ((pc)[0] = (jsbytecode) ((arg) >> 8),       \
 328                              (pc)[1] = (jsbytecode) (arg))
 329
 330 #define OFFSET_LEN          ARG_LEN
 331 #define OFFSET_MAX          ((1 << (ARG_LEN * 8)) - 1)
 332 #define GET_OFFSET(pc)      GET_ARG(pc)
 333
 334 static BOOL ParseRegExp(CompilerState*);
 335
 336 /*
 337  * Maximum supported tree depth is maximum size of EmitStateStackEntry stack.
 338  * For sanity, we limit it to 2^24 bytes.
 339  */
 340 #define TREE_DEPTH_MAX  ((1 << 24) / sizeof(EmitStateStackEntry))
 341
 342 /*
 343  * The maximum memory that can be allocated for class bitmaps.
 344  * For sanity, we limit it to 2^24 bytes.
 345  */
 346 #define CLASS_BITMAPS_MEM_LIMIT (1 << 24)
 347
 348 /*
 349  * Functions to get size and write/read bytecode that represent small indexes
 350  * compactly.
 351  * Each byte in the code represent 7-bit chunk of the index. 8th bit when set
 352  * indicates that the following byte brings more bits to the index. Otherwise
 353  * this is the last byte in the index bytecode representing highest index bits.
 354  */
 355 static size_t
 356 GetCompactIndexWidth(size_t index)
 357 {
 358     size_t width;
 359
 360     for (width = 1; (index >>= 7) != 0; ++width) { }
 361     return width;
 362 }
 363
 364 static inline jsbytecode *
 365 WriteCompactIndex(jsbytecode *pc, size_t index)
 366 {
 367     size_t next;
 368
 369     while ((next = index >> 7) != 0) {
 370         *pc++ = (jsbytecode)(index | 0x80);
 371         index = next;
 372     }
 373     *pc++ = (jsbytecode)index;
 374     return pc;
 375 }
 376
 377 static inline jsbytecode *
 378 ReadCompactIndex(jsbytecode *pc, size_t *result)
 379 {
 380     size_t nextByte;
 381
 382     nextByte = *pc++;
 383     if ((nextByte & 0x80) == 0) {
 384         /*
 385          * Short-circuit the most common case when compact index <= 127.
 386          */
 387         *result = nextByte;
 388     } else {
 389         size_t shift = 7;
 390         *result = 0x7F & nextByte;
 391         do {
 392             nextByte = *pc++;
 393             *result |= (nextByte & 0x7F) << shift;
 394             shift += 7;
 395         } while ((nextByte & 0x80) != 0);
 396     }
 397     return pc;
 398 }
 399
 400 /* Construct and initialize an RENode, returning NULL for out-of-memory */
 401 static RENode *
 402 NewRENode(CompilerState *state, REOp op)
 403 {
 404     RENode *ren;
 405
 406     ren = heap_pool_alloc(state->pool, sizeof(*ren));
 407     if (!ren) {
 408         /* js_ReportOutOfScriptQuota(cx); */
 409         return NULL;
 410     }
 411     ren->op = op;
 412     ren->next = NULL;
 413     ren->kid = NULL;
 414     return ren;
 415 }
 416
 417 /*
 418  * Validates and converts hex ascii value.
 419  */
 420 static BOOL
 421 isASCIIHexDigit(WCHAR c, UINT *digit)
 422 {
 423     UINT cv = c;
 424
 425     if (cv < '0')
 426         return FALSE;
 427     if (cv <= '9') {
 428         *digit = cv - '0';
 429         return TRUE;
 430     }
 431     cv |= 0x20;
 432     if (cv >= 'a' && cv <= 'f') {
 433         *digit = cv - 'a' + 10;
 434         return TRUE;
 435     }
 436     return FALSE;
 437 }
 438
 439 typedef struct {
 440     REOp op;
 441     const WCHAR *errPos;
 442     size_t parenIndex;
 443 } REOpData;
 444
 445 #define JUMP_OFFSET_HI(off)     ((jsbytecode)((off) >> 8))
 446 #define JUMP_OFFSET_LO(off)     ((jsbytecode)(off))
 447
 448 static BOOL
 449 SetForwardJumpOffset(jsbytecode *jump, jsbytecode *target)
 450 {
 451     ptrdiff_t offset = target - jump;
 452
 453     /* Check that target really points forward. */
 454     assert(offset >= 2);
 455     if ((size_t)offset > OFFSET_MAX)
 456         return FALSE;
 457
 458     jump[0] = JUMP_OFFSET_HI(offset);
 459     jump[1] = JUMP_OFFSET_LO(offset);
 460     return TRUE;
 461 }
 462
 463 /*
 464  * Generate bytecode for the tree rooted at t using an explicit stack instead
 465  * of recursion.
 466  */
 467 static jsbytecode *
 468 EmitREBytecode(CompilerState *state, regexp_t *re, size_t treeDepth,
 469                jsbytecode *pc, RENode *t)
 470 {
 471     EmitStateStackEntry *emitStateSP, *emitStateStack;
 472     RECharSet *charSet;
 473     REOp op;
 474
 475     if (treeDepth == 0) {
 476         emitStateStack = NULL;
 477     } else {
 478         emitStateStack = heap_alloc(sizeof(EmitStateStackEntry) * treeDepth);
 479         if (!emitStateStack)
 480             return NULL;
 481     }
 482     emitStateSP = emitStateStack;
 483     op = t->op;
 484     assert(op < REOP_LIMIT);
 485
 486     for (;;) {
 487         *pc++ = op;
 488         switch (op) {
 489           case REOP_EMPTY:
 490             --pc;
 491             break;
 492
 493           case REOP_ALTPREREQ2:
 494           case REOP_ALTPREREQ:
 495             assert(emitStateSP);
 496             emitStateSP->altHead = pc - 1;
 497             emitStateSP->endTermFixup = pc;
 498             pc += OFFSET_LEN;
 499             SET_ARG(pc, t->u.altprereq.ch1);
 500             pc += ARG_LEN;
 501             SET_ARG(pc, t->u.altprereq.ch2);
 502             pc += ARG_LEN;
 503
 504             emitStateSP->nextAltFixup = pc;    /* offset to next alternate */
 505             pc += OFFSET_LEN;
 506
 507             emitStateSP->continueNode = t;
 508             emitStateSP->continueOp = REOP_JUMP;
 509             emitStateSP->jumpToJumpFlag = FALSE;
 510             ++emitStateSP;
 511             assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
 512             t = t->kid;
 513             op = t->op;
 514             assert(op < REOP_LIMIT);
 515             continue;
 516
 517           case REOP_JUMP:
 518             emitStateSP->nextTermFixup = pc;    /* offset to following term */
 519             pc += OFFSET_LEN;
 520             if (!SetForwardJumpOffset(emitStateSP->nextAltFixup, pc))
 521                 goto jump_too_big;
 522             emitStateSP->continueOp = REOP_ENDALT;
 523             ++emitStateSP;
 524             assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
 525             t = t->u.kid2;
 526             op = t->op;
 527             assert(op < REOP_LIMIT);
 528             continue;
 529
 530           case REOP_ENDALT:
 531             /*
 532              * If we already patched emitStateSP->nextTermFixup to jump to
 533              * a nearer jump, to avoid 16-bit immediate offset overflow, we
 534              * are done here.
 535              */
 536             if (emitStateSP->jumpToJumpFlag)
 537                 break;
 538
 539             /*
 540              * Fix up the REOP_JUMP offset to go to the op after REOP_ENDALT.
 541              * REOP_ENDALT is executed only on successful match of the last
 542              * alternate in a group.
 543              */
 544             if (!SetForwardJumpOffset(emitStateSP->nextTermFixup, pc))
 545                 goto jump_too_big;
 546             if (t->op != REOP_ALT) {
 547                 if (!SetForwardJumpOffset(emitStateSP->endTermFixup, pc))
 548                     goto jump_too_big;
 549             }
 550
 551             /*
 552              * If the program is bigger than the REOP_JUMP offset range, then
 553              * we must check for alternates before this one that are part of
 554              * the same group, and fix up their jump offsets to target jumps
 555              * close enough to fit in a 16-bit unsigned offset immediate.
 556              */
 557             if ((size_t)(pc - re->program) > OFFSET_MAX &&
 558                 emitStateSP > emitStateStack) {
 559                 EmitStateStackEntry *esp, *esp2;
 560                 jsbytecode *alt, *jump;
 561                 ptrdiff_t span, header;
 562
 563                 esp2 = emitStateSP;
 564                 alt = esp2->altHead;
 565                 for (esp = esp2 - 1; esp >= emitStateStack; --esp) {
 566                     if (esp->continueOp == REOP_ENDALT &&
 567                         !esp->jumpToJumpFlag &&
 568                         esp->nextTermFixup + OFFSET_LEN == alt &&
 569                         (size_t)(pc - ((esp->continueNode->op != REOP_ALT)
 570                                        ? esp->endTermFixup
 571                                        : esp->nextTermFixup)) > OFFSET_MAX) {
 572                         alt = esp->altHead;
 573                         jump = esp->nextTermFixup;
 574
 575                         /*
 576                          * The span must be 1 less than the distance from
 577                          * jump offset to jump offset, so we actually jump
 578                          * to a REOP_JUMP bytecode, not to its offset!
 579                          */
 580                         for (;;) {
 581                             assert(jump < esp2->nextTermFixup);
 582                             span = esp2->nextTermFixup - jump - 1;
 583                             if ((size_t)span <= OFFSET_MAX)
 584                                 break;
 585                             do {
 586                                 if (--esp2 == esp)
 587                                     goto jump_too_big;
 588                             } while (esp2->continueOp != REOP_ENDALT);
 589                         }
 590
 591                         jump[0] = JUMP_OFFSET_HI(span);
 592                         jump[1] = JUMP_OFFSET_LO(span);
 593
 594                         if (esp->continueNode->op != REOP_ALT) {
 595                             /*
 596                              * We must patch the offset at esp->endTermFixup
 597                              * as well, for the REOP_ALTPREREQ{,2} opcodes.
 598                              * If we're unlucky and endTermFixup is more than
 599                              * OFFSET_MAX bytes from its target, we cheat by
 600                              * jumping 6 bytes to the jump whose offset is at
 601                              * esp->nextTermFixup, which has the same target.
 602                              */
 603                             jump = esp->endTermFixup;
 604                             header = esp->nextTermFixup - jump;
 605                             span += header;
 606                             if ((size_t)span > OFFSET_MAX)
 607                                 span = header;
 608
 609                             jump[0] = JUMP_OFFSET_HI(span);
 610                             jump[1] = JUMP_OFFSET_LO(span);
 611                         }
 612
 613                         esp->jumpToJumpFlag = TRUE;
 614                     }
 615                 }
 616             }
 617             break;
 618
 619           case REOP_ALT:
 620             assert(emitStateSP);
 621             emitStateSP->altHead = pc - 1;
 622             emitStateSP->nextAltFixup = pc;     /* offset to next alternate */
 623             pc += OFFSET_LEN;
 624             emitStateSP->continueNode = t;
 625             emitStateSP->continueOp = REOP_JUMP;
 626             emitStateSP->jumpToJumpFlag = FALSE;
 627             ++emitStateSP;
 628             assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
 629             t = t->kid;
 630             op = t->op;
 631             assert(op < REOP_LIMIT);
 632             continue;
 633
 634           case REOP_FLAT:
 635             /*
 636              * Coalesce FLATs if possible and if it would not increase bytecode
 637              * beyond preallocated limit. The latter happens only when bytecode
 638              * size for coalesced string with offset p and length 2 exceeds 6
 639              * bytes preallocated for 2 single char nodes, i.e. when
 640              * 1 + GetCompactIndexWidth(p) + GetCompactIndexWidth(2) > 6 or
 641              * GetCompactIndexWidth(p) > 4.
 642              * Since when GetCompactIndexWidth(p) <= 4 coalescing of 3 or more
 643              * nodes strictly decreases bytecode size, the check has to be
 644              * done only for the first coalescing.
 645              */
 646             if (t->kid &&
 647                 GetCompactIndexWidth((WCHAR*)t->kid - state->cpbegin) <= 4)
 648             {
 649                 while (t->next &&
 650                        t->next->op == REOP_FLAT &&
 651                        (WCHAR*)t->kid + t->u.flat.length ==
 652                        t->next->kid) {
 653                     t->u.flat.length += t->next->u.flat.length;
 654                     t->next = t->next->next;
 655                 }
 656             }
 657             if (t->kid && t->u.flat.length > 1) {
 658                 pc[-1] = (state->flags & REG_FOLD) ? REOP_FLATi : REOP_FLAT;
 659                 pc = WriteCompactIndex(pc, (WCHAR*)t->kid - state->cpbegin);
 660                 pc = WriteCompactIndex(pc, t->u.flat.length);
 661             } else if (t->u.flat.chr < 256) {
 662                 pc[-1] = (state->flags & REG_FOLD) ? REOP_FLAT1i : REOP_FLAT1;
 663                 *pc++ = (jsbytecode) t->u.flat.chr;
 664             } else {
 665                 pc[-1] = (state->flags & REG_FOLD)
 666                          ? REOP_UCFLAT1i
 667                          : REOP_UCFLAT1;
 668                 SET_ARG(pc, t->u.flat.chr);
 669                 pc += ARG_LEN;
 670             }
 671             break;
 672
 673           case REOP_LPAREN:
 674             assert(emitStateSP);
 675             pc = WriteCompactIndex(pc, t->u.parenIndex);
 676             emitStateSP->continueNode = t;
 677             emitStateSP->continueOp = REOP_RPAREN;
 678             ++emitStateSP;
 679             assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
 680             t = t->kid;
 681             op = t->op;
 682             continue;
 683
 684           case REOP_RPAREN:
 685             pc = WriteCompactIndex(pc, t->u.parenIndex);
 686             break;
 687
 688           case REOP_BACKREF:
 689             pc = WriteCompactIndex(pc, t->u.parenIndex);
 690             break;
 691
 692           case REOP_ASSERT:
 693             assert(emitStateSP);
 694             emitStateSP->nextTermFixup = pc;
 695             pc += OFFSET_LEN;
 696             emitStateSP->continueNode = t;
 697             emitStateSP->continueOp = REOP_ASSERTTEST;
 698             ++emitStateSP;
 699             assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
 700             t = t->kid;
 701             op = t->op;
 702             continue;
 703
 704           case REOP_ASSERTTEST:
 705           case REOP_ASSERTNOTTEST:
 706             if (!SetForwardJumpOffset(emitStateSP->nextTermFixup, pc))
 707                 goto jump_too_big;
 708             break;
 709
 710           case REOP_ASSERT_NOT:
 711             assert(emitStateSP);
 712             emitStateSP->nextTermFixup = pc;
 713             pc += OFFSET_LEN;
 714             emitStateSP->continueNode = t;
 715             emitStateSP->continueOp = REOP_ASSERTNOTTEST;
 716             ++emitStateSP;
 717             assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
 718             t = t->kid;
 719             op = t->op;
 720             continue;
 721
 722           case REOP_QUANT:
 723             assert(emitStateSP);
 724             if (t->u.range.min == 0 && t->u.range.max == (UINT)-1) {
 725                 pc[-1] = (t->u.range.greedy) ? REOP_STAR : REOP_MINIMALSTAR;
 726             } else if (t->u.range.min == 0 && t->u.range.max == 1) {
 727                 pc[-1] = (t->u.range.greedy) ? REOP_OPT : REOP_MINIMALOPT;
 728             } else if (t->u.range.min == 1 && t->u.range.max == (UINT) -1) {
 729                 pc[-1] = (t->u.range.greedy) ? REOP_PLUS : REOP_MINIMALPLUS;
 730             } else {
 731                 if (!t->u.range.greedy)
 732                     pc[-1] = REOP_MINIMALQUANT;
 733                 pc = WriteCompactIndex(pc, t->u.range.min);
 734                 /*
 735                  * Write max + 1 to avoid using size_t(max) + 1 bytes
 736                  * for (UINT)-1 sentinel.
 737                  */
 738                 pc = WriteCompactIndex(pc, t->u.range.max + 1);
 739             }
 740             emitStateSP->nextTermFixup = pc;
 741             pc += OFFSET_LEN;
 742             emitStateSP->continueNode = t;
 743             emitStateSP->continueOp = REOP_ENDCHILD;
 744             ++emitStateSP;
 745             assert((size_t)(emitStateSP - emitStateStack) <= treeDepth);
 746             t = t->kid;
 747             op = t->op;
 748             continue;
 749
 750           case REOP_ENDCHILD:
 751             if (!SetForwardJumpOffset(emitStateSP->nextTermFixup, pc))
 752                 goto jump_too_big;
 753             break;
 754
 755           case REOP_CLASS:
 756             if (!t->u.ucclass.sense)
 757                 pc[-1] = REOP_NCLASS;
 758             pc = WriteCompactIndex(pc, t->u.ucclass.index);
 759             charSet = &re->classList[t->u.ucclass.index];
 760             charSet->converted = FALSE;
 761             charSet->length = t->u.ucclass.bmsize;
 762             charSet->u.src.startIndex = t->u.ucclass.startIndex;
 763             charSet->u.src.length = t->u.ucclass.kidlen;
 764             charSet->sense = t->u.ucclass.sense;
 765             break;
 766
 767           default:
 768             break;
 769         }
 770
 771         t = t->next;
 772         if (t) {
 773             op = t->op;
 774         } else {
 775             if (emitStateSP == emitStateStack)
 776                 break;
 777             --emitStateSP;
 778             t = emitStateSP->continueNode;
 779             op = (REOp) emitStateSP->continueOp;
 780         }
 781     }
 782
 783   cleanup:
 784     heap_free(emitStateStack);
 785     return pc;
 786
 787   jump_too_big:
 788     ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
 789     pc = NULL;
 790     goto cleanup;
 791 }
 792
 793 /*
 794  * Process the op against the two top operands, reducing them to a single
 795  * operand in the penultimate slot. Update progLength and treeDepth.
 796  */
 797 static BOOL
 798 ProcessOp(CompilerState *state, REOpData *opData, RENode **operandStack,
 799           INT operandSP)
 800 {
 801     RENode *result;
 802
 803     switch (opData->op) {
 804       case REOP_ALT:
 805         result = NewRENode(state, REOP_ALT);
 806         if (!result)
 807             return FALSE;
 808         result->kid = operandStack[operandSP - 2];
 809         result->u.kid2 = operandStack[operandSP - 1];
 810         operandStack[operandSP - 2] = result;
 811
 812         if (state->treeDepth == TREE_DEPTH_MAX) {
 813             ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
 814             return FALSE;
 815         }
 816         ++state->treeDepth;
 817
 818         /*
 819          * Look at both alternates to see if there's a FLAT or a CLASS at
 820          * the start of each. If so, use a prerequisite match.
 821          */
 822         if (((RENode *) result->kid)->op == REOP_FLAT &&
 823             ((RENode *) result->u.kid2)->op == REOP_FLAT &&
 824             (state->flags & REG_FOLD) == 0) {
 825             result->op = REOP_ALTPREREQ;
 826             result->u.altprereq.ch1 = ((RENode *) result->kid)->u.flat.chr;
 827             result->u.altprereq.ch2 = ((RENode *) result->u.kid2)->u.flat.chr;
 828             /* ALTPREREQ, <end>, uch1, uch2, <next>, ...,
 829                                             JUMP, <end> ... ENDALT */
 830             state->progLength += 13;
 831         }
 832         else
 833         if (((RENode *) result->kid)->op == REOP_CLASS &&
 834             ((RENode *) result->kid)->u.ucclass.index < 256 &&
 835             ((RENode *) result->u.kid2)->op == REOP_FLAT &&
 836             (state->flags & REG_FOLD) == 0) {
 837             result->op = REOP_ALTPREREQ2;
 838             result->u.altprereq.ch1 = ((RENode *) result->u.kid2)->u.flat.chr;
 839             result->u.altprereq.ch2 = ((RENode *) result->kid)->u.ucclass.index;
 840             /* ALTPREREQ2, <end>, uch1, uch2, <next>, ...,
 841                                             JUMP, <end> ... ENDALT */
 842             state->progLength += 13;
 843         }
 844         else
 845         if (((RENode *) result->kid)->op == REOP_FLAT &&
 846             ((RENode *) result->u.kid2)->op == REOP_CLASS &&
 847             ((RENode *) result->u.kid2)->u.ucclass.index < 256 &&
 848             (state->flags & REG_FOLD) == 0) {
 849             result->op = REOP_ALTPREREQ2;
 850             result->u.altprereq.ch1 = ((RENode *) result->kid)->u.flat.chr;
 851             result->u.altprereq.ch2 =
 852                 ((RENode *) result->u.kid2)->u.ucclass.index;
 853             /* ALTPREREQ2, <end>, uch1, uch2, <next>, ...,
 854                                           JUMP, <end> ... ENDALT */
 855             state->progLength += 13;
 856         }
 857         else {
 858             /* ALT, <next>, ..., JUMP, <end> ... ENDALT */
 859             state->progLength += 7;
 860         }
 861         break;
 862
 863       case REOP_CONCAT:
 864         result = operandStack[operandSP - 2];
 865         while (result->next)
 866             result = result->next;
 867         result->next = operandStack[operandSP - 1];
 868         break;
 869
 870       case REOP_ASSERT:
 871       case REOP_ASSERT_NOT:
 872       case REOP_LPARENNON:
 873       case REOP_LPAREN:
 874         /* These should have been processed by a close paren. */
 875         ReportRegExpErrorHelper(state, JSREPORT_ERROR, JSMSG_MISSING_PAREN,
 876                                 opData->errPos);
 877         return FALSE;
 878
 879       default:;
 880     }
 881     return TRUE;
 882 }
 883
 884 /*
 885  * Hack two bits in CompilerState.flags, for use within FindParenCount to flag
 886  * its being on the stack, and to propagate errors to its callers.
 887  */
 888 #define JSREG_FIND_PAREN_COUNT  0x8000
 889 #define JSREG_FIND_PAREN_ERROR  0x4000
 890
 891 /*
 892  * Magic return value from FindParenCount and GetDecimalValue, to indicate
 893  * overflow beyond GetDecimalValue's max parameter, or a computed maximum if
 894  * its findMax parameter is non-null.
 895  */
 896 #define OVERFLOW_VALUE          ((UINT)-1)
 897
 898 static UINT
 899 FindParenCount(CompilerState *state)
 900 {
 901     CompilerState temp;
 902     int i;
 903
 904     if (state->flags & JSREG_FIND_PAREN_COUNT)
 905         return OVERFLOW_VALUE;
 906
 907     /*
 908      * Copy state into temp, flag it so we never report an invalid backref,
 909      * and reset its members to parse the entire regexp.  This is obviously
 910      * suboptimal, but GetDecimalValue calls us only if a backref appears to
 911      * refer to a forward parenthetical, which is rare.
 912      */
 913     temp = *state;
 914     temp.flags |= JSREG_FIND_PAREN_COUNT;
 915     temp.cp = temp.cpbegin;
 916     temp.parenCount = 0;
 917     temp.classCount = 0;
 918     temp.progLength = 0;
 919     temp.treeDepth = 0;
 920     temp.classBitmapsMem = 0;
 921     for (i = 0; i < CLASS_CACHE_SIZE; i++)
 922         temp.classCache[i].start = NULL;
 923
 924     if (!ParseRegExp(&temp)) {
 925         state->flags |= JSREG_FIND_PAREN_ERROR;
 926         return OVERFLOW_VALUE;
 927     }
 928     return temp.parenCount;
 929 }
 930
 931 /*
 932  * Extract and return a decimal value at state->cp.  The initial character c
 933  * has already been read.  Return OVERFLOW_VALUE if the result exceeds max.
 934  * Callers who pass a non-null findMax should test JSREG_FIND_PAREN_ERROR in
 935  * state->flags to discover whether an error occurred under findMax.
 936  */
 937 static UINT
 938 GetDecimalValue(WCHAR c, UINT max, UINT (*findMax)(CompilerState *state),
 939                 CompilerState *state)
 940 {
 941     UINT value = JS7_UNDEC(c);
 942     BOOL overflow = (value > max && (!findMax || value > findMax(state)));
 943
 944     /* The following restriction allows simpler overflow checks. */
 945     assert(max <= ((UINT)-1 - 9) / 10);
 946     while (state->cp < state->cpend) {
 947         c = *state->cp;
 948         if (!JS7_ISDEC(c))
 949             break;
 950         value = 10 * value + JS7_UNDEC(c);
 951         if (!overflow && value > max && (!findMax || value > findMax(state)))
 952             overflow = TRUE;
 953         ++state->cp;
 954     }
 955     return overflow ? OVERFLOW_VALUE : value;
 956 }
 957
 958 /*
 959  * Calculate the total size of the bitmap required for a class expression.
 960  */
 961 static BOOL
 962 CalculateBitmapSize(CompilerState *state, RENode *target, const WCHAR *src,
 963                     const WCHAR *end)
 964 {
 965     UINT max = 0;
 966     BOOL inRange = FALSE;
 967     WCHAR c, rangeStart = 0;
 968     UINT n, digit, nDigits, i;
 969
 970     target->u.ucclass.bmsize = 0;
 971     target->u.ucclass.sense = TRUE;
 972
 973     if (src == end)
 974         return TRUE;
 975
 976     if (*src == '^') {
 977         ++src;
 978         target->u.ucclass.sense = FALSE;
 979     }
 980
 981     while (src != end) {
 982         BOOL canStartRange = TRUE;
 983         UINT localMax = 0;
 984
 985         switch (*src) {
 986           case '\\':
 987             ++src;
 988             c = *src++;
 989             switch (c) {
 990               case 'b':
 991                 localMax = 0x8;
 992                 break;
 993               case 'f':
 994                 localMax = 0xC;
 995                 break;
 996               case 'n':
 997                 localMax = 0xA;
 998                 break;
 999               case 'r':
1000                 localMax = 0xD;
1001                 break;
1002               case 't':
1003                 localMax = 0x9;
1004                 break;
1005               case 'v':
1006                 localMax = 0xB;
1007                 break;
1008               case 'c':
1009                 if (src < end && RE_IS_LETTER(*src)) {
1010                     localMax = (UINT) (*src++) & 0x1F;
1011                 } else {
1012                     --src;
1013                     localMax = '\\';
1014                 }
1015                 break;
1016               case 'x':
1017                 nDigits = 2;
1018                 goto lexHex;
1019               case 'u':
1020                 nDigits = 4;
1021 lexHex:
1022                 n = 0;
1023                 for (i = 0; (i < nDigits) && (src < end); i++) {
1024                     c = *src++;
1025                     if (!isASCIIHexDigit(c, &digit)) {
1026                         /*
1027                          * Back off to accepting the original
1028                          *'\' as a literal.
1029                          */
1030                         src -= i + 1;
1031                         n = '\\';
1032                         break;
1033                     }
1034                     n = (n << 4) | digit;
1035                 }
1036                 localMax = n;
1037                 break;
1038               case 'd':
1039                 canStartRange = FALSE;
1040                 if (inRange) {
1041                     JS_ReportErrorNumber(state->context,
1042                                          js_GetErrorMessage, NULL,
1043                                          JSMSG_BAD_CLASS_RANGE);
1044                     return FALSE;
1045                 }
1046                 localMax = '9';
1047                 break;
1048               case 'D':
1049               case 's':
1050               case 'S':
1051               case 'w':
1052               case 'W':
1053                 canStartRange = FALSE;
1054                 if (inRange) {
1055                     JS_ReportErrorNumber(state->context,
1056                                          js_GetErrorMessage, NULL,
1057                                          JSMSG_BAD_CLASS_RANGE);
1058                     return FALSE;
1059                 }
1060                 max = 65535;
1061
1062                 /*
1063                  * If this is the start of a range, ensure that it's less than
1064                  * the end.
1065                  */
1066                 localMax = 0;
1067                 break;
1068               case '0':
1069               case '1':
1070               case '2':
1071               case '3':
1072               case '4':
1073               case '5':
1074               case '6':
1075               case '7':
1076                 /*
1077                  *  This is a non-ECMA extension - decimal escapes (in this
1078                  *  case, octal!) are supposed to be an error inside class
1079                  *  ranges, but supported here for backwards compatibility.
1080                  *
1081                  */
1082                 n = JS7_UNDEC(c);
1083                 c = *src;
1084                 if ('0' <= c && c <= '7') {
1085                     src++;
1086                     n = 8 * n + JS7_UNDEC(c);
1087                     c = *src;
1088                     if ('0' <= c && c <= '7') {
1089                         src++;
1090                         i = 8 * n + JS7_UNDEC(c);
1091                         if (i <= 0377)
1092                             n = i;
1093                         else
1094                             src--;
1095                     }
1096                 }
1097                 localMax = n;
1098                 break;
1099
1100               default:
1101                 localMax = c;
1102                 break;
1103             }
1104             break;
1105           default:
1106             localMax = *src++;
1107             break;
1108         }
1109
1110         if (inRange) {
1111             /* Throw a SyntaxError here, per ECMA-262, 15.10.2.15. */
1112             if (rangeStart > localMax) {
1113                 JS_ReportErrorNumber(state->context,
1114                                      js_GetErrorMessage, NULL,
1115                                      JSMSG_BAD_CLASS_RANGE);
1116                 return FALSE;
1117             }
1118             inRange = FALSE;
1119         } else {
1120             if (canStartRange && src < end - 1) {
1121                 if (*src == '-') {
1122                     ++src;
1123                     inRange = TRUE;
1124                     rangeStart = (WCHAR)localMax;
1125                     continue;
1126                 }
1127             }
1128             if (state->flags & REG_FOLD)
1129                 rangeStart = localMax;   /* one run of the uc/dc loop below */
1130         }
1131
1132         if (state->flags & REG_FOLD) {
1133             WCHAR maxch = localMax;
1134
1135             for (i = rangeStart; i <= localMax; i++) {
1136                 WCHAR uch, dch;
1137
1138                 uch = toupperW(i);
1139                 dch = tolowerW(i);
1140                 if(maxch < uch)
1141                     maxch = uch;
1142                 if(maxch < dch)
1143                     maxch = dch;
1144             }
1145             localMax = maxch;
1146         }
1147
1148         if (localMax > max)
1149             max = localMax;
1150     }
1151     target->u.ucclass.bmsize = max;
1152     return TRUE;
1153 }
1154
1155 static INT
1156 ParseMinMaxQuantifier(CompilerState *state, BOOL ignoreValues)
1157 {
1158     UINT min, max;
1159     WCHAR c;
1160     const WCHAR *errp = state->cp++;
1161
1162     c = *state->cp;
1163     if (JS7_ISDEC(c)) {
1164         ++state->cp;
1165         min = GetDecimalValue(c, 0xFFFF, NULL, state);
1166         c = *state->cp;
1167
1168         if (!ignoreValues && min == OVERFLOW_VALUE)
1169             return JSMSG_MIN_TOO_BIG;
1170
1171         if (c == ',') {
1172             c = *++state->cp;
1173             if (JS7_ISDEC(c)) {
1174                 ++state->cp;
1175                 max = GetDecimalValue(c, 0xFFFF, NULL, state);
1176                 c = *state->cp;
1177                 if (!ignoreValues && max == OVERFLOW_VALUE)
1178                     return JSMSG_MAX_TOO_BIG;
1179                 if (!ignoreValues && min > max)
1180                     return JSMSG_OUT_OF_ORDER;
1181             } else {
1182                 max = (UINT)-1;
1183             }
1184         } else {
1185             max = min;
1186         }
1187         if (c == '}') {
1188             state->result = NewRENode(state, REOP_QUANT);
1189             if (!state->result)
1190                 return JSMSG_OUT_OF_MEMORY;
1191             state->result->u.range.min = min;
1192             state->result->u.range.max = max;
1193             /*
1194              * QUANT, <min>, <max>, <next> ... <ENDCHILD>
1195              * where <max> is written as compact(max+1) to make
1196              * (UINT)-1 sentinel to occupy 1 byte, not width_of(max)+1.
1197              */
1198             state->progLength += (1 + GetCompactIndexWidth(min)
1199                                   + GetCompactIndexWidth(max + 1)
1200                                   +3);
1201             return 0;
1202         }
1203     }
1204
1205     state->cp = errp;
1206     return -1;
1207 }
1208
1209 static BOOL
1210 ParseQuantifier(CompilerState *state)
1211 {
1212     RENode *term;
1213     term = state->result;
1214     if (state->cp < state->cpend) {
1215         switch (*state->cp) {
1216           case '+':
1217             state->result = NewRENode(state, REOP_QUANT);
1218             if (!state->result)
1219                 return FALSE;
1220             state->result->u.range.min = 1;
1221             state->result->u.range.max = (UINT)-1;
1222             /* <PLUS>, <next> ... <ENDCHILD> */
1223             state->progLength += 4;
1224             goto quantifier;
1225           case '*':
1226             state->result = NewRENode(state, REOP_QUANT);
1227             if (!state->result)
1228                 return FALSE;
1229             state->result->u.range.min = 0;
1230             state->result->u.range.max = (UINT)-1;
1231             /* <STAR>, <next> ... <ENDCHILD> */
1232             state->progLength += 4;
1233             goto quantifier;
1234           case '?':
1235             state->result = NewRENode(state, REOP_QUANT);
1236             if (!state->result)
1237                 return FALSE;
1238             state->result->u.range.min = 0;
1239             state->result->u.range.max = 1;
1240             /* <OPT>, <next> ... <ENDCHILD> */
1241             state->progLength += 4;
1242             goto quantifier;
1243           case '{':       /* balance '}' */
1244           {
1245             INT err;
1246
1247             err = ParseMinMaxQuantifier(state, FALSE);
1248             if (err == 0)
1249                 goto quantifier;
1250             if (err == -1)
1251                 return TRUE;
1252
1253             ReportRegExpErrorHelper(state, JSREPORT_ERROR, err, errp);
1254             return FALSE;
1255           }
1256           default:;
1257         }
1258     }
1259     return TRUE;
1260
1261 quantifier:
1262     if (state->treeDepth == TREE_DEPTH_MAX) {
1263         ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
1264         return FALSE;
1265     }
1266
1267     ++state->treeDepth;
1268     ++state->cp;
1269     state->result->kid = term;
1270     if (state->cp < state->cpend && *state->cp == '?') {
1271         ++state->cp;
1272         state->result->u.range.greedy = FALSE;
1273     } else {
1274         state->result->u.range.greedy = TRUE;
1275     }
1276     return TRUE;
1277 }
1278
1279 /*
1280  *  item:       assertion               An item is either an assertion or
1281  *              quantatom               a quantified atom.
1282  *
1283  *  assertion:  '^'                     Assertions match beginning of string
1284  *                                      (or line if the class static property
1285  *                                      RegExp.multiline is true).
1286  *              '$'                     End of string (or line if the class
1287  *                                      static property RegExp.multiline is
1288  *                                      true).
1289  *              '\b'                    Word boundary (between \w and \W).
1290  *              '\B'                    Word non-boundary.
1291  *
1292  *  quantatom:  atom                    An unquantified atom.
1293  *              quantatom '{' n ',' m '}'
1294  *                                      Atom must occur between n and m times.
1295  *              quantatom '{' n ',' '}' Atom must occur at least n times.
1296  *              quantatom '{' n '}'     Atom must occur exactly n times.
1297  *              quantatom '*'           Zero or more times (same as {0,}).
1298  *              quantatom '+'           One or more times (same as {1,}).
1299  *              quantatom '?'           Zero or one time (same as {0,1}).
1300  *
1301  *              any of which can be optionally followed by '?' for ungreedy
1302  *
1303  *  atom:       '(' regexp ')'          A parenthesized regexp (what matched
1304  *                                      can be addressed using a backreference,
1305  *                                      see '\' n below).
1306  *              '.'                     Matches any char except '\n'.
1307  *              '[' classlist ']'       A character class.
1308  *              '[' '^' classlist ']'   A negated character class.
1309  *              '\f'                    Form Feed.
1310  *              '\n'                    Newline (Line Feed).
1311  *              '\r'                    Carriage Return.
1312  *              '\t'                    Horizontal Tab.
1313  *              '\v'                    Vertical Tab.
1314  *              '\d'                    A digit (same as [0-9]).
1315  *              '\D'                    A non-digit.
1316  *              '\w'                    A word character, [0-9a-z_A-Z].
1317  *              '\W'                    A non-word character.
1318  *              '\s'                    A whitespace character, [ \b\f\n\r\t\v].
1319  *              '\S'                    A non-whitespace character.
1320  *              '\' n                   A backreference to the nth (n decimal
1321  *                                      and positive) parenthesized expression.
1322  *              '\' octal               An octal escape sequence (octal must be
1323  *                                      two or three digits long, unless it is
1324  *                                      0 for the null character).
1325  *              '\x' hex                A hex escape (hex must be two digits).
1326  *              '\u' unicode            A unicode escape (must be four digits).
1327  *              '\c' ctrl               A control character, ctrl is a letter.
1328  *              '\' literalatomchar     Any character except one of the above
1329  *                                      that follow '\' in an atom.
1330  *              otheratomchar           Any character not first among the other
1331  *                                      atom right-hand sides.
1332  */
1333 static BOOL
1334 ParseTerm(CompilerState *state)
1335 {
1336     WCHAR c = *state->cp++;
1337     UINT nDigits;
1338     UINT num, tmp, n, i;
1339     const WCHAR *termStart;
1340
1341     switch (c) {
1342     /* assertions and atoms */
1343       case '^':
1344         state->result = NewRENode(state, REOP_BOL);
1345         if (!state->result)
1346             return FALSE;
1347         state->progLength++;
1348         return TRUE;
1349       case '$':
1350         state->result = NewRENode(state, REOP_EOL);
1351         if (!state->result)
1352             return FALSE;
1353         state->progLength++;
1354         return TRUE;
1355       case '\\':
1356         if (state->cp >= state->cpend) {
1357             /* a trailing '\' is an error */
1358             ReportRegExpError(state, JSREPORT_ERROR, JSMSG_TRAILING_SLASH);
1359             return FALSE;
1360         }
1361         c = *state->cp++;
1362         switch (c) {
1363         /* assertion escapes */
1364           case 'b' :
1365             state->result = NewRENode(state, REOP_WBDRY);
1366             if (!state->result)
1367                 return FALSE;
1368             state->progLength++;
1369             return TRUE;
1370           case 'B':
1371             state->result = NewRENode(state, REOP_WNONBDRY);
1372             if (!state->result)
1373                 return FALSE;
1374             state->progLength++;
1375             return TRUE;
1376           /* Decimal escape */
1377           case '0':
1378               /* Give a strict warning. See also the note below. */
1379               WARN("non-octal digit in an escape sequence that doesn't match a back-reference\n");
1380      doOctal:
1381             num = 0;
1382             while (state->cp < state->cpend) {
1383                 c = *state->cp;
1384                 if (c < '0' || '7' < c)
1385                     break;
1386                 state->cp++;
1387                 tmp = 8 * num + (UINT)JS7_UNDEC(c);
1388                 if (tmp > 0377)
1389                     break;
1390                 num = tmp;
1391             }
1392             c = (WCHAR)num;
1393     doFlat:
1394             state->result = NewRENode(state, REOP_FLAT);
1395             if (!state->result)
1396                 return FALSE;
1397             state->result->u.flat.chr = c;
1398             state->result->u.flat.length = 1;
1399             state->progLength += 3;
1400             break;
1401           case '1':
1402           case '2':
1403           case '3':
1404           case '4':
1405           case '5':
1406           case '6':
1407           case '7':
1408           case '8':
1409           case '9':
1410             termStart = state->cp - 1;
1411             num = GetDecimalValue(c, state->parenCount, FindParenCount, state);
1412             if (state->flags & JSREG_FIND_PAREN_ERROR)
1413                 return FALSE;
1414             if (num == OVERFLOW_VALUE) {
1415                 /* Give a strict mode warning. */
1416                 WARN("back-reference exceeds number of capturing parentheses\n");
1417
1418                 /*
1419                  * Note: ECMA 262, 15.10.2.9 says that we should throw a syntax
1420                  * error here. However, for compatibility with IE, we treat the
1421                  * whole backref as flat if the first character in it is not a
1422                  * valid octal character, and as an octal escape otherwise.
1423                  */
1424                 state->cp = termStart;
1425                 if (c >= '8') {
1426                     /* Treat this as flat. termStart - 1 is the \. */
1427                     c = '\\';
1428                     goto asFlat;
1429                 }
1430
1431                 /* Treat this as an octal escape. */
1432                 goto doOctal;
1433             }
1434             assert(1 <= num && num <= 0x10000);
1435             state->result = NewRENode(state, REOP_BACKREF);
1436             if (!state->result)
1437                 return FALSE;
1438             state->result->u.parenIndex = num - 1;
1439             state->progLength
1440                 += 1 + GetCompactIndexWidth(state->result->u.parenIndex);
1441             break;
1442           /* Control escape */
1443           case 'f':
1444             c = 0xC;
1445             goto doFlat;
1446           case 'n':
1447             c = 0xA;
1448             goto doFlat;
1449           case 'r':
1450             c = 0xD;
1451             goto doFlat;
1452           case 't':
1453             c = 0x9;
1454             goto doFlat;
1455           case 'v':
1456             c = 0xB;
1457             goto doFlat;
1458           /* Control letter */
1459           case 'c':
1460             if (state->cp < state->cpend && RE_IS_LETTER(*state->cp)) {
1461                 c = (WCHAR) (*state->cp++ & 0x1F);
1462             } else {
1463                 /* back off to accepting the original '\' as a literal */
1464                 --state->cp;
1465                 c = '\\';
1466             }
1467             goto doFlat;
1468           /* HexEscapeSequence */
1469           case 'x':
1470             nDigits = 2;
1471             goto lexHex;
1472           /* UnicodeEscapeSequence */
1473           case 'u':
1474             nDigits = 4;
1475 lexHex:
1476             n = 0;
1477             for (i = 0; i < nDigits && state->cp < state->cpend; i++) {
1478                 UINT digit;
1479                 c = *state->cp++;
1480                 if (!isASCIIHexDigit(c, &digit)) {
1481                     /*
1482                      * Back off to accepting the original 'u' or 'x' as a
1483                      * literal.
1484                      */
1485                     state->cp -= i + 2;
1486                     n = *state->cp++;
1487                     break;
1488                 }
1489                 n = (n << 4) | digit;
1490             }
1491             c = (WCHAR) n;
1492             goto doFlat;
1493           /* Character class escapes */
1494           case 'd':
1495             state->result = NewRENode(state, REOP_DIGIT);
1496 doSimple:
1497             if (!state->result)
1498                 return FALSE;
1499             state->progLength++;
1500             break;
1501           case 'D':
1502             state->result = NewRENode(state, REOP_NONDIGIT);
1503             goto doSimple;
1504           case 's':
1505             state->result = NewRENode(state, REOP_SPACE);
1506             goto doSimple;
1507           case 'S':
1508             state->result = NewRENode(state, REOP_NONSPACE);
1509             goto doSimple;
1510           case 'w':
1511             state->result = NewRENode(state, REOP_ALNUM);
1512             goto doSimple;
1513           case 'W':
1514             state->result = NewRENode(state, REOP_NONALNUM);
1515             goto doSimple;
1516           /* IdentityEscape */
1517           default:
1518             state->result = NewRENode(state, REOP_FLAT);
1519             if (!state->result)
1520                 return FALSE;
1521             state->result->u.flat.chr = c;
1522             state->result->u.flat.length = 1;
1523             state->result->kid = (void *) (state->cp - 1);
1524             state->progLength += 3;
1525             break;
1526         }
1527         break;
1528       case '[':
1529         state->result = NewRENode(state, REOP_CLASS);
1530         if (!state->result)
1531             return FALSE;
1532         termStart = state->cp;
1533         state->result->u.ucclass.startIndex = termStart - state->cpbegin;
1534         for (;;) {
1535             if (state->cp == state->cpend) {
1536                 ReportRegExpErrorHelper(state, JSREPORT_ERROR,
1537                                         JSMSG_UNTERM_CLASS, termStart);
1538
1539                 return FALSE;
1540             }
1541             if (*state->cp == '\\') {
1542                 state->cp++;
1543                 if (state->cp != state->cpend)
1544                     state->cp++;
1545                 continue;
1546             }
1547             if (*state->cp == ']') {
1548                 state->result->u.ucclass.kidlen = state->cp - termStart;
1549                 break;
1550             }
1551             state->cp++;
1552         }
1553         for (i = 0; i < CLASS_CACHE_SIZE; i++) {
1554             if (!state->classCache[i].start) {
1555                 state->classCache[i].start = termStart;
1556                 state->classCache[i].length = state->result->u.ucclass.kidlen;
1557                 state->classCache[i].index = state->classCount;
1558                 break;
1559             }
1560             if (state->classCache[i].length ==
1561                 state->result->u.ucclass.kidlen) {
1562                 for (n = 0; ; n++) {
1563                     if (n == state->classCache[i].length) {
1564                         state->result->u.ucclass.index
1565                             = state->classCache[i].index;
1566                         goto claim;
1567                     }
1568                     if (state->classCache[i].start[n] != termStart[n])
1569                         break;
1570                 }
1571             }
1572         }
1573         state->result->u.ucclass.index = state->classCount++;
1574
1575     claim:
1576         /*
1577          * Call CalculateBitmapSize now as we want any errors it finds
1578          * to be reported during the parse phase, not at execution.
1579          */
1580         if (!CalculateBitmapSize(state, state->result, termStart, state->cp++))
1581             return FALSE;
1582         /*
1583          * Update classBitmapsMem with number of bytes to hold bmsize bits,
1584          * which is (bitsCount + 7) / 8 or (highest_bit + 1 + 7) / 8
1585          * or highest_bit / 8 + 1 where highest_bit is u.ucclass.bmsize.
1586          */
1587         n = (state->result->u.ucclass.bmsize >> 3) + 1;
1588         if (n > CLASS_BITMAPS_MEM_LIMIT - state->classBitmapsMem) {
1589             ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
1590             return FALSE;
1591         }
1592         state->classBitmapsMem += n;
1593         /* CLASS, <index> */
1594         state->progLength
1595             += 1 + GetCompactIndexWidth(state->result->u.ucclass.index);
1596         break;
1597
1598       case '.':
1599         state->result = NewRENode(state, REOP_DOT);
1600         goto doSimple;
1601
1602       case '{':
1603       {
1604         const WCHAR *errp = state->cp--;
1605         INT err;
1606
1607         err = ParseMinMaxQuantifier(state, TRUE);
1608         state->cp = errp;
1609
1610         if (err < 0)
1611             goto asFlat;
1612
1613         /* FALL THROUGH */
1614       }
1615       case '*':
1616       case '+':
1617       case '?':
1618         ReportRegExpErrorHelper(state, JSREPORT_ERROR,
1619                                 JSMSG_BAD_QUANTIFIER, state->cp - 1);
1620         return FALSE;
1621       default:
1622 asFlat:
1623         state->result = NewRENode(state, REOP_FLAT);
1624         if (!state->result)
1625             return FALSE;
1626         state->result->u.flat.chr = c;
1627         state->result->u.flat.length = 1;
1628         state->result->kid = (void *) (state->cp - 1);
1629         state->progLength += 3;
1630         break;
1631     }
1632     return ParseQuantifier(state);
1633 }
1634
1635 /*
1636  * Top-down regular expression grammar, based closely on Perl4.
1637  *
1638  *  regexp:     altern                  A regular expression is one or more
1639  *              altern '|' regexp       alternatives separated by vertical bar.
1640  */
1641 #define INITIAL_STACK_SIZE  128
1642
1643 static BOOL
1644 ParseRegExp(CompilerState *state)
1645 {
1646     size_t parenIndex;
1647     RENode *operand;
1648     REOpData *operatorStack;
1649     RENode **operandStack;
1650     REOp op;
1651     INT i;
1652     BOOL result = FALSE;
1653
1654     INT operatorSP = 0, operatorStackSize = INITIAL_STACK_SIZE;
1655     INT operandSP = 0, operandStackSize = INITIAL_STACK_SIZE;
1656
1657     /* Watch out for empty regexp */
1658     if (state->cp == state->cpend) {
1659         state->result = NewRENode(state, REOP_EMPTY);
1660         return (state->result != NULL);
1661     }
1662
1663     operatorStack = heap_alloc(sizeof(REOpData) * operatorStackSize);
1664     if (!operatorStack)
1665         return FALSE;
1666
1667     operandStack = heap_alloc(sizeof(RENode *) * operandStackSize);
1668     if (!operandStack)
1669         goto out;
1670
1671     for (;;) {
1672         parenIndex = state->parenCount;
1673         if (state->cp == state->cpend) {
1674             /*
1675              * If we are at the end of the regexp and we're short one or more
1676              * operands, the regexp must have the form /x|/ or some such, with
1677              * left parentheses making us short more than one operand.
1678              */
1679             if (operatorSP >= operandSP) {
1680                 operand = NewRENode(state, REOP_EMPTY);
1681                 if (!operand)
1682                     goto out;
1683                 goto pushOperand;
1684             }
1685         } else {
1686             switch (*state->cp) {
1687               case '(':
1688                 ++state->cp;
1689                 if (state->cp + 1 < state->cpend &&
1690                     *state->cp == '?' &&
1691                     (state->cp[1] == '=' ||
1692                      state->cp[1] == '!' ||
1693                      state->cp[1] == ':')) {
1694                     switch (state->cp[1]) {
1695                       case '=':
1696                         op = REOP_ASSERT;
1697                         /* ASSERT, <next>, ... ASSERTTEST */
1698                         state->progLength += 4;
1699                         break;
1700                       case '!':
1701                         op = REOP_ASSERT_NOT;
1702                         /* ASSERTNOT, <next>, ... ASSERTNOTTEST */
1703                         state->progLength += 4;
1704                         break;
1705                       default:
1706                         op = REOP_LPARENNON;
1707                         break;
1708                     }
1709                     state->cp += 2;
1710                 } else {
1711                     op = REOP_LPAREN;
1712                     /* LPAREN, <index>, ... RPAREN, <index> */
1713                     state->progLength
1714                         += 2 * (1 + GetCompactIndexWidth(parenIndex));
1715                     state->parenCount++;
1716                     if (state->parenCount == 65535) {
1717                         ReportRegExpError(state, JSREPORT_ERROR,
1718                                           JSMSG_TOO_MANY_PARENS);
1719                         goto out;
1720                     }
1721                 }
1722                 goto pushOperator;
1723
1724               case ')':
1725                 /*
1726                  * If there's no stacked open parenthesis, throw syntax error.
1727                  */
1728                 for (i = operatorSP - 1; ; i--) {
1729                     if (i < 0) {
1730                         ReportRegExpError(state, JSREPORT_ERROR,
1731                                           JSMSG_UNMATCHED_RIGHT_PAREN);
1732                         goto out;
1733                     }
1734                     if (operatorStack[i].op == REOP_ASSERT ||
1735                         operatorStack[i].op == REOP_ASSERT_NOT ||
1736                         operatorStack[i].op == REOP_LPARENNON ||
1737                         operatorStack[i].op == REOP_LPAREN) {
1738                         break;
1739                     }
1740                 }
1741                 /* FALL THROUGH */
1742
1743               case '|':
1744                 /* Expected an operand before these, so make an empty one */
1745                 operand = NewRENode(state, REOP_EMPTY);
1746                 if (!operand)
1747                     goto out;
1748                 goto pushOperand;
1749
1750               default:
1751                 if (!ParseTerm(state))
1752                     goto out;
1753                 operand = state->result;
1754 pushOperand:
1755                 if (operandSP == operandStackSize) {
1756                     RENode **tmp;
1757                     operandStackSize += operandStackSize;
1758                     tmp = heap_realloc(operandStack, sizeof(RENode *) * operandStackSize);
1759                     if (!tmp)
1760                         goto out;
1761                     operandStack = tmp;
1762                 }
1763                 operandStack[operandSP++] = operand;
1764                 break;
1765             }
1766         }
1767
1768         /* At the end; process remaining operators. */
1769 restartOperator:
1770         if (state->cp == state->cpend) {
1771             while (operatorSP) {
1772                 --operatorSP;
1773                 if (!ProcessOp(state, &operatorStack[operatorSP],
1774                                operandStack, operandSP))
1775                     goto out;
1776                 --operandSP;
1777             }
1778             assert(operandSP == 1);
1779             state->result = operandStack[0];
1780             result = TRUE;
1781             goto out;
1782         }
1783
1784         switch (*state->cp) {
1785           case '|':
1786             /* Process any stacked 'concat' operators */
1787             ++state->cp;
1788             while (operatorSP &&
1789                    operatorStack[operatorSP - 1].op == REOP_CONCAT) {
1790                 --operatorSP;
1791                 if (!ProcessOp(state, &operatorStack[operatorSP],
1792                                operandStack, operandSP)) {
1793                     goto out;
1794                 }
1795                 --operandSP;
1796             }
1797             op = REOP_ALT;
1798             goto pushOperator;
1799
1800           case ')':
1801             /*
1802              * If there's no stacked open parenthesis, throw syntax error.
1803              */
1804             for (i = operatorSP - 1; ; i--) {
1805                 if (i < 0) {
1806                     ReportRegExpError(state, JSREPORT_ERROR,
1807                                       JSMSG_UNMATCHED_RIGHT_PAREN);
1808                     goto out;
1809                 }
1810                 if (operatorStack[i].op == REOP_ASSERT ||
1811                     operatorStack[i].op == REOP_ASSERT_NOT ||
1812                     operatorStack[i].op == REOP_LPARENNON ||
1813                     operatorStack[i].op == REOP_LPAREN) {
1814                     break;
1815                 }
1816             }
1817             ++state->cp;
1818
1819             /* Process everything on the stack until the open parenthesis. */
1820             for (;;) {
1821                 assert(operatorSP);
1822                 --operatorSP;
1823                 switch (operatorStack[operatorSP].op) {
1824                   case REOP_ASSERT:
1825                   case REOP_ASSERT_NOT:
1826                   case REOP_LPAREN:
1827                     operand = NewRENode(state, operatorStack[operatorSP].op);
1828                     if (!operand)
1829                         goto out;
1830                     operand->u.parenIndex =
1831                         operatorStack[operatorSP].parenIndex;
1832                     assert(operandSP);
1833                     operand->kid = operandStack[operandSP - 1];
1834                     operandStack[operandSP - 1] = operand;
1835                     if (state->treeDepth == TREE_DEPTH_MAX) {
1836                         ReportRegExpError(state, JSREPORT_ERROR,
1837                                           JSMSG_REGEXP_TOO_COMPLEX);
1838                         goto out;
1839                     }
1840                     ++state->treeDepth;
1841                     /* FALL THROUGH */
1842
1843                   case REOP_LPARENNON:
1844                     state->result = operandStack[operandSP - 1];
1845                     if (!ParseQuantifier(state))
1846                         goto out;
1847                     operandStack[operandSP - 1] = state->result;
1848                     goto restartOperator;
1849                   default:
1850                     if (!ProcessOp(state, &operatorStack[operatorSP],
1851                                    operandStack, operandSP))
1852                         goto out;
1853                     --operandSP;
1854                     break;
1855                 }
1856             }
1857             break;
1858
1859           case '{':
1860           {
1861             const WCHAR *errp = state->cp;
1862
1863             if (ParseMinMaxQuantifier(state, TRUE) < 0) {
1864                 /*
1865                  * This didn't even scan correctly as a quantifier, so we should
1866                  * treat it as flat.
1867                  */
1868                 op = REOP_CONCAT;
1869                 goto pushOperator;
1870             }
1871
1872             state->cp = errp;
1873             /* FALL THROUGH */
1874           }
1875
1876           case '+':
1877           case '*':
1878           case '?':
1879             ReportRegExpErrorHelper(state, JSREPORT_ERROR, JSMSG_BAD_QUANTIFIER,
1880                                     state->cp);
1881             result = FALSE;
1882             goto out;
1883
1884           default:
1885             /* Anything else is the start of the next term. */
1886             op = REOP_CONCAT;
1887 pushOperator:
1888             if (operatorSP == operatorStackSize) {
1889                 REOpData *tmp;
1890                 operatorStackSize += operatorStackSize;
1891                 tmp = heap_realloc(operatorStack, sizeof(REOpData) * operatorStackSize);
1892                 if (!tmp)
1893                     goto out;
1894                 operatorStack = tmp;
1895             }
1896             operatorStack[operatorSP].op = op;
1897             operatorStack[operatorSP].errPos = state->cp;
1898             operatorStack[operatorSP++].parenIndex = parenIndex;
1899             break;
1900         }
1901     }
1902 out:
1903     heap_free(operatorStack);
1904     heap_free(operandStack);
1905     return result;
1906 }
1907
1908 /*
1909  * Save the current state of the match - the position in the input
1910  * text as well as the position in the bytecode. The state of any
1911  * parent expressions is also saved (preceding state).
1912  * Contents of parenCount parentheses from parenIndex are also saved.
1913  */
1914 static REBackTrackData *
1915 PushBackTrackState(REGlobalData *gData, REOp op,
1916                    jsbytecode *target, match_state_t *x, const WCHAR *cp,
1917                    size_t parenIndex, size_t parenCount)
1918 {
1919     size_t i;
1920     REBackTrackData *result =
1921         (REBackTrackData *) ((char *)gData->backTrackSP + gData->cursz);
1922
1923     size_t sz = sizeof(REBackTrackData) +
1924                 gData->stateStackTop * sizeof(REProgState) +
1925                 parenCount * sizeof(RECapture);
1926
1927     ptrdiff_t btsize = gData->backTrackStackSize;
1928     ptrdiff_t btincr = ((char *)result + sz) -
1929                        ((char *)gData->backTrackStack + btsize);
1930
1931     TRACE("\tBT_Push: %lu,%lu\n", (ULONG_PTR)parenIndex, (ULONG_PTR)parenCount);
1932
1933     JS_COUNT_OPERATION(gData->cx, JSOW_JUMP * (1 + parenCount));
1934     if (btincr > 0) {
1935         ptrdiff_t offset = (char *)result - (char *)gData->backTrackStack;
1936
1937         JS_COUNT_OPERATION(gData->cx, JSOW_ALLOCATION);
1938         btincr = ((btincr+btsize-1)/btsize)*btsize;
1939         gData->backTrackStack = heap_pool_grow(gData->pool, gData->backTrackStack, btsize, btincr);
1940         if (!gData->backTrackStack) {
1941             js_ReportOutOfScriptQuota(gData->cx);
1942             gData->ok = FALSE;
1943             return NULL;
1944         }
1945         gData->backTrackStackSize = btsize + btincr;
1946         result = (REBackTrackData *) ((char *)gData->backTrackStack + offset);
1947     }
1948     gData->backTrackSP = result;
1949     result->sz = gData->cursz;
1950     gData->cursz = sz;
1951
1952     result->backtrack_op = op;
1953     result->backtrack_pc = target;
1954     result->cp = cp;
1955     result->parenCount = parenCount;
1956     result->parenIndex = parenIndex;
1957
1958     result->saveStateStackTop = gData->stateStackTop;
1959     assert(gData->stateStackTop);
1960     memcpy(result + 1, gData->stateStack,
1961            sizeof(REProgState) * result->saveStateStackTop);
1962
1963     if (parenCount != 0) {
1964         memcpy((char *)(result + 1) +
1965                sizeof(REProgState) * result->saveStateStackTop,
1966                &x->parens[parenIndex],
1967                sizeof(RECapture) * parenCount);
1968         for (i = 0; i != parenCount; i++)
1969             x->parens[parenIndex + i].index = -1;
1970     }
1971
1972     return result;
1973 }
1974
1975 static inline match_state_t *
1976 FlatNIMatcher(REGlobalData *gData, match_state_t *x, const WCHAR *matchChars,
1977               size_t length)
1978 {
1979     size_t i;
1980     assert(gData->cpend >= x->cp);
1981     if (length > (size_t)(gData->cpend - x->cp))
1982         return NULL;
1983     for (i = 0; i != length; i++) {
1984         if (toupperW(matchChars[i]) != toupperW(x->cp[i]))
1985             return NULL;
1986     }
1987     x->cp += length;
1988     return x;
1989 }
1990
1991 /*
1992  * 1. Evaluate DecimalEscape to obtain an EscapeValue E.
1993  * 2. If E is not a character then go to step 6.
1994  * 3. Let ch be E's character.
1995  * 4. Let A be a one-element RECharSet containing the character ch.
1996  * 5. Call CharacterSetMatcher(A, false) and return its Matcher result.
1997  * 6. E must be an integer. Let n be that integer.
1998  * 7. If n=0 or n>NCapturingParens then throw a SyntaxError exception.
1999  * 8. Return an internal Matcher closure that takes two arguments, a State x
2000  *    and a Continuation c, and performs the following:
2001  *     1. Let cap be x's captures internal array.
2002  *     2. Let s be cap[n].
2003  *     3. If s is undefined, then call c(x) and return its result.
2004  *     4. Let e be x's endIndex.
2005  *     5. Let len be s's length.
2006  *     6. Let f be e+len.
2007  *     7. If f>InputLength, return failure.
2008  *     8. If there exists an integer i between 0 (inclusive) and len (exclusive)
2009  *        such that Canonicalize(s[i]) is not the same character as
2010  *        Canonicalize(Input [e+i]), then return failure.
2011  *     9. Let y be the State (f, cap).
2012  *     10. Call c(y) and return its result.
2013  */
2014 static match_state_t *
2015 BackrefMatcher(REGlobalData *gData, match_state_t *x, size_t parenIndex)
2016 {
2017     size_t len, i;
2018     const WCHAR *parenContent;
2019     RECapture *cap = &x->parens[parenIndex];
2020
2021     if (cap->index == -1)
2022         return x;
2023
2024     len = cap->length;
2025     if (x->cp + len > gData->cpend)
2026         return NULL;
2027
2028     parenContent = &gData->cpbegin[cap->index];
2029     if (gData->regexp->flags & REG_FOLD) {
2030         for (i = 0; i < len; i++) {
2031             if (toupperW(parenContent[i]) != toupperW(x->cp[i]))
2032                 return NULL;
2033         }
2034     } else {
2035         for (i = 0; i < len; i++) {
2036             if (parenContent[i] != x->cp[i])
2037                 return NULL;
2038         }
2039     }
2040     x->cp += len;
2041     return x;
2042 }
2043
2044 /* Add a single character to the RECharSet */
2045 static void
2046 AddCharacterToCharSet(RECharSet *cs, WCHAR c)
2047 {
2048     UINT byteIndex = (UINT)(c >> 3);
2049     assert(c <= cs->length);
2050     cs->u.bits[byteIndex] |= 1 << (c & 0x7);
2051 }
2052
2053
2054 /* Add a character range, c1 to c2 (inclusive) to the RECharSet */
2055 static void
2056 AddCharacterRangeToCharSet(RECharSet *cs, UINT c1, UINT c2)
2057 {
2058     UINT i;
2059
2060     UINT byteIndex1 = c1 >> 3;
2061     UINT byteIndex2 = c2 >> 3;
2062
2063     assert(c2 <= cs->length && c1 <= c2);
2064
2065     c1 &= 0x7;
2066     c2 &= 0x7;
2067
2068     if (byteIndex1 == byteIndex2) {
2069         cs->u.bits[byteIndex1] |= ((BYTE)0xFF >> (7 - (c2 - c1))) << c1;
2070     } else {
2071         cs->u.bits[byteIndex1] |= 0xFF << c1;
2072         for (i = byteIndex1 + 1; i < byteIndex2; i++)
2073             cs->u.bits[i] = 0xFF;
2074         cs->u.bits[byteIndex2] |= (BYTE)0xFF >> (7 - c2);
2075     }
2076 }
2077
2078 /* Compile the source of the class into a RECharSet */
2079 static BOOL
2080 ProcessCharSet(REGlobalData *gData, RECharSet *charSet)
2081 {
2082     const WCHAR *src, *end;
2083     BOOL inRange = FALSE;
2084     WCHAR rangeStart = 0;
2085     UINT byteLength, n;
2086     WCHAR c, thisCh;
2087     INT nDigits, i;
2088
2089     assert(!charSet->converted);
2090     /*
2091      * Assert that startIndex and length points to chars inside [] inside
2092      * source string.
2093      */
2094     assert(1 <= charSet->u.src.startIndex);
2095     assert(charSet->u.src.startIndex < gData->regexp->source_len);
2096     assert(charSet->u.src.length <= gData->regexp->source_len
2097             - 1 - charSet->u.src.startIndex);
2098
2099     charSet->converted = TRUE;
2100     src = gData->regexp->source + charSet->u.src.startIndex;
2101
2102     end = src + charSet->u.src.length;
2103
2104     assert(src[-1] == '[' && end[0] == ']');
2105
2106     byteLength = (charSet->length >> 3) + 1;
2107     charSet->u.bits = heap_alloc(byteLength);
2108     if (!charSet->u.bits) {
2109         JS_ReportOutOfMemory(gData->cx);
2110         gData->ok = FALSE;
2111         return FALSE;
2112     }
2113     memset(charSet->u.bits, 0, byteLength);
2114
2115     if (src == end)
2116         return TRUE;
2117
2118     if (*src == '^') {
2119         assert(charSet->sense == FALSE);
2120         ++src;
2121     } else {
2122         assert(charSet->sense == TRUE);
2123     }
2124
2125     while (src != end) {
2126         switch (*src) {
2127           case '\\':
2128             ++src;
2129             c = *src++;
2130             switch (c) {
2131               case 'b':
2132                 thisCh = 0x8;
2133                 break;
2134               case 'f':
2135                 thisCh = 0xC;
2136                 break;
2137               case 'n':
2138                 thisCh = 0xA;
2139                 break;
2140               case 'r':
2141                 thisCh = 0xD;
2142                 break;
2143               case 't':
2144                 thisCh = 0x9;
2145                 break;
2146               case 'v':
2147                 thisCh = 0xB;
2148                 break;
2149               case 'c':
2150                 if (src < end && JS_ISWORD(*src)) {
2151                     thisCh = (WCHAR)(*src++ & 0x1F);
2152                 } else {
2153                     --src;
2154                     thisCh = '\\';
2155                 }
2156                 break;
2157               case 'x':
2158                 nDigits = 2;
2159                 goto lexHex;
2160               case 'u':
2161                 nDigits = 4;
2162             lexHex:
2163                 n = 0;
2164                 for (i = 0; (i < nDigits) && (src < end); i++) {
2165                     UINT digit;
2166                     c = *src++;
2167                     if (!isASCIIHexDigit(c, &digit)) {
2168                         /*
2169                          * Back off to accepting the original '\'
2170                          * as a literal
2171                          */
2172                         src -= i + 1;
2173                         n = '\\';
2174                         break;
2175                     }
2176                     n = (n << 4) | digit;
2177                 }
2178                 thisCh = (WCHAR)n;
2179                 break;
2180               case '0':
2181               case '1':
2182               case '2':
2183               case '3':
2184               case '4':
2185               case '5':
2186               case '6':
2187               case '7':
2188                 /*
2189                  *  This is a non-ECMA extension - decimal escapes (in this
2190                  *  case, octal!) are supposed to be an error inside class
2191                  *  ranges, but supported here for backwards compatibility.
2192                  */
2193                 n = JS7_UNDEC(c);
2194                 c = *src;
2195                 if ('0' <= c && c <= '7') {
2196                     src++;
2197                     n = 8 * n + JS7_UNDEC(c);
2198                     c = *src;
2199                     if ('0' <= c && c <= '7') {
2200                         src++;
2201                         i = 8 * n + JS7_UNDEC(c);
2202                         if (i <= 0377)
2203                             n = i;
2204                         else
2205                             src--;
2206                     }
2207                 }
2208                 thisCh = (WCHAR)n;
2209                 break;
2210
2211               case 'd':
2212                 AddCharacterRangeToCharSet(charSet, '0', '9');
2213                 continue;   /* don't need range processing */
2214               case 'D':
2215                 AddCharacterRangeToCharSet(charSet, 0, '0' - 1);
2216                 AddCharacterRangeToCharSet(charSet,
2217                                            (WCHAR)('9' + 1),
2218                                            (WCHAR)charSet->length);
2219                 continue;
2220               case 's':
2221                 for (i = (INT)charSet->length; i >= 0; i--)
2222                     if (isspaceW(i))
2223                         AddCharacterToCharSet(charSet, (WCHAR)i);
2224                 continue;
2225               case 'S':
2226                 for (i = (INT)charSet->length; i >= 0; i--)
2227                     if (!isspaceW(i))
2228                         AddCharacterToCharSet(charSet, (WCHAR)i);
2229                 continue;
2230               case 'w':
2231                 for (i = (INT)charSet->length; i >= 0; i--)
2232                     if (JS_ISWORD(i))
2233                         AddCharacterToCharSet(charSet, (WCHAR)i);
2234                 continue;
2235               case 'W':
2236                 for (i = (INT)charSet->length; i >= 0; i--)
2237                     if (!JS_ISWORD(i))
2238                         AddCharacterToCharSet(charSet, (WCHAR)i);
2239                 continue;
2240               default:
2241                 thisCh = c;
2242                 break;
2243
2244             }
2245             break;
2246
2247           default:
2248             thisCh = *src++;
2249             break;
2250
2251         }
2252         if (inRange) {
2253             if (gData->regexp->flags & REG_FOLD) {
2254                 assert(rangeStart <= thisCh);
2255                 for (i = rangeStart; i <= thisCh; i++) {
2256                     WCHAR uch, dch;
2257
2258                     AddCharacterToCharSet(charSet, i);
2259                     uch = toupperW(i);
2260                     dch = tolowerW(i);
2261                     if (i != uch)
2262                         AddCharacterToCharSet(charSet, uch);
2263                     if (i != dch)
2264                         AddCharacterToCharSet(charSet, dch);
2265                 }
2266             } else {
2267                 AddCharacterRangeToCharSet(charSet, rangeStart, thisCh);
2268             }
2269             inRange = FALSE;
2270         } else {
2271             if (gData->regexp->flags & REG_FOLD) {
2272                 AddCharacterToCharSet(charSet, toupperW(thisCh));
2273                 AddCharacterToCharSet(charSet, tolowerW(thisCh));
2274             } else {
2275                 AddCharacterToCharSet(charSet, thisCh);
2276             }
2277             if (src < end - 1) {
2278                 if (*src == '-') {
2279                     ++src;
2280                     inRange = TRUE;
2281                     rangeStart = thisCh;
2282                 }
2283             }
2284         }
2285     }
2286     return TRUE;
2287 }
2288
2289 static BOOL
2290 ReallocStateStack(REGlobalData *gData)
2291 {
2292     size_t limit = gData->stateStackLimit;
2293     size_t sz = sizeof(REProgState) * limit;
2294
2295     gData->stateStack = heap_pool_grow(gData->pool, gData->stateStack, sz, sz);
2296     if (!gData->stateStack) {
2297         js_ReportOutOfScriptQuota(gData->cx);
2298         gData->ok = FALSE;
2299         return FALSE;
2300     }
2301     gData->stateStackLimit = limit + limit;
2302     return TRUE;
2303 }
2304
2305 #define PUSH_STATE_STACK(data)                                                \
2306     do {                                                                      \
2307         ++(data)->stateStackTop;                                              \
2308         if ((data)->stateStackTop == (data)->stateStackLimit &&               \
2309             !ReallocStateStack((data))) {                                     \
2310             return NULL;                                                      \
2311         }                                                                     \
2312     }while(0)
2313
2314 /*
2315  * Apply the current op against the given input to see if it's going to match
2316  * or fail. Return false if we don't get a match, true if we do. If updatecp is
2317  * true, then update the current state's cp. Always update startpc to the next
2318  * op.
2319  */
2320 static inline match_state_t *
2321 SimpleMatch(REGlobalData *gData, match_state_t *x, REOp op,
2322             jsbytecode **startpc, BOOL updatecp)
2323 {
2324     match_state_t *result = NULL;
2325     WCHAR matchCh;
2326     size_t parenIndex;
2327     size_t offset, length, index;
2328     jsbytecode *pc = *startpc;  /* pc has already been incremented past op */
2329     const WCHAR *source;
2330     const WCHAR *startcp = x->cp;
2331     WCHAR ch;
2332     RECharSet *charSet;
2333
2334     const char *opname = reop_names[op];
2335     TRACE("\n%06d: %*s%s\n", (int)(pc - gData->regexp->program),
2336           (int)gData->stateStackTop * 2, "", opname);
2337
2338     switch (op) {
2339       case REOP_EMPTY:
2340         result = x;
2341         break;
2342       case REOP_BOL:
2343         if (x->cp != gData->cpbegin) {
2344             if (/*!gData->cx->regExpStatics.multiline &&  FIXME !!! */
2345                 !(gData->regexp->flags & REG_MULTILINE)) {
2346                 break;
2347             }
2348             if (!RE_IS_LINE_TERM(x->cp[-1]))
2349                 break;
2350         }
2351         result = x;
2352         break;
2353       case REOP_EOL:
2354         if (x->cp != gData->cpend) {
2355             if (/*!gData->cx->regExpStatics.multiline &&*/
2356                 !(gData->regexp->flags & REG_MULTILINE)) {
2357                 break;
2358             }
2359             if (!RE_IS_LINE_TERM(*x->cp))
2360                 break;
2361         }
2362         result = x;
2363         break;
2364       case REOP_WBDRY:
2365         if ((x->cp == gData->cpbegin || !JS_ISWORD(x->cp[-1])) ^
2366             !(x->cp != gData->cpend && JS_ISWORD(*x->cp))) {
2367             result = x;
2368         }
2369         break;
2370       case REOP_WNONBDRY:
2371         if ((x->cp == gData->cpbegin || !JS_ISWORD(x->cp[-1])) ^
2372             (x->cp != gData->cpend && JS_ISWORD(*x->cp))) {
2373             result = x;
2374         }
2375         break;
2376       case REOP_DOT:
2377         if (x->cp != gData->cpend && !RE_IS_LINE_TERM(*x->cp)) {
2378             result = x;
2379             result->cp++;
2380         }
2381         break;
2382       case REOP_DIGIT:
2383         if (x->cp != gData->cpend && JS7_ISDEC(*x->cp)) {
2384             result = x;
2385             result->cp++;
2386         }
2387         break;
2388       case REOP_NONDIGIT:
2389         if (x->cp != gData->cpend && !JS7_ISDEC(*x->cp)) {
2390             result = x;
2391             result->cp++;
2392         }
2393         break;
2394       case REOP_ALNUM:
2395         if (x->cp != gData->cpend && JS_ISWORD(*x->cp)) {
2396             result = x;
2397             result->cp++;
2398         }
2399         break;
2400       case REOP_NONALNUM:
2401         if (x->cp != gData->cpend && !JS_ISWORD(*x->cp)) {
2402             result = x;
2403             result->cp++;
2404         }
2405         break;
2406       case REOP_SPACE:
2407         if (x->cp != gData->cpend && isspaceW(*x->cp)) {
2408             result = x;
2409             result->cp++;
2410         }
2411         break;
2412       case REOP_NONSPACE:
2413         if (x->cp != gData->cpend && !isspaceW(*x->cp)) {
2414             result = x;
2415             result->cp++;
2416         }
2417         break;
2418       case REOP_BACKREF:
2419         pc = ReadCompactIndex(pc, &parenIndex);
2420         assert(parenIndex < gData->regexp->parenCount);
2421         result = BackrefMatcher(gData, x, parenIndex);
2422         break;
2423       case REOP_FLAT:
2424         pc = ReadCompactIndex(pc, &offset);
2425         assert(offset < gData->regexp->source_len);
2426         pc = ReadCompactIndex(pc, &length);
2427         assert(1 <= length);
2428         assert(length <= gData->regexp->source_len - offset);
2429         if (length <= (size_t)(gData->cpend - x->cp)) {
2430             source = gData->regexp->source + offset;
2431             TRACE("%s\n", debugstr_wn(source, length));
2432             for (index = 0; index != length; index++) {
2433                 if (source[index] != x->cp[index])
2434                     return NULL;
2435             }
2436             x->cp += length;
2437             result = x;
2438         }
2439         break;
2440       case REOP_FLAT1:
2441         matchCh = *pc++;
2442         TRACE(" '%c' == '%c'\n", (char)matchCh, (char)*x->cp);
2443         if (x->cp != gData->cpend && *x->cp == matchCh) {
2444             result = x;
2445             result->cp++;
2446         }
2447         break;
2448       case REOP_FLATi:
2449         pc = ReadCompactIndex(pc, &offset);
2450         assert(offset < gData->regexp->source_len);
2451         pc = ReadCompactIndex(pc, &length);
2452         assert(1 <= length);
2453         assert(length <= gData->regexp->source_len - offset);
2454         source = gData->regexp->source;
2455         result = FlatNIMatcher(gData, x, source + offset, length);
2456         break;
2457       case REOP_FLAT1i:
2458         matchCh = *pc++;
2459         if (x->cp != gData->cpend && toupperW(*x->cp) == toupperW(matchCh)) {
2460             result = x;
2461             result->cp++;
2462         }
2463         break;
2464       case REOP_UCFLAT1:
2465         matchCh = GET_ARG(pc);
2466         TRACE(" '%c' == '%c'\n", (char)matchCh, (char)*x->cp);
2467         pc += ARG_LEN;
2468         if (x->cp != gData->cpend && *x->cp == matchCh) {
2469             result = x;
2470             result->cp++;
2471         }
2472         break;
2473       case REOP_UCFLAT1i:
2474         matchCh = GET_ARG(pc);
2475         pc += ARG_LEN;
2476         if (x->cp != gData->cpend && toupperW(*x->cp) == toupperW(matchCh)) {
2477             result = x;
2478             result->cp++;
2479         }
2480         break;
2481       case REOP_CLASS:
2482         pc = ReadCompactIndex(pc, &index);
2483         assert(index < gData->regexp->classCount);
2484         if (x->cp != gData->cpend) {
2485             charSet = &gData->regexp->classList[index];
2486             assert(charSet->converted);
2487             ch = *x->cp;
2488             index = ch >> 3;
2489             if (charSet->length != 0 &&
2490                 ch <= charSet->length &&
2491                 (charSet->u.bits[index] & (1 << (ch & 0x7)))) {
2492                 result = x;
2493                 result->cp++;
2494             }
2495         }
2496         break;
2497       case REOP_NCLASS:
2498         pc = ReadCompactIndex(pc, &index);
2499         assert(index < gData->regexp->classCount);
2500         if (x->cp != gData->cpend) {
2501             charSet = &gData->regexp->classList[index];
2502             assert(charSet->converted);
2503             ch = *x->cp;
2504             index = ch >> 3;
2505             if (charSet->length == 0 ||
2506                 ch > charSet->length ||
2507                 !(charSet->u.bits[index] & (1 << (ch & 0x7)))) {
2508                 result = x;
2509                 result->cp++;
2510             }
2511         }
2512         break;
2513
2514       default:
2515         assert(FALSE);
2516     }
2517     if (result) {
2518         if (!updatecp)
2519             x->cp = startcp;
2520         *startpc = pc;
2521         TRACE(" *\n");
2522         return result;
2523     }
2524     x->cp = startcp;
2525     return NULL;
2526 }
2527
2528 static inline match_state_t *
2529 ExecuteREBytecode(REGlobalData *gData, match_state_t *x)
2530 {
2531     match_state_t *result = NULL;
2532     REBackTrackData *backTrackData;
2533     jsbytecode *nextpc, *testpc;
2534     REOp nextop;
2535     RECapture *cap;
2536     REProgState *curState;
2537     const WCHAR *startcp;
2538     size_t parenIndex, k;
2539     size_t parenSoFar = 0;
2540
2541     WCHAR matchCh1, matchCh2;
2542     RECharSet *charSet;
2543
2544     BOOL anchor;
2545     jsbytecode *pc = gData->regexp->program;
2546     REOp op = (REOp) *pc++;
2547
2548     /*
2549      * If the first node is a simple match, step the index into the string
2550      * until that match is made, or fail if it can't be found at all.
2551      */
2552     if (REOP_IS_SIMPLE(op) && !(gData->regexp->flags & REG_STICKY)) {
2553         anchor = FALSE;
2554         while (x->cp <= gData->cpend) {
2555             nextpc = pc;    /* reset back to start each time */
2556             result = SimpleMatch(gData, x, op, &nextpc, TRUE);
2557             if (result) {
2558                 anchor = TRUE;
2559                 x = result;
2560                 pc = nextpc;    /* accept skip to next opcode */
2561                 op = (REOp) *pc++;
2562                 assert(op < REOP_LIMIT);
2563                 break;
2564             }
2565             gData->skipped++;
2566             x->cp++;
2567         }
2568         if (!anchor)
2569             goto bad;
2570     }
2571
2572     for (;;) {
2573         const char *opname = reop_names[op];
2574         TRACE("\n%06d: %*s%s\n", (int)(pc - gData->regexp->program),
2575               (int)gData->stateStackTop * 2, "", opname);
2576
2577         if (REOP_IS_SIMPLE(op)) {
2578             result = SimpleMatch(gData, x, op, &pc, TRUE);
2579         } else {
2580             curState = &gData->stateStack[gData->stateStackTop];
2581             switch (op) {
2582               case REOP_END:
2583                 goto good;
2584               case REOP_ALTPREREQ2:
2585                 nextpc = pc + GET_OFFSET(pc);   /* start of next op */
2586                 pc += ARG_LEN;
2587                 matchCh2 = GET_ARG(pc);
2588                 pc += ARG_LEN;
2589                 k = GET_ARG(pc);
2590                 pc += ARG_LEN;
2591
2592                 if (x->cp != gData->cpend) {
2593                     if (*x->cp == matchCh2)
2594                         goto doAlt;
2595
2596                     charSet = &gData->regexp->classList[k];
2597                     if (!charSet->converted && !ProcessCharSet(gData, charSet))
2598                         goto bad;
2599                     matchCh1 = *x->cp;
2600                     k = matchCh1 >> 3;
2601                     if ((charSet->length == 0 ||
2602                          matchCh1 > charSet->length ||
2603                          !(charSet->u.bits[k] & (1 << (matchCh1 & 0x7)))) ^
2604                         charSet->sense) {
2605                         goto doAlt;
2606                     }
2607                 }
2608                 result = NULL;
2609                 break;
2610
2611               case REOP_ALTPREREQ:
2612                 nextpc = pc + GET_OFFSET(pc);   /* start of next op */
2613                 pc += ARG_LEN;
2614                 matchCh1 = GET_ARG(pc);
2615                 pc += ARG_LEN;
2616                 matchCh2 = GET_ARG(pc);
2617                 pc += ARG_LEN;
2618                 if (x->cp == gData->cpend ||
2619                     (*x->cp != matchCh1 && *x->cp != matchCh2)) {
2620                     result = NULL;
2621                     break;
2622                 }
2623                 /* else fall through... */
2624
2625               case REOP_ALT:
2626               doAlt:
2627                 nextpc = pc + GET_OFFSET(pc);   /* start of next alternate */
2628                 pc += ARG_LEN;                  /* start of this alternate */
2629                 curState->parenSoFar = parenSoFar;
2630                 PUSH_STATE_STACK(gData);
2631                 op = (REOp) *pc++;
2632                 startcp = x->cp;
2633                 if (REOP_IS_SIMPLE(op)) {
2634                     if (!SimpleMatch(gData, x, op, &pc, TRUE)) {
2635                         op = (REOp) *nextpc++;
2636                         pc = nextpc;
2637                         continue;
2638                     }
2639                     result = x;
2640                     op = (REOp) *pc++;
2641                 }
2642                 nextop = (REOp) *nextpc++;
2643                 if (!PushBackTrackState(gData, nextop, nextpc, x, startcp, 0, 0))
2644                     goto bad;
2645                 continue;
2646
2647               /*
2648                * Occurs at (successful) end of REOP_ALT,
2649                */
2650               case REOP_JUMP:
2651                 /*
2652                  * If we have not gotten a result here, it is because of an
2653                  * empty match.  Do the same thing REOP_EMPTY would do.
2654                  */
2655                 if (!result)
2656                     result = x;
2657
2658                 --gData->stateStackTop;
2659                 pc += GET_OFFSET(pc);
2660                 op = (REOp) *pc++;
2661                 continue;
2662
2663               /*
2664                * Occurs at last (successful) end of REOP_ALT,
2665                */
2666               case REOP_ENDALT:
2667                 /*
2668                  * If we have not gotten a result here, it is because of an
2669                  * empty match.  Do the same thing REOP_EMPTY would do.
2670                  */
2671                 if (!result)
2672                     result = x;
2673
2674                 --gData->stateStackTop;
2675                 op = (REOp) *pc++;
2676                 continue;
2677
2678               case REOP_LPAREN:
2679                 pc = ReadCompactIndex(pc, &parenIndex);
2680                 TRACE("[ %lu ]\n", (ULONG_PTR)parenIndex);
2681                 assert(parenIndex < gData->regexp->parenCount);
2682                 if (parenIndex + 1 > parenSoFar)
2683                     parenSoFar = parenIndex + 1;
2684                 x->parens[parenIndex].index = x->cp - gData->cpbegin;
2685                 x->parens[parenIndex].length = 0;
2686                 op = (REOp) *pc++;
2687                 continue;
2688
2689               case REOP_RPAREN:
2690               {
2691                 ptrdiff_t delta;
2692
2693                 pc = ReadCompactIndex(pc, &parenIndex);
2694                 assert(parenIndex < gData->regexp->parenCount);
2695                 cap = &x->parens[parenIndex];
2696                 delta = x->cp - (gData->cpbegin + cap->index);
2697                 cap->length = (delta < 0) ? 0 : (size_t) delta;
2698                 op = (REOp) *pc++;
2699
2700                 if (!result)
2701                     result = x;
2702                 continue;
2703               }
2704               case REOP_ASSERT:
2705                 nextpc = pc + GET_OFFSET(pc);  /* start of term after ASSERT */
2706                 pc += ARG_LEN;                 /* start of ASSERT child */
2707                 op = (REOp) *pc++;
2708                 testpc = pc;
2709                 if (REOP_IS_SIMPLE(op) &&
2710                     !SimpleMatch(gData, x, op, &testpc, FALSE)) {
2711                     result = NULL;
2712                     break;
2713                 }
2714                 curState->u.assertion.top =
2715                     (char *)gData->backTrackSP - (char *)gData->backTrackStack;
2716                 curState->u.assertion.sz = gData->cursz;
2717                 curState->index = x->cp - gData->cpbegin;
2718                 curState->parenSoFar = parenSoFar;
2719                 PUSH_STATE_STACK(gData);
2720                 if (!PushBackTrackState(gData, REOP_ASSERTTEST,
2721                                         nextpc, x, x->cp, 0, 0)) {
2722                     goto bad;
2723                 }
2724                 continue;
2725
2726               case REOP_ASSERT_NOT:
2727                 nextpc = pc + GET_OFFSET(pc);
2728                 pc += ARG_LEN;
2729                 op = (REOp) *pc++;
2730                 testpc = pc;
2731                 if (REOP_IS_SIMPLE(op) /* Note - fail to fail! */ &&
2732                     SimpleMatch(gData, x, op, &testpc, FALSE) &&
2733                     *testpc == REOP_ASSERTNOTTEST) {
2734                     result = NULL;
2735                     break;
2736                 }
2737                 curState->u.assertion.top
2738                     = (char *)gData->backTrackSP -
2739                       (char *)gData->backTrackStack;
2740                 curState->u.assertion.sz = gData->cursz;
2741                 curState->index = x->cp - gData->cpbegin;
2742                 curState->parenSoFar = parenSoFar;
2743                 PUSH_STATE_STACK(gData);
2744                 if (!PushBackTrackState(gData, REOP_ASSERTNOTTEST,
2745                                         nextpc, x, x->cp, 0, 0)) {
2746                     goto bad;
2747                 }
2748                 continue;
2749
2750               case REOP_ASSERTTEST:
2751                 --gData->stateStackTop;
2752                 --curState;
2753                 x->cp = gData->cpbegin + curState->index;
2754                 gData->backTrackSP =
2755                     (REBackTrackData *) ((char *)gData->backTrackStack +
2756                                          curState->u.assertion.top);
2757                 gData->cursz = curState->u.assertion.sz;
2758                 if (result)
2759                     result = x;
2760                 break;
2761
2762               case REOP_ASSERTNOTTEST:
2763                 --gData->stateStackTop;
2764                 --curState;
2765                 x->cp = gData->cpbegin + curState->index;
2766                 gData->backTrackSP =
2767                     (REBackTrackData *) ((char *)gData->backTrackStack +
2768                                          curState->u.assertion.top);
2769                 gData->cursz = curState->u.assertion.sz;
2770                 result = (!result) ? x : NULL;
2771                 break;
2772               case REOP_STAR:
2773                 curState->u.quantifier.min = 0;
2774                 curState->u.quantifier.max = (UINT)-1;
2775                 goto quantcommon;
2776               case REOP_PLUS:
2777                 curState->u.quantifier.min = 1;
2778                 curState->u.quantifier.max = (UINT)-1;
2779                 goto quantcommon;
2780               case REOP_OPT:
2781                 curState->u.quantifier.min = 0;
2782                 curState->u.quantifier.max = 1;
2783                 goto quantcommon;
2784               case REOP_QUANT:
2785                 pc = ReadCompactIndex(pc, &k);
2786                 curState->u.quantifier.min = k;
2787                 pc = ReadCompactIndex(pc, &k);
2788                 /* max is k - 1 to use one byte for (UINT)-1 sentinel. */
2789                 curState->u.quantifier.max = k - 1;
2790                 assert(curState->u.quantifier.min <= curState->u.quantifier.max);
2791               quantcommon:
2792                 if (curState->u.quantifier.max == 0) {
2793                     pc = pc + GET_OFFSET(pc);
2794                     op = (REOp) *pc++;
2795                     result = x;
2796                     continue;
2797                 }
2798                 /* Step over <next> */
2799                 nextpc = pc + ARG_LEN;
2800                 op = (REOp) *nextpc++;
2801                 startcp = x->cp;
2802                 if (REOP_IS_SIMPLE(op)) {
2803                     if (!SimpleMatch(gData, x, op, &nextpc, TRUE)) {
2804                         if (curState->u.quantifier.min == 0)
2805                             result = x;
2806                         else
2807                             result = NULL;
2808                         pc = pc + GET_OFFSET(pc);
2809                         break;
2810                     }
2811                     op = (REOp) *nextpc++;
2812                     result = x;
2813                 }
2814                 curState->index = startcp - gData->cpbegin;
2815                 curState->continue_op = REOP_REPEAT;
2816                 curState->continue_pc = pc;
2817                 curState->parenSoFar = parenSoFar;
2818                 PUSH_STATE_STACK(gData);
2819                 if (curState->u.quantifier.min == 0 &&
2820                     !PushBackTrackState(gData, REOP_REPEAT, pc, x, startcp,
2821                                         0, 0)) {
2822                     goto bad;
2823                 }
2824                 pc = nextpc;
2825                 continue;
2826
2827               case REOP_ENDCHILD: /* marks the end of a quantifier child */
2828                 pc = curState[-1].continue_pc;
2829                 op = (REOp) curState[-1].continue_op;
2830
2831                 if (!result)
2832                     result = x;
2833                 continue;
2834
2835               case REOP_REPEAT:
2836                 --curState;
2837                 do {
2838                     --gData->stateStackTop;
2839                     if (!result) {
2840                         /* Failed, see if we have enough children. */
2841                         if (curState->u.quantifier.min == 0)
2842                             goto repeatDone;
2843                         goto break_switch;
2844                     }
2845                     if (curState->u.quantifier.min == 0 &&
2846                         x->cp == gData->cpbegin + curState->index) {
2847                         /* matched an empty string, that'll get us nowhere */
2848                         result = NULL;
2849                         goto break_switch;
2850                     }
2851                     if (curState->u.quantifier.min != 0)
2852                         curState->u.quantifier.min--;
2853                     if (curState->u.quantifier.max != (UINT) -1)
2854                         curState->u.quantifier.max--;
2855                     if (curState->u.quantifier.max == 0)
2856                         goto repeatDone;
2857                     nextpc = pc + ARG_LEN;
2858                     nextop = (REOp) *nextpc;
2859                     startcp = x->cp;
2860                     if (REOP_IS_SIMPLE(nextop)) {
2861                         nextpc++;
2862                         if (!SimpleMatch(gData, x, nextop, &nextpc, TRUE)) {
2863                             if (curState->u.quantifier.min == 0)
2864                                 goto repeatDone;
2865                             result = NULL;
2866                             goto break_switch;
2867                         }
2868                         result = x;
2869                     }
2870                     curState->index = startcp - gData->cpbegin;
2871                     PUSH_STATE_STACK(gData);
2872                     if (curState->u.quantifier.min == 0 &&
2873                         !PushBackTrackState(gData, REOP_REPEAT,
2874                                             pc, x, startcp,
2875                                             curState->parenSoFar,
2876                                             parenSoFar -
2877                                             curState->parenSoFar)) {
2878                         goto bad;
2879                     }
2880                 } while (*nextpc == REOP_ENDCHILD);
2881                 pc = nextpc;
2882                 op = (REOp) *pc++;
2883                 parenSoFar = curState->parenSoFar;
2884                 continue;
2885
2886               repeatDone:
2887                 result = x;
2888                 pc += GET_OFFSET(pc);
2889                 goto break_switch;
2890
2891               case REOP_MINIMALSTAR:
2892                 curState->u.quantifier.min = 0;
2893                 curState->u.quantifier.max = (UINT)-1;
2894                 goto minimalquantcommon;
2895               case REOP_MINIMALPLUS:
2896                 curState->u.quantifier.min = 1;
2897                 curState->u.quantifier.max = (UINT)-1;
2898                 goto minimalquantcommon;
2899               case REOP_MINIMALOPT:
2900                 curState->u.quantifier.min = 0;
2901                 curState->u.quantifier.max = 1;
2902                 goto minimalquantcommon;
2903               case REOP_MINIMALQUANT:
2904                 pc = ReadCompactIndex(pc, &k);
2905                 curState->u.quantifier.min = k;
2906                 pc = ReadCompactIndex(pc, &k);
2907                 /* See REOP_QUANT comments about k - 1. */
2908                 curState->u.quantifier.max = k - 1;
2909                 assert(curState->u.quantifier.min
2910                           <= curState->u.quantifier.max);
2911               minimalquantcommon:
2912                 curState->index = x->cp - gData->cpbegin;
2913                 curState->parenSoFar = parenSoFar;
2914                 PUSH_STATE_STACK(gData);
2915                 if (curState->u.quantifier.min != 0) {
2916                     curState->continue_op = REOP_MINIMALREPEAT;
2917                     curState->continue_pc = pc;
2918                     /* step over <next> */
2919                     pc += OFFSET_LEN;
2920                     op = (REOp) *pc++;
2921                 } else {
2922                     if (!PushBackTrackState(gData, REOP_MINIMALREPEAT,
2923                                             pc, x, x->cp, 0, 0)) {
2924                         goto bad;
2925                     }
2926                     --gData->stateStackTop;
2927                     pc = pc + GET_OFFSET(pc);
2928                     op = (REOp) *pc++;
2929                 }
2930                 continue;
2931
2932               case REOP_MINIMALREPEAT:
2933                 --gData->stateStackTop;
2934                 --curState;
2935
2936                 TRACE("{%d,%d}\n", curState->u.quantifier.min, curState->u.quantifier.max);
2937 #define PREPARE_REPEAT()                                                      \
2938     do {                                                                      \
2939         curState->index = x->cp - gData->cpbegin;                          \
2940         curState->continue_op = REOP_MINIMALREPEAT;                           \
2941         curState->continue_pc = pc;                                           \
2942         pc += ARG_LEN;                                                        \
2943         for (k = curState->parenSoFar; k < parenSoFar; k++)                   \
2944             x->parens[k].index = -1;                                          \
2945         PUSH_STATE_STACK(gData);                                              \
2946         op = (REOp) *pc++;                                                    \
2947         assert(op < REOP_LIMIT);                                              \
2948     }while(0)
2949
2950                 if (!result) {
2951                     TRACE(" -\n");
2952                     /*
2953                      * Non-greedy failure - try to consume another child.
2954                      */
2955                     if (curState->u.quantifier.max == (UINT) -1 ||
2956                         curState->u.quantifier.max > 0) {
2957                         PREPARE_REPEAT();
2958                         continue;
2959                     }
2960                     /* Don't need to adjust pc since we're going to pop. */
2961                     break;
2962                 }
2963                 if (curState->u.quantifier.min == 0 &&
2964                     x->cp == gData->cpbegin + curState->index) {
2965                     /* Matched an empty string, that'll get us nowhere. */
2966                     result = NULL;
2967                     break;
2968                 }
2969                 if (curState->u.quantifier.min != 0)
2970                     curState->u.quantifier.min--;
2971                 if (curState->u.quantifier.max != (UINT) -1)
2972                     curState->u.quantifier.max--;
2973                 if (curState->u.quantifier.min != 0) {
2974                     PREPARE_REPEAT();
2975                     continue;
2976                 }
2977                 curState->index = x->cp - gData->cpbegin;
2978                 curState->parenSoFar = parenSoFar;
2979                 PUSH_STATE_STACK(gData);
2980                 if (!PushBackTrackState(gData, REOP_MINIMALREPEAT,
2981                                         pc, x, x->cp,
2982                                         curState->parenSoFar,
2983                                         parenSoFar - curState->parenSoFar)) {
2984                     goto bad;
2985                 }
2986                 --gData->stateStackTop;
2987                 pc = pc + GET_OFFSET(pc);
2988                 op = (REOp) *pc++;
2989                 assert(op < REOP_LIMIT);
2990                 continue;
2991               default:
2992                 assert(FALSE);
2993                 result = NULL;
2994             }
2995           break_switch:;
2996         }
2997
2998         /*
2999          *  If the match failed and there's a backtrack option, take it.
3000          *  Otherwise this is a complete and utter failure.
3001          */
3002         if (!result) {
3003             if (gData->cursz == 0)
3004                 return NULL;
3005
3006             /* Potentially detect explosive regex here. */
3007             gData->backTrackCount++;
3008             if (gData->backTrackLimit &&
3009                 gData->backTrackCount >= gData->backTrackLimit) {
3010                 JS_ReportErrorNumber(gData->cx, js_GetErrorMessage, NULL,
3011                                      JSMSG_REGEXP_TOO_COMPLEX);
3012                 gData->ok = FALSE;
3013                 return NULL;
3014             }
3015
3016             backTrackData = gData->backTrackSP;
3017             gData->cursz = backTrackData->sz;
3018             gData->backTrackSP =
3019                 (REBackTrackData *) ((char *)backTrackData - backTrackData->sz);
3020             x->cp = backTrackData->cp;
3021             pc = backTrackData->backtrack_pc;
3022             op = (REOp) backTrackData->backtrack_op;
3023             assert(op < REOP_LIMIT);
3024             gData->stateStackTop = backTrackData->saveStateStackTop;
3025             assert(gData->stateStackTop);
3026
3027             memcpy(gData->stateStack, backTrackData + 1,
3028                    sizeof(REProgState) * backTrackData->saveStateStackTop);
3029             curState = &gData->stateStack[gData->stateStackTop - 1];
3030
3031             if (backTrackData->parenCount) {
3032                 memcpy(&x->parens[backTrackData->parenIndex],
3033                        (char *)(backTrackData + 1) +
3034                        sizeof(REProgState) * backTrackData->saveStateStackTop,
3035                        sizeof(RECapture) * backTrackData->parenCount);
3036                 parenSoFar = backTrackData->parenIndex + backTrackData->parenCount;
3037             } else {
3038                 for (k = curState->parenSoFar; k < parenSoFar; k++)
3039                     x->parens[k].index = -1;
3040                 parenSoFar = curState->parenSoFar;
3041             }
3042
3043             TRACE("\tBT_Pop: %ld,%ld\n",
3044                      (ULONG_PTR)backTrackData->parenIndex,
3045                      (ULONG_PTR)backTrackData->parenCount);
3046             continue;
3047         }
3048         x = result;
3049
3050         /*
3051          *  Continue with the expression.
3052          */
3053         op = (REOp)*pc++;
3054         assert(op < REOP_LIMIT);
3055     }
3056
3057 bad:
3058     TRACE("\n");
3059     return NULL;
3060
3061 good:
3062     TRACE("\n");
3063     return x;
3064 }
3065
3066 static match_state_t *MatchRegExp(REGlobalData *gData, match_state_t *x)
3067 {
3068     match_state_t *result;
3069     const WCHAR *cp = x->cp;
3070     const WCHAR *cp2;
3071     UINT j;
3072
3073     /*
3074      * Have to include the position beyond the last character
3075      * in order to detect end-of-input/line condition.
3076      */
3077     for (cp2 = cp; cp2 <= gData->cpend; cp2++) {
3078         gData->skipped = cp2 - cp;
3079         x->cp = cp2;
3080         for (j = 0; j < gData->regexp->parenCount; j++)
3081             x->parens[j].index = -1;
3082         result = ExecuteREBytecode(gData, x);
3083         if (!gData->ok || result || (gData->regexp->flags & REG_STICKY))
3084             return result;
3085         gData->backTrackSP = gData->backTrackStack;
3086         gData->cursz = 0;
3087         gData->stateStackTop = 0;
3088         cp2 = cp + gData->skipped;
3089     }
3090     return NULL;
3091 }
3092
3093 static HRESULT InitMatch(regexp_t *re, void *cx, heap_pool_t *pool, REGlobalData *gData)
3094 {
3095     UINT i;
3096
3097     gData->backTrackStackSize = INITIAL_BACKTRACK;
3098     gData->backTrackStack = heap_pool_alloc(gData->pool, INITIAL_BACKTRACK);
3099     if (!gData->backTrackStack)
3100         goto bad;
3101
3102     gData->backTrackSP = gData->backTrackStack;
3103     gData->cursz = 0;
3104     gData->backTrackCount = 0;
3105     gData->backTrackLimit = 0;
3106
3107     gData->stateStackLimit = INITIAL_STATESTACK;
3108     gData->stateStack = heap_pool_alloc(gData->pool, sizeof(REProgState) * INITIAL_STATESTACK);
3109     if (!gData->stateStack)
3110         goto bad;
3111
3112     gData->stateStackTop = 0;
3113     gData->cx = cx;
3114     gData->pool = pool;
3115     gData->regexp = re;
3116     gData->ok = TRUE;
3117
3118     for (i = 0; i < re->classCount; i++) {
3119         if (!re->classList[i].converted &&
3120                 !ProcessCharSet(gData, &re->classList[i])) {
3121             return E_FAIL;
3122         }
3123     }
3124
3125     return S_OK;
3126
3127 bad:
3128     js_ReportOutOfScriptQuota(cx);
3129     gData->ok = FALSE;
3130     return E_OUTOFMEMORY;
3131 }
3132
3133 HRESULT regexp_execute(regexp_t *regexp, void *cx, heap_pool_t *pool,
3134         const WCHAR *str, DWORD str_len, match_state_t *result)
3135 {
3136     match_state_t *res;
3137     REGlobalData gData;
3138     heap_pool_t *mark = heap_pool_mark(pool);
3139     const WCHAR *str_beg = result->cp;
3140     HRESULT hres;
3141
3142     assert(result->cp != NULL);
3143
3144     gData.cpbegin = str;
3145     gData.cpend = str+str_len;
3146     gData.start = result->cp-str;
3147     gData.skipped = 0;
3148     gData.pool = pool;
3149
3150     hres = InitMatch(regexp, cx, pool, &gData);
3151     if(FAILED(hres)) {
3152         WARN("InitMatch failed\n");
3153         heap_pool_clear(mark);
3154         return hres;
3155     }
3156
3157     res = MatchRegExp(&gData, result);
3158     heap_pool_clear(mark);
3159     if(!gData.ok) {
3160         WARN("MatchRegExp failed\n");
3161         return E_FAIL;
3162     }
3163
3164     if(!res) {
3165         result->match_len = 0;
3166         return S_FALSE;
3167     }
3168
3169     result->match_len = (result->cp-str_beg) - gData.skipped;
3170     result->paren_count = regexp->parenCount;
3171     return S_OK;
3172 }
3173
3174 void regexp_destroy(regexp_t *re)
3175 {
3176     if (re->classList) {
3177         UINT i;
3178         for (i = 0; i < re->classCount; i++) {
3179             if (re->classList[i].converted)
3180                 heap_free(re->classList[i].u.bits);
3181             re->classList[i].u.bits = NULL;
3182         }
3183         heap_free(re->classList);
3184     }
3185     heap_free(re);
3186 }
3187
3188 regexp_t* regexp_new(void *cx, heap_pool_t *pool, const WCHAR *str,
3189         DWORD str_len, WORD flags, BOOL flat)
3190 {
3191     regexp_t *re;
3192     heap_pool_t *mark;
3193     CompilerState state;
3194     size_t resize;
3195     jsbytecode *endPC;
3196     UINT i;
3197     size_t len;
3198
3199     re = NULL;
3200     mark = heap_pool_mark(pool);
3201     len = str_len;
3202
3203     state.context = cx;
3204     state.pool = pool;
3205     state.cp = str;
3206     if (!state.cp)
3207         goto out;
3208     state.cpbegin = state.cp;
3209     state.cpend = state.cp + len;
3210     state.flags = flags;
3211     state.parenCount = 0;
3212     state.classCount = 0;
3213     state.progLength = 0;
3214     state.treeDepth = 0;
3215     state.classBitmapsMem = 0;
3216     for (i = 0; i < CLASS_CACHE_SIZE; i++)
3217         state.classCache[i].start = NULL;
3218
3219     if (len != 0 && flat) {
3220         state.result = NewRENode(&state, REOP_FLAT);
3221         if (!state.result)
3222             goto out;
3223         state.result->u.flat.chr = *state.cpbegin;
3224         state.result->u.flat.length = len;
3225         state.result->kid = (void *) state.cpbegin;
3226         /* Flat bytecode: REOP_FLAT compact(string_offset) compact(len). */
3227         state.progLength += 1 + GetCompactIndexWidth(0)
3228                           + GetCompactIndexWidth(len);
3229     } else {
3230         if (!ParseRegExp(&state))
3231             goto out;
3232     }
3233     resize = offsetof(regexp_t, program) + state.progLength + 1;
3234     re = heap_alloc(resize);
3235     if (!re)
3236         goto out;
3237
3238     assert(state.classBitmapsMem <= CLASS_BITMAPS_MEM_LIMIT);
3239     re->classCount = state.classCount;
3240     if (re->classCount) {
3241         re->classList = heap_alloc(re->classCount * sizeof(RECharSet));
3242         if (!re->classList) {
3243             regexp_destroy(re);
3244             re = NULL;
3245             goto out;
3246         }
3247         for (i = 0; i < re->classCount; i++)
3248             re->classList[i].converted = FALSE;
3249     } else {
3250         re->classList = NULL;
3251     }
3252     endPC = EmitREBytecode(&state, re, state.treeDepth, re->program, state.result);
3253     if (!endPC) {
3254         regexp_destroy(re);
3255         re = NULL;
3256         goto out;
3257     }
3258     *endPC++ = REOP_END;
3259     /*
3260      * Check whether size was overestimated and shrink using realloc.
3261      * This is safe since no pointers to newly parsed regexp or its parts
3262      * besides re exist here.
3263      */
3264     if ((size_t)(endPC - re->program) != state.progLength + 1) {
3265         regexp_t *tmp;
3266         assert((size_t)(endPC - re->program) < state.progLength + 1);
3267         resize = offsetof(regexp_t, program) + (endPC - re->program);
3268         tmp = heap_realloc(re, resize);
3269         if (tmp)
3270             re = tmp;
3271     }
3272
3273     re->flags = flags;
3274     re->parenCount = state.parenCount;
3275     re->source = str;
3276     re->source_len = str_len;
3277
3278 out:
3279     heap_pool_clear(mark);
3280     return re;
3281 }
3282
3283 HRESULT regexp_set_flags(regexp_t **regexp, void *cx, heap_pool_t *pool, WORD flags)
3284 {
3285     if(((*regexp)->flags & REG_FOLD) != (flags & REG_FOLD)) {
3286         regexp_t *new_regexp = regexp_new(cx, pool, (*regexp)->source,
3287                 (*regexp)->source_len, flags, FALSE);
3288
3289         if(!new_regexp)
3290             return E_FAIL;
3291
3292         regexp_destroy(*regexp);
3293         *regexp = new_regexp;
3294     }else {
3295         (*regexp)->flags = flags;
3296     }
3297
3298     return S_OK;
3299 }