Path: blob/master/thirdparty/pcre2/src/pcre2_match.c
10278 views
/*************************************************1* Perl-Compatible Regular Expressions *2*************************************************/34/* PCRE is a library of functions to support regular expressions whose syntax5and semantics are as close as possible to those of the Perl 5 language.67Written by Philip Hazel8Original API code Copyright (c) 1997-2012 University of Cambridge9New API code Copyright (c) 2015-2024 University of Cambridge1011-----------------------------------------------------------------------------12Redistribution and use in source and binary forms, with or without13modification, are permitted provided that the following conditions are met:1415* Redistributions of source code must retain the above copyright notice,16this list of conditions and the following disclaimer.1718* Redistributions in binary form must reproduce the above copyright19notice, this list of conditions and the following disclaimer in the20documentation and/or other materials provided with the distribution.2122* Neither the name of the University of Cambridge nor the names of its23contributors may be used to endorse or promote products derived from24this software without specific prior written permission.2526THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"27AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE28IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE29ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE30LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR31CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF32SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS33INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN34CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)35ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE36POSSIBILITY OF SUCH DAMAGE.37-----------------------------------------------------------------------------38*/394041#ifdef HAVE_CONFIG_H42#include "config.h"43#endif4445#include "pcre2_internal.h"4647/* These defines enable debugging code */4849/* #define DEBUG_FRAMES_DISPLAY */50/* #define DEBUG_SHOW_OPS */51/* #define DEBUG_SHOW_RMATCH */5253#ifdef DEBUG_FRAMES_DISPLAY54#include <stdarg.h>55#endif5657#ifdef DEBUG_SHOW_OPS58static const char *OP_names[] = { OP_NAME_LIST };59#endif6061/* These defines identify the name of the block containing "static"62information, and fields within it. */6364#define NLBLOCK mb /* Block containing newline information */65#define PSSTART start_subject /* Field containing processed string start */66#define PSEND end_subject /* Field containing processed string end */6768#define RECURSE_UNSET 0xffffffffu /* Bigger than max group number */6970/* Masks for identifying the public options that are permitted at match time. */7172#define PUBLIC_MATCH_OPTIONS \73(PCRE2_ANCHORED|PCRE2_ENDANCHORED|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY| \74PCRE2_NOTEMPTY_ATSTART|PCRE2_NO_UTF_CHECK|PCRE2_PARTIAL_HARD| \75PCRE2_PARTIAL_SOFT|PCRE2_NO_JIT|PCRE2_COPY_MATCHED_SUBJECT| \76PCRE2_DISABLE_RECURSELOOP_CHECK)7778#define PUBLIC_JIT_MATCH_OPTIONS \79(PCRE2_NO_UTF_CHECK|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY|\80PCRE2_NOTEMPTY_ATSTART|PCRE2_PARTIAL_SOFT|PCRE2_PARTIAL_HARD|\81PCRE2_COPY_MATCHED_SUBJECT)8283/* Non-error returns from and within the match() function. Error returns are84externally defined PCRE2_ERROR_xxx codes, which are all negative. */8586#define MATCH_MATCH 187#define MATCH_NOMATCH 08889/* Special internal returns used in the match() function. Make them90sufficiently negative to avoid the external error codes. */9192#define MATCH_ACCEPT (-999)93#define MATCH_KETRPOS (-998)94/* The next 5 must be kept together and in sequence so that a test that checks95for any one of them can use a range. */96#define MATCH_COMMIT (-997)97#define MATCH_PRUNE (-996)98#define MATCH_SKIP (-995)99#define MATCH_SKIP_ARG (-994)100#define MATCH_THEN (-993)101#define MATCH_BACKTRACK_MAX MATCH_THEN102#define MATCH_BACKTRACK_MIN MATCH_COMMIT103104/* Group frame type values. Zero means the frame is not a group frame. The105lower 16 bits are used for data (e.g. the capture number). Group frames are106used for most groups so that information about the start is easily available at107the end without having to scan back through intermediate frames (backtrack108points). */109110#define GF_CAPTURE 0x00010000u111#define GF_NOCAPTURE 0x00020000u112#define GF_CONDASSERT 0x00030000u113#define GF_RECURSE 0x00040000u114115/* Masks for the identity and data parts of the group frame type. */116117#define GF_IDMASK(a) ((a) & 0xffff0000u)118#define GF_DATAMASK(a) ((a) & 0x0000ffffu)119120/* Repetition types */121122enum { REPTYPE_MIN, REPTYPE_MAX, REPTYPE_POS };123124/* Min and max values for the common repeats; a maximum of UINT32_MAX =>125infinity. */126127static const uint32_t rep_min[] = {1280, 0, /* * and *? */1291, 1, /* + and +? */1300, 0, /* ? and ?? */1310, 0, /* dummy placefillers for OP_CR[MIN]RANGE */1320, 1, 0 }; /* OP_CRPOS{STAR, PLUS, QUERY} */133134static const uint32_t rep_max[] = {135UINT32_MAX, UINT32_MAX, /* * and *? */136UINT32_MAX, UINT32_MAX, /* + and +? */1371, 1, /* ? and ?? */1380, 0, /* dummy placefillers for OP_CR[MIN]RANGE */139UINT32_MAX, UINT32_MAX, 1 }; /* OP_CRPOS{STAR, PLUS, QUERY} */140141/* Repetition types - must include OP_CRPOSRANGE (not needed above) */142143static const uint32_t rep_typ[] = {144REPTYPE_MAX, REPTYPE_MIN, /* * and *? */145REPTYPE_MAX, REPTYPE_MIN, /* + and +? */146REPTYPE_MAX, REPTYPE_MIN, /* ? and ?? */147REPTYPE_MAX, REPTYPE_MIN, /* OP_CRRANGE and OP_CRMINRANGE */148REPTYPE_POS, REPTYPE_POS, /* OP_CRPOSSTAR, OP_CRPOSPLUS */149REPTYPE_POS, REPTYPE_POS }; /* OP_CRPOSQUERY, OP_CRPOSRANGE */150151/* Numbers for RMATCH calls at backtracking points. When these lists are152changed, the code at RETURN_SWITCH below must be updated in sync. */153154enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,155RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,156RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,157RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39 };158159#ifdef SUPPORT_WIDE_CHARS160enum { RM100=100, RM101, RM102, RM103 };161#endif162163#ifdef SUPPORT_UNICODE164enum { RM200=200, RM201, RM202, RM203, RM204, RM205, RM206, RM207,165RM208, RM209, RM210, RM211, RM212, RM213, RM214, RM215,166RM216, RM217, RM218, RM219, RM220, RM221, RM222, RM223,167RM224 };168#endif169170/* Define short names for general fields in the current backtrack frame, which171is always pointed to by the F variable. Occasional references to fields in172other frames are written out explicitly. There are also some fields in the173current frame whose names start with "temp" that are used for short-term,174localised backtracking memory. These are #defined with Lxxx names at the point175of use and undefined afterwards. */176177#define Fback_frame F->back_frame178#define Fcapture_last F->capture_last179#define Fcurrent_recurse F->current_recurse180#define Fecode F->ecode181#define Feptr F->eptr182#define Fgroup_frame_type F->group_frame_type183#define Flast_group_offset F->last_group_offset184#define Flength F->length185#define Fmark F->mark186#define Frdepth F->rdepth187#define Fstart_match F->start_match188#define Foffset_top F->offset_top189#define Foccu F->occu190#define Fop F->op191#define Fovector F->ovector192#define Freturn_id F->return_id193194195#ifdef DEBUG_FRAMES_DISPLAY196/*************************************************197* Display current frames and contents *198*************************************************/199200/* This debugging function displays the current set of frames and their201contents. It is not called automatically from anywhere, the intention being202that calls can be inserted where necessary when debugging frame-related203problems.204205Arguments:206f the file to write to207F the current top frame208P a previous frame of interest209frame_size the frame size210mb points to the match block211match_data points to the match data block212s identification text213214Returns: nothing215*/216217static void218display_frames(FILE *f, heapframe *F, heapframe *P, PCRE2_SIZE frame_size,219match_block *mb, pcre2_match_data *match_data, const char *s, ...)220{221uint32_t i;222heapframe *Q;223va_list ap;224va_start(ap, s);225226fprintf(f, "FRAMES ");227vfprintf(f, s, ap);228va_end(ap);229230if (P != NULL) fprintf(f, " P=%lu",231((char *)P - (char *)(match_data->heapframes))/frame_size);232fprintf(f, "\n");233234for (i = 0, Q = match_data->heapframes;235Q <= F;236i++, Q = (heapframe *)((char *)Q + frame_size))237{238fprintf(f, "Frame %d type=%x subj=%lu code=%d back=%lu id=%d",239i, Q->group_frame_type, Q->eptr - mb->start_subject, *(Q->ecode),240Q->back_frame, Q->return_id);241242if (Q->last_group_offset == PCRE2_UNSET)243fprintf(f, " lgoffset=unset\n");244else245fprintf(f, " lgoffset=%lu\n", Q->last_group_offset/frame_size);246}247}248249#endif250251252253/*************************************************254* Process a callout *255*************************************************/256257/* This function is called for all callouts, whether "standalone" or at the258start of a conditional group. Feptr will be pointing to either OP_CALLOUT or259OP_CALLOUT_STR. A callout block is allocated in pcre2_match() and initialized260with fixed values.261262Arguments:263F points to the current backtracking frame264mb points to the match block265lengthptr where to return the length of the callout item266267Returns: the return from the callout268or 0 if no callout function exists269*/270271static int272do_callout(heapframe *F, match_block *mb, PCRE2_SIZE *lengthptr)273{274int rc;275PCRE2_SIZE save0, save1;276PCRE2_SIZE *callout_ovector;277pcre2_callout_block *cb;278279*lengthptr = (*Fecode == OP_CALLOUT)?280PRIV(OP_lengths)[OP_CALLOUT] : GET(Fecode, 1 + 2*LINK_SIZE);281282if (mb->callout == NULL) return 0; /* No callout function provided */283284/* The original matching code (pre 10.30) worked directly with the ovector285passed by the user, and this was passed to callouts. Now that the working286ovector is in the backtracking frame, it no longer needs to reserve space for287the overall match offsets (which would waste space in the frame). For backward288compatibility, however, we pass capture_top and offset_vector to the callout as289if for the extended ovector, and we ensure that the first two slots are unset290by preserving and restoring their current contents. Picky compilers complain if291references such as Fovector[-2] are use directly, so we set up a separate292pointer. */293294callout_ovector = (PCRE2_SIZE *)(Fovector) - 2;295296/* The cb->version, cb->subject, cb->subject_length, and cb->start_match fields297are set externally. The first 3 never change; the last is updated for each298bumpalong. */299300cb = mb->cb;301cb->capture_top = (uint32_t)Foffset_top/2 + 1;302cb->capture_last = Fcapture_last;303cb->offset_vector = callout_ovector;304cb->mark = mb->nomatch_mark;305cb->current_position = (PCRE2_SIZE)(Feptr - mb->start_subject);306cb->pattern_position = GET(Fecode, 1);307cb->next_item_length = GET(Fecode, 1 + LINK_SIZE);308309if (*Fecode == OP_CALLOUT) /* Numerical callout */310{311cb->callout_number = Fecode[1 + 2*LINK_SIZE];312cb->callout_string_offset = 0;313cb->callout_string = NULL;314cb->callout_string_length = 0;315}316else /* String callout */317{318cb->callout_number = 0;319cb->callout_string_offset = GET(Fecode, 1 + 3*LINK_SIZE);320cb->callout_string = Fecode + (1 + 4*LINK_SIZE) + 1;321cb->callout_string_length =322*lengthptr - (1 + 4*LINK_SIZE) - 2;323}324325save0 = callout_ovector[0];326save1 = callout_ovector[1];327callout_ovector[0] = callout_ovector[1] = PCRE2_UNSET;328rc = mb->callout(cb, mb->callout_data);329callout_ovector[0] = save0;330callout_ovector[1] = save1;331cb->callout_flags = 0;332return rc;333}334335336337/*************************************************338* Match a back-reference *339*************************************************/340341/* This function is called only when it is known that the offset lies within342the offsets that have so far been used in the match. Note that in caseless343UTF-8 mode, the number of subject bytes matched may be different to the number344of reference bytes. (In theory this could also happen in UTF-16 mode, but it345seems unlikely.)346347Arguments:348offset index into the offset vector349caseless TRUE if caseless350caseopts bitmask of REFI_FLAG_XYZ values351F the current backtracking frame pointer352mb points to match block353lengthptr pointer for returning the length matched354355Returns: = 0 sucessful match; number of code units matched is set356< 0 no match357> 0 partial match358*/359360static int361match_ref(PCRE2_SIZE offset, BOOL caseless, int caseopts, heapframe *F,362match_block *mb, PCRE2_SIZE *lengthptr)363{364PCRE2_SPTR p;365PCRE2_SIZE length;366PCRE2_SPTR eptr;367PCRE2_SPTR eptr_start;368369/* Deal with an unset group. The default is no match, but there is an option to370match an empty string. */371372if (offset >= Foffset_top || Fovector[offset] == PCRE2_UNSET)373{374if ((mb->poptions & PCRE2_MATCH_UNSET_BACKREF) != 0)375{376*lengthptr = 0;377return 0; /* Match */378}379else return -1; /* No match */380}381382/* Separate the caseless and UTF cases for speed. */383384eptr = eptr_start = Feptr;385p = mb->start_subject + Fovector[offset];386length = Fovector[offset+1] - Fovector[offset];387388if (caseless)389{390#if defined SUPPORT_UNICODE391BOOL utf = (mb->poptions & PCRE2_UTF) != 0;392BOOL caseless_restrict = (caseopts & REFI_FLAG_CASELESS_RESTRICT) != 0;393BOOL turkish_casing = !caseless_restrict && (caseopts & REFI_FLAG_TURKISH_CASING) != 0;394395if (utf || (mb->poptions & PCRE2_UCP) != 0)396{397PCRE2_SPTR endptr = p + length;398399/* Match characters up to the end of the reference. NOTE: the number of400code units matched may differ, because in UTF-8 there are some characters401whose upper and lower case codes have different numbers of bytes. For402example, U+023A (2 bytes in UTF-8) is the upper case version of U+2C65 (3403bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a404sequence of two of the latter. It is important, therefore, to check the405length along the reference, not along the subject (earlier code did this406wrong). UCP without uses Unicode properties but without UTF encoding. */407408while (p < endptr)409{410uint32_t c, d;411const ucd_record *ur;412if (eptr >= mb->end_subject) return 1; /* Partial match */413414if (utf)415{416GETCHARINC(c, eptr);417GETCHARINC(d, p);418}419else420{421c = *eptr++;422d = *p++;423}424425if (turkish_casing && UCD_ANY_I(d))426{427c = UCD_FOLD_I_TURKISH(c);428d = UCD_FOLD_I_TURKISH(d);429if (c != d) return -1; /* No match */430}431else if (c != d && c != (uint32_t)((int)d + (ur = GET_UCD(d))->other_case))432{433const uint32_t *pp = PRIV(ucd_caseless_sets) + ur->caseset;434435/* When PCRE2_EXTRA_CASELESS_RESTRICT is set, ignore any caseless sets436that start with an ASCII character. */437if (caseless_restrict && *pp < 128) return -1; /* No match */438439for (;;)440{441if (c < *pp) return -1; /* No match */442if (c == *pp++) break;443}444}445}446}447else448#endif449450/* Not in UTF or UCP mode */451{452for (; length > 0; length--)453{454uint32_t cc, cp;455if (eptr >= mb->end_subject) return 1; /* Partial match */456cc = UCHAR21TEST(eptr);457cp = UCHAR21TEST(p);458if (TABLE_GET(cp, mb->lcc, cp) != TABLE_GET(cc, mb->lcc, cc))459return -1; /* No match */460p++;461eptr++;462}463}464}465466/* In the caseful case, we can just compare the code units, whether or not we467are in UTF and/or UCP mode. When partial matching, we have to do this unit by468unit. */469470else471{472if (mb->partial != 0)473{474for (; length > 0; length--)475{476if (eptr >= mb->end_subject) return 1; /* Partial match */477if (UCHAR21INCTEST(p) != UCHAR21INCTEST(eptr)) return -1; /* No match */478}479}480481/* Not partial matching */482483else484{485if ((PCRE2_SIZE)(mb->end_subject - eptr) < length) return 1; /* Partial */486if (memcmp(p, eptr, CU2BYTES(length)) != 0) return -1; /* No match */487eptr += length;488}489}490491*lengthptr = eptr - eptr_start;492return 0; /* Match */493}494495496497/******************************************************************************498*******************************************************************************499"Recursion" in the match() function500501The original match() function was highly recursive, but this proved to be the502source of a number of problems over the years, mostly because of the relatively503small system stacks that are commonly found. As new features were added to504patterns, various kludges were invented to reduce the amount of stack used,505making the code hard to understand in places.506507A version did exist that used individual frames on the heap instead of calling508match() recursively, but this ran substantially slower. The current version is509a refactoring that uses a vector of frames to remember backtracking points.510This runs no slower, and possibly even a bit faster than the original recursive511implementation.512513At first, an initial vector of size START_FRAMES_SIZE (enough for maybe 50514frames) was allocated on the system stack. If this was not big enough, the heap515was used for a larger vector. However, it turns out that there are environments516where taking as little as 20KiB from the system stack is an embarrassment.517After another refactoring, the heap is used exclusively, but a pointer the518frames vector and its size are cached in the match_data block, so that there is519no new memory allocation if the same match_data block is used for multiple520matches (unless the frames vector has to be extended).521*******************************************************************************522******************************************************************************/523524525526527/*************************************************528* Macros for the match() function *529*************************************************/530531/* These macros pack up tests that are used for partial matching several times532in the code. The second one is used when we already know we are past the end of533the subject. We set the "hit end" flag if the pointer is at the end of the534subject and either (a) the pointer is past the earliest inspected character535(i.e. something has been matched, even if not part of the actual matched536string), or (b) the pattern contains a lookbehind. These are the conditions for537which adding more characters may allow the current match to continue.538539For hard partial matching, we immediately return a partial match. Otherwise,540carrying on means that a complete match on the current subject will be sought.541A partial match is returned only if no complete match can be found. */542543#define CHECK_PARTIAL() \544do { \545if (Feptr >= mb->end_subject) \546{ \547SCHECK_PARTIAL(); \548} \549} \550while (0)551552#define SCHECK_PARTIAL() \553do { \554if (mb->partial != 0 && \555(Feptr > mb->start_used_ptr || mb->allowemptypartial)) \556{ \557mb->hitend = TRUE; \558if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; \559} \560} \561while (0)562563564/* These macros are used to implement backtracking. They simulate a recursive565call to the match() function by means of a local vector of frames which566remember the backtracking points. */567568#define RMATCH(ra,rb) \569do { \570start_ecode = ra; \571Freturn_id = rb; \572goto MATCH_RECURSE; \573L_##rb:; \574} \575while (0)576577#define RRETURN(ra) \578do { \579rrc = ra; \580goto RETURN_SWITCH; \581} \582while (0)583584585586/*************************************************587* Match from current position *588*************************************************/589590/* This function is called to run one match attempt at a single starting point591in the subject.592593Performance note: It might be tempting to extract commonly used fields from the594mb structure (e.g. end_subject) into individual variables to improve595performance. Tests using gcc on a SPARC disproved this; in the first case, it596made performance worse.597598Arguments:599start_eptr starting character in subject600start_ecode starting position in compiled code601top_bracket number of capturing parentheses in the pattern602frame_size size of each backtracking frame603match_data pointer to the match_data block604mb pointer to "static" variables block605606Returns: MATCH_MATCH if matched ) these values are >= 0607MATCH_NOMATCH if failed to match )608negative MATCH_xxx value for PRUNE, SKIP, etc609negative PCRE2_ERROR_xxx value if aborted by an error condition610(e.g. stopped by repeated call or depth limit)611*/612613static int614match(PCRE2_SPTR start_eptr, PCRE2_SPTR start_ecode, uint16_t top_bracket,615PCRE2_SIZE frame_size, pcre2_match_data *match_data, match_block *mb)616{617/* Frame-handling variables */618619heapframe *F; /* Current frame pointer */620heapframe *N = NULL; /* Temporary frame pointers */621heapframe *P = NULL;622623heapframe *frames_top; /* End of frames vector */624heapframe *assert_accept_frame = NULL; /* For passing back a frame with captures */625PCRE2_SIZE frame_copy_size; /* Amount to copy when creating a new frame */626627/* Local variables that do not need to be preserved over calls to RRMATCH(). */628629PCRE2_SPTR branch_end = NULL;630PCRE2_SPTR branch_start;631PCRE2_SPTR bracode; /* Temp pointer to start of group */632PCRE2_SIZE offset; /* Used for group offsets */633PCRE2_SIZE length; /* Used for various length calculations */634635int rrc; /* Return from functions & backtracking "recursions" */636#ifdef SUPPORT_UNICODE637int proptype; /* Type of character property */638#endif639640uint32_t i; /* Used for local loops */641uint32_t fc; /* Character values */642uint32_t number; /* Used for group and other numbers */643uint32_t reptype = 0; /* Type of repetition (0 to avoid compiler warning) */644uint32_t group_frame_type; /* Specifies type for new group frames */645646BOOL condition; /* Used in conditional groups */647BOOL cur_is_word; /* Used in "word" tests */648BOOL prev_is_word; /* Used in "word" tests */649650/* UTF and UCP flags */651652#ifdef SUPPORT_UNICODE653BOOL utf = (mb->poptions & PCRE2_UTF) != 0;654BOOL ucp = (mb->poptions & PCRE2_UCP) != 0;655#else656BOOL utf = FALSE; /* Required for convenience even when no Unicode support */657#endif658659/* This is the length of the last part of a backtracking frame that must be660copied when a new frame is created. */661662frame_copy_size = frame_size - offsetof(heapframe, eptr);663664/* Set up the first frame and the end of the frames vector. */665666F = match_data->heapframes;667frames_top = (heapframe *)((char *)F + match_data->heapframes_size);668669Frdepth = 0; /* "Recursion" depth */670Fcapture_last = 0; /* Number of most recent capture */671Fcurrent_recurse = RECURSE_UNSET; /* Not pattern recursing. */672Fstart_match = Feptr = start_eptr; /* Current data pointer and start match */673Fmark = NULL; /* Most recent mark */674Foffset_top = 0; /* End of captures within the frame */675Flast_group_offset = PCRE2_UNSET; /* Saved frame of most recent group */676group_frame_type = 0; /* Not a start of group frame */677goto NEW_FRAME; /* Start processing with this frame */678679/* Come back here when we want to create a new frame for remembering a680backtracking point. */681682MATCH_RECURSE:683684/* Set up a new backtracking frame. If the vector is full, get a new one,685doubling the size, but constrained by the heap limit (which is in KiB). */686687N = (heapframe *)((char *)F + frame_size);688if ((heapframe *)((char *)N + frame_size) >= frames_top)689{690heapframe *new;691PCRE2_SIZE newsize;692PCRE2_SIZE usedsize = (char *)N - (char *)(match_data->heapframes);693694if (match_data->heapframes_size >= PCRE2_SIZE_MAX / 2)695{696if (match_data->heapframes_size == PCRE2_SIZE_MAX - 1)697return PCRE2_ERROR_NOMEMORY;698newsize = PCRE2_SIZE_MAX - 1;699}700else701newsize = match_data->heapframes_size * 2;702703if (newsize / 1024 >= mb->heap_limit)704{705PCRE2_SIZE old_size = match_data->heapframes_size / 1024;706if (mb->heap_limit <= old_size)707return PCRE2_ERROR_HEAPLIMIT;708else709{710PCRE2_SIZE max_delta = 1024 * (mb->heap_limit - old_size);711int over_bytes = match_data->heapframes_size % 1024;712if (over_bytes) max_delta -= (1024 - over_bytes);713newsize = match_data->heapframes_size + max_delta;714}715}716717/* With a heap limit set, the permitted additional size may not be enough for718another frame, so do a final check. */719720if (newsize - usedsize < frame_size) return PCRE2_ERROR_HEAPLIMIT;721new = match_data->memctl.malloc(newsize, match_data->memctl.memory_data);722if (new == NULL) return PCRE2_ERROR_NOMEMORY;723memcpy(new, match_data->heapframes, usedsize);724725N = (heapframe *)((char *)new + usedsize);726F = (heapframe *)((char *)N - frame_size);727728match_data->memctl.free(match_data->heapframes, match_data->memctl.memory_data);729match_data->heapframes = new;730match_data->heapframes_size = newsize;731frames_top = (heapframe *)((char *)new + newsize);732}733734#ifdef DEBUG_SHOW_RMATCH735fprintf(stderr, "++ RMATCH %d frame=%d", Freturn_id, Frdepth + 1);736if (group_frame_type != 0)737{738fprintf(stderr, " type=%x ", group_frame_type);739switch (GF_IDMASK(group_frame_type))740{741case GF_CAPTURE:742fprintf(stderr, "capture=%d", GF_DATAMASK(group_frame_type));743break;744745case GF_NOCAPTURE:746fprintf(stderr, "nocapture op=%d", GF_DATAMASK(group_frame_type));747break;748749case GF_CONDASSERT:750fprintf(stderr, "condassert op=%d", GF_DATAMASK(group_frame_type));751break;752753case GF_RECURSE:754fprintf(stderr, "recurse=%d", GF_DATAMASK(group_frame_type));755break;756757default:758fprintf(stderr, "*** unknown ***");759break;760}761}762fprintf(stderr, "\n");763#endif764765/* Copy those fields that must be copied into the new frame, increase the766"recursion" depth (i.e. the new frame's index) and then make the new frame767current. */768769memcpy((char *)N + offsetof(heapframe, eptr),770(char *)F + offsetof(heapframe, eptr),771frame_copy_size);772773N->rdepth = Frdepth + 1;774F = N;775776/* Carry on processing with a new frame. */777778NEW_FRAME:779Fgroup_frame_type = group_frame_type;780Fecode = start_ecode; /* Starting code pointer */781Fback_frame = frame_size; /* Default is go back one frame */782783/* If this is a special type of group frame, remember its offset for quick784access at the end of the group. If this is a recursion, set a new current785recursion value. */786787if (group_frame_type != 0)788{789Flast_group_offset = (char *)F - (char *)match_data->heapframes;790if (GF_IDMASK(group_frame_type) == GF_RECURSE)791Fcurrent_recurse = GF_DATAMASK(group_frame_type);792group_frame_type = 0;793}794795796/* ========================================================================= */797/* This is the main processing loop. First check that we haven't recorded too798many backtracks (search tree is too large), or that we haven't exceeded the799recursive depth limit (used too many backtracking frames). If not, process the800opcodes. */801802if (mb->match_call_count++ >= mb->match_limit) return PCRE2_ERROR_MATCHLIMIT;803if (Frdepth >= mb->match_limit_depth) return PCRE2_ERROR_DEPTHLIMIT;804805#ifdef DEBUG_SHOW_OPS806fprintf(stderr, "\n++ New frame: type=0x%x subject offset %ld\n",807GF_IDMASK(Fgroup_frame_type), Feptr - mb->start_subject);808#endif809810for (;;)811{812#ifdef DEBUG_SHOW_OPS813fprintf(stderr, "++ %2ld op=%3d %s\n", Fecode - mb->start_code, *Fecode,814OP_names[*Fecode]);815#endif816817Fop = (uint8_t)(*Fecode); /* Cast needed for 16-bit and 32-bit modes */818switch(Fop)819{820/* ===================================================================== */821/* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes, to close822any currently open capturing brackets. Unlike reaching the end of a group,823where we know the starting frame is at the top of the chained frames, in824this case we have to search back for the relevant frame in case other types825of group that use chained frames have intervened. Multiple OP_CLOSEs always826come innermost first, which matches the chain order. We can ignore this in827a recursion, because captures are not passed out of recursions. */828829case OP_CLOSE:830if (Fcurrent_recurse == RECURSE_UNSET)831{832number = GET2(Fecode, 1);833offset = Flast_group_offset;834for(;;)835{836/* Corrupted heapframes?. Trigger an assert and return an error */837PCRE2_ASSERT(offset != PCRE2_UNSET);838if (offset == PCRE2_UNSET) return PCRE2_ERROR_INTERNAL;839840N = (heapframe *)((char *)match_data->heapframes + offset);841P = (heapframe *)((char *)N - frame_size);842if (N->group_frame_type == (GF_CAPTURE | number)) break;843offset = P->last_group_offset;844}845offset = (number << 1) - 2;846Fcapture_last = number;847Fovector[offset] = P->eptr - mb->start_subject;848Fovector[offset+1] = Feptr - mb->start_subject;849if (offset >= Foffset_top) Foffset_top = offset + 2;850}851Fecode += PRIV(OP_lengths)[*Fecode];852break;853854855/* ===================================================================== */856/* Real or forced end of the pattern, assertion, or recursion. In an857assertion ACCEPT, update the last used pointer and remember the current858frame so that the captures and mark can be fished out of it. */859860case OP_ASSERT_ACCEPT:861if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr;862assert_accept_frame = F;863RRETURN(MATCH_ACCEPT);864865/* For ACCEPT within a recursion, we have to find the most recent866recursion. If not in a recursion, fall through to code that is common with867OP_END. */868869case OP_ACCEPT:870if (Fcurrent_recurse != RECURSE_UNSET)871{872#ifdef DEBUG_SHOW_OPS873fprintf(stderr, "++ Accept within recursion\n");874#endif875offset = Flast_group_offset;876for(;;)877{878/* Corrupted heapframes?. Trigger an assert and return an error */879PCRE2_ASSERT(offset != PCRE2_UNSET);880if (offset == PCRE2_UNSET) return PCRE2_ERROR_INTERNAL;881882N = (heapframe *)((char *)match_data->heapframes + offset);883P = (heapframe *)((char *)N - frame_size);884if (GF_IDMASK(N->group_frame_type) == GF_RECURSE) break;885offset = P->last_group_offset;886}887888/* N is now the frame of the recursion; the previous frame is at the889OP_RECURSE position. Go back there, copying the current subject position890and mark, and the start_match position (\K might have changed it), and891then move on past the OP_RECURSE. */892893P->eptr = Feptr;894P->mark = Fmark;895P->start_match = Fstart_match;896F = P;897Fecode += 1 + LINK_SIZE;898continue;899}900/* Fall through */901902/* OP_END itself can never be reached within a recursion because that is903picked up when the OP_KET that always precedes OP_END is reached. */904905case OP_END:906907/* Fail for an empty string match if either PCRE2_NOTEMPTY is set, or if908PCRE2_NOTEMPTY_ATSTART is set and we have matched at the start of the909subject. In both cases, backtracking will then try other alternatives, if910any. */911912if (Feptr == Fstart_match &&913((mb->moptions & PCRE2_NOTEMPTY) != 0 ||914((mb->moptions & PCRE2_NOTEMPTY_ATSTART) != 0 &&915Fstart_match == mb->start_subject + mb->start_offset)))916{917#ifdef DEBUG_SHOW_OPS918fprintf(stderr, "++ Backtrack because empty string\n");919#endif920RRETURN(MATCH_NOMATCH);921}922923/* Fail if PCRE2_ENDANCHORED is set and the end of the match is not924the end of the subject. After (*ACCEPT) we fail the entire match (at this925position) but backtrack if we've reached the end of the pattern. This926applies whether or not we are in a recursion. */927928if (Feptr < mb->end_subject &&929((mb->moptions | mb->poptions) & PCRE2_ENDANCHORED) != 0)930{931if (Fop == OP_END)932{933#ifdef DEBUG_SHOW_OPS934fprintf(stderr, "++ Backtrack because not at end (endanchored set)\n");935#endif936RRETURN(MATCH_NOMATCH);937}938939#ifdef DEBUG_SHOW_OPS940fprintf(stderr, "++ Failed ACCEPT not at end (endanchnored set)\n");941#endif942return MATCH_NOMATCH; /* (*ACCEPT) */943}944945/* We have a successful match of the whole pattern. Record the result and946then do a direct return from the function. If there is space in the offset947vector, set any pairs that follow the highest-numbered captured string but948are less than the number of capturing groups in the pattern to PCRE2_UNSET.949It is documented that this happens. "Gaps" are set to PCRE2_UNSET950dynamically. It is only those at the end that need setting here. */951952mb->end_match_ptr = Feptr; /* Record where we ended */953mb->end_offset_top = Foffset_top; /* and how many extracts were taken */954mb->mark = Fmark; /* and the last success mark */955if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr;956957match_data->ovector[0] = Fstart_match - mb->start_subject;958match_data->ovector[1] = Feptr - mb->start_subject;959960/* Set i to the smaller of the sizes of the external and frame ovectors. */961962i = 2 * ((top_bracket + 1 > match_data->oveccount)?963match_data->oveccount : top_bracket + 1);964memcpy(match_data->ovector + 2, Fovector, (i - 2) * sizeof(PCRE2_SIZE));965while (--i >= Foffset_top + 2) match_data->ovector[i] = PCRE2_UNSET;966return MATCH_MATCH; /* Note: NOT RRETURN */967968969/*===================================================================== */970/* Match any single character type except newline; have to take care with971CRLF newlines and partial matching. */972973case OP_ANY:974if (IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH);975if (mb->partial != 0 &&976Feptr == mb->end_subject - 1 &&977NLBLOCK->nltype == NLTYPE_FIXED &&978NLBLOCK->nllen == 2 &&979UCHAR21TEST(Feptr) == NLBLOCK->nl[0])980{981mb->hitend = TRUE;982if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;983}984/* Fall through */985986/* Match any single character whatsoever. */987988case OP_ALLANY:989if (Feptr >= mb->end_subject) /* DO NOT merge the Feptr++ here; it must */990{ /* not be updated before SCHECK_PARTIAL. */991SCHECK_PARTIAL();992RRETURN(MATCH_NOMATCH);993}994Feptr++;995#ifdef SUPPORT_UNICODE996if (utf) ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);997#endif998Fecode++;999break;100010011002/* ===================================================================== */1003/* Match a single code unit, even in UTF mode. This opcode really does1004match any code unit, even newline. (It really should be called ANYCODEUNIT,1005of course - the byte name is from pre-16 bit days.) */10061007case OP_ANYBYTE:1008if (Feptr >= mb->end_subject) /* DO NOT merge the Feptr++ here; it must */1009{ /* not be updated before SCHECK_PARTIAL. */1010SCHECK_PARTIAL();1011RRETURN(MATCH_NOMATCH);1012}1013Feptr++;1014Fecode++;1015break;101610171018/* ===================================================================== */1019/* Match a single character, casefully */10201021case OP_CHAR:1022#ifdef SUPPORT_UNICODE1023if (utf)1024{1025Flength = 1;1026Fecode++;1027GETCHARLEN(fc, Fecode, Flength);1028if (Flength > (PCRE2_SIZE)(mb->end_subject - Feptr))1029{1030CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */1031RRETURN(MATCH_NOMATCH);1032}1033for (; Flength > 0; Flength--)1034{1035if (*Fecode++ != UCHAR21INC(Feptr)) RRETURN(MATCH_NOMATCH);1036}1037}1038else1039#endif10401041/* Not UTF mode */1042{1043if (mb->end_subject - Feptr < 1)1044{1045SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */1046RRETURN(MATCH_NOMATCH);1047}1048if (Fecode[1] != *Feptr++) RRETURN(MATCH_NOMATCH);1049Fecode += 2;1050}1051break;105210531054/* ===================================================================== */1055/* Match a single character, caselessly. If we are at the end of the1056subject, give up immediately. We get here only when the pattern character1057has at most one other case. Characters with more than two cases are coded1058as OP_PROP with the pseudo-property PT_CLIST. */10591060case OP_CHARI:1061if (Feptr >= mb->end_subject)1062{1063SCHECK_PARTIAL();1064RRETURN(MATCH_NOMATCH);1065}10661067#ifdef SUPPORT_UNICODE1068if (utf)1069{1070Flength = 1;1071Fecode++;1072GETCHARLEN(fc, Fecode, Flength);10731074/* If the pattern character's value is < 128, we know that its other case1075(if any) is also < 128 (and therefore only one code unit long in all1076code-unit widths), so we can use the fast lookup table. We checked above1077that there is at least one character left in the subject. */10781079if (fc < 128)1080{1081uint32_t cc = UCHAR21(Feptr);1082if (mb->lcc[fc] != TABLE_GET(cc, mb->lcc, cc)) RRETURN(MATCH_NOMATCH);1083Fecode++;1084Feptr++;1085}10861087/* Otherwise we must pick up the subject character and use Unicode1088property support to test its other case. Note that we cannot use the1089value of "Flength" to check for sufficient bytes left, because the other1090case of the character may have more or fewer code units. */10911092else1093{1094uint32_t dc;1095GETCHARINC(dc, Feptr);1096Fecode += Flength;1097if (dc != fc && dc != UCD_OTHERCASE(fc)) RRETURN(MATCH_NOMATCH);1098}1099}11001101/* If UCP is set without UTF we must do the same as above, but with one1102character per code unit. */11031104else if (ucp)1105{1106uint32_t cc = UCHAR21(Feptr);1107fc = Fecode[1];1108if (fc < 128)1109{1110if (mb->lcc[fc] != TABLE_GET(cc, mb->lcc, cc)) RRETURN(MATCH_NOMATCH);1111}1112else1113{1114if (cc != fc && cc != UCD_OTHERCASE(fc)) RRETURN(MATCH_NOMATCH);1115}1116Feptr++;1117Fecode += 2;1118}11191120else1121#endif /* SUPPORT_UNICODE */11221123/* Not UTF or UCP mode; use the table for characters < 256. */1124{1125if (TABLE_GET(Fecode[1], mb->lcc, Fecode[1])1126!= TABLE_GET(*Feptr, mb->lcc, *Feptr)) RRETURN(MATCH_NOMATCH);1127Feptr++;1128Fecode += 2;1129}1130break;113111321133/* ===================================================================== */1134/* Match not a single character. */11351136case OP_NOT:1137case OP_NOTI:1138if (Feptr >= mb->end_subject)1139{1140SCHECK_PARTIAL();1141RRETURN(MATCH_NOMATCH);1142}11431144#ifdef SUPPORT_UNICODE1145if (utf)1146{1147uint32_t ch;1148Fecode++;1149GETCHARINC(ch, Fecode);1150GETCHARINC(fc, Feptr);1151if (ch == fc)1152{1153RRETURN(MATCH_NOMATCH); /* Caseful match */1154}1155else if (Fop == OP_NOTI) /* If caseless */1156{1157if (ch > 127)1158ch = UCD_OTHERCASE(ch);1159else1160ch = (mb->fcc)[ch];1161if (ch == fc) RRETURN(MATCH_NOMATCH);1162}1163}11641165/* UCP without UTF is as above, but with one character per code unit. */11661167else if (ucp)1168{1169uint32_t ch;1170fc = UCHAR21INC(Feptr);1171ch = Fecode[1];1172Fecode += 2;11731174if (ch == fc)1175{1176RRETURN(MATCH_NOMATCH); /* Caseful match */1177}1178else if (Fop == OP_NOTI) /* If caseless */1179{1180if (ch > 127)1181ch = UCD_OTHERCASE(ch);1182else1183ch = (mb->fcc)[ch];1184if (ch == fc) RRETURN(MATCH_NOMATCH);1185}1186}11871188else1189#endif /* SUPPORT_UNICODE */11901191/* Neither UTF nor UCP is set */11921193{1194uint32_t ch = Fecode[1];1195fc = UCHAR21INC(Feptr);1196if (ch == fc || (Fop == OP_NOTI && TABLE_GET(ch, mb->fcc, ch) == fc))1197RRETURN(MATCH_NOMATCH);1198Fecode += 2;1199}1200break;120112021203/* ===================================================================== */1204/* Match a single character repeatedly. */12051206#define Loclength F->temp_size1207#define Lstart_eptr F->temp_sptr[0]1208#define Lcharptr F->temp_sptr[1]1209#define Lmin F->temp_32[0]1210#define Lmax F->temp_32[1]1211#define Lc F->temp_32[2]1212#define Loc F->temp_32[3]12131214case OP_EXACT:1215case OP_EXACTI:1216Lmin = Lmax = GET2(Fecode, 1);1217Fecode += 1 + IMM2_SIZE;1218goto REPEATCHAR;12191220case OP_POSUPTO:1221case OP_POSUPTOI:1222reptype = REPTYPE_POS;1223Lmin = 0;1224Lmax = GET2(Fecode, 1);1225Fecode += 1 + IMM2_SIZE;1226goto REPEATCHAR;12271228case OP_UPTO:1229case OP_UPTOI:1230reptype = REPTYPE_MAX;1231Lmin = 0;1232Lmax = GET2(Fecode, 1);1233Fecode += 1 + IMM2_SIZE;1234goto REPEATCHAR;12351236case OP_MINUPTO:1237case OP_MINUPTOI:1238reptype = REPTYPE_MIN;1239Lmin = 0;1240Lmax = GET2(Fecode, 1);1241Fecode += 1 + IMM2_SIZE;1242goto REPEATCHAR;12431244case OP_POSSTAR:1245case OP_POSSTARI:1246reptype = REPTYPE_POS;1247Lmin = 0;1248Lmax = UINT32_MAX;1249Fecode++;1250goto REPEATCHAR;12511252case OP_POSPLUS:1253case OP_POSPLUSI:1254reptype = REPTYPE_POS;1255Lmin = 1;1256Lmax = UINT32_MAX;1257Fecode++;1258goto REPEATCHAR;12591260case OP_POSQUERY:1261case OP_POSQUERYI:1262reptype = REPTYPE_POS;1263Lmin = 0;1264Lmax = 1;1265Fecode++;1266goto REPEATCHAR;12671268case OP_STAR:1269case OP_STARI:1270case OP_MINSTAR:1271case OP_MINSTARI:1272case OP_PLUS:1273case OP_PLUSI:1274case OP_MINPLUS:1275case OP_MINPLUSI:1276case OP_QUERY:1277case OP_QUERYI:1278case OP_MINQUERY:1279case OP_MINQUERYI:1280fc = *Fecode++ - ((Fop < OP_STARI)? OP_STAR : OP_STARI);1281Lmin = rep_min[fc];1282Lmax = rep_max[fc];1283reptype = rep_typ[fc];12841285/* Common code for all repeated single-character matches. We first check1286for the minimum number of characters. If the minimum equals the maximum, we1287are done. Otherwise, if minimizing, check the rest of the pattern for a1288match; if there isn't one, advance up to the maximum, one character at a1289time.12901291If maximizing, advance up to the maximum number of matching characters,1292until Feptr is past the end of the maximum run. If possessive, we are1293then done (no backing up). Otherwise, match at this position; anything1294other than no match is immediately returned. For nomatch, back up one1295character, unless we are matching \R and the last thing matched was1296\r\n, in which case, back up two code units until we reach the first1297optional character position.12981299The various UTF/non-UTF and caseful/caseless cases are handled separately,1300for speed. */13011302REPEATCHAR:1303#ifdef SUPPORT_UNICODE1304if (utf)1305{1306Flength = 1;1307Lcharptr = Fecode;1308GETCHARLEN(fc, Fecode, Flength);1309Fecode += Flength;13101311/* Handle multi-code-unit character matching, caseful and caseless. */13121313if (Flength > 1)1314{1315uint32_t othercase;13161317if (Fop >= OP_STARI && /* Caseless */1318(othercase = UCD_OTHERCASE(fc)) != fc)1319Loclength = PRIV(ord2utf)(othercase, Foccu);1320else Loclength = 0;13211322for (i = 1; i <= Lmin; i++)1323{1324if (Feptr <= mb->end_subject - Flength &&1325memcmp(Feptr, Lcharptr, CU2BYTES(Flength)) == 0) Feptr += Flength;1326else if (Loclength > 0 &&1327Feptr <= mb->end_subject - Loclength &&1328memcmp(Feptr, Foccu, CU2BYTES(Loclength)) == 0)1329Feptr += Loclength;1330else1331{1332CHECK_PARTIAL();1333RRETURN(MATCH_NOMATCH);1334}1335}13361337if (Lmin == Lmax) continue;13381339if (reptype == REPTYPE_MIN)1340{1341for (;;)1342{1343RMATCH(Fecode, RM202);1344if (rrc != MATCH_NOMATCH) RRETURN(rrc);1345if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);1346if (Feptr <= mb->end_subject - Flength &&1347memcmp(Feptr, Lcharptr, CU2BYTES(Flength)) == 0) Feptr += Flength;1348else if (Loclength > 0 &&1349Feptr <= mb->end_subject - Loclength &&1350memcmp(Feptr, Foccu, CU2BYTES(Loclength)) == 0)1351Feptr += Loclength;1352else1353{1354CHECK_PARTIAL();1355RRETURN(MATCH_NOMATCH);1356}1357}1358PCRE2_UNREACHABLE(); /* Control never reaches here */1359}13601361else /* Maximize */1362{1363Lstart_eptr = Feptr;1364for (i = Lmin; i < Lmax; i++)1365{1366if (Feptr <= mb->end_subject - Flength &&1367memcmp(Feptr, Lcharptr, CU2BYTES(Flength)) == 0)1368Feptr += Flength;1369else if (Loclength > 0 &&1370Feptr <= mb->end_subject - Loclength &&1371memcmp(Feptr, Foccu, CU2BYTES(Loclength)) == 0)1372Feptr += Loclength;1373else1374{1375CHECK_PARTIAL();1376break;1377}1378}13791380/* After \C in UTF mode, Lstart_eptr might be in the middle of a1381Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't1382go too far. */13831384if (reptype != REPTYPE_POS) for(;;)1385{1386if (Feptr <= Lstart_eptr) break;1387RMATCH(Fecode, RM203);1388if (rrc != MATCH_NOMATCH) RRETURN(rrc);1389Feptr--;1390BACKCHAR(Feptr);1391}1392}1393break; /* End of repeated wide character handling */1394}13951396/* Length of UTF character is 1. Put it into the preserved variable and1397fall through to the non-UTF code. */13981399Lc = fc;1400}1401else1402#endif /* SUPPORT_UNICODE */14031404/* When not in UTF mode, load a single-code-unit character. Then proceed as1405above, using Unicode casing if either UTF or UCP is set. */14061407Lc = *Fecode++;14081409/* Caseless comparison */14101411if (Fop >= OP_STARI)1412{1413#if PCRE2_CODE_UNIT_WIDTH == 81414#ifdef SUPPORT_UNICODE1415if (ucp && !utf && Lc > 127) Loc = UCD_OTHERCASE(Lc);1416else1417#endif /* SUPPORT_UNICODE */1418/* Lc will be < 128 in UTF-8 mode. */1419Loc = mb->fcc[Lc];1420#else /* 16-bit & 32-bit */1421#ifdef SUPPORT_UNICODE1422if ((utf || ucp) && Lc > 127) Loc = UCD_OTHERCASE(Lc);1423else1424#endif /* SUPPORT_UNICODE */1425Loc = TABLE_GET(Lc, mb->fcc, Lc);1426#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */14271428for (i = 1; i <= Lmin; i++)1429{1430uint32_t cc; /* Faster than PCRE2_UCHAR */1431if (Feptr >= mb->end_subject)1432{1433SCHECK_PARTIAL();1434RRETURN(MATCH_NOMATCH);1435}1436cc = UCHAR21TEST(Feptr);1437if (Lc != cc && Loc != cc) RRETURN(MATCH_NOMATCH);1438Feptr++;1439}1440if (Lmin == Lmax) continue;14411442if (reptype == REPTYPE_MIN)1443{1444for (;;)1445{1446uint32_t cc; /* Faster than PCRE2_UCHAR */1447RMATCH(Fecode, RM25);1448if (rrc != MATCH_NOMATCH) RRETURN(rrc);1449if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);1450if (Feptr >= mb->end_subject)1451{1452SCHECK_PARTIAL();1453RRETURN(MATCH_NOMATCH);1454}1455cc = UCHAR21TEST(Feptr);1456if (Lc != cc && Loc != cc) RRETURN(MATCH_NOMATCH);1457Feptr++;1458}1459PCRE2_UNREACHABLE(); /* Control never reaches here */1460}14611462else /* Maximize */1463{1464Lstart_eptr = Feptr;1465for (i = Lmin; i < Lmax; i++)1466{1467uint32_t cc; /* Faster than PCRE2_UCHAR */1468if (Feptr >= mb->end_subject)1469{1470SCHECK_PARTIAL();1471break;1472}1473cc = UCHAR21TEST(Feptr);1474if (Lc != cc && Loc != cc) break;1475Feptr++;1476}1477if (reptype != REPTYPE_POS) for (;;)1478{1479if (Feptr == Lstart_eptr) break;1480RMATCH(Fecode, RM26);1481Feptr--;1482if (rrc != MATCH_NOMATCH) RRETURN(rrc);1483}1484}1485}14861487/* Caseful comparisons (includes all multi-byte characters) */14881489else1490{1491for (i = 1; i <= Lmin; i++)1492{1493if (Feptr >= mb->end_subject)1494{1495SCHECK_PARTIAL();1496RRETURN(MATCH_NOMATCH);1497}1498if (Lc != UCHAR21INCTEST(Feptr)) RRETURN(MATCH_NOMATCH);1499}15001501if (Lmin == Lmax) continue;15021503if (reptype == REPTYPE_MIN)1504{1505for (;;)1506{1507RMATCH(Fecode, RM27);1508if (rrc != MATCH_NOMATCH) RRETURN(rrc);1509if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);1510if (Feptr >= mb->end_subject)1511{1512SCHECK_PARTIAL();1513RRETURN(MATCH_NOMATCH);1514}1515if (Lc != UCHAR21INCTEST(Feptr)) RRETURN(MATCH_NOMATCH);1516}1517PCRE2_UNREACHABLE(); /* Control never reaches here */1518}1519else /* Maximize */1520{1521Lstart_eptr = Feptr;1522for (i = Lmin; i < Lmax; i++)1523{1524if (Feptr >= mb->end_subject)1525{1526SCHECK_PARTIAL();1527break;1528}15291530if (Lc != UCHAR21TEST(Feptr)) break;1531Feptr++;1532}15331534if (reptype != REPTYPE_POS) for (;;)1535{1536if (Feptr <= Lstart_eptr) break;1537RMATCH(Fecode, RM28);1538Feptr--;1539if (rrc != MATCH_NOMATCH) RRETURN(rrc);1540}1541}1542}1543break;15441545#undef Loclength1546#undef Lstart_eptr1547#undef Lcharptr1548#undef Lmin1549#undef Lmax1550#undef Lc1551#undef Loc155215531554/* ===================================================================== */1555/* Match a negated single one-byte character repeatedly. This is almost a1556repeat of the code for a repeated single character, but I haven't found a1557nice way of commoning these up that doesn't require a test of the1558positive/negative option for each character match. Maybe that wouldn't add1559very much to the time taken, but character matching *is* what this is all1560about... */15611562#define Lstart_eptr F->temp_sptr[0]1563#define Lmin F->temp_32[0]1564#define Lmax F->temp_32[1]1565#define Lc F->temp_32[2]1566#define Loc F->temp_32[3]15671568case OP_NOTEXACT:1569case OP_NOTEXACTI:1570Lmin = Lmax = GET2(Fecode, 1);1571Fecode += 1 + IMM2_SIZE;1572goto REPEATNOTCHAR;15731574case OP_NOTUPTO:1575case OP_NOTUPTOI:1576Lmin = 0;1577Lmax = GET2(Fecode, 1);1578reptype = REPTYPE_MAX;1579Fecode += 1 + IMM2_SIZE;1580goto REPEATNOTCHAR;15811582case OP_NOTMINUPTO:1583case OP_NOTMINUPTOI:1584Lmin = 0;1585Lmax = GET2(Fecode, 1);1586reptype = REPTYPE_MIN;1587Fecode += 1 + IMM2_SIZE;1588goto REPEATNOTCHAR;15891590case OP_NOTPOSSTAR:1591case OP_NOTPOSSTARI:1592reptype = REPTYPE_POS;1593Lmin = 0;1594Lmax = UINT32_MAX;1595Fecode++;1596goto REPEATNOTCHAR;15971598case OP_NOTPOSPLUS:1599case OP_NOTPOSPLUSI:1600reptype = REPTYPE_POS;1601Lmin = 1;1602Lmax = UINT32_MAX;1603Fecode++;1604goto REPEATNOTCHAR;16051606case OP_NOTPOSQUERY:1607case OP_NOTPOSQUERYI:1608reptype = REPTYPE_POS;1609Lmin = 0;1610Lmax = 1;1611Fecode++;1612goto REPEATNOTCHAR;16131614case OP_NOTPOSUPTO:1615case OP_NOTPOSUPTOI:1616reptype = REPTYPE_POS;1617Lmin = 0;1618Lmax = GET2(Fecode, 1);1619Fecode += 1 + IMM2_SIZE;1620goto REPEATNOTCHAR;16211622case OP_NOTSTAR:1623case OP_NOTSTARI:1624case OP_NOTMINSTAR:1625case OP_NOTMINSTARI:1626case OP_NOTPLUS:1627case OP_NOTPLUSI:1628case OP_NOTMINPLUS:1629case OP_NOTMINPLUSI:1630case OP_NOTQUERY:1631case OP_NOTQUERYI:1632case OP_NOTMINQUERY:1633case OP_NOTMINQUERYI:1634fc = *Fecode++ - ((Fop >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);1635Lmin = rep_min[fc];1636Lmax = rep_max[fc];1637reptype = rep_typ[fc];16381639/* Common code for all repeated single-character non-matches. */16401641REPEATNOTCHAR:1642GETCHARINCTEST(Lc, Fecode);16431644/* The code is duplicated for the caseless and caseful cases, for speed,1645since matching characters is likely to be quite common. First, ensure the1646minimum number of matches are present. If Lmin = Lmax, we are done.1647Otherwise, if minimizing, keep trying the rest of the expression and1648advancing one matching character if failing, up to the maximum.1649Alternatively, if maximizing, find the maximum number of characters and1650work backwards. */16511652if (Fop >= OP_NOTSTARI) /* Caseless */1653{1654#ifdef SUPPORT_UNICODE1655if ((utf || ucp) && Lc > 127)1656Loc = UCD_OTHERCASE(Lc);1657else1658#endif /* SUPPORT_UNICODE */16591660Loc = TABLE_GET(Lc, mb->fcc, Lc); /* Other case from table */16611662#ifdef SUPPORT_UNICODE1663if (utf)1664{1665uint32_t d;1666for (i = 1; i <= Lmin; i++)1667{1668if (Feptr >= mb->end_subject)1669{1670SCHECK_PARTIAL();1671RRETURN(MATCH_NOMATCH);1672}1673GETCHARINC(d, Feptr);1674if (Lc == d || Loc == d) RRETURN(MATCH_NOMATCH);1675}1676}1677else1678#endif /* SUPPORT_UNICODE */16791680/* Not UTF mode */1681{1682for (i = 1; i <= Lmin; i++)1683{1684if (Feptr >= mb->end_subject)1685{1686SCHECK_PARTIAL();1687RRETURN(MATCH_NOMATCH);1688}1689if (Lc == *Feptr || Loc == *Feptr) RRETURN(MATCH_NOMATCH);1690Feptr++;1691}1692}16931694if (Lmin == Lmax) continue; /* Finished for exact count */16951696if (reptype == REPTYPE_MIN)1697{1698#ifdef SUPPORT_UNICODE1699if (utf)1700{1701uint32_t d;1702for (;;)1703{1704RMATCH(Fecode, RM204);1705if (rrc != MATCH_NOMATCH) RRETURN(rrc);1706if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);1707if (Feptr >= mb->end_subject)1708{1709SCHECK_PARTIAL();1710RRETURN(MATCH_NOMATCH);1711}1712GETCHARINC(d, Feptr);1713if (Lc == d || Loc == d) RRETURN(MATCH_NOMATCH);1714}1715}1716else1717#endif /*SUPPORT_UNICODE */17181719/* Not UTF mode */1720{1721for (;;)1722{1723RMATCH(Fecode, RM29);1724if (rrc != MATCH_NOMATCH) RRETURN(rrc);1725if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);1726if (Feptr >= mb->end_subject)1727{1728SCHECK_PARTIAL();1729RRETURN(MATCH_NOMATCH);1730}1731if (Lc == *Feptr || Loc == *Feptr) RRETURN(MATCH_NOMATCH);1732Feptr++;1733}1734}1735PCRE2_UNREACHABLE(); /* Control never reaches here */1736}17371738/* Maximize case */17391740else1741{1742Lstart_eptr = Feptr;17431744#ifdef SUPPORT_UNICODE1745if (utf)1746{1747uint32_t d;1748for (i = Lmin; i < Lmax; i++)1749{1750int len = 1;1751if (Feptr >= mb->end_subject)1752{1753SCHECK_PARTIAL();1754break;1755}1756GETCHARLEN(d, Feptr, len);1757if (Lc == d || Loc == d) break;1758Feptr += len;1759}17601761/* After \C in UTF mode, Lstart_eptr might be in the middle of a1762Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't1763go too far. */17641765if (reptype != REPTYPE_POS) for(;;)1766{1767if (Feptr <= Lstart_eptr) break;1768RMATCH(Fecode, RM205);1769if (rrc != MATCH_NOMATCH) RRETURN(rrc);1770Feptr--;1771BACKCHAR(Feptr);1772}1773}1774else1775#endif /* SUPPORT_UNICODE */17761777/* Not UTF mode */1778{1779for (i = Lmin; i < Lmax; i++)1780{1781if (Feptr >= mb->end_subject)1782{1783SCHECK_PARTIAL();1784break;1785}1786if (Lc == *Feptr || Loc == *Feptr) break;1787Feptr++;1788}1789if (reptype != REPTYPE_POS) for (;;)1790{1791if (Feptr == Lstart_eptr) break;1792RMATCH(Fecode, RM30);1793if (rrc != MATCH_NOMATCH) RRETURN(rrc);1794Feptr--;1795}1796}1797}1798}17991800/* Caseful comparisons */18011802else1803{1804#ifdef SUPPORT_UNICODE1805if (utf)1806{1807uint32_t d;1808for (i = 1; i <= Lmin; i++)1809{1810if (Feptr >= mb->end_subject)1811{1812SCHECK_PARTIAL();1813RRETURN(MATCH_NOMATCH);1814}1815GETCHARINC(d, Feptr);1816if (Lc == d) RRETURN(MATCH_NOMATCH);1817}1818}1819else1820#endif1821/* Not UTF mode */1822{1823for (i = 1; i <= Lmin; i++)1824{1825if (Feptr >= mb->end_subject)1826{1827SCHECK_PARTIAL();1828RRETURN(MATCH_NOMATCH);1829}1830if (Lc == *Feptr++) RRETURN(MATCH_NOMATCH);1831}1832}18331834if (Lmin == Lmax) continue;18351836if (reptype == REPTYPE_MIN)1837{1838#ifdef SUPPORT_UNICODE1839if (utf)1840{1841uint32_t d;1842for (;;)1843{1844RMATCH(Fecode, RM206);1845if (rrc != MATCH_NOMATCH) RRETURN(rrc);1846if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);1847if (Feptr >= mb->end_subject)1848{1849SCHECK_PARTIAL();1850RRETURN(MATCH_NOMATCH);1851}1852GETCHARINC(d, Feptr);1853if (Lc == d) RRETURN(MATCH_NOMATCH);1854}1855}1856else1857#endif1858/* Not UTF mode */1859{1860for (;;)1861{1862RMATCH(Fecode, RM31);1863if (rrc != MATCH_NOMATCH) RRETURN(rrc);1864if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);1865if (Feptr >= mb->end_subject)1866{1867SCHECK_PARTIAL();1868RRETURN(MATCH_NOMATCH);1869}1870if (Lc == *Feptr++) RRETURN(MATCH_NOMATCH);1871}1872}1873PCRE2_UNREACHABLE(); /* Control never reaches here */1874}18751876/* Maximize case */18771878else1879{1880Lstart_eptr = Feptr;18811882#ifdef SUPPORT_UNICODE1883if (utf)1884{1885uint32_t d;1886for (i = Lmin; i < Lmax; i++)1887{1888int len = 1;1889if (Feptr >= mb->end_subject)1890{1891SCHECK_PARTIAL();1892break;1893}1894GETCHARLEN(d, Feptr, len);1895if (Lc == d) break;1896Feptr += len;1897}18981899/* After \C in UTF mode, Lstart_eptr might be in the middle of a1900Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't1901go too far. */19021903if (reptype != REPTYPE_POS) for(;;)1904{1905if (Feptr <= Lstart_eptr) break;1906RMATCH(Fecode, RM207);1907if (rrc != MATCH_NOMATCH) RRETURN(rrc);1908Feptr--;1909BACKCHAR(Feptr);1910}1911}1912else1913#endif1914/* Not UTF mode */1915{1916for (i = Lmin; i < Lmax; i++)1917{1918if (Feptr >= mb->end_subject)1919{1920SCHECK_PARTIAL();1921break;1922}1923if (Lc == *Feptr) break;1924Feptr++;1925}1926if (reptype != REPTYPE_POS) for (;;)1927{1928if (Feptr == Lstart_eptr) break;1929RMATCH(Fecode, RM32);1930if (rrc != MATCH_NOMATCH) RRETURN(rrc);1931Feptr--;1932}1933}1934}1935}1936break;19371938#undef Lstart_eptr1939#undef Lmin1940#undef Lmax1941#undef Lc1942#undef Loc194319441945/* ===================================================================== */1946/* Match a bit-mapped character class, possibly repeatedly. These opcodes1947are used when all the characters in the class have values in the range19480-255, and either the matching is caseful, or the characters are in the1949range 0-127 when UTF processing is enabled. The only difference between1950OP_CLASS and OP_NCLASS occurs when a data character outside the range is1951encountered. */19521953#define Lmin F->temp_32[0]1954#define Lmax F->temp_32[1]1955#define Lstart_eptr F->temp_sptr[0]1956#define Lbyte_map_address F->temp_sptr[1]1957#define Lbyte_map ((const unsigned char *)Lbyte_map_address)19581959case OP_NCLASS:1960case OP_CLASS:1961{1962Lbyte_map_address = Fecode + 1; /* Save for matching */1963Fecode += 1 + (32 / sizeof(PCRE2_UCHAR)); /* Advance past the item */19641965/* Look past the end of the item to see if there is repeat information1966following. Then obey similar code to character type repeats. */19671968switch (*Fecode)1969{1970case OP_CRSTAR:1971case OP_CRMINSTAR:1972case OP_CRPLUS:1973case OP_CRMINPLUS:1974case OP_CRQUERY:1975case OP_CRMINQUERY:1976case OP_CRPOSSTAR:1977case OP_CRPOSPLUS:1978case OP_CRPOSQUERY:1979fc = *Fecode++ - OP_CRSTAR;1980Lmin = rep_min[fc];1981Lmax = rep_max[fc];1982reptype = rep_typ[fc];1983break;19841985case OP_CRRANGE:1986case OP_CRMINRANGE:1987case OP_CRPOSRANGE:1988Lmin = GET2(Fecode, 1);1989Lmax = GET2(Fecode, 1 + IMM2_SIZE);1990if (Lmax == 0) Lmax = UINT32_MAX; /* Max 0 => infinity */1991reptype = rep_typ[*Fecode - OP_CRSTAR];1992Fecode += 1 + 2 * IMM2_SIZE;1993break;19941995default: /* No repeat follows */1996Lmin = Lmax = 1;1997break;1998}19992000/* First, ensure the minimum number of matches are present. */20012002#ifdef SUPPORT_UNICODE2003if (utf)2004{2005for (i = 1; i <= Lmin; i++)2006{2007if (Feptr >= mb->end_subject)2008{2009SCHECK_PARTIAL();2010RRETURN(MATCH_NOMATCH);2011}2012GETCHARINC(fc, Feptr);2013if (fc > 255)2014{2015if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH);2016}2017else2018if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) RRETURN(MATCH_NOMATCH);2019}2020}2021else2022#endif2023/* Not UTF mode */2024{2025for (i = 1; i <= Lmin; i++)2026{2027if (Feptr >= mb->end_subject)2028{2029SCHECK_PARTIAL();2030RRETURN(MATCH_NOMATCH);2031}2032fc = *Feptr++;2033#if PCRE2_CODE_UNIT_WIDTH != 82034if (fc > 255)2035{2036if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH);2037}2038else2039#endif2040if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) RRETURN(MATCH_NOMATCH);2041}2042}20432044/* If Lmax == Lmin we are done. Continue with main loop. */20452046if (Lmin == Lmax) continue;20472048/* If minimizing, keep testing the rest of the expression and advancing2049the pointer while it matches the class. */20502051if (reptype == REPTYPE_MIN)2052{2053#ifdef SUPPORT_UNICODE2054if (utf)2055{2056for (;;)2057{2058RMATCH(Fecode, RM200);2059if (rrc != MATCH_NOMATCH) RRETURN(rrc);2060if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);2061if (Feptr >= mb->end_subject)2062{2063SCHECK_PARTIAL();2064RRETURN(MATCH_NOMATCH);2065}2066GETCHARINC(fc, Feptr);2067if (fc > 255)2068{2069if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH);2070}2071else2072if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) RRETURN(MATCH_NOMATCH);2073}2074}2075else2076#endif2077/* Not UTF mode */2078{2079for (;;)2080{2081RMATCH(Fecode, RM23);2082if (rrc != MATCH_NOMATCH) RRETURN(rrc);2083if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);2084if (Feptr >= mb->end_subject)2085{2086SCHECK_PARTIAL();2087RRETURN(MATCH_NOMATCH);2088}2089fc = *Feptr++;2090#if PCRE2_CODE_UNIT_WIDTH != 82091if (fc > 255)2092{2093if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH);2094}2095else2096#endif2097if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) RRETURN(MATCH_NOMATCH);2098}2099}2100PCRE2_UNREACHABLE(); /* Control never reaches here */2101}21022103/* If maximizing, find the longest possible run, then work backwards. */21042105else2106{2107Lstart_eptr = Feptr;21082109#ifdef SUPPORT_UNICODE2110if (utf)2111{2112for (i = Lmin; i < Lmax; i++)2113{2114int len = 1;2115if (Feptr >= mb->end_subject)2116{2117SCHECK_PARTIAL();2118break;2119}2120GETCHARLEN(fc, Feptr, len);2121if (fc > 255)2122{2123if (Fop == OP_CLASS) break;2124}2125else2126if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) break;2127Feptr += len;2128}21292130if (reptype == REPTYPE_POS) continue; /* No backtracking */21312132/* After \C in UTF mode, Lstart_eptr might be in the middle of a2133Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't2134go too far. */21352136for (;;)2137{2138RMATCH(Fecode, RM201);2139if (rrc != MATCH_NOMATCH) RRETURN(rrc);2140if (Feptr-- <= Lstart_eptr) break; /* Tried at original position */2141BACKCHAR(Feptr);2142}2143}2144else2145#endif2146/* Not UTF mode */2147{2148for (i = Lmin; i < Lmax; i++)2149{2150if (Feptr >= mb->end_subject)2151{2152SCHECK_PARTIAL();2153break;2154}2155fc = *Feptr;2156#if PCRE2_CODE_UNIT_WIDTH != 82157if (fc > 255)2158{2159if (Fop == OP_CLASS) break;2160}2161else2162#endif2163if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) break;2164Feptr++;2165}21662167if (reptype == REPTYPE_POS) continue; /* No backtracking */21682169while (Feptr >= Lstart_eptr)2170{2171RMATCH(Fecode, RM24);2172if (rrc != MATCH_NOMATCH) RRETURN(rrc);2173Feptr--;2174}2175}21762177RRETURN(MATCH_NOMATCH);2178}2179}21802181PCRE2_UNREACHABLE(); /* Control never reaches here */21822183#undef Lbyte_map_address2184#undef Lbyte_map2185#undef Lstart_eptr2186#undef Lmin2187#undef Lmax218821892190/* ===================================================================== */2191/* Match an extended character class. In the 8-bit library, this opcode is2192encountered only when UTF-8 mode mode is supported. In the 16-bit and219332-bit libraries, codepoints greater than 255 may be encountered even when2194UTF is not supported. */21952196#define Lstart_eptr F->temp_sptr[0]2197#define Lxclass_data F->temp_sptr[1]2198#define Lmin F->temp_32[0]2199#define Lmax F->temp_32[1]22002201#ifdef SUPPORT_WIDE_CHARS2202case OP_XCLASS:2203{2204Lxclass_data = Fecode + 1 + LINK_SIZE; /* Save for matching */2205Fecode += GET(Fecode, 1); /* Advance past the item */22062207switch (*Fecode)2208{2209case OP_CRSTAR:2210case OP_CRMINSTAR:2211case OP_CRPLUS:2212case OP_CRMINPLUS:2213case OP_CRQUERY:2214case OP_CRMINQUERY:2215case OP_CRPOSSTAR:2216case OP_CRPOSPLUS:2217case OP_CRPOSQUERY:2218fc = *Fecode++ - OP_CRSTAR;2219Lmin = rep_min[fc];2220Lmax = rep_max[fc];2221reptype = rep_typ[fc];2222break;22232224case OP_CRRANGE:2225case OP_CRMINRANGE:2226case OP_CRPOSRANGE:2227Lmin = GET2(Fecode, 1);2228Lmax = GET2(Fecode, 1 + IMM2_SIZE);2229if (Lmax == 0) Lmax = UINT32_MAX; /* Max 0 => infinity */2230reptype = rep_typ[*Fecode - OP_CRSTAR];2231Fecode += 1 + 2 * IMM2_SIZE;2232break;22332234default: /* No repeat follows */2235Lmin = Lmax = 1;2236break;2237}22382239/* First, ensure the minimum number of matches are present. */22402241for (i = 1; i <= Lmin; i++)2242{2243if (Feptr >= mb->end_subject)2244{2245SCHECK_PARTIAL();2246RRETURN(MATCH_NOMATCH);2247}2248GETCHARINCTEST(fc, Feptr);2249if (!PRIV(xclass)(fc, Lxclass_data,2250(const uint8_t*)mb->start_code, utf))2251RRETURN(MATCH_NOMATCH);2252}22532254/* If Lmax == Lmin we can just continue with the main loop. */22552256if (Lmin == Lmax) continue;22572258/* If minimizing, keep testing the rest of the expression and advancing2259the pointer while it matches the class. */22602261if (reptype == REPTYPE_MIN)2262{2263for (;;)2264{2265RMATCH(Fecode, RM100);2266if (rrc != MATCH_NOMATCH) RRETURN(rrc);2267if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);2268if (Feptr >= mb->end_subject)2269{2270SCHECK_PARTIAL();2271RRETURN(MATCH_NOMATCH);2272}2273GETCHARINCTEST(fc, Feptr);2274if (!PRIV(xclass)(fc, Lxclass_data,2275(const uint8_t*)mb->start_code, utf))2276RRETURN(MATCH_NOMATCH);2277}2278PCRE2_UNREACHABLE(); /* Control never reaches here */2279}22802281/* If maximizing, find the longest possible run, then work backwards. */22822283else2284{2285Lstart_eptr = Feptr;2286for (i = Lmin; i < Lmax; i++)2287{2288int len = 1;2289if (Feptr >= mb->end_subject)2290{2291SCHECK_PARTIAL();2292break;2293}2294#ifdef SUPPORT_UNICODE2295GETCHARLENTEST(fc, Feptr, len);2296#else2297fc = *Feptr;2298#endif2299if (!PRIV(xclass)(fc, Lxclass_data,2300(const uint8_t*)mb->start_code, utf)) break;2301Feptr += len;2302}23032304if (reptype == REPTYPE_POS) continue; /* No backtracking */23052306/* After \C in UTF mode, Lstart_eptr might be in the middle of a2307Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't2308go too far. */23092310for(;;)2311{2312RMATCH(Fecode, RM101);2313if (rrc != MATCH_NOMATCH) RRETURN(rrc);2314if (Feptr-- <= Lstart_eptr) break; /* Tried at original position */2315#ifdef SUPPORT_UNICODE2316if (utf) BACKCHAR(Feptr);2317#endif2318}2319RRETURN(MATCH_NOMATCH);2320}23212322PCRE2_UNREACHABLE(); /* Control never reaches here */2323}2324#endif /* SUPPORT_WIDE_CHARS: end of XCLASS */23252326#undef Lstart_eptr2327#undef Lxclass_data2328#undef Lmin2329#undef Lmax233023312332/* ===================================================================== */2333/* Match a complex, set-based character class. This opcodes are used when2334there is complex nesting or logical operations within the character2335class. */23362337#define Lstart_eptr F->temp_sptr[0]2338#define Leclass_data F->temp_sptr[1]2339#define Leclass_len F->temp_size2340#define Lmin F->temp_32[0]2341#define Lmax F->temp_32[1]23422343#ifdef SUPPORT_WIDE_CHARS2344case OP_ECLASS:2345{2346Leclass_data = Fecode + 1 + LINK_SIZE; /* Save for matching */2347Fecode += GET(Fecode, 1); /* Advance past the item */2348Leclass_len = (PCRE2_SIZE)(Fecode - Leclass_data);23492350switch (*Fecode)2351{2352case OP_CRSTAR:2353case OP_CRMINSTAR:2354case OP_CRPLUS:2355case OP_CRMINPLUS:2356case OP_CRQUERY:2357case OP_CRMINQUERY:2358case OP_CRPOSSTAR:2359case OP_CRPOSPLUS:2360case OP_CRPOSQUERY:2361fc = *Fecode++ - OP_CRSTAR;2362Lmin = rep_min[fc];2363Lmax = rep_max[fc];2364reptype = rep_typ[fc];2365break;23662367case OP_CRRANGE:2368case OP_CRMINRANGE:2369case OP_CRPOSRANGE:2370Lmin = GET2(Fecode, 1);2371Lmax = GET2(Fecode, 1 + IMM2_SIZE);2372if (Lmax == 0) Lmax = UINT32_MAX; /* Max 0 => infinity */2373reptype = rep_typ[*Fecode - OP_CRSTAR];2374Fecode += 1 + 2 * IMM2_SIZE;2375break;23762377default: /* No repeat follows */2378Lmin = Lmax = 1;2379break;2380}23812382/* First, ensure the minimum number of matches are present. */23832384for (i = 1; i <= Lmin; i++)2385{2386if (Feptr >= mb->end_subject)2387{2388SCHECK_PARTIAL();2389RRETURN(MATCH_NOMATCH);2390}2391GETCHARINCTEST(fc, Feptr);2392if (!PRIV(eclass)(fc, Leclass_data, Leclass_data + Leclass_len,2393(const uint8_t*)mb->start_code, utf))2394RRETURN(MATCH_NOMATCH);2395}23962397/* If Lmax == Lmin we can just continue with the main loop. */23982399if (Lmin == Lmax) continue;24002401/* If minimizing, keep testing the rest of the expression and advancing2402the pointer while it matches the class. */24032404if (reptype == REPTYPE_MIN)2405{2406for (;;)2407{2408RMATCH(Fecode, RM102);2409if (rrc != MATCH_NOMATCH) RRETURN(rrc);2410if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);2411if (Feptr >= mb->end_subject)2412{2413SCHECK_PARTIAL();2414RRETURN(MATCH_NOMATCH);2415}2416GETCHARINCTEST(fc, Feptr);2417if (!PRIV(eclass)(fc, Leclass_data, Leclass_data + Leclass_len,2418(const uint8_t*)mb->start_code, utf))2419RRETURN(MATCH_NOMATCH);2420}2421PCRE2_UNREACHABLE(); /* Control never reaches here */2422}24232424/* If maximizing, find the longest possible run, then work backwards. */24252426else2427{2428Lstart_eptr = Feptr;2429for (i = Lmin; i < Lmax; i++)2430{2431int len = 1;2432if (Feptr >= mb->end_subject)2433{2434SCHECK_PARTIAL();2435break;2436}2437#ifdef SUPPORT_UNICODE2438GETCHARLENTEST(fc, Feptr, len);2439#else2440fc = *Feptr;2441#endif2442if (!PRIV(eclass)(fc, Leclass_data, Leclass_data + Leclass_len,2443(const uint8_t*)mb->start_code, utf))2444break;2445Feptr += len;2446}24472448if (reptype == REPTYPE_POS) continue; /* No backtracking */24492450/* After \C in UTF mode, Lstart_eptr might be in the middle of a2451Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't2452go too far. */24532454for(;;)2455{2456RMATCH(Fecode, RM103);2457if (rrc != MATCH_NOMATCH) RRETURN(rrc);2458if (Feptr-- <= Lstart_eptr) break; /* Tried at original position */2459#ifdef SUPPORT_UNICODE2460if (utf) BACKCHAR(Feptr);2461#endif2462}2463RRETURN(MATCH_NOMATCH);2464}24652466PCRE2_UNREACHABLE(); /* Control never reaches here */2467}2468#endif /* SUPPORT_WIDE_CHARS: end of ECLASS */24692470#undef Lstart_eptr2471#undef Leclass_data2472#undef Leclass_len2473#undef Lmin2474#undef Lmax247524762477/* ===================================================================== */2478/* Match various character types when PCRE2_UCP is not set. These opcodes2479are not generated when PCRE2_UCP is set - instead appropriate property2480tests are compiled. */24812482case OP_NOT_DIGIT:2483if (Feptr >= mb->end_subject)2484{2485SCHECK_PARTIAL();2486RRETURN(MATCH_NOMATCH);2487}2488GETCHARINCTEST(fc, Feptr);2489if (CHMAX_255(fc) && (mb->ctypes[fc] & ctype_digit) != 0)2490RRETURN(MATCH_NOMATCH);2491Fecode++;2492break;24932494case OP_DIGIT:2495if (Feptr >= mb->end_subject)2496{2497SCHECK_PARTIAL();2498RRETURN(MATCH_NOMATCH);2499}2500GETCHARINCTEST(fc, Feptr);2501if (!CHMAX_255(fc) || (mb->ctypes[fc] & ctype_digit) == 0)2502RRETURN(MATCH_NOMATCH);2503Fecode++;2504break;25052506case OP_NOT_WHITESPACE:2507if (Feptr >= mb->end_subject)2508{2509SCHECK_PARTIAL();2510RRETURN(MATCH_NOMATCH);2511}2512GETCHARINCTEST(fc, Feptr);2513if (CHMAX_255(fc) && (mb->ctypes[fc] & ctype_space) != 0)2514RRETURN(MATCH_NOMATCH);2515Fecode++;2516break;25172518case OP_WHITESPACE:2519if (Feptr >= mb->end_subject)2520{2521SCHECK_PARTIAL();2522RRETURN(MATCH_NOMATCH);2523}2524GETCHARINCTEST(fc, Feptr);2525if (!CHMAX_255(fc) || (mb->ctypes[fc] & ctype_space) == 0)2526RRETURN(MATCH_NOMATCH);2527Fecode++;2528break;25292530case OP_NOT_WORDCHAR:2531if (Feptr >= mb->end_subject)2532{2533SCHECK_PARTIAL();2534RRETURN(MATCH_NOMATCH);2535}2536GETCHARINCTEST(fc, Feptr);2537if (CHMAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0)2538RRETURN(MATCH_NOMATCH);2539Fecode++;2540break;25412542case OP_WORDCHAR:2543if (Feptr >= mb->end_subject)2544{2545SCHECK_PARTIAL();2546RRETURN(MATCH_NOMATCH);2547}2548GETCHARINCTEST(fc, Feptr);2549if (!CHMAX_255(fc) || (mb->ctypes[fc] & ctype_word) == 0)2550RRETURN(MATCH_NOMATCH);2551Fecode++;2552break;25532554case OP_ANYNL:2555if (Feptr >= mb->end_subject)2556{2557SCHECK_PARTIAL();2558RRETURN(MATCH_NOMATCH);2559}2560GETCHARINCTEST(fc, Feptr);2561switch(fc)2562{2563default: RRETURN(MATCH_NOMATCH);25642565case CHAR_CR:2566if (Feptr >= mb->end_subject)2567{2568SCHECK_PARTIAL();2569}2570else if (UCHAR21TEST(Feptr) == CHAR_LF) Feptr++;2571break;25722573case CHAR_LF:2574break;25752576case CHAR_VT:2577case CHAR_FF:2578case CHAR_NEL:2579#ifndef EBCDIC2580case 0x2028:2581case 0x2029:2582#endif /* Not EBCDIC */2583if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) RRETURN(MATCH_NOMATCH);2584break;2585}2586Fecode++;2587break;25882589case OP_NOT_HSPACE:2590if (Feptr >= mb->end_subject)2591{2592SCHECK_PARTIAL();2593RRETURN(MATCH_NOMATCH);2594}2595GETCHARINCTEST(fc, Feptr);2596switch(fc)2597{2598HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */2599default: break;2600}2601Fecode++;2602break;26032604case OP_HSPACE:2605if (Feptr >= mb->end_subject)2606{2607SCHECK_PARTIAL();2608RRETURN(MATCH_NOMATCH);2609}2610GETCHARINCTEST(fc, Feptr);2611switch(fc)2612{2613HSPACE_CASES: break; /* Byte and multibyte cases */2614default: RRETURN(MATCH_NOMATCH);2615}2616Fecode++;2617break;26182619case OP_NOT_VSPACE:2620if (Feptr >= mb->end_subject)2621{2622SCHECK_PARTIAL();2623RRETURN(MATCH_NOMATCH);2624}2625GETCHARINCTEST(fc, Feptr);2626switch(fc)2627{2628VSPACE_CASES: RRETURN(MATCH_NOMATCH);2629default: break;2630}2631Fecode++;2632break;26332634case OP_VSPACE:2635if (Feptr >= mb->end_subject)2636{2637SCHECK_PARTIAL();2638RRETURN(MATCH_NOMATCH);2639}2640GETCHARINCTEST(fc, Feptr);2641switch(fc)2642{2643VSPACE_CASES: break;2644default: RRETURN(MATCH_NOMATCH);2645}2646Fecode++;2647break;264826492650#ifdef SUPPORT_UNICODE26512652/* ===================================================================== */2653/* Check the next character by Unicode property. We will get here only2654if the support is in the binary; otherwise a compile-time error occurs. */26552656case OP_PROP:2657case OP_NOTPROP:2658if (Feptr >= mb->end_subject)2659{2660SCHECK_PARTIAL();2661RRETURN(MATCH_NOMATCH);2662}2663GETCHARINCTEST(fc, Feptr);2664{2665const uint32_t *cp;2666uint32_t chartype;2667const ucd_record *prop = GET_UCD(fc);2668BOOL notmatch = Fop == OP_NOTPROP;26692670switch(Fecode[1])2671{2672case PT_LAMP:2673chartype = prop->chartype;2674if ((chartype == ucp_Lu ||2675chartype == ucp_Ll ||2676chartype == ucp_Lt) == notmatch)2677RRETURN(MATCH_NOMATCH);2678break;26792680case PT_GC:2681if ((Fecode[2] == PRIV(ucp_gentype)[prop->chartype]) == notmatch)2682RRETURN(MATCH_NOMATCH);2683break;26842685case PT_PC:2686if ((Fecode[2] == prop->chartype) == notmatch)2687RRETURN(MATCH_NOMATCH);2688break;26892690case PT_SC:2691if ((Fecode[2] == prop->script) == notmatch)2692RRETURN(MATCH_NOMATCH);2693break;26942695case PT_SCX:2696{2697BOOL ok = (Fecode[2] == prop->script ||2698MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), Fecode[2]) != 0);2699if (ok == notmatch) RRETURN(MATCH_NOMATCH);2700}2701break;27022703/* These are specials */27042705case PT_ALNUM:2706chartype = prop->chartype;2707if ((PRIV(ucp_gentype)[chartype] == ucp_L ||2708PRIV(ucp_gentype)[chartype] == ucp_N) == notmatch)2709RRETURN(MATCH_NOMATCH);2710break;27112712/* Perl space used to exclude VT, but from Perl 5.18 it is included,2713which means that Perl space and POSIX space are now identical. PCRE2714was changed at release 8.34. */27152716case PT_SPACE: /* Perl space */2717case PT_PXSPACE: /* POSIX space */2718switch(fc)2719{2720HSPACE_CASES:2721VSPACE_CASES:2722if (notmatch) RRETURN(MATCH_NOMATCH);2723break;27242725default:2726if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == notmatch)2727RRETURN(MATCH_NOMATCH);2728break;2729}2730break;27312732case PT_WORD:2733chartype = prop->chartype;2734if ((PRIV(ucp_gentype)[chartype] == ucp_L ||2735PRIV(ucp_gentype)[chartype] == ucp_N ||2736chartype == ucp_Mn ||2737chartype == ucp_Pc) == notmatch)2738RRETURN(MATCH_NOMATCH);2739break;27402741case PT_CLIST:2742#if PCRE2_CODE_UNIT_WIDTH == 322743if (fc > MAX_UTF_CODE_POINT)2744{2745if (notmatch) break;;2746RRETURN(MATCH_NOMATCH);2747}2748#endif2749cp = PRIV(ucd_caseless_sets) + Fecode[2];2750for (;;)2751{2752if (fc < *cp)2753{ if (notmatch) break; else { RRETURN(MATCH_NOMATCH); } }2754if (fc == *cp++)2755{ if (notmatch) { RRETURN(MATCH_NOMATCH); } else break; }2756}2757break;27582759case PT_UCNC:2760if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT ||2761fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) ||2762fc >= 0xe000) == notmatch)2763RRETURN(MATCH_NOMATCH);2764break;27652766case PT_BIDICL:2767if ((UCD_BIDICLASS_PROP(prop) == Fecode[2]) == notmatch)2768RRETURN(MATCH_NOMATCH);2769break;27702771case PT_BOOL:2772{2773BOOL ok = MAPBIT(PRIV(ucd_boolprop_sets) +2774UCD_BPROPS_PROP(prop), Fecode[2]) != 0;2775if (ok == notmatch) RRETURN(MATCH_NOMATCH);2776}2777break;27782779/* This should never occur */27802781default:2782PCRE2_DEBUG_UNREACHABLE();2783return PCRE2_ERROR_INTERNAL;2784}27852786Fecode += 3;2787}2788break;278927902791/* ===================================================================== */2792/* Match an extended Unicode sequence. We will get here only if the support2793is in the binary; otherwise a compile-time error occurs. */27942795case OP_EXTUNI:2796if (Feptr >= mb->end_subject)2797{2798SCHECK_PARTIAL();2799RRETURN(MATCH_NOMATCH);2800}2801else2802{2803GETCHARINCTEST(fc, Feptr);2804Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, mb->end_subject, utf,2805NULL);2806}2807CHECK_PARTIAL();2808Fecode++;2809break;28102811#endif /* SUPPORT_UNICODE */281228132814/* ===================================================================== */2815/* Match a single character type repeatedly. Note that the property type2816does not need to be in a stack frame as it is not used within an RMATCH()2817loop. */28182819#define Lstart_eptr F->temp_sptr[0]2820#define Lmin F->temp_32[0]2821#define Lmax F->temp_32[1]2822#define Lctype F->temp_32[2]2823#define Lpropvalue F->temp_32[3]28242825case OP_TYPEEXACT:2826Lmin = Lmax = GET2(Fecode, 1);2827Fecode += 1 + IMM2_SIZE;2828goto REPEATTYPE;28292830case OP_TYPEUPTO:2831case OP_TYPEMINUPTO:2832Lmin = 0;2833Lmax = GET2(Fecode, 1);2834reptype = (*Fecode == OP_TYPEMINUPTO)? REPTYPE_MIN : REPTYPE_MAX;2835Fecode += 1 + IMM2_SIZE;2836goto REPEATTYPE;28372838case OP_TYPEPOSSTAR:2839reptype = REPTYPE_POS;2840Lmin = 0;2841Lmax = UINT32_MAX;2842Fecode++;2843goto REPEATTYPE;28442845case OP_TYPEPOSPLUS:2846reptype = REPTYPE_POS;2847Lmin = 1;2848Lmax = UINT32_MAX;2849Fecode++;2850goto REPEATTYPE;28512852case OP_TYPEPOSQUERY:2853reptype = REPTYPE_POS;2854Lmin = 0;2855Lmax = 1;2856Fecode++;2857goto REPEATTYPE;28582859case OP_TYPEPOSUPTO:2860reptype = REPTYPE_POS;2861Lmin = 0;2862Lmax = GET2(Fecode, 1);2863Fecode += 1 + IMM2_SIZE;2864goto REPEATTYPE;28652866case OP_TYPESTAR:2867case OP_TYPEMINSTAR:2868case OP_TYPEPLUS:2869case OP_TYPEMINPLUS:2870case OP_TYPEQUERY:2871case OP_TYPEMINQUERY:2872fc = *Fecode++ - OP_TYPESTAR;2873Lmin = rep_min[fc];2874Lmax = rep_max[fc];2875reptype = rep_typ[fc];28762877/* Common code for all repeated character type matches. */28782879REPEATTYPE:2880Lctype = *Fecode++; /* Code for the character type */28812882#ifdef SUPPORT_UNICODE2883if (Lctype == OP_PROP || Lctype == OP_NOTPROP)2884{2885proptype = *Fecode++;2886Lpropvalue = *Fecode++;2887}2888else proptype = -1;2889#endif28902891/* First, ensure the minimum number of matches are present. Use inline2892code for maximizing the speed, and do the type test once at the start2893(i.e. keep it out of the loops). As there are no calls to RMATCH in the2894loops, we can use an ordinary variable for "notmatch". The code for UTF2895mode is separated out for tidiness, except for Unicode property tests. */28962897if (Lmin > 0)2898{2899#ifdef SUPPORT_UNICODE2900if (proptype >= 0) /* Property tests in all modes */2901{2902BOOL notmatch = Lctype == OP_NOTPROP;2903switch(proptype)2904{2905case PT_LAMP:2906for (i = 1; i <= Lmin; i++)2907{2908int chartype;2909if (Feptr >= mb->end_subject)2910{2911SCHECK_PARTIAL();2912RRETURN(MATCH_NOMATCH);2913}2914GETCHARINCTEST(fc, Feptr);2915chartype = UCD_CHARTYPE(fc);2916if ((chartype == ucp_Lu ||2917chartype == ucp_Ll ||2918chartype == ucp_Lt) == notmatch)2919RRETURN(MATCH_NOMATCH);2920}2921break;29222923case PT_GC:2924for (i = 1; i <= Lmin; i++)2925{2926if (Feptr >= mb->end_subject)2927{2928SCHECK_PARTIAL();2929RRETURN(MATCH_NOMATCH);2930}2931GETCHARINCTEST(fc, Feptr);2932if ((UCD_CATEGORY(fc) == Lpropvalue) == notmatch)2933RRETURN(MATCH_NOMATCH);2934}2935break;29362937case PT_PC:2938for (i = 1; i <= Lmin; i++)2939{2940if (Feptr >= mb->end_subject)2941{2942SCHECK_PARTIAL();2943RRETURN(MATCH_NOMATCH);2944}2945GETCHARINCTEST(fc, Feptr);2946if ((UCD_CHARTYPE(fc) == Lpropvalue) == notmatch)2947RRETURN(MATCH_NOMATCH);2948}2949break;29502951case PT_SC:2952for (i = 1; i <= Lmin; i++)2953{2954if (Feptr >= mb->end_subject)2955{2956SCHECK_PARTIAL();2957RRETURN(MATCH_NOMATCH);2958}2959GETCHARINCTEST(fc, Feptr);2960if ((UCD_SCRIPT(fc) == Lpropvalue) == notmatch)2961RRETURN(MATCH_NOMATCH);2962}2963break;29642965case PT_SCX:2966for (i = 1; i <= Lmin; i++)2967{2968BOOL ok;2969const ucd_record *prop;2970if (Feptr >= mb->end_subject)2971{2972SCHECK_PARTIAL();2973RRETURN(MATCH_NOMATCH);2974}2975GETCHARINCTEST(fc, Feptr);2976prop = GET_UCD(fc);2977ok = (prop->script == Lpropvalue ||2978MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), Lpropvalue) != 0);2979if (ok == notmatch)2980RRETURN(MATCH_NOMATCH);2981}2982break;29832984case PT_ALNUM:2985for (i = 1; i <= Lmin; i++)2986{2987int category;2988if (Feptr >= mb->end_subject)2989{2990SCHECK_PARTIAL();2991RRETURN(MATCH_NOMATCH);2992}2993GETCHARINCTEST(fc, Feptr);2994category = UCD_CATEGORY(fc);2995if ((category == ucp_L || category == ucp_N) == notmatch)2996RRETURN(MATCH_NOMATCH);2997}2998break;29993000/* Perl space used to exclude VT, but from Perl 5.18 it is included,3001which means that Perl space and POSIX space are now identical. PCRE3002was changed at release 8.34. */30033004case PT_SPACE: /* Perl space */3005case PT_PXSPACE: /* POSIX space */3006for (i = 1; i <= Lmin; i++)3007{3008if (Feptr >= mb->end_subject)3009{3010SCHECK_PARTIAL();3011RRETURN(MATCH_NOMATCH);3012}3013GETCHARINCTEST(fc, Feptr);3014switch(fc)3015{3016HSPACE_CASES:3017VSPACE_CASES:3018if (notmatch) RRETURN(MATCH_NOMATCH);3019break;30203021default:3022if ((UCD_CATEGORY(fc) == ucp_Z) == notmatch)3023RRETURN(MATCH_NOMATCH);3024break;3025}3026}3027break;30283029case PT_WORD:3030for (i = 1; i <= Lmin; i++)3031{3032int chartype, category;3033if (Feptr >= mb->end_subject)3034{3035SCHECK_PARTIAL();3036RRETURN(MATCH_NOMATCH);3037}3038GETCHARINCTEST(fc, Feptr);3039chartype = UCD_CHARTYPE(fc);3040category = PRIV(ucp_gentype)[chartype];3041if ((category == ucp_L || category == ucp_N ||3042chartype == ucp_Mn || chartype == ucp_Pc) == notmatch)3043RRETURN(MATCH_NOMATCH);3044}3045break;30463047case PT_CLIST:3048for (i = 1; i <= Lmin; i++)3049{3050const uint32_t *cp;3051if (Feptr >= mb->end_subject)3052{3053SCHECK_PARTIAL();3054RRETURN(MATCH_NOMATCH);3055}3056GETCHARINCTEST(fc, Feptr);3057#if PCRE2_CODE_UNIT_WIDTH == 323058if (fc > MAX_UTF_CODE_POINT)3059{3060if (notmatch) continue;3061RRETURN(MATCH_NOMATCH);3062}3063#endif3064cp = PRIV(ucd_caseless_sets) + Lpropvalue;3065for (;;)3066{3067if (fc < *cp)3068{3069if (notmatch) break;3070RRETURN(MATCH_NOMATCH);3071}3072if (fc == *cp++)3073{3074if (notmatch) RRETURN(MATCH_NOMATCH);3075break;3076}3077}3078}3079break;30803081case PT_UCNC:3082for (i = 1; i <= Lmin; i++)3083{3084if (Feptr >= mb->end_subject)3085{3086SCHECK_PARTIAL();3087RRETURN(MATCH_NOMATCH);3088}3089GETCHARINCTEST(fc, Feptr);3090if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT ||3091fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) ||3092fc >= 0xe000) == notmatch)3093RRETURN(MATCH_NOMATCH);3094}3095break;30963097case PT_BIDICL:3098for (i = 1; i <= Lmin; i++)3099{3100if (Feptr >= mb->end_subject)3101{3102SCHECK_PARTIAL();3103RRETURN(MATCH_NOMATCH);3104}3105GETCHARINCTEST(fc, Feptr);3106if ((UCD_BIDICLASS(fc) == Lpropvalue) == notmatch)3107RRETURN(MATCH_NOMATCH);3108}3109break;31103111case PT_BOOL:3112for (i = 1; i <= Lmin; i++)3113{3114BOOL ok;3115const ucd_record *prop;3116if (Feptr >= mb->end_subject)3117{3118SCHECK_PARTIAL();3119RRETURN(MATCH_NOMATCH);3120}3121GETCHARINCTEST(fc, Feptr);3122prop = GET_UCD(fc);3123ok = MAPBIT(PRIV(ucd_boolprop_sets) +3124UCD_BPROPS_PROP(prop), Lpropvalue) != 0;3125if (ok == notmatch)3126RRETURN(MATCH_NOMATCH);3127}3128break;31293130/* This should not occur */31313132default:3133PCRE2_DEBUG_UNREACHABLE();3134return PCRE2_ERROR_INTERNAL;3135}3136}31373138/* Match extended Unicode sequences. We will get here only if the3139support is in the binary; otherwise a compile-time error occurs. */31403141else if (Lctype == OP_EXTUNI)3142{3143for (i = 1; i <= Lmin; i++)3144{3145if (Feptr >= mb->end_subject)3146{3147SCHECK_PARTIAL();3148RRETURN(MATCH_NOMATCH);3149}3150else3151{3152GETCHARINCTEST(fc, Feptr);3153Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject,3154mb->end_subject, utf, NULL);3155}3156CHECK_PARTIAL();3157}3158}3159else3160#endif /* SUPPORT_UNICODE */31613162/* Handle all other cases in UTF mode */31633164#ifdef SUPPORT_UNICODE3165if (utf) switch(Lctype)3166{3167case OP_ANY:3168for (i = 1; i <= Lmin; i++)3169{3170if (Feptr >= mb->end_subject)3171{3172SCHECK_PARTIAL();3173RRETURN(MATCH_NOMATCH);3174}3175if (IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH);3176if (mb->partial != 0 &&3177Feptr + 1 >= mb->end_subject &&3178NLBLOCK->nltype == NLTYPE_FIXED &&3179NLBLOCK->nllen == 2 &&3180UCHAR21(Feptr) == NLBLOCK->nl[0])3181{3182mb->hitend = TRUE;3183if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;3184}3185Feptr++;3186ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);3187}3188break;31893190case OP_ALLANY:3191for (i = 1; i <= Lmin; i++)3192{3193if (Feptr >= mb->end_subject)3194{3195SCHECK_PARTIAL();3196RRETURN(MATCH_NOMATCH);3197}3198Feptr++;3199ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);3200}3201break;32023203case OP_ANYBYTE:3204if (Feptr > mb->end_subject - Lmin) RRETURN(MATCH_NOMATCH);3205Feptr += Lmin;3206break;32073208case OP_ANYNL:3209for (i = 1; i <= Lmin; i++)3210{3211if (Feptr >= mb->end_subject)3212{3213SCHECK_PARTIAL();3214RRETURN(MATCH_NOMATCH);3215}3216GETCHARINC(fc, Feptr);3217switch(fc)3218{3219default: RRETURN(MATCH_NOMATCH);32203221case CHAR_CR:3222if (Feptr < mb->end_subject && UCHAR21(Feptr) == CHAR_LF) Feptr++;3223break;32243225case CHAR_LF:3226break;32273228case CHAR_VT:3229case CHAR_FF:3230case CHAR_NEL:3231#ifndef EBCDIC3232case 0x2028:3233case 0x2029:3234#endif /* Not EBCDIC */3235if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) RRETURN(MATCH_NOMATCH);3236break;3237}3238}3239break;32403241case OP_NOT_HSPACE:3242for (i = 1; i <= Lmin; i++)3243{3244if (Feptr >= mb->end_subject)3245{3246SCHECK_PARTIAL();3247RRETURN(MATCH_NOMATCH);3248}3249GETCHARINC(fc, Feptr);3250switch(fc)3251{3252HSPACE_CASES: RRETURN(MATCH_NOMATCH);3253default: break;3254}3255}3256break;32573258case OP_HSPACE:3259for (i = 1; i <= Lmin; i++)3260{3261if (Feptr >= mb->end_subject)3262{3263SCHECK_PARTIAL();3264RRETURN(MATCH_NOMATCH);3265}3266GETCHARINC(fc, Feptr);3267switch(fc)3268{3269HSPACE_CASES: break;3270default: RRETURN(MATCH_NOMATCH);3271}3272}3273break;32743275case OP_NOT_VSPACE:3276for (i = 1; i <= Lmin; i++)3277{3278if (Feptr >= mb->end_subject)3279{3280SCHECK_PARTIAL();3281RRETURN(MATCH_NOMATCH);3282}3283GETCHARINC(fc, Feptr);3284switch(fc)3285{3286VSPACE_CASES: RRETURN(MATCH_NOMATCH);3287default: break;3288}3289}3290break;32913292case OP_VSPACE:3293for (i = 1; i <= Lmin; i++)3294{3295if (Feptr >= mb->end_subject)3296{3297SCHECK_PARTIAL();3298RRETURN(MATCH_NOMATCH);3299}3300GETCHARINC(fc, Feptr);3301switch(fc)3302{3303VSPACE_CASES: break;3304default: RRETURN(MATCH_NOMATCH);3305}3306}3307break;33083309case OP_NOT_DIGIT:3310for (i = 1; i <= Lmin; i++)3311{3312if (Feptr >= mb->end_subject)3313{3314SCHECK_PARTIAL();3315RRETURN(MATCH_NOMATCH);3316}3317GETCHARINC(fc, Feptr);3318if (fc < 128 && (mb->ctypes[fc] & ctype_digit) != 0)3319RRETURN(MATCH_NOMATCH);3320}3321break;33223323case OP_DIGIT:3324for (i = 1; i <= Lmin; i++)3325{3326uint32_t cc;3327if (Feptr >= mb->end_subject)3328{3329SCHECK_PARTIAL();3330RRETURN(MATCH_NOMATCH);3331}3332cc = UCHAR21(Feptr);3333if (cc >= 128 || (mb->ctypes[cc] & ctype_digit) == 0)3334RRETURN(MATCH_NOMATCH);3335Feptr++;3336/* No need to skip more code units - we know it has only one. */3337}3338break;33393340case OP_NOT_WHITESPACE:3341for (i = 1; i <= Lmin; i++)3342{3343uint32_t cc;3344if (Feptr >= mb->end_subject)3345{3346SCHECK_PARTIAL();3347RRETURN(MATCH_NOMATCH);3348}3349cc = UCHAR21(Feptr);3350if (cc < 128 && (mb->ctypes[cc] & ctype_space) != 0)3351RRETURN(MATCH_NOMATCH);3352Feptr++;3353ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);3354}3355break;33563357case OP_WHITESPACE:3358for (i = 1; i <= Lmin; i++)3359{3360uint32_t cc;3361if (Feptr >= mb->end_subject)3362{3363SCHECK_PARTIAL();3364RRETURN(MATCH_NOMATCH);3365}3366cc = UCHAR21(Feptr);3367if (cc >= 128 || (mb->ctypes[cc] & ctype_space) == 0)3368RRETURN(MATCH_NOMATCH);3369Feptr++;3370/* No need to skip more code units - we know it has only one. */3371}3372break;33733374case OP_NOT_WORDCHAR:3375for (i = 1; i <= Lmin; i++)3376{3377uint32_t cc;3378if (Feptr >= mb->end_subject)3379{3380SCHECK_PARTIAL();3381RRETURN(MATCH_NOMATCH);3382}3383cc = UCHAR21(Feptr);3384if (cc < 128 && (mb->ctypes[cc] & ctype_word) != 0)3385RRETURN(MATCH_NOMATCH);3386Feptr++;3387ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);3388}3389break;33903391case OP_WORDCHAR:3392for (i = 1; i <= Lmin; i++)3393{3394uint32_t cc;3395if (Feptr >= mb->end_subject)3396{3397SCHECK_PARTIAL();3398RRETURN(MATCH_NOMATCH);3399}3400cc = UCHAR21(Feptr);3401if (cc >= 128 || (mb->ctypes[cc] & ctype_word) == 0)3402RRETURN(MATCH_NOMATCH);3403Feptr++;3404/* No need to skip more code units - we know it has only one. */3405}3406break;34073408default:3409PCRE2_DEBUG_UNREACHABLE();3410return PCRE2_ERROR_INTERNAL;3411} /* End switch(Lctype) */34123413else3414#endif /* SUPPORT_UNICODE */34153416/* Code for the non-UTF case for minimum matching of operators other3417than OP_PROP and OP_NOTPROP. */34183419switch(Lctype)3420{3421case OP_ANY:3422for (i = 1; i <= Lmin; i++)3423{3424if (Feptr >= mb->end_subject)3425{3426SCHECK_PARTIAL();3427RRETURN(MATCH_NOMATCH);3428}3429if (IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH);3430if (mb->partial != 0 &&3431Feptr + 1 >= mb->end_subject &&3432NLBLOCK->nltype == NLTYPE_FIXED &&3433NLBLOCK->nllen == 2 &&3434*Feptr == NLBLOCK->nl[0])3435{3436mb->hitend = TRUE;3437if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;3438}3439Feptr++;3440}3441break;34423443case OP_ALLANY:3444if (Feptr > mb->end_subject - Lmin)3445{3446SCHECK_PARTIAL();3447RRETURN(MATCH_NOMATCH);3448}3449Feptr += Lmin;3450break;34513452/* This OP_ANYBYTE case will never be reached because \C gets turned3453into OP_ALLANY in non-UTF mode. Cut out the code so that coverage3454reports don't complain about it's never being used. */34553456/* case OP_ANYBYTE:3457* if (Feptr > mb->end_subject - Lmin)3458* {3459* SCHECK_PARTIAL();3460* RRETURN(MATCH_NOMATCH);3461* }3462* Feptr += Lmin;3463* break;3464*/3465case OP_ANYNL:3466for (i = 1; i <= Lmin; i++)3467{3468if (Feptr >= mb->end_subject)3469{3470SCHECK_PARTIAL();3471RRETURN(MATCH_NOMATCH);3472}3473switch(*Feptr++)3474{3475default: RRETURN(MATCH_NOMATCH);34763477case CHAR_CR:3478if (Feptr < mb->end_subject && *Feptr == CHAR_LF) Feptr++;3479break;34803481case CHAR_LF:3482break;34833484case CHAR_VT:3485case CHAR_FF:3486case CHAR_NEL:3487#if PCRE2_CODE_UNIT_WIDTH != 83488case 0x2028:3489case 0x2029:3490#endif3491if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) RRETURN(MATCH_NOMATCH);3492break;3493}3494}3495break;34963497case OP_NOT_HSPACE:3498for (i = 1; i <= Lmin; i++)3499{3500if (Feptr >= mb->end_subject)3501{3502SCHECK_PARTIAL();3503RRETURN(MATCH_NOMATCH);3504}3505switch(*Feptr++)3506{3507default: break;3508HSPACE_BYTE_CASES:3509#if PCRE2_CODE_UNIT_WIDTH != 83510HSPACE_MULTIBYTE_CASES:3511#endif3512RRETURN(MATCH_NOMATCH);3513}3514}3515break;35163517case OP_HSPACE:3518for (i = 1; i <= Lmin; i++)3519{3520if (Feptr >= mb->end_subject)3521{3522SCHECK_PARTIAL();3523RRETURN(MATCH_NOMATCH);3524}3525switch(*Feptr++)3526{3527default: RRETURN(MATCH_NOMATCH);3528HSPACE_BYTE_CASES:3529#if PCRE2_CODE_UNIT_WIDTH != 83530HSPACE_MULTIBYTE_CASES:3531#endif3532break;3533}3534}3535break;35363537case OP_NOT_VSPACE:3538for (i = 1; i <= Lmin; i++)3539{3540if (Feptr >= mb->end_subject)3541{3542SCHECK_PARTIAL();3543RRETURN(MATCH_NOMATCH);3544}3545switch(*Feptr++)3546{3547VSPACE_BYTE_CASES:3548#if PCRE2_CODE_UNIT_WIDTH != 83549VSPACE_MULTIBYTE_CASES:3550#endif3551RRETURN(MATCH_NOMATCH);3552default: break;3553}3554}3555break;35563557case OP_VSPACE:3558for (i = 1; i <= Lmin; i++)3559{3560if (Feptr >= mb->end_subject)3561{3562SCHECK_PARTIAL();3563RRETURN(MATCH_NOMATCH);3564}3565switch(*Feptr++)3566{3567default: RRETURN(MATCH_NOMATCH);3568VSPACE_BYTE_CASES:3569#if PCRE2_CODE_UNIT_WIDTH != 83570VSPACE_MULTIBYTE_CASES:3571#endif3572break;3573}3574}3575break;35763577case OP_NOT_DIGIT:3578for (i = 1; i <= Lmin; i++)3579{3580if (Feptr >= mb->end_subject)3581{3582SCHECK_PARTIAL();3583RRETURN(MATCH_NOMATCH);3584}3585if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_digit) != 0)3586RRETURN(MATCH_NOMATCH);3587Feptr++;3588}3589break;35903591case OP_DIGIT:3592for (i = 1; i <= Lmin; i++)3593{3594if (Feptr >= mb->end_subject)3595{3596SCHECK_PARTIAL();3597RRETURN(MATCH_NOMATCH);3598}3599if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_digit) == 0)3600RRETURN(MATCH_NOMATCH);3601Feptr++;3602}3603break;36043605case OP_NOT_WHITESPACE:3606for (i = 1; i <= Lmin; i++)3607{3608if (Feptr >= mb->end_subject)3609{3610SCHECK_PARTIAL();3611RRETURN(MATCH_NOMATCH);3612}3613if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_space) != 0)3614RRETURN(MATCH_NOMATCH);3615Feptr++;3616}3617break;36183619case OP_WHITESPACE:3620for (i = 1; i <= Lmin; i++)3621{3622if (Feptr >= mb->end_subject)3623{3624SCHECK_PARTIAL();3625RRETURN(MATCH_NOMATCH);3626}3627if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_space) == 0)3628RRETURN(MATCH_NOMATCH);3629Feptr++;3630}3631break;36323633case OP_NOT_WORDCHAR:3634for (i = 1; i <= Lmin; i++)3635{3636if (Feptr >= mb->end_subject)3637{3638SCHECK_PARTIAL();3639RRETURN(MATCH_NOMATCH);3640}3641if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_word) != 0)3642RRETURN(MATCH_NOMATCH);3643Feptr++;3644}3645break;36463647case OP_WORDCHAR:3648for (i = 1; i <= Lmin; i++)3649{3650if (Feptr >= mb->end_subject)3651{3652SCHECK_PARTIAL();3653RRETURN(MATCH_NOMATCH);3654}3655if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_word) == 0)3656RRETURN(MATCH_NOMATCH);3657Feptr++;3658}3659break;36603661default:3662PCRE2_DEBUG_UNREACHABLE();3663return PCRE2_ERROR_INTERNAL;3664}3665}36663667/* If Lmin = Lmax we are done. Continue with the main loop. */36683669if (Lmin == Lmax) continue;36703671/* If minimizing, we have to test the rest of the pattern before each3672subsequent match. This means we cannot use a local "notmatch" variable as3673in the other cases. As all 4 temporary 32-bit values in the frame are3674already in use, just test the type each time. */36753676if (reptype == REPTYPE_MIN)3677{3678#ifdef SUPPORT_UNICODE3679if (proptype >= 0)3680{3681switch(proptype)3682{3683case PT_LAMP:3684for (;;)3685{3686int chartype;3687RMATCH(Fecode, RM208);3688if (rrc != MATCH_NOMATCH) RRETURN(rrc);3689if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);3690if (Feptr >= mb->end_subject)3691{3692SCHECK_PARTIAL();3693RRETURN(MATCH_NOMATCH);3694}3695GETCHARINCTEST(fc, Feptr);3696chartype = UCD_CHARTYPE(fc);3697if ((chartype == ucp_Lu ||3698chartype == ucp_Ll ||3699chartype == ucp_Lt) == (Lctype == OP_NOTPROP))3700RRETURN(MATCH_NOMATCH);3701}3702PCRE2_UNREACHABLE(); /* Control never reaches here */37033704case PT_GC:3705for (;;)3706{3707RMATCH(Fecode, RM209);3708if (rrc != MATCH_NOMATCH) RRETURN(rrc);3709if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);3710if (Feptr >= mb->end_subject)3711{3712SCHECK_PARTIAL();3713RRETURN(MATCH_NOMATCH);3714}3715GETCHARINCTEST(fc, Feptr);3716if ((UCD_CATEGORY(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))3717RRETURN(MATCH_NOMATCH);3718}3719PCRE2_UNREACHABLE(); /* Control never reaches here */37203721case PT_PC:3722for (;;)3723{3724RMATCH(Fecode, RM210);3725if (rrc != MATCH_NOMATCH) RRETURN(rrc);3726if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);3727if (Feptr >= mb->end_subject)3728{3729SCHECK_PARTIAL();3730RRETURN(MATCH_NOMATCH);3731}3732GETCHARINCTEST(fc, Feptr);3733if ((UCD_CHARTYPE(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))3734RRETURN(MATCH_NOMATCH);3735}3736PCRE2_UNREACHABLE(); /* Control never reaches here */37373738case PT_SC:3739for (;;)3740{3741RMATCH(Fecode, RM211);3742if (rrc != MATCH_NOMATCH) RRETURN(rrc);3743if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);3744if (Feptr >= mb->end_subject)3745{3746SCHECK_PARTIAL();3747RRETURN(MATCH_NOMATCH);3748}3749GETCHARINCTEST(fc, Feptr);3750if ((UCD_SCRIPT(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))3751RRETURN(MATCH_NOMATCH);3752}3753PCRE2_UNREACHABLE(); /* Control never reaches here */37543755case PT_SCX:3756for (;;)3757{3758BOOL ok;3759const ucd_record *prop;3760RMATCH(Fecode, RM224);3761if (rrc != MATCH_NOMATCH) RRETURN(rrc);3762if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);3763if (Feptr >= mb->end_subject)3764{3765SCHECK_PARTIAL();3766RRETURN(MATCH_NOMATCH);3767}3768GETCHARINCTEST(fc, Feptr);3769prop = GET_UCD(fc);3770ok = (prop->script == Lpropvalue3771|| MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), Lpropvalue) != 0);3772if (ok == (Lctype == OP_NOTPROP))3773RRETURN(MATCH_NOMATCH);3774}3775PCRE2_UNREACHABLE(); /* Control never reaches here */37763777case PT_ALNUM:3778for (;;)3779{3780int category;3781RMATCH(Fecode, RM212);3782if (rrc != MATCH_NOMATCH) RRETURN(rrc);3783if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);3784if (Feptr >= mb->end_subject)3785{3786SCHECK_PARTIAL();3787RRETURN(MATCH_NOMATCH);3788}3789GETCHARINCTEST(fc, Feptr);3790category = UCD_CATEGORY(fc);3791if ((category == ucp_L || category == ucp_N) == (Lctype == OP_NOTPROP))3792RRETURN(MATCH_NOMATCH);3793}3794PCRE2_UNREACHABLE(); /* Control never reaches here */37953796/* Perl space used to exclude VT, but from Perl 5.18 it is included,3797which means that Perl space and POSIX space are now identical. PCRE3798was changed at release 8.34. */37993800case PT_SPACE: /* Perl space */3801case PT_PXSPACE: /* POSIX space */3802for (;;)3803{3804RMATCH(Fecode, RM213);3805if (rrc != MATCH_NOMATCH) RRETURN(rrc);3806if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);3807if (Feptr >= mb->end_subject)3808{3809SCHECK_PARTIAL();3810RRETURN(MATCH_NOMATCH);3811}3812GETCHARINCTEST(fc, Feptr);3813switch(fc)3814{3815HSPACE_CASES:3816VSPACE_CASES:3817if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH);3818break;38193820default:3821if ((UCD_CATEGORY(fc) == ucp_Z) == (Lctype == OP_NOTPROP))3822RRETURN(MATCH_NOMATCH);3823break;3824}3825}3826PCRE2_UNREACHABLE(); /* Control never reaches here */38273828case PT_WORD:3829for (;;)3830{3831int chartype, category;3832RMATCH(Fecode, RM214);3833if (rrc != MATCH_NOMATCH) RRETURN(rrc);3834if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);3835if (Feptr >= mb->end_subject)3836{3837SCHECK_PARTIAL();3838RRETURN(MATCH_NOMATCH);3839}3840GETCHARINCTEST(fc, Feptr);3841chartype = UCD_CHARTYPE(fc);3842category = PRIV(ucp_gentype)[chartype];3843if ((category == ucp_L ||3844category == ucp_N ||3845chartype == ucp_Mn ||3846chartype == ucp_Pc) == (Lctype == OP_NOTPROP))3847RRETURN(MATCH_NOMATCH);3848}3849PCRE2_UNREACHABLE(); /* Control never reaches here */38503851case PT_CLIST:3852for (;;)3853{3854const uint32_t *cp;3855RMATCH(Fecode, RM215);3856if (rrc != MATCH_NOMATCH) RRETURN(rrc);3857if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);3858if (Feptr >= mb->end_subject)3859{3860SCHECK_PARTIAL();3861RRETURN(MATCH_NOMATCH);3862}3863GETCHARINCTEST(fc, Feptr);3864#if PCRE2_CODE_UNIT_WIDTH == 323865if (fc > MAX_UTF_CODE_POINT)3866{3867if (Lctype == OP_NOTPROP) continue;3868RRETURN(MATCH_NOMATCH);3869}3870#endif3871cp = PRIV(ucd_caseless_sets) + Lpropvalue;3872for (;;)3873{3874if (fc < *cp)3875{3876if (Lctype == OP_NOTPROP) break;3877RRETURN(MATCH_NOMATCH);3878}3879if (fc == *cp++)3880{3881if (Lctype == OP_NOTPROP) RRETURN(MATCH_NOMATCH);3882break;3883}3884}3885}3886PCRE2_UNREACHABLE(); /* Control never reaches here */38873888case PT_UCNC:3889for (;;)3890{3891RMATCH(Fecode, RM216);3892if (rrc != MATCH_NOMATCH) RRETURN(rrc);3893if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);3894if (Feptr >= mb->end_subject)3895{3896SCHECK_PARTIAL();3897RRETURN(MATCH_NOMATCH);3898}3899GETCHARINCTEST(fc, Feptr);3900if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT ||3901fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) ||3902fc >= 0xe000) == (Lctype == OP_NOTPROP))3903RRETURN(MATCH_NOMATCH);3904}3905PCRE2_UNREACHABLE(); /* Control never reaches here */39063907case PT_BIDICL:3908for (;;)3909{3910RMATCH(Fecode, RM223);3911if (rrc != MATCH_NOMATCH) RRETURN(rrc);3912if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);3913if (Feptr >= mb->end_subject)3914{3915SCHECK_PARTIAL();3916RRETURN(MATCH_NOMATCH);3917}3918GETCHARINCTEST(fc, Feptr);3919if ((UCD_BIDICLASS(fc) == Lpropvalue) == (Lctype == OP_NOTPROP))3920RRETURN(MATCH_NOMATCH);3921}3922PCRE2_UNREACHABLE(); /* Control never reaches here */39233924case PT_BOOL:3925for (;;)3926{3927BOOL ok;3928const ucd_record *prop;3929RMATCH(Fecode, RM222);3930if (rrc != MATCH_NOMATCH) RRETURN(rrc);3931if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);3932if (Feptr >= mb->end_subject)3933{3934SCHECK_PARTIAL();3935RRETURN(MATCH_NOMATCH);3936}3937GETCHARINCTEST(fc, Feptr);3938prop = GET_UCD(fc);3939ok = MAPBIT(PRIV(ucd_boolprop_sets) +3940UCD_BPROPS_PROP(prop), Lpropvalue) != 0;3941if (ok == (Lctype == OP_NOTPROP))3942RRETURN(MATCH_NOMATCH);3943}3944PCRE2_UNREACHABLE(); /* Control never reaches here */39453946/* This should never occur */3947default:3948PCRE2_DEBUG_UNREACHABLE();3949return PCRE2_ERROR_INTERNAL;3950}3951}39523953/* Match extended Unicode sequences. We will get here only if the3954support is in the binary; otherwise a compile-time error occurs. */39553956else if (Lctype == OP_EXTUNI)3957{3958for (;;)3959{3960RMATCH(Fecode, RM217);3961if (rrc != MATCH_NOMATCH) RRETURN(rrc);3962if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);3963if (Feptr >= mb->end_subject)3964{3965SCHECK_PARTIAL();3966RRETURN(MATCH_NOMATCH);3967}3968else3969{3970GETCHARINCTEST(fc, Feptr);3971Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, mb->end_subject,3972utf, NULL);3973}3974CHECK_PARTIAL();3975}3976}3977else3978#endif /* SUPPORT_UNICODE */39793980/* UTF mode for non-property testing character types. */39813982#ifdef SUPPORT_UNICODE3983if (utf)3984{3985for (;;)3986{3987RMATCH(Fecode, RM218);3988if (rrc != MATCH_NOMATCH) RRETURN(rrc);3989if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);3990if (Feptr >= mb->end_subject)3991{3992SCHECK_PARTIAL();3993RRETURN(MATCH_NOMATCH);3994}3995if (Lctype == OP_ANY && IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH);3996GETCHARINC(fc, Feptr);3997switch(Lctype)3998{3999case OP_ANY: /* This is the non-NL case */4000if (mb->partial != 0 && /* Take care with CRLF partial */4001Feptr >= mb->end_subject &&4002NLBLOCK->nltype == NLTYPE_FIXED &&4003NLBLOCK->nllen == 2 &&4004fc == NLBLOCK->nl[0])4005{4006mb->hitend = TRUE;4007if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;4008}4009break;40104011case OP_ALLANY:4012case OP_ANYBYTE:4013break;40144015case OP_ANYNL:4016switch(fc)4017{4018default: RRETURN(MATCH_NOMATCH);40194020case CHAR_CR:4021if (Feptr < mb->end_subject && UCHAR21(Feptr) == CHAR_LF) Feptr++;4022break;40234024case CHAR_LF:4025break;40264027case CHAR_VT:4028case CHAR_FF:4029case CHAR_NEL:4030#ifndef EBCDIC4031case 0x2028:4032case 0x2029:4033#endif /* Not EBCDIC */4034if (mb->bsr_convention == PCRE2_BSR_ANYCRLF)4035RRETURN(MATCH_NOMATCH);4036break;4037}4038break;40394040case OP_NOT_HSPACE:4041switch(fc)4042{4043HSPACE_CASES: RRETURN(MATCH_NOMATCH);4044default: break;4045}4046break;40474048case OP_HSPACE:4049switch(fc)4050{4051HSPACE_CASES: break;4052default: RRETURN(MATCH_NOMATCH);4053}4054break;40554056case OP_NOT_VSPACE:4057switch(fc)4058{4059VSPACE_CASES: RRETURN(MATCH_NOMATCH);4060default: break;4061}4062break;40634064case OP_VSPACE:4065switch(fc)4066{4067VSPACE_CASES: break;4068default: RRETURN(MATCH_NOMATCH);4069}4070break;40714072case OP_NOT_DIGIT:4073if (fc < 256 && (mb->ctypes[fc] & ctype_digit) != 0)4074RRETURN(MATCH_NOMATCH);4075break;40764077case OP_DIGIT:4078if (fc >= 256 || (mb->ctypes[fc] & ctype_digit) == 0)4079RRETURN(MATCH_NOMATCH);4080break;40814082case OP_NOT_WHITESPACE:4083if (fc < 256 && (mb->ctypes[fc] & ctype_space) != 0)4084RRETURN(MATCH_NOMATCH);4085break;40864087case OP_WHITESPACE:4088if (fc >= 256 || (mb->ctypes[fc] & ctype_space) == 0)4089RRETURN(MATCH_NOMATCH);4090break;40914092case OP_NOT_WORDCHAR:4093if (fc < 256 && (mb->ctypes[fc] & ctype_word) != 0)4094RRETURN(MATCH_NOMATCH);4095break;40964097case OP_WORDCHAR:4098if (fc >= 256 || (mb->ctypes[fc] & ctype_word) == 0)4099RRETURN(MATCH_NOMATCH);4100break;41014102default:4103PCRE2_DEBUG_UNREACHABLE();4104return PCRE2_ERROR_INTERNAL;4105}4106}4107}4108else4109#endif /* SUPPORT_UNICODE */41104111/* Not UTF mode */4112{4113for (;;)4114{4115RMATCH(Fecode, RM33);4116if (rrc != MATCH_NOMATCH) RRETURN(rrc);4117if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);4118if (Feptr >= mb->end_subject)4119{4120SCHECK_PARTIAL();4121RRETURN(MATCH_NOMATCH);4122}4123if (Lctype == OP_ANY && IS_NEWLINE(Feptr))4124RRETURN(MATCH_NOMATCH);4125fc = *Feptr++;4126switch(Lctype)4127{4128case OP_ANY: /* This is the non-NL case */4129if (mb->partial != 0 && /* Take care with CRLF partial */4130Feptr >= mb->end_subject &&4131NLBLOCK->nltype == NLTYPE_FIXED &&4132NLBLOCK->nllen == 2 &&4133fc == NLBLOCK->nl[0])4134{4135mb->hitend = TRUE;4136if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;4137}4138break;41394140case OP_ALLANY:4141case OP_ANYBYTE:4142break;41434144case OP_ANYNL:4145switch(fc)4146{4147default: RRETURN(MATCH_NOMATCH);41484149case CHAR_CR:4150if (Feptr < mb->end_subject && *Feptr == CHAR_LF) Feptr++;4151break;41524153case CHAR_LF:4154break;41554156case CHAR_VT:4157case CHAR_FF:4158case CHAR_NEL:4159#if PCRE2_CODE_UNIT_WIDTH != 84160case 0x2028:4161case 0x2029:4162#endif4163if (mb->bsr_convention == PCRE2_BSR_ANYCRLF)4164RRETURN(MATCH_NOMATCH);4165break;4166}4167break;41684169case OP_NOT_HSPACE:4170switch(fc)4171{4172default: break;4173HSPACE_BYTE_CASES:4174#if PCRE2_CODE_UNIT_WIDTH != 84175HSPACE_MULTIBYTE_CASES:4176#endif4177RRETURN(MATCH_NOMATCH);4178}4179break;41804181case OP_HSPACE:4182switch(fc)4183{4184default: RRETURN(MATCH_NOMATCH);4185HSPACE_BYTE_CASES:4186#if PCRE2_CODE_UNIT_WIDTH != 84187HSPACE_MULTIBYTE_CASES:4188#endif4189break;4190}4191break;41924193case OP_NOT_VSPACE:4194switch(fc)4195{4196default: break;4197VSPACE_BYTE_CASES:4198#if PCRE2_CODE_UNIT_WIDTH != 84199VSPACE_MULTIBYTE_CASES:4200#endif4201RRETURN(MATCH_NOMATCH);4202}4203break;42044205case OP_VSPACE:4206switch(fc)4207{4208default: RRETURN(MATCH_NOMATCH);4209VSPACE_BYTE_CASES:4210#if PCRE2_CODE_UNIT_WIDTH != 84211VSPACE_MULTIBYTE_CASES:4212#endif4213break;4214}4215break;42164217case OP_NOT_DIGIT:4218if (MAX_255(fc) && (mb->ctypes[fc] & ctype_digit) != 0)4219RRETURN(MATCH_NOMATCH);4220break;42214222case OP_DIGIT:4223if (!MAX_255(fc) || (mb->ctypes[fc] & ctype_digit) == 0)4224RRETURN(MATCH_NOMATCH);4225break;42264227case OP_NOT_WHITESPACE:4228if (MAX_255(fc) && (mb->ctypes[fc] & ctype_space) != 0)4229RRETURN(MATCH_NOMATCH);4230break;42314232case OP_WHITESPACE:4233if (!MAX_255(fc) || (mb->ctypes[fc] & ctype_space) == 0)4234RRETURN(MATCH_NOMATCH);4235break;42364237case OP_NOT_WORDCHAR:4238if (MAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0)4239RRETURN(MATCH_NOMATCH);4240break;42414242case OP_WORDCHAR:4243if (!MAX_255(fc) || (mb->ctypes[fc] & ctype_word) == 0)4244RRETURN(MATCH_NOMATCH);4245break;42464247default:4248PCRE2_DEBUG_UNREACHABLE();4249return PCRE2_ERROR_INTERNAL;4250}4251}4252}42534254PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */4255}42564257/* If maximizing, it is worth using inline code for speed, doing the type4258test once at the start (i.e. keep it out of the loops). Once again,4259"notmatch" can be an ordinary local variable because the loops do not call4260RMATCH. */42614262else4263{4264Lstart_eptr = Feptr; /* Remember where we started */42654266#ifdef SUPPORT_UNICODE4267if (proptype >= 0)4268{4269BOOL notmatch = Lctype == OP_NOTPROP;4270switch(proptype)4271{4272case PT_LAMP:4273for (i = Lmin; i < Lmax; i++)4274{4275int chartype;4276int len = 1;4277if (Feptr >= mb->end_subject)4278{4279SCHECK_PARTIAL();4280break;4281}4282GETCHARLENTEST(fc, Feptr, len);4283chartype = UCD_CHARTYPE(fc);4284if ((chartype == ucp_Lu ||4285chartype == ucp_Ll ||4286chartype == ucp_Lt) == notmatch)4287break;4288Feptr+= len;4289}4290break;42914292case PT_GC:4293for (i = Lmin; i < Lmax; i++)4294{4295int len = 1;4296if (Feptr >= mb->end_subject)4297{4298SCHECK_PARTIAL();4299break;4300}4301GETCHARLENTEST(fc, Feptr, len);4302if ((UCD_CATEGORY(fc) == Lpropvalue) == notmatch) break;4303Feptr+= len;4304}4305break;43064307case PT_PC:4308for (i = Lmin; i < Lmax; i++)4309{4310int len = 1;4311if (Feptr >= mb->end_subject)4312{4313SCHECK_PARTIAL();4314break;4315}4316GETCHARLENTEST(fc, Feptr, len);4317if ((UCD_CHARTYPE(fc) == Lpropvalue) == notmatch) break;4318Feptr+= len;4319}4320break;43214322case PT_SC:4323for (i = Lmin; i < Lmax; i++)4324{4325int len = 1;4326if (Feptr >= mb->end_subject)4327{4328SCHECK_PARTIAL();4329break;4330}4331GETCHARLENTEST(fc, Feptr, len);4332if ((UCD_SCRIPT(fc) == Lpropvalue) == notmatch) break;4333Feptr+= len;4334}4335break;43364337case PT_SCX:4338for (i = Lmin; i < Lmax; i++)4339{4340BOOL ok;4341const ucd_record *prop;4342int len = 1;4343if (Feptr >= mb->end_subject)4344{4345SCHECK_PARTIAL();4346break;4347}4348GETCHARLENTEST(fc, Feptr, len);4349prop = GET_UCD(fc);4350ok = (prop->script == Lpropvalue ||4351MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), Lpropvalue) != 0);4352if (ok == notmatch) break;4353Feptr+= len;4354}4355break;43564357case PT_ALNUM:4358for (i = Lmin; i < Lmax; i++)4359{4360int category;4361int len = 1;4362if (Feptr >= mb->end_subject)4363{4364SCHECK_PARTIAL();4365break;4366}4367GETCHARLENTEST(fc, Feptr, len);4368category = UCD_CATEGORY(fc);4369if ((category == ucp_L || category == ucp_N) == notmatch)4370break;4371Feptr+= len;4372}4373break;43744375/* Perl space used to exclude VT, but from Perl 5.18 it is included,4376which means that Perl space and POSIX space are now identical. PCRE4377was changed at release 8.34. */43784379case PT_SPACE: /* Perl space */4380case PT_PXSPACE: /* POSIX space */4381for (i = Lmin; i < Lmax; i++)4382{4383int len = 1;4384if (Feptr >= mb->end_subject)4385{4386SCHECK_PARTIAL();4387break;4388}4389GETCHARLENTEST(fc, Feptr, len);4390switch(fc)4391{4392HSPACE_CASES:4393VSPACE_CASES:4394if (notmatch) goto ENDLOOP99; /* Break the loop */4395break;43964397default:4398if ((UCD_CATEGORY(fc) == ucp_Z) == notmatch)4399goto ENDLOOP99; /* Break the loop */4400break;4401}4402Feptr+= len;4403}4404ENDLOOP99:4405break;44064407case PT_WORD:4408for (i = Lmin; i < Lmax; i++)4409{4410int chartype, category;4411int len = 1;4412if (Feptr >= mb->end_subject)4413{4414SCHECK_PARTIAL();4415break;4416}4417GETCHARLENTEST(fc, Feptr, len);4418chartype = UCD_CHARTYPE(fc);4419category = PRIV(ucp_gentype)[chartype];4420if ((category == ucp_L ||4421category == ucp_N ||4422chartype == ucp_Mn ||4423chartype == ucp_Pc) == notmatch)4424break;4425Feptr+= len;4426}4427break;44284429case PT_CLIST:4430for (i = Lmin; i < Lmax; i++)4431{4432const uint32_t *cp;4433int len = 1;4434if (Feptr >= mb->end_subject)4435{4436SCHECK_PARTIAL();4437break;4438}4439GETCHARLENTEST(fc, Feptr, len);4440#if PCRE2_CODE_UNIT_WIDTH == 324441if (fc > MAX_UTF_CODE_POINT)4442{4443if (!notmatch) goto GOT_MAX;4444}4445else4446#endif4447{4448cp = PRIV(ucd_caseless_sets) + Lpropvalue;4449for (;;)4450{4451if (fc < *cp)4452{ if (notmatch) break; else goto GOT_MAX; }4453if (fc == *cp++)4454{ if (notmatch) goto GOT_MAX; else break; }4455}4456}44574458Feptr += len;4459}4460GOT_MAX:4461break;44624463case PT_UCNC:4464for (i = Lmin; i < Lmax; i++)4465{4466int len = 1;4467if (Feptr >= mb->end_subject)4468{4469SCHECK_PARTIAL();4470break;4471}4472GETCHARLENTEST(fc, Feptr, len);4473if ((fc == CHAR_DOLLAR_SIGN || fc == CHAR_COMMERCIAL_AT ||4474fc == CHAR_GRAVE_ACCENT || (fc >= 0xa0 && fc <= 0xd7ff) ||4475fc >= 0xe000) == notmatch)4476break;4477Feptr += len;4478}4479break;44804481case PT_BIDICL:4482for (i = Lmin; i < Lmax; i++)4483{4484int len = 1;4485if (Feptr >= mb->end_subject)4486{4487SCHECK_PARTIAL();4488break;4489}4490GETCHARLENTEST(fc, Feptr, len);4491if ((UCD_BIDICLASS(fc) == Lpropvalue) == notmatch) break;4492Feptr+= len;4493}4494break;44954496case PT_BOOL:4497for (i = Lmin; i < Lmax; i++)4498{4499BOOL ok;4500const ucd_record *prop;4501int len = 1;4502if (Feptr >= mb->end_subject)4503{4504SCHECK_PARTIAL();4505break;4506}4507GETCHARLENTEST(fc, Feptr, len);4508prop = GET_UCD(fc);4509ok = MAPBIT(PRIV(ucd_boolprop_sets) +4510UCD_BPROPS_PROP(prop), Lpropvalue) != 0;4511if (ok == notmatch) break;4512Feptr+= len;4513}4514break;45154516default:4517PCRE2_DEBUG_UNREACHABLE();4518return PCRE2_ERROR_INTERNAL;4519}45204521/* Feptr is now past the end of the maximum run */45224523if (reptype == REPTYPE_POS) continue; /* No backtracking */45244525/* After \C in UTF mode, Lstart_eptr might be in the middle of a4526Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't4527go too far. */45284529for(;;)4530{4531if (Feptr <= Lstart_eptr) break;4532RMATCH(Fecode, RM221);4533if (rrc != MATCH_NOMATCH) RRETURN(rrc);4534Feptr--;4535if (utf) BACKCHAR(Feptr);4536}4537}45384539/* Match extended Unicode grapheme clusters. We will get here only if the4540support is in the binary; otherwise a compile-time error occurs. */45414542else if (Lctype == OP_EXTUNI)4543{4544for (i = Lmin; i < Lmax; i++)4545{4546if (Feptr >= mb->end_subject)4547{4548SCHECK_PARTIAL();4549break;4550}4551else4552{4553GETCHARINCTEST(fc, Feptr);4554Feptr = PRIV(extuni)(fc, Feptr, mb->start_subject, mb->end_subject,4555utf, NULL);4556}4557CHECK_PARTIAL();4558}45594560/* Feptr is now past the end of the maximum run */45614562if (reptype == REPTYPE_POS) continue; /* No backtracking */45634564/* We use <= Lstart_eptr rather than == Lstart_eptr to detect the start4565of the run while backtracking because the use of \C in UTF mode can4566cause BACKCHAR to move back past Lstart_eptr. This is just palliative;4567the use of \C in UTF mode is fraught with danger. */45684569for(;;)4570{4571int lgb, rgb;4572PCRE2_SPTR fptr;45734574if (Feptr <= Lstart_eptr) break; /* At start of char run */4575RMATCH(Fecode, RM219);4576if (rrc != MATCH_NOMATCH) RRETURN(rrc);45774578/* Backtracking over an extended grapheme cluster involves inspecting4579the previous two characters (if present) to see if a break is4580permitted between them. */45814582Feptr--;4583if (!utf) fc = *Feptr; else4584{4585BACKCHAR(Feptr);4586GETCHAR(fc, Feptr);4587}4588rgb = UCD_GRAPHBREAK(fc);45894590for (;;)4591{4592if (Feptr <= Lstart_eptr) break; /* At start of char run */4593fptr = Feptr - 1;4594if (!utf) fc = *fptr; else4595{4596BACKCHAR(fptr);4597GETCHAR(fc, fptr);4598}4599lgb = UCD_GRAPHBREAK(fc);4600if ((PRIV(ucp_gbtable)[lgb] & (1u << rgb)) == 0) break;4601Feptr = fptr;4602rgb = lgb;4603}4604}4605}46064607else4608#endif /* SUPPORT_UNICODE */46094610#ifdef SUPPORT_UNICODE4611if (utf)4612{4613switch(Lctype)4614{4615case OP_ANY:4616for (i = Lmin; i < Lmax; i++)4617{4618if (Feptr >= mb->end_subject)4619{4620SCHECK_PARTIAL();4621break;4622}4623if (IS_NEWLINE(Feptr)) break;4624if (mb->partial != 0 && /* Take care with CRLF partial */4625Feptr + 1 >= mb->end_subject &&4626NLBLOCK->nltype == NLTYPE_FIXED &&4627NLBLOCK->nllen == 2 &&4628UCHAR21(Feptr) == NLBLOCK->nl[0])4629{4630mb->hitend = TRUE;4631if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;4632}4633Feptr++;4634ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);4635}4636break;46374638case OP_ALLANY:4639if (Lmax < UINT32_MAX)4640{4641for (i = Lmin; i < Lmax; i++)4642{4643if (Feptr >= mb->end_subject)4644{4645SCHECK_PARTIAL();4646break;4647}4648Feptr++;4649ACROSSCHAR(Feptr < mb->end_subject, Feptr, Feptr++);4650}4651}4652else4653{4654Feptr = mb->end_subject; /* Unlimited UTF-8 repeat */4655SCHECK_PARTIAL();4656}4657break;46584659/* The "byte" (i.e. "code unit") case is the same as non-UTF */46604661case OP_ANYBYTE:4662fc = Lmax - Lmin;4663if (fc > (uint32_t)(mb->end_subject - Feptr))4664{4665Feptr = mb->end_subject;4666SCHECK_PARTIAL();4667}4668else Feptr += fc;4669break;46704671case OP_ANYNL:4672for (i = Lmin; i < Lmax; i++)4673{4674int len = 1;4675if (Feptr >= mb->end_subject)4676{4677SCHECK_PARTIAL();4678break;4679}4680GETCHARLEN(fc, Feptr, len);4681if (fc == CHAR_CR)4682{4683if (++Feptr >= mb->end_subject) break;4684if (UCHAR21(Feptr) == CHAR_LF) Feptr++;4685}4686else4687{4688if (fc != CHAR_LF &&4689(mb->bsr_convention == PCRE2_BSR_ANYCRLF ||4690(fc != CHAR_VT && fc != CHAR_FF && fc != CHAR_NEL4691#ifndef EBCDIC4692&& fc != 0x2028 && fc != 0x20294693#endif /* Not EBCDIC */4694)))4695break;4696Feptr += len;4697}4698}4699break;47004701case OP_NOT_HSPACE:4702case OP_HSPACE:4703for (i = Lmin; i < Lmax; i++)4704{4705BOOL gotspace;4706int len = 1;4707if (Feptr >= mb->end_subject)4708{4709SCHECK_PARTIAL();4710break;4711}4712GETCHARLEN(fc, Feptr, len);4713switch(fc)4714{4715HSPACE_CASES: gotspace = TRUE; break;4716default: gotspace = FALSE; break;4717}4718if (gotspace == (Lctype == OP_NOT_HSPACE)) break;4719Feptr += len;4720}4721break;47224723case OP_NOT_VSPACE:4724case OP_VSPACE:4725for (i = Lmin; i < Lmax; i++)4726{4727BOOL gotspace;4728int len = 1;4729if (Feptr >= mb->end_subject)4730{4731SCHECK_PARTIAL();4732break;4733}4734GETCHARLEN(fc, Feptr, len);4735switch(fc)4736{4737VSPACE_CASES: gotspace = TRUE; break;4738default: gotspace = FALSE; break;4739}4740if (gotspace == (Lctype == OP_NOT_VSPACE)) break;4741Feptr += len;4742}4743break;47444745case OP_NOT_DIGIT:4746for (i = Lmin; i < Lmax; i++)4747{4748int len = 1;4749if (Feptr >= mb->end_subject)4750{4751SCHECK_PARTIAL();4752break;4753}4754GETCHARLEN(fc, Feptr, len);4755if (fc < 256 && (mb->ctypes[fc] & ctype_digit) != 0) break;4756Feptr+= len;4757}4758break;47594760case OP_DIGIT:4761for (i = Lmin; i < Lmax; i++)4762{4763int len = 1;4764if (Feptr >= mb->end_subject)4765{4766SCHECK_PARTIAL();4767break;4768}4769GETCHARLEN(fc, Feptr, len);4770if (fc >= 256 ||(mb->ctypes[fc] & ctype_digit) == 0) break;4771Feptr+= len;4772}4773break;47744775case OP_NOT_WHITESPACE:4776for (i = Lmin; i < Lmax; i++)4777{4778int len = 1;4779if (Feptr >= mb->end_subject)4780{4781SCHECK_PARTIAL();4782break;4783}4784GETCHARLEN(fc, Feptr, len);4785if (fc < 256 && (mb->ctypes[fc] & ctype_space) != 0) break;4786Feptr+= len;4787}4788break;47894790case OP_WHITESPACE:4791for (i = Lmin; i < Lmax; i++)4792{4793int len = 1;4794if (Feptr >= mb->end_subject)4795{4796SCHECK_PARTIAL();4797break;4798}4799GETCHARLEN(fc, Feptr, len);4800if (fc >= 256 ||(mb->ctypes[fc] & ctype_space) == 0) break;4801Feptr+= len;4802}4803break;48044805case OP_NOT_WORDCHAR:4806for (i = Lmin; i < Lmax; i++)4807{4808int len = 1;4809if (Feptr >= mb->end_subject)4810{4811SCHECK_PARTIAL();4812break;4813}4814GETCHARLEN(fc, Feptr, len);4815if (fc < 256 && (mb->ctypes[fc] & ctype_word) != 0) break;4816Feptr+= len;4817}4818break;48194820case OP_WORDCHAR:4821for (i = Lmin; i < Lmax; i++)4822{4823int len = 1;4824if (Feptr >= mb->end_subject)4825{4826SCHECK_PARTIAL();4827break;4828}4829GETCHARLEN(fc, Feptr, len);4830if (fc >= 256 || (mb->ctypes[fc] & ctype_word) == 0) break;4831Feptr+= len;4832}4833break;48344835default:4836PCRE2_DEBUG_UNREACHABLE();4837return PCRE2_ERROR_INTERNAL;4838}48394840if (reptype == REPTYPE_POS) continue; /* No backtracking */48414842/* After \C in UTF mode, Lstart_eptr might be in the middle of a4843Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't go4844too far. */48454846for(;;)4847{4848if (Feptr <= Lstart_eptr) break;4849RMATCH(Fecode, RM220);4850if (rrc != MATCH_NOMATCH) RRETURN(rrc);4851Feptr--;4852BACKCHAR(Feptr);4853if (Lctype == OP_ANYNL && Feptr > Lstart_eptr &&4854UCHAR21(Feptr) == CHAR_NL && UCHAR21(Feptr - 1) == CHAR_CR)4855Feptr--;4856}4857}4858else4859#endif /* SUPPORT_UNICODE */48604861/* Not UTF mode */4862{4863switch(Lctype)4864{4865case OP_ANY:4866for (i = Lmin; i < Lmax; i++)4867{4868if (Feptr >= mb->end_subject)4869{4870SCHECK_PARTIAL();4871break;4872}4873if (IS_NEWLINE(Feptr)) break;4874if (mb->partial != 0 && /* Take care with CRLF partial */4875Feptr + 1 >= mb->end_subject &&4876NLBLOCK->nltype == NLTYPE_FIXED &&4877NLBLOCK->nllen == 2 &&4878*Feptr == NLBLOCK->nl[0])4879{4880mb->hitend = TRUE;4881if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;4882}4883Feptr++;4884}4885break;48864887case OP_ALLANY:4888case OP_ANYBYTE:4889fc = Lmax - Lmin;4890if (fc > (uint32_t)(mb->end_subject - Feptr))4891{4892Feptr = mb->end_subject;4893SCHECK_PARTIAL();4894}4895else Feptr += fc;4896break;48974898case OP_ANYNL:4899for (i = Lmin; i < Lmax; i++)4900{4901if (Feptr >= mb->end_subject)4902{4903SCHECK_PARTIAL();4904break;4905}4906fc = *Feptr;4907if (fc == CHAR_CR)4908{4909if (++Feptr >= mb->end_subject) break;4910if (*Feptr == CHAR_LF) Feptr++;4911}4912else4913{4914if (fc != CHAR_LF && (mb->bsr_convention == PCRE2_BSR_ANYCRLF ||4915(fc != CHAR_VT && fc != CHAR_FF && fc != CHAR_NEL4916#if PCRE2_CODE_UNIT_WIDTH != 84917&& fc != 0x2028 && fc != 0x20294918#endif4919))) break;4920Feptr++;4921}4922}4923break;49244925case OP_NOT_HSPACE:4926for (i = Lmin; i < Lmax; i++)4927{4928if (Feptr >= mb->end_subject)4929{4930SCHECK_PARTIAL();4931break;4932}4933switch(*Feptr)4934{4935default: Feptr++; break;4936HSPACE_BYTE_CASES:4937#if PCRE2_CODE_UNIT_WIDTH != 84938HSPACE_MULTIBYTE_CASES:4939#endif4940goto ENDLOOP00;4941}4942}4943ENDLOOP00:4944break;49454946case OP_HSPACE:4947for (i = Lmin; i < Lmax; i++)4948{4949if (Feptr >= mb->end_subject)4950{4951SCHECK_PARTIAL();4952break;4953}4954switch(*Feptr)4955{4956default: goto ENDLOOP01;4957HSPACE_BYTE_CASES:4958#if PCRE2_CODE_UNIT_WIDTH != 84959HSPACE_MULTIBYTE_CASES:4960#endif4961Feptr++; break;4962}4963}4964ENDLOOP01:4965break;49664967case OP_NOT_VSPACE:4968for (i = Lmin; i < Lmax; i++)4969{4970if (Feptr >= mb->end_subject)4971{4972SCHECK_PARTIAL();4973break;4974}4975switch(*Feptr)4976{4977default: Feptr++; break;4978VSPACE_BYTE_CASES:4979#if PCRE2_CODE_UNIT_WIDTH != 84980VSPACE_MULTIBYTE_CASES:4981#endif4982goto ENDLOOP02;4983}4984}4985ENDLOOP02:4986break;49874988case OP_VSPACE:4989for (i = Lmin; i < Lmax; i++)4990{4991if (Feptr >= mb->end_subject)4992{4993SCHECK_PARTIAL();4994break;4995}4996switch(*Feptr)4997{4998default: goto ENDLOOP03;4999VSPACE_BYTE_CASES:5000#if PCRE2_CODE_UNIT_WIDTH != 85001VSPACE_MULTIBYTE_CASES:5002#endif5003Feptr++; break;5004}5005}5006ENDLOOP03:5007break;50085009case OP_NOT_DIGIT:5010for (i = Lmin; i < Lmax; i++)5011{5012if (Feptr >= mb->end_subject)5013{5014SCHECK_PARTIAL();5015break;5016}5017if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_digit) != 0)5018break;5019Feptr++;5020}5021break;50225023case OP_DIGIT:5024for (i = Lmin; i < Lmax; i++)5025{5026if (Feptr >= mb->end_subject)5027{5028SCHECK_PARTIAL();5029break;5030}5031if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_digit) == 0)5032break;5033Feptr++;5034}5035break;50365037case OP_NOT_WHITESPACE:5038for (i = Lmin; i < Lmax; i++)5039{5040if (Feptr >= mb->end_subject)5041{5042SCHECK_PARTIAL();5043break;5044}5045if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_space) != 0)5046break;5047Feptr++;5048}5049break;50505051case OP_WHITESPACE:5052for (i = Lmin; i < Lmax; i++)5053{5054if (Feptr >= mb->end_subject)5055{5056SCHECK_PARTIAL();5057break;5058}5059if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_space) == 0)5060break;5061Feptr++;5062}5063break;50645065case OP_NOT_WORDCHAR:5066for (i = Lmin; i < Lmax; i++)5067{5068if (Feptr >= mb->end_subject)5069{5070SCHECK_PARTIAL();5071break;5072}5073if (MAX_255(*Feptr) && (mb->ctypes[*Feptr] & ctype_word) != 0)5074break;5075Feptr++;5076}5077break;50785079case OP_WORDCHAR:5080for (i = Lmin; i < Lmax; i++)5081{5082if (Feptr >= mb->end_subject)5083{5084SCHECK_PARTIAL();5085break;5086}5087if (!MAX_255(*Feptr) || (mb->ctypes[*Feptr] & ctype_word) == 0)5088break;5089Feptr++;5090}5091break;50925093default:5094PCRE2_DEBUG_UNREACHABLE();5095return PCRE2_ERROR_INTERNAL;5096}50975098if (reptype == REPTYPE_POS) continue; /* No backtracking */50995100for (;;)5101{5102if (Feptr == Lstart_eptr) break;5103RMATCH(Fecode, RM34);5104if (rrc != MATCH_NOMATCH) RRETURN(rrc);5105Feptr--;5106if (Lctype == OP_ANYNL && Feptr > Lstart_eptr && *Feptr == CHAR_LF &&5107Feptr[-1] == CHAR_CR) Feptr--;5108}5109}5110}5111break; /* End of repeat character type processing */51125113#undef Lstart_eptr5114#undef Lmin5115#undef Lmax5116#undef Lctype5117#undef Lpropvalue511851195120/* ===================================================================== */5121/* Match a back reference, possibly repeatedly. Look past the end of the5122item to see if there is repeat information following. The OP_REF and5123OP_REFI opcodes are used for a reference to a numbered group or to a5124non-duplicated named group. For a duplicated named group, OP_DNREF and5125OP_DNREFI are used. In this case we must scan the list of groups to which5126the name refers, and use the first one that is set. */51275128#define Lmin F->temp_32[0]5129#define Lmax F->temp_32[1]5130#define Lcaseless F->temp_32[2]5131#define Lcaseopts F->temp_32[3]5132#define Lstart F->temp_sptr[0]5133#define Loffset F->temp_size51345135case OP_DNREF:5136case OP_DNREFI:5137Lcaseless = (Fop == OP_DNREFI);5138Lcaseopts = (Fop == OP_DNREFI)? Fecode[1 + 2*IMM2_SIZE] : 0;5139{5140int count = GET2(Fecode, 1+IMM2_SIZE);5141PCRE2_SPTR slot = mb->name_table + GET2(Fecode, 1) * mb->name_entry_size;5142Fecode += 1 + 2*IMM2_SIZE + (Fop == OP_DNREFI? 1 : 0);51435144while (count-- > 0)5145{5146Loffset = (GET2(slot, 0) << 1) - 2;5147if (Loffset < Foffset_top && Fovector[Loffset] != PCRE2_UNSET) break;5148slot += mb->name_entry_size;5149}5150}5151goto REF_REPEAT;51525153case OP_REF:5154case OP_REFI:5155Lcaseless = (Fop == OP_REFI);5156Lcaseopts = (Fop == OP_REFI)? Fecode[1 + IMM2_SIZE] : 0;5157Loffset = (GET2(Fecode, 1) << 1) - 2;5158Fecode += 1 + IMM2_SIZE + (Fop == OP_REFI? 1 : 0);51595160/* Set up for repetition, or handle the non-repeated case. The maximum and5161minimum must be in the heap frame, but as they are short-term values, we5162use temporary fields. */51635164REF_REPEAT:5165switch (*Fecode)5166{5167case OP_CRSTAR:5168case OP_CRMINSTAR:5169case OP_CRPLUS:5170case OP_CRMINPLUS:5171case OP_CRQUERY:5172case OP_CRMINQUERY:5173fc = *Fecode++ - OP_CRSTAR;5174Lmin = rep_min[fc];5175Lmax = rep_max[fc];5176reptype = rep_typ[fc];5177break;51785179case OP_CRRANGE:5180case OP_CRMINRANGE:5181Lmin = GET2(Fecode, 1);5182Lmax = GET2(Fecode, 1 + IMM2_SIZE);5183reptype = rep_typ[*Fecode - OP_CRSTAR];5184if (Lmax == 0) Lmax = UINT32_MAX; /* Max 0 => infinity */5185Fecode += 1 + 2 * IMM2_SIZE;5186break;51875188default: /* No repeat follows */5189{5190rrc = match_ref(Loffset, Lcaseless, Lcaseopts, F, mb, &length);5191if (rrc != 0)5192{5193if (rrc > 0) Feptr = mb->end_subject; /* Partial match */5194CHECK_PARTIAL();5195RRETURN(MATCH_NOMATCH);5196}5197}5198Feptr += length;5199continue; /* With the main loop */5200}52015202/* Handle repeated back references. If a set group has length zero, just5203continue with the main loop, because it matches however many times. For an5204unset reference, if the minimum is zero, we can also just continue. We can5205also continue if PCRE2_MATCH_UNSET_BACKREF is set, because this makes unset5206group behave as a zero-length group. For any other unset cases, carrying5207on will result in NOMATCH. */52085209if (Loffset < Foffset_top && Fovector[Loffset] != PCRE2_UNSET)5210{5211if (Fovector[Loffset] == Fovector[Loffset + 1]) continue;5212}5213else /* Group is not set */5214{5215if (Lmin == 0 || (mb->poptions & PCRE2_MATCH_UNSET_BACKREF) != 0)5216continue;5217}52185219/* First, ensure the minimum number of matches are present. */52205221for (i = 1; i <= Lmin; i++)5222{5223PCRE2_SIZE slength;5224rrc = match_ref(Loffset, Lcaseless, Lcaseopts, F, mb, &slength);5225if (rrc != 0)5226{5227if (rrc > 0) Feptr = mb->end_subject; /* Partial match */5228CHECK_PARTIAL();5229RRETURN(MATCH_NOMATCH);5230}5231Feptr += slength;5232}52335234/* If min = max, we are done. They are not both allowed to be zero. */52355236if (Lmin == Lmax) continue;52375238/* If minimizing, keep trying and advancing the pointer. */52395240if (reptype == REPTYPE_MIN)5241{5242for (;;)5243{5244PCRE2_SIZE slength;5245RMATCH(Fecode, RM20);5246if (rrc != MATCH_NOMATCH) RRETURN(rrc);5247if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);5248rrc = match_ref(Loffset, Lcaseless, Lcaseopts, F, mb, &slength);5249if (rrc != 0)5250{5251if (rrc > 0) Feptr = mb->end_subject; /* Partial match */5252CHECK_PARTIAL();5253RRETURN(MATCH_NOMATCH);5254}5255Feptr += slength;5256}52575258PCRE2_UNREACHABLE(); /* Control never reaches here */5259}52605261/* If maximizing, find the longest string and work backwards, as long as5262the matched lengths for each iteration are the same. */52635264else5265{5266BOOL samelengths = TRUE;5267Lstart = Feptr; /* Starting position */5268Flength = Fovector[Loffset+1] - Fovector[Loffset];52695270for (i = Lmin; i < Lmax; i++)5271{5272PCRE2_SIZE slength;5273rrc = match_ref(Loffset, Lcaseless, Lcaseopts, F, mb, &slength);5274if (rrc != 0)5275{5276/* Can't use CHECK_PARTIAL because we don't want to update Feptr in5277the soft partial matching case. */52785279if (rrc > 0 && mb->partial != 0 &&5280mb->end_subject > mb->start_used_ptr)5281{5282mb->hitend = TRUE;5283if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;5284}5285break;5286}52875288if (slength != Flength) samelengths = FALSE;5289Feptr += slength;5290}52915292/* If the length matched for each repetition is the same as the length of5293the captured group, we can easily work backwards. This is the normal5294case. However, in caseless UTF-8 mode there are pairs of case-equivalent5295characters whose lengths (in terms of code units) differ. However, this5296is very rare, so we handle it by re-matching fewer and fewer times. */52975298if (samelengths)5299{5300while (Feptr >= Lstart)5301{5302RMATCH(Fecode, RM21);5303if (rrc != MATCH_NOMATCH) RRETURN(rrc);5304Feptr -= Flength;5305}5306}53075308/* The rare case of non-matching lengths. Re-scan the repetition for each5309iteration. We know that match_ref() will succeed every time. */53105311else5312{5313Lmax = i;5314for (;;)5315{5316RMATCH(Fecode, RM22);5317if (rrc != MATCH_NOMATCH) RRETURN(rrc);5318if (Feptr == Lstart) break; /* Failed after minimal repetition */5319Feptr = Lstart;5320Lmax--;5321for (i = Lmin; i < Lmax; i++)5322{5323PCRE2_SIZE slength;5324(void)match_ref(Loffset, Lcaseless, Lcaseopts, F, mb, &slength);5325Feptr += slength;5326}5327}5328}53295330RRETURN(MATCH_NOMATCH);5331}53325333PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */53345335#undef Lcaseless5336#undef Lmin5337#undef Lmax5338#undef Lstart5339#undef Loffset5340534153425343/* ========================================================================= */5344/* Opcodes for the start of various parenthesized items */5345/* ========================================================================= */53465347/* In all cases, if the result of RMATCH() is MATCH_THEN, check whether the5348(*THEN) is within the current branch by comparing the address of OP_THEN5349that is passed back with the end of the branch. If (*THEN) is within the5350current branch, and the branch is one of two or more alternatives (it5351either starts or ends with OP_ALT), we have reached the limit of THEN's5352action, so convert the return code to NOMATCH, which will cause normal5353backtracking to happen from now on. Otherwise, THEN is passed back to an5354outer alternative. This implements Perl's treatment of parenthesized5355groups, where a group not containing | does not affect the current5356alternative, that is, (X) is NOT the same as (X|(*F)). */535753585359/* ===================================================================== */5360/* BRAZERO, BRAMINZERO and SKIPZERO occur just before a non-possessive5361bracket group, indicating that it may occur zero times. It may repeat5362infinitely, or not at all - i.e. it could be ()* or ()? or even (){0} in5363the pattern. Brackets with fixed upper repeat limits are compiled as a5364number of copies, with the optional ones preceded by BRAZERO or BRAMINZERO.5365Possessive groups with possible zero repeats are preceded by BRAPOSZERO. */53665367#define Lnext_ecode F->temp_sptr[0]53685369case OP_BRAZERO:5370Lnext_ecode = Fecode + 1;5371RMATCH(Lnext_ecode, RM9);5372if (rrc != MATCH_NOMATCH) RRETURN(rrc);5373do Lnext_ecode += GET(Lnext_ecode, 1); while (*Lnext_ecode == OP_ALT);5374Fecode = Lnext_ecode + 1 + LINK_SIZE;5375break;53765377case OP_BRAMINZERO:5378Lnext_ecode = Fecode + 1;5379do Lnext_ecode += GET(Lnext_ecode, 1); while (*Lnext_ecode == OP_ALT);5380RMATCH(Lnext_ecode + 1 + LINK_SIZE, RM10);5381if (rrc != MATCH_NOMATCH) RRETURN(rrc);5382Fecode++;5383break;53845385#undef Lnext_ecode53865387case OP_SKIPZERO:5388Fecode++;5389do Fecode += GET(Fecode,1); while (*Fecode == OP_ALT);5390Fecode += 1 + LINK_SIZE;5391break;539253935394/* ===================================================================== */5395/* Handle possessive brackets with an unlimited repeat. The end of these5396brackets will always be OP_KETRPOS, which returns MATCH_KETRPOS without5397going further in the pattern. */53985399#define Lframe_type F->temp_32[0]5400#define Lmatched_once F->temp_32[1]5401#define Lzero_allowed F->temp_32[2]5402#define Lstart_eptr F->temp_sptr[0]5403#define Lstart_group F->temp_sptr[1]54045405case OP_BRAPOSZERO:5406Lzero_allowed = TRUE; /* Zero repeat is allowed */5407Fecode += 1;5408if (*Fecode == OP_CBRAPOS || *Fecode == OP_SCBRAPOS)5409goto POSSESSIVE_CAPTURE;5410goto POSSESSIVE_NON_CAPTURE;54115412case OP_BRAPOS:5413case OP_SBRAPOS:5414Lzero_allowed = FALSE; /* Zero repeat not allowed */54155416POSSESSIVE_NON_CAPTURE:5417Lframe_type = GF_NOCAPTURE; /* Remembered frame type */5418goto POSSESSIVE_GROUP;54195420case OP_CBRAPOS:5421case OP_SCBRAPOS:5422Lzero_allowed = FALSE; /* Zero repeat not allowed */54235424POSSESSIVE_CAPTURE:5425number = GET2(Fecode, 1+LINK_SIZE);5426Lframe_type = GF_CAPTURE | number; /* Remembered frame type */54275428POSSESSIVE_GROUP:5429Lmatched_once = FALSE; /* Never matched */5430Lstart_group = Fecode; /* Start of this group */54315432for (;;)5433{5434Lstart_eptr = Feptr; /* Position at group start */5435group_frame_type = Lframe_type;5436RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM8);5437if (rrc == MATCH_KETRPOS)5438{5439Lmatched_once = TRUE; /* Matched at least once */5440if (Feptr == Lstart_eptr) /* Empty match; skip to end */5441{5442do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT);5443break;5444}54455446Fecode = Lstart_group;5447continue;5448}54495450/* See comment above about handling THEN. */54515452if (rrc == MATCH_THEN)5453{5454PCRE2_SPTR next_ecode = Fecode + GET(Fecode,1);5455if (mb->verb_ecode_ptr < next_ecode &&5456(*Fecode == OP_ALT || *next_ecode == OP_ALT))5457rrc = MATCH_NOMATCH;5458}54595460if (rrc != MATCH_NOMATCH) RRETURN(rrc);5461Fecode += GET(Fecode, 1);5462if (*Fecode != OP_ALT) break;5463}54645465/* Success if matched something or zero repeat allowed */54665467if (Lmatched_once || Lzero_allowed)5468{5469Fecode += 1 + LINK_SIZE;5470break;5471}54725473RRETURN(MATCH_NOMATCH);54745475#undef Lmatched_once5476#undef Lzero_allowed5477#undef Lframe_type5478#undef Lstart_eptr5479#undef Lstart_group548054815482/* ===================================================================== */5483/* Handle non-capturing brackets that cannot match an empty string. When we5484get to the final alternative within the brackets, as long as there are no5485THEN's in the pattern, we can optimize by not recording a new backtracking5486point. (Ideally we should test for a THEN within this group, but we don't5487have that information.) Don't do this if we are at the very top level,5488however, because that would make handling assertions and once-only brackets5489messier when there is nothing to go back to. */54905491#define Lframe_type F->temp_32[0] /* Set for all that use GROUPLOOP */5492#define Lnext_branch F->temp_sptr[0] /* Used only in OP_BRA handling */54935494case OP_BRA:5495if (mb->hasthen || Frdepth == 0)5496{5497Lframe_type = 0;5498goto GROUPLOOP;5499}55005501for (;;)5502{5503Lnext_branch = Fecode + GET(Fecode, 1);5504if (*Lnext_branch != OP_ALT) break;55055506/* This is never the final branch. We do not need to test for MATCH_THEN5507here because this code is not used when there is a THEN in the pattern. */55085509RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM1);5510if (rrc != MATCH_NOMATCH) RRETURN(rrc);5511Fecode = Lnext_branch;5512}55135514/* Hit the start of the final branch. Continue at this level. */55155516Fecode += PRIV(OP_lengths)[*Fecode];5517break;55185519#undef Lnext_branch552055215522/* ===================================================================== */5523/* Handle a capturing bracket, other than those that are possessive with an5524unlimited repeat. */55255526case OP_CBRA:5527case OP_SCBRA:5528Lframe_type = GF_CAPTURE | GET2(Fecode, 1+LINK_SIZE);5529goto GROUPLOOP;553055315532/* ===================================================================== */5533/* Atomic groups and non-capturing brackets that can match an empty string5534must record a backtracking point and also set up a chained frame. */55355536case OP_ONCE:5537case OP_SCRIPT_RUN:5538case OP_SBRA:5539Lframe_type = GF_NOCAPTURE | Fop;55405541GROUPLOOP:5542for (;;)5543{5544group_frame_type = Lframe_type;5545RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM2);5546if (rrc == MATCH_THEN)5547{5548PCRE2_SPTR next_ecode = Fecode + GET(Fecode,1);5549if (mb->verb_ecode_ptr < next_ecode &&5550(*Fecode == OP_ALT || *next_ecode == OP_ALT))5551rrc = MATCH_NOMATCH;5552}5553if (rrc != MATCH_NOMATCH) RRETURN(rrc);5554Fecode += GET(Fecode, 1);5555if (*Fecode != OP_ALT) RRETURN(MATCH_NOMATCH);5556}5557PCRE2_UNREACHABLE(); /* Control never reaches here */55585559#undef Lframe_type556055615562/* ===================================================================== */5563/* Pattern recursion either matches the current regex, or some5564subexpression. The offset data is the offset to the starting bracket from5565the start of the whole pattern. This is so that it works from duplicated5566subpatterns. For a whole-pattern recursion, we have to infer the number5567zero. */55685569#define Lframe_type F->temp_32[0]5570#define Lstart_branch F->temp_sptr[0]55715572case OP_RECURSE:5573bracode = mb->start_code + GET(Fecode, 1);5574number = (bracode == mb->start_code)? 0 : GET2(bracode, 1 + LINK_SIZE);55755576/* If we are already in a pattern recursion, check for repeating the same5577one without changing the subject pointer or the last referenced character5578in the subject. This should catch convoluted mutual recursions; some5579simple cases are caught at compile time. However, there are rare cases when5580this check needs to be turned off. In this case, actual recursion loops5581will be caught by the match or heap limits. */55825583if (Fcurrent_recurse != RECURSE_UNSET)5584{5585offset = Flast_group_offset;5586while (offset != PCRE2_UNSET)5587{5588N = (heapframe *)((char *)match_data->heapframes + offset);5589P = (heapframe *)((char *)N - frame_size);5590if (N->group_frame_type == (GF_RECURSE | number))5591{5592if (Feptr == P->eptr && mb->last_used_ptr == P->recurse_last_used &&5593(mb->moptions & PCRE2_DISABLE_RECURSELOOP_CHECK) == 0)5594return PCRE2_ERROR_RECURSELOOP;5595break;5596}5597offset = P->last_group_offset;5598}5599}56005601/* Remember the current last referenced character and then run the5602recursion branch by branch. */56035604F->recurse_last_used = mb->last_used_ptr;5605Lstart_branch = bracode;5606Lframe_type = GF_RECURSE | number;56075608for (;;)5609{5610PCRE2_SPTR next_ecode;56115612group_frame_type = Lframe_type;5613RMATCH(Lstart_branch + PRIV(OP_lengths)[*Lstart_branch], RM11);5614next_ecode = Lstart_branch + GET(Lstart_branch,1);56155616/* Handle backtracking verbs, which are defined in a range that can5617easily be tested for. PCRE does not allow THEN, SKIP, PRUNE or COMMIT to5618escape beyond a recursion; they cause a NOMATCH for the entire recursion.56195620When one of these verbs triggers, the current recursion group number is5621recorded. If it matches the recursion we are processing, the verb5622happened within the recursion and we must deal with it. Otherwise it must5623have happened after the recursion completed, and so has to be passed5624back. See comment above about handling THEN. */56255626if (rrc >= MATCH_BACKTRACK_MIN && rrc <= MATCH_BACKTRACK_MAX &&5627mb->verb_current_recurse == (Lframe_type ^ GF_RECURSE))5628{5629if (rrc == MATCH_THEN && mb->verb_ecode_ptr < next_ecode &&5630(*Lstart_branch == OP_ALT || *next_ecode == OP_ALT))5631rrc = MATCH_NOMATCH;5632else RRETURN(MATCH_NOMATCH);5633}56345635/* Note that carrying on after (*ACCEPT) in a recursion is handled in the5636OP_ACCEPT code. Nothing needs to be done here. */56375638if (rrc != MATCH_NOMATCH) RRETURN(rrc);5639Lstart_branch = next_ecode;5640if (*Lstart_branch != OP_ALT) RRETURN(MATCH_NOMATCH);5641}5642PCRE2_UNREACHABLE(); /* Control never reaches here */56435644#undef Lframe_type5645#undef Lstart_branch564656475648/* ===================================================================== */5649/* Positive assertions are like other groups except that PCRE doesn't allow5650the effect of (*THEN) to escape beyond an assertion; it is therefore5651treated as NOMATCH. (*ACCEPT) is treated as successful assertion, with its5652captures and mark retained. Any other return is an error. */56535654#define Lframe_type F->temp_32[0]56555656case OP_ASSERT:5657case OP_ASSERTBACK:5658case OP_ASSERT_NA:5659case OP_ASSERTBACK_NA:5660Lframe_type = GF_NOCAPTURE | Fop;5661for (;;)5662{5663group_frame_type = Lframe_type;5664RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM3);5665if (rrc == MATCH_ACCEPT)5666{5667memcpy(Fovector,5668(char *)assert_accept_frame + offsetof(heapframe, ovector),5669assert_accept_frame->offset_top * sizeof(PCRE2_SIZE));5670Foffset_top = assert_accept_frame->offset_top;5671Fmark = assert_accept_frame->mark;5672break;5673}5674if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);5675Fecode += GET(Fecode, 1);5676if (*Fecode != OP_ALT) RRETURN(MATCH_NOMATCH);5677}56785679do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT);5680Fecode += 1 + LINK_SIZE;5681break;56825683#undef Lframe_type568456855686/* ===================================================================== */5687/* Handle negative assertions. Loop for each non-matching branch as for5688positive assertions. */56895690#define Lframe_type F->temp_32[0]56915692case OP_ASSERT_NOT:5693case OP_ASSERTBACK_NOT:5694Lframe_type = GF_NOCAPTURE | Fop;56955696for (;;)5697{5698group_frame_type = Lframe_type;5699RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM4);5700switch(rrc)5701{5702case MATCH_ACCEPT: /* Assertion matched, therefore it fails. */5703case MATCH_MATCH:5704RRETURN (MATCH_NOMATCH);57055706case MATCH_NOMATCH: /* Branch failed, try next if present. */5707case MATCH_THEN:5708Fecode += GET(Fecode, 1);5709if (*Fecode != OP_ALT) goto ASSERT_NOT_FAILED;5710break;57115712case MATCH_COMMIT: /* Assertion forced to fail, therefore continue. */5713case MATCH_SKIP:5714case MATCH_PRUNE:5715do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT);5716goto ASSERT_NOT_FAILED;57175718default: /* Pass back any other return */5719RRETURN(rrc);5720}5721}57225723/* None of the branches have matched or there was a backtrack to (*COMMIT),5724(*SKIP), (*PRUNE), or (*THEN) in the last branch. This is success for a5725negative assertion, so carry on. */57265727ASSERT_NOT_FAILED:5728Fecode += 1 + LINK_SIZE;5729break;57305731#undef Lframe_type57325733/* ===================================================================== */5734/* Handle scan substring operation. */57355736#define Lframe_type F->temp_32[0]5737#define Lextra_size F->temp_32[1]5738#define Lsaved_moptions F->temp_32[2]5739#define Lsaved_end_subject F->temp_sptr[0]5740#define Lsaved_eptr F->temp_sptr[1]5741#define Ltrue_end_extra F->temp_size57425743case OP_ASSERT_SCS:5744{5745PCRE2_SPTR ecode = Fecode + 1 + LINK_SIZE;5746uint32_t extra_size = 0;5747int count;5748PCRE2_SPTR slot;57495750/* Disable compiler warning. */5751offset = 0;5752(void)offset;57535754for (;;)5755{5756if (*ecode == OP_CREF)5757{5758extra_size += 1+IMM2_SIZE;5759offset = (GET2(ecode, 1) << 1) - 2;5760ecode += 1+IMM2_SIZE;5761if (offset < Foffset_top && Fovector[offset] != PCRE2_UNSET)5762goto SCS_OFFSET_FOUND;5763continue;5764}57655766if (*ecode != OP_DNCREF) RRETURN(MATCH_NOMATCH);57675768count = GET2(ecode, 1 + IMM2_SIZE);5769slot = mb->name_table + GET2(ecode, 1) * mb->name_entry_size;5770extra_size += 1+2*IMM2_SIZE;5771ecode += 1+2*IMM2_SIZE;57725773while (count > 0)5774{5775offset = (GET2(slot, 0) << 1) - 2;5776if (offset < Foffset_top && Fovector[offset] != PCRE2_UNSET)5777goto SCS_OFFSET_FOUND;5778slot += mb->name_entry_size;5779count--;5780}5781}57825783SCS_OFFSET_FOUND:57845785/* Skip remaining options. */5786for (;;)5787{5788if (*ecode == OP_CREF)5789{5790extra_size += 1+IMM2_SIZE;5791ecode += 1+IMM2_SIZE;5792}5793else if (*ecode == OP_DNCREF)5794{5795extra_size += 1+2*IMM2_SIZE;5796ecode += 1+2*IMM2_SIZE;5797}5798else break;5799}58005801Lextra_size = extra_size;5802}58035804Lsaved_end_subject = mb->end_subject;5805Ltrue_end_extra = mb->true_end_subject - mb->end_subject;5806Lsaved_eptr = Feptr;5807Lsaved_moptions = mb->moptions;58085809Feptr = mb->start_subject + Fovector[offset];5810mb->true_end_subject = mb->end_subject =5811mb->start_subject + Fovector[offset + 1];5812mb->moptions &= ~PCRE2_NOTEOL;58135814Lframe_type = GF_NOCAPTURE | Fop;5815for (;;)5816{5817group_frame_type = Lframe_type;5818RMATCH(Fecode + 1 + LINK_SIZE + Lextra_size, RM38);5819if (rrc == MATCH_ACCEPT)5820{5821memcpy(Fovector,5822(char *)assert_accept_frame + offsetof(heapframe, ovector),5823assert_accept_frame->offset_top * sizeof(PCRE2_SIZE));5824Foffset_top = assert_accept_frame->offset_top;5825Fmark = assert_accept_frame->mark;5826break;5827}58285829if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)5830{5831mb->end_subject = Lsaved_end_subject;5832mb->true_end_subject = mb->end_subject + Ltrue_end_extra;5833mb->moptions = Lsaved_moptions;5834RRETURN(rrc);5835}58365837Fecode += GET(Fecode, 1);5838if (*Fecode != OP_ALT)5839{5840mb->end_subject = Lsaved_end_subject;5841mb->true_end_subject = mb->end_subject + Ltrue_end_extra;5842mb->moptions = Lsaved_moptions;5843RRETURN(MATCH_NOMATCH);5844}5845Lextra_size = 0;5846}58475848do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT);5849Fecode += 1 + LINK_SIZE;5850Feptr = Lsaved_eptr;5851break;58525853#undef Lframe_type5854#undef Lextra_size5855#undef Lsaved_end_subject5856#undef Lsaved_eptr5857#undef Ltrue_end_extra5858#undef Lsave_moptions58595860/* ===================================================================== */5861/* The callout item calls an external function, if one is provided, passing5862details of the match so far. This is mainly for debugging, though the5863function is able to force a failure. */58645865case OP_CALLOUT:5866case OP_CALLOUT_STR:5867rrc = do_callout(F, mb, &length);5868if (rrc > 0) RRETURN(MATCH_NOMATCH);5869if (rrc < 0) RRETURN(rrc);5870Fecode += length;5871break;587258735874/* ===================================================================== */5875/* Conditional group: compilation checked that there are no more than two5876branches. If the condition is false, skipping the first branch takes us5877past the end of the item if there is only one branch, but that's exactly5878what we want. */58795880case OP_COND:5881case OP_SCOND:58825883/* The variable Flength will be added to Fecode when the condition is5884false, to get to the second branch. Setting it to the offset to the ALT or5885KET, then incrementing Fecode achieves this effect. However, if the second5886branch is non-existent, we must point to the KET so that the end of the5887group is correctly processed. We now have Fecode pointing to the condition5888or callout. */58895890Flength = GET(Fecode, 1); /* Offset to the second branch */5891if (Fecode[Flength] != OP_ALT) Flength -= 1 + LINK_SIZE;5892Fecode += 1 + LINK_SIZE; /* From this opcode */58935894/* Because of the way auto-callout works during compile, a callout item is5895inserted between OP_COND and an assertion condition. Such a callout can5896also be inserted manually. */58975898if (*Fecode == OP_CALLOUT || *Fecode == OP_CALLOUT_STR)5899{5900rrc = do_callout(F, mb, &length);5901if (rrc > 0) RRETURN(MATCH_NOMATCH);5902if (rrc < 0) RRETURN(rrc);59035904/* Advance Fecode past the callout, so it now points to the condition. We5905must adjust Flength so that the value of Fecode+Flength is unchanged. */59065907Fecode += length;5908Flength -= length;5909}59105911/* Test the various possible conditions */59125913condition = FALSE;5914switch(*Fecode)5915{5916case OP_RREF: /* Group recursion test */5917if (Fcurrent_recurse != RECURSE_UNSET)5918{5919number = GET2(Fecode, 1);5920condition = (number == RREF_ANY || number == Fcurrent_recurse);5921}5922break;59235924case OP_DNRREF: /* Duplicate named group recursion test */5925if (Fcurrent_recurse != RECURSE_UNSET)5926{5927int count = GET2(Fecode, 1 + IMM2_SIZE);5928PCRE2_SPTR slot = mb->name_table + GET2(Fecode, 1) * mb->name_entry_size;5929while (count-- > 0)5930{5931number = GET2(slot, 0);5932condition = number == Fcurrent_recurse;5933if (condition) break;5934slot += mb->name_entry_size;5935}5936}5937break;59385939case OP_CREF: /* Numbered group used test */5940offset = (GET2(Fecode, 1) << 1) - 2; /* Doubled ref number */5941condition = offset < Foffset_top && Fovector[offset] != PCRE2_UNSET;5942break;59435944case OP_DNCREF: /* Duplicate named group used test */5945{5946int count = GET2(Fecode, 1 + IMM2_SIZE);5947PCRE2_SPTR slot = mb->name_table + GET2(Fecode, 1) * mb->name_entry_size;5948while (count-- > 0)5949{5950offset = (GET2(slot, 0) << 1) - 2;5951condition = offset < Foffset_top && Fovector[offset] != PCRE2_UNSET;5952if (condition) break;5953slot += mb->name_entry_size;5954}5955}5956break;59575958case OP_FALSE:5959case OP_FAIL: /* The assertion (?!) becomes OP_FAIL */5960break;59615962case OP_TRUE:5963condition = TRUE;5964break;59655966/* The condition is an assertion. Run code similar to the assertion code5967above. */59685969#define Lpositive F->temp_32[0]5970#define Lstart_branch F->temp_sptr[0]59715972default:5973Lpositive = (*Fecode == OP_ASSERT || *Fecode == OP_ASSERTBACK);5974Lstart_branch = Fecode;59755976for (;;)5977{5978group_frame_type = GF_CONDASSERT | *Fecode;5979RMATCH(Lstart_branch + PRIV(OP_lengths)[*Lstart_branch], RM5);59805981switch(rrc)5982{5983case MATCH_ACCEPT: /* Save captures */5984memcpy(Fovector,5985(char *)assert_accept_frame + offsetof(heapframe, ovector),5986assert_accept_frame->offset_top * sizeof(PCRE2_SIZE));5987Foffset_top = assert_accept_frame->offset_top;59885989/* Fall through */5990/* In the case of a match, the captures have already been put into5991the current frame. */59925993case MATCH_MATCH:5994condition = Lpositive; /* TRUE for positive assertion */5995break;59965997/* PCRE doesn't allow the effect of (*THEN) to escape beyond an5998assertion; it is therefore always treated as NOMATCH. */59996000case MATCH_NOMATCH:6001case MATCH_THEN:6002Lstart_branch += GET(Lstart_branch, 1);6003if (*Lstart_branch == OP_ALT) continue; /* Try next branch */6004condition = !Lpositive; /* TRUE for negative assertion */6005break;60066007/* These force no match without checking other branches. */60086009case MATCH_COMMIT:6010case MATCH_SKIP:6011case MATCH_PRUNE:6012condition = !Lpositive;6013break;60146015default:6016RRETURN(rrc);6017}6018break; /* Out of the branch loop */6019}60206021/* If the condition is true, find the end of the assertion so that6022advancing past it gets us to the start of the first branch. */60236024if (condition)6025{6026do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT);6027}6028break; /* End of assertion condition */6029}60306031#undef Lpositive6032#undef Lstart_branch60336034/* Choose branch according to the condition. */60356036Fecode += condition? PRIV(OP_lengths)[*Fecode] : Flength;60376038/* If the opcode is OP_SCOND it means we are at a repeated conditional6039group that might match an empty string. We must therefore descend a level6040so that the start is remembered for checking. For OP_COND we can just6041continue at this level. */60426043if (Fop == OP_SCOND)6044{6045group_frame_type = GF_NOCAPTURE | Fop;6046RMATCH(Fecode, RM35);6047RRETURN(rrc);6048}6049break;6050605160526053/* ========================================================================= */6054/* End of start of parenthesis opcodes */6055/* ========================================================================= */605660576058/* ===================================================================== */6059/* Move the subject pointer back by one fixed amount. This occurs at the6060start of each branch that has a fixed length in a lookbehind assertion. If6061we are too close to the start to move back, fail. When working with UTF-86062we move back a number of characters, not bytes. */60636064case OP_REVERSE:6065number = GET2(Fecode, 1);6066#ifdef SUPPORT_UNICODE6067if (utf)6068{6069/* We used to do a simpler `while (number-- > 0)` but that triggers6070clang's unsigned integer overflow sanitizer. */6071while (number > 0)6072{6073--number;6074if (Feptr <= mb->check_subject) RRETURN(MATCH_NOMATCH);6075Feptr--;6076BACKCHAR(Feptr);6077}6078}6079else6080#endif60816082/* No UTF support, or not in UTF mode: count is code unit count */60836084{6085if ((ptrdiff_t)number > Feptr - mb->start_subject) RRETURN(MATCH_NOMATCH);6086Feptr -= number;6087}60886089/* Save the earliest consulted character, then skip to next opcode */60906091if (Feptr < mb->start_used_ptr) mb->start_used_ptr = Feptr;6092Fecode += 1 + IMM2_SIZE;6093break;609460956096/* ===================================================================== */6097/* Move the subject pointer back by a variable amount. This occurs at the6098start of each branch of a lookbehind assertion when the branch has a6099variable, but limited, length. A loop is needed to try matching the branch6100after moving back different numbers of characters. If we are too close to6101the start to move back even the minimum amount, fail. When working with6102UTF-8 we move back a number of characters, not bytes. */61036104#define Lmin F->temp_32[0]6105#define Lmax F->temp_32[1]6106#define Leptr F->temp_sptr[0]61076108case OP_VREVERSE:6109Lmin = GET2(Fecode, 1);6110Lmax = GET2(Fecode, 1 + IMM2_SIZE);6111Leptr = Feptr;61126113/* Move back by the maximum branch length and then work forwards. This6114ensures that items such as \d{3,5} get the maximum length, which is6115relevant for captures, and makes for Perl compatibility. */61166117#ifdef SUPPORT_UNICODE6118if (utf)6119{6120for (i = 0; i < Lmax; i++)6121{6122if (Feptr == mb->start_subject)6123{6124if (i < Lmin) RRETURN(MATCH_NOMATCH);6125Lmax = i;6126break;6127}6128Feptr--;6129BACKCHAR(Feptr);6130}6131}6132else6133#endif61346135/* No UTF support or not in UTF mode */61366137{6138ptrdiff_t diff = Feptr - mb->start_subject;6139uint32_t available = (diff > 65535)? 65535 : ((diff > 0)? (int)diff : 0);6140if (Lmin > available) RRETURN(MATCH_NOMATCH);6141if (Lmax > available) Lmax = available;6142Feptr -= Lmax;6143}61446145/* Now try matching, moving forward one character on failure, until we6146reach the minimum back length. */61476148for (;;)6149{6150RMATCH(Fecode + 1 + 2 * IMM2_SIZE, RM37);6151if (rrc != MATCH_NOMATCH) RRETURN(rrc);6152if (Lmax-- <= Lmin) RRETURN(MATCH_NOMATCH);6153Feptr++;6154#ifdef SUPPORT_UNICODE6155if (utf) { FORWARDCHARTEST(Feptr, mb->end_subject); }6156#endif6157}6158PCRE2_UNREACHABLE(); /* Control never reaches here */61596160#undef Lmin6161#undef Lmax6162#undef Leptr61636164/* ===================================================================== */6165/* An alternation is the end of a branch; scan along to find the end of the6166bracketed group. */61676168case OP_ALT:6169branch_end = Fecode;6170do Fecode += GET(Fecode,1); while (*Fecode == OP_ALT);6171break;617261736174/* ===================================================================== */6175/* The end of a parenthesized group. For all but OP_BRA and OP_COND, the6176starting frame was added to the chained frames in order to remember the6177starting subject position for the group. (Not true for OP_BRA when it's a6178whole pattern recursion, but that is handled separately below.)*/61796180case OP_KET:6181case OP_KETRMIN:6182case OP_KETRMAX:6183case OP_KETRPOS:61846185bracode = Fecode - GET(Fecode, 1);61866187if (branch_end == NULL) branch_end = Fecode;6188branch_start = bracode;6189while (branch_start + GET(branch_start, 1) != branch_end)6190branch_start += GET(branch_start, 1);6191branch_end = NULL;61926193/* Point N to the frame at the start of the most recent group, and P to its6194predecessor. Remember the subject pointer at the start of the group. */61956196if (*bracode != OP_BRA && *bracode != OP_COND)6197{6198N = (heapframe *)((char *)match_data->heapframes + Flast_group_offset);6199P = (heapframe *)((char *)N - frame_size);6200Flast_group_offset = P->last_group_offset;62016202#ifdef DEBUG_SHOW_RMATCH6203fprintf(stderr, "++ KET for frame=%d type=%x prev char offset=%lu\n",6204N->rdepth, N->group_frame_type,6205(char *)P->eptr - (char *)mb->start_subject);6206#endif62076208/* If we are at the end of an assertion that is a condition, first check6209to see if we are at the end of a variable-length branch in a lookbehind.6210If this is the case and we have not landed on the current character,6211return no match. Compare code below for non-condition lookbehinds. In6212other cases, return a match, discarding any intermediate backtracking6213points. Copy back the mark setting and the captures into the frame before6214N so that they are set on return. Doing this for all assertions, both6215positive and negative, seems to match what Perl does. */62166217if (GF_IDMASK(N->group_frame_type) == GF_CONDASSERT)6218{6219if ((*bracode == OP_ASSERTBACK || *bracode == OP_ASSERTBACK_NOT) &&6220branch_start[1 + LINK_SIZE] == OP_VREVERSE && Feptr != P->eptr)6221RRETURN(MATCH_NOMATCH);6222memcpy((char *)P + offsetof(heapframe, ovector), Fovector,6223Foffset_top * sizeof(PCRE2_SIZE));6224P->offset_top = Foffset_top;6225P->mark = Fmark;6226Fback_frame = (char *)F - (char *)P;6227RRETURN(MATCH_MATCH);6228}6229}6230else P = NULL; /* Indicates starting frame not recorded */62316232/* The group was not a conditional assertion. */62336234switch (*bracode)6235{6236/* Whole pattern recursion is handled as a recursion into group 0, but6237the entire pattern is wrapped in OP_BRA/OP_KET rather than a capturing6238group - a design mistake: it should perhaps have been capture group 0.6239Anyway, that means the end of such recursion must be handled here. It is6240detected by checking for an immediately following OP_END when we are6241recursing in group 0. If this is not the end of a whole-pattern6242recursion, there is nothing to be done. */62436244case OP_BRA:6245if (Fcurrent_recurse != 0 || Fecode[1+LINK_SIZE] != OP_END) break;62466247/* It is the end of whole-pattern recursion. */62486249offset = Flast_group_offset;62506251/* Corrupted heapframes?. Trigger an assert and return an error */6252PCRE2_ASSERT(offset != PCRE2_UNSET);6253if (offset == PCRE2_UNSET) return PCRE2_ERROR_INTERNAL;62546255N = (heapframe *)((char *)match_data->heapframes + offset);6256P = (heapframe *)((char *)N - frame_size);6257Flast_group_offset = P->last_group_offset;62586259/* Reinstate the previous set of captures and then carry on after the6260recursion call. */62616262memcpy((char *)F + offsetof(heapframe, ovector), P->ovector,6263Foffset_top * sizeof(PCRE2_SIZE));6264Foffset_top = P->offset_top;6265Fcapture_last = P->capture_last;6266Fcurrent_recurse = P->current_recurse;6267Fecode = P->ecode + 1 + LINK_SIZE;6268continue; /* With next opcode */62696270case OP_COND: /* No need to do anything for these */6271case OP_SCOND:6272break;62736274/* Non-atomic positive assertions are like OP_BRA, except that the6275subject pointer must be put back to where it was at the start of the6276assertion. For a variable lookbehind, check its end point. */62776278case OP_ASSERTBACK_NA:6279if (branch_start[1 + LINK_SIZE] == OP_VREVERSE && Feptr != P->eptr)6280RRETURN(MATCH_NOMATCH);6281/* Fall through */62826283case OP_ASSERT_NA:6284if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr;6285Feptr = P->eptr;6286break;62876288/* Atomic positive assertions are like OP_ONCE, except that in addition6289the subject pointer must be put back to where it was at the start of the6290assertion. For a variable lookbehind, check its end point. */62916292case OP_ASSERTBACK:6293if (branch_start[1 + LINK_SIZE] == OP_VREVERSE && Feptr != P->eptr)6294RRETURN(MATCH_NOMATCH);6295/* Fall through */62966297case OP_ASSERT:6298if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr;6299Feptr = P->eptr;6300/* Fall through */63016302/* For an atomic group, discard internal backtracking points. We must6303also ensure that any remaining branches within the top-level of the group6304are not tried. Do this by adjusting the code pointer within the backtrack6305frame so that it points to the final branch. */63066307case OP_ONCE:6308Fback_frame = ((char *)F - (char *)P);6309for (;;)6310{6311uint32_t y = GET(P->ecode,1);6312if ((P->ecode)[y] != OP_ALT) break;6313P->ecode += y;6314}6315break;63166317/* A matching negative assertion returns MATCH, which is turned into6318NOMATCH at the assertion level. For a variable lookbehind, check its end6319point. */63206321case OP_ASSERTBACK_NOT:6322if (branch_start[1 + LINK_SIZE] == OP_VREVERSE && Feptr != P->eptr)6323RRETURN(MATCH_NOMATCH);6324/* Fall through */63256326case OP_ASSERT_NOT:6327RRETURN(MATCH_MATCH);63286329/* A scan substring group must preserve the current end_subject,6330and restore it before the backtracking is performed into its sub6331pattern. */63326333case OP_ASSERT_SCS:6334F->temp_sptr[0] = mb->end_subject;6335mb->end_subject = P->temp_sptr[0];6336mb->true_end_subject = mb->end_subject + P->temp_size;6337Feptr = P->temp_sptr[1];63386339RMATCH(Fecode + 1 + LINK_SIZE, RM39);63406341mb->end_subject = F->temp_sptr[0];6342mb->true_end_subject = mb->end_subject;6343RRETURN(rrc);6344break;63456346/* At the end of a script run, apply the script-checking rules. This code6347will never by exercised if Unicode support it not compiled, because in6348that environment script runs cause an error at compile time. */63496350case OP_SCRIPT_RUN:6351if (!PRIV(script_run)(P->eptr, Feptr, utf)) RRETURN(MATCH_NOMATCH);6352break;63536354/* Whole-pattern recursion is coded as a recurse into group 0, and is6355handled with OP_BRA above. Other recursion is handled here. */63566357case OP_CBRA:6358case OP_CBRAPOS:6359case OP_SCBRA:6360case OP_SCBRAPOS:6361number = GET2(bracode, 1+LINK_SIZE);63626363/* Handle a recursively called group. We reinstate the previous set of6364captures and then carry on after the recursion call. */63656366if (Fcurrent_recurse == number)6367{6368P = (heapframe *)((char *)N - frame_size);6369memcpy((char *)F + offsetof(heapframe, ovector), P->ovector,6370Foffset_top * sizeof(PCRE2_SIZE));6371Foffset_top = P->offset_top;6372Fcapture_last = P->capture_last;6373Fcurrent_recurse = P->current_recurse;6374Fecode = P->ecode + 1 + LINK_SIZE;6375continue; /* With next opcode */6376}63776378/* Deal with actual capturing. */63796380offset = (number << 1) - 2;6381Fcapture_last = number;6382Fovector[offset] = P->eptr - mb->start_subject;6383Fovector[offset+1] = Feptr - mb->start_subject;6384if (offset >= Foffset_top) Foffset_top = offset + 2;6385break;6386} /* End actions relating to the starting opcode */63876388/* OP_KETRPOS is a possessive repeating ket. Remember the current position,6389and return the MATCH_KETRPOS. This makes it possible to do the repeats one6390at a time from the outer level. This must precede the empty string test -6391in this case that test is done at the outer level. */63926393if (*Fecode == OP_KETRPOS)6394{6395memcpy((char *)P + offsetof(heapframe, eptr),6396(char *)F + offsetof(heapframe, eptr),6397frame_copy_size);6398RRETURN(MATCH_KETRPOS);6399}64006401/* Handle the different kinds of closing brackets. A non-repeating ket6402needs no special action, just continuing at this level. This also happens6403for the repeating kets if the group matched no characters, in order to6404forcibly break infinite loops. Otherwise, the repeating kets try the rest6405of the pattern or restart from the preceding bracket, in the appropriate6406order. */64076408if (Fop != OP_KET && (P == NULL || Feptr != P->eptr))6409{6410if (Fop == OP_KETRMIN)6411{6412RMATCH(Fecode + 1 + LINK_SIZE, RM6);6413if (rrc != MATCH_NOMATCH) RRETURN(rrc);6414Fecode -= GET(Fecode, 1);6415break; /* End of ket processing */6416}64176418/* Repeat the maximum number of times (KETRMAX) */64196420RMATCH(bracode, RM7);6421if (rrc != MATCH_NOMATCH) RRETURN(rrc);6422}64236424/* Carry on at this level for a non-repeating ket, or after matching an6425empty string, or after repeating for a maximum number of times. */64266427Fecode += 1 + LINK_SIZE;6428break;642964306431/* ===================================================================== */6432/* Start and end of line assertions, not multiline mode. */64336434case OP_CIRC: /* Start of line, unless PCRE2_NOTBOL is set. */6435if (Feptr != mb->start_subject || (mb->moptions & PCRE2_NOTBOL) != 0)6436RRETURN(MATCH_NOMATCH);6437Fecode++;6438break;64396440case OP_SOD: /* Unconditional start of subject */6441if (Feptr != mb->start_subject) RRETURN(MATCH_NOMATCH);6442Fecode++;6443break;64446445/* When PCRE2_NOTEOL is unset, assert before the subject end, or a6446terminating newline unless PCRE2_DOLLAR_ENDONLY is set. */64476448case OP_DOLL:6449if ((mb->moptions & PCRE2_NOTEOL) != 0) RRETURN(MATCH_NOMATCH);6450if ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0) goto ASSERT_NL_OR_EOS;64516452/* Fall through */6453/* Unconditional end of subject assertion (\z). */64546455case OP_EOD:6456if (Feptr < mb->true_end_subject) RRETURN(MATCH_NOMATCH);6457if (mb->partial != 0)6458{6459mb->hitend = TRUE;6460if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;6461}6462Fecode++;6463break;64646465/* End of subject or ending \n assertion (\Z) */64666467case OP_EODN:6468ASSERT_NL_OR_EOS:6469if (Feptr < mb->true_end_subject &&6470(!IS_NEWLINE(Feptr) || Feptr != mb->true_end_subject - mb->nllen))6471{6472if (mb->partial != 0 &&6473Feptr + 1 >= mb->end_subject &&6474NLBLOCK->nltype == NLTYPE_FIXED &&6475NLBLOCK->nllen == 2 &&6476UCHAR21TEST(Feptr) == NLBLOCK->nl[0])6477{6478mb->hitend = TRUE;6479if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;6480}6481RRETURN(MATCH_NOMATCH);6482}64836484/* Either at end of string or \n before end. */64856486if (mb->partial != 0)6487{6488mb->hitend = TRUE;6489if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;6490}6491Fecode++;6492break;649364946495/* ===================================================================== */6496/* Start and end of line assertions, multiline mode. */64976498/* Start of subject unless notbol, or after any newline except for one at6499the very end, unless PCRE2_ALT_CIRCUMFLEX is set. */65006501case OP_CIRCM:6502if ((mb->moptions & PCRE2_NOTBOL) != 0 && Feptr == mb->start_subject)6503RRETURN(MATCH_NOMATCH);6504if (Feptr != mb->start_subject &&6505((Feptr == mb->end_subject &&6506(mb->poptions & PCRE2_ALT_CIRCUMFLEX) == 0) ||6507!WAS_NEWLINE(Feptr)))6508RRETURN(MATCH_NOMATCH);6509Fecode++;6510break;65116512/* Assert before any newline, or before end of subject unless noteol is6513set. */65146515case OP_DOLLM:6516if (Feptr < mb->end_subject)6517{6518if (!IS_NEWLINE(Feptr))6519{6520if (mb->partial != 0 &&6521Feptr + 1 >= mb->end_subject &&6522NLBLOCK->nltype == NLTYPE_FIXED &&6523NLBLOCK->nllen == 2 &&6524UCHAR21TEST(Feptr) == NLBLOCK->nl[0])6525{6526mb->hitend = TRUE;6527if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;6528}6529RRETURN(MATCH_NOMATCH);6530}6531}6532else6533{6534if ((mb->moptions & PCRE2_NOTEOL) != 0) RRETURN(MATCH_NOMATCH);6535SCHECK_PARTIAL();6536}6537Fecode++;6538break;653965406541/* ===================================================================== */6542/* Start of match assertion */65436544case OP_SOM:6545if (Feptr != mb->start_subject + mb->start_offset) RRETURN(MATCH_NOMATCH);6546Fecode++;6547break;654865496550/* ===================================================================== */6551/* Reset the start of match point */65526553case OP_SET_SOM:6554Fstart_match = Feptr;6555Fecode++;6556break;655765586559/* ===================================================================== */6560/* Word boundary assertions. Find out if the previous and current6561characters are "word" characters. It takes a bit more work in UTF mode.6562Characters > 255 are assumed to be "non-word" characters when PCRE2_UCP is6563not set. When it is set, use Unicode properties if available, even when not6564in UTF mode. Remember the earliest and latest consulted characters. */65656566case OP_NOT_WORD_BOUNDARY:6567case OP_WORD_BOUNDARY:6568case OP_NOT_UCP_WORD_BOUNDARY:6569case OP_UCP_WORD_BOUNDARY:6570if (Feptr == mb->check_subject) prev_is_word = FALSE; else6571{6572PCRE2_SPTR lastptr = Feptr - 1;6573#ifdef SUPPORT_UNICODE6574if (utf)6575{6576BACKCHAR(lastptr);6577GETCHAR(fc, lastptr);6578}6579else6580#endif /* SUPPORT_UNICODE */6581fc = *lastptr;6582if (lastptr < mb->start_used_ptr) mb->start_used_ptr = lastptr;6583#ifdef SUPPORT_UNICODE6584if (Fop == OP_UCP_WORD_BOUNDARY || Fop == OP_NOT_UCP_WORD_BOUNDARY)6585{6586int chartype = UCD_CHARTYPE(fc);6587int category = PRIV(ucp_gentype)[chartype];6588prev_is_word = (category == ucp_L || category == ucp_N ||6589chartype == ucp_Mn || chartype == ucp_Pc);6590}6591else6592#endif /* SUPPORT_UNICODE */6593prev_is_word = CHMAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0;6594}65956596/* Get status of next character */65976598if (Feptr >= mb->end_subject)6599{6600SCHECK_PARTIAL();6601cur_is_word = FALSE;6602}6603else6604{6605PCRE2_SPTR nextptr = Feptr + 1;6606#ifdef SUPPORT_UNICODE6607if (utf)6608{6609FORWARDCHARTEST(nextptr, mb->end_subject);6610GETCHAR(fc, Feptr);6611}6612else6613#endif /* SUPPORT_UNICODE */6614fc = *Feptr;6615if (nextptr > mb->last_used_ptr) mb->last_used_ptr = nextptr;6616#ifdef SUPPORT_UNICODE6617if (Fop == OP_UCP_WORD_BOUNDARY || Fop == OP_NOT_UCP_WORD_BOUNDARY)6618{6619int chartype = UCD_CHARTYPE(fc);6620int category = PRIV(ucp_gentype)[chartype];6621cur_is_word = (category == ucp_L || category == ucp_N ||6622chartype == ucp_Mn || chartype == ucp_Pc);6623}6624else6625#endif /* SUPPORT_UNICODE */6626cur_is_word = CHMAX_255(fc) && (mb->ctypes[fc] & ctype_word) != 0;6627}66286629/* Now see if the situation is what we want */66306631if ((*Fecode++ == OP_WORD_BOUNDARY || Fop == OP_UCP_WORD_BOUNDARY)?6632cur_is_word == prev_is_word : cur_is_word != prev_is_word)6633RRETURN(MATCH_NOMATCH);6634break;663566366637/* ===================================================================== */6638/* Backtracking (*VERB)s, with and without arguments. Note that if the6639pattern is successfully matched, we do not come back from RMATCH. */66406641case OP_MARK:6642Fmark = mb->nomatch_mark = Fecode + 2;6643RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM12);66446645/* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an6646argument, and we must check whether that argument matches this MARK's6647argument. It is passed back in mb->verb_skip_ptr. If it does match, we6648return MATCH_SKIP with mb->verb_skip_ptr now pointing to the subject6649position that corresponds to this mark. Otherwise, pass back the return6650code unaltered. */66516652if (rrc == MATCH_SKIP_ARG &&6653PRIV(strcmp)(Fecode + 2, mb->verb_skip_ptr) == 0)6654{6655mb->verb_skip_ptr = Feptr; /* Pass back current position */6656RRETURN(MATCH_SKIP);6657}6658RRETURN(rrc);66596660case OP_FAIL:6661RRETURN(MATCH_NOMATCH);66626663/* Record the current recursing group number in mb->verb_current_recurse6664when a backtracking return such as MATCH_COMMIT is given. This enables the6665recurse processing to catch verbs from within the recursion. */66666667case OP_COMMIT:6668RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM13);6669if (rrc != MATCH_NOMATCH) RRETURN(rrc);6670mb->verb_current_recurse = Fcurrent_recurse;6671RRETURN(MATCH_COMMIT);66726673case OP_COMMIT_ARG:6674Fmark = mb->nomatch_mark = Fecode + 2;6675RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM36);6676if (rrc != MATCH_NOMATCH) RRETURN(rrc);6677mb->verb_current_recurse = Fcurrent_recurse;6678RRETURN(MATCH_COMMIT);66796680case OP_PRUNE:6681RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM14);6682if (rrc != MATCH_NOMATCH) RRETURN(rrc);6683mb->verb_current_recurse = Fcurrent_recurse;6684RRETURN(MATCH_PRUNE);66856686case OP_PRUNE_ARG:6687Fmark = mb->nomatch_mark = Fecode + 2;6688RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM15);6689if (rrc != MATCH_NOMATCH) RRETURN(rrc);6690mb->verb_current_recurse = Fcurrent_recurse;6691RRETURN(MATCH_PRUNE);66926693case OP_SKIP:6694RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM16);6695if (rrc != MATCH_NOMATCH) RRETURN(rrc);6696mb->verb_skip_ptr = Feptr; /* Pass back current position */6697mb->verb_current_recurse = Fcurrent_recurse;6698RRETURN(MATCH_SKIP);66996700/* Note that, for Perl compatibility, SKIP with an argument does NOT set6701nomatch_mark. When a pattern match ends with a SKIP_ARG for which there was6702not a matching mark, we have to re-run the match, ignoring the SKIP_ARG6703that failed and any that precede it (either they also failed, or were not6704triggered). To do this, we maintain a count of executed SKIP_ARGs. If a6705SKIP_ARG gets to top level, the match is re-run with mb->ignore_skip_arg6706set to the count of the one that failed. */67076708case OP_SKIP_ARG:6709mb->skip_arg_count++;6710if (mb->skip_arg_count <= mb->ignore_skip_arg)6711{6712Fecode += PRIV(OP_lengths)[*Fecode] + Fecode[1];6713break;6714}6715RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM17);6716if (rrc != MATCH_NOMATCH) RRETURN(rrc);67176718/* Pass back the current skip name and return the special MATCH_SKIP_ARG6719return code. This will either be caught by a matching MARK, or get to the6720top, where it causes a rematch with mb->ignore_skip_arg set to the value of6721mb->skip_arg_count. */67226723mb->verb_skip_ptr = Fecode + 2;6724mb->verb_current_recurse = Fcurrent_recurse;6725RRETURN(MATCH_SKIP_ARG);67266727/* For THEN (and THEN_ARG) we pass back the address of the opcode, so that6728the branch in which it occurs can be determined. */67296730case OP_THEN:6731RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM18);6732if (rrc != MATCH_NOMATCH) RRETURN(rrc);6733mb->verb_ecode_ptr = Fecode;6734mb->verb_current_recurse = Fcurrent_recurse;6735RRETURN(MATCH_THEN);67366737case OP_THEN_ARG:6738Fmark = mb->nomatch_mark = Fecode + 2;6739RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM19);6740if (rrc != MATCH_NOMATCH) RRETURN(rrc);6741mb->verb_ecode_ptr = Fecode;6742mb->verb_current_recurse = Fcurrent_recurse;6743RRETURN(MATCH_THEN);674467456746/* ===================================================================== */6747/* There's been some horrible disaster. Arrival here can only mean there is6748something seriously wrong in the code above or the OP_xxx definitions. */67496750default:6751PCRE2_DEBUG_UNREACHABLE();6752return PCRE2_ERROR_INTERNAL;6753}67546755/* Do not insert any code in here without much thought; it is assumed6756that "continue" in the code above comes out to here to repeat the main6757loop. */67586759} /* End of main loop */67606761PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */67626763/* ========================================================================= */6764/* The RRETURN() macro jumps here. The number that is saved in Freturn_id6765indicates which label we actually want to return to. The value in Frdepth is6766the index number of the frame in the vector. The return value has been placed6767in rrc. */67686769#define LBL(val) case val: goto L_RM##val;67706771RETURN_SWITCH:6772if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr;6773if (Frdepth == 0) return rrc; /* Exit from the top level */6774F = (heapframe *)((char *)F - Fback_frame); /* Backtrack */6775mb->cb->callout_flags |= PCRE2_CALLOUT_BACKTRACK; /* Note for callouts */67766777#ifdef DEBUG_SHOW_RMATCH6778fprintf(stderr, "++ RETURN %d to RM%d\n", rrc, Freturn_id);6779#endif67806781switch (Freturn_id)6782{6783LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)6784LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(16)6785LBL(17) LBL(18) LBL(19) LBL(20) LBL(21) LBL(22) LBL(23) LBL(24)6786LBL(25) LBL(26) LBL(27) LBL(28) LBL(29) LBL(30) LBL(31) LBL(32)6787LBL(33) LBL(34) LBL(35) LBL(36) LBL(37) LBL(38) LBL(39)67886789#ifdef SUPPORT_WIDE_CHARS6790LBL(100) LBL(101) LBL(102) LBL(103)6791#endif67926793#ifdef SUPPORT_UNICODE6794LBL(200) LBL(201) LBL(202) LBL(203) LBL(204) LBL(205) LBL(206)6795LBL(207) LBL(208) LBL(209) LBL(210) LBL(211) LBL(212) LBL(213)6796LBL(214) LBL(215) LBL(216) LBL(217) LBL(218) LBL(219) LBL(220)6797LBL(221) LBL(222) LBL(223) LBL(224)6798#endif67996800default:6801PCRE2_DEBUG_UNREACHABLE();6802return PCRE2_ERROR_INTERNAL;6803}6804#undef LBL6805}680668076808/*************************************************6809* Match a Regular Expression *6810*************************************************/68116812/* This function applies a compiled pattern to a subject string and picks out6813portions of the string if it matches. Two elements in the vector are set for6814each substring: the offsets to the start and end of the substring.68156816Arguments:6817code points to the compiled expression6818subject points to the subject string6819length length of subject string (may contain binary zeros)6820start_offset where to start in the subject string6821options option bits6822match_data points to a match_data block6823mcontext points a PCRE2 context68246825Returns: > 0 => success; value is the number of ovector pairs filled6826= 0 => success, but ovector is not big enough6827= -1 => failed to match (PCRE2_ERROR_NOMATCH)6828= -2 => partial match (PCRE2_ERROR_PARTIAL)6829< -2 => some kind of unexpected problem6830*/68316832PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION6833pcre2_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length,6834PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data,6835pcre2_match_context *mcontext)6836{6837int rc;6838int was_zero_terminated = 0;6839const uint8_t *start_bits = NULL;6840const pcre2_real_code *re = (const pcre2_real_code *)code;68416842BOOL anchored;6843BOOL firstline;6844BOOL has_first_cu = FALSE;6845BOOL has_req_cu = FALSE;6846BOOL startline;68476848#if PCRE2_CODE_UNIT_WIDTH == 86849PCRE2_SPTR memchr_found_first_cu;6850PCRE2_SPTR memchr_found_first_cu2;6851#endif68526853PCRE2_UCHAR first_cu = 0;6854PCRE2_UCHAR first_cu2 = 0;6855PCRE2_UCHAR req_cu = 0;6856PCRE2_UCHAR req_cu2 = 0;68576858PCRE2_SPTR bumpalong_limit;6859PCRE2_SPTR end_subject;6860PCRE2_SPTR true_end_subject;6861PCRE2_SPTR start_match;6862PCRE2_SPTR req_cu_ptr;6863PCRE2_SPTR start_partial;6864PCRE2_SPTR match_partial;68656866#ifdef SUPPORT_JIT6867BOOL use_jit;6868#endif68696870/* This flag is needed even when Unicode is not supported for convenience6871(it is used by the IS_NEWLINE macro). */68726873BOOL utf = FALSE;68746875#ifdef SUPPORT_UNICODE6876BOOL ucp = FALSE;6877BOOL allow_invalid;6878uint32_t fragment_options = 0;6879#ifdef SUPPORT_JIT6880BOOL jit_checked_utf = FALSE;6881#endif6882#endif /* SUPPORT_UNICODE */68836884PCRE2_SIZE frame_size;6885PCRE2_SIZE heapframes_size;68866887/* We need to have mb as a pointer to a match block, because the IS_NEWLINE6888macro is used below, and it expects NLBLOCK to be defined as a pointer. */68896890pcre2_callout_block cb;6891match_block actual_match_block;6892match_block *mb = &actual_match_block;68936894/* Recognize NULL, length 0 as an empty string. */68956896if (subject == NULL && length == 0) subject = (PCRE2_SPTR)"";68976898/* Plausibility checks */68996900if ((options & ~PUBLIC_MATCH_OPTIONS) != 0) return PCRE2_ERROR_BADOPTION;6901if (code == NULL || subject == NULL || match_data == NULL)6902return PCRE2_ERROR_NULL;69036904start_match = subject + start_offset;6905req_cu_ptr = start_match - 1;6906if (length == PCRE2_ZERO_TERMINATED)6907{6908length = PRIV(strlen)(subject);6909was_zero_terminated = 1;6910}6911true_end_subject = end_subject = subject + length;69126913if (start_offset > length) return PCRE2_ERROR_BADOFFSET;69146915/* Check that the first field in the block is the magic number. */69166917if (re->magic_number != MAGIC_NUMBER) return PCRE2_ERROR_BADMAGIC;69186919/* Check the code unit width. */69206921if ((re->flags & PCRE2_MODE_MASK) != PCRE2_CODE_UNIT_WIDTH/8)6922return PCRE2_ERROR_BADMODE;69236924/* PCRE2_NOTEMPTY and PCRE2_NOTEMPTY_ATSTART are match-time flags in the6925options variable for this function. Users of PCRE2 who are not calling the6926function directly would like to have a way of setting these flags, in the same6927way that they can set pcre2_compile() flags like PCRE2_NO_AUTO_POSSESS with6928constructions like (*NO_AUTOPOSSESS). To enable this, (*NOTEMPTY) and6929(*NOTEMPTY_ATSTART) set bits in the pattern's "flag" function which we now6930transfer to the options for this function. The bits are guaranteed to be6931adjacent, but do not have the same values. This bit of Boolean trickery assumes6932that the match-time bits are not more significant than the flag bits. If by6933accident this is not the case, a compile-time division by zero error will6934occur. */69356936#define FF (PCRE2_NOTEMPTY_SET|PCRE2_NE_ATST_SET)6937#define OO (PCRE2_NOTEMPTY|PCRE2_NOTEMPTY_ATSTART)6938options |= (re->flags & FF) / ((FF & (~FF+1)) / (OO & (~OO+1)));6939#undef FF6940#undef OO69416942/* If the pattern was successfully studied with JIT support, we will run the6943JIT executable instead of the rest of this function. Most options must be set6944at compile time for the JIT code to be usable. */69456946#ifdef SUPPORT_JIT6947use_jit = (re->executable_jit != NULL &&6948(options & ~PUBLIC_JIT_MATCH_OPTIONS) == 0);6949#endif69506951/* Initialize UTF/UCP parameters. */69526953#ifdef SUPPORT_UNICODE6954utf = (re->overall_options & PCRE2_UTF) != 0;6955allow_invalid = (re->overall_options & PCRE2_MATCH_INVALID_UTF) != 0;6956ucp = (re->overall_options & PCRE2_UCP) != 0;6957#endif /* SUPPORT_UNICODE */69586959/* Convert the partial matching flags into an integer. */69606961mb->partial = ((options & PCRE2_PARTIAL_HARD) != 0)? 2 :6962((options & PCRE2_PARTIAL_SOFT) != 0)? 1 : 0;69636964/* Partial matching and PCRE2_ENDANCHORED are currently not allowed at the same6965time. */69666967if (mb->partial != 0 &&6968((re->overall_options | options) & PCRE2_ENDANCHORED) != 0)6969return PCRE2_ERROR_BADOPTION;69706971/* It is an error to set an offset limit without setting the flag at compile6972time. */69736974if (mcontext != NULL && mcontext->offset_limit != PCRE2_UNSET &&6975(re->overall_options & PCRE2_USE_OFFSET_LIMIT) == 0)6976return PCRE2_ERROR_BADOFFSETLIMIT;69776978/* If the match data block was previously used with PCRE2_COPY_MATCHED_SUBJECT,6979free the memory that was obtained. Set the field to NULL for no match cases. */69806981if ((match_data->flags & PCRE2_MD_COPIED_SUBJECT) != 0)6982{6983match_data->memctl.free((void *)match_data->subject,6984match_data->memctl.memory_data);6985match_data->flags &= ~PCRE2_MD_COPIED_SUBJECT;6986}6987match_data->subject = NULL;69886989/* Zero the error offset in case the first code unit is invalid UTF. */69906991match_data->startchar = 0;699269936994/* ============================= JIT matching ============================== */69956996/* Prepare for JIT matching. Check a UTF string for validity unless no check is6997requested or invalid UTF can be handled. We check only the portion of the6998subject that might be be inspected during matching - from the offset minus the6999maximum lookbehind to the given length. This saves time when a small part of a7000large subject is being matched by the use of a starting offset. Note that the7001maximum lookbehind is a number of characters, not code units. */70027003#ifdef SUPPORT_JIT7004if (use_jit)7005{7006#ifdef SUPPORT_UNICODE7007if (utf && (options & PCRE2_NO_UTF_CHECK) == 0 && !allow_invalid)7008{70097010/* For 8-bit and 16-bit UTF, check that the first code unit is a valid7011character start. */70127013#if PCRE2_CODE_UNIT_WIDTH != 327014if (start_match < end_subject && NOT_FIRSTCU(*start_match))7015{7016if (start_offset > 0) return PCRE2_ERROR_BADUTFOFFSET;7017#if PCRE2_CODE_UNIT_WIDTH == 87018return PCRE2_ERROR_UTF8_ERR20; /* Isolated 0x80 byte */7019#else7020return PCRE2_ERROR_UTF16_ERR3; /* Isolated low surrogate */7021#endif7022}7023#endif /* WIDTH != 32 */70247025/* Move back by the maximum lookbehind, just in case it happens at the very7026start of matching. */70277028#if PCRE2_CODE_UNIT_WIDTH != 327029for (unsigned int i = re->max_lookbehind; i > 0 && start_match > subject; i--)7030{7031start_match--;7032while (start_match > subject &&7033#if PCRE2_CODE_UNIT_WIDTH == 87034(*start_match & 0xc0) == 0x80)7035#else /* 16-bit */7036(*start_match & 0xfc00) == 0xdc00)7037#endif7038start_match--;7039}7040#else /* PCRE2_CODE_UNIT_WIDTH != 32 */70417042/* In the 32-bit library, one code unit equals one character. However,7043we cannot just subtract the lookbehind and then compare pointers, because7044a very large lookbehind could create an invalid pointer. */70457046if (start_offset >= re->max_lookbehind)7047start_match -= re->max_lookbehind;7048else7049start_match = subject;7050#endif /* PCRE2_CODE_UNIT_WIDTH != 32 */70517052/* Validate the relevant portion of the subject. Adjust the offset of an7053invalid code point to be an absolute offset in the whole string. */70547055match_data->rc = PRIV(valid_utf)(start_match,7056length - (start_match - subject), &(match_data->startchar));7057if (match_data->rc != 0)7058{7059match_data->startchar += start_match - subject;7060return match_data->rc;7061}7062jit_checked_utf = TRUE;7063}7064#endif /* SUPPORT_UNICODE */70657066/* If JIT returns BADOPTION, which means that the selected complete or7067partial matching mode was not compiled, fall through to the interpreter. */70687069rc = pcre2_jit_match(code, subject, length, start_offset, options,7070match_data, mcontext);7071if (rc != PCRE2_ERROR_JIT_BADOPTION)7072{7073match_data->subject_length = length;7074if (rc >= 0 && (options & PCRE2_COPY_MATCHED_SUBJECT) != 0)7075{7076length = CU2BYTES(length + was_zero_terminated);7077match_data->subject = match_data->memctl.malloc(length,7078match_data->memctl.memory_data);7079if (match_data->subject == NULL) return PCRE2_ERROR_NOMEMORY;7080memcpy((void *)match_data->subject, subject, length);7081match_data->flags |= PCRE2_MD_COPIED_SUBJECT;7082}7083return rc;7084}7085}7086#endif /* SUPPORT_JIT */70877088/* ========================= End of JIT matching ========================== */708970907091/* Proceed with non-JIT matching. The default is to allow lookbehinds to the7092start of the subject. A UTF check when there is a non-zero offset may change7093this. */70947095mb->check_subject = subject;70967097/* If a UTF subject string was not checked for validity in the JIT code above,7098check it here, and handle support for invalid UTF strings. The check above7099happens only when invalid UTF is not supported and PCRE2_NO_CHECK_UTF is unset.7100If we get here in those circumstances, it means the subject string is valid,7101but for some reason JIT matching was not successful. There is no need to check7102the subject again.71037104We check only the portion of the subject that might be be inspected during7105matching - from the offset minus the maximum lookbehind to the given length.7106This saves time when a small part of a large subject is being matched by the7107use of a starting offset. Note that the maximum lookbehind is a number of7108characters, not code units.71097110Note also that support for invalid UTF forces a check, overriding the setting7111of PCRE2_NO_CHECK_UTF. */71127113#ifdef SUPPORT_UNICODE7114if (utf &&7115#ifdef SUPPORT_JIT7116!jit_checked_utf &&7117#endif7118((options & PCRE2_NO_UTF_CHECK) == 0 || allow_invalid))7119{7120#if PCRE2_CODE_UNIT_WIDTH != 327121BOOL skipped_bad_start = FALSE;7122#endif71237124/* For 8-bit and 16-bit UTF, check that the first code unit is a valid7125character start. If we are handling invalid UTF, just skip over such code7126units. Otherwise, give an appropriate error. */71277128#if PCRE2_CODE_UNIT_WIDTH != 327129if (allow_invalid)7130{7131while (start_match < end_subject && NOT_FIRSTCU(*start_match))7132{7133start_match++;7134skipped_bad_start = TRUE;7135}7136}7137else if (start_match < end_subject && NOT_FIRSTCU(*start_match))7138{7139if (start_offset > 0) return PCRE2_ERROR_BADUTFOFFSET;7140#if PCRE2_CODE_UNIT_WIDTH == 87141return PCRE2_ERROR_UTF8_ERR20; /* Isolated 0x80 byte */7142#else7143return PCRE2_ERROR_UTF16_ERR3; /* Isolated low surrogate */7144#endif7145}7146#endif /* WIDTH != 32 */71477148/* The mb->check_subject field points to the start of UTF checking;7149lookbehinds can go back no further than this. */71507151mb->check_subject = start_match;71527153/* Move back by the maximum lookbehind, just in case it happens at the very7154start of matching, but don't do this if we skipped bad 8-bit or 16-bit code7155units above. */71567157#if PCRE2_CODE_UNIT_WIDTH != 327158if (!skipped_bad_start)7159{7160unsigned int i;7161for (i = re->max_lookbehind; i > 0 && mb->check_subject > subject; i--)7162{7163mb->check_subject--;7164while (mb->check_subject > subject &&7165#if PCRE2_CODE_UNIT_WIDTH == 87166(*mb->check_subject & 0xc0) == 0x80)7167#else /* 16-bit */7168(*mb->check_subject & 0xfc00) == 0xdc00)7169#endif7170mb->check_subject--;7171}7172}7173#else /* PCRE2_CODE_UNIT_WIDTH != 32 */71747175/* In the 32-bit library, one code unit equals one character. However,7176we cannot just subtract the lookbehind and then compare pointers, because7177a very large lookbehind could create an invalid pointer. */71787179if (start_offset >= re->max_lookbehind)7180mb->check_subject -= re->max_lookbehind;7181else7182mb->check_subject = subject;7183#endif /* PCRE2_CODE_UNIT_WIDTH != 32 */71847185/* Validate the relevant portion of the subject. There's a loop in case we7186encounter bad UTF in the characters preceding start_match which we are7187scanning because of a lookbehind. */71887189for (;;)7190{7191match_data->rc = PRIV(valid_utf)(mb->check_subject,7192length - (mb->check_subject - subject), &(match_data->startchar));71937194if (match_data->rc == 0) break; /* Valid UTF string */71957196/* Invalid UTF string. Adjust the offset to be an absolute offset in the7197whole string. If we are handling invalid UTF strings, set end_subject to7198stop before the bad code unit, and set the options to "not end of line".7199Otherwise return the error. */72007201match_data->startchar += mb->check_subject - subject;7202if (!allow_invalid || match_data->rc > 0) return match_data->rc;7203end_subject = subject + match_data->startchar;72047205/* If the end precedes start_match, it means there is invalid UTF in the7206extra code units we reversed over because of a lookbehind. Advance past the7207first bad code unit, and then skip invalid character starting code units in72088-bit and 16-bit modes, and try again with the original end point. */72097210if (end_subject < start_match)7211{7212mb->check_subject = end_subject + 1;7213#if PCRE2_CODE_UNIT_WIDTH != 327214while (mb->check_subject < start_match && NOT_FIRSTCU(*mb->check_subject))7215mb->check_subject++;7216#endif7217end_subject = true_end_subject;7218}72197220/* Otherwise, set the not end of line option, and do the match. */72217222else7223{7224fragment_options = PCRE2_NOTEOL;7225break;7226}7227}7228}7229#endif /* SUPPORT_UNICODE */72307231/* A NULL match context means "use a default context", but we take the memory7232control functions from the pattern. */72337234if (mcontext == NULL)7235{7236mcontext = (pcre2_match_context *)(&PRIV(default_match_context));7237mb->memctl = re->memctl;7238}7239else mb->memctl = mcontext->memctl;72407241anchored = ((re->overall_options | options) & PCRE2_ANCHORED) != 0;7242firstline = !anchored && (re->overall_options & PCRE2_FIRSTLINE) != 0;7243startline = (re->flags & PCRE2_STARTLINE) != 0;7244bumpalong_limit = (mcontext->offset_limit == PCRE2_UNSET)?7245true_end_subject : subject + mcontext->offset_limit;72467247/* Initialize and set up the fixed fields in the callout block, with a pointer7248in the match block. */72497250mb->cb = &cb;7251cb.version = 2;7252cb.subject = subject;7253cb.subject_length = (PCRE2_SIZE)(end_subject - subject);7254cb.callout_flags = 0;72557256/* Fill in the remaining fields in the match block, except for moptions, which7257gets set later. */72587259mb->callout = mcontext->callout;7260mb->callout_data = mcontext->callout_data;72617262mb->start_subject = subject;7263mb->start_offset = start_offset;7264mb->end_subject = end_subject;7265mb->true_end_subject = true_end_subject;7266mb->hasthen = (re->flags & PCRE2_HASTHEN) != 0;7267mb->allowemptypartial = (re->max_lookbehind > 0) ||7268(re->flags & PCRE2_MATCH_EMPTY) != 0;7269mb->poptions = re->overall_options; /* Pattern options */7270mb->ignore_skip_arg = 0;7271mb->mark = mb->nomatch_mark = NULL; /* In case never set */72727273/* The name table is needed for finding all the numbers associated with a7274given name, for condition testing. The code follows the name table. */72757276mb->name_table = (PCRE2_SPTR)((const uint8_t *)re + sizeof(pcre2_real_code));7277mb->name_count = re->name_count;7278mb->name_entry_size = re->name_entry_size;7279mb->start_code = (PCRE2_SPTR)((const uint8_t *)re + re->code_start);72807281/* Process the \R and newline settings. */72827283mb->bsr_convention = re->bsr_convention;7284mb->nltype = NLTYPE_FIXED;7285switch(re->newline_convention)7286{7287case PCRE2_NEWLINE_CR:7288mb->nllen = 1;7289mb->nl[0] = CHAR_CR;7290break;72917292case PCRE2_NEWLINE_LF:7293mb->nllen = 1;7294mb->nl[0] = CHAR_NL;7295break;72967297case PCRE2_NEWLINE_NUL:7298mb->nllen = 1;7299mb->nl[0] = CHAR_NUL;7300break;73017302case PCRE2_NEWLINE_CRLF:7303mb->nllen = 2;7304mb->nl[0] = CHAR_CR;7305mb->nl[1] = CHAR_NL;7306break;73077308case PCRE2_NEWLINE_ANY:7309mb->nltype = NLTYPE_ANY;7310break;73117312case PCRE2_NEWLINE_ANYCRLF:7313mb->nltype = NLTYPE_ANYCRLF;7314break;73157316default:7317PCRE2_DEBUG_UNREACHABLE();7318return PCRE2_ERROR_INTERNAL;7319}73207321/* The backtracking frames have fixed data at the front, and a PCRE2_SIZE7322vector at the end, whose size depends on the number of capturing parentheses in7323the pattern. It is not used at all if there are no capturing parentheses.73247325frame_size is the total size of each frame7326match_data->heapframes is the pointer to the frames vector7327match_data->heapframes_size is the allocated size of the vector73287329We must pad the frame_size for alignment to ensure subsequent frames are as7330aligned as heapframe. Whilst ovector is word-aligned due to being a PCRE2_SIZE7331array, that does not guarantee it is suitably aligned for pointers, as some7332architectures have pointers that are larger than a size_t. */73337334frame_size = (offsetof(heapframe, ovector) +7335re->top_bracket * 2 * sizeof(PCRE2_SIZE) + HEAPFRAME_ALIGNMENT - 1) &7336~(HEAPFRAME_ALIGNMENT - 1);73377338/* Limits set in the pattern override the match context only if they are7339smaller. */73407341mb->heap_limit = ((mcontext->heap_limit < re->limit_heap)?7342mcontext->heap_limit : re->limit_heap);73437344mb->match_limit = (mcontext->match_limit < re->limit_match)?7345mcontext->match_limit : re->limit_match;73467347mb->match_limit_depth = (mcontext->depth_limit < re->limit_depth)?7348mcontext->depth_limit : re->limit_depth;73497350/* If a pattern has very many capturing parentheses, the frame size may be very7351large. Set the initial frame vector size to ensure that there are at least 107352available frames, but enforce a minimum of START_FRAMES_SIZE. If this is7353greater than the heap limit, get as large a vector as possible. */73547355heapframes_size = frame_size * 10;7356if (heapframes_size < START_FRAMES_SIZE) heapframes_size = START_FRAMES_SIZE;7357if (heapframes_size / 1024 > mb->heap_limit)7358{7359PCRE2_SIZE max_size = 1024 * mb->heap_limit;7360if (max_size < frame_size) return PCRE2_ERROR_HEAPLIMIT;7361heapframes_size = max_size;7362}73637364/* If an existing frame vector in the match_data block is large enough, we can7365use it. Otherwise, free any pre-existing vector and get a new one. */73667367if (match_data->heapframes_size < heapframes_size)7368{7369match_data->memctl.free(match_data->heapframes,7370match_data->memctl.memory_data);7371match_data->heapframes = match_data->memctl.malloc(heapframes_size,7372match_data->memctl.memory_data);7373if (match_data->heapframes == NULL)7374{7375match_data->heapframes_size = 0;7376return PCRE2_ERROR_NOMEMORY;7377}7378match_data->heapframes_size = heapframes_size;7379}73807381/* Write to the ovector within the first frame to mark every capture unset and7382to avoid uninitialized memory read errors when it is copied to a new frame. */73837384memset((char *)(match_data->heapframes) + offsetof(heapframe, ovector), 0xff,7385frame_size - offsetof(heapframe, ovector));73867387/* Pointers to the individual character tables */73887389mb->lcc = re->tables + lcc_offset;7390mb->fcc = re->tables + fcc_offset;7391mb->ctypes = re->tables + ctypes_offset;73927393/* Set up the first code unit to match, if available. If there's no first code7394unit there may be a bitmap of possible first characters. */73957396if ((re->flags & PCRE2_FIRSTSET) != 0)7397{7398has_first_cu = TRUE;7399first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit);7400if ((re->flags & PCRE2_FIRSTCASELESS) != 0)7401{7402first_cu2 = TABLE_GET(first_cu, mb->fcc, first_cu);7403#ifdef SUPPORT_UNICODE7404#if PCRE2_CODE_UNIT_WIDTH == 87405if (first_cu > 127 && ucp && !utf) first_cu2 = UCD_OTHERCASE(first_cu);7406#else7407if (first_cu > 127 && (utf || ucp)) first_cu2 = UCD_OTHERCASE(first_cu);7408#endif7409#endif /* SUPPORT_UNICODE */7410}7411}7412else7413if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0)7414start_bits = re->start_bitmap;74157416/* There may also be a "last known required character" set. */74177418if ((re->flags & PCRE2_LASTSET) != 0)7419{7420has_req_cu = TRUE;7421req_cu = req_cu2 = (PCRE2_UCHAR)(re->last_codeunit);7422if ((re->flags & PCRE2_LASTCASELESS) != 0)7423{7424req_cu2 = TABLE_GET(req_cu, mb->fcc, req_cu);7425#ifdef SUPPORT_UNICODE7426#if PCRE2_CODE_UNIT_WIDTH == 87427if (req_cu > 127 && ucp && !utf) req_cu2 = UCD_OTHERCASE(req_cu);7428#else7429if (req_cu > 127 && (utf || ucp)) req_cu2 = UCD_OTHERCASE(req_cu);7430#endif7431#endif /* SUPPORT_UNICODE */7432}7433}743474357436/* ==========================================================================*/74377438/* Loop for handling unanchored repeated matching attempts; for anchored regexs7439the loop runs just once. */74407441#ifdef SUPPORT_UNICODE7442FRAGMENT_RESTART:7443#endif74447445start_partial = match_partial = NULL;7446mb->hitend = FALSE;74477448#if PCRE2_CODE_UNIT_WIDTH == 87449memchr_found_first_cu = NULL;7450memchr_found_first_cu2 = NULL;7451#endif74527453for(;;)7454{7455PCRE2_SPTR new_start_match;74567457/* ----------------- Start of match optimizations ---------------- */74587459/* There are some optimizations that avoid running the match if a known7460starting point is not found, or if a known later code unit is not present.7461However, there is an option (settable at compile time) that disables these,7462for testing and for ensuring that all callouts do actually occur. */74637464if ((re->optimization_flags & PCRE2_OPTIM_START_OPTIMIZE) != 0)7465{7466/* If firstline is TRUE, the start of the match is constrained to the first7467line of a multiline string. That is, the match must be before or at the7468first newline following the start of matching. Temporarily adjust7469end_subject so that we stop the scans for a first code unit at a newline.7470If the match fails at the newline, later code breaks the loop. */74717472if (firstline)7473{7474PCRE2_SPTR t = start_match;7475#ifdef SUPPORT_UNICODE7476if (utf)7477{7478while (t < end_subject && !IS_NEWLINE(t))7479{7480t++;7481ACROSSCHAR(t < end_subject, t, t++);7482}7483}7484else7485#endif7486while (t < end_subject && !IS_NEWLINE(t)) t++;7487end_subject = t;7488}74897490/* Anchored: check the first code unit if one is recorded. This may seem7491pointless but it can help in detecting a no match case without scanning for7492the required code unit. */74937494if (anchored)7495{7496if (has_first_cu || start_bits != NULL)7497{7498BOOL ok = start_match < end_subject;7499if (ok)7500{7501PCRE2_UCHAR c = UCHAR21TEST(start_match);7502ok = has_first_cu && (c == first_cu || c == first_cu2);7503if (!ok && start_bits != NULL)7504{7505#if PCRE2_CODE_UNIT_WIDTH != 87506if (c > 255) c = 255;7507#endif7508ok = (start_bits[c/8] & (1u << (c&7))) != 0;7509}7510}7511if (!ok)7512{7513rc = MATCH_NOMATCH;7514break;7515}7516}7517}75187519/* Not anchored. Advance to a unique first code unit if there is one. */75207521else7522{7523if (has_first_cu)7524{7525if (first_cu != first_cu2) /* Caseless */7526{7527/* In 16-bit and 32_bit modes we have to do our own search, so can7528look for both cases at once. */75297530#if PCRE2_CODE_UNIT_WIDTH != 87531PCRE2_UCHAR smc;7532while (start_match < end_subject &&7533(smc = UCHAR21TEST(start_match)) != first_cu &&7534smc != first_cu2)7535start_match++;7536#else7537/* In 8-bit mode, the use of memchr() gives a big speed up, even7538though we have to call it twice in order to find the earliest7539occurrence of the code unit in either of its cases. Caching is used7540to remember the positions of previously found code units. This can7541make a huge difference when the strings are very long and only one7542case is actually present. */75437544PCRE2_SPTR pp1 = NULL;7545PCRE2_SPTR pp2 = NULL;7546PCRE2_SIZE searchlength = end_subject - start_match;75477548/* If we haven't got a previously found position for first_cu, or if7549the current starting position is later, we need to do a search. If7550the code unit is not found, set it to the end. */75517552if (memchr_found_first_cu == NULL ||7553start_match > memchr_found_first_cu)7554{7555pp1 = memchr(start_match, first_cu, searchlength);7556memchr_found_first_cu = (pp1 == NULL)? end_subject : pp1;7557}75587559/* If the start is before a previously found position, use the7560previous position, or NULL if a previous search failed. */75617562else pp1 = (memchr_found_first_cu == end_subject)? NULL :7563memchr_found_first_cu;75647565/* Do the same thing for the other case. */75667567if (memchr_found_first_cu2 == NULL ||7568start_match > memchr_found_first_cu2)7569{7570pp2 = memchr(start_match, first_cu2, searchlength);7571memchr_found_first_cu2 = (pp2 == NULL)? end_subject : pp2;7572}75737574else pp2 = (memchr_found_first_cu2 == end_subject)? NULL :7575memchr_found_first_cu2;75767577/* Set the start to the end of the subject if neither case was found.7578Otherwise, use the earlier found point. */75797580if (pp1 == NULL)7581start_match = (pp2 == NULL)? end_subject : pp2;7582else7583start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2;75847585#endif /* 8-bit handling */7586}75877588/* The caseful case is much simpler. */75897590else7591{7592#if PCRE2_CODE_UNIT_WIDTH != 87593while (start_match < end_subject && UCHAR21TEST(start_match) !=7594first_cu)7595start_match++;7596#else7597start_match = memchr(start_match, first_cu, end_subject - start_match);7598if (start_match == NULL) start_match = end_subject;7599#endif7600}76017602/* If we can't find the required first code unit, having reached the7603true end of the subject, break the bumpalong loop, to force a match7604failure, except when doing partial matching, when we let the next cycle7605run at the end of the subject. To see why, consider the pattern7606/(?<=abc)def/, which partially matches "abc", even though the string7607does not contain the starting character "d". If we have not reached the7608true end of the subject (PCRE2_FIRSTLINE caused end_subject to be7609temporarily modified) we also let the cycle run, because the matching7610string is legitimately allowed to start with the first code unit of a7611newline. */76127613if (mb->partial == 0 && start_match >= mb->end_subject)7614{7615rc = MATCH_NOMATCH;7616break;7617}7618}76197620/* If there's no first code unit, advance to just after a linebreak for a7621multiline match if required. */76227623else if (startline)7624{7625if (start_match > mb->start_subject + start_offset)7626{7627#ifdef SUPPORT_UNICODE7628if (utf)7629{7630while (start_match < end_subject && !WAS_NEWLINE(start_match))7631{7632start_match++;7633ACROSSCHAR(start_match < end_subject, start_match, start_match++);7634}7635}7636else7637#endif7638while (start_match < end_subject && !WAS_NEWLINE(start_match))7639start_match++;76407641/* If we have just passed a CR and the newline option is ANY or7642ANYCRLF, and we are now at a LF, advance the match position by one7643more code unit. */76447645if (start_match[-1] == CHAR_CR &&7646(mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) &&7647start_match < end_subject &&7648UCHAR21TEST(start_match) == CHAR_NL)7649start_match++;7650}7651}76527653/* If there's no first code unit or a requirement for a multiline line7654start, advance to a non-unique first code unit if any have been7655identified. The bitmap contains only 256 bits. When code units are 16 or765632 bits wide, all code units greater than 254 set the 255 bit. */76577658else if (start_bits != NULL)7659{7660while (start_match < end_subject)7661{7662uint32_t c = UCHAR21TEST(start_match);7663#if PCRE2_CODE_UNIT_WIDTH != 87664if (c > 255) c = 255;7665#endif7666if ((start_bits[c/8] & (1u << (c&7))) != 0) break;7667start_match++;7668}76697670/* See comment above in first_cu checking about the next few lines. */76717672if (mb->partial == 0 && start_match >= mb->end_subject)7673{7674rc = MATCH_NOMATCH;7675break;7676}7677}7678} /* End first code unit handling */76797680/* Restore fudged end_subject */76817682end_subject = mb->end_subject;76837684/* The following two optimizations must be disabled for partial matching. */76857686if (mb->partial == 0)7687{7688PCRE2_SPTR p;76897690/* The minimum matching length is a lower bound; no string of that length7691may actually match the pattern. Although the value is, strictly, in7692characters, we treat it as code units to avoid spending too much time in7693this optimization. */76947695if (end_subject - start_match < re->minlength)7696{7697rc = MATCH_NOMATCH;7698break;7699}77007701/* If req_cu is set, we know that that code unit must appear in the7702subject for the (non-partial) match to succeed. If the first code unit is7703set, req_cu must be later in the subject; otherwise the test starts at7704the match point. This optimization can save a huge amount of backtracking7705in patterns with nested unlimited repeats that aren't going to match.7706Writing separate code for caseful/caseless versions makes it go faster,7707as does using an autoincrement and backing off on a match. As in the case7708of the first code unit, using memchr() in the 8-bit library gives a big7709speed up. Unlike the first_cu check above, we do not need to call7710memchr() twice in the caseless case because we only need to check for the7711presence of the character in either case, not find the first occurrence.77127713The search can be skipped if the code unit was found later than the7714current starting point in a previous iteration of the bumpalong loop.77157716HOWEVER: when the subject string is very, very long, searching to its end7717can take a long time, and give bad performance on quite ordinary7718anchored patterns. This showed up when somebody was matching something7719like /^\d+C/ on a 32-megabyte string... so we don't do this when the7720string is sufficiently long, but it's worth searching a lot more for7721unanchored patterns. */77227723p = start_match + (has_first_cu? 1:0);7724if (has_req_cu && p > req_cu_ptr)7725{7726PCRE2_SIZE check_length = end_subject - start_match;77277728if (check_length < REQ_CU_MAX ||7729(!anchored && check_length < REQ_CU_MAX * 1000))7730{7731if (req_cu != req_cu2) /* Caseless */7732{7733#if PCRE2_CODE_UNIT_WIDTH != 87734while (p < end_subject)7735{7736uint32_t pp = UCHAR21INCTEST(p);7737if (pp == req_cu || pp == req_cu2) { p--; break; }7738}7739#else /* 8-bit code units */7740PCRE2_SPTR pp = p;7741p = memchr(pp, req_cu, end_subject - pp);7742if (p == NULL)7743{7744p = memchr(pp, req_cu2, end_subject - pp);7745if (p == NULL) p = end_subject;7746}7747#endif /* PCRE2_CODE_UNIT_WIDTH != 8 */7748}77497750/* The caseful case */77517752else7753{7754#if PCRE2_CODE_UNIT_WIDTH != 87755while (p < end_subject)7756{7757if (UCHAR21INCTEST(p) == req_cu) { p--; break; }7758}77597760#else /* 8-bit code units */7761p = memchr(p, req_cu, end_subject - p);7762if (p == NULL) p = end_subject;7763#endif7764}77657766/* If we can't find the required code unit, break the bumpalong loop,7767forcing a match failure. */77687769if (p >= end_subject)7770{7771rc = MATCH_NOMATCH;7772break;7773}77747775/* If we have found the required code unit, save the point where we7776found it, so that we don't search again next time round the bumpalong7777loop if the start hasn't yet passed this code unit. */77787779req_cu_ptr = p;7780}7781}7782}7783}77847785/* ------------ End of start of match optimizations ------------ */77867787/* Give no match if we have passed the bumpalong limit. */77887789if (start_match > bumpalong_limit)7790{7791rc = MATCH_NOMATCH;7792break;7793}77947795/* OK, we can now run the match. If "hitend" is set afterwards, remember the7796first starting point for which a partial match was found. */77977798cb.start_match = (PCRE2_SIZE)(start_match - subject);7799cb.callout_flags |= PCRE2_CALLOUT_STARTMATCH;78007801mb->start_used_ptr = start_match;7802mb->last_used_ptr = start_match;7803#ifdef SUPPORT_UNICODE7804mb->moptions = options | fragment_options;7805#else7806mb->moptions = options;7807#endif7808mb->match_call_count = 0;7809mb->end_offset_top = 0;7810mb->skip_arg_count = 0;78117812#ifdef DEBUG_SHOW_OPS7813fprintf(stderr, "++ Calling match()\n");7814#endif78157816rc = match(start_match, mb->start_code, re->top_bracket, frame_size,7817match_data, mb);78187819#ifdef DEBUG_SHOW_OPS7820fprintf(stderr, "++ match() returned %d\n\n", rc);7821#endif78227823if (mb->hitend && start_partial == NULL)7824{7825start_partial = mb->start_used_ptr;7826match_partial = start_match;7827}78287829switch(rc)7830{7831/* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched7832the SKIP's arg was not found. In this circumstance, Perl ignores the SKIP7833entirely. The only way we can do that is to re-do the match at the same7834point, with a flag to force SKIP with an argument to be ignored. Just7835treating this case as NOMATCH does not work because it does not check other7836alternatives in patterns such as A(*SKIP:A)B|AC when the subject is AC. */78377838case MATCH_SKIP_ARG:7839new_start_match = start_match;7840mb->ignore_skip_arg = mb->skip_arg_count;7841break;78427843/* SKIP passes back the next starting point explicitly, but if it is no7844greater than the match we have just done, treat it as NOMATCH. */78457846case MATCH_SKIP:7847if (mb->verb_skip_ptr > start_match)7848{7849new_start_match = mb->verb_skip_ptr;7850break;7851}7852/* Fall through */78537854/* NOMATCH and PRUNE advance by one character. THEN at this level acts7855exactly like PRUNE. Unset ignore SKIP-with-argument. */78567857case MATCH_NOMATCH:7858case MATCH_PRUNE:7859case MATCH_THEN:7860mb->ignore_skip_arg = 0;7861new_start_match = start_match + 1;7862#ifdef SUPPORT_UNICODE7863if (utf)7864ACROSSCHAR(new_start_match < end_subject, new_start_match,7865new_start_match++);7866#endif7867break;78687869/* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */78707871case MATCH_COMMIT:7872rc = MATCH_NOMATCH;7873goto ENDLOOP;78747875/* Any other return is either a match, or some kind of error. */78767877default:7878goto ENDLOOP;7879}78807881/* Control reaches here for the various types of "no match at this point"7882result. Reset the code to MATCH_NOMATCH for subsequent checking. */78837884rc = MATCH_NOMATCH;78857886/* If PCRE2_FIRSTLINE is set, the match must happen before or at the first7887newline in the subject (though it may continue over the newline). Therefore,7888if we have just failed to match, starting at a newline, do not continue. */78897890if (firstline && IS_NEWLINE(start_match)) break;78917892/* Advance to new matching position */78937894start_match = new_start_match;78957896/* Break the loop if the pattern is anchored or if we have passed the end of7897the subject. */78987899if (anchored || start_match > end_subject) break;79007901/* If we have just passed a CR and we are now at a LF, and the pattern does7902not contain any explicit matches for \r or \n, and the newline option is CRLF7903or ANY or ANYCRLF, advance the match position by one more code unit. In7904normal matching start_match will aways be greater than the first position at7905this stage, but a failed *SKIP can cause a return at the same point, which is7906why the first test exists. */79077908if (start_match > subject + start_offset &&7909start_match[-1] == CHAR_CR &&7910start_match < end_subject &&7911*start_match == CHAR_NL &&7912(re->flags & PCRE2_HASCRORLF) == 0 &&7913(mb->nltype == NLTYPE_ANY ||7914mb->nltype == NLTYPE_ANYCRLF ||7915mb->nllen == 2))7916start_match++;79177918mb->mark = NULL; /* Reset for start of next match attempt */7919} /* End of for(;;) "bumpalong" loop */79207921/* ==========================================================================*/79227923/* When we reach here, one of the following stopping conditions is true:79247925(1) The match succeeded, either completely, or partially;79267927(2) The pattern is anchored or the match was failed after (*COMMIT);79287929(3) We are past the end of the subject or the bumpalong limit;79307931(4) PCRE2_FIRSTLINE is set and we have failed to match at a newline, because7932this option requests that a match occur at or before the first newline in7933the subject.79347935(5) Some kind of error occurred.79367937*/79387939ENDLOOP:79407941/* If end_subject != true_end_subject, it means we are handling invalid UTF,7942and have just processed a non-terminal fragment. If this resulted in no match7943or a partial match we must carry on to the next fragment (a partial match is7944returned to the caller only at the very end of the subject). A loop is used to7945avoid trying to match against empty fragments; if the pattern can match an7946empty string it would have done so already. */79477948#ifdef SUPPORT_UNICODE7949if (utf && end_subject != true_end_subject &&7950(rc == MATCH_NOMATCH || rc == PCRE2_ERROR_PARTIAL))7951{7952for (;;)7953{7954/* Advance past the first bad code unit, and then skip invalid character7955starting code units in 8-bit and 16-bit modes. */79567957start_match = end_subject + 1;79587959#if PCRE2_CODE_UNIT_WIDTH != 327960while (start_match < true_end_subject && NOT_FIRSTCU(*start_match))7961start_match++;7962#endif79637964/* If we have hit the end of the subject, there isn't another non-empty7965fragment, so give up. */79667967if (start_match >= true_end_subject)7968{7969rc = MATCH_NOMATCH; /* In case it was partial */7970match_partial = NULL;7971break;7972}79737974/* Check the rest of the subject */79757976mb->check_subject = start_match;7977rc = PRIV(valid_utf)(start_match, length - (start_match - subject),7978&(match_data->startchar));79797980/* The rest of the subject is valid UTF. */79817982if (rc == 0)7983{7984mb->end_subject = end_subject = true_end_subject;7985fragment_options = PCRE2_NOTBOL;7986goto FRAGMENT_RESTART;7987}79887989/* A subsequent UTF error has been found; if the next fragment is7990non-empty, set up to process it. Otherwise, let the loop advance. */79917992else if (rc < 0)7993{7994mb->end_subject = end_subject = start_match + match_data->startchar;7995if (end_subject > start_match)7996{7997fragment_options = PCRE2_NOTBOL|PCRE2_NOTEOL;7998goto FRAGMENT_RESTART;7999}8000}8001}8002}8003#endif /* SUPPORT_UNICODE */80048005/* Fill in fields that are always returned in the match data. */80068007match_data->code = re;8008match_data->mark = mb->mark;8009match_data->matchedby = PCRE2_MATCHEDBY_INTERPRETER;80108011/* Handle a fully successful match. Set the return code to the number of8012captured strings, or 0 if there were too many to fit into the ovector, and then8013set the remaining returned values before returning. Make a copy of the subject8014string if requested. */80158016if (rc == MATCH_MATCH)8017{8018match_data->rc = ((int)mb->end_offset_top >= 2 * match_data->oveccount)?80190 : (int)mb->end_offset_top/2 + 1;8020match_data->subject_length = length;8021match_data->startchar = start_match - subject;8022match_data->leftchar = mb->start_used_ptr - subject;8023match_data->rightchar = ((mb->last_used_ptr > mb->end_match_ptr)?8024mb->last_used_ptr : mb->end_match_ptr) - subject;8025if ((options & PCRE2_COPY_MATCHED_SUBJECT) != 0)8026{8027length = CU2BYTES(length + was_zero_terminated);8028match_data->subject = match_data->memctl.malloc(length,8029match_data->memctl.memory_data);8030if (match_data->subject == NULL) return PCRE2_ERROR_NOMEMORY;8031memcpy((void *)match_data->subject, subject, length);8032match_data->flags |= PCRE2_MD_COPIED_SUBJECT;8033}8034else match_data->subject = subject;80358036return match_data->rc;8037}80388039/* Control gets here if there has been a partial match, an error, or if the8040overall match attempt has failed at all permitted starting positions. Any mark8041data is in the nomatch_mark field. */80428043match_data->mark = mb->nomatch_mark;80448045/* For anything other than nomatch or partial match, just return the code. */80468047if (rc != MATCH_NOMATCH && rc != PCRE2_ERROR_PARTIAL) match_data->rc = rc;80488049/* Handle a partial match. If a "soft" partial match was requested, searching8050for a complete match will have continued, and the value of rc at this point8051will be MATCH_NOMATCH. For a "hard" partial match, it will already be8052PCRE2_ERROR_PARTIAL. */80538054else if (match_partial != NULL)8055{8056match_data->subject = subject;8057match_data->subject_length = length;8058match_data->ovector[0] = match_partial - subject;8059match_data->ovector[1] = end_subject - subject;8060match_data->startchar = match_partial - subject;8061match_data->leftchar = start_partial - subject;8062match_data->rightchar = end_subject - subject;8063match_data->rc = PCRE2_ERROR_PARTIAL;8064}80658066/* Else this is the classic nomatch case. */80678068else match_data->rc = PCRE2_ERROR_NOMATCH;80698070return match_data->rc;8071}80728073/* These #undefs are here to enable unity builds with CMake. */80748075#undef NLBLOCK /* Block containing newline information */8076#undef PSSTART /* Field containing processed string start */8077#undef PSEND /* Field containing processed string end */80788079/* End of pcre2_match.c */808080818082