ICU 57.1  57.1
regex.h
Go to the documentation of this file.
1 /*
2 **********************************************************************
3 * Copyright (C) 2002-2016, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: regex.h
7 * encoding: US-ASCII
8 * indentation:4
9 *
10 * created on: 2002oct22
11 * created by: Andy Heninger
12 *
13 * ICU Regular Expressions, API for C++
14 */
15 
16 #ifndef REGEX_H
17 #define REGEX_H
18 
19 //#define REGEX_DEBUG
20 
45 #include "unicode/utypes.h"
46 
47 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
48 
49 #include "unicode/uobject.h"
50 #include "unicode/unistr.h"
51 #include "unicode/utext.h"
52 #include "unicode/parseerr.h"
53 
54 #include "unicode/uregex.h"
55 
56 // Forward Declarations
57 
58 struct UHashtable;
59 
61 
62 struct Regex8BitSet;
63 class RegexCImpl;
64 class RegexMatcher;
65 class RegexPattern;
66 struct REStackFrame;
67 class RuleBasedBreakIterator;
68 class UnicodeSet;
69 class UVector;
70 class UVector32;
71 class UVector64;
72 
73 
85 class U_I18N_API RegexPattern U_FINAL : public UObject {
86 public:
87 
95  RegexPattern();
96 
103  RegexPattern(const RegexPattern &source);
104 
110  virtual ~RegexPattern();
111 
120  UBool operator==(const RegexPattern& that) const;
121 
130  inline UBool operator!=(const RegexPattern& that) const {return ! operator ==(that);}
131 
137  RegexPattern &operator =(const RegexPattern &source);
138 
146  virtual RegexPattern *clone() const;
147 
148 
173  static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
174  UParseError &pe,
175  UErrorCode &status);
176 
203  static RegexPattern * U_EXPORT2 compile( UText *regex,
204  UParseError &pe,
205  UErrorCode &status);
206 
231  static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
232  uint32_t flags,
233  UParseError &pe,
234  UErrorCode &status);
235 
262  static RegexPattern * U_EXPORT2 compile( UText *regex,
263  uint32_t flags,
264  UParseError &pe,
265  UErrorCode &status);
266 
289  static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
290  uint32_t flags,
291  UErrorCode &status);
292 
317  static RegexPattern * U_EXPORT2 compile( UText *regex,
318  uint32_t flags,
319  UErrorCode &status);
320 
326  virtual uint32_t flags() const;
327 
345  virtual RegexMatcher *matcher(const UnicodeString &input,
346  UErrorCode &status) const;
347 
348 private:
361  RegexMatcher *matcher(const UChar *input,
362  UErrorCode &status) const;
363 public:
364 
365 
377  virtual RegexMatcher *matcher(UErrorCode &status) const;
378 
379 
394  static UBool U_EXPORT2 matches(const UnicodeString &regex,
395  const UnicodeString &input,
396  UParseError &pe,
397  UErrorCode &status);
398 
413  static UBool U_EXPORT2 matches(UText *regex,
414  UText *input,
415  UParseError &pe,
416  UErrorCode &status);
417 
426  virtual UnicodeString pattern() const;
427 
428 
439  virtual UText *patternText(UErrorCode &status) const;
440 
441 
455  virtual int32_t groupNumberFromName(const UnicodeString &groupName, UErrorCode &status) const;
456 
457 
474  virtual int32_t groupNumberFromName(const char *groupName, int32_t nameLength, UErrorCode &status) const;
475 
476 
515  virtual int32_t split(const UnicodeString &input,
516  UnicodeString dest[],
517  int32_t destCapacity,
518  UErrorCode &status) const;
519 
520 
559  virtual int32_t split(UText *input,
560  UText *dest[],
561  int32_t destCapacity,
562  UErrorCode &status) const;
563 
564 
570  virtual UClassID getDynamicClassID() const;
571 
577  static UClassID U_EXPORT2 getStaticClassID();
578 
579 private:
580  //
581  // Implementation Data
582  //
583  UText *fPattern; // The original pattern string.
584  UnicodeString *fPatternString; // The original pattern UncodeString if relevant
585  uint32_t fFlags; // The flags used when compiling the pattern.
586  //
587  UVector64 *fCompiledPat; // The compiled pattern p-code.
588  UnicodeString fLiteralText; // Any literal string data from the pattern,
589  // after un-escaping, for use during the match.
590 
591  UVector *fSets; // Any UnicodeSets referenced from the pattern.
592  Regex8BitSet *fSets8; // (and fast sets for latin-1 range.)
593 
594 
595  UErrorCode fDeferredStatus; // status if some prior error has left this
596  // RegexPattern in an unusable state.
597 
598  int32_t fMinMatchLen; // Minimum Match Length. All matches will have length
599  // >= this value. For some patterns, this calculated
600  // value may be less than the true shortest
601  // possible match.
602 
603  int32_t fFrameSize; // Size of a state stack frame in the
604  // execution engine.
605 
606  int32_t fDataSize; // The size of the data needed by the pattern that
607  // does not go on the state stack, but has just
608  // a single copy per matcher.
609 
610  UVector32 *fGroupMap; // Map from capture group number to position of
611  // the group's variables in the matcher stack frame.
612 
613  UnicodeSet **fStaticSets; // Ptr to static (shared) sets for predefined
614  // regex character classes, e.g. Word.
615 
616  Regex8BitSet *fStaticSets8; // Ptr to the static (shared) latin-1 only
617  // sets for predefined regex classes.
618 
619  int32_t fStartType; // Info on how a match must start.
620  int32_t fInitialStringIdx; //
621  int32_t fInitialStringLen;
622  UnicodeSet *fInitialChars;
623  UChar32 fInitialChar;
624  Regex8BitSet *fInitialChars8;
625  UBool fNeedsAltInput;
626 
627  UHashtable *fNamedCaptureMap; // Map from capture group names to numbers.
628 
629  friend class RegexCompile;
630  friend class RegexMatcher;
631  friend class RegexCImpl;
632 
633  //
634  // Implementation Methods
635  //
636  void init(); // Common initialization, for use by constructors.
637  void zap(); // Common cleanup
638 
639  void dumpOp(int32_t index) const;
640 
641  public:
642 #ifndef U_HIDE_INTERNAL_API
643 
647  void dumpPattern() const;
648 #endif /* U_HIDE_INTERNAL_API */
649 };
650 
651 
652 
662 class U_I18N_API RegexMatcher U_FINAL : public UObject {
663 public:
664 
679  RegexMatcher(const UnicodeString &regexp, uint32_t flags, UErrorCode &status);
680 
696  RegexMatcher(UText *regexp, uint32_t flags, UErrorCode &status);
697 
719  RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
720  uint32_t flags, UErrorCode &status);
721 
743  RegexMatcher(UText *regexp, UText *input,
744  uint32_t flags, UErrorCode &status);
745 
746 private:
759  RegexMatcher(const UnicodeString &regexp, const UChar *input,
760  uint32_t flags, UErrorCode &status);
761 public:
762 
763 
769  virtual ~RegexMatcher();
770 
771 
778  virtual UBool matches(UErrorCode &status);
779 
780 
791  virtual UBool matches(int64_t startIndex, UErrorCode &status);
792 
793 
807  virtual UBool lookingAt(UErrorCode &status);
808 
809 
823  virtual UBool lookingAt(int64_t startIndex, UErrorCode &status);
824 
825 
838  virtual UBool find();
839 
840 
854  virtual UBool find(UErrorCode &status);
855 
865  virtual UBool find(int64_t start, UErrorCode &status);
866 
867 
877  virtual UnicodeString group(UErrorCode &status) const;
878 
879 
897  virtual UnicodeString group(int32_t groupNum, UErrorCode &status) const;
898 
904  virtual int32_t groupCount() const;
905 
906 
921  virtual UText *group(UText *dest, int64_t &group_len, UErrorCode &status) const;
922 
943  virtual UText *group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const;
944 
952  virtual int32_t start(UErrorCode &status) const;
953 
961  virtual int64_t start64(UErrorCode &status) const;
962 
963 
977  virtual int32_t start(int32_t group, UErrorCode &status) const;
978 
992  virtual int64_t start64(int32_t group, UErrorCode &status) const;
993 
1007  virtual int32_t end(UErrorCode &status) const;
1008 
1022  virtual int64_t end64(UErrorCode &status) const;
1023 
1024 
1042  virtual int32_t end(int32_t group, UErrorCode &status) const;
1043 
1061  virtual int64_t end64(int32_t group, UErrorCode &status) const;
1062 
1071  virtual RegexMatcher &reset();
1072 
1073 
1089  virtual RegexMatcher &reset(int64_t index, UErrorCode &status);
1090 
1091 
1109  virtual RegexMatcher &reset(const UnicodeString &input);
1110 
1111 
1125  virtual RegexMatcher &reset(UText *input);
1126 
1127 
1152  virtual RegexMatcher &refreshInputText(UText *input, UErrorCode &status);
1153 
1154 private:
1167  RegexMatcher &reset(const UChar *input);
1168 public:
1169 
1177  virtual const UnicodeString &input() const;
1178 
1187  virtual UText *inputText() const;
1188 
1199  virtual UText *getInput(UText *dest, UErrorCode &status) const;
1200 
1201 
1220  virtual RegexMatcher &region(int64_t start, int64_t limit, UErrorCode &status);
1221 
1233  virtual RegexMatcher &region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status);
1234 
1243  virtual int32_t regionStart() const;
1244 
1253  virtual int64_t regionStart64() const;
1254 
1255 
1264  virtual int32_t regionEnd() const;
1265 
1274  virtual int64_t regionEnd64() const;
1275 
1284  virtual UBool hasTransparentBounds() const;
1285 
1304  virtual RegexMatcher &useTransparentBounds(UBool b);
1305 
1306 
1314  virtual UBool hasAnchoringBounds() const;
1315 
1316 
1329  virtual RegexMatcher &useAnchoringBounds(UBool b);
1330 
1331 
1344  virtual UBool hitEnd() const;
1345 
1355  virtual UBool requireEnd() const;
1356 
1357 
1363  virtual const RegexPattern &pattern() const;
1364 
1365 
1382  virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &status);
1383 
1384 
1405  virtual UText *replaceAll(UText *replacement, UText *dest, UErrorCode &status);
1406 
1407 
1428  virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &status);
1429 
1430 
1455  virtual UText *replaceFirst(UText *replacement, UText *dest, UErrorCode &status);
1456 
1457 
1485  virtual RegexMatcher &appendReplacement(UnicodeString &dest,
1486  const UnicodeString &replacement, UErrorCode &status);
1487 
1488 
1516  virtual RegexMatcher &appendReplacement(UText *dest,
1517  UText *replacement, UErrorCode &status);
1518 
1519 
1530  virtual UnicodeString &appendTail(UnicodeString &dest);
1531 
1532 
1546  virtual UText *appendTail(UText *dest, UErrorCode &status);
1547 
1548 
1572  virtual int32_t split(const UnicodeString &input,
1573  UnicodeString dest[],
1574  int32_t destCapacity,
1575  UErrorCode &status);
1576 
1577 
1601  virtual int32_t split(UText *input,
1602  UText *dest[],
1603  int32_t destCapacity,
1604  UErrorCode &status);
1605 
1627  virtual void setTimeLimit(int32_t limit, UErrorCode &status);
1628 
1635  virtual int32_t getTimeLimit() const;
1636 
1658  virtual void setStackLimit(int32_t limit, UErrorCode &status);
1659 
1667  virtual int32_t getStackLimit() const;
1668 
1669 
1683  virtual void setMatchCallback(URegexMatchCallback *callback,
1684  const void *context,
1685  UErrorCode &status);
1686 
1687 
1698  virtual void getMatchCallback(URegexMatchCallback *&callback,
1699  const void *&context,
1700  UErrorCode &status);
1701 
1702 
1716  virtual void setFindProgressCallback(URegexFindProgressCallback *callback,
1717  const void *context,
1718  UErrorCode &status);
1719 
1720 
1731  virtual void getFindProgressCallback(URegexFindProgressCallback *&callback,
1732  const void *&context,
1733  UErrorCode &status);
1734 
1735 #ifndef U_HIDE_INTERNAL_API
1736 
1741  void setTrace(UBool state);
1742 #endif /* U_HIDE_INTERNAL_API */
1743 
1749  static UClassID U_EXPORT2 getStaticClassID();
1750 
1756  virtual UClassID getDynamicClassID() const;
1757 
1758 private:
1759  // Constructors and other object boilerplate are private.
1760  // Instances of RegexMatcher can not be assigned, copied, cloned, etc.
1761  RegexMatcher(); // default constructor not implemented
1762  RegexMatcher(const RegexPattern *pat);
1763  RegexMatcher(const RegexMatcher &other);
1764  RegexMatcher &operator =(const RegexMatcher &rhs);
1765  void init(UErrorCode &status); // Common initialization
1766  void init2(UText *t, UErrorCode &e); // Common initialization, part 2.
1767 
1768  friend class RegexPattern;
1769  friend class RegexCImpl;
1770 public:
1771 #ifndef U_HIDE_INTERNAL_API
1772 
1773  void resetPreserveRegion(); // Reset matcher state, but preserve any region.
1774 #endif /* U_HIDE_INTERNAL_API */
1775 private:
1776 
1777  //
1778  // MatchAt This is the internal interface to the match engine itself.
1779  // Match status comes back in matcher member variables.
1780  //
1781  void MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status);
1782  inline void backTrack(int64_t &inputIdx, int32_t &patIdx);
1783  UBool isWordBoundary(int64_t pos); // perform Perl-like \b test
1784  UBool isUWordBoundary(int64_t pos); // perform RBBI based \b test
1785  REStackFrame *resetStack();
1786  inline REStackFrame *StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status);
1787  void IncrementTime(UErrorCode &status);
1788 
1789  // Call user find callback function, if set. Return TRUE if operation should be interrupted.
1790  inline UBool findProgressInterrupt(int64_t matchIndex, UErrorCode &status);
1791 
1792  int64_t appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const;
1793 
1794  UBool findUsingChunk(UErrorCode &status);
1795  void MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status);
1796  UBool isChunkWordBoundary(int32_t pos);
1797 
1798  const RegexPattern *fPattern;
1799  RegexPattern *fPatternOwned; // Non-NULL if this matcher owns the pattern, and
1800  // should delete it when through.
1801 
1802  const UnicodeString *fInput; // The string being matched. Only used for input()
1803  UText *fInputText; // The text being matched. Is never NULL.
1804  UText *fAltInputText; // A shallow copy of the text being matched.
1805  // Only created if the pattern contains backreferences.
1806  int64_t fInputLength; // Full length of the input text.
1807  int32_t fFrameSize; // The size of a frame in the backtrack stack.
1808 
1809  int64_t fRegionStart; // Start of the input region, default = 0.
1810  int64_t fRegionLimit; // End of input region, default to input.length.
1811 
1812  int64_t fAnchorStart; // Region bounds for anchoring operations (^ or $).
1813  int64_t fAnchorLimit; // See useAnchoringBounds
1814 
1815  int64_t fLookStart; // Region bounds for look-ahead/behind and
1816  int64_t fLookLimit; // and other boundary tests. See
1817  // useTransparentBounds
1818 
1819  int64_t fActiveStart; // Currently active bounds for matching.
1820  int64_t fActiveLimit; // Usually is the same as region, but
1821  // is changed to fLookStart/Limit when
1822  // entering look around regions.
1823 
1824  UBool fTransparentBounds; // True if using transparent bounds.
1825  UBool fAnchoringBounds; // True if using anchoring bounds.
1826 
1827  UBool fMatch; // True if the last attempted match was successful.
1828  int64_t fMatchStart; // Position of the start of the most recent match
1829  int64_t fMatchEnd; // First position after the end of the most recent match
1830  // Zero if no previous match, even when a region
1831  // is active.
1832  int64_t fLastMatchEnd; // First position after the end of the previous match,
1833  // or -1 if there was no previous match.
1834  int64_t fAppendPosition; // First position after the end of the previous
1835  // appendReplacement(). As described by the
1836  // JavaDoc for Java Matcher, where it is called
1837  // "append position"
1838  UBool fHitEnd; // True if the last match touched the end of input.
1839  UBool fRequireEnd; // True if the last match required end-of-input
1840  // (matched $ or Z)
1841 
1842  UVector64 *fStack;
1843  REStackFrame *fFrame; // After finding a match, the last active stack frame,
1844  // which will contain the capture group results.
1845  // NOT valid while match engine is running.
1846 
1847  int64_t *fData; // Data area for use by the compiled pattern.
1848  int64_t fSmallData[8]; // Use this for data if it's enough.
1849 
1850  int32_t fTimeLimit; // Max time (in arbitrary steps) to let the
1851  // match engine run. Zero for unlimited.
1852 
1853  int32_t fTime; // Match time, accumulates while matching.
1854  int32_t fTickCounter; // Low bits counter for time. Counts down StateSaves.
1855  // Kept separately from fTime to keep as much
1856  // code as possible out of the inline
1857  // StateSave function.
1858 
1859  int32_t fStackLimit; // Maximum memory size to use for the backtrack
1860  // stack, in bytes. Zero for unlimited.
1861 
1862  URegexMatchCallback *fCallbackFn; // Pointer to match progress callback funct.
1863  // NULL if there is no callback.
1864  const void *fCallbackContext; // User Context ptr for callback function.
1865 
1866  URegexFindProgressCallback *fFindProgressCallbackFn; // Pointer to match progress callback funct.
1867  // NULL if there is no callback.
1868  const void *fFindProgressCallbackContext; // User Context ptr for callback function.
1869 
1870 
1871  UBool fInputUniStrMaybeMutable; // Set when fInputText wraps a UnicodeString that may be mutable - compatibility.
1872 
1873  UBool fTraceDebug; // Set true for debug tracing of match engine.
1874 
1875  UErrorCode fDeferredStatus; // Save error state that cannot be immediately
1876  // reported, or that permanently disables this matcher.
1877 
1878  RuleBasedBreakIterator *fWordBreakItr;
1879 };
1880 
1882 #endif // UCONFIG_NO_REGULAR_EXPRESSIONS
1883 #endif
C++ API: Unicode String.
U_EXPORT UBool operator==(const StringPiece &x, const StringPiece &y)
Global operator == for StringPiece.
void * UClassID
UClassID is used to identify classes without using the compiler's RTTI.
Definition: uobject.h:91
Class RegexPattern represents a compiled regular expression.
Definition: regex.h:85
UBool URegexFindProgressCallback(const void *context, int64_t matchIndex)
Function pointer for a regular expression find callback function.
Definition: uregex.h:1571
C API: Abstract Unicode Text API.
class RegexMatcher bundles together a regular expression pattern and input text to which the expressi...
Definition: regex.h:662
UBool operator!=(const RegexPattern &that) const
Comparison operator.
Definition: regex.h:130
#define U_I18N_API
Set to export library symbols from inside the i18n library, and to import them from outside...
Definition: utypes.h:358
#define U_NAMESPACE_BEGIN
This is used to begin a declaration of a public ICU C++ API.
Definition: uversion.h:129
C API: Regular Expressions.
int32_t UChar32
Define UChar32 as a type for single Unicode code points.
Definition: umachine.h:332
virtual UClassID getDynamicClassID() const
ICU4C "poor man's RTTI", returns a UClassID for the actual ICU class.
A mutable set of Unicode characters and multicharacter strings.
Definition: uniset.h:276
C++ API: Common ICU base class UObject.
uint16_t UChar
Define UChar to be UCHAR_TYPE, if that is #defined (for example, to char16_t), or wchar_t if that is ...
Definition: umachine.h:312
#define U_NAMESPACE_END
This is used to end a declaration of a public ICU C++ API.
Definition: uversion.h:130
UBool URegexMatchCallback(const void *context, int32_t steps)
Function pointer for a regular expression matching callback function.
Definition: uregex.h:1497
C API: Parse Error Information.
UErrorCode
Error code to replace exception handling, so that the code is compatible with all C++ compilers...
Definition: utypes.h:476
UText struct.
Definition: utext.h:1343
A subclass of BreakIterator whose behavior is specified using a list of rules.
Definition: rbbi.h:65
A UParseError struct is used to returned detailed information about parsing errors.
Definition: parseerr.h:56
Basic definitions for ICU, for both C and C++ APIs.
UnicodeString is a string class that stores Unicode characters directly and provides similar function...
Definition: unistr.h:293
UObject is the common ICU "boilerplate" class.
Definition: uobject.h:221
int8_t UBool
The ICU boolean type.
Definition: umachine.h:234