Frames | No Frames |
1: /* gnu/regexp/RESyntax.java 2: Copyright (C) 2006 Free Software Foundation, Inc. 3: 4: This file is part of GNU Classpath. 5: 6: GNU Classpath is free software; you can redistribute it and/or modify 7: it under the terms of the GNU General Public License as published by 8: the Free Software Foundation; either version 2, or (at your option) 9: any later version. 10: 11: GNU Classpath is distributed in the hope that it will be useful, but 12: WITHOUT ANY WARRANTY; without even the implied warranty of 13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14: General Public License for more details. 15: 16: You should have received a copy of the GNU General Public License 17: along with GNU Classpath; see the file COPYING. If not, write to the 18: Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 19: 02110-1301 USA. 20: 21: Linking this library statically or dynamically with other modules is 22: making a combined work based on this library. Thus, the terms and 23: conditions of the GNU General Public License cover the whole 24: combination. 25: 26: As a special exception, the copyright holders of this library give you 27: permission to link this library with independent modules to produce an 28: executable, regardless of the license terms of these independent 29: modules, and to copy and distribute the resulting executable under 30: terms of your choice, provided that you also meet, for each linked 31: independent module, the terms and conditions of the license of that 32: module. An independent module is a module which is not derived from 33: or based on this library. If you modify this library, you may extend 34: this exception to your version of the library, but you are not 35: obligated to do so. If you do not wish to do so, delete this 36: exception statement from your version. */ 37: 38: 39: package gnu.java.util.regex; 40: import java.io.Serializable; 41: import java.util.BitSet; 42: 43: /** 44: * An RESyntax specifies the way a regular expression will be compiled. 45: * This class provides a number of predefined useful constants for 46: * emulating popular regular expression syntaxes. Additionally the 47: * user may construct his or her own syntax, using any combination of the 48: * syntax bit constants. The syntax is an optional argument to any of the 49: * matching methods on class RE. 50: * 51: * @author <A HREF="mailto:wes@cacas.org">Wes Biggs</A> 52: */ 53: 54: public final class RESyntax implements Serializable { 55: static final String DEFAULT_LINE_SEPARATOR = System.getProperty("line.separator"); 56: 57: private BitSet bits; 58: 59: // true for the constant defined syntaxes 60: private boolean isFinal = false; 61: 62: private String lineSeparator = DEFAULT_LINE_SEPARATOR; 63: 64: // Values for constants are bit indexes 65: 66: /** 67: * Syntax bit. Backslash is an escape character in lists. 68: */ 69: public static final int RE_BACKSLASH_ESCAPE_IN_LISTS = 0; 70: 71: /** 72: * Syntax bit. Use \? instead of ? and \+ instead of +. 73: */ 74: public static final int RE_BK_PLUS_QM = 1; 75: 76: /** 77: * Syntax bit. POSIX character classes ([:...:]) in lists are allowed. 78: */ 79: public static final int RE_CHAR_CLASSES = 2; 80: 81: /** 82: * Syntax bit. ^ and $ are special everywhere. 83: * <B>Not implemented.</B> 84: */ 85: public static final int RE_CONTEXT_INDEP_ANCHORS = 3; 86: 87: /** 88: * Syntax bit. Repetition operators are only special in valid positions. 89: * <B>Not implemented.</B> 90: */ 91: public static final int RE_CONTEXT_INDEP_OPS = 4; 92: 93: /** 94: * Syntax bit. Repetition and alternation operators are invalid 95: * at start and end of pattern and other places. 96: * <B>Not implemented</B>. 97: */ 98: public static final int RE_CONTEXT_INVALID_OPS = 5; 99: 100: /** 101: * Syntax bit. Match-any-character operator (.) matches a newline. 102: */ 103: public static final int RE_DOT_NEWLINE = 6; 104: 105: /** 106: * Syntax bit. Match-any-character operator (.) does not match a null. 107: */ 108: public static final int RE_DOT_NOT_NULL = 7; 109: 110: /** 111: * Syntax bit. Intervals ({x}, {x,}, {x,y}) are allowed. 112: */ 113: public static final int RE_INTERVALS = 8; 114: 115: /** 116: * Syntax bit. No alternation (|), match one-or-more (+), or 117: * match zero-or-one (?) operators. 118: */ 119: public static final int RE_LIMITED_OPS = 9; 120: 121: /** 122: * Syntax bit. Newline is an alternation operator. 123: */ 124: public static final int RE_NEWLINE_ALT = 10; // impl. 125: 126: /** 127: * Syntax bit. Intervals use { } instead of \{ \} 128: */ 129: public static final int RE_NO_BK_BRACES = 11; 130: 131: /** 132: * Syntax bit. Grouping uses ( ) instead of \( \). 133: */ 134: public static final int RE_NO_BK_PARENS = 12; 135: 136: /** 137: * Syntax bit. Backreferences not allowed. 138: */ 139: public static final int RE_NO_BK_REFS = 13; 140: 141: /** 142: * Syntax bit. Alternation uses | instead of \| 143: */ 144: public static final int RE_NO_BK_VBAR = 14; 145: 146: /** 147: * Syntax bit. <B>Not implemented</B>. 148: */ 149: public static final int RE_NO_EMPTY_RANGES = 15; 150: 151: /** 152: * Syntax bit. An unmatched right parenthesis (')' or '\)', depending 153: * on RE_NO_BK_PARENS) will throw an exception when compiling. 154: */ 155: public static final int RE_UNMATCHED_RIGHT_PAREN_ORD = 16; 156: 157: /** 158: * Syntax bit. <B>Not implemented.</B> 159: */ 160: public static final int RE_HAT_LISTS_NOT_NEWLINE = 17; 161: 162: /** 163: * Syntax bit. Stingy matching is allowed (+?, *?, ??, {x,y}?). 164: */ 165: public static final int RE_STINGY_OPS = 18; 166: 167: /** 168: * Syntax bit. Allow character class escapes (\d, \D, \s, \S, \w, \W). 169: */ 170: public static final int RE_CHAR_CLASS_ESCAPES = 19; 171: 172: /** 173: * Syntax bit. Allow use of (?:xxx) grouping (subexpression is not saved). 174: */ 175: public static final int RE_PURE_GROUPING = 20; 176: 177: /** 178: * Syntax bit. Allow use of (?=xxx) and (?!xxx) apply the subexpression 179: * to the text following the current position without consuming that text. 180: */ 181: public static final int RE_LOOKAHEAD = 21; 182: 183: /** 184: * Syntax bit. Allow beginning- and end-of-string anchors (\A, \Z). 185: */ 186: public static final int RE_STRING_ANCHORS = 22; 187: 188: /** 189: * Syntax bit. Allow embedded comments, (?#comment), as in Perl5. 190: */ 191: public static final int RE_COMMENTS = 23; 192: 193: /** 194: * Syntax bit. Allow character class escapes within lists, as in Perl5. 195: */ 196: public static final int RE_CHAR_CLASS_ESC_IN_LISTS = 24; 197: 198: /** 199: * Syntax bit. Possessive matching is allowed (++, *+, ?+, {x,y}+). 200: */ 201: public static final int RE_POSSESSIVE_OPS = 25; 202: 203: /** 204: * Syntax bit. Allow embedded flags, (?is-x), as in Perl5. 205: */ 206: public static final int RE_EMBEDDED_FLAGS = 26; 207: 208: /** 209: * Syntax bit. Allow octal char (\0377), as in Perl5. 210: */ 211: public static final int RE_OCTAL_CHAR = 27; 212: 213: /** 214: * Syntax bit. Allow hex char (\x1b), as in Perl5. 215: */ 216: public static final int RE_HEX_CHAR = 28; 217: 218: /** 219: * Syntax bit. Allow Unicode char (\u1234), as in Java 1.4. 220: */ 221: public static final int RE_UNICODE_CHAR = 29; 222: 223: /** 224: * Syntax bit. Allow named property (\p{P}, \P{p}), as in Perl5. 225: */ 226: public static final int RE_NAMED_PROPERTY = 30; 227: 228: /** 229: * Syntax bit. Allow nested characterclass ([a-z&&[^p-r]]), as in Java 1.4. 230: */ 231: public static final int RE_NESTED_CHARCLASS = 31; 232: 233: private static final int BIT_TOTAL = 32; 234: 235: /** 236: * Predefined syntax. 237: * Emulates regular expression support in the awk utility. 238: */ 239: public static final RESyntax RE_SYNTAX_AWK; 240: 241: /** 242: * Predefined syntax. 243: * Emulates regular expression support in the ed utility. 244: */ 245: public static final RESyntax RE_SYNTAX_ED; 246: 247: /** 248: * Predefined syntax. 249: * Emulates regular expression support in the egrep utility. 250: */ 251: public static final RESyntax RE_SYNTAX_EGREP; 252: 253: /** 254: * Predefined syntax. 255: * Emulates regular expression support in the GNU Emacs editor. 256: */ 257: public static final RESyntax RE_SYNTAX_EMACS; 258: 259: /** 260: * Predefined syntax. 261: * Emulates regular expression support in the grep utility. 262: */ 263: public static final RESyntax RE_SYNTAX_GREP; 264: 265: /** 266: * Predefined syntax. 267: * Emulates regular expression support in the POSIX awk specification. 268: */ 269: public static final RESyntax RE_SYNTAX_POSIX_AWK; 270: 271: /** 272: * Predefined syntax. 273: * Emulates POSIX basic regular expression support. 274: */ 275: public static final RESyntax RE_SYNTAX_POSIX_BASIC; 276: 277: /** 278: * Predefined syntax. 279: * Emulates regular expression support in the POSIX egrep specification. 280: */ 281: public static final RESyntax RE_SYNTAX_POSIX_EGREP; 282: 283: /** 284: * Predefined syntax. 285: * Emulates POSIX extended regular expression support. 286: */ 287: public static final RESyntax RE_SYNTAX_POSIX_EXTENDED; 288: 289: /** 290: * Predefined syntax. 291: * Emulates POSIX basic minimal regular expressions. 292: */ 293: public static final RESyntax RE_SYNTAX_POSIX_MINIMAL_BASIC; 294: 295: /** 296: * Predefined syntax. 297: * Emulates POSIX extended minimal regular expressions. 298: */ 299: public static final RESyntax RE_SYNTAX_POSIX_MINIMAL_EXTENDED; 300: 301: /** 302: * Predefined syntax. 303: * Emulates regular expression support in the sed utility. 304: */ 305: public static final RESyntax RE_SYNTAX_SED; 306: 307: /** 308: * Predefined syntax. 309: * Emulates regular expression support in Larry Wall's perl, version 4, 310: */ 311: public static final RESyntax RE_SYNTAX_PERL4; 312: 313: /** 314: * Predefined syntax. 315: * Emulates regular expression support in Larry Wall's perl, version 4, 316: * using single line mode (/s modifier). 317: */ 318: public static final RESyntax RE_SYNTAX_PERL4_S; // single line mode (/s) 319: 320: /** 321: * Predefined syntax. 322: * Emulates regular expression support in Larry Wall's perl, version 5. 323: */ 324: public static final RESyntax RE_SYNTAX_PERL5; 325: 326: /** 327: * Predefined syntax. 328: * Emulates regular expression support in Larry Wall's perl, version 5, 329: * using single line mode (/s modifier). 330: */ 331: public static final RESyntax RE_SYNTAX_PERL5_S; 332: 333: /** 334: * Predefined syntax. 335: * Emulates regular expression support in Java 1.4's java.util.regex 336: * package. 337: */ 338: public static final RESyntax RE_SYNTAX_JAVA_1_4; 339: 340: static { 341: // Define syntaxes 342: 343: RE_SYNTAX_EMACS = new RESyntax().makeFinal(); 344: 345: RESyntax RE_SYNTAX_POSIX_COMMON = new RESyntax() 346: .set(RE_CHAR_CLASSES) 347: .set(RE_DOT_NEWLINE) 348: .set(RE_DOT_NOT_NULL) 349: .set(RE_INTERVALS) 350: .set(RE_NO_EMPTY_RANGES) 351: .makeFinal(); 352: 353: RE_SYNTAX_POSIX_BASIC = new RESyntax(RE_SYNTAX_POSIX_COMMON) 354: .set(RE_BK_PLUS_QM) 355: .makeFinal(); 356: 357: RE_SYNTAX_POSIX_EXTENDED = new RESyntax(RE_SYNTAX_POSIX_COMMON) 358: .set(RE_CONTEXT_INDEP_ANCHORS) 359: .set(RE_CONTEXT_INDEP_OPS) 360: .set(RE_NO_BK_BRACES) 361: .set(RE_NO_BK_PARENS) 362: .set(RE_NO_BK_VBAR) 363: .set(RE_UNMATCHED_RIGHT_PAREN_ORD) 364: .makeFinal(); 365: 366: RE_SYNTAX_AWK = new RESyntax() 367: .set(RE_BACKSLASH_ESCAPE_IN_LISTS) 368: .set(RE_DOT_NOT_NULL) 369: .set(RE_NO_BK_PARENS) 370: .set(RE_NO_BK_REFS) 371: .set(RE_NO_BK_VBAR) 372: .set(RE_NO_EMPTY_RANGES) 373: .set(RE_UNMATCHED_RIGHT_PAREN_ORD) 374: .makeFinal(); 375: 376: RE_SYNTAX_POSIX_AWK = new RESyntax(RE_SYNTAX_POSIX_EXTENDED) 377: .set(RE_BACKSLASH_ESCAPE_IN_LISTS) 378: .makeFinal(); 379: 380: RE_SYNTAX_GREP = new RESyntax() 381: .set(RE_BK_PLUS_QM) 382: .set(RE_CHAR_CLASSES) 383: .set(RE_HAT_LISTS_NOT_NEWLINE) 384: .set(RE_INTERVALS) 385: .set(RE_NEWLINE_ALT) 386: .makeFinal(); 387: 388: RE_SYNTAX_EGREP = new RESyntax() 389: .set(RE_CHAR_CLASSES) 390: .set(RE_CONTEXT_INDEP_ANCHORS) 391: .set(RE_CONTEXT_INDEP_OPS) 392: .set(RE_HAT_LISTS_NOT_NEWLINE) 393: .set(RE_NEWLINE_ALT) 394: .set(RE_NO_BK_PARENS) 395: .set(RE_NO_BK_VBAR) 396: .makeFinal(); 397: 398: RE_SYNTAX_POSIX_EGREP = new RESyntax(RE_SYNTAX_EGREP) 399: .set(RE_INTERVALS) 400: .set(RE_NO_BK_BRACES) 401: .makeFinal(); 402: 403: /* P1003.2/D11.2, section 4.20.7.1, lines 5078ff. */ 404: 405: RE_SYNTAX_ED = new RESyntax(RE_SYNTAX_POSIX_BASIC) 406: .makeFinal(); 407: 408: RE_SYNTAX_SED = new RESyntax(RE_SYNTAX_POSIX_BASIC) 409: .makeFinal(); 410: 411: RE_SYNTAX_POSIX_MINIMAL_BASIC = new RESyntax(RE_SYNTAX_POSIX_COMMON) 412: .set(RE_LIMITED_OPS) 413: .makeFinal(); 414: 415: /* Differs from RE_SYNTAX_POSIX_EXTENDED in that RE_CONTEXT_INVALID_OPS 416: replaces RE_CONTEXT_INDEP_OPS and RE_NO_BK_REFS is added. */ 417: 418: RE_SYNTAX_POSIX_MINIMAL_EXTENDED = new RESyntax(RE_SYNTAX_POSIX_COMMON) 419: .set(RE_CONTEXT_INDEP_ANCHORS) 420: .set(RE_CONTEXT_INVALID_OPS) 421: .set(RE_NO_BK_BRACES) 422: .set(RE_NO_BK_PARENS) 423: .set(RE_NO_BK_REFS) 424: .set(RE_NO_BK_VBAR) 425: .set(RE_UNMATCHED_RIGHT_PAREN_ORD) 426: .makeFinal(); 427: 428: /* There is no official Perl spec, but here's a "best guess" */ 429: 430: RE_SYNTAX_PERL4 = new RESyntax() 431: .set(RE_BACKSLASH_ESCAPE_IN_LISTS) 432: .set(RE_CONTEXT_INDEP_ANCHORS) 433: .set(RE_CONTEXT_INDEP_OPS) // except for '{', apparently 434: .set(RE_INTERVALS) 435: .set(RE_NO_BK_BRACES) 436: .set(RE_NO_BK_PARENS) 437: .set(RE_NO_BK_VBAR) 438: .set(RE_NO_EMPTY_RANGES) 439: .set(RE_CHAR_CLASS_ESCAPES) // \d,\D,\w,\W,\s,\S 440: .makeFinal(); 441: 442: RE_SYNTAX_PERL4_S = new RESyntax(RE_SYNTAX_PERL4) 443: .set(RE_DOT_NEWLINE) 444: .makeFinal(); 445: 446: RE_SYNTAX_PERL5 = new RESyntax(RE_SYNTAX_PERL4) 447: .set(RE_PURE_GROUPING) // (?:) 448: .set(RE_STINGY_OPS) // *?,??,+?,{}? 449: .set(RE_LOOKAHEAD) // (?=)(?!) 450: .set(RE_STRING_ANCHORS) // \A,\Z 451: .set(RE_CHAR_CLASS_ESC_IN_LISTS)// \d,\D,\w,\W,\s,\S within [] 452: .set(RE_COMMENTS) // (?#) 453: .set(RE_EMBEDDED_FLAGS) // (?imsx-imsx) 454: .set(RE_OCTAL_CHAR) // \0377 455: .set(RE_HEX_CHAR) // \x1b 456: .set(RE_NAMED_PROPERTY) // \p{prop}, \P{prop} 457: .makeFinal(); 458: 459: RE_SYNTAX_PERL5_S = new RESyntax(RE_SYNTAX_PERL5) 460: .set(RE_DOT_NEWLINE) 461: .makeFinal(); 462: 463: RE_SYNTAX_JAVA_1_4 = new RESyntax(RE_SYNTAX_PERL5) 464: // XXX 465: .set(RE_POSSESSIVE_OPS) // *+,?+,++,{}+ 466: .set(RE_UNICODE_CHAR) // \u1234 467: .set(RE_NESTED_CHARCLASS) // [a-z&&[^p-r]] 468: .makeFinal(); 469: } 470: 471: /** 472: * Construct a new syntax object with all bits turned off. 473: * This is equivalent to RE_SYNTAX_EMACS. 474: */ 475: public RESyntax() { 476: bits = new BitSet(BIT_TOTAL); 477: } 478: 479: /** 480: * Called internally when constructing predefined syntaxes 481: * so their interpretation cannot vary. Conceivably useful 482: * for your syntaxes as well. Causes IllegalAccessError to 483: * be thrown if any attempt to modify the syntax is made. 484: * 485: * @return this object for convenient chaining 486: */ 487: public RESyntax makeFinal() { 488: isFinal = true; 489: return this; 490: } 491: 492: /** 493: * Construct a new syntax object with all bits set the same 494: * as the other syntax. 495: */ 496: public RESyntax(RESyntax other) { 497: bits = (BitSet) other.bits.clone(); 498: } 499: 500: /** 501: * Check if a given bit is set in this syntax. 502: */ 503: public boolean get(int index) { 504: return bits.get(index); 505: } 506: 507: /** 508: * Set a given bit in this syntax. 509: * 510: * @param index the constant (RESyntax.RE_xxx) bit to set. 511: * @return a reference to this object for easy chaining. 512: */ 513: public RESyntax set(int index) { 514: if (isFinal) 515: throw new IllegalAccessError(RE.getLocalizedMessage("syntax.final")); 516: bits.set(index); 517: return this; 518: } 519: 520: /** 521: * Clear a given bit in this syntax. 522: * 523: * @param index the constant (RESyntax.RE_xxx) bit to clear. 524: * @return a reference to this object for easy chaining. 525: */ 526: public RESyntax clear(int index) { 527: if (isFinal) 528: throw new IllegalAccessError(RE.getLocalizedMessage("syntax.final")); 529: bits.clear(index); 530: return this; 531: } 532: 533: /** 534: * Changes the line separator string for regular expressions 535: * created using this RESyntax. The default separator is the 536: * value returned by the system property "line.separator", which 537: * should be correct when reading platform-specific files from a 538: * filesystem. However, many programs may collect input from 539: * sources where the line separator is differently specified (for 540: * example, in the applet environment, the text box widget 541: * interprets line breaks as single-character newlines, 542: * regardless of the host platform. 543: * 544: * Note that setting the line separator to a character or 545: * characters that have specific meaning within the current syntax 546: * can cause unexpected chronosynclastic infundibula. 547: * 548: * @return this object for convenient chaining 549: */ 550: public RESyntax setLineSeparator(String aSeparator) { 551: if (isFinal) 552: throw new IllegalAccessError(RE.getLocalizedMessage("syntax.final")); 553: lineSeparator = aSeparator; 554: return this; 555: } 556: 557: /** 558: * Returns the currently active line separator string. The default 559: * is the platform-dependent system property "line.separator". 560: */ 561: public String getLineSeparator() { 562: return lineSeparator; 563: } 564: }