Source for gnu.java.util.regex.RESyntax

   1: /* gnu/regexp/RESyntax.java
   2:    Copyright (C) 2006 Free Software Foundation, Inc.
   3: 
   4: This file is part of GNU Classpath.
   5: 
   6: GNU Classpath is free software; you can redistribute it and/or modify
   7: it under the terms of the GNU General Public License as published by
   8: the Free Software Foundation; either version 2, or (at your option)
   9: any later version.
  10: 
  11: GNU Classpath is distributed in the hope that it will be useful, but
  12: WITHOUT ANY WARRANTY; without even the implied warranty of
  13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14: General Public License for more details.
  15: 
  16: You should have received a copy of the GNU General Public License
  17: along with GNU Classpath; see the file COPYING.  If not, write to the
  18: Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  19: 02110-1301 USA.
  20: 
  21: Linking this library statically or dynamically with other modules is
  22: making a combined work based on this library.  Thus, the terms and
  23: conditions of the GNU General Public License cover the whole
  24: combination.
  25: 
  26: As a special exception, the copyright holders of this library give you
  27: permission to link this library with independent modules to produce an
  28: executable, regardless of the license terms of these independent
  29: modules, and to copy and distribute the resulting executable under
  30: terms of your choice, provided that you also meet, for each linked
  31: independent module, the terms and conditions of the license of that
  32: module.  An independent module is a module which is not derived from
  33: or based on this library.  If you modify this library, you may extend
  34: this exception to your version of the library, but you are not
  35: obligated to do so.  If you do not wish to do so, delete this
  36: exception statement from your version. */
  37: 
  38: 
  39: package gnu.java.util.regex;
  40: import java.io.Serializable;
  41: import java.util.BitSet;
  42: 
  43: /**
  44:  * An RESyntax specifies the way a regular expression will be compiled.
  45:  * This class provides a number of predefined useful constants for
  46:  * emulating popular regular expression syntaxes.  Additionally the
  47:  * user may construct his or her own syntax, using any combination of the
  48:  * syntax bit constants.  The syntax is an optional argument to any of the
  49:  * matching methods on class RE.
  50:  *
  51:  * @author <A HREF="mailto:wes@cacas.org">Wes Biggs</A>
  52:  */
  53: 
  54: public final class RESyntax implements Serializable {
  55:     static final String DEFAULT_LINE_SEPARATOR = System.getProperty("line.separator");
  56: 
  57:     private BitSet bits;
  58: 
  59:     // true for the constant defined syntaxes
  60:     private boolean isFinal = false;
  61: 
  62:     private String lineSeparator = DEFAULT_LINE_SEPARATOR;
  63: 
  64:   // Values for constants are bit indexes
  65: 
  66:   /**
  67:    * Syntax bit. Backslash is an escape character in lists.
  68:    */
  69:   public static final int RE_BACKSLASH_ESCAPE_IN_LISTS =  0;
  70: 
  71:   /**
  72:    * Syntax bit. Use \? instead of ? and \+ instead of +.
  73:    */
  74:   public static final int RE_BK_PLUS_QM                =  1;
  75: 
  76:   /**
  77:    * Syntax bit. POSIX character classes ([:...:]) in lists are allowed.
  78:    */
  79:   public static final int RE_CHAR_CLASSES              =  2;
  80: 
  81:   /**
  82:    * Syntax bit. ^ and $ are special everywhere.
  83:    * <B>Not implemented.</B>
  84:    */
  85:   public static final int RE_CONTEXT_INDEP_ANCHORS     =  3; 
  86: 
  87:   /**
  88:    * Syntax bit. Repetition operators are only special in valid positions.
  89:    * <B>Not implemented.</B>
  90:    */
  91:   public static final int RE_CONTEXT_INDEP_OPS         =  4; 
  92: 
  93:   /**
  94:    * Syntax bit. Repetition and alternation operators are invalid
  95:    * at start and end of pattern and other places. 
  96:    * <B>Not implemented</B>.
  97:    */
  98:   public static final int RE_CONTEXT_INVALID_OPS       =  5; 
  99: 
 100:   /**
 101:    * Syntax bit. Match-any-character operator (.) matches a newline.
 102:    */
 103:   public static final int RE_DOT_NEWLINE               =  6;
 104: 
 105:   /**
 106:    * Syntax bit. Match-any-character operator (.) does not match a null.
 107:    */
 108:   public static final int RE_DOT_NOT_NULL              =  7;
 109: 
 110:   /**
 111:    * Syntax bit. Intervals ({x}, {x,}, {x,y}) are allowed.
 112:    */
 113:   public static final int RE_INTERVALS                 =  8;
 114: 
 115:   /**
 116:    * Syntax bit. No alternation (|), match one-or-more (+), or 
 117:    * match zero-or-one (?) operators.
 118:    */
 119:   public static final int RE_LIMITED_OPS               =  9;
 120: 
 121:   /**
 122:    * Syntax bit. Newline is an alternation operator.
 123:    */
 124:   public static final int RE_NEWLINE_ALT               = 10; // impl.
 125: 
 126:   /**
 127:    * Syntax bit. Intervals use { } instead of \{ \}
 128:    */
 129:   public static final int RE_NO_BK_BRACES              = 11; 
 130: 
 131:   /**
 132:    * Syntax bit. Grouping uses ( ) instead of \( \).
 133:    */
 134:   public static final int RE_NO_BK_PARENS              = 12;
 135: 
 136:   /**
 137:    * Syntax bit. Backreferences not allowed.
 138:    */
 139:   public static final int RE_NO_BK_REFS                = 13;
 140: 
 141:   /**
 142:    * Syntax bit. Alternation uses | instead of \|
 143:    */
 144:   public static final int RE_NO_BK_VBAR                = 14;
 145: 
 146:   /**
 147:    * Syntax bit. <B>Not implemented</B>.
 148:    */
 149:   public static final int RE_NO_EMPTY_RANGES           = 15;
 150: 
 151:   /**
 152:    * Syntax bit. An unmatched right parenthesis (')' or '\)', depending
 153:    * on RE_NO_BK_PARENS) will throw an exception when compiling.
 154:    */
 155:   public static final int RE_UNMATCHED_RIGHT_PAREN_ORD = 16;
 156: 
 157:   /**
 158:    * Syntax bit. <B>Not implemented.</B>
 159:    */
 160:   public static final int RE_HAT_LISTS_NOT_NEWLINE     = 17;
 161: 
 162:   /**
 163:    * Syntax bit.  Stingy matching is allowed (+?, *?, ??, {x,y}?).
 164:    */
 165:   public static final int RE_STINGY_OPS                = 18;
 166: 
 167:   /**
 168:    * Syntax bit. Allow character class escapes (\d, \D, \s, \S, \w, \W).
 169:    */
 170:   public static final int RE_CHAR_CLASS_ESCAPES        = 19;
 171: 
 172:   /**
 173:    * Syntax bit. Allow use of (?:xxx) grouping (subexpression is not saved).
 174:    */
 175:   public static final int RE_PURE_GROUPING             = 20;
 176: 
 177:   /**
 178:    * Syntax bit. Allow use of (?=xxx) and (?!xxx) apply the subexpression
 179:    * to the text following the current position without consuming that text.
 180:    */
 181:   public static final int RE_LOOKAHEAD                 = 21;
 182: 
 183:   /**
 184:    * Syntax bit. Allow beginning- and end-of-string anchors (\A, \Z).
 185:    */
 186:   public static final int RE_STRING_ANCHORS            = 22;
 187: 
 188:   /**
 189:    * Syntax bit. Allow embedded comments, (?#comment), as in Perl5.
 190:    */
 191:   public static final int RE_COMMENTS                  = 23;
 192: 
 193:   /**
 194:    * Syntax bit. Allow character class escapes within lists, as in Perl5.
 195:    */
 196:   public static final int RE_CHAR_CLASS_ESC_IN_LISTS   = 24;
 197: 
 198:   /**
 199:    * Syntax bit.  Possessive matching is allowed (++, *+, ?+, {x,y}+).
 200:    */
 201:   public static final int RE_POSSESSIVE_OPS            = 25;
 202: 
 203:   /**
 204:    * Syntax bit.  Allow embedded flags, (?is-x), as in Perl5.
 205:    */
 206:   public static final int RE_EMBEDDED_FLAGS            = 26;
 207: 
 208:   /**
 209:    * Syntax bit.  Allow octal char (\0377), as in Perl5.
 210:    */
 211:   public static final int RE_OCTAL_CHAR                = 27;
 212: 
 213:   /**
 214:    * Syntax bit.  Allow hex char (\x1b), as in Perl5.
 215:    */
 216:   public static final int RE_HEX_CHAR                  = 28;
 217: 
 218:   /**
 219:    * Syntax bit.  Allow Unicode char (\u1234), as in Java 1.4.
 220:    */
 221:   public static final int RE_UNICODE_CHAR              = 29;
 222: 
 223:   /**
 224:    * Syntax bit.  Allow named property (\p{P}, \P{p}), as in Perl5.
 225:    */
 226:   public static final int RE_NAMED_PROPERTY            = 30;
 227: 
 228:   /**
 229:    * Syntax bit.  Allow nested characterclass ([a-z&&[^p-r]]), as in Java 1.4.
 230:    */
 231:   public static final int RE_NESTED_CHARCLASS          = 31;
 232: 
 233:   private static final int BIT_TOTAL                   = 32;
 234: 
 235:   /**
 236:    * Predefined syntax.
 237:    * Emulates regular expression support in the awk utility.
 238:    */
 239:   public static final RESyntax RE_SYNTAX_AWK;
 240: 
 241:   /**
 242:    * Predefined syntax.
 243:    * Emulates regular expression support in the ed utility.
 244:    */
 245:   public static final RESyntax RE_SYNTAX_ED;
 246: 
 247:   /**
 248:    * Predefined syntax.
 249:    * Emulates regular expression support in the egrep utility.
 250:    */
 251:   public static final RESyntax RE_SYNTAX_EGREP;
 252: 
 253:   /**
 254:    * Predefined syntax.
 255:    * Emulates regular expression support in the GNU Emacs editor.
 256:    */
 257:   public static final RESyntax RE_SYNTAX_EMACS;
 258: 
 259:   /**
 260:    * Predefined syntax.
 261:    * Emulates regular expression support in the grep utility.
 262:    */
 263:   public static final RESyntax RE_SYNTAX_GREP;
 264: 
 265:   /**
 266:    * Predefined syntax.
 267:    * Emulates regular expression support in the POSIX awk specification.
 268:    */
 269:   public static final RESyntax RE_SYNTAX_POSIX_AWK;
 270: 
 271:   /**
 272:    * Predefined syntax.
 273:    * Emulates POSIX basic regular expression support.
 274:    */
 275:   public static final RESyntax RE_SYNTAX_POSIX_BASIC;
 276: 
 277:   /**
 278:    * Predefined syntax.
 279:    * Emulates regular expression support in the POSIX egrep specification.
 280:    */
 281:   public static final RESyntax RE_SYNTAX_POSIX_EGREP;
 282: 
 283:   /**
 284:    * Predefined syntax.
 285:    * Emulates POSIX extended regular expression support.
 286:    */
 287:   public static final RESyntax RE_SYNTAX_POSIX_EXTENDED;
 288: 
 289:   /**
 290:    * Predefined syntax.
 291:    * Emulates POSIX basic minimal regular expressions.
 292:    */
 293:   public static final RESyntax RE_SYNTAX_POSIX_MINIMAL_BASIC;
 294: 
 295:   /**
 296:    * Predefined syntax.
 297:    * Emulates POSIX extended minimal regular expressions.
 298:    */
 299:   public static final RESyntax RE_SYNTAX_POSIX_MINIMAL_EXTENDED;
 300: 
 301:   /**
 302:    * Predefined syntax.
 303:    * Emulates regular expression support in the sed utility.
 304:    */
 305:   public static final RESyntax RE_SYNTAX_SED;
 306: 
 307:   /**
 308:    * Predefined syntax.
 309:    * Emulates regular expression support in Larry Wall's perl, version 4,
 310:    */
 311:   public static final RESyntax RE_SYNTAX_PERL4;
 312: 
 313:   /**
 314:    * Predefined syntax.
 315:    * Emulates regular expression support in Larry Wall's perl, version 4,
 316:    * using single line mode (/s modifier).
 317:    */
 318:   public static final RESyntax RE_SYNTAX_PERL4_S; // single line mode (/s)
 319: 
 320:   /**
 321:    * Predefined syntax.
 322:    * Emulates regular expression support in Larry Wall's perl, version 5.
 323:    */
 324:   public static final RESyntax RE_SYNTAX_PERL5;  
 325: 
 326:   /**
 327:    * Predefined syntax.
 328:    * Emulates regular expression support in Larry Wall's perl, version 5,
 329:    * using single line mode (/s modifier).
 330:    */
 331:   public static final RESyntax RE_SYNTAX_PERL5_S;
 332: 
 333:     /**
 334:      * Predefined syntax.
 335:      * Emulates regular expression support in Java 1.4's java.util.regex
 336:      * package.
 337:      */
 338:     public static final RESyntax RE_SYNTAX_JAVA_1_4;
 339: 
 340:   static {
 341:       // Define syntaxes
 342:       
 343:       RE_SYNTAX_EMACS = new RESyntax().makeFinal();
 344:       
 345:       RESyntax RE_SYNTAX_POSIX_COMMON = new RESyntax()
 346:       .set(RE_CHAR_CLASSES)
 347:       .set(RE_DOT_NEWLINE)
 348:       .set(RE_DOT_NOT_NULL)
 349:       .set(RE_INTERVALS)
 350:       .set(RE_NO_EMPTY_RANGES)
 351:       .makeFinal();
 352:       
 353:       RE_SYNTAX_POSIX_BASIC = new RESyntax(RE_SYNTAX_POSIX_COMMON)
 354:       .set(RE_BK_PLUS_QM)
 355:       .makeFinal();
 356:       
 357:       RE_SYNTAX_POSIX_EXTENDED = new RESyntax(RE_SYNTAX_POSIX_COMMON)
 358:       .set(RE_CONTEXT_INDEP_ANCHORS)
 359:       .set(RE_CONTEXT_INDEP_OPS)
 360:       .set(RE_NO_BK_BRACES)
 361:       .set(RE_NO_BK_PARENS)
 362:       .set(RE_NO_BK_VBAR)
 363:       .set(RE_UNMATCHED_RIGHT_PAREN_ORD)
 364:       .makeFinal();
 365: 
 366:       RE_SYNTAX_AWK = new RESyntax()
 367:       .set(RE_BACKSLASH_ESCAPE_IN_LISTS)
 368:       .set(RE_DOT_NOT_NULL)
 369:       .set(RE_NO_BK_PARENS)
 370:       .set(RE_NO_BK_REFS)
 371:       .set(RE_NO_BK_VBAR)
 372:       .set(RE_NO_EMPTY_RANGES)
 373:       .set(RE_UNMATCHED_RIGHT_PAREN_ORD)
 374:       .makeFinal();
 375:       
 376:       RE_SYNTAX_POSIX_AWK = new RESyntax(RE_SYNTAX_POSIX_EXTENDED)
 377:       .set(RE_BACKSLASH_ESCAPE_IN_LISTS)
 378:       .makeFinal();
 379:       
 380:       RE_SYNTAX_GREP = new RESyntax()
 381:       .set(RE_BK_PLUS_QM)
 382:       .set(RE_CHAR_CLASSES)
 383:       .set(RE_HAT_LISTS_NOT_NEWLINE)
 384:       .set(RE_INTERVALS)
 385:       .set(RE_NEWLINE_ALT)
 386:       .makeFinal();
 387:       
 388:       RE_SYNTAX_EGREP = new RESyntax()
 389:       .set(RE_CHAR_CLASSES)
 390:       .set(RE_CONTEXT_INDEP_ANCHORS)
 391:       .set(RE_CONTEXT_INDEP_OPS)
 392:       .set(RE_HAT_LISTS_NOT_NEWLINE)
 393:       .set(RE_NEWLINE_ALT)
 394:       .set(RE_NO_BK_PARENS)
 395:       .set(RE_NO_BK_VBAR)
 396:       .makeFinal();
 397:     
 398:       RE_SYNTAX_POSIX_EGREP = new RESyntax(RE_SYNTAX_EGREP)
 399:       .set(RE_INTERVALS)
 400:       .set(RE_NO_BK_BRACES)
 401:       .makeFinal();
 402:     
 403:       /* P1003.2/D11.2, section 4.20.7.1, lines 5078ff.  */
 404:     
 405:       RE_SYNTAX_ED = new RESyntax(RE_SYNTAX_POSIX_BASIC)
 406:       .makeFinal();
 407:     
 408:       RE_SYNTAX_SED = new RESyntax(RE_SYNTAX_POSIX_BASIC)
 409:       .makeFinal();
 410:       
 411:       RE_SYNTAX_POSIX_MINIMAL_BASIC = new RESyntax(RE_SYNTAX_POSIX_COMMON)
 412:       .set(RE_LIMITED_OPS)
 413:       .makeFinal();
 414:       
 415:       /* Differs from RE_SYNTAX_POSIX_EXTENDED in that RE_CONTEXT_INVALID_OPS
 416:      replaces RE_CONTEXT_INDEP_OPS and RE_NO_BK_REFS is added. */
 417:       
 418:       RE_SYNTAX_POSIX_MINIMAL_EXTENDED = new RESyntax(RE_SYNTAX_POSIX_COMMON)
 419:       .set(RE_CONTEXT_INDEP_ANCHORS)
 420:       .set(RE_CONTEXT_INVALID_OPS)
 421:       .set(RE_NO_BK_BRACES)
 422:       .set(RE_NO_BK_PARENS)
 423:       .set(RE_NO_BK_REFS)
 424:       .set(RE_NO_BK_VBAR)
 425:       .set(RE_UNMATCHED_RIGHT_PAREN_ORD)
 426:       .makeFinal();
 427:       
 428:       /* There is no official Perl spec, but here's a "best guess" */
 429:       
 430:       RE_SYNTAX_PERL4 = new RESyntax()
 431:       .set(RE_BACKSLASH_ESCAPE_IN_LISTS)
 432:       .set(RE_CONTEXT_INDEP_ANCHORS)
 433:       .set(RE_CONTEXT_INDEP_OPS)          // except for '{', apparently
 434:       .set(RE_INTERVALS)
 435:       .set(RE_NO_BK_BRACES)
 436:       .set(RE_NO_BK_PARENS)
 437:       .set(RE_NO_BK_VBAR)
 438:       .set(RE_NO_EMPTY_RANGES)
 439:       .set(RE_CHAR_CLASS_ESCAPES)    // \d,\D,\w,\W,\s,\S
 440:       .makeFinal();
 441:       
 442:       RE_SYNTAX_PERL4_S = new RESyntax(RE_SYNTAX_PERL4)
 443:       .set(RE_DOT_NEWLINE)
 444:       .makeFinal();
 445:       
 446:       RE_SYNTAX_PERL5 = new RESyntax(RE_SYNTAX_PERL4)
 447:       .set(RE_PURE_GROUPING)          // (?:)
 448:       .set(RE_STINGY_OPS)             // *?,??,+?,{}?
 449:       .set(RE_LOOKAHEAD)              // (?=)(?!)
 450:       .set(RE_STRING_ANCHORS)         // \A,\Z
 451:       .set(RE_CHAR_CLASS_ESC_IN_LISTS)// \d,\D,\w,\W,\s,\S within []
 452:       .set(RE_COMMENTS)              // (?#)
 453:       .set(RE_EMBEDDED_FLAGS)         // (?imsx-imsx)
 454:       .set(RE_OCTAL_CHAR)             // \0377
 455:       .set(RE_HEX_CHAR)               // \x1b
 456:       .set(RE_NAMED_PROPERTY)         // \p{prop}, \P{prop}
 457:       .makeFinal();
 458:       
 459:       RE_SYNTAX_PERL5_S = new RESyntax(RE_SYNTAX_PERL5)
 460:       .set(RE_DOT_NEWLINE)
 461:       .makeFinal();
 462: 
 463:       RE_SYNTAX_JAVA_1_4 = new RESyntax(RE_SYNTAX_PERL5)
 464:       // XXX
 465:       .set(RE_POSSESSIVE_OPS)         // *+,?+,++,{}+
 466:       .set(RE_UNICODE_CHAR)           // \u1234
 467:       .set(RE_NESTED_CHARCLASS)       // [a-z&&[^p-r]]
 468:       .makeFinal();
 469:   }
 470: 
 471:   /**
 472:    * Construct a new syntax object with all bits turned off.
 473:    * This is equivalent to RE_SYNTAX_EMACS.
 474:    */
 475:   public RESyntax() {
 476:     bits = new BitSet(BIT_TOTAL);
 477:   }
 478: 
 479:     /**
 480:      * Called internally when constructing predefined syntaxes
 481:      * so their interpretation cannot vary.  Conceivably useful
 482:      * for your syntaxes as well.  Causes IllegalAccessError to
 483:      * be thrown if any attempt to modify the syntax is made.
 484:      *
 485:      * @return this object for convenient chaining
 486:      */
 487:     public RESyntax makeFinal() {
 488:     isFinal = true;
 489:     return this;
 490:     }
 491: 
 492:   /**
 493:    * Construct a new syntax object with all bits set the same 
 494:    * as the other syntax.
 495:    */
 496:   public RESyntax(RESyntax other) {
 497:     bits = (BitSet) other.bits.clone();
 498:   }
 499: 
 500:   /**
 501:    * Check if a given bit is set in this syntax.
 502:    */
 503:   public boolean get(int index) {
 504:     return bits.get(index);
 505:   }
 506: 
 507:   /**
 508:    * Set a given bit in this syntax. 
 509:    *
 510:    * @param index the constant (RESyntax.RE_xxx) bit to set.
 511:    * @return a reference to this object for easy chaining.
 512:    */
 513:   public RESyntax set(int index) {
 514:     if (isFinal)
 515:       throw new IllegalAccessError(RE.getLocalizedMessage("syntax.final"));
 516:     bits.set(index);
 517:     return this;
 518:   }
 519: 
 520:   /**
 521:    * Clear a given bit in this syntax. 
 522:    *
 523:    * @param index the constant (RESyntax.RE_xxx) bit to clear.
 524:    * @return a reference to this object for easy chaining.
 525:    */
 526:   public RESyntax clear(int index) {
 527:       if (isFinal)
 528:         throw new IllegalAccessError(RE.getLocalizedMessage("syntax.final"));
 529:       bits.clear(index);
 530:       return this;
 531:   }
 532: 
 533:     /**
 534:      * Changes the line separator string for regular expressions
 535:      * created using this RESyntax.  The default separator is the
 536:      * value returned by the system property "line.separator", which
 537:      * should be correct when reading platform-specific files from a
 538:      * filesystem.  However, many programs may collect input from
 539:      * sources where the line separator is differently specified (for
 540:      * example, in the applet environment, the text box widget
 541:      * interprets line breaks as single-character newlines,
 542:      * regardless of the host platform.
 543:      *
 544:      * Note that setting the line separator to a character or
 545:      * characters that have specific meaning within the current syntax
 546:      * can cause unexpected chronosynclastic infundibula.
 547:      *
 548:      * @return this object for convenient chaining 
 549:      */
 550:     public RESyntax setLineSeparator(String aSeparator) {
 551:     if (isFinal)
 552:           throw new IllegalAccessError(RE.getLocalizedMessage("syntax.final"));
 553:     lineSeparator = aSeparator;
 554:     return this;
 555:     }
 556: 
 557:     /**
 558:      * Returns the currently active line separator string.  The default
 559:      * is the platform-dependent system property "line.separator".
 560:      */
 561:     public String getLineSeparator() {
 562:     return lineSeparator;
 563:     }
 564: }