Source for gnu.xml.xpath.XPathTokenizer

   1: /* XPathTokenizer.java -- 
   2:    Copyright (C) 2004 Free Software Foundation, Inc.
   3: 
   4: This file is part of GNU Classpath.
   5: 
   6: GNU Classpath is free software; you can redistribute it and/or modify
   7: it under the terms of the GNU General Public License as published by
   8: the Free Software Foundation; either version 2, or (at your option)
   9: any later version.
  10: 
  11: GNU Classpath is distributed in the hope that it will be useful, but
  12: WITHOUT ANY WARRANTY; without even the implied warranty of
  13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14: General Public License for more details.
  15: 
  16: You should have received a copy of the GNU General Public License
  17: along with GNU Classpath; see the file COPYING.  If not, write to the
  18: Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  19: 02110-1301 USA.
  20: 
  21: Linking this library statically or dynamically with other modules is
  22: making a combined work based on this library.  Thus, the terms and
  23: conditions of the GNU General Public License cover the whole
  24: combination.
  25: 
  26: As a special exception, the copyright holders of this library give you
  27: permission to link this library with independent modules to produce an
  28: executable, regardless of the license terms of these independent
  29: modules, and to copy and distribute the resulting executable under
  30: terms of your choice, provided that you also meet, for each linked
  31: independent module, the terms and conditions of the license of that
  32: module.  An independent module is a module which is not derived from
  33: or based on this library.  If you modify this library, you may extend
  34: this exception to your version of the library, but you are not
  35: obligated to do so.  If you do not wish to do so, delete this
  36: exception statement from your version. */
  37: 
  38: package gnu.xml.xpath;
  39: 
  40: import java.io.BufferedReader;
  41: import java.io.IOException;
  42: import java.io.Reader;
  43: import java.io.StringReader;
  44: import java.util.Map;
  45: import java.util.TreeMap;
  46: 
  47: /*import antlr.Token;
  48: import antlr.TokenStream;
  49: import antlr.TokenStreamException;
  50: import antlr.TokenStreamIOException;*/
  51: 
  52: /**
  53:  * XPath 1.0 expression tokenizer.
  54:  * 
  55:  * @author <a href='mailto:dog@gnu.org'>Chris Burdess</a>
  56:  */
  57: public class XPathTokenizer
  58: implements XPathParser.yyInput
  59: //implements TokenStream
  60: {
  61: 
  62:   static class XPathToken
  63:   //extends Token
  64:   {
  65: 
  66:     int type;
  67:     String val;
  68: 
  69:     XPathToken (int type)
  70:     {
  71:       this (type, null);
  72:     }
  73: 
  74:     XPathToken (int type, String val)
  75:     {
  76:       //super (type);
  77:       this.type = type;
  78:       this.val = val;
  79:     }
  80: 
  81:     public String getText ()
  82:     {
  83:       return val;
  84:     }
  85: 
  86:     public String toString ()
  87:     {
  88:       return val;
  89:     }
  90:     
  91:   }
  92: 
  93:   static final Map keywords = new TreeMap ();
  94:   static
  95:   {
  96:     keywords.put ("ancestor", new Integer (XPathParser.ANCESTOR));
  97:     keywords.put ("ancestor-or-self", new Integer (XPathParser.ANCESTOR_OR_SELF));
  98:     keywords.put ("attribute", new Integer (XPathParser.ATTRIBUTE));
  99:     keywords.put ("child", new Integer (XPathParser.CHILD));
 100:     keywords.put ("descendant", new Integer (XPathParser.DESCENDANT));
 101:     keywords.put ("descendant-or-self", new Integer (XPathParser.DESCENDANT_OR_SELF));
 102:     keywords.put ("following", new Integer (XPathParser.FOLLOWING));
 103:     keywords.put ("following-sibling", new Integer (XPathParser.FOLLOWING_SIBLING));
 104:     keywords.put ("namespace", new Integer (XPathParser.NAMESPACE));
 105:     keywords.put ("parent", new Integer (XPathParser.PARENT));
 106:     keywords.put ("preceding", new Integer (XPathParser.PRECEDING));
 107:     keywords.put ("preceding-sibling", new Integer (XPathParser.PRECEDING_SIBLING));
 108:     keywords.put ("self", new Integer (XPathParser.SELF));
 109:     keywords.put ("div", new Integer (XPathParser.DIV));
 110:     keywords.put ("mod", new Integer (XPathParser.MOD));
 111:     keywords.put ("or", new Integer (XPathParser.OR));
 112:     keywords.put ("and", new Integer (XPathParser.AND));
 113:     keywords.put ("comment", new Integer (XPathParser.COMMENT));
 114:     keywords.put ("processing-instruction", new Integer (XPathParser.PROCESSING_INSTRUCTION));
 115:     keywords.put ("text", new Integer (XPathParser.TEXT));
 116:     keywords.put ("node", new Integer (XPathParser.NODE));
 117:   }
 118: 
 119:   Reader in;
 120:   XPathToken token;
 121:   XPathToken lastToken;
 122: 
 123:   public XPathTokenizer (String expr)
 124:   {
 125:     this (new StringReader (expr));
 126:   }
 127: 
 128:   XPathTokenizer (Reader in)
 129:   {
 130:     this.in = in.markSupported () ? in : new BufferedReader (in);
 131:   }
 132: 
 133:   /* Begin ANTLR specific *
 134: 
 135:   public Token nextToken ()
 136:     throws TokenStreamException
 137:   {
 138:     try
 139:       {
 140:         if (!advance ())
 141:           {
 142:             throw new TokenStreamException ("eof");
 143:           }
 144:         token ();
 145:         return token;
 146:       }
 147:     catch (IOException e)
 148:       {
 149:         throw new TokenStreamIOException (e);
 150:       }
 151:   }
 152:   
 153:   * End ANTLR specific */
 154: 
 155:   public boolean advance ()
 156:     throws IOException
 157:   {
 158:     lastToken = token;
 159:     int c = in.read ();
 160:     switch (c)
 161:       {
 162:       case -1: // eof
 163:         return false;
 164:       case 0x20:
 165:       case 0x09:
 166:       case 0x0d:
 167:       case 0x0a: // skip whitespace
 168:         return advance ();
 169:       case 0x22: // "
 170:       case 0x27: // '
 171:         token = consume_literal (c);
 172:         break;
 173:       case 0x28: // (
 174:         token = new XPathToken (XPathParser.LP);
 175:         break;
 176:       case 0x29: // )
 177:         token = new XPathToken (XPathParser.RP);
 178:         break;
 179:       case 0x5b: // [
 180:         token = new XPathToken (XPathParser.LB);
 181:         break;
 182:       case 0x5d: // ]
 183:         token = new XPathToken (XPathParser.RB);
 184:         break;
 185:       case 0x2c: // ,
 186:         token = new XPathToken (XPathParser.COMMA);
 187:         break;
 188:       case 0x7c: // |
 189:         token = new XPathToken (XPathParser.PIPE);
 190:         break;
 191:       case 0x2f: // /
 192:         in.mark (1);
 193:         int d1 = in.read ();
 194:         if (d1 == 0x2f)
 195:           {
 196:             token = new XPathToken (XPathParser.DOUBLE_SLASH);
 197:           }
 198:         else
 199:           {
 200:             in.reset ();
 201:             token = new XPathToken (XPathParser.SLASH);
 202:           }
 203:         break;
 204:       case 0x3d: // =
 205:         token = new XPathToken (XPathParser.EQ);
 206:         break;
 207:       case 0x21: // !
 208:         in.mark (1);
 209:         int d2 = in.read ();
 210:         if (d2 == 0x3d) // =
 211:           {
 212:             token = new XPathToken (XPathParser.NE);
 213:           }
 214:         else
 215:           {
 216:             in.reset ();
 217:             token = new XPathToken (XPathParser.yyErrorCode);
 218:           }
 219:         break;
 220:       case 0x3e: // >
 221:         in.mark (1);
 222:         int d3 = in.read ();
 223:         if (d3 == 0x3d) // =
 224:           {
 225:             token = new XPathToken (XPathParser.GTE);
 226:           }
 227:         else
 228:           {
 229:             in.reset ();
 230:             token = new XPathToken (XPathParser.GT);
 231:           }
 232:         break;
 233:       case 0x3c: // <
 234:         in.mark (1);
 235:         int d4 = in.read ();
 236:         if (d4 == 0x3d) // =
 237:           {
 238:             token = new XPathToken (XPathParser.LTE);
 239:           }
 240:         else
 241:           {
 242:             in.reset ();
 243:             token = new XPathToken (XPathParser.LT);
 244:           }
 245:         break;
 246:       case 0x2b: // +
 247:         token = new XPathToken (XPathParser.PLUS);
 248:         break;
 249:       case 0x2d: // -
 250:         token = new XPathToken (XPathParser.MINUS);
 251:         break;
 252:       case 0x40: // @
 253:         token = new XPathToken (XPathParser.AT);
 254:         break;
 255:       case 0x2a: // *
 256:         token = new XPathToken (XPathParser.STAR);
 257:         break;
 258:       case 0x24: // $
 259:         token = new XPathToken (XPathParser.DOLLAR);
 260:         break;
 261:       case 0x3a: // :
 262:         in.mark (1);
 263:         int d5 = in.read ();
 264:         if (d5 == 0x3a)
 265:           {
 266:             token = new XPathToken (XPathParser.DOUBLE_COLON);
 267:           }
 268:         else
 269:           {
 270:             in.reset ();
 271:             token = new XPathToken (XPathParser.COLON);
 272:           }
 273:         break;
 274:       case 0x2e: // .
 275:         in.mark (1);
 276:         int d6 = in.read ();
 277:         if (d6 == 0x2e)
 278:           {
 279:             token = new XPathToken (XPathParser.DOUBLE_DOT);
 280:           }
 281:         else
 282:           {
 283:             in.reset ();
 284:             token = new XPathToken (XPathParser.DOT);
 285:           }
 286:         break;
 287:       default:
 288:         if (c >= 0x30 && c <= 0x39)
 289:           {
 290:             token = consume_digits (c);
 291:           }
 292:         else if (c == 0x5f || Character.isLetter ((char) c))
 293:           {
 294:             token = consume_name (c);
 295:           }
 296:         else
 297:           {
 298:             token = new XPathToken (XPathParser.yyErrorCode);
 299:           }
 300:       }
 301:     return true;
 302:   }
 303: 
 304:   public int token ()
 305:   {
 306:     return token.type;
 307:   }
 308: 
 309:   public Object value ()
 310:   {
 311:     return token.val;
 312:   }
 313: 
 314:   XPathToken consume_literal (int delimiter)
 315:     throws IOException
 316:   {
 317:     StringBuffer buf = new StringBuffer ();
 318:     while (true)
 319:       {
 320:         int c = in.read ();
 321:         if (c == -1)
 322:           {
 323:             return new XPathToken (XPathParser.yyErrorCode);
 324:           }
 325:         else if (c == delimiter)
 326:           {
 327:             return new XPathToken (XPathParser.LITERAL, buf.toString ());
 328:           }
 329:         else
 330:           {
 331:             buf.append ((char) c);
 332:           }
 333:       }
 334:   }
 335: 
 336:   XPathToken consume_digits (int c)
 337:     throws IOException
 338:   {
 339:     StringBuffer buf = new StringBuffer ();
 340:     buf.append ((char) c);
 341:     while (true)
 342:       {
 343:         in.mark (1);
 344:         c = in.read ();
 345:         if (c >= 0x30 && c <= 0x39)
 346:           {
 347:             buf.append ((char) c);
 348:           }
 349:         else
 350:           {
 351:             in.reset ();
 352:             return new XPathToken (XPathParser.DIGITS, buf.toString ());
 353:           }
 354:       }
 355:   }
 356: 
 357:   XPathToken consume_name (int c)
 358:     throws IOException
 359:   {
 360:     StringBuffer buf = new StringBuffer ();
 361:     buf.append ((char) c);
 362:     while (true)
 363:       {
 364:         in.mark (1);
 365:         c = in.read ();
 366:         if (isNameChar (c))
 367:           {
 368:             buf.append ((char) c);
 369:           }
 370:         else
 371:           {
 372:             in.reset ();
 373:             String name = buf.toString ();
 374:             Integer keyword = (Integer) keywords.get (name);
 375:             if (keyword == null)
 376:               {
 377:                 return new XPathToken (XPathParser.NAME, name);
 378:               }
 379:             else
 380:               {
 381:                 int val = keyword.intValue ();
 382:                 switch (val)
 383:                   {
 384:                   case XPathParser.NODE:
 385:                   case XPathParser.COMMENT:
 386:                   case XPathParser.TEXT:
 387:                   case XPathParser.PROCESSING_INSTRUCTION:
 388:                     // Consume subsequent (
 389:                     in.mark (1);
 390:                     do
 391:                       {
 392:                         c = in.read ();
 393:                       }
 394:                     while (c == 0x20 || c == 0x09);
 395:                     if (c != 0x28)
 396:                       {
 397:                         in.reset ();
 398:                         return new XPathToken (XPathParser.NAME, name);
 399:                       }
 400:                     break;
 401:                   case XPathParser.CHILD:
 402:                   case XPathParser.PARENT:
 403:                   case XPathParser.SELF:
 404:                   case XPathParser.DESCENDANT:
 405:                   case XPathParser.ANCESTOR:
 406:                   case XPathParser.DESCENDANT_OR_SELF:
 407:                   case XPathParser.ANCESTOR_OR_SELF:
 408:                   case XPathParser.ATTRIBUTE:
 409:                   case XPathParser.NAMESPACE:
 410:                   case XPathParser.FOLLOWING:
 411:                   case XPathParser.FOLLOWING_SIBLING:
 412:                   case XPathParser.PRECEDING:
 413:                   case XPathParser.PRECEDING_SIBLING:
 414:                     // Check that this is an axis specifier
 415:                     in.mark(1);
 416:                     do
 417:                       {
 418:                         c = in.read();
 419:                       }
 420:                     while (c == 0x20 || c == 0x09);
 421:                     if (c == 0x3a)
 422:                       {
 423:                         c = in.read();
 424:                         if (c == 0x3a)
 425:                           {
 426:                             in.reset();
 427:                             return new XPathToken(val);
 428:                           }
 429:                       }
 430:                     in.reset();
 431:                     return new XPathToken(XPathParser.NAME, name);
 432:                   case XPathParser.DIV:
 433:                   case XPathParser.MOD:
 434:                     // May be a name
 435:                     if (lastToken == null)
 436:                       {
 437:                         return new XPathToken(XPathParser.NAME, name);
 438:                       }
 439:                     switch (lastToken.type)
 440:                       {
 441:                       case XPathParser.LP:
 442:                       case XPathParser.LB:
 443:                       case XPathParser.COMMA:
 444:                       case XPathParser.PIPE:
 445:                       case XPathParser.EQ:
 446:                       case XPathParser.NE:
 447:                       case XPathParser.GT:
 448:                       case XPathParser.LT:
 449:                       case XPathParser.GTE:
 450:                       case XPathParser.LTE:
 451:                       case XPathParser.PLUS:
 452:                       case XPathParser.MINUS:
 453:                       case XPathParser.STAR:
 454:                       case XPathParser.AT:
 455:                       case XPathParser.DOLLAR:
 456:                       case XPathParser.COLON:
 457:                       case XPathParser.DOUBLE_COLON:
 458:                       case XPathParser.DIV:
 459:                       case XPathParser.MOD:
 460:                       case XPathParser.OR:
 461:                       case XPathParser.AND:
 462:                       case XPathParser.SLASH:
 463:                         return new XPathToken(XPathParser.NAME, name);
 464:                       }
 465:                     break;
 466:                   }
 467:                 return new XPathToken (val);
 468:               }
 469:           }
 470:       }
 471:   }
 472: 
 473:   boolean isNameChar (int c)
 474:   {
 475:     /* Name */
 476:     return (c == 0x5f
 477:             || c == 0x2d
 478:             || c == 0x2e
 479:             || (c >= 0x30 && c <= 0x39)
 480:             /* CombiningChar */
 481:             || (c >= 0x0300 && c <= 0x0345)
 482:             || (c >= 0x0360 && c <= 0x0361)
 483:             || (c >= 0x0483 && c <= 0x0486)
 484:             || (c >= 0x0591 && c <= 0x05A1)
 485:             || (c >= 0x05A3 && c <= 0x05B9)
 486:             || (c >= 0x05BB && c <= 0x05BD)
 487:             || c == 0x05BF
 488:             || (c >= 0x05C1 && c <= 0x05C2)
 489:             || c == 0x05C4
 490:             || (c >= 0x064B && c <= 0x0652)
 491:             || c == 0x0670
 492:             || (c >= 0x06D6 && c <= 0x06DC)
 493:             || (c >= 0x06DD && c <= 0x06DF)
 494:             || (c >= 0x06E0 && c <= 0x06E4)
 495:             || (c >= 0x06E7 && c <= 0x06E8)
 496:             || (c >= 0x06EA && c <= 0x06ED)
 497:             || (c >= 0x0901 && c <= 0x0903)
 498:             || c == 0x093C
 499:             || (c >= 0x093E && c <= 0x094C)
 500:             || c == 0x094D
 501:             || (c >= 0x0951 && c <= 0x0954)
 502:             || (c >= 0x0962 && c <= 0x0963)
 503:             || (c >= 0x0981 && c <= 0x0983)
 504:             || c == 0x09BC
 505:             || c == 0x09BE
 506:             || c == 0x09BF
 507:             || (c >= 0x09C0 && c <= 0x09C4)
 508:             || (c >= 0x09C7 && c <= 0x09C8)
 509:             || (c >= 0x09CB && c <= 0x09CD)
 510:             || c == 0x09D7
 511:             || (c >= 0x09E2 && c <= 0x09E3)
 512:             || c == 0x0A02
 513:             || c == 0x0A3C
 514:             || c == 0x0A3E
 515:             || c == 0x0A3F
 516:             || (c >= 0x0A40 && c <= 0x0A42)
 517:             || (c >= 0x0A47 && c <= 0x0A48)
 518:             || (c >= 0x0A4B && c <= 0x0A4D)
 519:             || (c >= 0x0A70 && c <= 0x0A71)
 520:             || (c >= 0x0A81 && c <= 0x0A83)
 521:             || c == 0x0ABC
 522:             || (c >= 0x0ABE && c <= 0x0AC5)
 523:             || (c >= 0x0AC7 && c <= 0x0AC9)
 524:             || (c >= 0x0ACB && c <= 0x0ACD)
 525:             || (c >= 0x0B01 && c <= 0x0B03)
 526:             || c == 0x0B3C
 527:             || (c >= 0x0B3E && c <= 0x0B43)
 528:             || (c >= 0x0B47 && c <= 0x0B48)
 529:             || (c >= 0x0B4B && c <= 0x0B4D)
 530:             || (c >= 0x0B56 && c <= 0x0B57)
 531:             || (c >= 0x0B82 && c <= 0x0B83)
 532:             || (c >= 0x0BBE && c <= 0x0BC2)
 533:             || (c >= 0x0BC6 && c <= 0x0BC8)
 534:             || (c >= 0x0BCA && c <= 0x0BCD)
 535:             || c == 0x0BD7
 536:             || (c >= 0x0C01 && c <= 0x0C03)
 537:             || (c >= 0x0C3E && c <= 0x0C44)
 538:             || (c >= 0x0C46 && c <= 0x0C48)
 539:             || (c >= 0x0C4A && c <= 0x0C4D)
 540:             || (c >= 0x0C55 && c <= 0x0C56)
 541:             || (c >= 0x0C82 && c <= 0x0C83)
 542:             || (c >= 0x0CBE && c <= 0x0CC4)
 543:             || (c >= 0x0CC6 && c <= 0x0CC8)
 544:             || (c >= 0x0CCA && c <= 0x0CCD)
 545:             || (c >= 0x0CD5 && c <= 0x0CD6)
 546:             || (c >= 0x0D02 && c <= 0x0D03)
 547:             || (c >= 0x0D3E && c <= 0x0D43)
 548:             || (c >= 0x0D46 && c <= 0x0D48)
 549:             || (c >= 0x0D4A && c <= 0x0D4D)
 550:             || c == 0x0D57
 551:             || c == 0x0E31
 552:             || (c >= 0x0E34 && c <= 0x0E3A)
 553:             || (c >= 0x0E47 && c <= 0x0E4E)
 554:             || c == 0x0EB1
 555:             || (c >= 0x0EB4 && c <= 0x0EB9)
 556:             || (c >= 0x0EBB && c <= 0x0EBC)
 557:             || (c >= 0x0EC8 && c <= 0x0ECD)
 558:             || (c >= 0x0F18 && c <= 0x0F19)
 559:             || c == 0x0F35
 560:             || c == 0x0F37
 561:             || c == 0x0F39
 562:             || c == 0x0F3E
 563:             || c == 0x0F3F
 564:             || (c >= 0x0F71 && c <= 0x0F84)
 565:             || (c >= 0x0F86 && c <= 0x0F8B)
 566:             || (c >= 0x0F90 && c <= 0x0F95)
 567:             || c == 0x0F97
 568:             || (c >= 0x0F99 && c <= 0x0FAD)
 569:             || (c >= 0x0FB1 && c <= 0x0FB7)
 570:             || c == 0x0FB9
 571:             || (c >= 0x20D0 && c <= 0x20DC)
 572:             || c == 0x20E1
 573:             || (c >= 0x302A && c <= 0x302F)
 574:             || c == 0x3099
 575:             || c == 0x309A
 576:             /* Extender */
 577:             || c == 0x00B7
 578:             || c == 0x02D0
 579:             || c == 0x02D1
 580:             || c == 0x0387
 581:             || c == 0x0640
 582:             || c == 0x0E46
 583:             || c == 0x0EC6
 584:             || c == 0x3005
 585:             || (c >= 0x3031 && c <= 0x3035)
 586:             || (c >= 0x309D && c <= 0x309E)
 587:             || (c >= 0x30FC && c <= 0x30FE)
 588:             /* Name */
 589:             || Character.isLetter ((char) c));
 590:   }
 591: 
 592: }