Source for gnu.java.text.WordBreakIterator

   1: /* WordBreakIterator.java - Default word BreakIterator.
   2:    Copyright (C) 1999, 2001, 2004 Free Software Foundation, Inc.
   3: 
   4: This file is part of GNU Classpath.
   5: 
   6: GNU Classpath is free software; you can redistribute it and/or modify
   7: it under the terms of the GNU General Public License as published by
   8: the Free Software Foundation; either version 2, or (at your option)
   9: any later version.
  10:  
  11: GNU Classpath is distributed in the hope that it will be useful, but
  12: WITHOUT ANY WARRANTY; without even the implied warranty of
  13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14: General Public License for more details.
  15: 
  16: You should have received a copy of the GNU General Public License
  17: along with GNU Classpath; see the file COPYING.  If not, write to the
  18: Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  19: 02110-1301 USA.
  20: 
  21: Linking this library statically or dynamically with other modules is
  22: making a combined work based on this library.  Thus, the terms and
  23: conditions of the GNU General Public License cover the whole
  24: combination.
  25: 
  26: As a special exception, the copyright holders of this library give you
  27: permission to link this library with independent modules to produce an
  28: executable, regardless of the license terms of these independent
  29: modules, and to copy and distribute the resulting executable under
  30: terms of your choice, provided that you also meet, for each linked
  31: independent module, the terms and conditions of the license of that
  32: module.  An independent module is a module which is not derived from
  33: or based on this library.  If you modify this library, you may extend
  34: this exception to your version of the library, but you are not
  35: obligated to do so.  If you do not wish to do so, delete this
  36: exception statement from your version. */
  37: 
  38: 
  39: package gnu.java.text;
  40: 
  41: import java.text.CharacterIterator;
  42: 
  43: /**
  44:  * @author Tom Tromey <tromey@cygnus.com>
  45:  * @date March 22, 1999
  46:  * Written using The Unicode Standard, Version 2.0.
  47:  */
  48: 
  49: public class WordBreakIterator extends BaseBreakIterator
  50: {
  51:   public Object clone ()
  52:   {
  53:     return new WordBreakIterator (this);
  54:   }
  55: 
  56:   public WordBreakIterator ()
  57:   {
  58:   }
  59: 
  60:   private WordBreakIterator (WordBreakIterator other)
  61:   {
  62:     iter = (CharacterIterator) other.iter.clone();
  63:   }
  64: 
  65:   // Some methods to tell us different properties of characters.
  66:   private final boolean isHira (char c)
  67:   {
  68:     return c >= 0x3040 && c <= 0x309f;
  69:   }
  70:   private final boolean isKata (char c)
  71:   {
  72:     return c >= 0x30a0 && c <= 0x30ff;
  73:   }
  74:   private final boolean isHan (char c)
  75:   {
  76:     return c >= 0x4e00 && c <= 0x9fff;
  77:   }
  78: 
  79:   public int next ()
  80:   {
  81:     int end = iter.getEndIndex();
  82:     if (iter.getIndex() == end)
  83:       return DONE;
  84: 
  85:     while (iter.getIndex() < end)
  86:       {
  87:     char c = iter.current();
  88:     if (c == CharacterIterator.DONE)
  89:       break;
  90:     int type = Character.getType(c);
  91: 
  92:     char n = iter.next();
  93:     if (n == CharacterIterator.DONE)
  94:       break;
  95: 
  96:     // Break after paragraph separators.
  97:     if (type == Character.PARAGRAPH_SEPARATOR
  98:         || type == Character.LINE_SEPARATOR)
  99:       break;
 100: 
 101:     // Break between letters and non-letters.
 102:     // FIXME: we treat apostrophe as part of a word.  This
 103:     // is an English-ism.
 104:     boolean is_letter = Character.isLetter(c);
 105:     if (c != '\'' && ! is_letter && type != Character.NON_SPACING_MARK
 106:         && Character.isLetter(n))
 107:       break;
 108: 
 109:     // Always break after certain symbols, such as punctuation.
 110:     // This heuristic is derived from hints in the JCL book and is
 111:     // not part of Unicode.  It seems to be right, however.
 112:     // FIXME: we treat apostrophe as part of a word.  This
 113:     // is an English-ism.
 114:     if (c != '\''
 115:         && (type == Character.DASH_PUNCTUATION
 116:         || type == Character.START_PUNCTUATION
 117:         || type == Character.END_PUNCTUATION
 118:         || type == Character.CONNECTOR_PUNCTUATION
 119:         || type == Character.OTHER_PUNCTUATION
 120:         || type == Character.MATH_SYMBOL
 121:         || type == Character.CURRENCY_SYMBOL
 122:         || type == Character.MODIFIER_SYMBOL
 123:         || type == Character.OTHER_SYMBOL
 124:         || type == Character.FORMAT
 125:         || type == Character.CONTROL))
 126:       break;
 127: 
 128:     boolean is_hira = isHira (c);
 129:     boolean is_kata = isKata (c);
 130:     boolean is_han = isHan (c);
 131: 
 132:     // Special case Japanese.
 133:     if (! is_hira && ! is_kata && ! is_han
 134:         && type != Character.NON_SPACING_MARK
 135:         && (isHira (n) || isKata (n) || isHan (n)))
 136:       break;
 137: 
 138:     if (is_hira || is_kata || is_han || is_letter)
 139:       {
 140:         // Now we need to do some lookahead.  We might need to do
 141:         // quite a bit of lookahead, so we save our position and
 142:         // restore it later.
 143:         int save = iter.getIndex();
 144:         // Skip string of non spacing marks.
 145:         while (n != CharacterIterator.DONE
 146:            && Character.getType(n) == Character.NON_SPACING_MARK)
 147:           n = iter.next();
 148:         if (n == CharacterIterator.DONE)
 149:           break;
 150:         if ((is_hira && ! isHira (n))
 151:         || (is_kata && ! isHira (n) && ! isKata (n))
 152:         || (is_han && ! isHira (n) && ! isHan (n))
 153:         // FIXME: we treat apostrophe as part of a word.  This
 154:         // is an English-ism.
 155:         || (is_letter && ! Character.isLetter(n) && n != '\''))
 156:           break;
 157:         iter.setIndex(save);
 158:       }
 159:       }
 160: 
 161:     return iter.getIndex();
 162:   }
 163: 
 164:   public int previous ()
 165:   {
 166:     int start = iter.getBeginIndex();
 167:     if (iter.getIndex() == start)
 168:       return DONE;
 169: 
 170:     while (iter.getIndex() >= start)
 171:       {
 172:     char c = iter.previous();
 173:     if (c == CharacterIterator.DONE)
 174:       break;
 175: 
 176:     boolean is_hira = isHira (c);
 177:     boolean is_kata = isKata (c);
 178:     boolean is_han = isHan (c);
 179:     boolean is_letter = Character.isLetter(c);
 180: 
 181:     char n = iter.previous();
 182:     if (n == CharacterIterator.DONE)
 183:       break;
 184:     iter.next();
 185:     int type = Character.getType(n);
 186:     // Break after paragraph separators.
 187:     if (type == Character.PARAGRAPH_SEPARATOR
 188:         || type == Character.LINE_SEPARATOR)
 189:       break;
 190: 
 191:     // Break between letters and non-letters.
 192:     // FIXME: we treat apostrophe as part of a word.  This
 193:     // is an English-ism.
 194:     if (n != '\'' && ! Character.isLetter(n)
 195:         && type != Character.NON_SPACING_MARK
 196:         && is_letter)
 197:       break;
 198: 
 199:     // Always break after certain symbols, such as punctuation.
 200:     // This heuristic is derived from hints in the JCL book and is
 201:     // not part of Unicode.  It seems to be right, however.
 202:     // FIXME: we treat apostrophe as part of a word.  This
 203:     // is an English-ism.
 204:     if (n != '\''
 205:         && (type == Character.DASH_PUNCTUATION
 206:         || type == Character.START_PUNCTUATION
 207:         || type == Character.END_PUNCTUATION
 208:         || type == Character.CONNECTOR_PUNCTUATION
 209:         || type == Character.OTHER_PUNCTUATION
 210:         || type == Character.MATH_SYMBOL
 211:         || type == Character.CURRENCY_SYMBOL
 212:         || type == Character.MODIFIER_SYMBOL
 213:         || type == Character.OTHER_SYMBOL
 214:         || type == Character.FORMAT
 215:         || type == Character.CONTROL))
 216:       break;
 217: 
 218:     // Special case Japanese.
 219:     if ((is_hira || is_kata || is_han)
 220:         && ! isHira (n) && ! isKata (n) && ! isHan (n)
 221:         && type != Character.NON_SPACING_MARK)
 222:       break;
 223: 
 224:     // We might have to skip over non spacing marks to see what's
 225:     // on the other side.
 226:     if (! is_hira || (! is_letter && c != '\''))
 227:       {
 228:         int save = iter.getIndex();
 229:         while (n != CharacterIterator.DONE
 230:            && Character.getType(n) == Character.NON_SPACING_MARK)
 231:           n = iter.previous();
 232:         iter.setIndex(save);
 233:         // This is a strange case: a bunch of non-spacing marks at
 234:         // the beginning.  We treat the current location as a word
 235:         // break.
 236:         if (n == CharacterIterator.DONE)
 237:           break;
 238:         if ((isHira (n) && ! is_hira)
 239:         || (isKata (n) && ! is_hira && ! is_kata)
 240:         || (isHan (n) && ! is_hira && ! is_han)
 241:         // FIXME: we treat apostrophe as part of a word.  This
 242:         // is an English-ism.
 243:         || (! is_letter && c != '\'' && Character.isLetter(n)))
 244:           break;
 245:       }
 246:       }
 247: 
 248:     return iter.getIndex();
 249:   }
 250: }