1 /* ==================================================================== 2 * The Apache Software License, Version 1.1 3 * 4 * Copyright (c) 2002 The Apache Software Foundation. All rights 5 * reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * 19 * 3. The end-user documentation included with the redistribution, 20 * if any, must include the following acknowledgment: 21 * "This product includes software developed by the 22 * Apache Software Foundation (http://www.apache.org/)." 23 * Alternately, this acknowledgment may appear in the software itself, 24 * if and wherever such third-party acknowledgments normally appear. 25 * 26 * 4. The names "Apache" and "Apache Software Foundation" and 27 * "Apache POI" must not be used to endorse or promote products 28 * derived from this software without prior written permission. For 29 * written permission, please contact apache@apache.org. 30 * 31 * 5. Products derived from this software may not be called "Apache", 32 * "Apache POI", nor may "Apache" appear in their name, without 33 * prior written permission of the Apache Software Foundation. 34 * 35 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED 36 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 37 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 38 * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR 39 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 40 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 41 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF 42 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 43 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 44 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 45 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 46 * SUCH DAMAGE. 47 * ==================================================================== 48 * 49 * This software consists of voluntary contributions made by many 50 * individuals on behalf of the Apache Software Foundation. For more 51 * information on the Apache Software Foundation, please see 52 * <http://www.apache.org/>. 53 */ 54 55 package org.apache.poi.hssf.record; 56 57 import org.apache.poi.util.BinaryTree; 58 import org.apache.poi.util.LittleEndian; 59 import org.apache.poi.util.LittleEndianConsts; 60 61 /** 62 * Handles the task of deserializing a SST string. The two main entry points are 63 * 64 * @author Glen Stampoultzis (glens at apache.org) 65 */ 66 class SSTDeserializer 67 { 68 69 private BinaryTree strings; 70 /** this is the number of characters we expect in the first sub-record in a subsequent continuation record */ 71 private int continuationExpectedChars; 72 /** this is the string we were working on before hitting the end of the current record. This string is NOT finished. */ 73 private String unfinishedString; 74 /** this is true if the string uses wide characters */ 75 private boolean wideChar; 76 /** this is true if the string is a rich text string */ 77 private boolean richText; 78 /** this is true if the string is a far east string or some other wierd string */ 79 private boolean extendedText; 80 /** Number of formatting runs in this rich text field */ 81 private short runCount; 82 /** Number of characters in current string */ 83 private int charCount; 84 private int extensionLength; 85 86 87 public SSTDeserializer( BinaryTree strings ) 88 { 89 this.strings = strings; 90 initVars(); 91 } 92 93 private void initVars() 94 { 95 runCount = 0; 96 continuationExpectedChars = 0; 97 unfinishedString = ""; 98 // bytesInCurrentSegment = 0; 99 // stringDataOffset = 0; 100 wideChar = false; 101 richText = false; 102 extendedText = false; 103 } 104 105 /** 106 * This is the starting point where strings are constructed. Note that 107 * strings may span across multiple continuations. Read the SST record 108 * carefully before beginning to hack. 109 */ 110 public void manufactureStrings( final byte[] data, final int initialOffset, short dataSize ) 111 { 112 initVars(); 113 114 int offset = initialOffset; 115 while ( ( offset - initialOffset ) < dataSize ) 116 { 117 int remaining = dataSize - offset + initialOffset; 118 119 if ( ( remaining > 0 ) && ( remaining < LittleEndianConsts.SHORT_SIZE ) ) 120 { 121 throw new RecordFormatException( "Cannot get length of the last string in SSTRecord" ); 122 } 123 if ( remaining == LittleEndianConsts.SHORT_SIZE ) 124 { 125 setContinuationExpectedChars( LittleEndian.getUShort( data, offset ) ); 126 unfinishedString = ""; 127 break; 128 } 129 charCount = LittleEndian.getUShort( data, offset ); 130 readStringHeader( data, offset ); 131 boolean stringContinuesOverContinuation = remaining < totalStringSize(); 132 if ( stringContinuesOverContinuation ) 133 { 134 int remainingBytes = ( initialOffset + dataSize ) - offset - stringHeaderOverhead(); 135 setContinuationExpectedChars( charCount - calculateCharCount( remainingBytes ) ); 136 charCount -= getContinuationExpectedChars(); 137 } 138 else 139 { 140 setContinuationExpectedChars( 0 ); 141 } 142 processString( data, offset, charCount ); 143 offset += totalStringSize(); 144 if ( getContinuationExpectedChars() != 0 ) 145 { 146 break; 147 } 148 } 149 } 150 151 // private void dump( final byte[] data, int offset, int length ) 152 // { 153 // try 154 // { 155 // System.out.println( "------------------- SST DUMP -------------------------" ); 156 // HexDump.dump( (byte[]) data, offset, System.out, offset, length ); 157 // } 158 // catch ( IOException e ) 159 // { 160 // } 161 // catch ( ArrayIndexOutOfBoundsException e ) 162 // { 163 // } 164 // catch ( IllegalArgumentException e ) 165 // { 166 // } 167 // } 168 169 /** 170 * Detemines the option types for the string (ie, compressed or uncompressed unicode, rich text string or 171 * plain string etc) and calculates the length and offset for the string. 172 * 173 */ 174 private void readStringHeader( final byte[] data, final int index ) 175 { 176 177 byte optionFlag = data[index + LittleEndianConsts.SHORT_SIZE]; 178 179 wideChar = ( optionFlag & 1 ) == 1; 180 extendedText = ( optionFlag & 4 ) == 4; 181 richText = ( optionFlag & 8 ) == 8; 182 runCount = 0; 183 if ( richText ) 184 { 185 runCount = LittleEndian.getShort( data, index + SSTRecord.STRING_MINIMAL_OVERHEAD ); 186 } 187 extensionLength = 0; 188 if ( extendedText ) 189 { 190 extensionLength = LittleEndian.getInt( data, index + SSTRecord.STRING_MINIMAL_OVERHEAD 191 + (richText ? LittleEndianConsts.SHORT_SIZE : 0) ); 192 } 193 194 } 195 196 197 /** 198 * Reads a string or the first part of a string. 199 * 200 * @param characters the number of characters to write. 201 * 202 * @return the number of bytes written. 203 */ 204 private int processString( final byte[] data, final int dataIndex, final int characters ) 205 { 206 207 // length is the length we store it as. not the length that is read. 208 int length = SSTRecord.STRING_MINIMAL_OVERHEAD + calculateByteCount( characters ); 209 byte[] unicodeStringBuffer = new byte[length]; 210 211 int offset = 0; 212 213 // Set the length in characters 214 LittleEndian.putUShort( unicodeStringBuffer, offset, characters ); 215 offset += LittleEndianConsts.SHORT_SIZE; 216 // Set the option flags 217 unicodeStringBuffer[offset] = data[dataIndex + offset]; 218 // Copy in the string data 219 int bytesRead = unicodeStringBuffer.length - SSTRecord.STRING_MINIMAL_OVERHEAD; 220 arraycopy( data, dataIndex + stringHeaderOverhead(), unicodeStringBuffer, SSTRecord.STRING_MINIMAL_OVERHEAD, bytesRead ); 221 // Create the unicode string 222 UnicodeString string = new UnicodeString( UnicodeString.sid, 223 (short) unicodeStringBuffer.length, 224 unicodeStringBuffer ); 225 226 if ( isStringFinished() ) 227 { 228 Integer integer = new Integer( strings.size() ); 229 addToStringTable( strings, integer, string ); 230 } 231 else 232 { 233 unfinishedString = string.getString(); 234 } 235 236 return bytesRead; 237 } 238 239 private boolean isStringFinished() 240 { 241 return getContinuationExpectedChars() == 0; 242 } 243 244 /** 245 * Okay, we are doing some major cheating here. Because we can't handle rich text strings properly 246 * we end up getting duplicate strings. To get around this I'm doing two things: 1. Converting rich 247 * text to normal text and 2. If there's a duplicate I'm adding a space onto the end. Sneaky perhaps 248 * but it gets the job done until we can handle this a little better. 249 */ 250 static public void addToStringTable( BinaryTree strings, Integer integer, UnicodeString string ) 251 { 252 253 if ( string.isRichText() ) 254 string.setOptionFlags( (byte) ( string.getOptionFlags() & ( ~8 ) ) ); 255 if ( string.isExtendedText() ) 256 string.setOptionFlags( (byte) ( string.getOptionFlags() & ( ~4 ) ) ); 257 258 boolean added = false; 259 while ( added == false ) 260 { 261 try 262 { 263 strings.put( integer, string ); 264 added = true; 265 } 266 catch ( Exception ignore ) 267 { 268 string.setString( string.getString() + " " ); 269 } 270 } 271 272 } 273 274 275 private int calculateCharCount( final int byte_count ) 276 { 277 return byte_count / ( wideChar ? LittleEndianConsts.SHORT_SIZE : LittleEndianConsts.BYTE_SIZE ); 278 } 279 280 /** 281 * Process a Continue record. A Continue record for an SST record 282 * contains the same kind of data that the SST record contains, 283 * with the following exceptions: 284 * <P> 285 * <OL> 286 * <LI>The string counts at the beginning of the SST record are 287 * not in the Continue record 288 * <LI>The first string in the Continue record might NOT begin 289 * with a size. If the last string in the previous record is 290 * continued in this record, the size is determined by that 291 * last string in the previous record; the first string will 292 * begin with a flag byte, followed by the remaining bytes (or 293 * words) of the last string from the previous 294 * record. Otherwise, the first string in the record will 295 * begin with a string length 296 * </OL> 297 * 298 * @param record the Continue record's byte data 299 */ 300 public void processContinueRecord( final byte[] record ) 301 { 302 if ( isStringFinished() ) 303 { 304 initVars(); 305 manufactureStrings( record, 0, (short) record.length ); 306 } 307 else 308 { 309 // reset the wide bit because that can change across a continuation. the fact that it's 310 // actually rich text doesn't change across continuations even though the rich text 311 // may on longer be set in the "new" option flag. confusing huh? 312 wideChar = ( record[0] & 1 ) == 1; 313 314 if ( stringSpansContinuation( record.length - LittleEndianConsts.BYTE_SIZE ) ) 315 { 316 processEntireContinuation( record ); 317 } 318 else 319 { 320 readStringRemainder( record ); 321 } 322 } 323 324 } 325 326 /** 327 * Reads the remainder string and any subsequent strings from the continuation record. 328 * 329 * @param record The entire continuation record data. 330 */ 331 private void readStringRemainder( final byte[] record ) 332 { 333 int stringRemainderSizeInBytes = calculateByteCount( getContinuationExpectedChars() ); 334 // stringDataOffset = LittleEndianConsts.BYTE_SIZE; 335 byte[] unicodeStringData = new byte[SSTRecord.STRING_MINIMAL_OVERHEAD 336 + calculateByteCount( getContinuationExpectedChars() )]; 337 338 // write the string length 339 LittleEndian.putShort( unicodeStringData, 0, (short) getContinuationExpectedChars() ); 340 341 // write the options flag 342 unicodeStringData[LittleEndianConsts.SHORT_SIZE] = createOptionByte( wideChar, richText, extendedText ); 343 344 // copy the bytes/words making up the string; skipping 345 // past all the overhead of the str_data array 346 arraycopy( record, LittleEndianConsts.BYTE_SIZE, unicodeStringData, 347 SSTRecord.STRING_MINIMAL_OVERHEAD, 348 unicodeStringData.length - SSTRecord.STRING_MINIMAL_OVERHEAD ); 349 350 // use special constructor to create the final string 351 UnicodeString string = new UnicodeString( UnicodeString.sid, 352 (short) unicodeStringData.length, unicodeStringData, 353 unfinishedString ); 354 Integer integer = new Integer( strings.size() ); 355 356 addToStringTable( strings, integer, string ); 357 358 int newOffset = offsetForContinuedRecord( stringRemainderSizeInBytes ); 359 manufactureStrings( record, newOffset, (short) ( record.length - newOffset ) ); 360 } 361 362 /** 363 * Calculates the size of the string in bytes based on the character width 364 */ 365 private int stringSizeInBytes() 366 { 367 return calculateByteCount( charCount ); 368 } 369 370 /** 371 * Calculates the size of the string in byes. This figure includes all the over 372 * heads for the string. 373 */ 374 private int totalStringSize() 375 { 376 return stringSizeInBytes() 377 + stringHeaderOverhead() 378 + LittleEndianConsts.INT_SIZE * runCount 379 + extensionLength; 380 } 381 382 private int stringHeaderOverhead() 383 { 384 return SSTRecord.STRING_MINIMAL_OVERHEAD 385 + ( richText ? LittleEndianConsts.SHORT_SIZE : 0 ) 386 + ( extendedText ? LittleEndianConsts.INT_SIZE : 0 ); 387 } 388 389 private int offsetForContinuedRecord( int stringRemainderSizeInBytes ) 390 { 391 return stringRemainderSizeInBytes + LittleEndianConsts.BYTE_SIZE 392 + runCount * LittleEndianConsts.INT_SIZE + extensionLength; 393 } 394 395 private byte createOptionByte( boolean wideChar, boolean richText, boolean farEast ) 396 { 397 return (byte) ( ( wideChar ? 1 : 0 ) + ( farEast ? 4 : 0 ) + ( richText ? 8 : 0 ) ); 398 } 399 400 /** 401 * If the continued record is so long is spans into the next continue then 402 * simply suck the remaining string data into the existing <code>unfinishedString</code>. 403 * 404 * @param record The data from the continuation record. 405 */ 406 private void processEntireContinuation( final byte[] record ) 407 { 408 // create artificial data to create a UnicodeString 409 int dataLengthInBytes = record.length - LittleEndianConsts.BYTE_SIZE; 410 byte[] unicodeStringData = new byte[record.length + LittleEndianConsts.SHORT_SIZE]; 411 412 LittleEndian.putShort( unicodeStringData, (byte) 0, (short) calculateCharCount( dataLengthInBytes ) ); 413 arraycopy( record, 0, unicodeStringData, LittleEndianConsts.SHORT_SIZE, record.length ); 414 UnicodeString ucs = new UnicodeString( UnicodeString.sid, (short) unicodeStringData.length, unicodeStringData ); 415 416 unfinishedString = unfinishedString + ucs.getString(); 417 setContinuationExpectedChars( getContinuationExpectedChars() - calculateCharCount( dataLengthInBytes ) ); 418 } 419 420 private boolean stringSpansContinuation( int continuationSizeInBytes ) 421 { 422 return calculateByteCount( getContinuationExpectedChars() ) > continuationSizeInBytes; 423 } 424 425 /** 426 * @return the number of characters we expect in the first 427 * sub-record in a subsequent continuation record 428 */ 429 430 int getContinuationExpectedChars() 431 { 432 return continuationExpectedChars; 433 } 434 435 private void setContinuationExpectedChars( final int count ) 436 { 437 continuationExpectedChars = count; 438 } 439 440 private int calculateByteCount( final int character_count ) 441 { 442 return character_count * ( wideChar ? LittleEndianConsts.SHORT_SIZE : LittleEndianConsts.BYTE_SIZE ); 443 } 444 445 446 /** 447 * Copies an array from the specified source array, beginning at the 448 * specified position, to the specified position of the destination array. 449 * A subsequence of array components are copied from the source 450 * array referenced by <code>src</code> to the destination array 451 * referenced by <code>dst</code>. The number of components copied is 452 * equal to the <code>length</code> argument. The components at 453 * positions <code>srcOffset</code> through 454 * <code>srcOffset+length-1</code> in the source array are copied into 455 * positions <code>dstOffset</code> through 456 * <code>dstOffset+length-1</code>, respectively, of the destination 457 * array. 458 * <p> 459 * If the <code>src</code> and <code>dst</code> arguments refer to the 460 * same array object, then the copying is performed as if the 461 * components at positions <code>srcOffset</code> through 462 * <code>srcOffset+length-1</code> were first copied to a temporary 463 * array with <code>length</code> components and then the contents of 464 * the temporary array were copied into positions 465 * <code>dstOffset</code> through <code>dstOffset+length-1</code> of the 466 * destination array. 467 * <p> 468 * If <code>dst</code> is <code>null</code>, then a 469 * <code>NullPointerException</code> is thrown. 470 * <p> 471 * If <code>src</code> is <code>null</code>, then a 472 * <code>NullPointerException</code> is thrown and the destination 473 * array is not modified. 474 * <p> 475 * Otherwise, if any of the following is true, an 476 * <code>ArrayStoreException</code> is thrown and the destination is 477 * not modified: 478 * <ul> 479 * <li>The <code>src</code> argument refers to an object that is not an 480 * array. 481 * <li>The <code>dst</code> argument refers to an object that is not an 482 * array. 483 * <li>The <code>src</code> argument and <code>dst</code> argument refer to 484 * arrays whose component types are different primitive types. 485 * <li>The <code>src</code> argument refers to an array with a primitive 486 * component type and the <code>dst</code> argument refers to an array 487 * with a reference component type. 488 * <li>The <code>src</code> argument refers to an array with a reference 489 * component type and the <code>dst</code> argument refers to an array 490 * with a primitive component type. 491 * </ul> 492 * <p> 493 * Otherwise, if any of the following is true, an 494 * <code>IndexOutOfBoundsException</code> is 495 * thrown and the destination is not modified: 496 * <ul> 497 * <li>The <code>srcOffset</code> argument is negative. 498 * <li>The <code>dstOffset</code> argument is negative. 499 * <li>The <code>length</code> argument is negative. 500 * <li><code>srcOffset+length</code> is greater than 501 * <code>src.length</code>, the length of the source array. 502 * <li><code>dstOffset+length</code> is greater than 503 * <code>dst.length</code>, the length of the destination array. 504 * </ul> 505 * <p> 506 * Otherwise, if any actual component of the source array from 507 * position <code>srcOffset</code> through 508 * <code>srcOffset+length-1</code> cannot be converted to the component 509 * type of the destination array by assignment conversion, an 510 * <code>ArrayStoreException</code> is thrown. In this case, let 511 * <b><i>k</i></b> be the smallest nonnegative integer less than 512 * length such that <code>src[srcOffset+</code><i>k</i><code>]</code> 513 * cannot be converted to the component type of the destination 514 * array; when the exception is thrown, source array components from 515 * positions <code>srcOffset</code> through 516 * <code>srcOffset+</code><i>k</i><code>-1</code> 517 * will already have been copied to destination array positions 518 * <code>dstOffset</code> through 519 * <code>dstOffset+</code><i>k</I><code>-1</code> and no other 520 * positions of the destination array will have been modified. 521 * (Because of the restrictions already itemized, this 522 * paragraph effectively applies only to the situation where both 523 * arrays have component types that are reference types.) 524 * 525 * @param src the source array. 526 * @param src_position start position in the source array. 527 * @param dst the destination array. 528 * @param dst_position pos start position in the destination data. 529 * @param length the number of array elements to be copied. 530 * @exception IndexOutOfBoundsException if copying would cause 531 * access of data outside array bounds. 532 * @exception ArrayStoreException if an element in the <code>src</code> 533 * array could not be stored into the <code>dest</code> array 534 * because of a type mismatch. 535 * @exception NullPointerException if either <code>src</code> or 536 * <code>dst</code> is <code>null</code>. 537 */ 538 private void arraycopy( byte[] src, int src_position, 539 byte[] dst, int dst_position, 540 int length ) 541 { 542 System.arraycopy( src, src_position, dst, dst_position, length ); 543 } 544 545 /** 546 * @return the unfinished string 547 */ 548 String getUnfinishedString() 549 { 550 return unfinishedString; 551 } 552 553 /** 554 * @return true if current string uses wide characters 555 */ 556 boolean isWideChar() 557 { 558 return wideChar; 559 } 560 561 562 } 563