00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019 #ifndef __CS_CSUCTRANSFORM_H__
00020 #define __CS_CSUCTRANSFORM_H__
00021
00022 #include "csunicode.h"
00023
00031
00032 #define CS_UC_MAX_UTF8_ENCODED 6
00033
00034 #define CS_UC_MAX_UTF16_ENCODED 2
00035
00036 #define CS_UC_MAX_UTF32_ENCODED 1
00037
00041 class csUnicodeTransform
00042 {
00043 public:
00044 #define FAIL(ret) \
00045 { \
00046 if (isValid) *isValid = false; \
00047 ch = CS_UC_CHAR_REPLACER; \
00048 return ret; \
00049 }
00050
00051 #define SUCCEED \
00052 if (isValid) *isValid = true; \
00053 return chUsed;
00054
00055 #define GET_NEXT(next) \
00056 if (chUsed == strlen) \
00057 { \
00058 FAIL(chUsed); \
00059 } \
00060 next = *str++; \
00061 if (next == 0) \
00062 { \
00063 FAIL(chUsed); \
00064 } \
00065 chUsed++;
00066
00082 inline static int UTF8Decode (const utf8_char* str, size_t strlen,
00083 utf32_char& ch, bool* isValid = 0)
00084 {
00085 if (str == 0)
00086 {
00087 FAIL(0);
00088 }
00089 size_t chUsed = 0;
00090
00091 utf8_char curCh;
00092 GET_NEXT(curCh);
00093 if ((curCh & 0x80) == 0)
00094 {
00095
00096 ch = curCh;
00097 SUCCEED;
00098 }
00099 else
00100 {
00101
00102 int n = 0;
00103 while ((n < 7) && ((curCh & (1 << (7 - n))) != 0)) { n++; }
00104
00105 if ((n < 2) || (n > 6))
00106 {
00107
00108
00109 FAIL(1);
00110 }
00111
00112 ch = (curCh & ((1 << (8 - n)) - 1));
00113
00114 for (int i = 1; i < n; i++)
00115 {
00116 GET_NEXT(curCh);
00117 if ((curCh & 0xc0) != 0x80)
00118 {
00119 FAIL(chUsed);
00120 }
00121 else
00122 {
00123 ch <<= 6;
00124 ch |= (curCh & 0x3f);
00125 }
00126 }
00127
00128
00129 if ((ch < 0x80) && (n > 0))
00130 {
00131 FAIL(chUsed);
00132 }
00133 else if ((ch < 0x800) && (n > 2))
00134 {
00135 FAIL(chUsed);
00136 }
00137 else if ((ch < 0x10000) && (n > 3))
00138 {
00139 FAIL(chUsed);
00140 }
00141 else if ((ch < 0x200000) && (n > 4))
00142 {
00143 FAIL(chUsed);
00144 }
00145 else if ((ch < 0x4000000) && (n > 5))
00146 {
00147 FAIL(chUsed);
00148 }
00149 else if ((ch < 0x80000000) && (n > 6))
00150 {
00151 FAIL(chUsed);
00152 }
00153
00154 if (CS_UC_IS_INVALID(ch) || CS_UC_IS_SURROGATE(ch))
00155 FAIL(chUsed);
00156 SUCCEED;
00157 }
00158 }
00159
00164 inline static int UTF16Decode (const utf16_char* str, size_t strlen,
00165 utf32_char& ch, bool* isValid = 0)
00166 {
00167 if (str == 0)
00168 {
00169 FAIL(0);
00170 }
00171 size_t chUsed = 0;
00172
00173 utf16_char curCh;
00174 GET_NEXT(curCh);
00175
00176 if (CS_UC_IS_SURROGATE (curCh))
00177 {
00178
00179 if (!CS_UC_IS_HIGH_SURROGATE (curCh))
00180 {
00181 FAIL(chUsed);
00182 }
00183 ch = (curCh & 0x03ff) << 10;
00184 GET_NEXT(curCh);
00185
00186 if (!CS_UC_IS_LOW_SURROGATE (curCh))
00187 {
00188
00189 FAIL(1);
00190 }
00191 ch |= (curCh & 0x3ff);
00192
00193 if ((ch == 0) || (ch < 0x10000))
00194 FAIL(chUsed);
00195 }
00196 else
00197 {
00198 ch = curCh;
00199 }
00200 if (CS_UC_IS_INVALID(ch))
00201 FAIL(chUsed);
00202 SUCCEED;
00203 }
00204
00209 inline static int UTF32Decode (const utf32_char* str, size_t strlen,
00210 utf32_char& ch, bool* isValid = 0)
00211 {
00212 if (str == 0)
00213 {
00214 FAIL(0);
00215 }
00216 size_t chUsed = 0;
00217
00218 GET_NEXT(ch);
00219 if (CS_UC_IS_INVALID(ch))
00220 FAIL(chUsed);
00221 SUCCEED;
00222 }
00224 #undef FAIL
00225 #undef SUCCEED
00226 #undef GET_NEXT
00227
00228 #define _OUTPUT_CHAR(buf, chr) \
00229 if (bufRemaining > 0) \
00230 { \
00231 if(buf) *buf++ = chr; \
00232 bufRemaining--; \
00233 } \
00234 encodedLen++;
00235
00236 #define OUTPUT_CHAR(chr) _OUTPUT_CHAR(buf, chr)
00237
00250 inline static int EncodeUTF8 (const utf32_char ch, utf8_char* buf,
00251 size_t bufsize)
00252 {
00253 if ((CS_UC_IS_INVALID(ch)) || (CS_UC_IS_SURROGATE(ch)))
00254 return 0;
00255 size_t bufRemaining = bufsize, encodedLen = 0;
00256
00257 if (ch < 0x80)
00258 {
00259 OUTPUT_CHAR ((utf8_char)ch);
00260 }
00261 else if (ch < 0x800)
00262 {
00263 OUTPUT_CHAR ((utf8_char)(0xc0 | (ch >> 6)));
00264 OUTPUT_CHAR ((utf8_char)(0x80 | (ch & 0x3f)));
00265 }
00266 else if (ch < 0x10000)
00267 {
00268 OUTPUT_CHAR ((utf8_char)(0xe0 | (ch >> 12)));
00269 OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 6) & 0x3f)));
00270 OUTPUT_CHAR ((utf8_char)(0x80 | (ch & 0x3f)));
00271 }
00272 else if (ch < 0x200000)
00273 {
00274 OUTPUT_CHAR ((utf8_char)(0xf0 | (ch >> 18)));
00275 OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 12) & 0x3f)));
00276 OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 6) & 0x3f)));
00277 OUTPUT_CHAR ((utf8_char)(0x80 | (ch & 0x3f)));
00278 }
00279 else if (ch < 0x4000000)
00280 {
00281 OUTPUT_CHAR ((utf8_char)(0xf8 | (ch >> 24)));
00282 OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 18) & 0x3f)));
00283 OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 12) & 0x3f)));
00284 OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 6) & 0x3f)));
00285 OUTPUT_CHAR ((utf8_char)(0x80 | (ch & 0x3f)));
00286 }
00287 else if (ch < 0x80000000)
00288 {
00289 OUTPUT_CHAR ((utf8_char)(0xfc | (ch >> 30)));
00290 OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 24) & 0x3f)));
00291 OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 18) & 0x3f)));
00292 OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 12) & 0x3f)));
00293 OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 6) & 0x3f)));
00294 OUTPUT_CHAR ((utf8_char)(0x80 | (ch & 0x3f)));
00295 }
00296 return encodedLen;
00297 }
00298
00303 inline static int EncodeUTF16 (const utf32_char ch, utf16_char* buf,
00304 size_t bufsize)
00305 {
00306 if ((CS_UC_IS_INVALID(ch)) || (CS_UC_IS_SURROGATE(ch)))
00307 return 0;
00308 size_t bufRemaining = bufsize, encodedLen = 0;
00309
00310 if (ch < 0x10000)
00311 {
00312 OUTPUT_CHAR((utf16_char)ch);
00313 }
00314 else if (ch < 0x100000)
00315 {
00316 OUTPUT_CHAR((utf16_char)((ch >> 10) | CS_UC_CHAR_HIGH_SURROGATE_FIRST));
00317 OUTPUT_CHAR((utf16_char)((ch & 0x3ff) | CS_UC_CHAR_LOW_SURROGATE_FIRST));
00318 }
00319 else
00320 return 0;
00321
00322 return encodedLen;
00323 }
00324
00329 inline static int EncodeUTF32 (const utf32_char ch, utf32_char* buf,
00330 size_t bufsize)
00331 {
00332 if ((CS_UC_IS_INVALID(ch)) || (CS_UC_IS_SURROGATE(ch)))
00333 return 0;
00334 size_t bufRemaining = bufsize, encodedLen = 0;
00335
00336 OUTPUT_CHAR(ch);
00337
00338 return encodedLen;
00339 }
00341 #undef OUTPUT_CHAR
00342
00343 #define OUTPUT_CHAR(chr) _OUTPUT_CHAR(dest, chr)
00344
00345 #define UCTF_CONVERTER(funcName, fromType, decoder, toType, encoder) \
00346 inline static size_t funcName (toType* dest, size_t destSize, \
00347 const fromType* source, size_t srcSize = (size_t)-1) \
00348 { \
00349 if ((srcSize == 0) || (source == 0)) \
00350 return 0; \
00351 \
00352 size_t bufRemaining = (destSize > 0) ? destSize - 1 : 0; \
00353 size_t encodedLen = 0; \
00354 \
00355 size_t srcChars = srcSize; \
00356 \
00357 if (srcSize == (size_t)-1) \
00358 { \
00359 srcChars = 0; \
00360 const fromType* sptr = source; \
00361 while (*sptr++ != 0) srcChars++; \
00362 } \
00363 \
00364 while (srcChars > 0) \
00365 { \
00366 utf32_char ch; \
00367 int scnt = decoder (source, srcChars, ch, 0); \
00368 if (scnt == 0) break; \
00369 int dcnt = encoder (ch, dest, bufRemaining); \
00370 if (dcnt == 0) \
00371 { \
00372 dcnt = encoder (CS_UC_CHAR_REPLACER, dest, bufRemaining); \
00373 } \
00374 \
00375 if ((size_t)dcnt >= bufRemaining) \
00376 { \
00377 if (dest && (destSize > 0)) dest += bufRemaining; \
00378 bufRemaining = 0; \
00379 } \
00380 else \
00381 { \
00382 bufRemaining -= dcnt; \
00383 if (dest && (destSize > 0)) dest += dcnt; \
00384 } \
00385 encodedLen += dcnt; \
00386 if ((size_t)scnt >= srcChars) break; \
00387 srcChars -= scnt; \
00388 source += scnt; \
00389 } \
00390 \
00391 if (dest) *dest = 0; \
00392 \
00393 return encodedLen + 1; \
00394 }
00395
00413 UCTF_CONVERTER (UTF8to16, utf8_char, UTF8Decode, utf16_char, EncodeUTF16);
00418 UCTF_CONVERTER (UTF8to32, utf8_char, UTF8Decode, utf32_char, EncodeUTF32);
00419
00424 UCTF_CONVERTER (UTF16to8, utf16_char, UTF16Decode, utf8_char, EncodeUTF8);
00429 UCTF_CONVERTER (UTF16to32, utf16_char, UTF16Decode, utf32_char, EncodeUTF32);
00430
00435 UCTF_CONVERTER (UTF32to8, utf32_char, UTF32Decode, utf8_char, EncodeUTF8);
00440 UCTF_CONVERTER (UTF32to16, utf32_char, UTF32Decode, utf16_char, EncodeUTF16);
00443 #undef UCTF_CONVERTER
00444 #undef OUTPUT_CHAR
00445
00446 #if (CS_WCHAR_T_SIZE == 1)
00447 inline static size_t UTF8toWC (wchar_t* dest, size_t destSize,
00448 const utf8_char* source, size_t srcSize)
00449 {
00450 size_t srcChars = srcSize;
00451 if (srcSize == (size_t)-1)
00452 {
00453 srcChars = 0;
00454 const utf8_char* sptr = source;
00455 while (*sptr++ != 0) srcChars++;
00456 }
00457 if ((dest != 0) && (destSize != 0))
00458 {
00459 size_t len = MIN (destSize - 1, srcChars);
00460 memcpy (dest, source, size * sizeof (wchar_t));
00461 *(dest + len) = 0;
00462 }
00463 return srcChars + 1;
00464 };
00465
00466 inline static size_t UTF16toWC (wchar_t* dest, size_t destSize,
00467 const utf16_char* source, size_t srcSize)
00468 {
00469 return UTF16to8 ((utf8_char*)dest, destSize, source, srcSize);
00470 };
00471
00472 inline static size_t UTF32toWC (wchar_t* dest, size_t destSize,
00473 const utf32_char* source, size_t srcSize)
00474 {
00475 return UTF32to8 ((utf8_char*)dest, destSize, source, srcSize);
00476 };
00477
00478 inline static size_t WCtoUTF8 (utf8_char* dest, size_t destSize,
00479 const wchar_t* source, size_t srcSize)
00480 {
00481 size_t srcChars = srcSize;
00482 if (srcSize == (size_t)-1)
00483 {
00484 srcChars = 0;
00485 const wchar_t* sptr = source;
00486 while (*sptr++ != 0) srcChars++;
00487 }
00488 if ((dest != 0) && (destSize != 0))
00489 {
00490 size_t len = MIN (destSize - 1, srcChars);
00491 memcpy (dest, source, len * sizeof (wchar_t));
00492 *(dest + len) = 0;
00493 }
00494 return srcChars + 1;
00495 };
00496
00497 inline static size_t WCtoUTF16 (utf16_char* dest, size_t destSize,
00498 const wchar_t* source, size_t srcSize)
00499 {
00500 return UTF8to16 (dest, destSize, source, srcSize);
00501 };
00502
00503 inline static size_t WCtoUTF32 (utf32_char* dest, size_t destSize,
00504 const wchar_t* source, size_t srcSize)
00505 {
00506 return UTF8to32 (dest, destSize, source, srcSize);
00507 };
00508 #elif (CS_WCHAR_T_SIZE == 2)
00509
00510
00511
00518 inline static size_t UTF8toWC (wchar_t* dest, size_t destSize,
00519 const utf8_char* source, size_t srcSize)
00520 {
00521 return UTF8to16 ((utf16_char*)dest, destSize, source, srcSize);
00522 };
00523
00528 inline static size_t UTF16toWC (wchar_t* dest, size_t destSize,
00529 const utf16_char* source, size_t srcSize)
00530 {
00531 size_t srcChars = srcSize;
00532 if (srcSize == (size_t)-1)
00533 {
00534 srcChars = 0;
00535 const utf16_char* sptr = source;
00536 while (*sptr++ != 0) srcChars++;
00537 }
00538 if ((dest != 0) && (destSize != 0))
00539 {
00540 size_t len = MIN (destSize - 1, srcChars);
00541 memcpy (dest, source, len * sizeof (wchar_t));
00542 *(dest + len) = 0;
00543 }
00544 return srcChars + 1;
00545 };
00546
00551 inline static size_t UTF32toWC (wchar_t* dest, size_t destSize,
00552 const utf32_char* source, size_t srcSize)
00553 {
00554 return UTF32to16 ((utf16_char*)dest, destSize, source, srcSize);
00555 };
00556
00561 inline static size_t WCtoUTF8 (utf8_char* dest, size_t destSize,
00562 const wchar_t* source, size_t srcSize)
00563 {
00564 return UTF16to8 (dest, destSize, (utf16_char*)source, srcSize);
00565 };
00566
00571 inline static size_t WCtoUTF16 (utf16_char* dest, size_t destSize,
00572 const wchar_t* source, size_t srcSize)
00573 {
00574 size_t srcChars = srcSize;
00575 if (srcSize == (size_t)-1)
00576 {
00577 srcChars = 0;
00578 const wchar_t* sptr = source;
00579 while (*sptr++ != 0) srcChars++;
00580 }
00581 if ((dest != 0) && (destSize != 0))
00582 {
00583 size_t len = MIN (destSize - 1, srcChars);
00584 memcpy (dest, source, len * sizeof (wchar_t));
00585 *(dest + len) = 0;
00586 }
00587 return srcChars + 1;
00588 };
00589
00594 inline static size_t WCtoUTF32 (utf32_char* dest, size_t destSize,
00595 const wchar_t* source, size_t srcSize)
00596 {
00597 return UTF16to32 (dest, destSize, (utf16_char*)source, srcSize);
00598 };
00600 #elif (CS_WCHAR_T_SIZE == 4)
00601 inline static size_t UTF8toWC (wchar_t* dest, size_t destSize,
00602 const utf8_char* source, size_t srcSize)
00603 {
00604 return UTF8to32 ((utf32_char*)dest, destSize, source, srcSize);
00605 };
00606
00607 inline static size_t UTF16toWC (wchar_t* dest, size_t destSize,
00608 const utf16_char* source, size_t srcSize)
00609 {
00610 return UTF16to32 ((utf32_char*)dest, destSize, source, srcSize);
00611 };
00612
00613 inline static size_t UTF32toWC (wchar_t* dest, size_t destSize,
00614 const utf32_char* source, size_t srcSize)
00615 {
00616 size_t srcChars = srcSize;
00617 if (srcSize == (size_t)-1)
00618 {
00619 srcChars = 0;
00620 const utf32_char* sptr = source;
00621 while (*sptr++ != 0) srcChars++;
00622 }
00623 if ((dest != 0) && (destSize != 0))
00624 {
00625 size_t len = MIN (destSize - 1, srcChars);
00626 memcpy (dest, source, len * sizeof (wchar_t));
00627 *(dest + len) = 0;
00628 }
00629 return srcChars + 1;
00630 };
00631
00632 inline static size_t WCtoUTF8 (utf8_char* dest, size_t destSize,
00633 const wchar_t* source, size_t srcSize)
00634 {
00635 return UTF32to8 (dest, destSize, (utf32_char*)source, srcSize);
00636 };
00637
00638 inline static size_t WCtoUTF16 (utf16_char* dest, size_t destSize,
00639 const wchar_t* source, size_t srcSize)
00640 {
00641 return UTF32to16 (dest, destSize, (utf32_char*)source, srcSize);
00642 };
00643
00644 inline static size_t WCtoUTF32 (utf32_char* dest, size_t destSize,
00645 const wchar_t* source, size_t srcSize)
00646 {
00647 size_t srcChars = srcSize;
00648 if (srcSize == (size_t)-1)
00649 {
00650 srcChars = 0;
00651 const wchar_t* sptr = source;
00652 while (*sptr++ != 0) srcChars++;
00653 }
00654 if ((dest != 0) && (destSize != 0))
00655 {
00656 size_t len = MIN (destSize - 1, srcChars);
00657 memcpy (dest, source, len * sizeof (wchar_t));
00658 *(dest + len) = 0;
00659 }
00660 return srcChars + 1;
00661 };
00662 #else
00663 #error Odd-sized, unsupported wchar_t!
00664 #endif
00665
00666 };
00667
00670 #endif
00671