CrystalSpace

Public API Reference

Main Page   Modules   Class Hierarchy   Alphabetical List   Compound List   File List   Compound Members   File Members   Related Pages  

csuctransform.h

Go to the documentation of this file.
00001 /*
00002     Copyright (C) 2003 by Frank Richter
00003 
00004     This library is free software; you can redistribute it and/or
00005     modify it under the terms of the GNU Library General Public
00006     License as published by the Free Software Foundation; either
00007     version 2 of the License, or (at your option) any later version.
00008 
00009     This library is distributed in the hope that it will be useful,
00010     but WITHOUT ANY WARRANTY; without even the implied warranty of
00011     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00012     Library General Public License for more details.
00013 
00014     You should have received a copy of the GNU Library General Public
00015     License along with this library; if not, write to the Free
00016     Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
00017 */
00018 
00019 #ifndef __CS_CSUCTRANSFORM_H__
00020 #define __CS_CSUCTRANSFORM_H__
00021 
00022 #include "csunicode.h"
00023 
00031 
00032 #define CS_UC_MAX_UTF8_ENCODED          6
00033 
00034 #define CS_UC_MAX_UTF16_ENCODED         2
00035 
00036 #define CS_UC_MAX_UTF32_ENCODED         1
00037  
00041 class csUnicodeTransform
00042 {
00043 public:
00044 #define FAIL(ret)                               \
00045   {                                             \
00046     if (isValid) *isValid = false;              \
00047     ch = CS_UC_CHAR_REPLACER;                   \
00048     return ret;                                 \
00049   }
00050 
00051 #define SUCCEED                                 \
00052     if (isValid) *isValid = true;               \
00053     return chUsed;
00054   
00055 #define GET_NEXT(next)  \
00056   if (chUsed == strlen)                         \
00057   {                                             \
00058     FAIL(chUsed);                               \
00059   }                                             \
00060   next = *str++;                                \
00061   if (next == 0)                                \
00062   {                                             \
00063     FAIL(chUsed);                               \
00064   }                                             \
00065   chUsed++;                                     
00066   
00082   inline static int UTF8Decode (const utf8_char* str, size_t strlen, 
00083     utf32_char& ch, bool* isValid = 0)
00084   {
00085     if (str == 0)
00086     {
00087       FAIL(0);
00088     }
00089     size_t chUsed = 0;
00090     
00091     utf8_char curCh;
00092     GET_NEXT(curCh);
00093     if ((curCh & 0x80) == 0)
00094     {
00095       // easy case
00096       ch = curCh;
00097       SUCCEED;
00098     }
00099     else
00100     {
00101       // Count with how many bytes this char is encoded.
00102       int n = 0;
00103       while ((n < 7) && ((curCh & (1 << (7 - n))) != 0)) { n++; }
00104 
00105       if ((n < 2) || (n > 6))
00106       {
00107         // Invalid code: first char of a "sequence" must have
00108         // at least two and at most six MSBs set
00109         FAIL(1);
00110       }
00111 
00112       ch = (curCh & ((1 << (8 - n)) - 1));
00113       
00114       for (int i = 1; i < n; i++)
00115       {
00116         GET_NEXT(curCh);
00117         if ((curCh & 0xc0) != 0x80)
00118         {
00119           FAIL(chUsed);
00120         }
00121         else
00122         {
00123           ch <<= 6;
00124           ch |= (curCh & 0x3f);
00125         }
00126       }
00127       
00128       // Check for "overlong" codes.
00129       if ((ch < 0x80) && (n > 0))
00130       {
00131         FAIL(chUsed);
00132       }
00133       else if ((ch < 0x800) && (n > 2))
00134       {
00135         FAIL(chUsed);
00136       }
00137       else if ((ch < 0x10000) && (n > 3))
00138       {
00139         FAIL(chUsed);
00140       }
00141       else if ((ch < 0x200000) && (n > 4))
00142       {
00143         FAIL(chUsed);
00144       }
00145       else if ((ch < 0x4000000) && (n > 5))
00146       {
00147         FAIL(chUsed);
00148       }
00149       else if ((ch < 0x80000000) && (n > 6))
00150       {
00151         FAIL(chUsed);
00152       }
00153       
00154       if (CS_UC_IS_INVALID(ch) || CS_UC_IS_SURROGATE(ch))
00155         FAIL(chUsed);
00156       SUCCEED;
00157     }
00158   }
00159   
00164   inline static int UTF16Decode (const utf16_char* str, size_t strlen, 
00165     utf32_char& ch, bool* isValid = 0)
00166   {
00167     if (str == 0)
00168     {
00169       FAIL(0);
00170     }
00171     size_t chUsed = 0;
00172     
00173     utf16_char curCh;
00174     GET_NEXT(curCh);
00175     // Decode surrogate
00176     if (CS_UC_IS_SURROGATE (curCh))
00177     {
00178       // Invalid code
00179       if (!CS_UC_IS_HIGH_SURROGATE (curCh))
00180       {
00181         FAIL(chUsed);
00182       }
00183       ch = (curCh & 0x03ff) << 10;
00184       GET_NEXT(curCh);
00185       // Invalid code
00186       if (!CS_UC_IS_LOW_SURROGATE (curCh))
00187       {
00188         // Fail with 1 so the char is handled upon the next Decode.
00189         FAIL(1);
00190       }
00191       ch |= (curCh & 0x3ff);
00192       // Check for "overlong" codes
00193       if ((ch == 0) || (ch < 0x10000))
00194         FAIL(chUsed);
00195     }
00196     else
00197     {
00198       ch = curCh;
00199     }
00200     if (CS_UC_IS_INVALID(ch))
00201       FAIL(chUsed);
00202     SUCCEED;
00203   }
00204   
00209   inline static int UTF32Decode (const utf32_char* str, size_t strlen, 
00210     utf32_char& ch, bool* isValid = 0)
00211   {
00212     if (str == 0)
00213     {
00214       FAIL(0);
00215     }
00216     size_t chUsed = 0;
00217     
00218     GET_NEXT(ch);
00219     if (CS_UC_IS_INVALID(ch))
00220       FAIL(chUsed);
00221     SUCCEED;
00222   }
00224 #undef FAIL
00225 #undef SUCCEED
00226 #undef GET_NEXT
00227 
00228 #define _OUTPUT_CHAR(buf, chr)                          \
00229   if (bufRemaining > 0)                                 \
00230   {                                                     \
00231     if(buf) *buf++ = chr;                               \
00232     bufRemaining--;                                     \
00233   }                                                     \
00234   encodedLen++;
00235 
00236 #define OUTPUT_CHAR(chr) _OUTPUT_CHAR(buf, chr)
00237   
00250   inline static int EncodeUTF8 (const utf32_char ch, utf8_char* buf, 
00251     size_t bufsize)
00252   {
00253     if ((CS_UC_IS_INVALID(ch)) || (CS_UC_IS_SURROGATE(ch))) 
00254       return 0;
00255     size_t bufRemaining = bufsize, encodedLen = 0;
00256     
00257     if (ch < 0x80)
00258     {
00259       OUTPUT_CHAR ((utf8_char)ch);
00260     }
00261     else if (ch < 0x800)
00262     {
00263       OUTPUT_CHAR ((utf8_char)(0xc0 | (ch >> 6)));
00264       OUTPUT_CHAR ((utf8_char)(0x80 | (ch & 0x3f)));
00265     }
00266     else if (ch < 0x10000)
00267     {
00268       OUTPUT_CHAR ((utf8_char)(0xe0 | (ch >> 12)));
00269       OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 6) & 0x3f)));
00270       OUTPUT_CHAR ((utf8_char)(0x80 | (ch & 0x3f)));
00271     }
00272     else if (ch < 0x200000)
00273     {
00274       OUTPUT_CHAR ((utf8_char)(0xf0 | (ch >> 18)));
00275       OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 12) & 0x3f)));
00276       OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 6) & 0x3f)));
00277       OUTPUT_CHAR ((utf8_char)(0x80 | (ch & 0x3f)));
00278     }
00279     else if (ch < 0x4000000)
00280     {
00281       OUTPUT_CHAR ((utf8_char)(0xf8 | (ch >> 24)));
00282       OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 18) & 0x3f)));
00283       OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 12) & 0x3f)));
00284       OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 6) & 0x3f)));
00285       OUTPUT_CHAR ((utf8_char)(0x80 | (ch & 0x3f)));
00286     }
00287     else if (ch < 0x80000000)
00288     {
00289       OUTPUT_CHAR ((utf8_char)(0xfc | (ch >> 30)));
00290       OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 24) & 0x3f)));
00291       OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 18) & 0x3f)));
00292       OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 12) & 0x3f)));
00293       OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 6) & 0x3f)));
00294       OUTPUT_CHAR ((utf8_char)(0x80 | (ch & 0x3f)));
00295     }
00296     return encodedLen;
00297   }
00298     
00303   inline static int EncodeUTF16 (const utf32_char ch, utf16_char* buf, 
00304     size_t bufsize)
00305   {
00306     if ((CS_UC_IS_INVALID(ch)) || (CS_UC_IS_SURROGATE(ch))) 
00307       return 0;
00308     size_t bufRemaining = bufsize, encodedLen = 0;
00309     
00310     if (ch < 0x10000)
00311     {
00312       OUTPUT_CHAR((utf16_char)ch);
00313     }
00314     else if (ch < 0x100000)
00315     {
00316       OUTPUT_CHAR((utf16_char)((ch >> 10) | CS_UC_CHAR_HIGH_SURROGATE_FIRST));
00317       OUTPUT_CHAR((utf16_char)((ch & 0x3ff) | CS_UC_CHAR_LOW_SURROGATE_FIRST));
00318     }
00319     else
00320       return 0;
00321     
00322     return encodedLen;
00323   }
00324 
00329   inline static int EncodeUTF32 (const utf32_char ch, utf32_char* buf, 
00330     size_t bufsize)
00331   {
00332     if ((CS_UC_IS_INVALID(ch)) || (CS_UC_IS_SURROGATE(ch))) 
00333       return 0;
00334     size_t bufRemaining = bufsize, encodedLen = 0;
00335     
00336     OUTPUT_CHAR(ch);
00337     
00338     return encodedLen;
00339   }
00341 #undef OUTPUT_CHAR
00342   
00343 #define OUTPUT_CHAR(chr) _OUTPUT_CHAR(dest, chr)
00344   
00345 #define UCTF_CONVERTER(funcName, fromType, decoder, toType, encoder)    \
00346   inline static size_t funcName (toType* dest, size_t destSize,         \
00347     const fromType* source, size_t srcSize = (size_t)-1)                \
00348   {                                                                     \
00349     if ((srcSize == 0) || (source == 0))                                \
00350       return 0;                                                         \
00351                                                                         \
00352     size_t bufRemaining = (destSize > 0) ? destSize - 1 : 0;            \
00353     size_t encodedLen = 0;                                              \
00354                                                                         \
00355     size_t srcChars = srcSize;                                          \
00356                                                                         \
00357     if (srcSize == (size_t)-1)                                          \
00358     {                                                                   \
00359       srcChars = 0;                                                     \
00360       const fromType* sptr = source;                                    \
00361       while (*sptr++ != 0) srcChars++;                                  \
00362     }                                                                   \
00363                                                                         \
00364     while (srcChars > 0)                                                \
00365     {                                                                   \
00366       utf32_char ch;                                                    \
00367       int scnt = decoder (source, srcChars, ch, 0);                     \
00368       if (scnt == 0) break;                                             \
00369       int dcnt = encoder (ch, dest, bufRemaining);                      \
00370       if (dcnt == 0)                                                    \
00371       {                                                                 \
00372         dcnt = encoder (CS_UC_CHAR_REPLACER, dest, bufRemaining);       \
00373       }                                                                 \
00374                                                                         \
00375       if ((size_t)dcnt >= bufRemaining)                                 \
00376       {                                                                 \
00377         if (dest && (destSize > 0)) dest += bufRemaining;               \
00378         bufRemaining = 0;                                               \
00379       }                                                                 \
00380       else                                                              \
00381       {                                                                 \
00382         bufRemaining -= dcnt;                                           \
00383         if (dest && (destSize > 0)) dest += dcnt;                       \
00384       }                                                                 \
00385       encodedLen += dcnt;                                               \
00386       if ((size_t)scnt >= srcChars) break;                              \
00387       srcChars -= scnt;                                                 \
00388       source += scnt;                                                   \
00389     }                                                                   \
00390                                                                         \
00391     if (dest) *dest = 0;                                                \
00392                                                                         \
00393     return encodedLen + 1;                                              \
00394   }
00395 
00413   UCTF_CONVERTER (UTF8to16, utf8_char, UTF8Decode, utf16_char, EncodeUTF16);
00418   UCTF_CONVERTER (UTF8to32, utf8_char, UTF8Decode, utf32_char, EncodeUTF32);
00419 
00424   UCTF_CONVERTER (UTF16to8, utf16_char, UTF16Decode, utf8_char, EncodeUTF8);
00429   UCTF_CONVERTER (UTF16to32, utf16_char, UTF16Decode, utf32_char, EncodeUTF32);
00430   
00435   UCTF_CONVERTER (UTF32to8, utf32_char, UTF32Decode, utf8_char, EncodeUTF8);
00440   UCTF_CONVERTER (UTF32to16, utf32_char, UTF32Decode, utf16_char, EncodeUTF16);
00443 #undef UCTF_CONVERTER
00444 #undef OUTPUT_CHAR
00445 
00446 #if (CS_WCHAR_T_SIZE == 1)
00447   inline static size_t UTF8toWC (wchar_t* dest, size_t destSize, 
00448     const utf8_char* source, size_t srcSize)
00449   {
00450     size_t srcChars = srcSize;                                          
00451     if (srcSize == (size_t)-1)                                          
00452     {                                                                   
00453       srcChars = 0;                                                     
00454       const utf8_char* sptr = source;                                   
00455       while (*sptr++ != 0) srcChars++;                                  
00456     }                           
00457     if ((dest != 0) && (destSize != 0))
00458     {
00459       size_t len = MIN (destSize - 1, srcChars);
00460       memcpy (dest, source, size * sizeof (wchar_t));
00461       *(dest + len) = 0;
00462     }
00463     return srcChars + 1;
00464   };
00465 
00466   inline static size_t UTF16toWC (wchar_t* dest, size_t destSize, 
00467     const utf16_char* source, size_t srcSize)
00468   {
00469     return UTF16to8 ((utf8_char*)dest, destSize, source, srcSize);
00470   };
00471 
00472   inline static size_t UTF32toWC (wchar_t* dest, size_t destSize, 
00473     const utf32_char* source, size_t srcSize)
00474   {
00475     return UTF32to8 ((utf8_char*)dest, destSize, source, srcSize);
00476   };
00477   
00478   inline static size_t WCtoUTF8 (utf8_char* dest, size_t destSize, 
00479     const wchar_t* source, size_t srcSize)
00480   {
00481     size_t srcChars = srcSize;                                          
00482     if (srcSize == (size_t)-1)                                          
00483     {                                                                   
00484       srcChars = 0;                                                     
00485       const wchar_t* sptr = source;                                     
00486       while (*sptr++ != 0) srcChars++;                                  
00487     }                           
00488     if ((dest != 0) && (destSize != 0))
00489     {
00490       size_t len = MIN (destSize - 1, srcChars);
00491       memcpy (dest, source, len * sizeof (wchar_t));
00492       *(dest + len) = 0;
00493     }
00494     return srcChars + 1;
00495   };
00496 
00497   inline static size_t WCtoUTF16 (utf16_char* dest, size_t destSize, 
00498     const wchar_t* source, size_t srcSize)
00499   {
00500     return UTF8to16 (dest, destSize, source, srcSize);
00501   };
00502 
00503   inline static size_t WCtoUTF32 (utf32_char* dest, size_t destSize, 
00504     const wchar_t* source, size_t srcSize)
00505   {
00506     return UTF8to32 (dest, destSize, source, srcSize);
00507   };
00508 #elif (CS_WCHAR_T_SIZE == 2)
00509   // Methods below for doxygen documentation are here as the size '2' is 
00510   // default.
00511   
00518   inline static size_t UTF8toWC (wchar_t* dest, size_t destSize, 
00519     const utf8_char* source, size_t srcSize)
00520   {
00521     return UTF8to16 ((utf16_char*)dest, destSize, source, srcSize);
00522   };
00523 
00528   inline static size_t UTF16toWC (wchar_t* dest, size_t destSize, 
00529     const utf16_char* source, size_t srcSize)
00530   {
00531     size_t srcChars = srcSize;                                          
00532     if (srcSize == (size_t)-1)                                          
00533     {                                                                   
00534       srcChars = 0;                                                     
00535       const utf16_char* sptr = source;                                  
00536       while (*sptr++ != 0) srcChars++;                                  
00537     }                           
00538     if ((dest != 0) && (destSize != 0))
00539     {
00540       size_t len = MIN (destSize - 1, srcChars);
00541       memcpy (dest, source, len * sizeof (wchar_t));
00542       *(dest + len) = 0;
00543     }
00544     return srcChars + 1;
00545   };
00546 
00551   inline static size_t UTF32toWC (wchar_t* dest, size_t destSize, 
00552     const utf32_char* source, size_t srcSize)
00553   {
00554     return UTF32to16 ((utf16_char*)dest, destSize, source, srcSize);
00555   };
00556   
00561   inline static size_t WCtoUTF8 (utf8_char* dest, size_t destSize, 
00562     const wchar_t* source, size_t srcSize)
00563   {
00564     return UTF16to8 (dest, destSize, (utf16_char*)source, srcSize);
00565   };
00566 
00571   inline static size_t WCtoUTF16 (utf16_char* dest, size_t destSize, 
00572     const wchar_t* source, size_t srcSize)
00573   {
00574     size_t srcChars = srcSize;                                          
00575     if (srcSize == (size_t)-1)                                          
00576     {                                                                   
00577       srcChars = 0;                                                     
00578       const wchar_t* sptr = source;                                     
00579       while (*sptr++ != 0) srcChars++;                                  
00580     }                           
00581     if ((dest != 0) && (destSize != 0))
00582     {
00583       size_t len = MIN (destSize - 1, srcChars);
00584       memcpy (dest, source, len * sizeof (wchar_t));
00585       *(dest + len) = 0;
00586     }
00587     return srcChars + 1;
00588   };
00589 
00594   inline static size_t WCtoUTF32 (utf32_char* dest, size_t destSize, 
00595     const wchar_t* source, size_t srcSize)
00596   {
00597     return UTF16to32 (dest, destSize, (utf16_char*)source, srcSize);
00598   };
00600 #elif (CS_WCHAR_T_SIZE == 4)
00601   inline static size_t UTF8toWC (wchar_t* dest, size_t destSize, 
00602     const utf8_char* source, size_t srcSize)
00603   {
00604     return UTF8to32 ((utf32_char*)dest, destSize, source, srcSize);
00605   };
00606 
00607   inline static size_t UTF16toWC (wchar_t* dest, size_t destSize, 
00608     const utf16_char* source, size_t srcSize)
00609   {
00610     return UTF16to32 ((utf32_char*)dest, destSize, source, srcSize);
00611   };
00612 
00613   inline static size_t UTF32toWC (wchar_t* dest, size_t destSize, 
00614     const utf32_char* source,  size_t srcSize)
00615   {
00616     size_t srcChars = srcSize;                                          
00617     if (srcSize == (size_t)-1)                                          
00618     {                                                                   
00619       srcChars = 0;                                                     
00620       const utf32_char* sptr = source;                                  
00621       while (*sptr++ != 0) srcChars++;                                  
00622     }                           
00623     if ((dest != 0) && (destSize != 0))
00624     {
00625       size_t len = MIN (destSize - 1, srcChars);
00626       memcpy (dest, source, len * sizeof (wchar_t));
00627       *(dest + len) = 0;
00628     }
00629     return srcChars + 1;
00630   };
00631   
00632   inline static size_t WCtoUTF8 (utf8_char* dest, size_t destSize, 
00633     const wchar_t* source, size_t srcSize)
00634   {
00635     return UTF32to8 (dest, destSize, (utf32_char*)source, srcSize);
00636   };
00637 
00638   inline static size_t WCtoUTF16 (utf16_char* dest, size_t destSize, 
00639     const wchar_t* source, size_t srcSize)
00640   {
00641     return UTF32to16 (dest, destSize, (utf32_char*)source, srcSize);
00642   };
00643 
00644   inline static size_t WCtoUTF32 (utf32_char* dest, size_t destSize, 
00645     const wchar_t* source, size_t srcSize)
00646   {
00647     size_t srcChars = srcSize;                                          
00648     if (srcSize == (size_t)-1)                                          
00649     {                                                                   
00650       srcChars = 0;                                                     
00651       const wchar_t* sptr = source;                                     
00652       while (*sptr++ != 0) srcChars++;                                  
00653     }                           
00654     if ((dest != 0) && (destSize != 0))
00655     {
00656       size_t len = MIN (destSize - 1, srcChars);
00657       memcpy (dest, source, len * sizeof (wchar_t));
00658       *(dest + len) = 0;
00659     }
00660     return srcChars + 1;
00661   };
00662 #else
00663   #error Odd-sized, unsupported wchar_t!
00664 #endif
00665 
00666 };
00667 
00670 #endif
00671 

Generated for Crystal Space by doxygen 1.2.14