#include "unicode/utypes.h"
#include "unicode/ucnv_err.h"
Go to the source code of this file.
Defines | |
#define | UCNV_MAX_CONVERTER_NAME_LENGTH 60 |
#define | UCNV_MAX_FULL_FILE_NAME_LENGTH (600+UCNV_MAX_CONVERTER_NAME_LENGTH) |
#define | UCNV_SI 0x0F |
#define | UCNV_SO 0x0E |
#define | UCNV_OPTION_SEP_CHAR ',' |
Character that separates converter names from options and options from each other. | |
#define | UCNV_OPTION_SEP_STRING "," |
String version of UCNV_OPTION_SEP_CHAR. | |
#define | UCNV_VALUE_SEP_CHAR '=' |
Character that separates a converter option from its value. | |
#define | UCNV_VALUE_SEP_STRING "=" |
String version of UCNV_VALUE_SEP_CHAR. | |
#define | UCNV_LOCALE_OPTION_STRING ",locale=" |
Converter option for specifying a locale. | |
#define | U_CNV_SAFECLONE_BUFFERSIZE 512 |
ICU 1.8 | |
Typedefs | |
typedef UConverter | UConverter |
typedef UConverterToUnicodeArgs * | args |
typedef UConverterToUnicodeArgs const char * | codePoints |
typedef UConverterToUnicodeArgs const char int32_t | length |
typedef UConverterToUnicodeArgs const char int32_t UConverterCallbackReason | reason |
typedef UConverterFromUnicodeArgs const UChar * | codeUnits |
typedef UConverterFromUnicodeArgs const UChar int32_t UChar32 | codePoint |
Enumerations | |
enum | UConverterType { UCNV_UNSUPPORTED_CONVERTER = -1, UCNV_SBCS = 0, UCNV_DBCS = 1, UCNV_MBCS = 2, UCNV_LATIN_1 = 3, UCNV_UTF8 = 4, UCNV_UTF16_BigEndian = 5, UCNV_UTF16_LittleEndian = 6, UCNV_UTF32_BigEndian = 7, UCNV_UTF32_LittleEndian = 8, UCNV_EBCDIC_STATEFUL = 9, UCNV_ISO_2022 = 10, UCNV_LMBCS_1 = 11, UCNV_LMBCS_2, UCNV_LMBCS_3, UCNV_LMBCS_4, UCNV_LMBCS_5, UCNV_LMBCS_6, UCNV_LMBCS_8, UCNV_LMBCS_11, UCNV_LMBCS_16, UCNV_LMBCS_17, UCNV_LMBCS_18, UCNV_LMBCS_19, UCNV_LMBCS_LAST = UCNV_LMBCS_19, UCNV_HZ, UCNV_SCSU, UCNV_ISCII, UCNV_US_ASCII, UCNV_UTF7, UCNV_NUMBER_OF_SUPPORTED_CONVERTER_TYPES } |
Enum for specifying basic types of converters. More... | |
enum | UConverterPlatform { UCNV_UNKNOWN = -1, UCNV_IBM = 0 } |
Enum for specifying which platform a converter ID refers to. More... | |
Functions | |
typedef | void (U_EXPORT2 *UConverterToUCallback)(const void *context |
Function pointer for error callback in the codepage to unicode direction. | |
U_CAPI int U_EXPORT2 | ucnv_compareNames (const char *name1, const char *name2) |
Do a fuzzy compare of a two converter/alias names. | |
U_CAPI UConverter *U_EXPORT2 | ucnv_open (const char *converterName, UErrorCode *err) |
Creates a UConverter object with the names specified as a C string. | |
U_CAPI UConverter *U_EXPORT2 | ucnv_openU (const UChar *name, UErrorCode *err) |
Creates a Unicode converter with the names specified as unicode string. | |
U_CAPI UConverter *U_EXPORT2 | ucnv_openCCSID (int32_t codepage, UConverterPlatform platform, UErrorCode *err) |
Creates a UConverter object from a CCSID number and platform pair. | |
U_CAPI UConverter *U_EXPORT2 | ucnv_safeClone (const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status) |
Thread safe cloning operation. | |
U_CAPI void U_EXPORT2 | ucnv_close (UConverter *converter) |
Deletes the unicode converter and releases resources associated with just this instance. | |
U_CAPI void U_EXPORT2 | ucnv_getSubstChars (const UConverter *converter, char *subChars, int8_t *len, UErrorCode *err) |
Fills in the output parameter, subChars, with the substitution characters as multiple bytes. | |
U_CAPI void U_EXPORT2 | ucnv_setSubstChars (UConverter *converter, const char *subChars, int8_t len, UErrorCode *err) |
Sets the substitution chars when converting from unicode to a codepage. | |
U_CAPI void U_EXPORT2 | ucnv_getInvalidChars (const UConverter *converter, char *errBytes, int8_t *len, UErrorCode *err) |
Fills in the output parameter, errBytes, with the error characters from the last failing conversion. | |
U_CAPI void U_EXPORT2 | ucnv_getInvalidUChars (const UConverter *converter, UChar *errUChars, int8_t *len, UErrorCode *err) |
Fills in the output parameter, errChars, with the error characters from the last failing conversion. | |
U_CAPI void U_EXPORT2 | ucnv_reset (UConverter *converter) |
Resets the state of a converter to the default state. | |
U_CAPI void U_EXPORT2 | ucnv_resetToUnicode (UConverter *converter) |
Resets the to-Unicode part of a converter state to the default state. | |
U_CAPI void U_EXPORT2 | ucnv_resetFromUnicode (UConverter *converter) |
Resets the from-Unicode part of a converter state to the default state. | |
U_CAPI int8_t U_EXPORT2 | ucnv_getMaxCharSize (const UConverter *converter) |
Returns the maximum length of bytes used by a character. | |
U_CAPI int8_t U_EXPORT2 | ucnv_getMinCharSize (const UConverter *converter) |
Returns the minimum byte length for characters in this codepage. | |
U_CAPI int32_t U_EXPORT2 | ucnv_getDisplayName (const UConverter *converter, const char *displayLocale, UChar *displayName, int32_t displayNameCapacity, UErrorCode *err) |
Returns the display name of the converter passed in based on the Locale passed in. | |
U_CAPI const char *U_EXPORT2 | ucnv_getName (const UConverter *converter, UErrorCode *err) |
Gets the internal, canonical name of the converter (zero-terminated). | |
U_CAPI int32_t U_EXPORT2 | ucnv_getCCSID (const UConverter *converter, UErrorCode *err) |
Gets a codepage number associated with the converter. | |
U_CAPI UConverterPlatform U_EXPORT2 | ucnv_getPlatform (const UConverter *converter, UErrorCode *err) |
Gets a codepage platform associated with the converter. | |
U_CAPI UConverterType U_EXPORT2 | ucnv_getType (const UConverter *converter) |
Gets the type of the converter e.g. | |
U_CAPI void U_EXPORT2 | ucnv_getStarters (const UConverter *converter, UBool starters[256], UErrorCode *err) |
Gets the "starter" (lead) bytes for converters of type MBCS. | |
U_CAPI void U_EXPORT2 | ucnv_getToUCallBack (const UConverter *converter, UConverterToUCallback *action, const void **context) |
Gets the current calback function used by the converter when an illegal or invalid codepage sequence is found. | |
U_CAPI void U_EXPORT2 | ucnv_getFromUCallBack (const UConverter *converter, UConverterFromUCallback *action, const void **context) |
Gets the current callback function used by the converter when illegal or invalid Unicode sequence is found. | |
U_CAPI void U_EXPORT2 | ucnv_setToUCallBack (UConverter *converter, UConverterToUCallback newAction, const void *newContext, UConverterToUCallback *oldAction, const void **oldContext, UErrorCode *err) |
Changes the callback function used by the converter when an illegal or invalid sequence is found. | |
U_CAPI void U_EXPORT2 | ucnv_setFromUCallBack (UConverter *converter, UConverterFromUCallback newAction, const void *newContext, UConverterFromUCallback *oldAction, const void **oldContext, UErrorCode *err) |
Changes the current callback function used by the converter when an illegal or invalid sequence is found. | |
U_CAPI void U_EXPORT2 | ucnv_fromUnicode (UConverter *converter, char **target, const char *targetLimit, const UChar **source, const UChar *sourceLimit, int32_t *offsets, UBool flush, UErrorCode *err) |
Converts an array of unicode characters to an array of codepage characters. | |
U_CAPI void U_EXPORT2 | ucnv_toUnicode (UConverter *converter, UChar **target, const UChar *targetLimit, const char **source, const char *sourceLimit, int32_t *offsets, UBool flush, UErrorCode *err) |
Converts a buffer of codepage bytes into an array of unicode UChars characters. | |
U_CAPI int32_t U_EXPORT2 | ucnv_fromUChars (UConverter *cnv, char *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, UErrorCode *pErrorCode) |
Convert the Unicode string into a codepage string using an existing UConverter. | |
U_CAPI int32_t U_EXPORT2 | ucnv_toUChars (UConverter *cnv, UChar *dest, int32_t destCapacity, const char *src, int32_t srcLength, UErrorCode *pErrorCode) |
Convert the codepage string into a Unicode string using an existing UConverter. | |
U_CAPI UChar32 U_EXPORT2 | ucnv_getNextUChar (UConverter *converter, const char **source, const char *sourceLimit, UErrorCode *err) |
Will convert a codepage buffer into unicode one character at a time. | |
U_CAPI int32_t U_EXPORT2 | ucnv_convert (const char *toConverterName, const char *fromConverterName, char *target, int32_t targetCapacity, const char *source, int32_t sourceLength, UErrorCode *err) |
Will convert a sequence of bytes from one codepage to another. | |
U_CAPI int32_t U_EXPORT2 | ucnv_flushCache (void) |
Frees up memory occupied by unused, cached converter shared data. | |
U_CAPI int32_t U_EXPORT2 | ucnv_countAvailable (void) |
returns the number of available converters, as per the alias file. | |
U_CAPI const char *U_EXPORT2 | ucnv_getAvailableName (int32_t n) |
Gets the name of the specified converter from a list of all converters contaied in the alias file. | |
U_CAPI uint16_t U_EXPORT2 | ucnv_countAliases (const char *alias, UErrorCode *pErrorCode) |
Gives the number of aliases for a given converter or alias name. | |
U_CAPI const char *U_EXPORT2 | ucnv_getAlias (const char *alias, uint16_t n, UErrorCode *pErrorCode) |
Gives the name of the alias at given index of alias list. | |
U_CAPI void U_EXPORT2 | ucnv_getAliases (const char *alias, const char **aliases, UErrorCode *pErrorCode) |
Fill-up the list of alias names for the given alias. | |
U_CAPI uint16_t U_EXPORT2 | ucnv_countStandards (void) |
Gives the number of standards associated to converter names. | |
U_CAPI const char *U_EXPORT2 | ucnv_getStandard (uint16_t n, UErrorCode *pErrorCode) |
Gives the name of the standard at given index of standard list. | |
U_CAPI const char *U_EXPORT2 | ucnv_getStandardName (const char *name, const char *standard, UErrorCode *pErrorCode) |
Returns a standard name for a given converter name. | |
U_CAPI const char *U_EXPORT2 | ucnv_getDefaultName (void) |
returns the current default converter name. | |
U_CAPI void U_EXPORT2 | ucnv_setDefaultName (const char *name) |
sets the current default converter name. | |
U_CAPI void U_EXPORT2 | ucnv_fixFileSeparator (const UConverter *cnv, UChar *source, int32_t sourceLen) |
Fixes the backslash character mismapping. | |
U_CAPI UBool U_EXPORT2 | ucnv_isAmbiguous (const UConverter *cnv) |
Determines if the converter contains ambiguous mappings of the same character or not. | |
U_CAPI void U_EXPORT2 | ucnv_setFallback (UConverter *cnv, UBool usesFallback) |
Sets the converter to use fallback mapping or not. | |
U_CAPI UBool U_EXPORT2 | ucnv_usesFallback (const UConverter *cnv) |
Determines if the converter uses fallback mappings or not. | |
U_CAPI const char *U_EXPORT2 | ucnv_detectUnicodeSignature (const char *source, int32_t sourceLength, int32_t *signatureLength, UErrorCode *pErrorCode) |
Detects Unicode signatures in the given byte stream. |
|
Converter option for specifying a locale.
|
|
Character that separates converter names from options and options from each other.
|
|
String version of UCNV_OPTION_SEP_CHAR.
|
|
Character that separates a converter option from its value.
|
|
String version of UCNV_VALUE_SEP_CHAR.
|
|
Enum for specifying which platform a converter ID refers to. The use of platform/CCSID is not recommended. See ucnv_openCCSID().
|
|
Enum for specifying basic types of converters.
|
|
Deletes the unicode converter and releases resources associated with just this instance. Does not free up shared converter tables.
|
|
Do a fuzzy compare of a two converter/alias names. The comparison is case-insensitive. It also ignores the characters '-', '_', and ' ' (dash, underscore, and space). Thus the strings "UTF-8", "utf_8", and "Utf 8" are exactly equivalent.
|
|
Will convert a sequence of bytes from one codepage to another. This is NOT AN EFFICIENT way to transcode. use {ucnv_toUnicode} and {ucnv_fromUnicode} for efficiency.
|
|
Gives the number of aliases for a given converter or alias name. Note that additional aliases are recognized by ucnv_open(). This method only enumerates the listed entries in the alias file.
|
|
returns the number of available converters, as per the alias file.
|
|
Gives the number of standards associated to converter names.
|
|
Detects Unicode signatures in the given byte stream. The signature bytes are not consumed, instead the number of bytes that make up the signature is returned. The conversion APIs donot discard signature bytes, so if the caller wishes to discard them, the caller should explicity add code to do that after calling this function. Usage: UErrorCode err = U_ZERO_ERROR; char input[] = { '\xEF','\xBB', '\xBF','\x41','\x42','\x43' }; char* source = input; int32_t signatureLength = 0; char* encoding = ucnv_detectUnicodeSignatures(source,sizeof(input),&signatureLength,&err); UConverter* conv = NULL; if(encoding!=NULL && U_SUCCESS(err)){ // should signature be discarded ? if (discardSignature){ source += signatureLength; } conv = ucnv_open(encoding, &err); .... do the conversion .... }
|
|
Fixes the backslash character mismapping. For example, in SJIS, the backslash character in the ASCII portion is also used to represent the yen currency sign. When mapping from Unicode character 0x005C, it's unclear whether to map the character back to yen or backslash in SJIS. This function will take the input buffer and replace all the yen sign characters with backslash. This is necessary when the user tries to open a file with the input buffer on Windows. This function will test the converter to see whether such mapping is required. You can sometimes avoid using this function by using the correct version of Shift-JIS.
|
|
Frees up memory occupied by unused, cached converter shared data.
|
|
Convert the Unicode string into a codepage string using an existing UConverter. The output string is NUL-terminated if possible. This function is a more convenient but less powerful version of ucnv_fromUnicode(). It is only useful for whole strings, not for streaming conversion. The maximum output buffer capacity required (barring output from callbacks) will be srcLength*ucnv_getMaxCharSize(cnv).
|
|
Converts an array of unicode characters to an array of codepage characters. This function is optimized for converting a continuous stream of data in buffer-sized chunks, where the entire source and target does not fit in available buffers. The source pointer is an in/out parameter. It starts out pointing where the conversion is to begin, and ends up pointing after the last UChar consumed. Target similarly starts out pointer at the first available byte in the output buffer, and ends up pointing after the last byte written to the output. The converter always attempts to consume the entire source buffer, unless (1.) the target buffer is full, or (2.) a failing error is returned from the current callback function. When a successful error status has been returned, it means that all of the source buffer has been consumed. At that point, the caller should reset the source and sourceLimit pointers to point to the next chunk.
This is a stateful conversion. Additionally, even when all source data has been consumed, some data may be in the converters' internal state. Call this function repeatedly, updating the target pointers with the next empty chunk of target in case of a
|
|
Gives the name of the alias at given index of alias list. Note that additional aliases are recognized by ucnv_open(). This method only enumerates the listed entries in the alias file.
|
|
Fill-up the list of alias names for the given alias. Note that additional aliases are recognized by ucnv_open(). This method only enumerates the listed entries in the alias file.
|
|
Gets the name of the specified converter from a list of all converters contaied in the alias file.
|
|
Gets a codepage number associated with the converter.
This is not guaranteed to be the one used to create the converter. Some converters do not represent platform registered codepages and return zero for the codepage number. The error code fill-in parameter indicates if the codepage number is available. Does not check if the converter is Important: The use of CCSIDs is not recommended because it is limited to only two platforms in principle and only one (UCNV_IBM) in the current ICU converter API. Also, CCSIDs are insufficient to identify IBM Unicode conversion tables precisely. For more details see ucnv_openCCSID().
|
|
returns the current default converter name.
|
|
Returns the display name of the converter passed in based on the Locale passed in. If the locale contains no display name, the internal ASCII name will be filled in.
|
|
Gets the current callback function used by the converter when illegal or invalid Unicode sequence is found. Context pointers are always owned by the caller.
|
|
Fills in the output parameter, errBytes, with the error characters from the last failing conversion.
|
|
Fills in the output parameter, errChars, with the error characters from the last failing conversion.
|
|
Returns the maximum length of bytes used by a character. This varies between 1 and 4
|
|
Returns the minimum byte length for characters in this codepage. This is either 1 or 2 for all supported codepages.
|
|
Gets the internal, canonical name of the converter (zero-terminated). The lifetime of the returned string will be that of the converter passed to this function.
|
|
Will convert a codepage buffer into unicode one character at a time. This function was written to be efficient when transcoding small amounts of data at a time. In that case it will be more efficient than {ucnv_toUnicode}. When converting large buffers use {ucnv_toUnicode}.
Handling of surrogate pairs and supplementary-plane code points:
|
|
Gets a codepage platform associated with the converter.
Currently, only
|
|
Gives the name of the standard at given index of standard list.
|
|
Returns a standard name for a given converter name.
|
|
Gets the "starter" (lead) bytes for converters of type MBCS.
Will fill in an
|
|
Fills in the output parameter, subChars, with the substitution characters as multiple bytes.
|
|
Gets the current calback function used by the converter when an illegal or invalid codepage sequence is found. Context pointers are always owned by the caller.
|
|
Gets the type of the converter e.g. SBCS, MBCS, DBCS, UTF8, UTF16_BE, UTF16_LE, ISO_2022, EBCDIC_STATEFUL, LATIN_1
|
|
Determines if the converter contains ambiguous mappings of the same character or not.
|
|
Creates a UConverter object with the names specified as a C string.
The actual name will be resolved with the alias file using a case-insensitive string comparison that ignores the delimiters '-', '_', and ' ' (dash, underscore, and space). E.g., the names "UTF8", "utf-8", and "Utf 8" are all equivalent. If A converter name for ICU 1.5 and above may contain options like a locale specification to control the specific behavior of the newly instantiated converter. The meaning of the options depends on the particular converter. If an option is not defined for or recognized by a given converter, then it is ignored.
Options are appended to the converter name string, with a
|
|
Creates a UConverter object from a CCSID number and platform pair. Note that the usefulness of this function is limited to platforms with numeric encoding IDs. Only IBM and Microsoft platforms use numeric (16-bit) identifiers for encodings. In addition, IBM CCSIDs and Unicode conversion tables are not 1:1 related. For many IBM CCSIDs there are multiple (up to six) Unicode conversion tables, and for some Unicode conversion tables there are multiple CCSIDs. Some "alternate" Unicode conversion tables are provided by the IBM CDRA conversion table registry. The most prominent example of a systematic modification of conversion tables that is not provided in the form of conversion table files in the repository is that S/390 Unix System Services swaps the codes for Line Feed and New Line in all EBCDIC codepages, which requires such a swap in the Unicode conversion tables as well. Only IBM default conversion tables are accessible with ucnv_openCCSID(). ucnv_getCCSID() will return the same CCSID for all conversion tables that are associated with that CCSID. Currently, the only "platform" supported in the ICU converter API is UCNV_IBM. In summary, the use of CCSIDs and the associated API functions is not recommended. In order to open a converter with the default IBM CDRA Unicode conversion table, you can use this function or use the prefix "ibm-": char name[20]; sprintf(name, "ibm-%hu", ccsid); cnv=ucnv_open(name, &errorCode); In order to open a converter with the IBM S/390 Unix System Services variant of a Unicode/EBCDIC conversion table, you can use the prefix "ibm-" together with the suffix "-s390": char name[20]; sprintf(name, "ibm-%hu-s390", ccsid); cnv=ucnv_open(name, &errorCode); In order to open a converter from a Microsoft codepage number, use the prefix "cp": char name[20]; sprintf(name, "cp%hu", codepageID); cnv=ucnv_open(name, &errorCode);
|
|
Creates a Unicode converter with the names specified as unicode string.
The name should be limited to the ASCII-7 alphanumerics range. The actual name will be resolved with the alias file using a case-insensitive string comparison that ignores the delimiters '-', '_', and ' ' (dash, underscore, and space). E.g., the names "UTF8", "utf-8", and "Utf 8" are all equivalent. If
|
|
Resets the state of a converter to the default state. This is used in the case of an error, to restart a conversion from a known default state. It will also empty the internal output buffers.
|
|
Resets the from-Unicode part of a converter state to the default state. This is used in the case of an error to restart a conversion from Unicode to a known default state. It will also empty the internal output buffers used for the conversion from Unicode codepoints.
|
|
Resets the to-Unicode part of a converter state to the default state. This is used in the case of an error to restart a conversion to Unicode to a known default state. It will also empty the internal output buffers used for the conversion to Unicode codepoints.
|
|
Thread safe cloning operation.
|
|
sets the current default converter name. Caller must own the storage for 'name' and preserve it indefinitely.
|
|
Sets the converter to use fallback mapping or not.
|
|
Changes the current callback function used by the converter when an illegal or invalid sequence is found. Context pointers are always owned by the caller.
|
|
Sets the substitution chars when converting from unicode to a codepage.
The substitution is specified as a string of 1-4 bytes, and may contain
|
|
Changes the callback function used by the converter when an illegal or invalid sequence is found. Context pointers are always owned by the caller.
|
|
Convert the codepage string into a Unicode string using an existing UConverter. The output string is NUL-terminated if possible. This function is a more convenient but less powerful version of ucnv_toUnicode(). It is only useful for whole strings, not for streaming conversion. The maximum output buffer capacity required (barring output from callbacks) will be 2*srcLength (each char may be converted into a surrogate pair).
|
|
Converts a buffer of codepage bytes into an array of unicode UChars characters. This function is optimized for converting a continuous stream of data in buffer-sized chunks, where the entire source and target does not fit in available buffers. The source pointer is an in/out parameter. It starts out pointing where the conversion is to begin, and ends up pointing after the last byte of source consumed. Target similarly starts out pointer at the first available UChar in the output buffer, and ends up pointing after the last UChar written to the output. It does NOT necessarily keep UChar sequences together. The converter always attempts to consume the entire source buffer, unless (1.) the target buffer is full, or (2.) a failing error is returned from the current callback function. When a successful error status has been returned, it means that all of the source buffer has been consumed. At that point, the caller should reset the source and sourceLimit pointers to point to the next chunk.
This is a stateful conversion. Additionally, even when all source data has been consumed, some data may be in the converters' internal state. Call this function repeatedly, updating the target pointers with the next empty chunk of target in case of a
|
|
Determines if the converter uses fallback mappings or not.
|
|
Function pointer for error callback in the codepage to unicode direction. Called when an error has occured in conversion to unicode, or on open/close of the callback (see reason).
|