一种对汉字更环保的 Unicode 编码方案 的实现。
/*
接口如下:
utf*_encode 把 utf-8/utf-16 的字符串(is 为长度),转换为 utf-c 格式。
os 为 buffer 的大小,如果转换后的结果不足以放到 buffer 中,返回 -1 ;否则,返回转换后的长度。
utf*_decode 把 utf-c 的字符串,转换为 utf-8 或 utf-16 格式。
is/os 以及返回值的含义同上。
*/
int utf8_encode(const char *utf8,size_t is,unsigned char *utfc,size_t os);
int utf8_decode(const unsigned char *utfc,size_t is,char *utf8,size_t os);
int utf16_encode(const wchar_t *utf16,size_t is,unsigned char *utfc,size_t os);
int utf16_decode(const unsigned char *utfc,size_t is,wchar_t *utf16,size_t os);
#include <stddef.h>
#include <wchar.h>
int
utf8_encode(const char *utf8,size_t is,unsigned char *utfc,size_t os)
{
const unsigned char *input=(const unsigned char *)utf8;
unsigned char *output=utfc;
while ((*input & 0xc0) == 0x80 && is!=0) {
++input;
--is;
}
if (is==0) {
return 0;
}
do {
if (os==0) {
return -1;
}
if (*input < 128) {
*output++ = *input++;
--is;
--os;
}
else {
unsigned char b=*input++;
int c=(int)(b&(0x0f | (~(b>>1) &0x1f)));
--is;
while (is!=0 && (*input & 0xc0)==0x80) {
c=c<<6 | (*input & 0x3f);
++input;
--is;
}
unsigned char hi = (unsigned char)(c >> 8);
unsigned char low = (unsigned char)(c&0xff);
if (hi>=0x20 && hi<=0x9f && hi!=0x3f) {
if (os<2) {
return -1;
}
*output++ = (unsigned char)(hi + 0x60);
*output++ = low;
os-=2;
}
else {
if (os<3) {
return -1;
}
*output++ = 0x9f;
*output++ = hi;
*output++ = low;
os-=3;
}
}
} while (is!=0);
return output-utfc;
}
int
utf8_decode(const unsigned char *utfc,size_t is,char *utf8,size_t os)
{
const unsigned char *input=utfc;
unsigned char *output=(unsigned char *)utf8;
if (is==0) {
return 0;
}
do {
if (os==0) {
return -1;
}
if (*input < 128) {
*output++ = *input++;
--is;
--os;
}
else {
int c;
if (*input==0x9f) {
if (is<3) {
c=0xffff;
is=0;
}
else {
c=input[1]<<8 | input[2];
is-=3;
input+=3;
}
}
else {
if (is<2) {
c=0xffff;
is=0;
}
else {
c=(input[0] - 0x60)<<8 | input[1];
is-=2;
input+=2;
}
}
if (c<=0x7ff) {
if (os<2) {
return -1;
}
*output++ =(unsigned char)((c>>6) | 0xc0);
os-=2;
}
else {
if (os<3) {
return -1;
}
*output++ =(unsigned char)((c>>12) | 0xe0);
*output++ =(unsigned char)(((c>>6) & 0x3f) | 0x80);
os-=3;
}
*output++ =(unsigned char)((c&0x3f) | 0x80);
}
} while (is!=0);
return output-(unsigned char *)utf8;
}
int
utf16_encode(const wchar_t *utf16,size_t is,unsigned char *utfc,size_t os)
{
const wchar_t *input=utf16;
unsigned char *output=utfc;
if (is==0) {
return 0;
}
do {
if (os==0) {
return -1;
}
if ((unsigned)*input < 128) {
*output++ = (unsigned char)*input++;
--is;
--os;
}
else {
int c=(unsigned)*input++;
--is;
unsigned char hi = (unsigned char)(c >> 8);
unsigned char low = (unsigned char)(c&0xff);
if (hi>=0x20 && hi<=0x9f && hi!=0x3f) {
if (os<2) {
return -1;
}
*output++ = (unsigned char)(hi + 0x60);
*output++ = low;
os-=2;
}
else {
if (os<3) {
return -1;
}
*output++ = 0x9f;
*output++ = hi;
*output++ = low;
os-=3;
}
}
} while (is!=0);
return output-utfc;
}
int
utf16_decode(const unsigned char *utfc,size_t is,wchar_t *utf16,size_t os)
{
const unsigned char *input=utfc;
wchar_t *output=utf16;
if (is==0) {
return 0;
}
do {
if (os==0) {
return -1;
}
if (*input < 128) {
*output++ = (wchar_t)*input++;
--is;
--os;
}
else {
int c;
if (*input==0x9f) {
if (is<3) {
c=0xffff;
is=0;
}
else {
c=input[1]<<8 | input[2];
is-=3;
input+=3;
}
}
else {
if (is<2) {
c=0xffff;
is=0;
}
else {
c=(input[0] - 0x60)<<8 | input[1];
is-=2;
input+=2;
}
}
*output++ =(wchar_t)c;
}
} while (is!=0);
return output-utf16;
}