云风的个人空间 : UTF-8/UTF-16 到 UTF-C 转换代码[UtfC]

首页 :: 索引 :: 修订历史 :: 你好, 3.145.167.58
你的足迹: » UTF-8/UTF-16 到 UTF-C 转换代码
[InterWiki]一种对汉字更环保的 Unicode 编码方案 的实现。

/* 
    接口如下:
        utf*_encode 把 utf-8/utf-16 的字符串(is 为长度),转换为 utf-c 格式。
        os 为 buffer 的大小,如果转换后的结果不足以放到 buffer 中,返回 -1 ;否则,返回转换后的长度。
 
        utf*_decode 把 utf-c 的字符串,转换为 utf-8 或 utf-16 格式。
        is/os 以及返回值的含义同上。
 */
 
int utf8_encode(const char *utf8,size_t is,unsigned char *utfc,size_t os);
int utf8_decode(const unsigned char *utfc,size_t is,char *utf8,size_t os);
int utf16_encode(const wchar_t *utf16,size_t is,unsigned char *utfc,size_t os);
int utf16_decode(const unsigned char *utfc,size_t is,wchar_t *utf16,size_t os);


#include <stddef.h>
#include <wchar.h>
 
int
utf8_encode(const char *utf8,size_t is,unsigned char *utfc,size_t os)
{
	const unsigned char *input=(const unsigned char *)utf8;
	unsigned char *output=utfc;
	while ((*input & 0xc0) == 0x80 && is!=0) {
		++input;
		--is;
	}
 
	if (is==0) {
		return 0;
	}
 
	do {
		if (os==0) {
			return -1;
		}
		if (*input < 128) {
			*output++ = *input++;
			--is;
			--os;
		}
		else {
			unsigned char b=*input++;
			int c=(int)(b&(0x0f | (~(b>>1) &0x1f)));
			--is;
			while (is!=0 && (*input & 0xc0)==0x80) {
				c=c<<6 | (*input & 0x3f);
				++input;
				--is;
			}
 
			unsigned char hi = (unsigned char)(c >> 8);
			unsigned char low = (unsigned char)(c&0xff);
			if (hi>=0x20 && hi<=0x9f && hi!=0x3f) {
				if (os<2) {
					return -1;
				}
				*output++ = (unsigned char)(hi + 0x60);
				*output++ = low;
				os-=2;
			}
			else {
				if (os<3) {
					return -1;
				}
				*output++ = 0x9f;
				*output++ = hi;
				*output++ = low;
				os-=3;
			}
		}
	} while (is!=0);
 
	return output-utfc;
}
 
int
utf8_decode(const unsigned char *utfc,size_t is,char *utf8,size_t os)
{
	const unsigned char *input=utfc;
	unsigned char *output=(unsigned char *)utf8;
 
	if (is==0) {
		return 0;
	}
 
	do {
		if (os==0) {
			return -1;
		}
		if (*input < 128) {
			*output++ = *input++;
			--is;
			--os;
		}
		else {
			int c;
			if (*input==0x9f) {
				if (is<3) {
					c=0xffff;
					is=0;
				}
				else {
					c=input[1]<<8 | input[2];
					is-=3;
					input+=3;
				}
			}
			else {
				if (is<2) {
					c=0xffff;
					is=0;
				}
				else {
					c=(input[0] - 0x60)<<8 | input[1];
					is-=2;
					input+=2;
				}
			}
			if (c<=0x7ff) {
				if (os<2) {
					return -1;
				}
				*output++ =(unsigned char)((c>>6) | 0xc0);
				os-=2;
			}
			else {
				if (os<3) {
					return -1;
				}
				*output++ =(unsigned char)((c>>12) | 0xe0);
				*output++ =(unsigned char)(((c>>6) & 0x3f) | 0x80);
				os-=3;
			}
			*output++ =(unsigned char)((c&0x3f) | 0x80);
		}
	} while (is!=0);
 
	return output-(unsigned char *)utf8;
}
 
int
utf16_encode(const wchar_t *utf16,size_t is,unsigned char *utfc,size_t os)
{
	const wchar_t *input=utf16;
	unsigned char *output=utfc;
 
	if (is==0) {
		return 0;
	}
 
	do {
		if (os==0) {
			return -1;
		}
		if ((unsigned)*input < 128) {
			*output++ = (unsigned char)*input++;
			--is;
			--os;
		}
		else {
			int c=(unsigned)*input++;
			--is;
 
			unsigned char hi = (unsigned char)(c >> 8);
			unsigned char low = (unsigned char)(c&0xff);
			if (hi>=0x20 && hi<=0x9f && hi!=0x3f) {
				if (os<2) {
					return -1;
				}
				*output++ = (unsigned char)(hi + 0x60);
				*output++ = low;
				os-=2;
			}
			else {
				if (os<3) {
					return -1;
				}
				*output++ = 0x9f;
				*output++ = hi;
				*output++ = low;
				os-=3;
			}
		}
	} while (is!=0);
 
	return output-utfc;
}
 
int
utf16_decode(const unsigned char *utfc,size_t is,wchar_t *utf16,size_t os)
{
	const unsigned char *input=utfc;
	wchar_t *output=utf16;
 
	if (is==0) {
		return 0;
	}
 
	do {
		if (os==0) {
			return -1;
		}
		if (*input < 128) {
			*output++ = (wchar_t)*input++;
			--is;
			--os;
		}
		else {
			int c;
			if (*input==0x9f) {
				if (is<3) {
					c=0xffff;
					is=0;
				}
				else {
					c=input[1]<<8 | input[2];
					is-=3;
					input+=3;
				}
			}
			else {
				if (is<2) {
					c=0xffff;
					is=0;
				}
				else {
					c=(input[0] - 0x60)<<8 | input[1];
					is-=2;
					input+=2;
				}
			}
			*output++ =(wchar_t)c;
		}
	} while (is!=0);
 
	return output-utf16;
}