unichr(), uniord(), RFC 3629

さらに改良。

  • RFC 3629 および Unicode 3.2 に準拠し、UTF-16 の範囲に対応。
  • unichr() にサロゲート領域の値 (0xD800〜0xDFFF) を与えると false を返す。
  • uniord() は (先頭の 1〜4 バイトに限り) UTF-8 としての妥当性をチェックする。

調査の過程で知ったのですが、RFC 2279 では UCS-4 すべてが UTF-8 に対応づけられているけど、RFC 3629 では UTF-16 に制限されているのですね。
RFC 2279 は RFC 3629 によって廃止・置換されているので PEAR::I18N_UnicodeString のように UCS-4 をフルサポートする必要はないと判断しました。

<?xml version="1.0" ?>
<extension name="unichr" version="0.0.2">
<summary>Unicode character PHP extension</summary>
<description><![CDATA[
	Converts between UTF-8 string and Unicode code point.
	This extension is in accordance with RFC 3629 - UTF-8, a transformation format of ISO 10646.
]]></description>
<function name="unichr">
	<proto>string unichr(int codepoint)</proto>
	<description><![CDATA[
		Get the UTF-8 encoded string which corresponds to the given code point.
	]]></description>
	<code><![CDATA[
if (codepoint < 0 || (codepoint > 0xD7FF && codepoint < 0xE000) || codepoint > 0x10FFFF) {
	RETURN_FALSE;
}
char str[5] = { '?0' };
int len = 0;
if (codepoint < 0x80) {
	/* UTF8-1, US-ASCII */
	str[0] = (char)codepoint;
	len = 1;
} else if (codepoint < 0x800) {
	/* UTF8-2 */
	str[0] = 0xC0 | (char)(codepoint >> 6);
	str[1] = 0x80 | (char)(codepoint & 0x3F);
	len = 2;
} else if (codepoint < 0x10000) {
	/* UTF8-3 */
	str[0] = 0xE0 | (char)(codepoint >> 12);
	str[1] = 0x80 | (char)(codepoint >> 6 & 0x3F);
	str[2] = 0x80 | (char)(codepoint & 0x3F);
	len = 3;
} else {
	/* UTF8-4, surrogate pairs */
	str[0] = 0xF0 | (char)(codepoint >> 18);
	str[1] = 0x80 | (char)(codepoint >> 12 & 0x3F);
	str[2] = 0x80 | (char)(codepoint >> 6 & 0x3F);
	str[3] = 0x80 | (char)(codepoint & 0x3F);
	len = 4;
}
RETURN_STRINGL(str, len, 1);
	]]></code>
</function>
<function name="uniord">
	<proto>int uniord(string str)</proto>
	<description><![CDATA[
		Get the code point which corresponds to the given UTF-8 encoded string.
	]]></description>
	<code><![CDATA[
if (str_len == 0) {
	RETURN_FALSE;
}
unsigned char flags[5] = { (unsigned char)str[0], '?0' };
unsigned char mask = 0;
int len = 0;
if (flags[0] < 0x80) {
	/* UTF8-1, US-ASCII */
	mask = 0x7F;
	len = 1;
} else if (flags[0] > 0xC1 && flags[0] < 0xE0) {
	/* UTF8-2 */
	if (str_len < 2) {
		RETURN_FALSE;
	}
	flags[1] = (unsigned char)str[1];
	if (flags[1] < 0x80 || flags[1] > 0xBF) {
		RETURN_FALSE;
	}
	mask = 0x1F;
	len = 2;
} else if (flags[0] > 0xDF && flags[0] < 0xF0) {
	/* UTF8-3 */
	if (str_len < 3) {
		RETURN_FALSE;
	}
	flags[1] = (unsigned char)str[1];
	flags[2] = (unsigned char)str[2];
	if (flags[0] == 0xE0) {
		if (flags[1] < 0xA0 || flags[1] > 0xBF) {
			RETURN_FALSE;
		}
	} else if (flags[0] == 0xED) {
		if (flags[1] < 0x80 || flags[1] > 0x9F) {
			RETURN_FALSE;
		}
	} else {
		if (flags[1] < 0x80 || flags[1] > 0xBF) {
			RETURN_FALSE;
		}
	}
	if (flags[2] < 0x80 || flags[2] > 0xBF) {
		RETURN_FALSE;
	}
	mask = 0xF;
	len = 3;
} else if (flags[0] > 0xEF && flags[0] < 0xF5) {
	/* UTF8-4, surrogate pairs */
	if (str_len < 4) {
		RETURN_FALSE;
	}
	flags[1] = (unsigned char)str[1];
	flags[2] = (unsigned char)str[2];
	flags[3] = (unsigned char)str[3];
	if (flags[0] == 0xF0) {
		if (flags[1] < 0x90 || flags[1] > 0xBF) {
			RETURN_FALSE;
		}
	} else if (flags[0] == 0xF4) {
		if (flags[1] < 0x80 || flags[1] > 0x8F) {
			RETURN_FALSE;
		}
	} else {
		if (flags[1] < 0x80 || flags[1] > 0xBF) {
			RETURN_FALSE;
		}
	}
	if (flags[2] < 0x80 || flags[2] > 0xBF || flags[3] < 0x80 || flags[3] > 0xBF) {
		RETURN_FALSE;
	}
	mask = 0x7;
	len = 4;
} else {
	RETURN_FALSE;
}
int pos = 0;
long codepoint = ((long)(flags[pos] & mask)) << (6 * (len - 1));
for (pos = 1; pos < len; pos++) {
	codepoint |= ((long)(flags[pos] & 0x3F)) << (6 * (len - pos - 1));
}
RETURN_LONG(codepoint);
	]]></code>
</function>
</extension>