Surrogate (Cs) に加え、 Noncharacter のコードポイントが与えられたときも False を返すように修正したバージョンです。
インストール方法はこちら。
<?xml version="1.0" ?> <extension name="unichr" version="0.0.2"> <summary>Unicode character PHP extension</summary> <description><![CDATA[ Converts between UTF-8 string and Unicode code point. This extension is in accordance with RFC 3629 - UTF-8, a transformation format of ISO 10646. ]]></description> <function name="unichr"> <proto>string unichr(int code)</proto> <description><![CDATA[ Get the UTF-8 encoded string which corresponds to the given code point. ]]></description> <code><![CDATA[ if (/* Out-of-Unicode */ code < 0 || code > 0x10FFFF || /* Surrogate */ (code >= 0xD800 && code <= 0xDFFF) || /* Noncharacter */ (code >= 0xFDD0 && code <= 0xFDEF) || (code & 0xFFFE) == 0xFFFE ) { RETURN_FALSE; } const char mask = 0x80; char str[5] = { '\0' }; int len = 0; if (code < 0x80) { /* UTF8-1, US-ASCII */ str[len++] = (char)code; } else if (code < 0x800) { /* UTF8-2 */ str[len++] = 0xC0 | (char)(code >> 6); str[len++] = mask | (char)(code & 0x3F); } else if (code < 0x10000) { /* UTF8-3 */ str[len++] = 0xE0 | (char)(code >> 12); str[len++] = mask | (char)(code >> 6 & 0x3F); str[len++] = mask | (char)(code & 0x3F); } else { /* UTF8-4, surrogate pairs in UTF-16 */ str[len++] = 0xF0 | (char)(code >> 18); str[len++] = mask | (char)(code >> 12 & 0x3F); str[len++] = mask | (char)(code >> 6 & 0x3F); str[len++] = mask | (char)(code & 0x3F); } RETURN_STRINGL(str, len, 1); ]]></code> </function> <function name="uniord"> <proto>int uniord(string str)</proto> <description><![CDATA[ Get the code point which corresponds to the given UTF-8 encoded string. ]]></description> <code><![CDATA[ if (str_len == 0) { RETURN_FALSE; } unsigned char flags[5] = { (unsigned char)str[0], '\0' }; unsigned char mask = 0; int len = 0; if (flags[0] < 0x80) { /* UTF8-1, US-ASCII */ mask = 0x7F; len = 1; } else if (flags[0] > 0xC1 && flags[0] < 0xE0) { /* UTF8-2 */ if (str_len < 2) { RETURN_FALSE; } flags[1] = (unsigned char)str[1]; if (flags[1] < 0x80 || flags[1] > 0xBF) { RETURN_FALSE; } mask = 0x1F; len = 2; } else if (flags[0] > 0xDF && flags[0] < 0xF0) { /* UTF8-3 */ if (str_len < 3) { RETURN_FALSE; } flags[1] = (unsigned char)str[1]; flags[2] = (unsigned char)str[2]; if (flags[0] == 0xE0) { if (flags[1] < 0xA0 || flags[1] > 0xBF) { RETURN_FALSE; } } else if (flags[0] == 0xED) { if (flags[1] < 0x80 || flags[1] > 0x9F) { RETURN_FALSE; } } else { if (flags[1] < 0x80 || flags[1] > 0xBF) { RETURN_FALSE; } } if (flags[2] < 0x80 || flags[2] > 0xBF) { RETURN_FALSE; } mask = 0xF; len = 3; } else if (flags[0] > 0xEF && flags[0] < 0xF5) { /* UTF8-4, surrogate pairs */ if (str_len < 4) { RETURN_FALSE; } flags[1] = (unsigned char)str[1]; flags[2] = (unsigned char)str[2]; flags[3] = (unsigned char)str[3]; if (flags[0] == 0xF0) { if (flags[1] < 0x90 || flags[1] > 0xBF) { RETURN_FALSE; } } else if (flags[0] == 0xF4) { if (flags[1] < 0x80 || flags[1] > 0x8F) { RETURN_FALSE; } } else { if (flags[1] < 0x80 || flags[1] > 0xBF) { RETURN_FALSE; } } if (flags[2] < 0x80 || flags[2] > 0xBF || flags[3] < 0x80 || flags[3] > 0xBF) { RETURN_FALSE; } mask = 0x7; len = 4; } else { RETURN_FALSE; } int pos = 0; long code = ((long)(flags[pos] & mask)) << (6 * (len - 1)); for (pos = 1; pos < len; pos++) { code |= ((long)(flags[pos] & 0x3F)) << (6 * (len - pos - 1)); } RETURN_LONG(code); ]]></code> </function> </extension>