PHP の正規表現ではコールバック関数を用いた置換では名前付き捕獲式集合が無視されるのですが、以前からこれは困ったことだと思っていました。
ちょっと複雑な正規表現を扱うときはコールバック関数側でのマッピングが面倒だし、さらにその正規表現を改変して捕獲式集合を増やしたときなどはコールバック関数の修正も求められ、保守が困難になるからです。
しかし以前に PHP のソースコード (ext/pcre/php_pcre.c) に手を加えてコールバック関数でも名前付き捕獲 - マッチした部分を連想配列として受け取る - ができるようにしようと思ったときは (今でも大概だけど、今より) C が読めなかったのでギブアップした覚えがあります。
で、ふと「もしかして今ならイケるんじゃね?」という根拠の無い自信に基づいて挑戦してみたところ、できてしまいました。
以下は ext/pcre/php_pcre.c の差分です。追加したコードの 99% は同ファイルの他の箇所からのコピペなので大した仕事はしてません。
--- php_pcre.c.odig +++ php_pcre.c @@ -792,7 +792,7 @@ /* {{{ preg_do_repl_func */ -static int preg_do_repl_func(zval *function, char *subject, int *offsets, int count, char **result TSRMLS_DC) +static int preg_do_repl_func(zval *function, char *subject, int *offsets, int count, char **subpat_names, char **result TSRMLS_DC) { zval *retval_ptr; /* Function return value */ zval **args[1]; /* Argument to pass to function */ @@ -802,8 +802,12 @@ MAKE_STD_ZVAL(subpats); array_init(subpats); - for (i = 0; i < count; i++) + for (i = 0; i < count; i++) { + if (subpat_names[i]) { + add_assoc_stringl(subpats, subpat_names[i], &subject[offsets[i<<1]], offsets[(i<<1)+1] - offsets[i<<1], 1); + } add_next_index_stringl(subpats, &subject[offsets[i<<1]], offsets[(i<<1)+1] - offsets[i<<1], 1); + } args[0] = &subpats; if (call_user_function_ex(EG(function_table), NULL, function, &retval_ptr, 1, args, 0, NULL TSRMLS_CC) == SUCCESS && retval_ptr) { @@ -938,6 +942,7 @@ int exoptions = 0; /* Execution options */ int count = 0; /* Count of matched subpatterns */ int *offsets; /* Array of subpattern offsets */ + int num_subpats; /* Number of captured subpatterns */ int size_offsets; /* Size of the offsets array */ int new_len; /* Length of needed storage */ int alloc_len; /* Actual allocated length */ @@ -959,6 +964,7 @@ *replace_end=NULL, /* End of replacement string */ *eval_result, /* Result of eval or custom function */ walk_last; /* Last walked character */ + char **subpat_names = NULL;/* Array for named subpatterns */ int rc; if (extra == NULL) { @@ -981,17 +987,66 @@ } /* Calculate the size of the offsets array, and allocate memory for it. */ - rc = pcre_fullinfo(pce->re, extra, PCRE_INFO_CAPTURECOUNT, &size_offsets); + rc = pcre_fullinfo(pce->re, extra, PCRE_INFO_CAPTURECOUNT, &num_subpats); if (rc < 0) { php_error_docref(NULL TSRMLS_CC, E_WARNING, "Internal pcre_fullinfo() error %d", rc); return NULL; } - size_offsets = (size_offsets + 1) * 3; + num_subpats++; + size_offsets = num_subpats * 3; offsets = (int *)safe_emalloc(size_offsets, sizeof(int), 0); alloc_len = 2 * subject_len + 1; result = safe_emalloc(alloc_len, sizeof(char), 0); + /* + * Build a mapping from subpattern numbers to their names. We will always + * allocate the table, even though there may be no named subpatterns. This + * avoids somewhat more complicated logic in the inner loops. + */ + if (is_callable_replace) { + subpat_names = (char **)safe_emalloc(num_subpats, sizeof(char *), 0); + memset(subpat_names, 0, sizeof(char *) * num_subpats); + + int name_cnt = 0, name_size, ni = 0; + char *name_table; + unsigned short name_idx; + + rc = pcre_fullinfo(pce->re, extra, PCRE_INFO_NAMECOUNT, &name_cnt); + if (rc < 0) { + php_error_docref(NULL TSRMLS_CC, E_WARNING, "Internal pcre_fullinfo() error %d", rc); + efree(offsets); + efree(subpat_names); + return NULL; + } + if (name_cnt > 0) { + int rc1, rc2; + long dummy_l; + double dummy_d; + rc1 = pcre_fullinfo(pce->re, extra, PCRE_INFO_NAMETABLE, &name_table); + rc2 = pcre_fullinfo(pce->re, extra, PCRE_INFO_NAMEENTRYSIZE, &name_size); + rc = rc2 ? rc2 : rc1; + if (rc < 0) { + php_error_docref(NULL TSRMLS_CC, E_WARNING, "Internal pcre_fullinfo() error %d", rc); + efree(offsets); + efree(subpat_names); + return NULL; + } + + while (ni++ < name_cnt) { + name_idx = 0xff * name_table[0] + name_table[1]; + subpat_names[name_idx] = name_table + 2; + if (is_numeric_string(subpat_names[name_idx], strlen(subpat_names[name_idx]), &dummy_l, &dummy_d, 0) > 0) { + php_error_docref(NULL TSRMLS_CC, E_WARNING, "Numeric named subpatterns are not allowed"); + efree(offsets); + efree(subpat_names); + return NULL; + } + name_table += name_size; + } + } + } + /* Initialize */ match = NULL; *result_len = 0; @@ -1028,7 +1083,7 @@ } else if (is_callable_replace) { /* Use custom function to get replacement string and its length. */ eval_result_len = preg_do_repl_func(replace_val, subject, offsets, - count, &eval_result TSRMLS_CC); + count, subpat_names, &eval_result TSRMLS_CC); new_len += eval_result_len; } else { /* do regular substitution */ walk = replace; @@ -1145,6 +1200,9 @@ } efree(offsets); + if (subpat_names != NULL) { + efree(subpat_names); + } return result; }
検証用スクリプト:
<?php function test_cb($matches) { print_r($matches); return ''; } $str = <<<EOS <html> <head> <meta http-equiv="content-type" content="text/html; charset=utf-8"> <meta http-equiv="content-style-type" content="text/css; charset=utf-8"> <meta http-equiv="content-script-type" content="text/javascript; charset=utf-8"> <title>hoge</title> </head> <body> <h1>hoge</h1> </html> EOS; //$re = '/<meta (http-equiv)="(.*?)" (content)="(.*?)">/'; $re = '/<meta (http-equiv)="(?P<http_equiv>.*?)" (content)="(?P<content>.*?)">/'; $result = preg_replace_callback($re, 'test_cb', $str); var_dump($str, $result); ?>
結果:
Array ( [0] => <meta http-equiv="content-type" content="text/html; charset=utf-8"> [1] => http-equiv [http_equiv] => content-type [2] => content-type [3] => content [content] => text/html; charset=utf-8 [4] => text/html; charset=utf-8 ) Array ( [0] => <meta http-equiv="content-style-type" content="text/css; charset=utf-8"> [1] => http-equiv [http_equiv] => content-style-type [2] => content-style-type [3] => content [content] => text/css; charset=utf-8 [4] => text/css; charset=utf-8 ) Array ( [0] => <meta http-equiv="content-script-type" content="text/javascript; charset=utf-8"> [1] => http-equiv [http_equiv] => content-script-type [2] => content-script-type [3] => content [content] => text/javascript; charset=utf-8 [4] => text/javascript; charset=utf-8 ) string(292) "<html> <head> <meta http-equiv="content-type" content="text/html; charset=utf-8"> <meta http-equiv="content-style-type" content="text/css; charset=utf-8"> <meta http-equiv="content-script-type" content="text/javascript; charset=utf-8"> <title>hoge</title> </head> <body> <h1>hoge</h1> </html>" string(73) "<html> <head> <title>hoge</title> </head> <body> <h1>hoge</h1> </html>"
大したコードは書けなくても、多少読めるだけでだいぶ世界が変わるなあ。