JavaScript Unicode Regex - 字符类中的顺序范围


JavaScript Unicode Regex - Range out of order in character class

为什么下面的代码给出"字符类中的范围顺序错误"错误?

var min_wordsafe_length = 1;
var max_length = 20;
var string = 'some-slug-like-string-with-!@£!%-special-chars-';
var PREG_CLASS_UNICODE_WORD_BOUNDARY = [
  '''x{0}-''x{2F}''x{3A}-''x{40}''x{5B}-''x{60}''x{7B}-''x{A9}''x{AB}-''x{B1}''x{B4}',
  '''x{B6}-''x{B8}''x{BB}''x{BF}''x{D7}''x{F7}''x{2C2}-''x{2C5}''x{2D2}-''x{2DF}',
  '''x{2E5}-''x{2EB}''x{2ED}''x{2EF}-''x{2FF}''x{375}''x{37E}-''x{385}''x{387}''x{3F6}',
  '''x{482}''x{55A}-''x{55F}''x{589}-''x{58A}''x{5BE}''x{5C0}''x{5C3}''x{5C6}',
  '''x{5F3}-''x{60F}''x{61B}-''x{61F}''x{66A}-''x{66D}''x{6D4}''x{6DD}''x{6E9}',
  '''x{6FD}-''x{6FE}''x{700}-''x{70F}''x{7F6}-''x{7F9}''x{830}-''x{83E}',
  '''x{964}-''x{965}''x{970}''x{9F2}-''x{9F3}''x{9FA}-''x{9FB}''x{AF1}''x{B70}',
  '''x{BF3}-''x{BFA}''x{C7F}''x{CF1}-''x{CF2}''x{D79}''x{DF4}''x{E3F}''x{E4F}',
  '''x{E5A}-''x{E5B}''x{F01}-''x{F17}''x{F1A}-''x{F1F}''x{F34}''x{F36}''x{F38}',
  '''x{F3A}-''x{F3D}''x{F85}''x{FBE}-''x{FC5}''x{FC7}-''x{FD8}''x{104A}-''x{104F}',
  '''x{109E}-''x{109F}''x{10FB}''x{1360}-''x{1368}''x{1390}-''x{1399}''x{1400}',
  '''x{166D}-''x{166E}''x{1680}''x{169B}-''x{169C}''x{16EB}-''x{16ED}',
  '''x{1735}-''x{1736}''x{17B4}-''x{17B5}''x{17D4}-''x{17D6}''x{17D8}-''x{17DB}',
  '''x{1800}-''x{180A}''x{180E}''x{1940}-''x{1945}''x{19DE}-''x{19FF}',
  '''x{1A1E}-''x{1A1F}''x{1AA0}-''x{1AA6}''x{1AA8}-''x{1AAD}''x{1B5A}-''x{1B6A}',
  '''x{1B74}-''x{1B7C}''x{1C3B}-''x{1C3F}''x{1C7E}-''x{1C7F}''x{1CD3}''x{1FBD}',
  '''x{1FBF}-''x{1FC1}''x{1FCD}-''x{1FCF}''x{1FDD}-''x{1FDF}''x{1FED}-''x{1FEF}',
  '''x{1FFD}-''x{206F}''x{207A}-''x{207E}''x{208A}-''x{208E}''x{20A0}-''x{20B8}',
  '''x{2100}-''x{2101}''x{2103}-''x{2106}''x{2108}-''x{2109}''x{2114}',
  '''x{2116}-''x{2118}''x{211E}-''x{2123}''x{2125}''x{2127}''x{2129}''x{212E}',
  '''x{213A}-''x{213B}''x{2140}-''x{2144}''x{214A}-''x{214D}''x{214F}',
  '''x{2190}-''x{244A}''x{249C}-''x{24E9}''x{2500}-''x{2775}''x{2794}-''x{2B59}',
  '''x{2CE5}-''x{2CEA}''x{2CF9}-''x{2CFC}''x{2CFE}-''x{2CFF}''x{2E00}-''x{2E2E}',
  '''x{2E30}-''x{3004}''x{3008}-''x{3020}''x{3030}''x{3036}-''x{3037}',
  '''x{303D}-''x{303F}''x{309B}-''x{309C}''x{30A0}''x{30FB}''x{3190}-''x{3191}',
  '''x{3196}-''x{319F}''x{31C0}-''x{31E3}''x{3200}-''x{321E}''x{322A}-''x{3250}',
  '''x{3260}-''x{327F}''x{328A}-''x{32B0}''x{32C0}-''x{33FF}''x{4DC0}-''x{4DFF}',
  '''x{A490}-''x{A4C6}''x{A4FE}-''x{A4FF}''x{A60D}-''x{A60F}''x{A673}''x{A67E}',
  '''x{A6F2}-''x{A716}''x{A720}-''x{A721}''x{A789}-''x{A78A}''x{A828}-''x{A82B}',
  '''x{A836}-''x{A839}''x{A874}-''x{A877}''x{A8CE}-''x{A8CF}''x{A8F8}-''x{A8FA}',
  '''x{A92E}-''x{A92F}''x{A95F}''x{A9C1}-''x{A9CD}''x{A9DE}-''x{A9DF}',
  '''x{AA5C}-''x{AA5F}''x{AA77}-''x{AA79}''x{AADE}-''x{AADF}''x{ABEB}',
  '''x{E000}-''x{F8FF}''x{FB29}''x{FD3E}-''x{FD3F}''x{FDFC}-''x{FDFD}',
  '''x{FE10}-''x{FE19}''x{FE30}-''x{FE6B}''x{FEFF}-''x{FF0F}''x{FF1A}-''x{FF20}',
  '''x{FF3B}-''x{FF40}''x{FF5B}-''x{FF65}''x{FFE0}-''x{FFFD}'].join('');
new RegExp("^(.{" + min_wordsafe_length + ","+ max_length +"})[" + PREG_CLASS_UNICODE_WORD_BOUNDARY + "]");

http://jsfiddle.net/52zz0drz/

错误似乎与PREG_CLASS_UNICODE_WORD_BOUNDARY有关,但是我无法确定范围在哪里出现故障。

我正在尝试将以下行从Drupal的truncate_utf8函数移植到JavaScript:

// Find the last word boundary, if there is one within $min_wordsafe_length
// to $max_length characters. preg_match() is always greedy, so it will
// find the longest string possible.
$found = preg_match('/^(.{' . $min_wordsafe_length . ',' . $max_length . '})[' . PREG_CLASS_UNICODE_WORD_BOUNDARY . ']/u', $string, $matches);

我在 JavaScript 中对 Drupal PREG_CLASS_UNICODE_WORD_BOUNDARY所做的唯一更改是双重转义'x以防止'在正则表达式之前丢失,并将 PHP 的字符串连接替换为数组连接。我没有做任何事情来改变范围的顺序。

如何修复此代码?我需要JavaScript正则表达式尽可能匹配PHP正则表达式的行为。

JavaScript 使用语法'uhhhh(正好 4 个十六进制数字)在正则表达式中指定 UTF-16 代码单元。对于 BMP 范围内的字符,除范围D800-DFFF外,字符的代码点直接映射到一个具有相同码位值的 UTF-16 代码单元。这里的情况正是如此,所以我们不需要处理代理及其怪癖。

在这种情况下,只需将 PCRE(在 PHP 中)中的'x{h...hh}语法替换为'uhhhh语法:

var PREG_CLASS_UNICODE_WORD_BOUNDARY = [
    "''u0000-''u002F''u003A-''u0040''u005B-''u0060''u007B-''u00A9''u00AB-''u00B1''u00B4",
    "''u00B6-''u00B8''u00BB''u00BF''u00D7''u00F7''u02C2-''u02C5''u02D2-''u02DF",
    "''u02E5-''u02EB''u02ED''u02EF-''u02FF''u0375''u037E-''u0385''u0387''u03F6",
    "''u0482''u055A-''u055F''u0589-''u058A''u05BE''u05C0''u05C3''u05C6",
    "''u05F3-''u060F''u061B-''u061F''u066A-''u066D''u06D4''u06DD''u06E9",
    "''u06FD-''u06FE''u0700-''u070F''u07F6-''u07F9''u0830-''u083E",
    "''u0964-''u0965''u0970''u09F2-''u09F3''u09FA-''u09FB''u0AF1''u0B70",
    "''u0BF3-''u0BFA''u0C7F''u0CF1-''u0CF2''u0D79''u0DF4''u0E3F''u0E4F",
    "''u0E5A-''u0E5B''u0F01-''u0F17''u0F1A-''u0F1F''u0F34''u0F36''u0F38",
    "''u0F3A-''u0F3D''u0F85''u0FBE-''u0FC5''u0FC7-''u0FD8''u104A-''u104F",
    "''u109E-''u109F''u10FB''u1360-''u1368''u1390-''u1399''u1400",
    "''u166D-''u166E''u1680''u169B-''u169C''u16EB-''u16ED",
    "''u1735-''u1736''u17B4-''u17B5''u17D4-''u17D6''u17D8-''u17DB",
    "''u1800-''u180A''u180E''u1940-''u1945''u19DE-''u19FF",
    "''u1A1E-''u1A1F''u1AA0-''u1AA6''u1AA8-''u1AAD''u1B5A-''u1B6A",
    "''u1B74-''u1B7C''u1C3B-''u1C3F''u1C7E-''u1C7F''u1CD3''u1FBD",
    "''u1FBF-''u1FC1''u1FCD-''u1FCF''u1FDD-''u1FDF''u1FED-''u1FEF",
    "''u1FFD-''u206F''u207A-''u207E''u208A-''u208E''u20A0-''u20B8",
    "''u2100-''u2101''u2103-''u2106''u2108-''u2109''u2114",
    "''u2116-''u2118''u211E-''u2123''u2125''u2127''u2129''u212E",
    "''u213A-''u213B''u2140-''u2144''u214A-''u214D''u214F",
    "''u2190-''u244A''u249C-''u24E9''u2500-''u2775''u2794-''u2B59",
    "''u2CE5-''u2CEA''u2CF9-''u2CFC''u2CFE-''u2CFF''u2E00-''u2E2E",
    "''u2E30-''u3004''u3008-''u3020''u3030''u3036-''u3037",
    "''u303D-''u303F''u309B-''u309C''u30A0''u30FB''u3190-''u3191",
    "''u3196-''u319F''u31C0-''u31E3''u3200-''u321E''u322A-''u3250",
    "''u3260-''u327F''u328A-''u32B0''u32C0-''u33FF''u4DC0-''u4DFF",
    "''uA490-''uA4C6''uA4FE-''uA4FF''uA60D-''uA60F''uA673''uA67E",
    "''uA6F2-''uA716''uA720-''uA721''uA789-''uA78A''uA828-''uA82B",
    "''uA836-''uA839''uA874-''uA877''uA8CE-''uA8CF''uA8F8-''uA8FA",
    "''uA92E-''uA92F''uA95F''uA9C1-''uA9CD''uA9DE-''uA9DF",
    "''uAA5C-''uAA5F''uAA77-''uAA79''uAADE-''uAADF''uABEB",
    "''uE000-''uF8FF''uFB29''uFD3E-''uFD3F''uFDFC-''uFDFD",
    "''uFE10-''uFE19''uFE30-''uFE6B''uFEFF-''uFF0F''uFF1A-''uFF20",
    "''uFF3B-''uFF40''uFF5B-''uFF65''uFFE0-''uFFFD"].join('');