我正在使用Amazon Mechanical Turk API,它只允许我使用正则表达式来过滤数据字段。
我想向函数输入一个整数范围,例如256-311或45-1233,并返回一个只匹配该范围的正则表达式。
匹配256-321的正则表达式是:
'b((25[6-9])|(2[6-9][0-9])|(3[0-1][0-9])|(32[0-1]))'b
这部分相当容易,但是我在创建这个正则表达式的循环中遇到了麻烦。
我试图建立一个函数定义如下:
function getRangeRegex( int fromInt, int toInt)
{
return regexString;
}
我在网上看了看,我很惊讶,过去似乎没有人解决过这个问题。这是一个难题……
谢谢你的时间。
这是一个快速攻击:
<?php
function regex_range($from, $to) {
if($from < 0 || $to < 0) {
throw new Exception("Negative values not supported");
}
if($from > $to) {
throw new Exception("Invalid range $from..$to, from > to");
}
$ranges = array($from);
$increment = 1;
$next = $from;
$higher = true;
while(true) {
$next += $increment;
if($next + $increment > $to) {
if($next <= $to) {
$ranges[] = $next;
}
$increment /= 10;
$higher = false;
}
else if($next % ($increment*10) === 0) {
$ranges[] = $next;
$increment = $higher ? $increment*10 : $increment/10;
}
if(!$higher && $increment < 10) {
break;
}
}
$ranges[] = $to + 1;
$regex = '/^(?:';
for($i = 0; $i < sizeof($ranges) - 1; $i++) {
$str_from = (string)($ranges[$i]);
$str_to = (string)($ranges[$i + 1] - 1);
for($j = 0; $j < strlen($str_from); $j++) {
if($str_from[$j] == $str_to[$j]) {
$regex .= $str_from[$j];
}
else {
$regex .= "[" . $str_from[$j] . "-" . $str_to[$j] . "]";
}
}
$regex .= "|";
}
return substr($regex, 0, strlen($regex)-1) . ')$/';
}
function test($from, $to) {
try {
printf("%-10s %s'n", $from . '-' . $to, regex_range($from, $to));
} catch (Exception $e) {
echo $e->getMessage() . "'n";
}
}
test(2, 8);
test(5, 35);
test(5, 100);
test(12, 1234);
test(123, 123);
test(256, 321);
test(256, 257);
test(180, 195);
test(2,1);
test(-2,4);
?>
生产:
2-8 /^(?:[2-7]|8)$/
5-35 /^(?:[5-9]|[1-2][0-9]|3[0-5])$/
5-100 /^(?:[5-9]|[1-9][0-9]|100)$/
12-1234 /^(?:1[2-9]|[2-9][0-9]|[1-9][0-9][0-9]|1[0-2][0-3][0-4])$/
123-123 /^(?:123)$/
256-321 /^(?:25[6-9]|2[6-9][0-9]|3[0-2][0-1])$/
256-257 /^(?:256|257)$/
180-195 /^(?:18[0-9]|19[0-5])$/
Invalid range 2..1, from > to
Negative values not supported
未经适当测试,使用风险自负!
是的,生成的正则表达式在许多情况下可以写得更紧凑,但我把它留给读者作为练习:)
对于像我一样正在寻找javascript版本的伟大的@Bart Kiers的生产
//Credit: Bart Kiers 2011
function regex_range(from, to){
if(from < 0 || to < 0) {
//throw new Exception("Negative values not supported");
return null;
}
if(from > to) {
//throw new Exception("Invalid range from..to, from > to");
return null;
}
var ranges = [];
ranges.push(from);
var increment = 1;
var next = from;
var higher = true;
while(true){
next += increment;
if(next + increment > to) {
if(next <= to) {
ranges.push(next);
}
increment /= 10;
higher = false;
}else{
if(next % (increment*10) == 0) {
ranges.push(next);
increment = higher ? increment*10 : increment/10;
}
}
if(!higher && increment < 10) {
break;
}
}
ranges.push(to + 1);
var regex = '/^(?:';
for(var i = 0; i < ranges.length - 1; i++) {
var str_from = ranges[i];
str_from = str_from.toString();
var str_to = ranges[i + 1] - 1;
str_to = str_to.toString();
for(var j = 0; j < str_from.length; j++) {
if(str_from[j] == str_to[j]) {
regex += str_from[j];
}
else {
regex += "[" + str_from[j] + "-" + str_to[j] + "]";
}
}
regex += "|";
}
return regex.substr(0, regex.length - 1 ) + ')$/';
}
RegexNumericRangeGenerator的PHP端口
class RegexRangeNumberGenerator {
static function parse($min, $max, $MatchWholeWord = FALSE, $MatchWholeLine = FALSE, $MatchLeadingZero = FALSE) {
if (!is_int($min) || !is_int($max) || $min > $max || $min < 0 || $max < 0) {
return FALSE;
}
if ($min == $max) {
return self::parseIntoPattern($min, $MatchWholeWord, $MatchWholeLine, $MatchLeadingZero);
}
$s = [];
$x = self::parseStartRange($min, $max);
foreach ($x as $o) {
$s[] = self::parseEndRange($o[0], $o[1]);
}
$n = self::reformatArray($s);
$h = self::parseIntoRegex($n);
return self::parseIntoPattern($h, $MatchWholeWord, $MatchWholeLine, $MatchLeadingZero);
}
static private function parseIntoPattern($t, $MatchWholeWord = FALSE, $MatchWholeLine = FALSE, $MatchLeadingZero = FALSE) {
$r = ((is_array($t)) ? implode("|", $t) : $t);
return (($MatchWholeLine && $MatchLeadingZero) ? "^0*(" . $r . ")$" : (($MatchLeadingZero) ? "0*(" . $r . ")" : (($MatchWholeLine) ? "^(" . $r . ")$" : (($MatchWholeWord) ? "''b(" . $r . ")''b" : "(" . $r . ")"))));
}
static private function parseIntoRegex($t) {
if (!is_array($t)) {
throw new Exception("Argument needs to be an array!");
}
$r = [];
for ($i = 0; $i < count($t); $i++) {
$e = str_split($t[$i][0]);
$n = str_split($t[$i][1]);
$s = "";
$o = 0;
$h = "";
for ($a = 0; $a < count($e); $a++) {
if ($e[$a] === $n[$a]) {
$h .= $e[$a];
} else {
if ((intval($e[$a]) + 1) === intval($n[$a])) {
$h .= "[" . $e[$a] . $n[$a] . "]";
} else {
if ($s === ($e[$a] . $n[$a])) {
$o++;
}
$s = $e[$a] . $n[$a];
if ($a == (count($e) - 1)) {
$h .= (($o > 0) ? "{" . ($o + 1) . "}" : "[" . $e[$a] . "-" . $n[$a] . "]");
} else {
if ($o === 0) {
$h .= "[" . $e[$a] . "-" . $n[$a] . "]";
}
}
}
}
}
$r[] = $h;
}
return $r;
}
static private function reformatArray($t) {
$arrReturn = [];
for ($i = 0; $i < count($t); $i++) {
$page = count($t[$i]) / 2;
for ($a = 0; $a < $page; $a++) {
$arrReturn[] = array_slice($t[$i], (2 * $a), 2);
}
}
return $arrReturn;
}
static private function parseStartRange($t, $r) {
if (strlen($t) === strlen($r)) {
return [[$t, $r]];
}
$break = pow(10, strlen($t)) - 1;
return array_merge([[$t, $break]], self::parseStartRange($break + 1, $r));
}
static private function parseEndRange($t, $r) {
if (strlen($t) == 1) {
return [$t, $r];
}
if (str_repeat("0", strlen($t)) === "0" . substr($t, 1)) {
if (str_repeat("0", strlen($r)) == "9" . substr($r, 1)) {
return [$t, $r];
}
if ((int) substr($t, 0, 1) < (int) substr($r, 0, 1)) {
$e = intval(substr($r, 0, 1) . str_repeat("0", strlen($r) - 1)) - 1;
return array_merge([$t, self::strBreakPoint($e)], self::parseEndRange(self::strBreakPoint($e + 1), $r));
}
}
if (str_repeat("9", strlen($r)) === "9" . substr($r, 1) && (int) substr($t, 0, 1) < (int) substr($r, 0, 1)) {
$e = intval(intval((int) substr($t, 0, 1) + 1) . "" . str_repeat("0", strlen($r) - 1)) - 1;
return array_merge(self::parseEndRange($t, self::strBreakPoint($e)), [self::strBreakPoint($e + 1), $r]);
}
if ((int) substr($t, 0, 1) < (int) substr($r, 0, 1)) {
$e = intval(intval((int) substr($t, 0, 1) + 1) . "" . str_repeat("0", strlen($r) - 1)) - 1;
return array_merge(self::parseEndRange($t, self::strBreakPoint($e)), self::parseEndRange(self::strBreakPoint($e + 1), $r));
}
$a = (int) substr($t, 0, 1);
$o = self::parseEndRange(substr($t, 1), substr($r, 1));
$h = [];
for ($u = 0; $u < count($o); $u++) {
$h[] = ($a . $o[$u]);
}
return $h;
}
static private function strBreakPoint($t) {
return str_pad($t, strlen(($t + 1)), "0", STR_PAD_LEFT);
}
}
测试结果2-8 ^([2-8])$
5-35 ^([5-9]|[12][0-9]|3[0-5])$
5-100 ^([5-9]|[1-8][0-9]|9[0-9]|100)$
12-1234 ^(1[2-9]|[2-9][0-9]|[1-8][0-9]{2}|9[0-8][0-9]|99[0-9]|1[01][0-9]{2}|12[0-2][0-9]|123[0-4])$
123-123 ^(123)$
256-321 ^(25[6-9]|2[6-9][0-9]|3[01][0-9]|32[01])$
256-257 ^(25[67])$
180-195 ^(18[0-9]|19[0-5])$
是否有理由必须是正则表达式?不能做这样的事情:
if ($number >= 256 && $number <= 321){
// do something
}
更新:有一种简单但丑陋的方法:range:
function getRangeRegex($from, $to)
{
$range = implode('|', range($from, $to));
// returns: 256|257|...|321
return $range;
}
要小心,优秀的@Bart Kiers的代码(和JS版本的Travis J)在某些情况下会失败。例如:
12-1234 /^(?:1[2-9]|[2-9][0-9]|[1-9][0-9][0-9]|1[0-2][0-3][0-4])$/
不匹配" 1229 "," 1115 "," 1 [0][0] (5 - 9) "
实际上已经完成了。
看看这个网站。它包含一个python脚本的链接,该脚本会自动生成这些正则表达式。
这个答案与这个问题重复。我还写了一篇博文
使用正则表达式验证数值范围
需要说明的是:当一个简单的if语句就足够了
if(num < -2055 || num > 2055) {
throw new IllegalArgumentException("num (" + num + ") must be between -2055 and 2055");
}
不建议使用正则表达式来验证数字范围。
另外,由于正则表达式分析字符串,必须首先将数字转换为字符串才能对其进行测试(当数字恰好已经是字符串时,例如从控制台获取用户输入时,会出现例外)。
(为了确保字符串是从一个数字开始的,您可以使用org.apache.commons.lang3.math.NumberUtils#isNumber(s)
)
尽管如此,弄清楚如何用正则表达式验证数字范围是有趣的和有指导意义的。
一个数字范围
规则:数量必须完全15
。
是最简单的范围。与此匹配的正则表达式是
'b15'b
为了避免在8215242
中匹配15
,必须使用词边界。
两个数字范围
规则:数字必须在15
和16
之间。三个可能的正则表达式:
'b(15|16)'b
'b1(5|6)'b
'b1[5-6]'b
数字范围"mirrored"在零附近
规则:号码必须在-12
和12
之间
以下是0
到12
的正则表达式,仅限正数:
'b('d|1[0-2])'b
无线:
'b( //The beginning of a word (or number), followed by either
'd // Any digit 0 through 9
| //Or
1[0-2] // A 1 followed by any digit between 0 and 2.
)'b //The end of a word
让这个工作为负数和正数都很简单,只需在开始处添加一个可选的破折号:
-?'b('d|1[0-2])'b
(假设破折号前面没有不合适的字符)
要禁止负数,必须使用负向后看:
(?<!-)'b('d|1[0-2])'b
将lookbehind放在外面会导致-11
中的11
匹配。(这篇文章中的第一个例子应该添加这个)
注:'d
与[0-9]
为了与所有的正则表达式兼容,所有的'd
-s都应该更改为[0-9]
。例如,. net将非ASCII数字(如不同语言中的数字)视为'd
的合法值。除最后一个示例外,为简洁起见,将其保留为'd
。
<我>(由于TimPietzcker在stackoverflow ) 我>
三位数字,除第一位数字外均为零
规则:必须0
和400
之间。
一个可能的正则表达式:
(?<!-)'b([1-3]?'d{1,2}|400)'b
免费间隔:
(?<!-) //Something not preceded by a dash
'b( //Word-start, followed by either
[1-3]? // No digit, or the digit 1, 2, or 3
'd{1,2} // Followed by one or two digits (between 0 and 9)
| //Or
400 // The number 400
)'b //Word-end
另一种不应该使用的可能性:
'b(0|1|2|3|4|5|6|7|8|9|10|11|12|13|14|15|16|17|18|19|20|21|22|23|24|25|26|27|28|29|30|31|32|33|34|35|36|37|38|39|40|41|42|43|44|45|46|47|48|49|50|51|52|53|54|55|56|57|58|59|60|61|62|63|64|65|66|67|68|69|70|71|72|73|74|75|76|77|78|79|80|81|82|83|84|85|86|87|88|89|90|91|92|93|94|95|96|97|98|99|100|101|102|103|104|105|106|107|108|109|110|111|112|113|114|115|116|117|118|119|120|121|122|123|124|125|126|127|128|129|130|131|132|133|134|135|136|137|138|139|140|141|142|143|144|145|146|147|148|149|150|151|152|153|154|155|156|157|158|159|160|161|162|163|164|165|166|167|168|169|170|171|172|173|174|175|176|177|178|179|180|181|182|183|184|185|186|187|188|189|190|191|192|193|194|195|196|197|198|199|200|201|202|203|204|205|206|207|208|209|210|211|212|213|214|215|216|217|218|219|220|221|222|223|224|225|226|227|228|229|230|231|232|233|234|235|236|237|238|239|240|241|242|243|244|245|246|247|248|249|250|251|252|253|254|255|256|257|258|259|260|261|262|263|264|265|266|267|268|269|270|271|272|273|274|275|276|277|278|279|280|281|282|283|284|285|286|287|288|289|290|291|292|293|294|295|296|297|298|299|300|301|302|303|304|305|306|307|308|309|310|311|312|313|314|315|316|317|318|319|320|321|322|323|324|325|326|327|328|329|330|331|332|333|334|335|336|337|338|339|340|341|342|343|344|345|346|347|348|349|350|351|352|353|354|355|356|357|358|359|360|361|362|363|364|365|366|367|368|369|370|371|372|373|374|375|376|377|378|379|380|381|382|383|384|385|386|387|388|389|390|391|392|393|394|395|396|397|398|399|400)'b
最后一个例子:四个数字,围绕零镜像,不以零结束
规则:必须在-2055
和2055
之间
这是来自一个关于stackoverflow的问题
正则表达式:
-?'b(20(5[0-5]|[0-4][0-9])|1?[0-9]{1,3})'b
无线:
-? //Optional dash
'b( //Followed by word boundary, followed by either of the following
20( // "20", followed by either
5[0-5] // A "5" followed by a digit 0-5
| // or
[0-4][0-9] // A digit 0-4, followed by any digit
)
| //OR
1?[0-9]{1,3} // An optional "1", followed by one through three digits (0-9)
)'b //Followed by a word boundary.
下面是这个正则表达式的可视化表示:
你可以在这里自己试一试:Debuggex demonstration
(感谢stackoverflow上的PlasmaPower为调试提供帮助)
最后注意
根据您捕获的内容,很可能所有子组都应该被制作成非捕获组。例如:
(-?'b(?:20(?:5[0-5]|[0-4][0-9])|1?[0-9]{1,3})'b)
代替:
-?'b(20(5[0-5]|[0-4][0-9])|1?[0-9]{1,3})'b
Java实现示例
import java.util.Scanner;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang.math.NumberUtils;
/**
<P>Confirm a user-input number is a valid number by reading a string an testing it is numeric before converting it to an it--this loops until a valid number is provided.</P>
<P>{@code java UserInputNumInRangeWRegex}</P>
**/
public class UserInputNumInRangeWRegex {
public static final void main(String[] ignored) {
int num = -1;
boolean isNum = false;
int iRangeMax = 2055;
//"": Dummy string, to reuse matcher
Matcher mtchrNumNegThrPos = Pattern.compile("-?''b(20(5[0-5]|[0-4][0-9])|1?[0-9]{1,3})''b").matcher("");
do {
System.out.print("Enter a number between -" + iRangeMax + " and " + iRangeMax + ": ");
String strInput = (new Scanner(System.in)).next();
if(!NumberUtils.isNumber(strInput)) {
System.out.println("Not a number. Try again.");
} else if(!mtchrNumNegThrPos.reset(strInput).matches()) {
System.out.println("Not in range. Try again.");
} else {
//Safe to convert
num = Integer.parseInt(strInput);
isNum = true;
}
} while(!isNum);
System.out.println("Number: " + num);
}
}
输出 [C:'java_code']java UserInputNumInRangeWRegex
Enter a number between -2055 and 2055: tuhet
Not a number. Try again.
Enter a number between -2055 and 2055: 283837483
Not in range. Try again.
Enter a number between -2055 and 2055: -200000
Not in range. Try again.
Enter a number between -2055 and 2055: -300
Number: -300
我已经把Bart Kiers的答案转换成c++了。该函数接受两个整数作为输入,并生成数字范围的正则表达式。
#include <stdio.h>
#include <iostream>
#include <vector>
#include <string>
std::string regex_range(int from, int to);
int main(int argc, char **argv)
{
std::string regex = regex_range(1,100);
std::cout << regex << std::endl;
return 0;
}
std::string regex_range(int from, int to) //Credit: Bart Kiers 2011
{
if(from < 0 || to < 0)
{
std::cout << "Negative values not supported. Exiting." << std::endl;
return 0;
}
if(from > to)
{
std::cout << "Invalid range, from > to. Exiting." << std::endl;
return 0;
}
std::vector<int> ranges;
ranges.push_back(from);
int increment = 1;
int next = from;
bool higher = true;
while(true)
{
next += increment;
if(next + increment > to)
{
if(next <= to)
{
ranges.push_back(next);
}
increment /= 10;
higher = false;
}
else if(next % (increment*10) == 0)
{
ranges.push_back(next);
increment = higher ? increment*10 : increment/10;
}
if(!higher && (increment < 10))
{
break;
}
}
ranges.push_back(to + 1);
std::string regex("^(?:");
for(int i = 0; i < ranges.size() - 1; i++)
{
int current_from = ranges.at(i);
std::string str_from = std::to_string(current_from);
int current_to = ranges.at(i + 1) - 1;
std::string str_to = std::to_string(current_to);
for(int j = 0; j < str_from.length(); j++)
{
if(str_from.at(j) == str_to.at(j))
{
std::string str_from_at_j(&str_from.at(j));
regex.append(str_from_at_j);
}
else
{
std::string str_from_at_j(&str_from.at(j));
std::string str_to_at_j(&str_to.at(j));
regex.append("[");
regex.append(str_from_at_j);
regex.append("-");
regex.append(str_to_at_j);
regex.append("]");
}
}
regex.append("|");
}
regex = regex.substr(0, regex.length() - 1);
regex.append(")$");
return regex;
}
因为我遇到了与@EmilianoT已经报告的相同的问题,我试图修复它,但最终我选择移植RegexNumericRangeGenerator的PHP端口(由@EmilianoT移植),虽然不是在一个类中。我对这个JS端口不太满意,因为所有toString()
和parseInt()
方法仍然可以优化(它们可能是不必要的),但它适用于所有情况。
我改变的是参数。我用parse(min, max, width = 0, prefix = '', suffix = '')
代替了parse($min, $max, $MatchWholeWord = FALSE, $MatchWholeLine = FALSE, $MatchLeadingZero = FALSE)
,这给了它更多的选项(有些人可能想把正则表达式变成斜杠,其他人想匹配行[prefix = '^'; suffix = '$'
]等)。我还希望能够配置数字的宽度(width = 3
→000
, 001
, 052
, 800
, 1000
,…)。
我替换了我以前的答案,因为它不工作所有的时间。如果有人想阅读它,他们可以在回答历史中看到它。
function parse(min, max, width = 0, prefix = '', suffix = '') {
if (! Number.isInteger(min) || ! Number.isInteger(max) || min > max || min < 0 || max < 0) {
return false
}
if (min == max) {
return parseIntoPattern(min, prefix, suffix)
}
let x = parseStartRange(min, max)
let s = []
x.forEach(o => {
s.push(parseEndRange(o[0], o[1]))
})
let n = reformatArray(s)
let h = parseIntoRegex(n, width)
return parseIntoPattern(h, prefix, suffix)
}
function parseIntoPattern(t, prefix = '', suffix = '') {
let r = Array.isArray(t) ? t.join('|') : t
return prefix + '(' + r + ')' + suffix
}
function parseIntoRegex(t, width = 0) {
if (! Array.isArray(t)) {
throw new Error('Argument needs to be an array!')
}
let r = []
for (let i = 0; i < t.length; i++) {
let e = t[i][0].split('')
let n = t[i][1].split('')
let s = ''
let o = 0
let h = ''
for (let a = 0; a < e.length; a++) {
if (e[a] === n[a]) {
h += e[a]
} else if (parseInt(e[a]) + 1 === parseInt(n[a])) {
h += '[' + e[a] + n[a] + ']'
} else {
if (s === e[a] + n[a]) {
o++
}
s = e[a] + n[a]
if (a == e.length - 1) {
h += o > 0 ? '{' + (o + 1) + '}' : '[' + e[a] + '-' + n[a] + ']'
} else if (o === 0) {
h += '[' + e[a] + '-' + n[a] + ']'
}
}
}
if (e.length < width) {
h = '0'.repeat(width - e.length, '0') + h
}
r.push(h)
}
return r
}
function reformatArray(t) {
let arrReturn = []
for (let i = 0; i < t.length; i++) {
let page = t[i].length / 2
for (let a = 0; a < page; a++) {
arrReturn.push(t[i].slice(2 * a))
}
}
return arrReturn
}
function parseStartRange(t, r) {
t = t.toString()
r = r.toString()
if (t.length === r.length) {
return [[t, r]]
}
let breakOut = 10 ** t.length - 1
return [[t, breakOut.toString()]].concat(parseStartRange(breakOut + 1, r))
}
function parseEndRange(t, r) {
if (t.length == 1) {
return [t, r]
}
if ('0'.repeat(t.length) === '0' + t.substr(1)) {
if ('0'.repeat(r.length) == '9' + r.substr(1)) {
return [t, r]
}
if (parseInt(t.toString().substr(0, 1)) < parseInt(r.toString().substr(0, 1))) {
let e = parseInt(r.toString().substr(0, 1) + '0'.repeat(r.length - 1)) - 1
return [t, strBreakPoint(e)].concat(parseEndRange(strBreakPoint(e + 1), r))
}
}
if ('9'.repeat(r.length) === '9' + r.toString().substr(1) && parseInt(t.toString().substr(0, 1)) < parseInt(r.toString().substr(0, 1))) {
let e = parseInt(parseInt(parseInt(t.toString().substr(0, 1)) + 1) + '0'.repeat(r.length - 1)) - 1
return parseEndRange(t, strBreakPoint(e)).concat(strBreakPoint(e + 1), r)
}
if (parseInt(t.toString().substr(0, 1)) < parseInt(r.toString().substr(0, 1))) {
let e = parseInt(parseInt(parseInt(t.toString().substr(0, 1)) + 1) + '0'.repeat(r.length - 1)) - 1
return parseEndRange(t, strBreakPoint(e)).concat(parseEndRange(strBreakPoint(e + 1), r))
}
let a = parseInt(t.toString().substr(0, 1))
let o = parseEndRange(t.toString().substr(1), r.toString().substr(1))
let h = []
for (let u = 0; u < o.length; u++) {
h.push(a + o[u])
}
return h
}
function strBreakPoint(t) {
return t.toString().padStart((parseInt(t) + 1).toString().length, '0')
}