Regex - check if input still has chances to become matching

We've got such regexp:

var regexp = /^one (two)+ three/;

So only string like "one two three" or "one two three four" or "one twotwo three" etc. will match it.

However, if we've got string like

"one " - is still 'promising' that maybe soon it will match

but this string: "one three" will never match no matter what we'll do.

Is there some way to check if given string have chances to become matching or not?

I need it for some tips during writing when I want to recommend all options that begins with given input (regexp's I'm using are pretty long and I dont want really to mess with them).


In other words - I want to check if string has ended during checking and nothing 'not matching' was faced.

In even more other words - Answer would be inside reason of not matching. If reason is end of string - then it would be promissing. However I dont know any way to check why some string didnt match

Answers:

Answer

This is a regex feature known as partial matching, it's available in several regex engines such as PCRE, Boost, Java but not in JavaScript.

Andacious's answer shows a very nice way to overcome this limitation, we just need to automate this.

Well... challenge accepted :)

Fortunately, JavaScript has a very limited regex feature set with a simple syntax, so I wrote a simple parser and on-the-fly transformation for this task, based on the features listed on MDN.

A couple points of interest:

  • This produces a regex which will almost always match the empty string. Therefore a failed partial match occurs when the result of exec is null or an array whose first element is the empty string
  • Negative lookaheads are kept as-is. I believe that's the right thing to do. The only ways to fail a match is through them (ie put a (?!) in the regex) and anchors (^ and $).
  • The parser assumes a valid input pattern: you can't create a RegExp object from an invalid pattern in the first place.
  • This code won't handle backreferences properly: ^(\w+)\s+\1$ won't yield a partial match against hello hel for instance

RegExp.prototype.toPartialMatchRegex = function() {
    "use strict";
    
    var re = this,
        source = this.source,
        i = 0;
    
    function process () {
        var result = "",
            tmp;

        function appendRaw(nbChars) {
            result += source.substr(i, nbChars);
            i += nbChars;
        };
        
        function appendOptional(nbChars) {
            result += "(?:" + source.substr(i, nbChars) + "|$)";
            i += nbChars;
        };

        while (i < source.length) {
            switch (source[i])
            {
                case "\\":
                    switch (source[i + 1])
                    {
                        case "c":
                            appendOptional(3);
                            break;
                            
                        case "x":
                            appendOptional(4);
                            break;
                            
                        case "u":
                            if (re.unicode) {
                                if (source[i + 2] === "{") {
                                    appendOptional(source.indexOf("}", i) - i + 1);
                                } else {
                                    appendOptional(6);
                                }
                            } else {
                                appendOptional(2);
                            }
                            break;
                            
                        default:
                            appendOptional(2);
                            break;
                    }
                    break;
                    
                case "[":
                    tmp = /\[(?:\\.|.)*?\]/g;
                    tmp.lastIndex = i;
                    tmp = tmp.exec(source);
                    appendOptional(tmp[0].length);
                    break;
                    
                case "|":
                case "^":
                case "$":
                case "*":
                case "+":
                case "?":
                    appendRaw(1);
                    break;
                    
                case "{":
                    tmp = /\{\d+,?\d*\}/g;
                    tmp.lastIndex = i;
                    tmp = tmp.exec(source);
                    if (tmp) {
                        appendRaw(tmp[0].length);
                    } else {
                        appendOptional(1);
                    }
                    break;
                    
                case "(":
                    if (source[i + 1] == "?") {
                        switch (source[i + 2])
                        {
                            case ":":
                                result += "(?:";
                                i += 3;
                                result += process() + "|$)";
                                break;
                                
                            case "=":
                                result += "(?=";
                                i += 3;
                                result += process() + ")";
                                break;
                                
                            case "!":
                                tmp = i;
                                i += 3;
                                process();
                                result += source.substr(tmp, i - tmp);
                                break;
                        }
                    } else {
                        appendRaw(1);
                        result += process() + "|$)";
                    }
                    break;
                    
                case ")":
                    ++i;
                    return result;
                    
                default:
                    appendOptional(1);
                    break;
            }
        }
        
        return result;
    }
    
    return new RegExp(process(), this.flags);
};






// Test code
(function() {
    document.write('<span style="display: inline-block; width: 60px;">Regex: </span><input id="re" value="^one (two)+ three"/><br><span style="display: inline-block; width: 60px;">Input: </span><input id="txt" value="one twotw"/><br><pre id="result"></pre>');
    document.close();

    var run = function() {
        var output = document.getElementById("result");
        try
        {
            var regex = new RegExp(document.getElementById("re").value);
            var input = document.getElementById("txt").value;
            var partialMatchRegex = regex.toPartialMatchRegex();
            var result = partialMatchRegex.exec(input);
            var matchType = regex.exec(input) ? "Full match" : result && result[0] ? "Partial match" : "No match";
            output.innerText = partialMatchRegex + "\n\n" + matchType + "\n" + JSON.stringify(result);
        }
        catch (e)
        {
            output.innerText = e;
        }
    };

    document.getElementById("re").addEventListener("input", run);
    document.getElementById("txt").addEventListener("input", run);
    run();
}());

I tested it a little bit and it seems to work fine, let me know if you find any bugs.

Answer

Another interesting option that I have used before is to OR every character expected with the $ symbol. This may not work well for every case but for cases where you are looking at specific characters and need a partial match on each character, this works.

For example (in Javascript):

var reg = /^(o|$)(n|$)(e|$)(\s|$)$/;

reg.test('')      -> true;
reg.test('o')     -> true;
reg.test('on')    -> true;
reg.test('one')   -> true;
reg.test('one ')  -> true;
reg.test('one t') -> false;
reg.test('x')     -> false;
reg.test('n')     -> false;
reg.test('e')     -> false;
reg.test(' ')     -> false;

While this isn't the prettiest regex, it is repeatable so if you need to generate it dynamically for some reason, you know the general pattern.

The same pattern can be applied to whole words as well, which probably isn't as helpful because they couldn't type one-by-one to get to these points.

var reg = /^(one|$)(\stwo|$)$/;

reg.test('')        -> true;
reg.test('one')     -> true;
reg.test('one ')    -> false;
reg.test('one two') -> true;
Answer

Your original question was simply testing a string's placement within another, specifically, the start. The fastest way to do that would be to use substr on the match string, followed by indexOf. I've updated my original answer to reflect that:

function check(str){
  return 'one two three'.substr(0, str.length) === str;
};
console.log(check('one'));           // true
console.log(check('one two'));       // true
console.log(check('one two three')); // true
console.log(check('one three'));     // false

If you need case-insensitivity, it's still fastest to simply toLowerCase the match & input strings. (If interested, here's a jsperf of substr, indexOf and RegExp testing for the start of a string, case-insensitively: http://jsperf.com/substr-vs-indexof-vs-regex )

Answer

I have found a npm package with a JavaScript implementation of RegEx with incremental regular expression matching: https://www.npmjs.com/package/incr-regex-package. It looks like it is worth checking out. It can report DONE, MORE, MAYBE and FAILED results for a given input.

There is also an example implementation of an input component for React here: https://www.npmjs.com/package/react-maskedinput. It uses {RXInputMask} from incr-regex-package to provide a more user input focused view of interacting with the RegEx library.

Answer

That's actually quite an interesting question.

Personally, I'd do it with the RegExp constructor, so the query can be broken down:

var input = "one ";
var query = "^one two three";
var q_len = query.length;
var match;
for( var i=1; i<=q_len; i++) {
    match = input.match(new RegExp(query.substr(0,i));
    if( !match) {alert("Pattern does NOT match"); break;}
    if( match[0].length == input.length) {alert("Input is promising!"); break;}
    // else continue to the next iteration
}

Obviously you can handle the "exact match" case up front to avoid the whole loop thing. If the whole pattern matches the input, then you're all good.

EDIT: I've just realised that this will not work for groups and stuff. It will fail on account of malformed regexes, but I hope it can serve as a basis for your query.

Answer

EDIT: after you edited your post the clarify, this answer might not be relevant to your specific question. Leaving it here for reference.


Inside the regex itself, on the subject of failing quickly once you know you won't find anything:

The anchor in your regex means that once the regex engine reaches the h of three, it will stop looking for matches and not try to start a match on the second character. On this case, I don't think you can do better than that (but it's already linear complexity, so not so bad).

On other cases I believe you have some usual best practices to learn in order to fail as quickly as possible when you know a match can't be found anymore.

You can look at possessive quantifiers if you don't already know them, but there are many other tricks...

Answer

Here's a simple proof of concept in a jsFiddle. You basically just loop through the proposed regex in reverse and look for the longest sub-match.

Note: this has a known issue in that it doesn't always handle groups well. For example, it will say foo bar b doesn't match foo( bar)+ at all, when it ought to say there's still hope. This could be fixed by getting much more creative with the following line:

temp_re = new RegExp(regex_part+'$'); // make sure this partial match ends in a piece that could still match the entire regex

Basically, you would need to parse the partial regex to see if it ends in a group, then check recursively for partial matches against that ending group. That's rather complicated, so I'm not getting into it for purposes of my demo.

JavaScript (using jQuery only for demo purposes):

var strings = {
    matches: "Matches!",
    stillhope: "There's still hope...",
    mismatch: "Does not match",
    empty: "No text yet"
};

// Object to handle watching for partial matches
function PartialRegexMonitor(regex, input_id) {
    var self = this;
    this.relen = regex.length;
    $('body').on('keyup', '#'+input_id, function() {
         self.update_test_results(regex, input_id);
    });
}

PartialRegexMonitor.prototype.update_test_results = function(regex, input_id) {
    var input = $('#'+input_id).val(),
        matches = find_partial_matches(regex, input),
        span = $('#'+input_id).siblings('.results');
    span.removeClass('match');
    span.removeClass('stillhope');
    span.removeClass('mismatch');
    span.removeClass('empty');
    span.addClass(matches.type)
        .html(strings[matches.type]);
}

// Test a partial regex against a string
function partial_match_tester(regex_part, str) {
    var matched = false;
    try {
        var re = new RegExp(regex_part, 'g'),
            matches = str.match(re),
            match_count = matches.length,
            temp_re;
        for(var i = 0; i < match_count; i++) {
            temp_re = new RegExp(regex_part+'$'); // make sure this partial match ends in a piece that could still match the entire regex
            matched = temp_re.test(str);
            if(matched) break;
        }
    }
    catch(e) {
    }
    return matched;
}

// Find the longest matching partial regex
function find_partial_matches(regex, str) {
    var relen = regex.length,
        matched = false,
        matches = {type: 'mismatch',
                   len: 0},
        regex_part = '';
    if(str.length == 0) {
        matches.type = 'empty';
        return matches;
    }

    for(var i=relen; i>=1; i--) {
        if(i==1 && str[0] == '^') { // don't allow matching against only '^'
            continue;
        }
        regex_part = regex.substr(0,i);
        // replace {\d}$ with {0,\d} for testing
        regex_part = regex_part.replace(/\{(\d)\}$/g, '{0,$1}');

        matched = partial_match_tester(regex_part, str);
        if(matched) {
            matches.type = (i==relen ? 'matches' : 'stillhope');
            console.log(matches.type + ": "+regex.substr(0,i)+" "+str);
            matches.len = i;
            break;
        }
    }
    return matches;
}

// Demo
$(function() {
    new PartialRegexMonitor('foo bar', 'input_0');
    new PartialRegexMonitor('^foo bar$', 'input_1');
    new PartialRegexMonitor('^fo+(\\s*b\\S[rz])+$', 'input_2');
    new PartialRegexMonitor('^\\d{3}-\\d{3}-\\d{4}$', 'input_3');
});

HTML for the demo:

<p>
Test against <span class="regex">foo bar</span>:<br/>
    <input type="text" id="input_0" />
    <span class="results empty">No text yet</span>
</p>
<p>
    Test against <span class="regex">^foo bar$</span>:<br/>
    <input type="text" id="input_1" />
    <span class="results empty">No text yet</span>
</p>
<p>
    Test against <span class="regex">^fo+(\s*b\S[rz])+$</span> (e.g., "foo bar", "foo baz", "foo bar baz"):<br/>
    <input type="text" id="input_2" />
    <span class="results empty">No text yet</span>
</p>
<p>
    Test against <span class="regex">^\d{3}-\d{3}-\d{4}$</span>:<br/>
    <input type="text" id="input_3" />
    <span class="results empty">No text yet</span>
</p>

CSS for the demo

.empty {
    background-color: #eeeeee;
}
.matches {
    background-color: #ccffcc;
    font-weight: bold;
}
.stillhope {
    background-color: #ccffff;
}
.mismatch {
    background-color: #ffcccc;
    font-weight: bold;
}
.regex {
    border-top:1px solid #999;
    border-bottom:1px solid #999;
    font-family: Courier New, monospace;
    background-color: #eee;
    color: #666;
}

Sample screenshot from the demo

enter image description here

Answer

Basing on @Andacious answer, I've made my own, a little more advanced and it seems to work.

I've made method that modifies regex making it accepting promising answers.

It replaces any literal char with "(?:OLD_LETTER|$)" so k becomes (?:k|$) looking for matching letter or end of input.

It also looks for parts not to replace like {1,2} and leave them as they are.

I'm sure it's not complete, but its very easy to add new rules of checking and main trick with any_sign or end of input seems to work in any case as end of string match and dont continue so basically we need to make such modification to main regex, that any literal char or group of chars will have alternative |$ and every syntax (that sometimes seems to have literal chars too) can't be destroyed.

RegExp.prototype.promising = function(){
    var source = this.source;
    var regexps = {
        replaceCandidates : /(\{[\d,]\})|(\[[\w-]+\])|((?:\\\w))|([\w\s-])/g,   //parts of regexp that may be replaced
        dontReplace : /\{[\d,]\}/g,     //dont replace those
    }

    source =  source.replace(regexps.replaceCandidates, function(n){ 
        if ( regexps.dontReplace.test(n) ) return n;
        return "(?:" + n + "|$)";
    });


    source = source.replace(/\s/g, function(s){
        return "(?:" + s + "|$)";
    });

    return new RegExp(source);

}

Test on jsFiddle

Answer

Still not 100% sure what you are asking for, but you can also nest them, like this:

var regexp = /^(one)+((\s+two)+((\s+three)+((\s+four)+)?)?)?$/;

Matches:

  • one
  • one two two
  • one two two three
  • one two two three three four

Does not match:

  • one three
Answer

Not sure if there is a way to do this with regex without making an extremely complicated pattern. But if you are just checking against a string then you can do something like this:

function matchPartialFromBeginning(pattern, value) {
    if(pattern.length == value.length)
        return pattern == value;
    if(pattern.length > value.length)
        return pattern.substr(0, value.length) == value;
    return false;
}
Answer

I think there is no automatic "Apply to every Pattern"-Solution. However you can manually derive an "optional" pattern out of your regex, and check against both patterns:

var fullPattern = /^one (two)+ three/;
var optPattern = /^o?n?e? ?(t?w?o?)+ ?t?h?r?e?e?$/;

//verbal
if (fullPattern.matches()){
   //matched
} else {
  if (optPattern.matches()){
     //looks promising
  }else{
     //no match.
  }
}

You could also implement your own optionalizePattern() method, that converts Regex Pattern into such an Optional derivate. (Will be a difficult task to cover ALL possible patterns. Dunno, if possible at all.)

Tags

Recent Questions

Top Questions

Home Tags Terms of Service Privacy Policy DMCA Contact Us

©2020 All rights reserved.