Improve performance for concatened queries

This modifies the behaviour of the regexpToQuery method to make it return whether the generated query has the exact same behaviour as the original regexp.
When that happens, the matchTree method skips creating a regexpMatchTree node, which prevents running the regex engine.

Change-Id: I26532b238fb5f03f2d145fe7ca118ec9eb331b18
diff --git a/matchtree.go b/matchtree.go
index 5b90be5..750f999 100644
--- a/matchtree.go
+++ b/matchtree.go
@@ -496,7 +496,7 @@
 	}
 	switch s := q.(type) {
 	case *query.Regexp:
-		subQ := query.RegexpToQuery(s.Regexp, ngramSize)
+		subQ, isEq := query.RegexpToQuery(s.Regexp, ngramSize)
 		subQ = query.Map(subQ, func(q query.Q) query.Q {
 			if sub, ok := q.(*query.Substring); ok {
 				sub.FileName = s.FileName
@@ -510,6 +510,12 @@
 			return nil, err
 		}
 
+		// if the query can be used in place of the regexp
+		// return the subtree
+		if isEq {
+			return subMT, nil
+		}
+
 		prefix := ""
 		if !s.CaseSensitive {
 			prefix = "(?i)"
diff --git a/matchtree_test.go b/matchtree_test.go
index 7585b6c..07aa4a5 100644
--- a/matchtree_test.go
+++ b/matchtree_test.go
@@ -17,6 +17,8 @@
 import (
 	"reflect"
 	"testing"
+
+	"github.com/google/zoekt/query"
 )
 
 func Test_breakOnNewlines(t *testing.T) {
@@ -152,3 +154,38 @@
 		})
 	}
 }
+
+func TestEquivalentQuerySkipRegexpTree(t *testing.T) {
+	tests := []struct {
+		query string
+		skip  bool
+	}{
+		{query: "^foo", skip: false},
+		{query: "foo", skip: true},
+		{query: "thread|needle|haystack", skip: true},
+		{query: "contain(er|ing)", skip: false},
+		{query: "thread (needle|haystack)", skip: true},
+		{query: "thread (needle|)", skip: false},
+	}
+
+	for _, tt := range tests {
+		q, err := query.Parse(tt.query)
+		if err != nil {
+			t.Errorf("Error parsing query: %s", "sym:"+tt.query)
+			continue
+		}
+
+		d := &indexData{}
+		mt, err := d.newMatchTree(q)
+		if err != nil {
+			t.Errorf("Error creating match tree from query: %s", q)
+			continue
+		}
+
+		visitMatchTree(mt, func(m matchTree) {
+			if _, ok := m.(*regexpMatchTree); ok && tt.skip {
+				t.Errorf("Expected regexpMatchTree to be skipped for query: %s", q)
+			}
+		})
+	}
+}
diff --git a/query/regexp.go b/query/regexp.go
index 25a75d8..426022d 100644
--- a/query/regexp.go
+++ b/query/regexp.go
@@ -45,13 +45,16 @@
 
 // RegexpToQuery tries to distill a substring search query that
 // matches a superset of the regexp.
-func RegexpToQuery(r *syntax.Regexp, minTextSize int) Q {
-	q := regexpToQueryRecursive(r, minTextSize)
+// If the returned query is equivalent with the original regexp,
+// it returns true. An equivalent query has the same behaviour as the original
+// regexp and can be used instead.
+func RegexpToQuery(r *syntax.Regexp, minTextSize int) (query Q, isEquivalent bool) {
+	q, isEq := regexpToQueryRecursive(r, minTextSize)
 	q = Simplify(q)
-	return q
+	return q, isEq
 }
 
-func regexpToQueryRecursive(r *syntax.Regexp, minTextSize int) Q {
+func regexpToQueryRecursive(r *syntax.Regexp, minTextSize int) (query Q, isEquivalent bool) {
 	// TODO - we could perhaps transform Begin/EndText in '\n'?
 	// TODO - we could perhaps transform CharClass in (OrQuery )
 	// if there are just a few runes, and part of a OpConcat?
@@ -59,7 +62,7 @@
 	case syntax.OpLiteral:
 		s := string(r.Rune)
 		if len(s) >= minTextSize {
-			return &Substring{Pattern: s}
+			return &Substring{Pattern: s}, true
 		}
 	case syntax.OpCapture:
 		return regexpToQueryRecursive(r.Sub[0], minTextSize)
@@ -74,15 +77,22 @@
 
 	case syntax.OpConcat, syntax.OpAlternate:
 		var qs []Q
+		isEq := true
 		for _, sr := range r.Sub {
-			if sq := regexpToQueryRecursive(sr, minTextSize); sq != nil {
+			if sq, sm := regexpToQueryRecursive(sr, minTextSize); sq != nil {
+				if !sm {
+					isEq = false
+				}
 				qs = append(qs, sq)
 			}
 		}
 		if r.Op == syntax.OpConcat {
-			return &And{qs}
+			if len(qs) > 1 {
+				isEq = false
+			}
+			return &And{qs}, isEq
 		}
-		return &Or{qs}
+		return &Or{qs}, isEq
 	}
-	return &Const{true}
+	return &Const{true}, false
 }
diff --git a/query/regexp_test.go b/query/regexp_test.go
index ebf9ce6..87780e5 100644
--- a/query/regexp_test.go
+++ b/query/regexp_test.go
@@ -52,13 +52,14 @@
 
 func TestRegexpParse(t *testing.T) {
 	type testcase struct {
-		in   string
-		want Q
+		in           string
+		query        Q
+		isEquivalent bool
 	}
 
 	cases := []testcase{
-		{"(foo|)bar", &Substring{Pattern: "bar"}},
-		{"(foo|)", &Const{true}},
+		{"(foo|)bar", &Substring{Pattern: "bar"}, false},
+		{"(foo|)", &Const{true}, false},
 		{"(foo|bar)baz.*bla", &And{[]Q{
 			&Or{[]Q{
 				&Substring{Pattern: "foo"},
@@ -66,12 +67,20 @@
 			}},
 			&Substring{Pattern: "baz"},
 			&Substring{Pattern: "bla"},
-		}}},
+		}}, false},
 		{"^[a-z](People)+barrabas$",
 			&And{[]Q{
 				&Substring{Pattern: "People"},
 				&Substring{Pattern: "barrabas"},
-			}}},
+			}}, false},
+		{"foo", &Substring{Pattern: "foo"}, true},
+		{"^foo", &Substring{Pattern: "foo"}, false},
+		{"(foo) (bar)", &And{[]Q{&Substring{Pattern: "foo"}, &Substring{Pattern: "bar"}}}, false},
+		{"(thread|needle|haystack)", &Or{[]Q{
+			&Substring{Pattern: "thread"},
+			&Substring{Pattern: "needle"},
+			&Substring{Pattern: "haystack"},
+		}}, true},
 	}
 
 	for _, c := range cases {
@@ -81,10 +90,14 @@
 			continue
 		}
 
-		got := RegexpToQuery(r, 3)
-		if !reflect.DeepEqual(c.want, got) {
+		query, isEq := RegexpToQuery(r, 3)
+		if !reflect.DeepEqual(c.query, query) {
 			printRegexp(t, r, 0)
-			t.Errorf("regexpToQuery(%q): got %v, want %v", c.in, got, c.want)
+			t.Errorf("regexpToQuery(%q): got %v, want %v", c.in, query, c.query)
+		}
+		if isEq != c.isEquivalent {
+			printRegexp(t, r, 0)
+			t.Errorf("regexpToQuery(%q): got %v, want %v", c.in, isEq, c.isEquivalent)
 		}
 	}
 }