Use newline index for regexp content searches

For content searches trying to match multiple terms on the same line, we
check whether the matches of the individual terms intersect before
calling the regex engine. If they don't intersect, we skip the document.

This optimisation is useful whenever terms of the query appear often in
the same document but rarely on the same line.

The table below shows latencies of search queries of master vs.
this change on a set of medium-sized repos. All latencies are
reported in ms.

+-------------------------+-------------+-----------------+-----------+
| Query                   | Master (ms) | Patchset 7 (ms) | Delta (%) |
+-------------------------+-------------+-----------------+-----------+
| (func).*?(rtoorg)       | 21.43       | 2.67            | 87.56     |
+-------------------------+-------------+-----------------+-----------+
| (func).*?(bool).*?(bar) | 723.60      | 170.1           | 76.49     |
+-------------------------+-------------+-----------------+-----------+
| (func).*?(bool).*?      | 1320        | 186.2           | 85.89     |
| (int32).*?(error)       |             |                 |           |
+-------------------------+-------------+-----------------+-----------+
| (func).*?(return)       | 236.97      | 73.73           | 68.88     |
+-------------------------+-------------+-----------------+-----------+
| (config).*?(override)   | 827.90      | 328.67          | 60.30     |
+-------------------------+-------------+-----------------+-----------+

Change-Id: Ic4e4c3e7a4ad61ec56efada35a70aa14d0766ff7
diff --git a/api.go b/api.go
index af22e6c..bec0e51 100644
--- a/api.go
+++ b/api.go
@@ -131,6 +131,9 @@
 
 	// Wall clock time for queued search.
 	Wait time.Duration
+
+	// Number of times regexp was called on files that we evaluated.
+	RegexpsConsidered int
 }
 
 func (s *Stats) Add(o Stats) {
diff --git a/eval.go b/eval.go
index 616f738..37108c0 100644
--- a/eval.go
+++ b/eval.go
@@ -18,6 +18,7 @@
 	"context"
 	"fmt"
 	"log"
+	"regexp/syntax"
 	"sort"
 	"strings"
 
@@ -418,3 +419,84 @@
 	}
 	return l, nil
 }
+
+// regexpToMatchTreeRecursive converts a regular expression to a matchTree mt. If
+// mt is equivalent to the input r, isEqual = true and the matchTree can be used
+// in place of the regex r. If singleLine = true, then the matchTree and all
+// its children only match terms on the same line. singleLine is used during
+// recursion to decide whether to return an andLineMatchTree (singleLine = true)
+// or a andMatchTree (singleLine = false).
+func (d *indexData) regexpToMatchTreeRecursive(r *syntax.Regexp, minTextSize int, fileName bool, caseSensitive bool) (mt matchTree, isEqual bool, singleLine bool, err error) {
+	// TODO - we could perhaps transform Begin/EndText in '\n'?
+	// TODO - we could perhaps transform CharClass in (OrQuery )
+	// if there are just a few runes, and part of a OpConcat?
+	switch r.Op {
+	case syntax.OpLiteral:
+		s := string(r.Rune)
+		if len(s) >= minTextSize {
+			mt, err := d.newSubstringMatchTree(&query.Substring{Pattern: s, FileName: fileName, CaseSensitive: caseSensitive})
+			return mt, true, !strings.Contains(s, "\n"), err
+		}
+	case syntax.OpCapture:
+		return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive)
+
+	case syntax.OpPlus:
+		return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive)
+
+	case syntax.OpRepeat:
+		if r.Min >= 1 {
+			return d.regexpToMatchTreeRecursive(r.Sub[0], minTextSize, fileName, caseSensitive)
+		}
+
+	case syntax.OpConcat, syntax.OpAlternate:
+		var qs []matchTree
+		isEq := true
+		singleLine = true
+		for _, sr := range r.Sub {
+			if sq, subIsEq, subSingleLine, err := d.regexpToMatchTreeRecursive(sr, minTextSize, fileName, caseSensitive); sq != nil {
+				if err != nil {
+					return nil, false, false, err
+				}
+				isEq = isEq && subIsEq
+				singleLine = singleLine && subSingleLine
+				qs = append(qs, sq)
+			}
+		}
+		if r.Op == syntax.OpConcat {
+			if len(qs) > 1 {
+				isEq = false
+			}
+			newQs := make([]matchTree, 0, len(qs))
+			for _, q := range qs {
+				if _, ok := q.(*bruteForceMatchTree); ok {
+					continue
+				}
+				newQs = append(newQs, q)
+			}
+			if len(newQs) == 1 {
+				return newQs[0], isEq, singleLine, nil
+			}
+			if len(newQs) == 0 {
+				return &bruteForceMatchTree{}, isEq, singleLine, nil
+			}
+			if singleLine {
+				return &andLineMatchTree{andMatchTree{children: newQs}}, isEq, singleLine, nil
+			}
+			return &andMatchTree{newQs}, isEq, singleLine, nil
+		}
+		for _, q := range qs {
+			if _, ok := q.(*bruteForceMatchTree); ok {
+				return q, isEq, false, nil
+			}
+		}
+		if len(qs) == 0 {
+			return &noMatchTree{"const"}, isEq, false, nil
+		}
+		return &orMatchTree{qs}, isEq, false, nil
+	case syntax.OpStar:
+		if r.Sub[0].Op == syntax.OpAnyCharNotNL {
+			return &bruteForceMatchTree{}, false, true, nil
+		}
+	}
+	return &bruteForceMatchTree{}, false, false, nil
+}
diff --git a/eval_test.go b/eval_test.go
new file mode 100644
index 0000000..105e1a3
--- /dev/null
+++ b/eval_test.go
@@ -0,0 +1,125 @@
+// Copyright 2020 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package zoekt
+
+import (
+	"reflect"
+	"regexp/syntax"
+	"strings"
+	"testing"
+
+	"github.com/google/zoekt/query"
+)
+
+var opnames = map[syntax.Op]string{
+	syntax.OpNoMatch:        "OpNoMatch",
+	syntax.OpEmptyMatch:     "OpEmptyMatch",
+	syntax.OpLiteral:        "OpLiteral",
+	syntax.OpCharClass:      "OpCharClass",
+	syntax.OpAnyCharNotNL:   "OpAnyCharNotNL",
+	syntax.OpAnyChar:        "OpAnyChar",
+	syntax.OpBeginLine:      "OpBeginLine",
+	syntax.OpEndLine:        "OpEndLine",
+	syntax.OpBeginText:      "OpBeginText",
+	syntax.OpEndText:        "OpEndText",
+	syntax.OpWordBoundary:   "OpWordBoundary",
+	syntax.OpNoWordBoundary: "OpNoWordBoundary",
+	syntax.OpCapture:        "OpCapture",
+	syntax.OpStar:           "OpStar",
+	syntax.OpPlus:           "OpPlus",
+	syntax.OpQuest:          "OpQuest",
+	syntax.OpRepeat:         "OpRepeat",
+	syntax.OpConcat:         "OpConcat",
+	syntax.OpAlternate:      "OpAlternate",
+}
+
+func printRegexp(t *testing.T, r *syntax.Regexp, lvl int) {
+	t.Logf("%s%s ch: %d", strings.Repeat(" ", lvl), opnames[r.Op], len(r.Sub))
+	for _, s := range r.Sub {
+		printRegexp(t, s, lvl+1)
+	}
+}
+
+func substrMT(pattern string) matchTree {
+	d := &indexData{}
+	mt, _ := d.newSubstringMatchTree(&query.Substring{
+		Pattern: pattern,
+	})
+	return mt
+}
+
+func TestRegexpParse(t *testing.T) {
+	type testcase struct {
+		in           string
+		query        matchTree
+		isEquivalent bool
+	}
+
+	cases := []testcase{
+		{"(foo|)bar", substrMT("bar"), false},
+		{"(foo|)", &bruteForceMatchTree{}, false},
+		{"(foo|bar)baz.*bla", &andMatchTree{[]matchTree{
+			&orMatchTree{[]matchTree{
+				substrMT("foo"),
+				substrMT("bar"),
+			}},
+			substrMT("baz"),
+			substrMT("bla"),
+		}}, false},
+		{"^[a-z](People)+barrabas$",
+			&andMatchTree{[]matchTree{
+				substrMT("People"),
+				substrMT("barrabas"),
+			}}, false},
+		{"foo", substrMT("foo"), true},
+		{"^foo", substrMT("foo"), false},
+		{"(foo) (bar)", &andMatchTree{[]matchTree{substrMT("foo"), substrMT("bar")}}, false},
+		{"(thread|needle|haystack)", &orMatchTree{[]matchTree{
+			substrMT("thread"),
+			substrMT("needle"),
+			substrMT("haystack"),
+		}}, true},
+		{"(foo)(?-s:.)*?(bar)", &andLineMatchTree{andMatchTree{[]matchTree{
+			substrMT("foo"),
+			substrMT("bar"),
+		}}}, false},
+		{"(foo)(?-s:.)*?[[:space:]](?-s:.)*?(bar)", &andMatchTree{[]matchTree{
+			substrMT("foo"),
+			substrMT("bar"),
+		}}, false},
+		{"(...)(...)", &bruteForceMatchTree{}, false},
+	}
+
+	for _, c := range cases {
+		r, err := syntax.Parse(c.in, syntax.Perl)
+		if err != nil {
+			t.Errorf("Parse(%q): %v", c.in, err)
+			continue
+		}
+		d := indexData{}
+		q := query.Regexp{
+			Regexp: r,
+		}
+		gotQuery, isEq, _, _ := d.regexpToMatchTreeRecursive(q.Regexp, 3, q.FileName, q.CaseSensitive)
+		if !reflect.DeepEqual(c.query, gotQuery) {
+			printRegexp(t, r, 0)
+			t.Errorf("regexpToQuery(%q): got %v, want %v", c.in, gotQuery, c.query)
+		}
+		if isEq != c.isEquivalent {
+			printRegexp(t, r, 0)
+			t.Errorf("regexpToQuery(%q): got %v, want %v", c.in, isEq, c.isEquivalent)
+		}
+	}
+}
diff --git a/index_test.go b/index_test.go
index b8c638d..d731444 100644
--- a/index_test.go
+++ b/index_test.go
@@ -1859,3 +1859,71 @@
 		}
 	}
 }
+
+func TestLineAnd(t *testing.T) {
+	b := testIndexBuilder(t, &Repository{Name: "reponame"},
+		Document{Name: "f1", Content: []byte("apple\nbanana\napple banana chocolate apple pudding banana\ngrape")},
+		Document{Name: "f2", Content: []byte("apple orange\nbanana")},
+		Document{Name: "f3", Content: []byte("banana grape")},
+	)
+	pattern := "(apple)(?-s:.)*?(banana)"
+	r, _ := syntax.Parse(pattern, syntax.Perl)
+
+	q := query.Regexp{
+		Regexp:  r,
+		Content: true,
+	}
+	res := searchForTest(t, b, &q)
+	wantRegexpCount := 1
+	if gotRegexpCount := res.RegexpsConsidered; gotRegexpCount != wantRegexpCount {
+		t.Errorf("got %d, wanted %d", gotRegexpCount, wantRegexpCount)
+	}
+	if len(res.Files) != 1 || res.Files[0].FileName != "f1" {
+		t.Errorf("got %v, want 1 result", res.Files)
+	}
+}
+
+func TestLineAndFileName(t *testing.T) {
+	b := testIndexBuilder(t, &Repository{Name: "reponame"},
+		Document{Name: "f1", Content: []byte("apple banana\ngrape")},
+		Document{Name: "f2", Content: []byte("apple banana\norange")},
+		Document{Name: "apple banana", Content: []byte("banana grape")},
+	)
+	pattern := "(apple)(?-s:.)*?(banana)"
+	r, _ := syntax.Parse(pattern, syntax.Perl)
+
+	q := query.Regexp{
+		Regexp:   r,
+		FileName: true,
+	}
+	res := searchForTest(t, b, &q)
+	wantRegexpCount := 1
+	if gotRegexpCount := res.RegexpsConsidered; gotRegexpCount != wantRegexpCount {
+		t.Errorf("got %d, wanted %d", gotRegexpCount, wantRegexpCount)
+	}
+	if len(res.Files) != 1 || res.Files[0].FileName != "apple banana" {
+		t.Errorf("got %v, want 1 result", res.Files)
+	}
+}
+
+func TestMultiLineRegex(t *testing.T) {
+	b := testIndexBuilder(t, &Repository{Name: "reponame"},
+		Document{Name: "f1", Content: []byte("apple banana\ngrape")},
+		Document{Name: "f2", Content: []byte("apple orange")},
+		Document{Name: "f3", Content: []byte("grape apple")},
+	)
+	pattern := "(apple).*?[[:space:]].*?(grape)"
+	r, _ := syntax.Parse(pattern, syntax.Perl)
+
+	q := query.Regexp{
+		Regexp: r,
+	}
+	res := searchForTest(t, b, &q)
+	wantRegexpCount := 2
+	if gotRegexpCount := res.RegexpsConsidered; gotRegexpCount != wantRegexpCount {
+		t.Errorf("got %d, wanted %d", gotRegexpCount, wantRegexpCount)
+	}
+	if len(res.Files) != 1 || res.Files[0].FileName != "f1" {
+		t.Errorf("got %v, want 1 result", res.Files)
+	}
+}
diff --git a/matchtree.go b/matchtree.go
index 750f999..0bb80cf 100644
--- a/matchtree.go
+++ b/matchtree.go
@@ -90,6 +90,10 @@
 	docID     uint32
 }
 
+type andLineMatchTree struct {
+	andMatchTree
+}
+
 type andMatchTree struct {
 	children []matchTree
 }
@@ -300,6 +304,8 @@
 		for _, ch := range s.children {
 			visitMatchTree(ch, f)
 		}
+	case *andLineMatchTree:
+		visitMatchTree(&s.andMatchTree, f)
 	case *noVisitMatchTree:
 		visitMatchTree(s.matchTree, f)
 	case *notMatchTree:
@@ -319,6 +325,8 @@
 				visitMatches(ch, known, f)
 			}
 		}
+	case *andLineMatchTree:
+		visitMatches(&s.andMatchTree, known, f)
 	case *orMatchTree:
 		for _, ch := range s.children {
 			if known[ch] {
@@ -343,6 +351,88 @@
 	return true, true
 }
 
+// andLineMatchTree is a performance optimization of andMatchTree. For content
+// searches we don't want to run the regex engine if there is no line that
+// contains matches from all terms.
+func (t *andLineMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
+	matches, sure := t.andMatchTree.matches(cp, cost, known)
+	if !(sure && matches) {
+		return matches, sure
+	}
+
+	// find child with fewest candidates
+	min := maxUInt32
+	fewestChildren := 0
+	for ix, child := range t.children {
+		v, ok := child.(*substrMatchTree)
+		// make sure we are running a content search and that all candidates are a
+		// substrMatchTree
+		if !ok || v.fileName {
+			return matches, sure
+		}
+		if len(v.current) < min {
+			min = len(v.current)
+			fewestChildren = ix
+		}
+	}
+
+	type lineRange struct {
+		start int
+		end   int
+	}
+	lines := make([]lineRange, 0, len(t.children[fewestChildren].(*substrMatchTree).current))
+	prev := -1
+	for _, candidate := range t.children[fewestChildren].(*substrMatchTree).current {
+		line, byteStart, byteEnd := candidate.line(cp.newlines(), cp.fileSize)
+		if line == prev {
+			continue
+		}
+		prev = line
+		lines = append(lines, lineRange{byteStart, byteEnd})
+	}
+
+	// children keeps track of the children's candidates we have already seen.
+	children := make([][]*candidateMatch, 0, len(t.children)-1)
+	for j, child := range t.children {
+		if j == fewestChildren {
+			continue
+		}
+		children = append(children, child.(*substrMatchTree).current)
+	}
+
+nextLine:
+	for i := 0; i < len(lines); i++ {
+		hits := 1
+	nextChild:
+		for j := range children {
+		nextCandidate:
+			for len(children[j]) > 0 {
+				candidate := children[j][0]
+				bo := int(cp.findOffset(false, candidate.runeOffset))
+				if bo < lines[i].start {
+					children[j] = children[j][1:]
+					continue nextCandidate
+				}
+				if bo <= lines[i].end {
+					hits++
+					continue nextChild
+				}
+				// move the `lines` iterator forward until bo <= line.end
+				for i < len(lines) && bo > lines[i].end {
+					i++
+				}
+				i--
+				continue nextLine
+			}
+		}
+		// return early once we found any line that contains matches from all children
+		if hits == len(t.children) {
+			return matches, true
+		}
+	}
+	return false, true
+}
+
 func (t *andMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
 	sure := true
 
@@ -355,6 +445,7 @@
 			sure = false
 		}
 	}
+
 	return true, sure
 }
 
@@ -388,6 +479,7 @@
 		return false, false
 	}
 
+	cp.stats.RegexpsConsidered++
 	idxs := t.regexp.FindAllIndex(cp.data(t.fileName), -1)
 	found := t.found[:0]
 	for _, idx := range idxs {
@@ -496,20 +588,15 @@
 	}
 	switch s := q.(type) {
 	case *query.Regexp:
-		subQ, isEq := query.RegexpToQuery(s.Regexp, ngramSize)
-		subQ = query.Map(subQ, func(q query.Q) query.Q {
-			if sub, ok := q.(*query.Substring); ok {
-				sub.FileName = s.FileName
-				sub.CaseSensitive = s.CaseSensitive
-			}
-			return q
-		})
-
-		subMT, err := d.newMatchTree(subQ)
+		// RegexpToMatchTreeRecursive tries to distill a matchTree that matches a
+		// superset of the regexp. If the returned matchTree is equivalent to the
+		// original regexp, it returns true. An equivalent matchTree has the same
+		// behaviour as the original regexp and can be used instead.
+		//
+		subMT, isEq, _, err := d.regexpToMatchTreeRecursive(s.Regexp, ngramSize, s.FileName, s.CaseSensitive)
 		if err != nil {
 			return nil, err
 		}
-
 		// if the query can be used in place of the regexp
 		// return the subtree
 		if isEq {
diff --git a/query/regexp.go b/query/regexp.go
index 426022d..64c9def 100644
--- a/query/regexp.go
+++ b/query/regexp.go
@@ -42,57 +42,3 @@
 
 	return &newRE
 }
-
-// RegexpToQuery tries to distill a substring search query that
-// matches a superset of the regexp.
-// If the returned query is equivalent with the original regexp,
-// it returns true. An equivalent query has the same behaviour as the original
-// regexp and can be used instead.
-func RegexpToQuery(r *syntax.Regexp, minTextSize int) (query Q, isEquivalent bool) {
-	q, isEq := regexpToQueryRecursive(r, minTextSize)
-	q = Simplify(q)
-	return q, isEq
-}
-
-func regexpToQueryRecursive(r *syntax.Regexp, minTextSize int) (query Q, isEquivalent bool) {
-	// TODO - we could perhaps transform Begin/EndText in '\n'?
-	// TODO - we could perhaps transform CharClass in (OrQuery )
-	// if there are just a few runes, and part of a OpConcat?
-	switch r.Op {
-	case syntax.OpLiteral:
-		s := string(r.Rune)
-		if len(s) >= minTextSize {
-			return &Substring{Pattern: s}, true
-		}
-	case syntax.OpCapture:
-		return regexpToQueryRecursive(r.Sub[0], minTextSize)
-
-	case syntax.OpPlus:
-		return regexpToQueryRecursive(r.Sub[0], minTextSize)
-
-	case syntax.OpRepeat:
-		if r.Min >= 1 {
-			return regexpToQueryRecursive(r.Sub[0], minTextSize)
-		}
-
-	case syntax.OpConcat, syntax.OpAlternate:
-		var qs []Q
-		isEq := true
-		for _, sr := range r.Sub {
-			if sq, sm := regexpToQueryRecursive(sr, minTextSize); sq != nil {
-				if !sm {
-					isEq = false
-				}
-				qs = append(qs, sq)
-			}
-		}
-		if r.Op == syntax.OpConcat {
-			if len(qs) > 1 {
-				isEq = false
-			}
-			return &And{qs}, isEq
-		}
-		return &Or{qs}, isEq
-	}
-	return &Const{true}, false
-}
diff --git a/query/regexp_test.go b/query/regexp_test.go
index 87780e5..a3a12a0 100644
--- a/query/regexp_test.go
+++ b/query/regexp_test.go
@@ -15,7 +15,6 @@
 package query
 
 import (
-	"reflect"
 	"regexp/syntax"
 	"strings"
 	"testing"
@@ -50,58 +49,6 @@
 	}
 }
 
-func TestRegexpParse(t *testing.T) {
-	type testcase struct {
-		in           string
-		query        Q
-		isEquivalent bool
-	}
-
-	cases := []testcase{
-		{"(foo|)bar", &Substring{Pattern: "bar"}, false},
-		{"(foo|)", &Const{true}, false},
-		{"(foo|bar)baz.*bla", &And{[]Q{
-			&Or{[]Q{
-				&Substring{Pattern: "foo"},
-				&Substring{Pattern: "bar"},
-			}},
-			&Substring{Pattern: "baz"},
-			&Substring{Pattern: "bla"},
-		}}, false},
-		{"^[a-z](People)+barrabas$",
-			&And{[]Q{
-				&Substring{Pattern: "People"},
-				&Substring{Pattern: "barrabas"},
-			}}, false},
-		{"foo", &Substring{Pattern: "foo"}, true},
-		{"^foo", &Substring{Pattern: "foo"}, false},
-		{"(foo) (bar)", &And{[]Q{&Substring{Pattern: "foo"}, &Substring{Pattern: "bar"}}}, false},
-		{"(thread|needle|haystack)", &Or{[]Q{
-			&Substring{Pattern: "thread"},
-			&Substring{Pattern: "needle"},
-			&Substring{Pattern: "haystack"},
-		}}, true},
-	}
-
-	for _, c := range cases {
-		r, err := syntax.Parse(c.in, syntax.Perl)
-		if err != nil {
-			t.Errorf("Parse(%q): %v", c.in, err)
-			continue
-		}
-
-		query, isEq := RegexpToQuery(r, 3)
-		if !reflect.DeepEqual(c.query, query) {
-			printRegexp(t, r, 0)
-			t.Errorf("regexpToQuery(%q): got %v, want %v", c.in, query, c.query)
-		}
-		if isEq != c.isEquivalent {
-			printRegexp(t, r, 0)
-			t.Errorf("regexpToQuery(%q): got %v, want %v", c.in, isEq, c.isEquivalent)
-		}
-	}
-}
-
 func TestLowerRegexp(t *testing.T) {
 	in := "[a-zA-Z]fooBAR"
 	re := mustParseRE(in)