make LineMatches only contain single lines
Fixes https://github.com/google/zoekt/issues/88.
Change-Id: Ida958551f018b197c7c71412f4f737f6b9b43e95
diff --git a/contentprovider.go b/contentprovider.go
index 1fc3cfe..f6aa94d 100644
--- a/contentprovider.go
+++ b/contentprovider.go
@@ -150,6 +150,7 @@
result = []LineMatch{res}
}
} else {
+ ms = breakMatchesOnNewlines(ms, p.data(false))
result = p.fillContentMatches(ms)
}
diff --git a/index_test.go b/index_test.go
index fca6911..2c0daf4 100644
--- a/index_test.go
+++ b/index_test.go
@@ -176,6 +176,23 @@
}
}
+// A result spanning multiple lines should have LineMatches that only cover
+// single lines.
+func TestQueryNewlines(t *testing.T) {
+ text := "line1\nline2\nbla"
+ b := testIndexBuilder(t, nil,
+ Document{Name: "filename", Content: []byte(text)})
+ sres := searchForTest(t, b, &query.Substring{Pattern: "ine2\nbla"})
+ matches := sres.Files
+ if len(matches) != 1 {
+ t.Fatalf("got %d file matches, want exactly one", len(matches))
+ }
+ m := matches[0]
+ if len(m.LineMatches) != 2 {
+ t.Fatalf("got %d line matches, want exactly two", len(m.LineMatches))
+ }
+}
+
func searchForTest(t *testing.T, b *IndexBuilder, q query.Q, o ...SearchOptions) *SearchResult {
searcher := searcherForTest(t, b)
var opts SearchOptions
@@ -1197,7 +1214,7 @@
sres := searchForTest(t, b, &query.Regexp{Regexp: re, CaseSensitive: true})
if len(sres.Files) != 1 {
t.Errorf("got %v, wanted 1 matches", sres.Files)
- } else if l := sres.Files[0].LineMatches[0].Line; !bytes.Equal(l, content) {
+ } else if l := sres.Files[0].LineMatches[0].Line; !bytes.Equal(l, content[len("pqr\n"):]) {
t.Errorf("got match line %q, want %q", l, content)
}
}
diff --git a/matchtree.go b/matchtree.go
index 5109f5e..4e86549 100644
--- a/matchtree.go
+++ b/matchtree.go
@@ -391,11 +391,13 @@
idxs := t.regexp.FindAllIndex(cp.data(t.fileName), -1)
found := t.found[:0]
for _, idx := range idxs {
- found = append(found, &candidateMatch{
+ cm := &candidateMatch{
byteOffset: uint32(idx[0]),
byteMatchSz: uint32(idx[1] - idx[0]),
fileName: t.fileName,
- })
+ }
+
+ found = append(found, cm)
}
t.found = found
t.reEvaluated = true
@@ -403,6 +405,41 @@
return len(t.found) > 0, true
}
+// breakMatchesOnNewlines returns matches resulting from breaking each element
+// of cms on newlines within text.
+func breakMatchesOnNewlines(cms []*candidateMatch, text []byte) []*candidateMatch {
+ var lineCMs []*candidateMatch
+ for _, cm := range cms {
+ lineCMs = append(lineCMs, breakOnNewlines(cm, text)...)
+ }
+ return lineCMs
+}
+
+// breakOnNewlines returns matches resulting from breaking cm on newlines
+// within text.
+func breakOnNewlines(cm *candidateMatch, text []byte) []*candidateMatch {
+ var cms []*candidateMatch
+ addMe := &candidateMatch{}
+ *addMe = *cm
+ for i := uint32(cm.byteOffset); i < cm.byteOffset+cm.byteMatchSz; i++ {
+ if text[i] == '\n' {
+ addMe.byteMatchSz = i - addMe.byteOffset
+ if addMe.byteMatchSz != 0 {
+ cms = append(cms, addMe)
+ }
+
+ addMe = &candidateMatch{}
+ *addMe = *cm
+ addMe.byteOffset = i + 1
+ }
+ }
+ addMe.byteMatchSz = cm.byteOffset + cm.byteMatchSz - addMe.byteOffset
+ if addMe.byteMatchSz != 0 {
+ cms = append(cms, addMe)
+ }
+ return cms
+}
+
func evalMatchTree(cp *contentProvider, cost int, known map[matchTree]bool, mt matchTree) (bool, bool) {
if v, ok := known[mt]; ok {
return v, true
diff --git a/matchtree_test.go b/matchtree_test.go
new file mode 100644
index 0000000..7585b6c
--- /dev/null
+++ b/matchtree_test.go
@@ -0,0 +1,154 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package zoekt
+
+import (
+ "reflect"
+ "testing"
+)
+
+func Test_breakOnNewlines(t *testing.T) {
+ type args struct {
+ cm *candidateMatch
+ text []byte
+ }
+ tests := []struct {
+ name string
+ args args
+ want []*candidateMatch
+ }{
+ {
+ name: "trivial case",
+ args: args{
+ cm: &candidateMatch{
+ byteOffset: 0,
+ byteMatchSz: 0,
+ },
+ text: nil,
+ },
+ want: nil,
+ },
+ {
+ name: "no newlines",
+ args: args{
+ cm: &candidateMatch{
+ byteOffset: 0,
+ byteMatchSz: 1,
+ },
+ text: []byte("a"),
+ },
+ want: []*candidateMatch{
+ {
+ byteOffset: 0,
+ byteMatchSz: 1,
+ },
+ },
+ },
+ {
+ name: "newline at start",
+ args: args{
+ cm: &candidateMatch{
+ byteOffset: 0,
+ byteMatchSz: 2,
+ },
+ text: []byte("\na"),
+ },
+ want: []*candidateMatch{
+ {
+ byteOffset: 1,
+ byteMatchSz: 1,
+ },
+ },
+ },
+ {
+ name: "newline at end",
+ args: args{
+ cm: &candidateMatch{
+ byteOffset: 0,
+ byteMatchSz: 2,
+ },
+ text: []byte("a\n"),
+ },
+ want: []*candidateMatch{
+ {
+ byteOffset: 0,
+ byteMatchSz: 1,
+ },
+ },
+ },
+ {
+ name: "newline in middle",
+ args: args{
+ cm: &candidateMatch{
+ byteOffset: 0,
+ byteMatchSz: 3,
+ },
+ text: []byte("a\nb"),
+ },
+ want: []*candidateMatch{
+ {
+ byteOffset: 0,
+ byteMatchSz: 1,
+ },
+ {
+ byteOffset: 2,
+ byteMatchSz: 1,
+ },
+ },
+ },
+ {
+ name: "two newlines",
+ args: args{
+ cm: &candidateMatch{
+ byteOffset: 0,
+ byteMatchSz: 5,
+ },
+ text: []byte("a\nb\nc"),
+ },
+ want: []*candidateMatch{
+ {
+ byteOffset: 0,
+ byteMatchSz: 1,
+ },
+ {
+ byteOffset: 2,
+ byteMatchSz: 1,
+ },
+ {
+ byteOffset: 4,
+ byteMatchSz: 1,
+ },
+ },
+ },
+ }
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ if got := breakOnNewlines(tt.args.cm, tt.args.text); !reflect.DeepEqual(got, tt.want) {
+ type PrintableCm struct {
+ byteOffset uint32
+ byteMatchSz uint32
+ }
+ var got2, want2 []PrintableCm
+ for _, g := range got {
+ got2 = append(got2, PrintableCm{byteOffset: g.byteOffset, byteMatchSz: g.byteMatchSz})
+ }
+ for _, w := range tt.want {
+ want2 = append(want2, PrintableCm{byteOffset: w.byteOffset, byteMatchSz: w.byteMatchSz})
+ }
+ t.Errorf("breakMatchOnNewlines() = %+v, want %+v", got2, want2)
+ }
+ })
+ }
+}