make LineMatches only contain single lines

Fixes https://github.com/google/zoekt/issues/88.

Change-Id: Ida958551f018b197c7c71412f4f737f6b9b43e95
diff --git a/contentprovider.go b/contentprovider.go
index 1fc3cfe..f6aa94d 100644
--- a/contentprovider.go
+++ b/contentprovider.go
@@ -150,6 +150,7 @@
 			result = []LineMatch{res}
 		}
 	} else {
+		ms = breakMatchesOnNewlines(ms, p.data(false))
 		result = p.fillContentMatches(ms)
 	}
 
diff --git a/index_test.go b/index_test.go
index fca6911..2c0daf4 100644
--- a/index_test.go
+++ b/index_test.go
@@ -176,6 +176,23 @@
 	}
 }
 
+// A result spanning multiple lines should have LineMatches that only cover
+// single lines.
+func TestQueryNewlines(t *testing.T) {
+	text := "line1\nline2\nbla"
+	b := testIndexBuilder(t, nil,
+		Document{Name: "filename", Content: []byte(text)})
+	sres := searchForTest(t, b, &query.Substring{Pattern: "ine2\nbla"})
+	matches := sres.Files
+	if len(matches) != 1 {
+		t.Fatalf("got %d file matches, want exactly one", len(matches))
+	}
+	m := matches[0]
+	if len(m.LineMatches) != 2 {
+		t.Fatalf("got %d line matches, want exactly two", len(m.LineMatches))
+	}
+}
+
 func searchForTest(t *testing.T, b *IndexBuilder, q query.Q, o ...SearchOptions) *SearchResult {
 	searcher := searcherForTest(t, b)
 	var opts SearchOptions
@@ -1197,7 +1214,7 @@
 	sres := searchForTest(t, b, &query.Regexp{Regexp: re, CaseSensitive: true})
 	if len(sres.Files) != 1 {
 		t.Errorf("got %v, wanted 1 matches", sres.Files)
-	} else if l := sres.Files[0].LineMatches[0].Line; !bytes.Equal(l, content) {
+	} else if l := sres.Files[0].LineMatches[0].Line; !bytes.Equal(l, content[len("pqr\n"):]) {
 		t.Errorf("got match line %q, want %q", l, content)
 	}
 }
diff --git a/matchtree.go b/matchtree.go
index 5109f5e..4e86549 100644
--- a/matchtree.go
+++ b/matchtree.go
@@ -391,11 +391,13 @@
 	idxs := t.regexp.FindAllIndex(cp.data(t.fileName), -1)
 	found := t.found[:0]
 	for _, idx := range idxs {
-		found = append(found, &candidateMatch{
+		cm := &candidateMatch{
 			byteOffset:  uint32(idx[0]),
 			byteMatchSz: uint32(idx[1] - idx[0]),
 			fileName:    t.fileName,
-		})
+		}
+
+		found = append(found, cm)
 	}
 	t.found = found
 	t.reEvaluated = true
@@ -403,6 +405,41 @@
 	return len(t.found) > 0, true
 }
 
+// breakMatchesOnNewlines returns matches resulting from breaking each element
+// of cms on newlines within text.
+func breakMatchesOnNewlines(cms []*candidateMatch, text []byte) []*candidateMatch {
+	var lineCMs []*candidateMatch
+	for _, cm := range cms {
+		lineCMs = append(lineCMs, breakOnNewlines(cm, text)...)
+	}
+	return lineCMs
+}
+
+// breakOnNewlines returns matches resulting from breaking cm on newlines
+// within text.
+func breakOnNewlines(cm *candidateMatch, text []byte) []*candidateMatch {
+	var cms []*candidateMatch
+	addMe := &candidateMatch{}
+	*addMe = *cm
+	for i := uint32(cm.byteOffset); i < cm.byteOffset+cm.byteMatchSz; i++ {
+		if text[i] == '\n' {
+			addMe.byteMatchSz = i - addMe.byteOffset
+			if addMe.byteMatchSz != 0 {
+				cms = append(cms, addMe)
+			}
+
+			addMe = &candidateMatch{}
+			*addMe = *cm
+			addMe.byteOffset = i + 1
+		}
+	}
+	addMe.byteMatchSz = cm.byteOffset + cm.byteMatchSz - addMe.byteOffset
+	if addMe.byteMatchSz != 0 {
+		cms = append(cms, addMe)
+	}
+	return cms
+}
+
 func evalMatchTree(cp *contentProvider, cost int, known map[matchTree]bool, mt matchTree) (bool, bool) {
 	if v, ok := known[mt]; ok {
 		return v, true
diff --git a/matchtree_test.go b/matchtree_test.go
new file mode 100644
index 0000000..7585b6c
--- /dev/null
+++ b/matchtree_test.go
@@ -0,0 +1,154 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package zoekt
+
+import (
+	"reflect"
+	"testing"
+)
+
+func Test_breakOnNewlines(t *testing.T) {
+	type args struct {
+		cm   *candidateMatch
+		text []byte
+	}
+	tests := []struct {
+		name string
+		args args
+		want []*candidateMatch
+	}{
+		{
+			name: "trivial case",
+			args: args{
+				cm: &candidateMatch{
+					byteOffset:  0,
+					byteMatchSz: 0,
+				},
+				text: nil,
+			},
+			want: nil,
+		},
+		{
+			name: "no newlines",
+			args: args{
+				cm: &candidateMatch{
+					byteOffset:  0,
+					byteMatchSz: 1,
+				},
+				text: []byte("a"),
+			},
+			want: []*candidateMatch{
+				{
+					byteOffset:  0,
+					byteMatchSz: 1,
+				},
+			},
+		},
+		{
+			name: "newline at start",
+			args: args{
+				cm: &candidateMatch{
+					byteOffset:  0,
+					byteMatchSz: 2,
+				},
+				text: []byte("\na"),
+			},
+			want: []*candidateMatch{
+				{
+					byteOffset:  1,
+					byteMatchSz: 1,
+				},
+			},
+		},
+		{
+			name: "newline at end",
+			args: args{
+				cm: &candidateMatch{
+					byteOffset:  0,
+					byteMatchSz: 2,
+				},
+				text: []byte("a\n"),
+			},
+			want: []*candidateMatch{
+				{
+					byteOffset:  0,
+					byteMatchSz: 1,
+				},
+			},
+		},
+		{
+			name: "newline in middle",
+			args: args{
+				cm: &candidateMatch{
+					byteOffset:  0,
+					byteMatchSz: 3,
+				},
+				text: []byte("a\nb"),
+			},
+			want: []*candidateMatch{
+				{
+					byteOffset:  0,
+					byteMatchSz: 1,
+				},
+				{
+					byteOffset:  2,
+					byteMatchSz: 1,
+				},
+			},
+		},
+		{
+			name: "two newlines",
+			args: args{
+				cm: &candidateMatch{
+					byteOffset:  0,
+					byteMatchSz: 5,
+				},
+				text: []byte("a\nb\nc"),
+			},
+			want: []*candidateMatch{
+				{
+					byteOffset:  0,
+					byteMatchSz: 1,
+				},
+				{
+					byteOffset:  2,
+					byteMatchSz: 1,
+				},
+				{
+					byteOffset:  4,
+					byteMatchSz: 1,
+				},
+			},
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if got := breakOnNewlines(tt.args.cm, tt.args.text); !reflect.DeepEqual(got, tt.want) {
+				type PrintableCm struct {
+					byteOffset  uint32
+					byteMatchSz uint32
+				}
+				var got2, want2 []PrintableCm
+				for _, g := range got {
+					got2 = append(got2, PrintableCm{byteOffset: g.byteOffset, byteMatchSz: g.byteMatchSz})
+				}
+				for _, w := range tt.want {
+					want2 = append(want2, PrintableCm{byteOffset: w.byteOffset, byteMatchSz: w.byteMatchSz})
+				}
+				t.Errorf("breakMatchOnNewlines() = %+v, want %+v", got2, want2)
+			}
+		})
+	}
+}