Use filename content for determining byte offsets in filename matches.
Change-Id: I1339ff375e275f13982161fb24db10bb439ce5ad
diff --git a/index_test.go b/index_test.go
index 4819d5a..fc0112e 100644
--- a/index_test.go
+++ b/index_test.go
@@ -1347,3 +1347,25 @@
t.Errorf("got FilesConsidered = %d, want 0", sres.Stats.FilesConsidered)
}
}
+
+func TestUTF8CorrectCorpus(t *testing.T) {
+ needle := "neeedle"
+
+ // 6 bytes.
+ unicode := "世界"
+ b := testIndexBuilder(t, nil,
+ Document{
+ Name: "f1",
+ Content: []byte(strings.Repeat(unicode, 100)),
+ },
+ Document{
+ Name: "xxxxxneeedle",
+ Content: []byte("hello"),
+ })
+
+ q := &query.Substring{Pattern: needle, FileName: true}
+ res := searchForTest(t, b, q)
+ if len(res.Files) != 1 {
+ t.Errorf("got %v, want 1 result", res)
+ }
+}
diff --git a/search.go b/search.go
index 406d964..bc388b3 100644
--- a/search.go
+++ b/search.go
@@ -82,8 +82,9 @@
return p._data
}
-// Find offset in bytes (relative to file start) for an offset in
-// runes (relative to file start).
+// Find offset in bytes (relative to corpus start) for an offset in
+// runes (relative to document start). If filename is set, the corpus
+// is the set of filenames, with the document being the name itself.
func (p *contentProvider) findOffset(filename bool, r uint32) uint32 {
sample := p.id.runeOffsets
runeEnds := p.id.fileEndRunes
@@ -103,11 +104,15 @@
left := absR % runeOffsetFrequency
var data []byte
- data, p.err = p.id.readContentSlice(byteOff, 3*runeOffsetFrequency)
- if p.err != nil {
- return 0
- }
+ if filename {
+ data = p.id.fileNameContent[byteOff:]
+ } else {
+ data, p.err = p.id.readContentSlice(byteOff, 3*runeOffsetFrequency)
+ if p.err != nil {
+ return 0
+ }
+ }
for left > 0 {
_, sz := utf8.DecodeRune(data)
byteOff += uint32(sz)