Check for binary content using bytes.Index before building postings
In commit 8029ba ("Include files without text on index"), I bailed
halfway in newSearchableString on finding problems. Unfortunately, the
postingsBuilder was not meant to process a file only halfway. This lead
to OOB access during the rune/byte translation for shards that had
halfway processed files.
Change-Id: I49d6a82bb1e6d4129f03de9a4293fc1cc5dcb4b3
diff --git a/indexbuilder.go b/indexbuilder.go
index e4678d7..40182ce 100644
--- a/indexbuilder.go
+++ b/indexbuilder.go
@@ -15,6 +15,7 @@
package zoekt
import (
+ "bytes"
"encoding/binary"
"fmt"
"hash/crc64"
@@ -64,6 +65,9 @@
}
}
+// Store trigram offsets for the given UTF-8 data. The
+// DocumentSections must correspond to rune boundaries in the UTF-8
+// data.
func (s *postingsBuilder) newSearchableString(data []byte, byteSections []DocumentSection) (*searchableString, []DocumentSection, error) {
dest := searchableString{
data: data,
@@ -87,9 +91,6 @@
if sz > 1 {
s.isPlainASCII = false
}
- if c == 0 {
- return nil, nil, &skipError{fmt.Sprintf("binary content at byte offset %d", byteCount)}
- }
data = data[sz:]
runeGram[0], runeGram[1], runeGram[2] = runeGram[1], runeGram[2], c
@@ -328,19 +329,15 @@
const notIndexedMarker = "NOT-INDEXED: "
-// skipError is an error for conditions that we can record in the index.
-type skipError struct {
- reason string
-}
-
-func (e *skipError) Error() string {
- return e.reason
-}
-
// Add a file which only occurs in certain branches.
func (b *IndexBuilder) Add(doc Document) error {
hasher := crc64.New(crc64.MakeTable(crc64.ISO))
+ if idx := bytes.IndexByte(doc.Content, 0); idx >= 0 {
+ doc.SkipReason = fmt.Sprintf("binary content at byte offset %d", idx)
+ doc.Language = "binary"
+ }
+
if doc.SkipReason != "" {
doc.Content = []byte(notIndexedMarker + doc.SkipReason)
doc.Symbols = nil
@@ -367,14 +364,7 @@
}
}
docStr, runeSecs, err := b.contentPostings.newSearchableString(doc.Content, doc.Symbols)
- if t, ok := err.(*skipError); err != nil && ok {
- doc.SkipReason = t.reason
- doc.Content = []byte(notIndexedMarker + doc.SkipReason)
- doc.Symbols = nil
- doc.Language = "binary"
-
- docStr, runeSecs, _ = b.contentPostings.newSearchableString(doc.Content, doc.Symbols)
- } else if err != nil {
+ if err != nil {
return err
}
nameStr, _, err := b.namePostings.newSearchableString([]byte(doc.Name), nil)