Remove indexes with absolute file offsets
This fixes a potential bug in which findOffset is incorrect if the
content sections would be rearranged so they are no longer at the
start of the index shard.
Change-Id: Id2f3885883e54d4822433cbba9926c5aa43f91df
diff --git a/index.go b/index.go
index a2d1655..d259fdf 100644
--- a/index.go
+++ b/index.go
@@ -33,12 +33,16 @@
ngrams map[ngram]simpleSection
- newlinesIndex []uint32
+ newlinesStart uint32
+ newlinesIndex []uint32
+
+ docSectionsStart uint32
docSectionsIndex []uint32
runeOffsets []uint32
// offsets of file contents. Includes end of last file.
- boundaries []uint32
+ boundariesStart uint32
+ boundaries []uint32
fileEnds []uint32
fileEndRunes []uint32
diff --git a/read.go b/read.go
index 8b212b4..5db0074 100644
--- a/read.go
+++ b/read.go
@@ -139,22 +139,25 @@
return nil, err
}
- d.boundaries = toc.fileContents.absoluteIndex()
- d.newlinesIndex = toc.newlines.absoluteIndex()
- d.docSectionsIndex = toc.fileSections.absoluteIndex()
+ d.boundariesStart = toc.fileContents.data.off
+ d.boundaries = toc.fileContents.relativeIndex()
+ d.newlinesStart = toc.newlines.data.off
+ d.newlinesIndex = toc.newlines.relativeIndex()
+ d.docSectionsStart = toc.fileSections.data.off
+ d.docSectionsIndex = toc.fileSections.relativeIndex()
textContent, err := d.readSectionBlob(toc.ngramText)
if err != nil {
return nil, err
}
- postingsIndex := toc.postings.absoluteIndex()
+ postingsIndex := toc.postings.relativeIndex()
const ngramEncoding = 8
for i := 0; i < len(textContent); i += ngramEncoding {
j := i / ngramEncoding
ng := ngram(binary.BigEndian.Uint64(textContent[i : i+ngramEncoding]))
d.ngrams[ng] = simpleSection{
- postingsIndex[j],
+ toc.postings.data.off + postingsIndex[j],
postingsIndex[j+1] - postingsIndex[j],
}
}
@@ -254,7 +257,7 @@
func (d *indexData) readContents(i uint32) ([]byte, error) {
return d.readSectionBlob(simpleSection{
- off: d.boundaries[i],
+ off: d.boundariesStart + d.boundaries[i],
sz: d.boundaries[i+1] - d.boundaries[i],
})
}
@@ -263,13 +266,13 @@
// TODO(hanwen): cap result if it is at the end of the content
// section.
return d.readSectionBlob(simpleSection{
- off: d.boundaries[0] + off,
+ off: d.boundariesStart + off,
sz: sz})
}
func (d *indexData) readNewlines(i uint32, buf []uint32) ([]uint32, error) {
blob, err := d.readSectionBlob(simpleSection{
- off: d.newlinesIndex[i],
+ off: d.newlinesStart + d.newlinesIndex[i],
sz: d.newlinesIndex[i+1] - d.newlinesIndex[i],
})
if err != nil {
@@ -281,7 +284,7 @@
func (d *indexData) readDocSections(i uint32) ([]DocumentSection, error) {
blob, err := d.readSectionBlob(simpleSection{
- off: d.docSectionsIndex[i],
+ off: d.docSectionsStart + d.docSectionsIndex[i],
sz: d.docSectionsIndex[i+1] - d.docSectionsIndex[i],
})
if err != nil {
@@ -337,7 +340,11 @@
}
func (d *indexData) calculateStats() {
- last := d.boundaries[len(d.boundaries)-1]
+ var last uint32
+ if len(d.boundaries) > 0 {
+ last += d.boundaries[len(d.boundaries)-1]
+ }
+
lastFN := last
if len(d.fileNameIndex) > 0 {
lastFN = d.fileNameIndex[len(d.fileNameIndex)-1]
diff --git a/section.go b/section.go
index 4d5de30..642d6e0 100644
--- a/section.go
+++ b/section.go
@@ -141,14 +141,6 @@
return err
}
-// absoluteIndex returns the offsets of items, plus a final marking the end of the
-// last item.
-func (s *compoundSection) absoluteIndex() []uint32 {
- index := s.offsets
- index = append(index, s.data.off+s.data.sz)
- return index
-}
-
// relativeIndex returns the relative offsets of the items (first
// element is 0), plus a final marking the end of the last item.
func (s *compoundSection) relativeIndex() []uint32 {
diff --git a/write.go b/write.go
index a7f59cc..c046f88 100644
--- a/write.go
+++ b/write.go
@@ -139,7 +139,6 @@
toc := indexTOC{}
toc.fileContents.writeStrings(w, b.contentStrings)
-
toc.newlines.start(w)
for _, f := range b.contentStrings {
toc.newlines.addItem(w, toSizedDeltas(newLinesIndices(f.data)))