Remove indexes with absolute file offsets This fixes a potential bug in which findOffset is incorrect if the content sections would be rearranged so they are no longer at the start of the index shard. Change-Id: Id2f3885883e54d4822433cbba9926c5aa43f91df

commit: 8c3d9b6213bf35a2b20deccb0804aab51cd6494f [log] [tgz]
author: Han-Wen Nienhuys <hanwen@google.com> Wed Mar 22 12:19:20 2017 +0100
committer: Han-Wen Nienhuys <hanwen@google.com> Wed Mar 22 12:19:20 2017 +0100
tree: 01ee387e498e737731439ba5702359f5fe592217
parent: 607c0a0ea66e729eef6622f6efa64a7e03f75200 [diff]
diff --git a/index.go b/index.go
index a2d1655..d259fdf 100644
--- a/index.go
+++ b/index.go

@@ -33,12 +33,16 @@
 
 	ngrams map[ngram]simpleSection
 
-	newlinesIndex    []uint32
+	newlinesStart uint32
+	newlinesIndex []uint32
+
+	docSectionsStart uint32
 	docSectionsIndex []uint32
 	runeOffsets      []uint32
 
 	// offsets of file contents. Includes end of last file.
-	boundaries []uint32
+	boundariesStart uint32
+	boundaries      []uint32
 
 	fileEnds     []uint32
 	fileEndRunes []uint32

diff --git a/read.go b/read.go
index 8b212b4..5db0074 100644
--- a/read.go
+++ b/read.go

@@ -139,22 +139,25 @@
 		return nil, err
 	}
 
-	d.boundaries = toc.fileContents.absoluteIndex()
-	d.newlinesIndex = toc.newlines.absoluteIndex()
-	d.docSectionsIndex = toc.fileSections.absoluteIndex()
+	d.boundariesStart = toc.fileContents.data.off
+	d.boundaries = toc.fileContents.relativeIndex()
+	d.newlinesStart = toc.newlines.data.off
+	d.newlinesIndex = toc.newlines.relativeIndex()
+	d.docSectionsStart = toc.fileSections.data.off
+	d.docSectionsIndex = toc.fileSections.relativeIndex()
 
 	textContent, err := d.readSectionBlob(toc.ngramText)
 	if err != nil {
 		return nil, err
 	}
-	postingsIndex := toc.postings.absoluteIndex()
+	postingsIndex := toc.postings.relativeIndex()
 
 	const ngramEncoding = 8
 	for i := 0; i < len(textContent); i += ngramEncoding {
 		j := i / ngramEncoding
 		ng := ngram(binary.BigEndian.Uint64(textContent[i : i+ngramEncoding]))
 		d.ngrams[ng] = simpleSection{
-			postingsIndex[j],
+			toc.postings.data.off + postingsIndex[j],
 			postingsIndex[j+1] - postingsIndex[j],
 		}
 	}
@@ -254,7 +257,7 @@
 
 func (d *indexData) readContents(i uint32) ([]byte, error) {
 	return d.readSectionBlob(simpleSection{
-		off: d.boundaries[i],
+		off: d.boundariesStart + d.boundaries[i],
 		sz:  d.boundaries[i+1] - d.boundaries[i],
 	})
 }
@@ -263,13 +266,13 @@
 	// TODO(hanwen): cap result if it is at the end of the content
 	// section.
 	return d.readSectionBlob(simpleSection{
-		off: d.boundaries[0] + off,
+		off: d.boundariesStart + off,
 		sz:  sz})
 }
 
 func (d *indexData) readNewlines(i uint32, buf []uint32) ([]uint32, error) {
 	blob, err := d.readSectionBlob(simpleSection{
-		off: d.newlinesIndex[i],
+		off: d.newlinesStart + d.newlinesIndex[i],
 		sz:  d.newlinesIndex[i+1] - d.newlinesIndex[i],
 	})
 	if err != nil {
@@ -281,7 +284,7 @@
 
 func (d *indexData) readDocSections(i uint32) ([]DocumentSection, error) {
 	blob, err := d.readSectionBlob(simpleSection{
-		off: d.docSectionsIndex[i],
+		off: d.docSectionsStart + d.docSectionsIndex[i],
 		sz:  d.docSectionsIndex[i+1] - d.docSectionsIndex[i],
 	})
 	if err != nil {
@@ -337,7 +340,11 @@
 }
 
 func (d *indexData) calculateStats() {
-	last := d.boundaries[len(d.boundaries)-1]
+	var last uint32
+	if len(d.boundaries) > 0 {
+		last += d.boundaries[len(d.boundaries)-1]
+	}
+
 	lastFN := last
 	if len(d.fileNameIndex) > 0 {
 		lastFN = d.fileNameIndex[len(d.fileNameIndex)-1]

diff --git a/section.go b/section.go
index 4d5de30..642d6e0 100644
--- a/section.go
+++ b/section.go

@@ -141,14 +141,6 @@
 	return err
 }
 
-// absoluteIndex returns the offsets of items, plus a final marking the end of the
-// last item.
-func (s *compoundSection) absoluteIndex() []uint32 {
-	index := s.offsets
-	index = append(index, s.data.off+s.data.sz)
-	return index
-}
-
 // relativeIndex returns the relative offsets of the items (first
 // element is 0), plus a final marking the end of the last item.
 func (s *compoundSection) relativeIndex() []uint32 {

diff --git a/write.go b/write.go
index a7f59cc..c046f88 100644
--- a/write.go
+++ b/write.go

@@ -139,7 +139,6 @@
 	toc := indexTOC{}
 
 	toc.fileContents.writeStrings(w, b.contentStrings)
-
 	toc.newlines.start(w)
 	for _, f := range b.contentStrings {
 		toc.newlines.addItem(w, toSizedDeltas(newLinesIndices(f.data)))
commit	8c3d9b6213bf35a2b20deccb0804aab51cd6494f	[log] [tgz]
author	Han-Wen Nienhuys <hanwen@google.com>	Wed Mar 22 12:19:20 2017 +0100
committer	Han-Wen Nienhuys <hanwen@google.com>	Wed Mar 22 12:19:20 2017 +0100
tree	01ee387e498e737731439ba5702359f5fe592217
parent	607c0a0ea66e729eef6622f6efa64a7e03f75200 [diff]