toc: modify disk format to include names for backwards/forwards compatiblity.

Including section names in the table of contents permits simpler
forwards and backwards compatibility. Instead of having to bump
the entire IndexFormatVersion when a new section is added, there is
simply a new section present in the table of contents.

Older versions can read TOCs with unknown sections and skip over them
with a warning. This is useful to permit downgrades without always
requiring a reindex.

Newer versions can read TOCs from older version with missing sections
and handle them gracefully, by checking for empty sections when loading
an index file and implementing whatever fallback code is necessary.

Section evolution is possible by having a new name for a tagged section,
adding the old section to the CompatibilityList, and writing the
conversion code when loading the file, or modifying the users of the
section to use whichever one is loaded.

Change-Id: I9aa05f29eb9d64fd0fff218f008d2031f1a15c8c
diff --git a/api.go b/api.go
index bec0e51..2bd8a88 100644
--- a/api.go
+++ b/api.go
@@ -216,12 +216,13 @@
 // IndexMetadata holds metadata stored in the index file. It contains
 // data generated by the core indexing library.
 type IndexMetadata struct {
-	IndexFormatVersion  int
-	IndexFeatureVersion int
-	IndexTime           time.Time
-	PlainASCII          bool
-	LanguageMap         map[string]byte
-	ZoektVersion        string
+	IndexFormatVersion    int
+	IndexFeatureVersion   int
+	IndexMinReaderVersion int
+	IndexTime             time.Time
+	PlainASCII            bool
+	LanguageMap           map[string]byte
+	ZoektVersion          string
 }
 
 // Statistics of a (collection of) repositories.
diff --git a/read.go b/read.go
index bac97b1..8351a23 100644
--- a/read.go
+++ b/read.go
@@ -18,6 +18,7 @@
 	"encoding/binary"
 	"encoding/json"
 	"fmt"
+	"log"
 	"sort"
 )
 
@@ -58,6 +59,36 @@
 	return binary.BigEndian.Uint64(b), nil
 }
 
+func (r *reader) ReadByte() (byte, error) {
+	b, err := r.r.Read(r.off, 1)
+	r.off += 1
+	if err != nil {
+		return 0, err
+	}
+	return b[0], nil
+}
+
+func (r *reader) Varint() (uint64, error) {
+	v, err := binary.ReadUvarint(r)
+	if err != nil {
+		return 0, err
+	}
+	return v, nil
+}
+
+func (r *reader) Str() (string, error) {
+	slen, err := r.Varint()
+	if err != nil {
+		return "", err
+	}
+	b, err := r.r.Read(r.off, uint32(slen))
+	if err != nil {
+		return "", err
+	}
+	r.off += uint32(slen)
+	return string(b), nil
+}
+
 func (r *reader) readTOC(toc *indexTOC) error {
 	sz, err := r.r.Size()
 	if err != nil {
@@ -77,15 +108,53 @@
 		return err
 	}
 
-	secs := toc.sections()
+	if sectionCount == 0 {
+		// tagged sections are indicated by a 0 sectionCount,
+		// and then a list of string-tagged type-indicated sections.
+		secs := toc.sectionsTagged()
+		for r.off < tocSection.off+tocSection.sz {
+			tag, err := r.Str()
+			if err != nil {
+				return err
+			}
+			kind, err := r.Varint()
+			if err != nil {
+				return err
+			}
+			sec := secs[tag]
+			if sec != nil && sec.kind() == sectionKind(kind) {
+				// happy path
+				if err := sec.read(r); err != nil {
+					return err
+				}
+				continue
+			}
+			// error case: skip over unknown section
+			if sec == nil {
+				log.Printf("file %s TOC has unknown section %q", r.r.Name(), tag)
+			} else {
+				return fmt.Errorf("file %s TOC section %q expects kind %d, got kind %d", r.r.Name(), tag,
+					kind, sec.kind())
+			}
+			if kind == 0 {
+				(&simpleSection{}).read(r)
+			} else if kind == 1 {
+				(&compoundSection{}).read(r)
+			}
+		}
+	} else {
+		// TODO: Remove this branch when ReaderMinFeatureVersion >= 10
 
-	if len(secs) != int(sectionCount) {
-		return fmt.Errorf("section count mismatch: got %d want %d", sectionCount, len(secs))
-	}
+		secs := toc.sections()
 
-	for _, s := range toc.sections() {
-		if err := s.read(r); err != nil {
-			return err
+		if len(secs) != int(sectionCount) {
+			return fmt.Errorf("section count mismatch: got %d want %d", sectionCount, len(secs))
+		}
+
+		for _, s := range secs {
+			if err := s.read(r); err != nil {
+				return err
+			}
 		}
 	}
 	return nil
@@ -158,6 +227,14 @@
 		return nil, fmt.Errorf("file is v%d, want v%d", d.metaData.IndexFormatVersion, IndexFormatVersion)
 	}
 
+	if d.metaData.IndexFeatureVersion < ReadMinFeatureVersion {
+		return nil, fmt.Errorf("file is feature version %d, want feature version >= %d", d.metaData.IndexFeatureVersion, ReadMinFeatureVersion)
+	}
+
+	if d.metaData.IndexMinReaderVersion > FeatureVersion {
+		return nil, fmt.Errorf("file needs read feature version >= %d, have read feature version %d", d.metaData.IndexMinReaderVersion, FeatureVersion)
+	}
+
 	blob, err = d.readSectionBlob(toc.repoMetaData)
 	if err != nil {
 		return nil, err
diff --git a/read_test.go b/read_test.go
index 4faa741..b6b57cd 100644
--- a/read_test.go
+++ b/read_test.go
@@ -16,10 +16,17 @@
 
 import (
 	"bytes"
+	"flag"
+	"fmt"
+	"io/fs"
+	"os"
+	"path"
 	"reflect"
 	"testing"
 )
 
+var update = flag.Bool("update", false, "update the golden files of this test")
+
 func TestReadWrite(t *testing.T) {
 	b, err := NewIndexBuilder(nil)
 	if err != nil {
@@ -98,3 +105,74 @@
 		t.Errorf("got trigram bcd at bits %v, want sz 2", data.fileNameNgrams)
 	}
 }
+
+func TestBackwardsCompat(t *testing.T) {
+	if *update {
+		b, err := NewIndexBuilder(nil)
+		if err != nil {
+			t.Fatalf("NewIndexBuilder: %v", err)
+		}
+
+		if err := b.AddFile("filename", []byte("abcde")); err != nil {
+			t.Fatalf("AddFile: %v", err)
+		}
+
+		var buf bytes.Buffer
+		b.Write(&buf)
+
+		outname := fmt.Sprintf("testdata/backcompat/new_v%d.%05d.zoekt", IndexFormatVersion, 0)
+		t.Log("writing new file", outname)
+
+		err = os.WriteFile(outname, buf.Bytes(), 0644)
+		if err != nil {
+			t.Fatalf("Creating output file: %v", err)
+		}
+	}
+
+	compatibleFiles, err := fs.Glob(os.DirFS("."), "testdata/backcompat/*.zoekt")
+	if err != nil {
+		t.Fatalf("fs.Glob: %v", err)
+	}
+
+	for _, fname := range compatibleFiles {
+		t.Run(path.Base(fname),
+			func(t *testing.T) {
+				f, err := os.Open(fname)
+				if err != nil {
+					t.Fatal("os.Open", err)
+				}
+				idx, err := NewIndexFile(f)
+				if err != nil {
+					t.Fatal("NewIndexFile", err)
+				}
+				r := reader{r: idx}
+
+				var toc indexTOC
+				err = r.readTOC(&toc)
+
+				if err != nil {
+					t.Errorf("got read error %v", err)
+				}
+				if toc.fileContents.data.sz != 5 {
+					t.Errorf("got contents size %d, want 5", toc.fileContents.data.sz)
+				}
+
+				data, err := r.readIndexData(&toc)
+				if err != nil {
+					t.Fatalf("readIndexData: %v", err)
+				}
+				if got := data.fileName(0); string(got) != "filename" {
+					t.Errorf("got filename %q, want %q", got, "filename")
+				}
+
+				if len(data.ngrams) != 3 {
+					t.Fatalf("got ngrams %v, want 3 ngrams", data.ngrams)
+				}
+
+				if _, ok := data.ngrams[stringToNGram("bcq")]; ok {
+					t.Errorf("found ngram bcd in %v", data.ngrams)
+				}
+			},
+		)
+	}
+}
diff --git a/section.go b/section.go
index a757804..90e4e94 100644
--- a/section.go
+++ b/section.go
@@ -65,6 +65,12 @@
 	w.Write(enc[:m])
 }
 
+func (w *writer) String(s string) {
+	b := []byte(s)
+	w.Varint(uint32(len(b)))
+	w.Write(b)
+}
+
 func (s *simpleSection) start(w *writer) {
 	s.off = w.Off()
 }
@@ -77,14 +83,26 @@
 type section interface {
 	read(*reader) error
 	write(*writer)
+	kind() sectionKind // simple or complex, used in serialization
 }
 
+type sectionKind int
+
+const (
+	sectionKindSimple  sectionKind = 0
+	sectionKindComplex sectionKind = 1
+)
+
 // simpleSection is a simple range of bytes.
 type simpleSection struct {
 	off uint32
 	sz  uint32
 }
 
+func (s *simpleSection) kind() sectionKind {
+	return sectionKindSimple
+}
+
 func (s *simpleSection) read(r *reader) error {
 	var err error
 	s.off, err = r.U32()
@@ -112,6 +130,10 @@
 	index   simpleSection
 }
 
+func (s *compoundSection) kind() sectionKind {
+	return sectionKindComplex
+}
+
 func (s *compoundSection) start(w *writer) {
 	s.data.start(w)
 }
diff --git a/testdata/backcompat/static_toc_v15.00000.zoekt b/testdata/backcompat/static_toc_v15.00000.zoekt
new file mode 100644
index 0000000..a070892
--- /dev/null
+++ b/testdata/backcompat/static_toc_v15.00000.zoekt
Binary files differ
diff --git a/toc.go b/toc.go
index c956e94..9eab283 100644
--- a/toc.go
+++ b/toc.go
@@ -27,6 +27,7 @@
 // 13: content checksums
 // 14: languages
 // 15: rune based symbol sections
+// 16 (TBA): TODO: remove fallback parsing in readTOC
 const IndexFormatVersion = 15
 
 // FeatureVersion is increased if a feature is added that requires reindexing data
@@ -39,7 +40,28 @@
 // 7: Record skip reasons in the index.
 // 8: Record source path in the index.
 // 9: Bump default max file size.
-const FeatureVersion = 9
+// 10: Switch to a more flexible TOC format.
+const FeatureVersion = 10
+
+// WriteMinFeatureVersion and ReadMinFeatureVersion constrain forwards and backwards
+// compatibility. For example, if a new way to encode filenameNgrams on disk is
+// added using a new section but the old one is retained, this would only bump
+// FeatureVersion, since the previous version can read the file and ignore the
+// new section, but the index files should be regenerated.
+// When the new encoding is fully rolled out and stable, the section with the old
+// encoding and the associated reader can be removed, and WriteMinFeatureVersion and
+// ReadMinFeatureVersion can be set to the current FeatureVersion, indicating
+// that the reader must handle the new version and that older versions are no
+// longer valid.
+// In this way, compatibility with arbitrary version offsets can be indicated.
+
+// WriteMinFeatureVersion constrains forwards compatibility by emitting files
+// that won't load in zoekt with a FeatureVersion below it.
+const WriteMinFeatureVersion = 10
+
+// ReadMinFeatureVersion constrains backwards compatibility by refusing to
+// load a file with a FeatureVersion below it.
+const ReadMinFeatureVersion = 8
 
 type indexTOC struct {
 	fileContents compoundSection
@@ -66,6 +88,8 @@
 }
 
 func (t *indexTOC) sections() []section {
+	// This old sections list is only needed to maintain backwards compatibility,
+	// and can be removed when a migration to tagged sections is complete.
 	return []section{
 		// This must be first, so it can be reliably read across
 		// file format versions.
@@ -90,3 +114,50 @@
 		&t.runeDocSections,
 	}
 }
+
+type taggedSection struct {
+	tag string
+	sec section
+}
+
+func (t *indexTOC) sectionsTagged() map[string]section {
+	out := map[string]section{}
+	for _, ent := range t.sectionsTaggedList() {
+		out[ent.tag] = ent.sec
+	}
+	for _, ent := range t.sectionsTaggedCompatibilityList() {
+		out[ent.tag] = ent.sec
+	}
+	return out
+}
+
+func (t *indexTOC) sectionsTaggedList() []taggedSection {
+	return []taggedSection{
+		{"metadata", &t.metaData},
+		{"repoMetaData", &t.repoMetaData},
+		{"fileContents", &t.fileContents},
+		{"fileNames", &t.fileNames},
+		{"fileSections", &t.fileSections},
+		{"newlines", &t.newlines},
+		{"ngramText", &t.ngramText},
+		{"postings", &t.postings},
+		{"nameNgramText", &t.nameNgramText},
+		{"namePostings", &t.namePostings},
+		{"branchMasks", &t.branchMasks},
+		{"subRepos", &t.subRepos},
+		{"runeOffsets", &t.runeOffsets},
+		{"nameRuneOffsets", &t.nameRuneOffsets},
+		{"fileEndRunes", &t.fileEndRunes},
+		{"nameEndRunes", &t.nameEndRunes},
+		{"contentChecksums", &t.contentChecksums},
+		{"languages", &t.languages},
+		{"runeDocSections", &t.runeDocSections},
+	}
+}
+
+// sectionsTaggedCompatibilityList returns a list of sections that will be
+// handled or converted for backwards compatiblity, but aren't written by
+// the current iteration of the indexer.
+func (t *indexTOC) sectionsTaggedCompatibilityList() []taggedSection {
+	return []taggedSection{}
+}
diff --git a/write.go b/write.go
index 8c5081e..7a89167 100644
--- a/write.go
+++ b/write.go
@@ -25,10 +25,22 @@
 )
 
 func (w *writer) writeTOC(toc *indexTOC) {
-	secs := toc.sections()
-	w.U32(uint32(len(secs)))
+	// Tagged sections are indicated with a 0 section count.
+	// Tagged sections allow easier forwards and backwards
+	// compatibility when evolving zoekt index files with new
+	// sections.
+	//
+	// A tagged section is:
+	// Varint TagLen, Tag String, Varint SecType, Section
+	//
+	// Section type is indicated because simpleSections and
+	// compoundSections have different lengths.
+	w.U32(0)
+	secs := toc.sectionsTaggedList()
 	for _, s := range secs {
-		s.write(w)
+		w.String(s.tag)
+		w.Varint(uint32(s.sec.kind()))
+		s.sec.write(w)
 	}
 }
 
@@ -121,12 +133,13 @@
 	toc.runeDocSections.end(w)
 
 	if err := b.writeJSON(&IndexMetadata{
-		IndexFormatVersion:  IndexFormatVersion,
-		IndexTime:           time.Now(),
-		IndexFeatureVersion: FeatureVersion,
-		PlainASCII:          b.contentPostings.isPlainASCII && b.namePostings.isPlainASCII,
-		LanguageMap:         b.languageMap,
-		ZoektVersion:        Version,
+		IndexFormatVersion:    IndexFormatVersion,
+		IndexTime:             time.Now(),
+		IndexFeatureVersion:   FeatureVersion,
+		IndexMinReaderVersion: WriteMinFeatureVersion,
+		PlainASCII:            b.contentPostings.isPlainASCII && b.namePostings.isPlainASCII,
+		LanguageMap:           b.languageMap,
+		ZoektVersion:          Version,
 	}, &toc.metaData, w); err != nil {
 		return err
 	}