Dismantle the docIterator type
Put candidate matches directly in the matchTree. No behavioral changes.
Change-Id: I34625bdeb5c80efb59cb65c3101fe5bec0d82e97
diff --git a/dociter.go b/dociter.go
index a293947..60dca9b 100644
--- a/dociter.go
+++ b/dociter.go
@@ -84,49 +84,22 @@
return idx + 1, start, end
}
-type docIterator interface {
- // TODO - reconsider this name? Or don't get all the
- // candidateMatch in one go?
- next() []*candidateMatch
- coversContent() bool
-
- // amount of I/O reads
- ioBytes() uint32
-}
-
type ngramDocIterator struct {
- ng1, ng2 ngram
- query *query.Substring
+ query *query.Substring
leftPad uint32
rightPad uint32
distance uint32
first []uint32
last []uint32
-
- fileIdx int
- ends []uint32
-
- // The ngram matches cover the pattern, so no need to check
- // contents.
- _coversContent bool
-
- // The number of posting bytes
- bytesRead uint32
+ ends []uint32
}
-func (s *ngramDocIterator) ioBytes() uint32 {
- return s.bytesRead
-}
-
-func (s *ngramDocIterator) coversContent() bool {
- return s._coversContent
-}
-
-func (s *ngramDocIterator) next() []*candidateMatch {
+func (s *ngramDocIterator) candidates() []*candidateMatch {
patBytes := []byte(s.query.Pattern)
lowerPatBytes := toLower(patBytes)
+ fileIdx := 0
var candidates []*candidateMatch
for {
if len(s.first) == 0 || len(s.last) == 0 {
@@ -135,8 +108,8 @@
p1 := s.first[0]
p2 := s.last[0]
- for s.fileIdx < len(s.ends) && s.ends[s.fileIdx] <= p1 {
- s.fileIdx++
+ for fileIdx < len(s.ends) && s.ends[fileIdx] <= p1 {
+ fileIdx++
}
if p1+s.distance < p2 {
@@ -148,10 +121,10 @@
s.last = s.last[1:]
var fileStart uint32
- if s.fileIdx > 0 {
- fileStart = s.ends[s.fileIdx-1]
+ if fileIdx > 0 {
+ fileStart = s.ends[fileIdx-1]
}
- if p1 < s.leftPad+fileStart || p1+s.distance+ngramSize+s.rightPad > s.ends[s.fileIdx] {
+ if p1 < s.leftPad+fileStart || p1+s.distance+ngramSize+s.rightPad > s.ends[fileIdx] {
continue
}
@@ -162,7 +135,7 @@
substrLowered: lowerPatBytes,
// TODO - this is wrong for casefolding searches.
byteMatchSz: uint32(len(lowerPatBytes)),
- file: uint32(s.fileIdx),
+ file: uint32(fileIdx),
runeOffset: p1 - fileStart - s.leftPad,
}
candidates = append(candidates, cand)
diff --git a/eval.go b/eval.go
index b4dc688..9d3cda9 100644
--- a/eval.go
+++ b/eval.go
@@ -20,6 +20,7 @@
"regexp"
"sort"
"strings"
+ "unicode/utf8"
"golang.org/x/net/context"
@@ -245,6 +246,8 @@
for _, ch := range s.children {
collectAtoms(ch, f)
}
+ case *notMatchTree:
+ collectAtoms(s.child, f)
default:
f(t)
}
@@ -268,23 +271,6 @@
}
}
-func collectRegexps(t matchTree, f func(*regexpMatchTree)) {
- switch s := t.(type) {
- case *andMatchTree:
- for _, ch := range s.children {
- collectRegexps(ch, f)
- }
- case *orMatchTree:
- for _, ch := range s.children {
- collectRegexps(ch, f)
- }
- case *notMatchTree:
- collectRegexps(s.child, f)
- case *regexpMatchTree:
- f(s)
- }
-}
-
func visitMatches(t matchTree, known map[matchTree]bool, f func(matchTree)) {
switch s := t.(type) {
case *andMatchTree:
@@ -437,7 +423,7 @@
return true, sure
}
-func (d *indexData) newMatchTree(q query.Q, sq map[*substrMatchTree]struct{}, stats *Stats) (matchTree, error) {
+func (d *indexData) newMatchTree(q query.Q, stats *Stats) (matchTree, error) {
switch s := q.(type) {
case *query.Regexp:
subQ := query.RegexpToQuery(s.Regexp, ngramSize)
@@ -449,7 +435,7 @@
return q
})
- subMT, err := d.newMatchTree(subQ, sq, stats)
+ subMT, err := d.newMatchTree(subQ, stats)
if err != nil {
return nil, err
}
@@ -468,7 +454,7 @@
case *query.And:
var r []matchTree
for _, ch := range s.Children {
- ct, err := d.newMatchTree(ch, sq, stats)
+ ct, err := d.newMatchTree(ch, stats)
if err != nil {
return nil, err
}
@@ -478,7 +464,7 @@
case *query.Or:
var r []matchTree
for _, ch := range s.Children {
- ct, err := d.newMatchTree(ch, sq, stats)
+ ct, err := d.newMatchTree(ch, stats)
if err != nil {
return nil, err
}
@@ -486,25 +472,13 @@
}
return &orMatchTree{r}, nil
case *query.Not:
- ct, err := d.newMatchTree(s.Child, sq, stats)
+ ct, err := d.newMatchTree(s.Child, stats)
return ¬MatchTree{
child: ct,
}, err
+
case *query.Substring:
- iter, err := d.getDocIterator(s)
- if err != nil {
- return nil, err
- }
- st := &substrMatchTree{
- query: s,
- caseSensitive: s.CaseSensitive,
- fileName: s.FileName,
- coversContent: iter.coversContent(),
- cands: iter.next(),
- }
- stats.IndexBytesLoaded += int64(iter.ioBytes())
- sq[st] = struct{}{}
- return st, nil
+ return d.newSubstringMatchTree(s, stats)
case *query.Branch:
mask := uint64(0)
@@ -534,6 +508,33 @@
return nil, nil
}
+func (d *indexData) newSubstringMatchTree(s *query.Substring, stats *Stats) (matchTree, error) {
+ st := &substrMatchTree{
+ query: s,
+ caseSensitive: s.CaseSensitive,
+ fileName: s.FileName,
+ }
+
+ if utf8.RuneCountInString(s.Pattern) < ngramSize {
+ if !s.FileName {
+ return nil, fmt.Errorf("pattern %q less than %d characters", s.Pattern, ngramSize)
+ }
+
+ st.cands = d.bruteForceMatchFilenames(s)
+ st.coversContent = true
+ return st, nil
+ }
+
+ result, err := d.iterateNgrams(s)
+ if err != nil {
+ return nil, err
+ }
+ st.coversContent = result.coversContent
+ st.cands = result.cands
+ stats.IndexBytesLoaded += int64(result.bytesRead)
+ return st, nil
+}
+
func (d *indexData) simplify(in query.Q) query.Q {
eval := query.Map(in, func(q query.Q) query.Q {
if r, ok := q.(*query.Repo); ok {
@@ -584,36 +585,35 @@
q = query.Map(q, query.ExpandFileContent)
- atoms := map[*substrMatchTree]struct{}{}
- mt, err := d.newMatchTree(q, atoms, &res.Stats)
+ mt, err := d.newMatchTree(q, &res.Stats)
if err != nil {
return nil, err
}
- for st := range atoms {
- res.Stats.NgramMatches += len(st.cands)
- }
-
+ totalAtomCount := 0
+ var substrAtoms, fileAtoms []*substrMatchTree
var regexpAtoms []*regexpMatchTree
- collectRegexps(mt, func(re *regexpMatchTree) {
- regexpAtoms = append(regexpAtoms, re)
- })
+ collectAtoms(mt, func(t matchTree) {
+ totalAtomCount++
+ if st, ok := t.(*substrMatchTree); ok {
+ res.Stats.NgramMatches += len(st.cands)
+ if st.fileName {
+ fileAtoms = append(fileAtoms, st)
+ } else {
+ substrAtoms = append(substrAtoms, st)
+ }
- var fileAtoms []*substrMatchTree
- for st := range atoms {
- if st.fileName {
- fileAtoms = append(fileAtoms, st)
}
- }
+ if re, ok := t.(*regexpMatchTree); ok {
+ regexpAtoms = append(regexpAtoms, re)
+ }
+ })
cp := contentProvider{
id: d,
stats: &res.Stats,
}
- totalAtomCount := 0
- collectAtoms(mt, func(t matchTree) { totalAtomCount++ })
-
docCount := uint32(len(d.fileBranchMasks))
canceled := false
lastDoc := int(-1)
@@ -661,7 +661,7 @@
}
}
- for st := range atoms {
+ for _, st := range substrAtoms {
// TODO - this may evaluate too much.
cp.evalContentMatches(st)
}
diff --git a/index_portable.go b/index_portable.go
index 68f21c9..73765ce 100644
--- a/index_portable.go
+++ b/index_portable.go
@@ -127,33 +127,7 @@
return sz
}
-func (data *indexData) getDocIterator(s *query.Substring) (docIterator, error) {
- if utf8.RuneCountInString(s.Pattern) < ngramSize {
- if !s.FileName {
- return nil, fmt.Errorf("pattern %q less than %d characters", s.Pattern, ngramSize)
- }
-
- return data.getBruteForceFileNameDocIterator(s), nil
- }
-
- return data.getNgramDocIterator(s)
-}
-
-type bruteForceIter struct {
- cands []*candidateMatch
-}
-
-func (i *bruteForceIter) ioBytes() uint32 { return 0 }
-
-func (i *bruteForceIter) next() []*candidateMatch {
- return i.cands
-}
-
-func (i *bruteForceIter) coversContent() bool {
- return true
-}
-
-func (data *indexData) getBruteForceFileNameDocIterator(query *query.Substring) docIterator {
+func (data *indexData) bruteForceMatchFilenames(query *query.Substring) []*candidateMatch {
quoted := regexp.QuoteMeta(query.Pattern)
if !query.CaseSensitive {
quoted = "(?i)" + quoted
@@ -197,7 +171,7 @@
})
}
- return &bruteForceIter{cands}
+ return cands
}
const maxUInt32 = 0xffffffff
@@ -222,7 +196,18 @@
return data.ngrams[ng].sz
}
-func (data *indexData) getNgramDocIterator(query *query.Substring) (docIterator, error) {
+type ngramIterationResults struct {
+ cands []*candidateMatch
+
+ // The ngram matches cover the pattern, so no need to check
+ // contents.
+ coversContent bool
+
+ // The number of posting bytes
+ bytesRead uint32
+}
+
+func (data *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResults, error) {
iter := &ngramDocIterator{
query: query,
}
@@ -249,7 +234,7 @@
}
if freq == 0 {
- return iter, nil
+ return &ngramIterationResults{}, nil
}
frequencies = append(frequencies, freq)
@@ -274,12 +259,13 @@
}
iter.first = postings
+
if firstI != lastI {
postings, sz, err := data.readPostings(lastNG, query.CaseSensitive, query.FileName)
+ bytesRead += sz
if err != nil {
return nil, err
}
- iter.bytesRead += sz
iter.last = postings
} else {
// TODO - we could be a little faster and skip the
@@ -287,13 +273,12 @@
iter.last = iter.first
}
- iter.bytesRead = bytesRead
- iter.ng1 = firstNG
- iter.ng2 = lastNG
- if lastI-firstI <= ngramSize && iter.leftPad == 0 && iter.rightPad == 0 {
- iter._coversContent = true
+ result := ngramIterationResults{
+ bytesRead: bytesRead,
+ cands: iter.candidates(),
+ coversContent: lastI-firstI <= ngramSize && iter.leftPad == 0 && iter.rightPad == 0,
}
- return iter, nil
+ return &result, nil
}
func (d *indexData) readPostings(ng ngram, caseSensitive, fileName bool) ([]uint32, uint32, error) {
diff --git a/index_test.go b/index_test.go
index 2c16149..e706b31 100644
--- a/index_test.go
+++ b/index_test.go
@@ -258,13 +258,13 @@
wantStats := Stats{
FilesLoaded: 1,
ContentBytesLoaded: 18,
- IndexBytesLoaded: 4,
+ IndexBytesLoaded: 8,
NgramMatches: 4,
MatchCount: 1,
FileCount: 1,
FilesConsidered: 2,
}
- if diff := pretty.Compare(sres.Stats, wantStats); diff != "" {
+ if diff := pretty.Compare(wantStats, sres.Stats); diff != "" {
t.Errorf("got stats diff %s", diff)
}
}