| // Copyright 2016 Google Inc. All rights reserved. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| package zoekt |
| |
| import ( |
| "bytes" |
| "fmt" |
| "sort" |
| "unicode/utf8" |
| |
| "github.com/google/zoekt/query" |
| ) |
| |
| // candidateMatch is a candidate match for a substring. |
| type candidateMatch struct { |
| caseSensitive bool |
| fileName bool |
| |
| substrBytes []byte |
| substrLowered []byte |
| |
| file uint32 |
| |
| // Offsets are relative to the start of the filename or file contents. |
| runeOffset uint32 |
| byteOffset uint32 |
| byteMatchSz uint32 |
| } |
| |
| // Matches content against the substring, and populates byteMatchSz on success |
| func (m *candidateMatch) matchContent(content []byte) bool { |
| if m.caseSensitive { |
| comp := bytes.Equal(m.substrBytes, content[m.byteOffset:m.byteOffset+uint32(len(m.substrBytes))]) |
| |
| m.byteMatchSz = uint32(len(m.substrBytes)) |
| return comp |
| } else { |
| // It is tempting to try a simple ASCII based |
| // comparison if possible, but we need more |
| // information. Simple ASCII chars have unicode upper |
| // case variants (the ASCII 'k' has the Kelvin symbol |
| // as upper case variant). We can only degrade to |
| // ASCII if we are sure that both the corpus and the |
| // query is ASCII only |
| sz, ok := caseFoldingEqualsRunes(m.substrLowered, content[m.byteOffset:]) |
| m.byteMatchSz = uint32(sz) |
| return ok |
| } |
| } |
| |
| // line returns the line holding the match. If the match starts with |
| // the newline ending line M, we return M. The line is characterized |
| // by its linenumber (base-1, byte index of line start, byte index of |
| // line end). The line end is the index of a newline, or the filesize |
| // (if matching the last line of the file.) |
| func (m *candidateMatch) line(newlines []uint32, fileSize uint32) (lineNum, lineStart, lineEnd int) { |
| idx := sort.Search(len(newlines), func(n int) bool { |
| return newlines[n] >= m.byteOffset |
| }) |
| |
| end := int(fileSize) |
| if idx < len(newlines) { |
| end = int(newlines[idx]) |
| } |
| |
| start := 0 |
| if idx > 0 { |
| start = int(newlines[idx-1] + 1) |
| } |
| |
| return idx + 1, start, end |
| } |
| |
| // matchIterator is a docIterator that produces candidateMatches for a given document |
| type matchIterator interface { |
| docIterator |
| |
| candidates() []*candidateMatch |
| updateStats(*Stats) |
| } |
| |
| // noMatchTree is both matchIterator and matchTree that matches nothing. |
| type noMatchTree struct { |
| Why string |
| } |
| |
| func (t *noMatchTree) String() string { |
| return fmt.Sprintf("not(%q)", t.Why) |
| } |
| |
| func (t *noMatchTree) candidates() []*candidateMatch { |
| return nil |
| } |
| |
| func (t *noMatchTree) nextDoc() uint32 { |
| return maxUInt32 |
| } |
| |
| func (t *noMatchTree) prepare(uint32) {} |
| |
| func (t *noMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) { |
| return false, true |
| } |
| |
| func (t *noMatchTree) updateStats(*Stats) {} |
| |
| func (m *candidateMatch) String() string { |
| return fmt.Sprintf("%d:%d", m.file, m.runeOffset) |
| } |
| |
| type ngramDocIterator struct { |
| leftPad uint32 |
| rightPad uint32 |
| |
| iter hitIterator |
| ends []uint32 |
| |
| // mutable |
| fileIdx uint32 |
| matchCount int |
| } |
| |
| // nextFileIndex returns the smallest index j of ends such that |
| // ends[j] > offset, assuming ends[f] <= offset. |
| func nextFileIndex(offset, f uint32, ends []uint32) uint32 { |
| d := uint32(1) |
| for f < uint32(len(ends)) && ends[f] <= offset { |
| if f+d < uint32(len(ends)) && ends[f+d] <= offset { |
| f += d |
| d *= 2 |
| } else if d > 1 { |
| d = d/4 + 1 |
| } else { |
| f++ |
| } |
| } |
| return f |
| } |
| |
| func (i *ngramDocIterator) nextDoc() uint32 { |
| i.fileIdx = nextFileIndex(i.iter.first(), i.fileIdx, i.ends) |
| if i.fileIdx >= uint32(len(i.ends)) { |
| return maxUInt32 |
| } |
| return i.fileIdx |
| } |
| |
| func (i *ngramDocIterator) String() string { |
| return fmt.Sprintf("ngram(L=%d,R=%d,%v)", i.leftPad, i.rightPad, i.iter) |
| } |
| |
| func (i *ngramDocIterator) prepare(nextDoc uint32) { |
| var start uint32 |
| if nextDoc > 0 { |
| start = i.ends[nextDoc-1] |
| } |
| if start > 0 { |
| i.iter.next(start + i.leftPad - 1) |
| } |
| i.fileIdx = nextDoc |
| } |
| |
| func (i *ngramDocIterator) updateStats(s *Stats) { |
| i.iter.updateStats(s) |
| s.NgramMatches += i.matchCount |
| } |
| |
| func (i *ngramDocIterator) candidates() []*candidateMatch { |
| if i.fileIdx >= uint32(len(i.ends)) { |
| return nil |
| } |
| |
| var fileStart uint32 |
| if i.fileIdx > 0 { |
| fileStart = i.ends[i.fileIdx-1] |
| } |
| fileEnd := i.ends[i.fileIdx] |
| |
| var candidates []*candidateMatch |
| for { |
| p1 := i.iter.first() |
| if p1 == maxUInt32 || p1 >= i.ends[i.fileIdx] { |
| break |
| } |
| i.iter.next(p1) |
| |
| if p1 < i.leftPad+fileStart || p1+i.rightPad > fileEnd { |
| continue |
| } |
| |
| candidates = append(candidates, &candidateMatch{ |
| file: uint32(i.fileIdx), |
| runeOffset: p1 - fileStart - i.leftPad, |
| }) |
| } |
| i.matchCount += len(candidates) |
| return candidates |
| } |
| |
| type trimBySectionMatchIter struct { |
| matchIterator |
| |
| patternSize uint32 |
| fileEndRunes []uint32 |
| |
| // mutable |
| doc uint32 |
| sections []DocumentSection |
| } |
| |
| func (i *trimBySectionMatchIter) String() string { |
| return fmt.Sprintf("trimSection(sz=%d, %v)", i.patternSize, i.matchIterator) |
| } |
| |
| func (d *indexData) newTrimByDocSectionIter(q *query.Substring, iter matchIterator) *trimBySectionMatchIter { |
| return &trimBySectionMatchIter{ |
| matchIterator: iter, |
| patternSize: uint32(utf8.RuneCountInString(q.Pattern)), |
| fileEndRunes: d.fileEndRunes, |
| sections: d.runeDocSections, |
| } |
| } |
| |
| func (i *trimBySectionMatchIter) prepare(doc uint32) { |
| i.matchIterator.prepare(doc) |
| i.doc = doc |
| |
| var fileStart uint32 |
| if doc > 0 { |
| fileStart = i.fileEndRunes[doc-1] |
| } |
| |
| for len(i.sections) > 0 && i.sections[0].Start < fileStart { |
| i.sections = i.sections[1:] |
| } |
| } |
| |
| func (i *trimBySectionMatchIter) candidates() []*candidateMatch { |
| var fileStart uint32 |
| if i.doc > 0 { |
| fileStart = i.fileEndRunes[i.doc-1] |
| } |
| |
| ms := i.matchIterator.candidates() |
| trimmed := ms[:0] |
| for len(i.sections) > 0 && len(ms) > 0 { |
| start := fileStart + ms[0].runeOffset |
| end := start + i.patternSize |
| if start >= i.sections[0].End { |
| i.sections = i.sections[1:] |
| continue |
| } |
| |
| if start < i.sections[0].Start { |
| ms = ms[1:] |
| continue |
| } |
| |
| // here we have: sec.Start <= start < sec.End |
| if end <= i.sections[0].End { |
| // complete match falls inside section. |
| trimmed = append(trimmed, ms[0]) |
| } |
| |
| ms = ms[1:] |
| } |
| return trimmed |
| } |