blob: 62fd1490ff0b5298eaa4d74b7789878d06c6c498 [file] [log] [blame]
// Copyright 2016 Google Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package zoekt
import (
"bytes"
"log"
"sort"
"unicode/utf8"
)
var _ = log.Println
// contentProvider is an abstraction to treat matches for names and
// content with the same code.
type contentProvider struct {
id *indexData
stats *Stats
// mutable
err error
idx uint32
_data []byte
_nl []uint32
_nlBuf []uint32
_sects []DocumentSection
_sectBuf []DocumentSection
fileSize uint32
}
// setDocument skips to the given document.
func (p *contentProvider) setDocument(docID uint32) {
fileStart := p.id.boundaries[docID]
p.idx = docID
p.fileSize = p.id.boundaries[docID+1] - fileStart
p._nl = nil
p._sects = nil
p._data = nil
}
func (p *contentProvider) docSections() []DocumentSection {
if p._sects == nil {
var sz uint32
p._sects, sz, p.err = p.id.readDocSections(p.idx, p._sectBuf)
p.stats.ContentBytesLoaded += int64(sz)
p._sectBuf = p._sects
}
return p._sects
}
func (p *contentProvider) newlines() []uint32 {
if p._nl == nil {
var sz uint32
p._nl, sz, p.err = p.id.readNewlines(p.idx, p._nlBuf)
p._nlBuf = p._nl
p.stats.ContentBytesLoaded += int64(sz)
}
return p._nl
}
func (p *contentProvider) data(fileName bool) []byte {
if fileName {
return p.id.fileNameContent[p.id.fileNameIndex[p.idx]:p.id.fileNameIndex[p.idx+1]]
}
if p._data == nil {
p._data, p.err = p.id.readContents(p.idx)
p.stats.FilesLoaded++
p.stats.ContentBytesLoaded += int64(len(p._data))
}
return p._data
}
// Find offset in bytes (relative to corpus start) for an offset in
// runes (relative to document start). If filename is set, the corpus
// is the set of filenames, with the document being the name itself.
func (p *contentProvider) findOffset(filename bool, r uint32) uint32 {
if p.id.metaData.PlainASCII {
return r
}
sample := p.id.runeOffsets
runeEnds := p.id.fileEndRunes
fileStartByte := p.id.boundaries[p.idx]
if filename {
sample = p.id.fileNameRuneOffsets
runeEnds = p.id.fileNameEndRunes
fileStartByte = p.id.fileNameIndex[p.idx]
}
absR := r
if p.idx > 0 {
absR += runeEnds[p.idx-1]
}
byteOff := sample[absR/runeOffsetFrequency]
left := absR % runeOffsetFrequency
var data []byte
if filename {
data = p.id.fileNameContent[byteOff:]
} else {
data, p.err = p.id.readContentSlice(byteOff, 3*runeOffsetFrequency)
if p.err != nil {
return 0
}
}
for left > 0 {
_, sz := utf8.DecodeRune(data)
byteOff += uint32(sz)
data = data[sz:]
left--
}
byteOff -= fileStartByte
return byteOff
}
func (p *contentProvider) fillMatches(ms []*candidateMatch) []LineMatch {
var result []LineMatch
if ms[0].fileName {
// There is only "line" in a filename.
res := LineMatch{
Line: p.id.fileName(p.idx),
FileName: true,
}
for _, m := range ms {
res.LineFragments = append(res.LineFragments, LineFragmentMatch{
LineOffset: int(m.byteOffset),
MatchLength: int(m.byteMatchSz),
Offset: m.byteOffset,
})
result = []LineMatch{res}
}
} else {
ms = breakMatchesOnNewlines(ms, p.data(false))
result = p.fillContentMatches(ms)
}
sects := p.docSections()
for i, m := range result {
result[i].Score = matchScore(sects, &m)
}
return result
}
func (p *contentProvider) fillContentMatches(ms []*candidateMatch) []LineMatch {
var result []LineMatch
for len(ms) > 0 {
m := ms[0]
num, lineStart, lineEnd := m.line(p.newlines(), p.fileSize)
var lineCands []*candidateMatch
endMatch := m.byteOffset + m.byteMatchSz
for len(ms) > 0 {
m := ms[0]
if int(m.byteOffset) <= lineEnd {
endMatch = m.byteOffset + m.byteMatchSz
lineCands = append(lineCands, m)
ms = ms[1:]
} else {
break
}
}
if len(lineCands) == 0 {
log.Panicf(
"%s %v infinite loop: num %d start,end %d,%d, offset %d",
p.id.fileName(p.idx), p.id.metaData,
num, lineStart, lineEnd,
m.byteOffset)
}
data := p.data(false)
// Due to merging matches, we may have a match that
// crosses a line boundary. Prevent confusion by
// taking lines until we pass the last match
for lineEnd < len(data) && endMatch > uint32(lineEnd) {
next := bytes.IndexByte(data[lineEnd+1:], '\n')
if next == -1 {
lineEnd = len(data)
} else {
// TODO(hanwen): test that checks "+1" part here.
lineEnd += next + 1
}
}
finalMatch := LineMatch{
LineStart: lineStart,
LineEnd: lineEnd,
LineNumber: num,
}
finalMatch.Line = data[lineStart:lineEnd]
for _, m := range lineCands {
fragment := LineFragmentMatch{
Offset: m.byteOffset,
LineOffset: int(m.byteOffset) - lineStart,
MatchLength: int(m.byteMatchSz),
}
finalMatch.LineFragments = append(finalMatch.LineFragments, fragment)
}
result = append(result, finalMatch)
}
return result
}
const (
// TODO - how to scale this relative to rank?
scorePartialWordMatch = 50.0
scoreWordMatch = 500.0
scoreImportantThreshold = 2000.0
scorePartialSymbol = 4000.0
scoreSymbol = 7000.0
scoreFactorAtomMatch = 400.0
scoreShardRankFactor = 20.0
scoreFileOrderFactor = 10.0
scoreLineOrderFactor = 1.0
)
func findSection(secs []DocumentSection, off, sz uint32) *DocumentSection {
j := sort.Search(len(secs), func(i int) bool {
return secs[i].End >= off+sz
})
if j == len(secs) {
return nil
}
if secs[j].Start <= off && off+sz <= secs[j].End {
return &secs[j]
}
return nil
}
func matchScore(secs []DocumentSection, m *LineMatch) float64 {
var maxScore float64
for _, f := range m.LineFragments {
startBoundary := f.LineOffset < len(m.Line) && (f.LineOffset == 0 || byteClass(m.Line[f.LineOffset-1]) != byteClass(m.Line[f.LineOffset]))
end := int(f.LineOffset) + f.MatchLength
endBoundary := end > 0 && (end == len(m.Line) || byteClass(m.Line[end-1]) != byteClass(m.Line[end]))
score := 0.0
if startBoundary && endBoundary {
score = scoreWordMatch
} else if startBoundary || endBoundary {
score = scorePartialWordMatch
}
sec := findSection(secs, f.Offset, uint32(f.MatchLength))
if sec != nil {
startMatch := sec.Start == f.Offset
endMatch := sec.End == f.Offset+uint32(f.MatchLength)
if startMatch && endMatch {
score += scoreSymbol
} else if startMatch || endMatch {
score += (scoreSymbol + scorePartialSymbol) / 2
} else {
score += scorePartialSymbol
}
}
if score > maxScore {
maxScore = score
}
}
return maxScore
}
type matchScoreSlice []LineMatch
func (m matchScoreSlice) Len() int { return len(m) }
func (m matchScoreSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
func (m matchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score }
type fileMatchSlice []FileMatch
func (m fileMatchSlice) Len() int { return len(m) }
func (m fileMatchSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
func (m fileMatchSlice) Less(i, j int) bool { return m[i].Score > m[j].Score }
func sortMatchesByScore(ms []LineMatch) {
sort.Sort(matchScoreSlice(ms))
}
// Sort a slice of results.
func SortFilesByScore(ms []FileMatch) {
sort.Sort(fileMatchSlice(ms))
}