contentprovider.go - zoekt - Git at Google

 // Copyright 2016 Google Inc. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //    http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 package zoekt

 import (
 	"bytes"
 	"log"
 	"sort"
 	"unicode/utf8"
 )

 var _ = log.Println

 // contentProvider is an abstraction to treat matches for names and
 // content with the same code.
 type contentProvider struct {
 	id    *indexData
 	stats *Stats

 	// mutable
 	err      error
 	idx      uint32
 	_data    []byte
 	_nl      []uint32
 	_nlBuf   []uint32
 	_sects   []DocumentSection
 	_sectBuf []DocumentSection
 	fileSize uint32
 }

 // setDocument skips to the given document.
 func (p *contentProvider) setDocument(docID uint32) {
 	fileStart := p.id.boundaries[docID]

 	p.idx = docID
 	p.fileSize = p.id.boundaries[docID+1] - fileStart

 	p._nl = nil
 	p._sects = nil
 	p._data = nil
 }

 func (p *contentProvider) docSections() []DocumentSection {
 	if p._sects == nil {
 		var sz uint32
 		p._sects, sz, p.err = p.id.readDocSections(p.idx, p._sectBuf)
 		p.stats.ContentBytesLoaded += int64(sz)
 		p._sectBuf = p._sects
 	}
 	return p._sects
 }

 func (p *contentProvider) newlines() []uint32 {
 	if p._nl == nil {
 		var sz uint32
 		p._nl, sz, p.err = p.id.readNewlines(p.idx, p._nlBuf)
 		p._nlBuf = p._nl
 		p.stats.ContentBytesLoaded += int64(sz)
 	}
 	return p._nl
 }

 func (p *contentProvider) data(fileName bool) []byte {
 	if fileName {
 		return p.id.fileNameContent[p.id.fileNameIndex[p.idx]:p.id.fileNameIndex[p.idx+1]]
 	}

 	if p._data == nil {
 		p._data, p.err = p.id.readContents(p.idx)
 		p.stats.FilesLoaded++
 		p.stats.ContentBytesLoaded += int64(len(p._data))
 	}
 	return p._data
 }

 // Find offset in bytes (relative to corpus start) for an offset in
 // runes (relative to document start). If filename is set, the corpus
 // is the set of filenames, with the document being the name itself.
 func (p *contentProvider) findOffset(filename bool, r uint32) uint32 {
 	if p.id.metaData.PlainASCII {
 		return r
 	}

 	sample := p.id.runeOffsets
 	runeEnds := p.id.fileEndRunes
 	fileStartByte := p.id.boundaries[p.idx]
 	if filename {
 		sample = p.id.fileNameRuneOffsets
 		runeEnds = p.id.fileNameEndRunes
 		fileStartByte = p.id.fileNameIndex[p.idx]
 	}

 	absR := r
 	if p.idx > 0 {
 		absR += runeEnds[p.idx-1]
 	}

 	byteOff := sample[absR/runeOffsetFrequency]
 	left := absR % runeOffsetFrequency

 	var data []byte

 	if filename {
 		data = p.id.fileNameContent[byteOff:]
 	} else {
 		data, p.err = p.id.readContentSlice(byteOff, 3*runeOffsetFrequency)
 		if p.err != nil {
 			return 0
 		}
 	}
 	for left > 0 {
 		_, sz := utf8.DecodeRune(data)
 		byteOff += uint32(sz)
 		data = data[sz:]
 		left--
 	}

 	byteOff -= fileStartByte
 	return byteOff
 }

 func (p *contentProvider) fillMatches(ms []*candidateMatch) []LineMatch {
 	var result []LineMatch
 	if ms[0].fileName {
 		// There is only "line" in a filename.
 		res := LineMatch{
 			Line:     p.id.fileName(p.idx),
 			FileName: true,
 		}

 		for _, m := range ms {
 			res.LineFragments = append(res.LineFragments, LineFragmentMatch{
 				LineOffset:  int(m.byteOffset),
 				MatchLength: int(m.byteMatchSz),
 				Offset:      m.byteOffset,
 			})

 			result = []LineMatch{res}
 		}
 	} else {
 		ms = breakMatchesOnNewlines(ms, p.data(false))
 		result = p.fillContentMatches(ms)
 	}

 	sects := p.docSections()
 	for i, m := range result {
 		result[i].Score = matchScore(sects, &m)
 	}

 	return result
 }

 func (p *contentProvider) fillContentMatches(ms []*candidateMatch) []LineMatch {
 	var result []LineMatch
 	for len(ms) > 0 {
 		m := ms[0]
 		num, lineStart, lineEnd := m.line(p.newlines(), p.fileSize)

 		var lineCands []*candidateMatch

 		endMatch := m.byteOffset + m.byteMatchSz

 		for len(ms) > 0 {
 			m := ms[0]
 			if int(m.byteOffset) <= lineEnd {
 				endMatch = m.byteOffset + m.byteMatchSz
 				lineCands = append(lineCands, m)
 				ms = ms[1:]
 			} else {
 				break
 			}
 		}

 		if len(lineCands) == 0 {
 			log.Panicf(
 				"%s %v infinite loop: num %d start,end %d,%d, offset %d",
 				p.id.fileName(p.idx), p.id.metaData,
 				num, lineStart, lineEnd,
 				m.byteOffset)
 		}

 		data := p.data(false)

 		// Due to merging matches, we may have a match that
 		// crosses a line boundary. Prevent confusion by
 		// taking lines until we pass the last match
 		for lineEnd < len(data) && endMatch > uint32(lineEnd) {
 			next := bytes.IndexByte(data[lineEnd+1:], '\n')
 			if next == -1 {
 				lineEnd = len(data)
 			} else {
 				// TODO(hanwen): test that checks "+1" part here.
 				lineEnd += next + 1
 			}
 		}

 		finalMatch := LineMatch{
 			LineStart:  lineStart,
 			LineEnd:    lineEnd,
 			LineNumber: num,
 		}
 		finalMatch.Line = data[lineStart:lineEnd]

 		for _, m := range lineCands {
 			fragment := LineFragmentMatch{
 				Offset:      m.byteOffset,
 				LineOffset:  int(m.byteOffset) - lineStart,
 				MatchLength: int(m.byteMatchSz),
 			}
 			finalMatch.LineFragments = append(finalMatch.LineFragments, fragment)
 		}
 		result = append(result, finalMatch)
 	}
 	return result
 }

 const (
 	// TODO - how to scale this relative to rank?
 	scorePartialWordMatch   = 50.0
 	scoreWordMatch          = 500.0
 	scoreImportantThreshold = 2000.0
 	scorePartialSymbol      = 4000.0
 	scoreSymbol             = 7000.0
 	scoreFactorAtomMatch    = 400.0
 	scoreShardRankFactor    = 20.0
 	scoreFileOrderFactor    = 10.0
 	scoreLineOrderFactor    = 1.0
 )

 func findSection(secs []DocumentSection, off, sz uint32) *DocumentSection {
 	j := sort.Search(len(secs), func(i int) bool {
 		return secs[i].End >= off+sz
 	})

 	if j == len(secs) {
 		return nil
 	}

 	if secs[j].Start <= off && off+sz <= secs[j].End {
 		return &secs[j]
 	}
 	return nil
 }

 func matchScore(secs []DocumentSection, m *LineMatch) float64 {
 	var maxScore float64
 	for _, f := range m.LineFragments {
 		startBoundary := f.LineOffset < len(m.Line) && (f.LineOffset == 0 || byteClass(m.Line[f.LineOffset-1]) != byteClass(m.Line[f.LineOffset]))

 		end := int(f.LineOffset) + f.MatchLength
 		endBoundary := end > 0 && (end == len(m.Line) || byteClass(m.Line[end-1]) != byteClass(m.Line[end]))

 		score := 0.0
 		if startBoundary && endBoundary {
 			score = scoreWordMatch
 		} else if startBoundary || endBoundary {
 			score = scorePartialWordMatch
 		}

 		sec := findSection(secs, f.Offset, uint32(f.MatchLength))
 		if sec != nil {
 			startMatch := sec.Start == f.Offset
 			endMatch := sec.End == f.Offset+uint32(f.MatchLength)
 			if startMatch && endMatch {
 				score += scoreSymbol
 			} else if startMatch || endMatch {
 				score += (scoreSymbol + scorePartialSymbol) / 2
 			} else {
 				score += scorePartialSymbol
 			}
 		}
 		if score > maxScore {
 			maxScore = score
 		}
 	}
 	return maxScore
 }

 type matchScoreSlice []LineMatch

 func (m matchScoreSlice) Len() int           { return len(m) }
 func (m matchScoreSlice) Swap(i, j int)      { m[i], m[j] = m[j], m[i] }
 func (m matchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score }

 type fileMatchSlice []FileMatch

 func (m fileMatchSlice) Len() int           { return len(m) }
 func (m fileMatchSlice) Swap(i, j int)      { m[i], m[j] = m[j], m[i] }
 func (m fileMatchSlice) Less(i, j int) bool { return m[i].Score > m[j].Score }

 func sortMatchesByScore(ms []LineMatch) {
 	sort.Sort(matchScoreSlice(ms))
 }

 // Sort a slice of results.
 func SortFilesByScore(ms []FileMatch) {
 	sort.Sort(fileMatchSlice(ms))
 }
	// Copyright 2016 Google Inc. All rights reserved.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	package zoekt

	import (
	"bytes"
	"log"
	"sort"
	"unicode/utf8"
	)

	var _ = log.Println

	// contentProvider is an abstraction to treat matches for names and
	// content with the same code.
	type contentProvider struct {
	id *indexData
	stats *Stats

	// mutable
	err error
	idx uint32
	_data []byte
	_nl []uint32
	_nlBuf []uint32
	_sects []DocumentSection
	_sectBuf []DocumentSection
	fileSize uint32
	}

	// setDocument skips to the given document.
	func (p *contentProvider) setDocument(docID uint32) {
	fileStart := p.id.boundaries[docID]

	p.idx = docID
	p.fileSize = p.id.boundaries[docID+1] - fileStart

	p._nl = nil
	p._sects = nil
	p._data = nil
	}

	func (p *contentProvider) docSections() []DocumentSection {
	if p._sects == nil {
	var sz uint32
	p._sects, sz, p.err = p.id.readDocSections(p.idx, p._sectBuf)
	p.stats.ContentBytesLoaded += int64(sz)
	p._sectBuf = p._sects
	}
	return p._sects
	}

	func (p *contentProvider) newlines() []uint32 {
	if p._nl == nil {
	var sz uint32
	p._nl, sz, p.err = p.id.readNewlines(p.idx, p._nlBuf)
	p._nlBuf = p._nl
	p.stats.ContentBytesLoaded += int64(sz)
	}
	return p._nl
	}

	func (p *contentProvider) data(fileName bool) []byte {
	if fileName {
	return p.id.fileNameContent[p.id.fileNameIndex[p.idx]:p.id.fileNameIndex[p.idx+1]]
	}

	if p._data == nil {
	p._data, p.err = p.id.readContents(p.idx)
	p.stats.FilesLoaded++
	p.stats.ContentBytesLoaded += int64(len(p._data))
	}
	return p._data
	}

	// Find offset in bytes (relative to corpus start) for an offset in
	// runes (relative to document start). If filename is set, the corpus
	// is the set of filenames, with the document being the name itself.
	func (p *contentProvider) findOffset(filename bool, r uint32) uint32 {
	if p.id.metaData.PlainASCII {
	return r
	}

	sample := p.id.runeOffsets
	runeEnds := p.id.fileEndRunes
	fileStartByte := p.id.boundaries[p.idx]
	if filename {
	sample = p.id.fileNameRuneOffsets
	runeEnds = p.id.fileNameEndRunes
	fileStartByte = p.id.fileNameIndex[p.idx]
	}

	absR := r
	if p.idx > 0 {
	absR += runeEnds[p.idx-1]
	}

	byteOff := sample[absR/runeOffsetFrequency]
	left := absR % runeOffsetFrequency

	var data []byte

	if filename {
	data = p.id.fileNameContent[byteOff:]
	} else {
	data, p.err = p.id.readContentSlice(byteOff, 3*runeOffsetFrequency)
	if p.err != nil {
	return 0
	}
	}
	for left > 0 {
	_, sz := utf8.DecodeRune(data)
	byteOff += uint32(sz)
	data = data[sz:]
	left--
	}

	byteOff -= fileStartByte
	return byteOff
	}

	func (p contentProvider) fillMatches(ms []candidateMatch) []LineMatch {
	var result []LineMatch
	if ms[0].fileName {
	// There is only "line" in a filename.
	res := LineMatch{
	Line: p.id.fileName(p.idx),
	FileName: true,
	}

	for _, m := range ms {
	res.LineFragments = append(res.LineFragments, LineFragmentMatch{
	LineOffset: int(m.byteOffset),
	MatchLength: int(m.byteMatchSz),
	Offset: m.byteOffset,
	})

	result = []LineMatch{res}
	}
	} else {
	ms = breakMatchesOnNewlines(ms, p.data(false))
	result = p.fillContentMatches(ms)
	}

	sects := p.docSections()
	for i, m := range result {
	result[i].Score = matchScore(sects, &m)
	}

	return result
	}

	func (p contentProvider) fillContentMatches(ms []candidateMatch) []LineMatch {
	var result []LineMatch
	for len(ms) > 0 {
	m := ms[0]
	num, lineStart, lineEnd := m.line(p.newlines(), p.fileSize)

	var lineCands []*candidateMatch

	endMatch := m.byteOffset + m.byteMatchSz

	for len(ms) > 0 {
	m := ms[0]
	if int(m.byteOffset) <= lineEnd {
	endMatch = m.byteOffset + m.byteMatchSz
	lineCands = append(lineCands, m)
	ms = ms[1:]
	} else {
	break
	}
	}

	if len(lineCands) == 0 {
	log.Panicf(
	"%s %v infinite loop: num %d start,end %d,%d, offset %d",
	p.id.fileName(p.idx), p.id.metaData,
	num, lineStart, lineEnd,
	m.byteOffset)
	}

	data := p.data(false)

	// Due to merging matches, we may have a match that
	// crosses a line boundary. Prevent confusion by
	// taking lines until we pass the last match
	for lineEnd < len(data) && endMatch > uint32(lineEnd) {
	next := bytes.IndexByte(data[lineEnd+1:], '\n')
	if next == -1 {
	lineEnd = len(data)
	} else {
	// TODO(hanwen): test that checks "+1" part here.
	lineEnd += next + 1
	}
	}

	finalMatch := LineMatch{
	LineStart: lineStart,
	LineEnd: lineEnd,
	LineNumber: num,
	}
	finalMatch.Line = data[lineStart:lineEnd]

	for _, m := range lineCands {
	fragment := LineFragmentMatch{
	Offset: m.byteOffset,
	LineOffset: int(m.byteOffset) - lineStart,
	MatchLength: int(m.byteMatchSz),
	}
	finalMatch.LineFragments = append(finalMatch.LineFragments, fragment)
	}
	result = append(result, finalMatch)
	}
	return result
	}

	const (
	// TODO - how to scale this relative to rank?
	scorePartialWordMatch = 50.0
	scoreWordMatch = 500.0
	scoreImportantThreshold = 2000.0
	scorePartialSymbol = 4000.0
	scoreSymbol = 7000.0
	scoreFactorAtomMatch = 400.0
	scoreShardRankFactor = 20.0
	scoreFileOrderFactor = 10.0
	scoreLineOrderFactor = 1.0
	)

	func findSection(secs []DocumentSection, off, sz uint32) *DocumentSection {
	j := sort.Search(len(secs), func(i int) bool {
	return secs[i].End >= off+sz
	})

	if j == len(secs) {
	return nil
	}

	if secs[j].Start <= off && off+sz <= secs[j].End {
	return &secs[j]
	}
	return nil
	}

	func matchScore(secs []DocumentSection, m *LineMatch) float64 {
	var maxScore float64
	for _, f := range m.LineFragments {
	startBoundary := f.LineOffset < len(m.Line) && (f.LineOffset == 0 \|\| byteClass(m.Line[f.LineOffset-1]) != byteClass(m.Line[f.LineOffset]))

	end := int(f.LineOffset) + f.MatchLength
	endBoundary := end > 0 && (end == len(m.Line) \|\| byteClass(m.Line[end-1]) != byteClass(m.Line[end]))

	score := 0.0
	if startBoundary && endBoundary {
	score = scoreWordMatch
	} else if startBoundary \|\| endBoundary {
	score = scorePartialWordMatch
	}

	sec := findSection(secs, f.Offset, uint32(f.MatchLength))
	if sec != nil {
	startMatch := sec.Start == f.Offset
	endMatch := sec.End == f.Offset+uint32(f.MatchLength)
	if startMatch && endMatch {
	score += scoreSymbol
	} else if startMatch \|\| endMatch {
	score += (scoreSymbol + scorePartialSymbol) / 2
	} else {
	score += scorePartialSymbol
	}
	}
	if score > maxScore {
	maxScore = score
	}
	}
	return maxScore
	}

	type matchScoreSlice []LineMatch

	func (m matchScoreSlice) Len() int { return len(m) }
	func (m matchScoreSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
	func (m matchScoreSlice) Less(i, j int) bool { return m[i].Score > m[j].Score }

	type fileMatchSlice []FileMatch

	func (m fileMatchSlice) Len() int { return len(m) }
	func (m fileMatchSlice) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
	func (m fileMatchSlice) Less(i, j int) bool { return m[i].Score > m[j].Score }

	func sortMatchesByScore(ms []LineMatch) {
	sort.Sort(matchScoreSlice(ms))
	}

	// Sort a slice of results.
	func SortFilesByScore(ms []FileMatch) {
	sort.Sort(fileMatchSlice(ms))
	}