blob: 62f7df79d32eb973af6a036762d25c9527fce5f9 [file] [log] [blame]
/*
* Copyright 2012 gitblit.com.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.gitblit.service;
import static org.eclipse.jgit.treewalk.filter.TreeFilter.ANY_DIFF;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.text.MessageFormat;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.concurrent.ConcurrentHashMap;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.DateTools.Resolution;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.MultiReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.search.highlight.Fragmenter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.search.highlight.SimpleSpanFragmenter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.pdf.PDFParser;
import org.apache.tika.sax.BodyContentHandler;
import org.eclipse.jgit.diff.DiffEntry.ChangeType;
import org.eclipse.jgit.lib.Constants;
import org.eclipse.jgit.lib.FileMode;
import org.eclipse.jgit.lib.ObjectId;
import org.eclipse.jgit.lib.ObjectLoader;
import org.eclipse.jgit.lib.ObjectReader;
import org.eclipse.jgit.lib.Repository;
import org.eclipse.jgit.lib.RepositoryCache.FileKey;
import org.eclipse.jgit.revwalk.RevCommit;
import org.eclipse.jgit.revwalk.RevTree;
import org.eclipse.jgit.revwalk.RevWalk;
import org.eclipse.jgit.storage.file.FileBasedConfig;
import org.eclipse.jgit.treewalk.EmptyTreeIterator;
import org.eclipse.jgit.treewalk.TreeWalk;
import org.eclipse.jgit.util.FS;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.gitblit.Constants.SearchObjectType;
import com.gitblit.GitBlit;
import com.gitblit.IStoredSettings;
import com.gitblit.Keys;
import com.gitblit.manager.FilestoreManager;
import com.gitblit.manager.IFilestoreManager;
import com.gitblit.manager.IRepositoryManager;
import com.gitblit.models.PathModel.PathChangeModel;
import com.gitblit.models.RefModel;
import com.gitblit.models.RepositoryModel;
import com.gitblit.models.SearchResult;
import com.gitblit.utils.ArrayUtils;
import com.gitblit.utils.JGitUtils;
import com.gitblit.utils.StringUtils;
/**
* The Lucene service handles indexing and searching repositories.
*
* @author James Moger
*
*/
public class LuceneService implements Runnable {
private static final int INDEX_VERSION = 6;
private static final String FIELD_OBJECT_TYPE = "type";
private static final String FIELD_PATH = "path";
private static final String FIELD_COMMIT = "commit";
private static final String FIELD_BRANCH = "branch";
private static final String FIELD_SUMMARY = "summary";
private static final String FIELD_CONTENT = "content";
private static final String FIELD_AUTHOR = "author";
private static final String FIELD_COMMITTER = "committer";
private static final String FIELD_DATE = "date";
private static final String FIELD_TAG = "tag";
private static final String CONF_FILE = "lucene.conf";
private static final String LUCENE_DIR = "lucene";
private static final String CONF_INDEX = "index";
private static final String CONF_VERSION = "version";
private static final String CONF_ALIAS = "aliases";
private static final String CONF_BRANCH = "branches";
private static final Version LUCENE_VERSION = Version.LUCENE_4_10_0;
private final Logger logger = LoggerFactory.getLogger(LuceneService.class);
private final IStoredSettings storedSettings;
private final IRepositoryManager repositoryManager;
private final IFilestoreManager filestoreManager;
private final File repositoriesFolder;
private final Map<String, IndexSearcher> searchers = new ConcurrentHashMap<String, IndexSearcher>();
private final Map<String, IndexWriter> writers = new ConcurrentHashMap<String, IndexWriter>();
private final String luceneIgnoreExtensions = "7z arc arj bin bmp dll doc docx exe gif gz jar jpg lib lzh odg odf odt pdf ppt png so swf xcf xls xlsx zip";
private Set<String> excludedExtensions;
public LuceneService(
IStoredSettings settings,
IRepositoryManager repositoryManager,
IFilestoreManager filestoreManager) {
this.storedSettings = settings;
this.repositoryManager = repositoryManager;
this.filestoreManager = filestoreManager;
this.repositoriesFolder = repositoryManager.getRepositoriesFolder();
String exts = luceneIgnoreExtensions;
if (settings != null) {
exts = settings.getString(Keys.web.luceneIgnoreExtensions, exts);
}
excludedExtensions = new TreeSet<String>(StringUtils.getStringsFromValue(exts));
}
/**
* Run is executed by the Gitblit executor service. Because this is called
* by an executor service, calls will queue - i.e. there can never be
* concurrent execution of repository index updates.
*/
@Override
public void run() {
if (!storedSettings.getBoolean(Keys.web.allowLuceneIndexing, true)) {
// Lucene indexing is disabled
return;
}
// reload the excluded extensions
String exts = storedSettings.getString(Keys.web.luceneIgnoreExtensions, luceneIgnoreExtensions);
excludedExtensions = new TreeSet<String>(StringUtils.getStringsFromValue(exts));
if (repositoryManager.isCollectingGarbage()) {
// busy collecting garbage, try again later
return;
}
for (String repositoryName: repositoryManager.getRepositoryList()) {
RepositoryModel model = repositoryManager.getRepositoryModel(repositoryName);
if (model.hasCommits && !ArrayUtils.isEmpty(model.indexedBranches)) {
Repository repository = repositoryManager.getRepository(model.name);
if (repository == null) {
if (repositoryManager.isCollectingGarbage(model.name)) {
logger.info(MessageFormat.format("Skipping Lucene index of {0}, busy garbage collecting", repositoryName));
}
continue;
}
index(model, repository);
repository.close();
System.gc();
}
}
}
/**
* Synchronously indexes a repository. This may build a complete index of a
* repository or it may update an existing index.
*
* @param displayName
* the name of the repository
* @param repository
* the repository object
*/
private void index(RepositoryModel model, Repository repository) {
try {
if (shouldReindex(repository)) {
// (re)build the entire index
IndexResult result = reindex(model, repository);
if (result.success) {
if (result.commitCount > 0) {
String msg = "Built {0} Lucene index from {1} commits and {2} files across {3} branches in {4} secs";
logger.info(MessageFormat.format(msg, model.name, result.commitCount,
result.blobCount, result.branchCount, result.duration()));
}
} else {
String msg = "Could not build {0} Lucene index!";
logger.error(MessageFormat.format(msg, model.name));
}
} else {
// update the index with latest commits
IndexResult result = updateIndex(model, repository);
if (result.success) {
if (result.commitCount > 0) {
String msg = "Updated {0} Lucene index with {1} commits and {2} files across {3} branches in {4} secs";
logger.info(MessageFormat.format(msg, model.name, result.commitCount,
result.blobCount, result.branchCount, result.duration()));
}
} else {
String msg = "Could not update {0} Lucene index!";
logger.error(MessageFormat.format(msg, model.name));
}
}
} catch (Throwable t) {
logger.error(MessageFormat.format("Lucene indexing failure for {0}", model.name), t);
}
}
/**
* Close the writer/searcher objects for a repository.
*
* @param repositoryName
*/
public synchronized void close(String repositoryName) {
try {
IndexSearcher searcher = searchers.remove(repositoryName);
if (searcher != null) {
searcher.getIndexReader().close();
}
} catch (Exception e) {
logger.error("Failed to close index searcher for " + repositoryName, e);
}
try {
IndexWriter writer = writers.remove(repositoryName);
if (writer != null) {
writer.close();
}
} catch (Exception e) {
logger.error("Failed to close index writer for " + repositoryName, e);
}
}
/**
* Close all Lucene indexers.
*
*/
public synchronized void close() {
// close all writers
for (String writer : writers.keySet()) {
try {
writers.get(writer).close(true);
} catch (Throwable t) {
logger.error("Failed to close Lucene writer for " + writer, t);
}
}
writers.clear();
// close all searchers
for (String searcher : searchers.keySet()) {
try {
searchers.get(searcher).getIndexReader().close();
} catch (Throwable t) {
logger.error("Failed to close Lucene searcher for " + searcher, t);
}
}
searchers.clear();
}
/**
* Deletes the Lucene index for the specified repository.
*
* @param repositoryName
* @return true, if successful
*/
public boolean deleteIndex(String repositoryName) {
try {
// close any open writer/searcher
close(repositoryName);
// delete the index folder
File repositoryFolder = FileKey.resolve(new File(repositoriesFolder, repositoryName), FS.DETECTED);
File luceneIndex = new File(repositoryFolder, LUCENE_DIR);
if (luceneIndex.exists()) {
org.eclipse.jgit.util.FileUtils.delete(luceneIndex,
org.eclipse.jgit.util.FileUtils.RECURSIVE);
}
// delete the config file
File luceneConfig = new File(repositoryFolder, CONF_FILE);
if (luceneConfig.exists()) {
luceneConfig.delete();
}
return true;
} catch (IOException e) {
throw new RuntimeException(e);
}
}
/**
* Returns the author for the commit, if this information is available.
*
* @param commit
* @return an author or unknown
*/
private String getAuthor(RevCommit commit) {
String name = "unknown";
try {
name = commit.getAuthorIdent().getName();
if (StringUtils.isEmpty(name)) {
name = commit.getAuthorIdent().getEmailAddress();
}
} catch (NullPointerException n) {
}
return name;
}
/**
* Returns the committer for the commit, if this information is available.
*
* @param commit
* @return an committer or unknown
*/
private String getCommitter(RevCommit commit) {
String name = "unknown";
try {
name = commit.getCommitterIdent().getName();
if (StringUtils.isEmpty(name)) {
name = commit.getCommitterIdent().getEmailAddress();
}
} catch (NullPointerException n) {
}
return name;
}
/**
* Get the tree associated with the given commit.
*
* @param walk
* @param commit
* @return tree
* @throws IOException
*/
private RevTree getTree(final RevWalk walk, final RevCommit commit)
throws IOException {
final RevTree tree = commit.getTree();
if (tree != null) {
return tree;
}
walk.parseHeaders(commit);
return commit.getTree();
}
/**
* Construct a keyname from the branch.
*
* @param branchName
* @return a keyname appropriate for the Git config file format
*/
private String getBranchKey(String branchName) {
return StringUtils.getSHA1(branchName);
}
/**
* Returns the Lucene configuration for the specified repository.
*
* @param repository
* @return a config object
*/
private FileBasedConfig getConfig(Repository repository) {
File file = new File(repository.getDirectory(), CONF_FILE);
FileBasedConfig config = new FileBasedConfig(file, FS.detect());
return config;
}
/**
* Reads the Lucene config file for the repository to check the index
* version. If the index version is different, then rebuild the repository
* index.
*
* @param repository
* @return true of the on-disk index format is different than INDEX_VERSION
*/
private boolean shouldReindex(Repository repository) {
try {
FileBasedConfig config = getConfig(repository);
config.load();
int indexVersion = config.getInt(CONF_INDEX, CONF_VERSION, 0);
// reindex if versions do not match
return indexVersion != INDEX_VERSION;
} catch (Throwable t) {
}
return true;
}
/**
* This completely indexes the repository and will destroy any existing
* index.
*
* @param repositoryName
* @param repository
* @return IndexResult
*/
public IndexResult reindex(RepositoryModel model, Repository repository) {
IndexResult result = new IndexResult();
if (!deleteIndex(model.name)) {
return result;
}
try {
String [] encodings = storedSettings.getStrings(Keys.web.blobEncodings).toArray(new String[0]);
FileBasedConfig config = getConfig(repository);
Set<String> indexedCommits = new TreeSet<String>();
IndexWriter writer = getIndexWriter(model.name);
// build a quick lookup of tags
Map<String, List<String>> tags = new HashMap<String, List<String>>();
for (RefModel tag : JGitUtils.getTags(repository, false, -1)) {
if (!tag.isAnnotatedTag()) {
// skip non-annotated tags
continue;
}
if (!tags.containsKey(tag.getReferencedObjectId().getName())) {
tags.put(tag.getReferencedObjectId().getName(), new ArrayList<String>());
}
tags.get(tag.getReferencedObjectId().getName()).add(tag.displayName);
}
ObjectReader reader = repository.newObjectReader();
// get the local branches
List<RefModel> branches = JGitUtils.getLocalBranches(repository, true, -1);
// sort them by most recently updated
Collections.sort(branches, new Comparator<RefModel>() {
@Override
public int compare(RefModel ref1, RefModel ref2) {
return ref2.getDate().compareTo(ref1.getDate());
}
});
// reorder default branch to first position
RefModel defaultBranch = null;
ObjectId defaultBranchId = JGitUtils.getDefaultBranch(repository);
for (RefModel branch : branches) {
if (branch.getObjectId().equals(defaultBranchId)) {
defaultBranch = branch;
break;
}
}
branches.remove(defaultBranch);
branches.add(0, defaultBranch);
// walk through each branch
for (RefModel branch : branches) {
boolean indexBranch = false;
if (model.indexedBranches.contains(com.gitblit.Constants.DEFAULT_BRANCH)
&& branch.equals(defaultBranch)) {
// indexing "default" branch
indexBranch = true;
} else if (branch.getName().startsWith(com.gitblit.Constants.R_META)) {
// skip internal meta branches
indexBranch = false;
} else {
// normal explicit branch check
indexBranch = model.indexedBranches.contains(branch.getName());
}
// if this branch is not specifically indexed then skip
if (!indexBranch) {
continue;
}
String branchName = branch.getName();
RevWalk revWalk = new RevWalk(reader);
RevCommit tip = revWalk.parseCommit(branch.getObjectId());
String tipId = tip.getId().getName();
String keyName = getBranchKey(branchName);
config.setString(CONF_ALIAS, null, keyName, branchName);
config.setString(CONF_BRANCH, null, keyName, tipId);
// index the blob contents of the tree
TreeWalk treeWalk = new TreeWalk(repository);
treeWalk.addTree(tip.getTree());
treeWalk.setRecursive(true);
Map<String, ObjectId> paths = new TreeMap<String, ObjectId>();
while (treeWalk.next()) {
// ensure path is not in a submodule
if (treeWalk.getFileMode(0) != FileMode.GITLINK) {
paths.put(treeWalk.getPathString(), treeWalk.getObjectId(0));
}
}
ByteArrayOutputStream os = new ByteArrayOutputStream();
byte[] tmp = new byte[32767];
RevWalk commitWalk = new RevWalk(reader);
commitWalk.markStart(tip);
RevCommit commit;
while ((paths.size() > 0) && (commit = commitWalk.next()) != null) {
TreeWalk diffWalk = new TreeWalk(reader);
int parentCount = commit.getParentCount();
switch (parentCount) {
case 0:
diffWalk.addTree(new EmptyTreeIterator());
break;
case 1:
diffWalk.addTree(getTree(commitWalk, commit.getParent(0)));
break;
default:
// skip merge commits
continue;
}
diffWalk.addTree(getTree(commitWalk, commit));
diffWalk.setFilter(ANY_DIFF);
diffWalk.setRecursive(true);
while ((paths.size() > 0) && diffWalk.next()) {
String path = diffWalk.getPathString();
if (!paths.containsKey(path)) {
continue;
}
//TODO: Figure out filestore oid the path - bit more involved than updating the index
// remove path from set
ObjectId blobId = paths.remove(path);
result.blobCount++;
// index the blob metadata
String blobAuthor = getAuthor(commit);
String blobCommitter = getCommitter(commit);
String blobDate = DateTools.timeToString(commit.getCommitTime() * 1000L,
Resolution.MINUTE);
Document doc = new Document();
doc.add(new Field(FIELD_OBJECT_TYPE, SearchObjectType.blob.name(), StringField.TYPE_STORED));
doc.add(new Field(FIELD_BRANCH, branchName, TextField.TYPE_STORED));
doc.add(new Field(FIELD_COMMIT, commit.getName(), TextField.TYPE_STORED));
doc.add(new Field(FIELD_PATH, path, TextField.TYPE_STORED));
doc.add(new Field(FIELD_DATE, blobDate, StringField.TYPE_STORED));
doc.add(new Field(FIELD_AUTHOR, blobAuthor, TextField.TYPE_STORED));
doc.add(new Field(FIELD_COMMITTER, blobCommitter, TextField.TYPE_STORED));
// determine extension to compare to the extension
// blacklist
String ext = null;
String name = path.toLowerCase();
if (name.indexOf('.') > -1) {
ext = name.substring(name.lastIndexOf('.') + 1);
}
// index the blob content
if (StringUtils.isEmpty(ext) || !excludedExtensions.contains(ext)) {
ObjectLoader ldr = repository.open(blobId, Constants.OBJ_BLOB);
InputStream in = ldr.openStream();
int n;
while ((n = in.read(tmp)) > 0) {
os.write(tmp, 0, n);
}
in.close();
byte[] content = os.toByteArray();
String str = StringUtils.decodeString(content, encodings);
doc.add(new Field(FIELD_CONTENT, str, TextField.TYPE_STORED));
os.reset();
}
// add the blob to the index
writer.addDocument(doc);
}
}
os.close();
// index the tip commit object
if (indexedCommits.add(tipId)) {
Document doc = createDocument(tip, tags.get(tipId));
doc.add(new Field(FIELD_BRANCH, branchName, TextField.TYPE_STORED));
writer.addDocument(doc);
result.commitCount += 1;
result.branchCount += 1;
}
// traverse the log and index the previous commit objects
RevWalk historyWalk = new RevWalk(reader);
historyWalk.markStart(historyWalk.parseCommit(tip.getId()));
RevCommit rev;
while ((rev = historyWalk.next()) != null) {
String hash = rev.getId().getName();
if (indexedCommits.add(hash)) {
Document doc = createDocument(rev, tags.get(hash));
doc.add(new Field(FIELD_BRANCH, branchName, TextField.TYPE_STORED));
writer.addDocument(doc);
result.commitCount += 1;
}
}
}
// finished
reader.close();
// commit all changes and reset the searcher
config.setInt(CONF_INDEX, null, CONF_VERSION, INDEX_VERSION);
config.save();
writer.commit();
resetIndexSearcher(model.name);
result.success();
} catch (Exception e) {
logger.error("Exception while reindexing " + model.name, e);
}
return result;
}
/**
* Incrementally update the index with the specified commit for the
* repository.
*
* @param repositoryName
* @param repository
* @param branch
* the fully qualified branch name (e.g. refs/heads/master)
* @param commit
* @return true, if successful
*/
private IndexResult index(String repositoryName, Repository repository,
String branch, RevCommit commit) {
IndexResult result = new IndexResult();
try {
String [] encodings = storedSettings.getStrings(Keys.web.blobEncodings).toArray(new String[0]);
List<PathChangeModel> changedPaths = JGitUtils.getFilesInCommit(repository, commit);
String revDate = DateTools.timeToString(commit.getCommitTime() * 1000L,
Resolution.MINUTE);
IndexWriter writer = getIndexWriter(repositoryName);
for (PathChangeModel path : changedPaths) {
if (path.isSubmodule()) {
continue;
}
// delete the indexed blob
deleteBlob(repositoryName, branch, path.name);
// re-index the blob
if (!ChangeType.DELETE.equals(path.changeType)) {
result.blobCount++;
Document doc = new Document();
doc.add(new Field(FIELD_OBJECT_TYPE, SearchObjectType.blob.name(), StringField.TYPE_STORED));
doc.add(new Field(FIELD_BRANCH, branch, TextField.TYPE_STORED));
doc.add(new Field(FIELD_COMMIT, commit.getName(), TextField.TYPE_STORED));
doc.add(new Field(FIELD_PATH, path.path, TextField.TYPE_STORED));
doc.add(new Field(FIELD_DATE, revDate, StringField.TYPE_STORED));
doc.add(new Field(FIELD_AUTHOR, getAuthor(commit), TextField.TYPE_STORED));
doc.add(new Field(FIELD_COMMITTER, getCommitter(commit), TextField.TYPE_STORED));
// determine extension to compare to the extension
// blacklist
String ext = null;
String name = path.name.toLowerCase();
if (name.indexOf('.') > -1) {
ext = name.substring(name.lastIndexOf('.') + 1);
}
if (StringUtils.isEmpty(ext) || !excludedExtensions.contains(ext)) {
String str = "";
// read the blob content
if (path.isFilestoreItem()) {
//Get file from filestore
BodyContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
PDFParser parser = new PDFParser();
ParseContext parseContext = new ParseContext();
File lfsFile = filestoreManager.getStoragePath(path.getFilestoreOid());
FileInputStream inputstream = new FileInputStream(lfsFile);
parser.parse(inputstream, handler, metadata, parseContext);
str = handler.toString();
} else {
str = JGitUtils.getStringContent(repository, commit.getTree(),
path.path, encodings);
}
if (str != null) {
doc.add(new Field(FIELD_CONTENT, str, TextField.TYPE_STORED));
writer.addDocument(doc);
}
}
}
}
writer.commit();
// get any annotated commit tags
List<String> commitTags = new ArrayList<String>();
for (RefModel ref : JGitUtils.getTags(repository, false, -1)) {
if (ref.isAnnotatedTag() && ref.getReferencedObjectId().equals(commit.getId())) {
commitTags.add(ref.displayName);
}
}
// create and write the Lucene document
Document doc = createDocument(commit, commitTags);
doc.add(new Field(FIELD_BRANCH, branch, TextField.TYPE_STORED));
result.commitCount++;
result.success = index(repositoryName, doc);
} catch (Exception e) {
logger.error(MessageFormat.format("Exception while indexing commit {0} in {1}", commit.getId().getName(), repositoryName), e);
}
return result;
}
/**
* Delete a blob from the specified branch of the repository index.
*
* @param repositoryName
* @param branch
* @param path
* @throws Exception
* @return true, if deleted, false if no record was deleted
*/
public boolean deleteBlob(String repositoryName, String branch, String path) throws Exception {
String pattern = MessageFormat.format("{0}:'{'0} AND {1}:\"'{'1'}'\" AND {2}:\"'{'2'}'\"", FIELD_OBJECT_TYPE, FIELD_BRANCH, FIELD_PATH);
String q = MessageFormat.format(pattern, SearchObjectType.blob.name(), branch, path);
BooleanQuery query = new BooleanQuery();
StandardAnalyzer analyzer = new StandardAnalyzer(LUCENE_VERSION);
QueryParser qp = new QueryParser(LUCENE_VERSION, FIELD_SUMMARY, analyzer);
query.add(qp.parse(q), Occur.MUST);
IndexWriter writer = getIndexWriter(repositoryName);
int numDocsBefore = writer.numDocs();
writer.deleteDocuments(query);
writer.commit();
int numDocsAfter = writer.numDocs();
if (numDocsBefore == numDocsAfter) {
logger.debug(MessageFormat.format("no records found to delete {0}", query.toString()));
return false;
} else {
logger.debug(MessageFormat.format("deleted {0} records with {1}", numDocsBefore - numDocsAfter, query.toString()));
return true;
}
}
/**
* Updates a repository index incrementally from the last indexed commits.
*
* @param model
* @param repository
* @return IndexResult
*/
private IndexResult updateIndex(RepositoryModel model, Repository repository) {
IndexResult result = new IndexResult();
try {
FileBasedConfig config = getConfig(repository);
config.load();
// build a quick lookup of annotated tags
Map<String, List<String>> tags = new HashMap<String, List<String>>();
for (RefModel tag : JGitUtils.getTags(repository, false, -1)) {
if (!tag.isAnnotatedTag()) {
// skip non-annotated tags
continue;
}
if (!tags.containsKey(tag.getObjectId().getName())) {
tags.put(tag.getReferencedObjectId().getName(), new ArrayList<String>());
}
tags.get(tag.getReferencedObjectId().getName()).add(tag.displayName);
}
// detect branch deletion
// first assume all branches are deleted and then remove each
// existing branch from deletedBranches during indexing
Set<String> deletedBranches = new TreeSet<String>();
for (String alias : config.getNames(CONF_ALIAS)) {
String branch = config.getString(CONF_ALIAS, null, alias);
deletedBranches.add(branch);
}
// get the local branches
List<RefModel> branches = JGitUtils.getLocalBranches(repository, true, -1);
// sort them by most recently updated
Collections.sort(branches, new Comparator<RefModel>() {
@Override
public int compare(RefModel ref1, RefModel ref2) {
return ref2.getDate().compareTo(ref1.getDate());
}
});
// reorder default branch to first position
RefModel defaultBranch = null;
ObjectId defaultBranchId = JGitUtils.getDefaultBranch(repository);
for (RefModel branch : branches) {
if (branch.getObjectId().equals(defaultBranchId)) {
defaultBranch = branch;
break;
}
}
branches.remove(defaultBranch);
branches.add(0, defaultBranch);
// walk through each branches
for (RefModel branch : branches) {
String branchName = branch.getName();
boolean indexBranch = false;
if (model.indexedBranches.contains(com.gitblit.Constants.DEFAULT_BRANCH)
&& branch.equals(defaultBranch)) {
// indexing "default" branch
indexBranch = true;
} else if (branch.getName().startsWith(com.gitblit.Constants.R_META)) {
// ignore internal meta branches
indexBranch = false;
} else {
// normal explicit branch check
indexBranch = model.indexedBranches.contains(branch.getName());
}
// if this branch is not specifically indexed then skip
if (!indexBranch) {
continue;
}
// remove this branch from the deletedBranches set
deletedBranches.remove(branchName);
// determine last commit
String keyName = getBranchKey(branchName);
String lastCommit = config.getString(CONF_BRANCH, null, keyName);
List<RevCommit> revs;
if (StringUtils.isEmpty(lastCommit)) {
// new branch/unindexed branch, get all commits on branch
revs = JGitUtils.getRevLog(repository, branchName, 0, -1);
} else {
// pre-existing branch, get changes since last commit
revs = JGitUtils.getRevLog(repository, lastCommit, branchName);
}
if (revs.size() > 0) {
result.branchCount += 1;
}
// reverse the list of commits so we start with the first commit
Collections.reverse(revs);
for (RevCommit commit : revs) {
// index a commit
result.add(index(model.name, repository, branchName, commit));
}
// update the config
config.setInt(CONF_INDEX, null, CONF_VERSION, INDEX_VERSION);
config.setString(CONF_ALIAS, null, keyName, branchName);
config.setString(CONF_BRANCH, null, keyName, branch.getObjectId().getName());
config.save();
}
// the deletedBranches set will normally be empty by this point
// unless a branch really was deleted and no longer exists
if (deletedBranches.size() > 0) {
for (String branch : deletedBranches) {
IndexWriter writer = getIndexWriter(model.name);
writer.deleteDocuments(new Term(FIELD_BRANCH, branch));
writer.commit();
}
}
result.success = true;
} catch (Throwable t) {
logger.error(MessageFormat.format("Exception while updating {0} Lucene index", model.name), t);
}
return result;
}
/**
* Creates a Lucene document for a commit
*
* @param commit
* @param tags
* @return a Lucene document
*/
private Document createDocument(RevCommit commit, List<String> tags) {
Document doc = new Document();
doc.add(new Field(FIELD_OBJECT_TYPE, SearchObjectType.commit.name(), StringField.TYPE_STORED));
doc.add(new Field(FIELD_COMMIT, commit.getName(), TextField.TYPE_STORED));
doc.add(new Field(FIELD_DATE, DateTools.timeToString(commit.getCommitTime() * 1000L,
Resolution.MINUTE), StringField.TYPE_STORED));
doc.add(new Field(FIELD_AUTHOR, getAuthor(commit), TextField.TYPE_STORED));
doc.add(new Field(FIELD_COMMITTER, getCommitter(commit), TextField.TYPE_STORED));
doc.add(new Field(FIELD_SUMMARY, commit.getShortMessage(), TextField.TYPE_STORED));
doc.add(new Field(FIELD_CONTENT, commit.getFullMessage(), TextField.TYPE_STORED));
if (!ArrayUtils.isEmpty(tags)) {
doc.add(new Field(FIELD_TAG, StringUtils.flattenStrings(tags), TextField.TYPE_STORED));
}
return doc;
}
/**
* Incrementally index an object for the repository.
*
* @param repositoryName
* @param doc
* @return true, if successful
*/
private boolean index(String repositoryName, Document doc) {
try {
IndexWriter writer = getIndexWriter(repositoryName);
writer.addDocument(doc);
writer.commit();
resetIndexSearcher(repositoryName);
return true;
} catch (Exception e) {
logger.error(MessageFormat.format("Exception while incrementally updating {0} Lucene index", repositoryName), e);
}
return false;
}
private SearchResult createSearchResult(Document doc, float score, int hitId, int totalHits) throws ParseException {
SearchResult result = new SearchResult();
result.hitId = hitId;
result.totalHits = totalHits;
result.score = score;
result.date = DateTools.stringToDate(doc.get(FIELD_DATE));
result.summary = doc.get(FIELD_SUMMARY);
result.author = doc.get(FIELD_AUTHOR);
result.committer = doc.get(FIELD_COMMITTER);
result.type = SearchObjectType.fromName(doc.get(FIELD_OBJECT_TYPE));
result.branch = doc.get(FIELD_BRANCH);
result.commitId = doc.get(FIELD_COMMIT);
result.path = doc.get(FIELD_PATH);
if (doc.get(FIELD_TAG) != null) {
result.tags = StringUtils.getStringsFromValue(doc.get(FIELD_TAG));
}
return result;
}
private synchronized void resetIndexSearcher(String repository) throws IOException {
IndexSearcher searcher = searchers.remove(repository);
if (searcher != null) {
searcher.getIndexReader().close();
}
}
/**
* Gets an index searcher for the repository.
*
* @param repository
* @return
* @throws IOException
*/
private IndexSearcher getIndexSearcher(String repository) throws IOException {
IndexSearcher searcher = searchers.get(repository);
if (searcher == null) {
IndexWriter writer = getIndexWriter(repository);
searcher = new IndexSearcher(DirectoryReader.open(writer, true));
searchers.put(repository, searcher);
}
return searcher;
}
/**
* Gets an index writer for the repository. The index will be created if it
* does not already exist or if forceCreate is specified.
*
* @param repository
* @return an IndexWriter
* @throws IOException
*/
private IndexWriter getIndexWriter(String repository) throws IOException {
IndexWriter indexWriter = writers.get(repository);
File repositoryFolder = FileKey.resolve(new File(repositoriesFolder, repository), FS.DETECTED);
File indexFolder = new File(repositoryFolder, LUCENE_DIR);
Directory directory = FSDirectory.open(indexFolder);
if (indexWriter == null) {
if (!indexFolder.exists()) {
indexFolder.mkdirs();
}
StandardAnalyzer analyzer = new StandardAnalyzer(LUCENE_VERSION);
IndexWriterConfig config = new IndexWriterConfig(LUCENE_VERSION, analyzer);
config.setOpenMode(OpenMode.CREATE_OR_APPEND);
indexWriter = new IndexWriter(directory, config);
writers.put(repository, indexWriter);
}
return indexWriter;
}
/**
* Searches the specified repositories for the given text or query
*
* @param text
* if the text is null or empty, null is returned
* @param page
* the page number to retrieve. page is 1-indexed.
* @param pageSize
* the number of elements to return for this page
* @param repositories
* a list of repositories to search. if no repositories are
* specified null is returned.
* @return a list of SearchResults in order from highest to the lowest score
*
*/
public List<SearchResult> search(String text, int page, int pageSize, List<String> repositories) {
if (ArrayUtils.isEmpty(repositories)) {
return null;
}
return search(text, page, pageSize, repositories.toArray(new String[0]));
}
/**
* Searches the specified repositories for the given text or query
*
* @param text
* if the text is null or empty, null is returned
* @param page
* the page number to retrieve. page is 1-indexed.
* @param pageSize
* the number of elements to return for this page
* @param repositories
* a list of repositories to search. if no repositories are
* specified null is returned.
* @return a list of SearchResults in order from highest to the lowest score
*
*/
public List<SearchResult> search(String text, int page, int pageSize, String... repositories) {
if (StringUtils.isEmpty(text)) {
return null;
}
if (ArrayUtils.isEmpty(repositories)) {
return null;
}
Set<SearchResult> results = new LinkedHashSet<SearchResult>();
StandardAnalyzer analyzer = new StandardAnalyzer(LUCENE_VERSION);
try {
// default search checks summary and content
BooleanQuery query = new BooleanQuery();
QueryParser qp;
qp = new QueryParser(LUCENE_VERSION, FIELD_SUMMARY, analyzer);
qp.setAllowLeadingWildcard(true);
query.add(qp.parse(text), Occur.SHOULD);
qp = new QueryParser(LUCENE_VERSION, FIELD_CONTENT, analyzer);
qp.setAllowLeadingWildcard(true);
query.add(qp.parse(text), Occur.SHOULD);
IndexSearcher searcher;
if (repositories.length == 1) {
// single repository search
searcher = getIndexSearcher(repositories[0]);
} else {
// multiple repository search
List<IndexReader> readers = new ArrayList<IndexReader>();
for (String repository : repositories) {
IndexSearcher repositoryIndex = getIndexSearcher(repository);
readers.add(repositoryIndex.getIndexReader());
}
IndexReader[] rdrs = readers.toArray(new IndexReader[readers.size()]);
MultiSourceReader reader = new MultiSourceReader(rdrs);
searcher = new IndexSearcher(reader);
}
Query rewrittenQuery = searcher.rewrite(query);
logger.debug(rewrittenQuery.toString());
TopScoreDocCollector collector = TopScoreDocCollector.create(5000, true);
searcher.search(rewrittenQuery, collector);
int offset = Math.max(0, (page - 1) * pageSize);
ScoreDoc[] hits = collector.topDocs(offset, pageSize).scoreDocs;
int totalHits = collector.getTotalHits();
for (int i = 0; i < hits.length; i++) {
int docId = hits[i].doc;
Document doc = searcher.doc(docId);
SearchResult result = createSearchResult(doc, hits[i].score, offset + i + 1, totalHits);
if (repositories.length == 1) {
// single repository search
result.repository = repositories[0];
} else {
// multi-repository search
MultiSourceReader reader = (MultiSourceReader) searcher.getIndexReader();
int index = reader.getSourceIndex(docId);
result.repository = repositories[index];
}
String content = doc.get(FIELD_CONTENT);
result.fragment = getHighlightedFragment(analyzer, query, content, result);
results.add(result);
}
} catch (Exception e) {
logger.error(MessageFormat.format("Exception while searching for {0}", text), e);
}
return new ArrayList<SearchResult>(results);
}
/**
*
* @param analyzer
* @param query
* @param content
* @param result
* @return
* @throws IOException
* @throws InvalidTokenOffsetsException
*/
private String getHighlightedFragment(Analyzer analyzer, Query query,
String content, SearchResult result) throws IOException, InvalidTokenOffsetsException {
if (content == null) {
content = "";
}
int tabLength = storedSettings.getInteger(Keys.web.tabLength, 4);
int fragmentLength = SearchObjectType.commit == result.type ? 512 : 150;
QueryScorer scorer = new QueryScorer(query, "content");
Fragmenter fragmenter = new SimpleSpanFragmenter(scorer, fragmentLength);
// use an artificial delimiter for the token
String termTag = "!!--[";
String termTagEnd = "]--!!";
SimpleHTMLFormatter formatter = new SimpleHTMLFormatter(termTag, termTagEnd);
Highlighter highlighter = new Highlighter(formatter, scorer);
highlighter.setTextFragmenter(fragmenter);
String [] fragments = highlighter.getBestFragments(analyzer, "content", content, 3);
if (ArrayUtils.isEmpty(fragments)) {
if (SearchObjectType.blob == result.type) {
return "";
}
// clip commit message
String fragment = content;
if (fragment.length() > fragmentLength) {
fragment = fragment.substring(0, fragmentLength) + "...";
}
return "<pre class=\"text\">" + StringUtils.escapeForHtml(fragment, true, tabLength) + "</pre>";
}
// make sure we have unique fragments
Set<String> uniqueFragments = new LinkedHashSet<String>();
for (String fragment : fragments) {
uniqueFragments.add(fragment);
}
fragments = uniqueFragments.toArray(new String[uniqueFragments.size()]);
StringBuilder sb = new StringBuilder();
for (int i = 0, len = fragments.length; i < len; i++) {
String fragment = fragments[i];
String tag = "<pre class=\"text\">";
// resurrect the raw fragment from removing the artificial delimiters
String raw = fragment.replace(termTag, "").replace(termTagEnd, "");
// determine position of the raw fragment in the content
int pos = content.indexOf(raw);
// restore complete first line of fragment
int c = pos;
while (c > 0) {
c--;
if (content.charAt(c) == '\n') {
break;
}
}
if (c > 0) {
// inject leading chunk of first fragment line
fragment = content.substring(c + 1, pos) + fragment;
}
if (SearchObjectType.blob == result.type) {
// count lines as offset into the content for this fragment
int line = Math.max(1, StringUtils.countLines(content.substring(0, pos)));
// create fragment tag with line number and language
String lang = "";
String ext = StringUtils.getFileExtension(result.path).toLowerCase();
if (!StringUtils.isEmpty(ext)) {
// maintain leading space!
lang = " lang-" + ext;
}
tag = MessageFormat.format("<pre class=\"prettyprint linenums:{0,number,0}{1}\">", line, lang);
}
sb.append(tag);
// replace the artificial delimiter with html tags
String html = StringUtils.escapeForHtml(fragment, false);
html = html.replace(termTag, "<span class=\"highlight\">").replace(termTagEnd, "</span>");
sb.append(html);
sb.append("</pre>");
if (i < len - 1) {
sb.append("<span class=\"ellipses\">...</span><br/>");
}
}
return sb.toString();
}
/**
* Simple class to track the results of an index update.
*/
private class IndexResult {
long startTime = System.currentTimeMillis();
long endTime = startTime;
boolean success;
int branchCount;
int commitCount;
int blobCount;
void add(IndexResult result) {
this.branchCount += result.branchCount;
this.commitCount += result.commitCount;
this.blobCount += result.blobCount;
}
void success() {
success = true;
endTime = System.currentTimeMillis();
}
float duration() {
return (endTime - startTime)/1000f;
}
}
/**
* Custom subclass of MultiReader to identify the source index for a given
* doc id. This would not be necessary of there was a public method to
* obtain this information.
*
*/
private class MultiSourceReader extends MultiReader {
MultiSourceReader(IndexReader [] readers) {
super(readers, false);
}
int getSourceIndex(int docId) {
int index = -1;
try {
index = super.readerIndex(docId);
} catch (Exception e) {
logger.error("Error getting source index", e);
}
return index;
}
}
}