| // Copyright 2016 Google Inc. All rights reserved. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| /* zoekt-repo-index indexes a repo-based repository. The constituent |
| git repositories should already have been downloaded to the |
| --repo_cache directory, eg. |
| |
| go install github.com/google/zoekt/cmd/zoekt-repo-index && |
| |
| zoekt-repo-index -base_url https://gfiber.googlesource.com/ \ |
| -manifest_repo_url https://gfiber.googlesource.com/manifests \ |
| -manifest_rev_prefix=refs/heads/ \ |
| -rev_prefix="refs/remotes/" \ |
| -repo_cache ~/zoekt-serving/repos/ \ |
| -shard_limit 50000000 \ |
| master:default_unrestricted.xml |
| */ |
| package main |
| |
| import ( |
| "crypto/sha1" |
| "flag" |
| "fmt" |
| "io/ioutil" |
| "log" |
| "net/url" |
| "path" |
| "path/filepath" |
| "reflect" |
| "sort" |
| "strings" |
| |
| "github.com/google/slothfs/manifest" |
| "github.com/google/zoekt" |
| "github.com/google/zoekt/build" |
| "github.com/google/zoekt/gitindex" |
| "go.uber.org/automaxprocs/maxprocs" |
| |
| git "gopkg.in/src-d/go-git.v4" |
| "gopkg.in/src-d/go-git.v4/plumbing" |
| ) |
| |
| var _ = log.Println |
| |
| type fileKey struct { |
| SubRepoPath string |
| Path string |
| ID plumbing.Hash |
| } |
| |
| func (k *fileKey) FullPath() string { |
| return filepath.Join(k.SubRepoPath, k.Path) |
| } |
| |
| type branchFile struct { |
| branch, file string |
| mf *manifest.Manifest |
| manifestPath string |
| } |
| |
| func parseBranches(manifestRepoURL, revPrefix string, cache *gitindex.RepoCache, args []string) ([]branchFile, error) { |
| var branches []branchFile |
| if manifestRepoURL != "" { |
| u, err := url.Parse(manifestRepoURL) |
| if err != nil { |
| return nil, err |
| } |
| |
| repo, err := cache.Open(u) |
| if err != nil { |
| return nil, err |
| } |
| |
| for _, f := range args { |
| fs := strings.SplitN(f, ":", 2) |
| if len(fs) != 2 { |
| return nil, fmt.Errorf("cannot parse %q as BRANCH:FILE", f) |
| } |
| mf, err := getManifest(repo, revPrefix+fs[0], fs[1]) |
| if err != nil { |
| return nil, fmt.Errorf("manifest %s:%s: %v", fs[0], fs[1], err) |
| } |
| |
| branches = append(branches, branchFile{ |
| branch: fs[0], |
| file: fs[1], |
| mf: mf, |
| manifestPath: cache.Path(u), |
| }) |
| } |
| } else { |
| if len(args) == 0 { |
| return nil, fmt.Errorf("must give XML file argument") |
| } |
| for _, f := range args { |
| mf, err := manifest.ParseFile(f) |
| if err != nil { |
| return nil, err |
| } |
| |
| branches = append(branches, branchFile{ |
| branch: "HEAD", |
| file: filepath.Base(f), |
| mf: mf, |
| manifestPath: f, |
| }) |
| } |
| } |
| return branches, nil |
| } |
| |
| func main() { |
| var sizeMax = flag.Int("file_limit", 128<<10, "maximum file size") |
| var shardLimit = flag.Int("shard_limit", 100<<20, "maximum corpus size for a shard") |
| var parallelism = flag.Int("parallelism", 1, "maximum number of parallel indexing processes") |
| |
| revPrefix := flag.String("rev_prefix", "refs/remotes/origin/", "prefix for references") |
| baseURLStr := flag.String("base_url", "", "base url to interpret repository names") |
| repoCacheDir := flag.String("repo_cache", "", "root for repository cache") |
| indexDir := flag.String("index", build.DefaultDir, "index directory for *.zoekt files") |
| manifestRepoURL := flag.String("manifest_repo_url", "", "set a URL for a git repository holding manifest XML file. Provide the BRANCH:XML-FILE as further command-line arguments") |
| manifestRevPrefix := flag.String("manifest_rev_prefix", "refs/remotes/origin/", "prefixes for branches in manifest repository") |
| repoName := flag.String("name", "", "set repository name") |
| repoURL := flag.String("url", "", "set repository URL") |
| maxSubProjects := flag.Int("max_sub_projects", 0, "trim number of projects in manifest, for debugging.") |
| incremental := flag.Bool("incremental", true, "only index if the repository has changed.") |
| flag.Parse() |
| |
| // Tune GOMAXPROCS to match Linux container CPU quota. |
| maxprocs.Set() |
| |
| if *repoCacheDir == "" { |
| log.Fatal("must set --repo_cache") |
| } |
| repoCache := gitindex.NewRepoCache(*repoCacheDir) |
| |
| if u, err := url.Parse(*baseURLStr); err != nil { |
| log.Fatalf("Parse(%q): %v", u, err) |
| } else if *repoName == "" { |
| *repoName = filepath.Join(u.Host, u.Path) |
| } |
| |
| opts := build.Options{ |
| Parallelism: *parallelism, |
| SizeMax: *sizeMax, |
| ShardMax: *shardLimit, |
| IndexDir: *indexDir, |
| RepositoryDescription: zoekt.Repository{ |
| Name: *repoName, |
| URL: *repoURL, |
| }, |
| } |
| opts.SetDefaults() |
| baseURL, err := url.Parse(*baseURLStr) |
| if err != nil { |
| log.Fatalf("Parse baseURL %q: %v", *baseURLStr, err) |
| } |
| |
| branches, err := parseBranches(*manifestRepoURL, *manifestRevPrefix, repoCache, flag.Args()) |
| if err != nil { |
| log.Fatalf("parseBranches(%s, %s): %v", *manifestRepoURL, *manifestRevPrefix, err) |
| } |
| if len(branches) == 0 { |
| log.Fatal("must specify at least one branch") |
| } |
| if *maxSubProjects > 0 { |
| for _, b := range branches { |
| if *maxSubProjects < len(b.mf.Project) { |
| b.mf.Project = b.mf.Project[:*maxSubProjects] |
| } |
| } |
| } |
| |
| perBranch := map[string]map[fileKey]gitindex.BlobLocation{} |
| opts.SubRepositories = map[string]*zoekt.Repository{} |
| |
| // branch => repo => version |
| versionMap := map[string]map[string]plumbing.Hash{} |
| for _, br := range branches { |
| br.mf.Filter() |
| files, versions, err := iterateManifest(br.mf, *baseURL, *revPrefix, repoCache) |
| if err != nil { |
| log.Fatalf("iterateManifest: %v", err) |
| } |
| |
| perBranch[br.branch] = files |
| for key, loc := range files { |
| _, ok := opts.SubRepositories[key.SubRepoPath] |
| if ok { |
| // This can be incorrect: if the layout of manifests |
| // changes across branches, then the same file could |
| // be in different subRepos. We'll pretend this is not |
| // a problem. |
| continue |
| } |
| |
| desc := &zoekt.Repository{} |
| if err := gitindex.SetTemplatesFromOrigin(desc, loc.URL); err != nil { |
| log.Fatalf("SetTemplatesFromOrigin(%s): %v", loc.URL, err) |
| } |
| |
| opts.SubRepositories[key.SubRepoPath] = desc |
| } |
| versionMap[br.branch] = versions |
| } |
| |
| for _, br := range branches { |
| var paths []string |
| for p := range opts.SubRepositories { |
| paths = append(paths, p) |
| } |
| sort.Strings(paths) |
| |
| // Compute a version of the aggregate. This version |
| // has nothing to do with git, but will let us do |
| // incrementality correctly. |
| hasher := sha1.New() |
| for _, p := range paths { |
| repo := opts.SubRepositories[p] |
| id := versionMap[br.branch][p] |
| |
| // it is possible that 'id' is zero, if this |
| // branch of the manifest doesn't have this |
| // particular subrepository. |
| hasher.Write([]byte(p)) |
| hasher.Write([]byte(id.String())) |
| repo.Branches = append(repo.Branches, zoekt.RepositoryBranch{ |
| Name: br.branch, |
| Version: id.String(), |
| }) |
| } |
| |
| opts.RepositoryDescription.Branches = append(opts.RepositoryDescription.Branches, zoekt.RepositoryBranch{ |
| Name: br.branch, |
| Version: fmt.Sprintf("%x", hasher.Sum(nil)), |
| }) |
| } |
| |
| // key => branch |
| all := map[fileKey][]string{} |
| for br, files := range perBranch { |
| for k := range files { |
| all[k] = append(all[k], br) |
| } |
| } |
| |
| if *incremental { |
| versions := opts.IndexVersions() |
| if reflect.DeepEqual(versions, opts.RepositoryDescription.Branches) { |
| return |
| } |
| } |
| |
| builder, err := build.NewBuilder(opts) |
| if err != nil { |
| log.Fatal(err) |
| } |
| for k, branches := range all { |
| loc := perBranch[branches[0]][k] |
| data, err := loc.Blob(&k.ID) |
| if err != nil { |
| log.Fatal(err) |
| } |
| |
| doc := zoekt.Document{ |
| Name: k.FullPath(), |
| Content: data, |
| SubRepositoryPath: k.SubRepoPath, |
| } |
| |
| doc.Branches = append(doc.Branches, branches...) |
| if err := builder.Add(doc); err != nil { |
| log.Printf("Add(%s): %v", doc.Name, err) |
| break |
| } |
| } |
| if err := builder.Finish(); err != nil { |
| log.Fatalf("Finish: %v", err) |
| } |
| } |
| |
| // getManifest parses the manifest XML at the given branch/path inside a Git repository. |
| func getManifest(repo *git.Repository, branch, path string) (*manifest.Manifest, error) { |
| ref, err := repo.Reference(plumbing.ReferenceName("refs/heads/"+branch), true) |
| if err != nil { |
| return nil, err |
| } |
| |
| commit, err := repo.CommitObject(ref.Hash()) |
| if err != nil { |
| return nil, err |
| } |
| |
| tree, err := repo.TreeObject(commit.TreeHash) |
| if err != nil { |
| return nil, err |
| } |
| |
| entry, err := tree.FindEntry(path) |
| if err != nil { |
| return nil, err |
| } |
| |
| blob, err := repo.BlobObject(entry.Hash) |
| if err != nil { |
| return nil, err |
| } |
| r, err := blob.Reader() |
| if err != nil { |
| return nil, err |
| } |
| defer r.Close() |
| |
| content, _ := ioutil.ReadAll(r) |
| return manifest.Parse(content) |
| } |
| |
| // iterateManifest constructs a complete tree from the given Manifest. |
| func iterateManifest(mf *manifest.Manifest, |
| baseURL url.URL, revPrefix string, |
| cache *gitindex.RepoCache) (map[fileKey]gitindex.BlobLocation, map[string]plumbing.Hash, error) { |
| allFiles := map[fileKey]gitindex.BlobLocation{} |
| allVersions := map[string]plumbing.Hash{} |
| for _, p := range mf.Project { |
| rev := mf.ProjectRevision(&p) |
| |
| projURL := baseURL |
| projURL.Path = path.Join(projURL.Path, p.Name) |
| |
| topRepo, err := cache.Open(&projURL) |
| if err != nil { |
| return nil, nil, err |
| } |
| |
| ref, err := topRepo.Reference(plumbing.ReferenceName(revPrefix+rev), true) |
| if err != nil { |
| return nil, nil, err |
| } |
| |
| commit, err := topRepo.CommitObject(ref.Hash()) |
| if err != nil { |
| return nil, nil, err |
| } |
| if err != nil { |
| return nil, nil, err |
| } |
| |
| allVersions[p.GetPath()] = commit.Hash |
| |
| tree, err := commit.Tree() |
| if err != nil { |
| return nil, nil, err |
| } |
| |
| files, versions, err := gitindex.TreeToFiles(topRepo, tree, projURL.String(), cache) |
| if err != nil { |
| return nil, nil, err |
| } |
| |
| for key, repo := range files { |
| allFiles[fileKey{ |
| SubRepoPath: filepath.Join(p.GetPath(), key.SubRepoPath), |
| Path: key.Path, |
| ID: key.ID, |
| }] = repo |
| } |
| |
| for path, version := range versions { |
| allVersions[filepath.Join(p.GetPath(), path)] = version |
| } |
| } |
| |
| return allFiles, allVersions, nil |
| } |