blob: 8754b4e773452aa98f670379348e3974fac7ad3a [file] [log] [blame]
// Command zoekt-archive-index indexes an archive.
//
// Example via github.com:
//
// zoekt-archive-index -incremental -commit b57cb1605fd11ba2ecfa7f68992b4b9cc791934d -name github.com/gorilla/mux -strip_components 1 https://codeload.github.com/gorilla/mux/legacy.tar.gz/b57cb1605fd11ba2ecfa7f68992b4b9cc791934d
//
// zoekt-archive-index -branch master https://github.com/gorilla/mux/commit/b57cb1605fd11ba2ecfa7f68992b4b9cc791934d
package main
import (
"errors"
"flag"
"fmt"
"io"
"io/ioutil"
"log"
"net/url"
"strings"
"github.com/google/zoekt"
"github.com/google/zoekt/build"
"github.com/google/zoekt/cmd"
"github.com/google/zoekt/gitindex"
"go.uber.org/automaxprocs/maxprocs"
)
// stripComponents removes the specified number of leading path
// elements. Pathnames with fewer elements will return the empty string.
func stripComponents(path string, count int) string {
for i := 0; path != "" && i < count; i++ {
i := strings.Index(path, "/")
if i < 0 {
return ""
}
path = path[i+1:]
}
return path
}
// isGitOID checks if the revision is a git OID SHA string.
//
// Note: This doesn't mean the SHA exists in a repository, nor does it mean it
// isn't a ref. Git allows 40-char hexadecimal strings to be references.
func isGitOID(s string) bool {
if len(s) != 40 {
return false
}
for _, r := range s {
if !(('0' <= r && r <= '9') ||
('a' <= r && r <= 'f') ||
('A' <= r && r <= 'F')) {
return false
}
}
return true
}
type Options struct {
Incremental bool
Archive string
Name string
RepoURL string
Branch string
Commit string
Strip int
}
func (o *Options) SetDefaults() {
// We guess based on the archive URL.
u, _ := url.Parse(o.Archive)
if u == nil {
return
}
setRef := func(ref string) {
if isGitOID(ref) && o.Commit == "" {
o.Commit = ref
}
if !isGitOID(ref) && o.Branch == "" {
o.Branch = ref
}
}
switch u.Host {
case "github.com", "codeload.github.com":
// https://github.com/octokit/octokit.rb/commit/3d21ec53a331a6f037a91c368710b99387d012c1
// https://github.com/octokit/octokit.rb/blob/master/README.md
// https://github.com/octokit/octokit.rb/tree/master/lib
// https://codeload.github.com/octokit/octokit.rb/legacy.tar.gz/master
parts := strings.Split(u.Path, "/")
if len(parts) > 2 && o.Name == "" {
o.Name = fmt.Sprintf("github.com/%s/%s", parts[1], parts[2])
o.RepoURL = fmt.Sprintf("https://github.com/%s/%s", parts[1], parts[2])
}
if len(parts) > 4 {
setRef(parts[4])
if u.Host == "github.com" {
o.Archive = fmt.Sprintf("https://codeload.github.com/%s/%s/legacy.tar.gz/%s", parts[1], parts[2], parts[4])
}
}
o.Strip = 1
case "api.github.com":
// https://api.github.com/repos/octokit/octokit.rb/tarball/master
parts := strings.Split(u.Path, "/")
if len(parts) > 2 && o.Name == "" {
o.Name = fmt.Sprintf("github.com/%s/%s", parts[1], parts[2])
o.RepoURL = fmt.Sprintf("https://github.com/%s/%s", parts[1], parts[2])
}
if len(parts) > 5 {
setRef(parts[5])
}
o.Strip = 1
}
}
func do(opts Options, bopts build.Options) error {
opts.SetDefaults()
if opts.Name == "" && opts.RepoURL == "" {
return errors.New("-name or -url required")
}
if opts.Branch == "" {
return errors.New("-branch required")
}
if opts.Name != "" {
bopts.RepositoryDescription.Name = opts.Name
}
if opts.RepoURL != "" {
u, err := url.Parse(opts.RepoURL)
if err != nil {
return err
}
if err := gitindex.SetTemplatesFromOrigin(&bopts.RepositoryDescription, u); err != nil {
return err
}
}
bopts.SetDefaults()
bopts.RepositoryDescription.Branches = []zoekt.RepositoryBranch{{Name: opts.Branch, Version: opts.Commit}}
brs := []string{opts.Branch}
if opts.Incremental && bopts.IncrementalSkipIndexing() {
return nil
}
a, err := openArchive(opts.Archive)
if err != nil {
return err
}
defer a.Close()
bopts.RepositoryDescription.Source = opts.Archive
builder, err := build.NewBuilder(bopts)
if err != nil {
return err
}
add := func(f *File) error {
defer f.Close()
contents, err := ioutil.ReadAll(f)
if err != nil {
return err
}
name := stripComponents(f.Name, opts.Strip)
if name == "" {
return nil
}
return builder.Add(zoekt.Document{
Name: name,
Content: contents,
Branches: brs,
})
}
for {
f, err := a.Next()
if err == io.EOF {
break
}
if err != nil {
return err
}
if err := add(f); err != nil {
return err
}
}
return builder.Finish()
}
func main() {
var (
incremental = flag.Bool("incremental", true, "only index changed repositories")
name = flag.String("name", "", "The repository name for the archive")
urlRaw = flag.String("url", "", "The repository URL for the archive")
branch = flag.String("branch", "", "The branch name for the archive")
commit = flag.String("commit", "", "The commit sha for the archive. If incremental this will avoid updating shards already at commit")
strip = flag.Int("strip_components", 0, "Remove the specified number of leading path elements. Pathnames with fewer elements will be silently skipped.")
)
flag.Parse()
// Tune GOMAXPROCS to match Linux container CPU quota.
maxprocs.Set()
log.SetFlags(log.LstdFlags | log.Lshortfile)
if len(flag.Args()) != 1 {
log.Fatal("expected argument for archive location")
}
archive := flag.Args()[0]
bopts := cmd.OptionsFromFlags()
opts := Options{
Incremental: *incremental,
Archive: archive,
Name: *name,
RepoURL: *urlRaw,
Branch: *branch,
Commit: *commit,
Strip: *strip,
}
if err := do(opts, *bopts); err != nil {
log.Fatal(err)
}
}