cmd/zoekt-mirror-gitiles/cgit.go - zoekt - Git at Google

 // Copyright 2016 Google Inc. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //    http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 package main

 import (
 	"bytes"
 	"fmt"
 	"io/ioutil"
 	"log"
 	"net/http"
 	"net/url"
 	"regexp"
 	"strings"
 )

 // I will go to programmer hell for trying to parse HTML with
 // regexps. Why doesn't CGit have a JSON interface?
 var cgitRepoEntryRE = regexp.MustCompile(
 	`class='sublevel-repo'><a title='([^'"]*)' href='([^']*)'>`)

 func normalizedGet(u *url.URL) ([]byte, error) {
 	rep, err := http.Get(u.String())
 	if err != nil {
 		return nil, err
 	}
 	defer rep.Body.Close()
 	if rep.StatusCode != 200 {
 		return nil, fmt.Errorf("status %s", rep.Status)
 	}

 	c, err := ioutil.ReadAll(rep.Body)
 	if err != nil {
 		return nil, err
 	}

 	c = bytes.Replace(c, []byte{'\n'}, []byte{' '}, -1)
 	return c, nil
 }

 // getCGitRepos finds repo names from the CGit index page hosted at
 // URL `u`.
 func getCGitRepos(u *url.URL, filter func(string) bool) (map[string]*crawlTarget, error) {
 	c, err := normalizedGet(u)
 	if err != nil {
 		return nil, err
 	}

 	pages := map[string]*crawlTarget{}
 	for _, m := range cgitRepoEntryRE.FindAllSubmatch(c, -1) {
 		nm := strings.TrimSuffix(string(m[1]), ".git")

 		if !filter(nm) {
 			continue
 		}

 		relUrl := string(m[2])

 		u, err := u.Parse(relUrl)
 		if err != nil {
 			log.Printf("ignoring u.Parse(%q): %v", relUrl, err)
 			continue
 		}
 		pages[nm] = &crawlTarget{
 			webURL:     u.String(),
 			webURLType: "cgit",
 		}
 	}

 	// TODO - parallel?
 	for _, target := range pages {
 		u, _ := url.Parse(target.webURL)
 		c, err := cgitCloneURL(u)
 		if err != nil {
 			log.Printf("ignoring cgitCloneURL(%s): %v", u, c)
 			continue
 		}

 		target.cloneURL = c.String()
 	}
 	return pages, nil
 }

 // We'll take the first URL we get. This may put the git:// URL (which
 // is insecure) at the top, but individual machines (such as
 // git.savannah.gnu) probably would rather receive git:// traffic
 // which is more efficient.

 // TODO - do something like `Clone.*<a.*href=` to get the first
 // URL. Older versions don't say vcs-git.
 var cloneURLRe = regexp.MustCompile(
 	`rel=["']vcs-git["'] *href=["']([^"']*)["']`)

 func cgitCloneURL(u *url.URL) (*url.URL, error) {
 	c, err := normalizedGet(u)
 	if err != nil {
 		return nil, err
 	}

 	m := cloneURLRe.FindSubmatch(c)
 	cl, err := url.Parse(string(m[1]))
 	if err != nil {
 		return nil, err
 	}

 	return cl, nil
 }
	// Copyright 2016 Google Inc. All rights reserved.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	package main

	import (
	"bytes"
	"fmt"
	"io/ioutil"
	"log"
	"net/http"
	"net/url"
	"regexp"
	"strings"
	)

	// I will go to programmer hell for trying to parse HTML with
	// regexps. Why doesn't CGit have a JSON interface?
	var cgitRepoEntryRE = regexp.MustCompile(
	`class='sublevel-repo'><a title='([^'"])' href='([^'])'>`)

	func normalizedGet(u *url.URL) ([]byte, error) {
	rep, err := http.Get(u.String())
	if err != nil {
	return nil, err
	}
	defer rep.Body.Close()
	if rep.StatusCode != 200 {
	return nil, fmt.Errorf("status %s", rep.Status)
	}

	c, err := ioutil.ReadAll(rep.Body)
	if err != nil {
	return nil, err
	}

	c = bytes.Replace(c, []byte{'\n'}, []byte{' '}, -1)
	return c, nil
	}

	// getCGitRepos finds repo names from the CGit index page hosted at
	// URL `u`.
	func getCGitRepos(u url.URL, filter func(string) bool) (map[string]crawlTarget, error) {
	c, err := normalizedGet(u)
	if err != nil {
	return nil, err
	}

	pages := map[string]*crawlTarget{}
	for _, m := range cgitRepoEntryRE.FindAllSubmatch(c, -1) {
	nm := strings.TrimSuffix(string(m[1]), ".git")

	if !filter(nm) {
	continue
	}

	relUrl := string(m[2])

	u, err := u.Parse(relUrl)
	if err != nil {
	log.Printf("ignoring u.Parse(%q): %v", relUrl, err)
	continue
	}
	pages[nm] = &crawlTarget{
	webURL: u.String(),
	webURLType: "cgit",
	}
	}

	// TODO - parallel?
	for _, target := range pages {
	u, _ := url.Parse(target.webURL)
	c, err := cgitCloneURL(u)
	if err != nil {
	log.Printf("ignoring cgitCloneURL(%s): %v", u, c)
	continue
	}

	target.cloneURL = c.String()
	}
	return pages, nil
	}

	// We'll take the first URL we get. This may put the git:// URL (which
	// is insecure) at the top, but individual machines (such as
	// git.savannah.gnu) probably would rather receive git:// traffic
	// which is more efficient.

	// TODO - do something like `Clone.<a.href=` to get the first
	// URL. Older versions don't say vcs-git.
	var cloneURLRe = regexp.MustCompile(
	`rel=["']vcs-git["'] href=["']([^"'])["']`)

	func cgitCloneURL(u url.URL) (url.URL, error) {
	c, err := normalizedGet(u)
	if err != nil {
	return nil, err
	}

	m := cloneURLRe.FindSubmatch(c)
	cl, err := url.Parse(string(m[1]))
	if err != nil {
	return nil, err
	}

	return cl, nil
	}