sync: Support downloading bundle to initialize repository

An HTTP (or HTTPS) based remote server may now offer a 'clone.bundle'
file in each repository's Git directory. Over an http:// or https://
remote repo will first ask for '$URL/clone.bundle', and if present
download this to bootstrap the local client, rather than relying
on the native Git transport to initialize the new repository.

Bundles may be hosted elsewhere. The client automatically follows a
HTTP 302 redirect to acquire the bundle file. This allows servers
to direct clients to cached copies residing on content delivery
networks, where the bundle may be closer to the end-user.

Bundle downloads are resumeable from where they last left off,
allowing clients to initialize large repositories even when the
connection gets interrupted.

If a bundle does not exist for a repository (a HTTP 404 response
code is returned for '$URL/clone.bundle'), the native Git transport
is used instead. If the client is performing a shallow sync, the
bundle transport is not used, as there is no way to embed shallow
data into the bundle.

Change-Id: I05dad17792fd6fd20635a0f71589566e557cc743
Signed-off-by: Shawn O. Pearce <sop@google.com>
diff --git a/error.py b/error.py
index 5238158..812585c 100644
--- a/error.py
+++ b/error.py
@@ -57,6 +57,15 @@
   def __str__(self):
     return self.reason
 
+class DownloadError(Exception):
+  """Cannot download a repository.
+  """
+  def __init__(self, reason):
+    self.reason = reason
+
+  def __str__(self):
+    return self.reason
+
 class NoSuchProjectError(Exception):
   """A specified project does not exist in the work tree.
   """
diff --git a/git_config.py b/git_config.py
index e4f4a0a..bcd6e8d 100644
--- a/git_config.py
+++ b/git_config.py
@@ -491,6 +491,12 @@
 URI_SCP = re.compile(r'^([^@:]*@?[^:/]{1,}):')
 URI_ALL = re.compile(r'^([a-z][a-z+]*)://([^@/]*@?[^/]*)/')
 
+def GetSchemeFromUrl(url):
+  m = URI_ALL.match(url)
+  if m:
+    return m.group(1)
+  return None
+
 def _preconnect(url):
   m = URI_ALL.match(url)
   if m:
diff --git a/main.py b/main.py
index c5c71c3..8ffdfcc 100755
--- a/main.py
+++ b/main.py
@@ -37,6 +37,7 @@
 from command import MirrorSafeCommand
 from command import PagedCommand
 from editor import Editor
+from error import DownloadError
 from error import ManifestInvalidRevisionError
 from error import NoSuchProjectError
 from error import RepoChangedException
@@ -143,6 +144,9 @@
           else:
             print >>sys.stderr, 'real\t%dh%dm%.3fs' \
               % (hours, minutes, seconds)
+    except DownloadError, e:
+      print >>sys.stderr, 'error: %s' % str(e)
+      sys.exit(1)
     except ManifestInvalidRevisionError, e:
       print >>sys.stderr, 'error: %s' % str(e)
       sys.exit(1)
diff --git a/project.py b/project.py
index 3efc445..5adfe82 100644
--- a/project.py
+++ b/project.py
@@ -24,9 +24,11 @@
 
 from color import Coloring
 from git_command import GitCommand
-from git_config import GitConfig, IsId
+from git_config import GitConfig, IsId, GetSchemeFromUrl
+from error import DownloadError
 from error import GitError, HookError, ImportError, UploadError
 from error import ManifestInvalidRevisionError
+from progress import Progress
 
 from git_refs import GitRefs, HEAD, R_HEADS, R_TAGS, R_PUB, R_M
 
@@ -884,15 +886,13 @@
 
 ## Sync ##
 
-  def Sync_NetworkHalf(self, quiet=False):
+  def Sync_NetworkHalf(self, quiet=False, is_new=None):
     """Perform only the network IO portion of the sync process.
        Local working directory/branch state is not affected.
     """
-    is_new = not self.Exists
+    if is_new is None:
+      is_new = not self.Exists
     if is_new:
-      if not quiet:
-        print >>sys.stderr
-        print >>sys.stderr, 'Initializing project %s ...' % self.name
       self._InitGitDir()
 
     self._InitRemote()
@@ -1312,9 +1312,16 @@
       name = self.remote.name
 
     ssh_proxy = False
-    if self.GetRemote(name).PreConnectFetch():
+    remote = self.GetRemote(name)
+    if remote.PreConnectFetch():
       ssh_proxy = True
 
+    bundle_dst = os.path.join(self.gitdir, 'clone.bundle')
+    bundle_tmp = os.path.join(self.gitdir, 'clone.bundle.tmp')
+    use_bundle = False
+    if os.path.exists(bundle_dst) or os.path.exists(bundle_tmp):
+      use_bundle = True
+
     if initial:
       alt = os.path.join(self.gitdir, 'objects/info/alternates')
       try:
@@ -1329,6 +1336,8 @@
         ref_dir = None
 
       if ref_dir and 'objects' == os.path.basename(ref_dir):
+        if use_bundle:
+          use_bundle = False
         ref_dir = os.path.dirname(ref_dir)
         packed_refs = os.path.join(self.gitdir, 'packed-refs')
         remote = self.GetRemote(name)
@@ -1368,6 +1377,7 @@
 
       else:
         ref_dir = None
+        use_bundle = True
 
     cmd = ['fetch']
 
@@ -1376,15 +1386,37 @@
     depth = self.manifest.manifestProject.config.GetString('repo.depth')
     if depth and initial:
       cmd.append('--depth=%s' % depth)
+      use_bundle = False
 
     if quiet:
       cmd.append('--quiet')
     if not self.worktree:
       cmd.append('--update-head-ok')
-    cmd.append(name)
-    if tag is not None:
-      cmd.append('tag')
-      cmd.append(tag)
+
+    if use_bundle and not os.path.exists(bundle_dst):
+      bundle_url = remote.url + '/clone.bundle'
+      bundle_url = GitConfig.ForUser().UrlInsteadOf(bundle_url)
+      if GetSchemeFromUrl(bundle_url) in ('http', 'https'):
+        use_bundle = self._FetchBundle(
+          bundle_url,
+          bundle_tmp,
+          bundle_dst,
+          quiet=quiet)
+      else:
+        use_bundle = False
+
+    if use_bundle:
+      if not quiet:
+        cmd.append('--quiet')
+      cmd.append(bundle_dst)
+      for f in remote.fetch:
+        cmd.append(str(f))
+      cmd.append('refs/tags/*:refs/tags/*')
+    else:
+      cmd.append(name)
+      if tag is not None:
+        cmd.append('tag')
+        cmd.append(tag)
 
     ok = GitCommand(self,
                     cmd,
@@ -1399,8 +1431,99 @@
           os.remove(packed_refs)
       self.bare_git.pack_refs('--all', '--prune')
 
+    if os.path.exists(bundle_dst):
+      os.remove(bundle_dst)
+    if os.path.exists(bundle_tmp):
+      os.remove(bundle_tmp)
+
     return ok
 
+  def _FetchBundle(self, srcUrl, tmpPath, dstPath, quiet=False):
+    keep = True
+    done = False
+    dest = open(tmpPath, 'a+b')
+    try:
+      dest.seek(0, os.SEEK_END)
+      pos = dest.tell()
+
+      req = urllib2.Request(srcUrl)
+      if pos > 0:
+        req.add_header('Range', 'bytes=%d-' % pos)
+
+      try:
+        r = urllib2.urlopen(req)
+      except urllib2.HTTPError, e:
+        if e.code == 404:
+          keep = False
+          return False
+        elif e.info()['content-type'] == 'text/plain':
+          try:
+            msg = e.read()
+            if len(msg) > 0 and msg[-1] == '\n':
+              msg = msg[0:-1]
+            msg = ' (%s)' % msg
+          except:
+            msg = ''
+        else:
+          try:
+            from BaseHTTPServer import BaseHTTPRequestHandler
+            res = BaseHTTPRequestHandler.responses[e.code]
+            msg = ' (%s: %s)' % (res[0], res[1])
+          except:
+            msg = ''
+        raise DownloadError('HTTP %s%s' % (e.code, msg))
+      except urllib2.URLError, e:
+        raise DownloadError('%s (%s)' % (e.reason, req.get_host()))
+
+      p = None
+      try:
+        size = r.headers['content-length']
+        unit = 1 << 10
+
+        if size and not quiet:
+          if size > 1024 * 1.3:
+            unit = 1 << 20
+            desc = 'MB'
+          else:
+            desc = 'KB'
+          p = Progress(
+            'Downloading %s' % self.relpath,
+            int(size) / unit,
+            units=desc)
+          if pos > 0:
+            p.update(pos / unit)
+
+        s = 0
+        while True:
+          d = r.read(8192)
+          if d == '':
+            done = True
+            return True
+          dest.write(d)
+          if p:
+            s += len(d)
+            if s >= unit:
+              p.update(s / unit)
+              s = s % unit
+        if p:
+          if s >= unit:
+            p.update(s / unit)
+          else:
+            p.update(1)
+      finally:
+        r.close()
+        if p:
+          p.end()
+    finally:
+      dest.close()
+
+      if os.path.exists(dstPath):
+        os.remove(dstPath)
+      if done:
+        os.rename(tmpPath, dstPath)
+      elif not keep:
+        os.remove(tmpPath)
+
   def _Checkout(self, rev, quiet=False):
     cmd = ['checkout']
     if quiet:
diff --git a/repo b/repo
index 1468fad..0e77983 100755
--- a/repo
+++ b/repo
@@ -28,7 +28,7 @@
 del magic
 
 # increment this whenever we make important changes to this script
-VERSION = (1, 12)
+VERSION = (1, 13)
 
 # increment this if the MAINTAINER_KEYS block is modified
 KEYRING_VERSION = (1,0)
@@ -91,6 +91,7 @@
 import readline
 import subprocess
 import sys
+import urllib2
 
 home_dot_repo = os.path.expanduser('~/.repoconfig')
 gpg_dir = os.path.join(home_dot_repo, 'gnupg')
@@ -187,10 +188,6 @@
     else:
       can_verify = True
 
-    if not opt.quiet:
-      print >>sys.stderr, 'Getting repo ...'
-      print >>sys.stderr, '   from %s' % url
-
     dst = os.path.abspath(os.path.join(repodir, S_repo))
     _Clone(url, dst, opt.quiet)
 
@@ -300,15 +297,42 @@
     raise CloneFailure()
 
 
-def _Fetch(local, quiet, *args):
+def _InitHttp():
+  handlers = []
+
+  mgr = urllib2.HTTPPasswordMgrWithDefaultRealm()
+  try:
+    import netrc
+    n = netrc.netrc()
+    for host in n.hosts:
+      p = n.hosts[host]
+      mgr.add_password(None, 'http://%s/'  % host, p[0], p[2])
+      mgr.add_password(None, 'https://%s/' % host, p[0], p[2])
+  except:
+    pass
+  handlers.append(urllib2.HTTPBasicAuthHandler(mgr))
+
+  if 'http_proxy' in os.environ:
+    url = os.environ['http_proxy']
+    handlers.append(urllib2.ProxyHandler({'http': url, 'https': url}))
+  if 'REPO_CURL_VERBOSE' in os.environ:
+    handlers.append(urllib2.HTTPHandler(debuglevel=1))
+    handlers.append(urllib2.HTTPSHandler(debuglevel=1))
+  urllib2.install_opener(urllib2.build_opener(*handlers))
+
+def _Fetch(url, local, src, quiet):
+  if not quiet:
+    print >>sys.stderr, 'Get %s' % url
+
   cmd = [GIT, 'fetch']
   if quiet:
     cmd.append('--quiet')
     err = subprocess.PIPE
   else:
     err = None
-  cmd.extend(args)
-  cmd.append('origin')
+  cmd.append(src)
+  cmd.append('+refs/heads/*:refs/remotes/origin/*')
+  cmd.append('refs/tags/*:refs/tags/*')
 
   proc = subprocess.Popen(cmd, cwd = local, stderr = err)
   if err:
@@ -317,6 +341,62 @@
   if proc.wait() != 0:
     raise CloneFailure()
 
+def _DownloadBundle(url, local, quiet):
+  if not url.endswith('/'):
+    url += '/'
+  url += 'clone.bundle'
+
+  proc = subprocess.Popen(
+    [GIT, 'config', '--get-regexp', 'url.*.insteadof'],
+    cwd = local,
+    stdout = subprocess.PIPE)
+  for line in proc.stdout:
+    m = re.compile(r'^url\.(.*)\.insteadof (.*)$').match(line)
+    if m:
+      new_url = m.group(1)
+      old_url = m.group(2)
+      if url.startswith(old_url):
+        url = new_url + url[len(old_url):]
+        break
+  proc.stdout.close()
+  proc.wait()
+
+  if not url.startswith('http:') and not url.startswith('https:'):
+    return False
+
+  dest = open(os.path.join(local, '.git', 'clone.bundle'), 'w+b')
+  try:
+    try:
+      r = urllib2.urlopen(url)
+    except urllib2.HTTPError, e:
+      if e.code == 404:
+        return False
+      print >>sys.stderr, 'fatal: Cannot get %s' % url
+      print >>sys.stderr, 'fatal: HTTP error %s' % e.code
+      raise CloneFailure()
+    except urllib2.URLError, e:
+      print >>sys.stderr, 'fatal: Cannot get %s' % url
+      print >>sys.stderr, 'fatal: error %s' % e.reason
+      raise CloneFailure()
+    try:
+      if not quiet:
+        print >>sys.stderr, 'Get %s' % url
+      while True:
+        buf = r.read(8192)
+        if buf == '':
+          return True
+        dest.write(buf)
+    finally:
+      r.close()
+  finally:
+    dest.close()
+
+def _ImportBundle(local):
+  path = os.path.join(local, '.git', 'clone.bundle')
+  try:
+    _Fetch(local, local, path, True)
+  finally:
+    os.remove(path)
 
 def _Clone(url, local, quiet):
   """Clones a git repository to a new subdirectory of repodir
@@ -344,11 +424,14 @@
     print >>sys.stderr, 'fatal: could not create %s' % local
     raise CloneFailure()
 
+  _InitHttp()
   _SetConfig(local, 'remote.origin.url', url)
   _SetConfig(local, 'remote.origin.fetch',
                     '+refs/heads/*:refs/remotes/origin/*')
-  _Fetch(local, quiet)
-  _Fetch(local, quiet, '--tags')
+  if _DownloadBundle(url, local, quiet):
+    _ImportBundle(local)
+  else:
+    _Fetch(url, local, 'origin', quiet)
 
 
 def _Verify(cwd, branch, quiet):
diff --git a/subcmds/init.py b/subcmds/init.py
index c35cc82..9214aed 100644
--- a/subcmds/init.py
+++ b/subcmds/init.py
@@ -21,6 +21,7 @@
 from command import InteractiveCommand, MirrorSafeCommand
 from error import ManifestParseError
 from project import SyncBuffer
+from git_config import GitConfig
 from git_command import git_require, MIN_GIT_VERSION
 
 class Init(InteractiveCommand, MirrorSafeCommand):
@@ -108,8 +109,8 @@
         sys.exit(1)
 
       if not opt.quiet:
-        print >>sys.stderr, 'Getting manifest ...'
-        print >>sys.stderr, '   from %s' % opt.manifest_url
+        print >>sys.stderr, 'Get %s' \
+          % GitConfig.ForUser().UrlInsteadOf(opt.manifest_url)
       m._InitGitDir()
 
       if opt.manifest_branch:
@@ -138,7 +139,7 @@
         print >>sys.stderr, 'fatal: --mirror not supported on existing client'
         sys.exit(1)
 
-    if not m.Sync_NetworkHalf():
+    if not m.Sync_NetworkHalf(is_new=is_new):
       r = m.GetRemote(m.remote.name)
       print >>sys.stderr, 'fatal: cannot obtain manifest %s' % r.url