sync: add retry to fetch operations

Add retries with exponential backoff and jitter to the fetch
operations. By default don't change behavior and enable
behind the new flag '--fetch-retries'.

Bug: https://crbug.com/1061473

Change-Id: I492710843985d00f81cbe3402dc56f2d21a45b35
Reviewed-on: https://gerrit-review.googlesource.com/c/git-repo/+/261576
Reviewed-by: Mike Frysinger <vapier@google.com>
Tested-by: George Engelbrecht <engeg@google.com>
diff --git a/project.py b/project.py
index d35ad52..691e0d9 100644
--- a/project.py
+++ b/project.py
@@ -55,6 +55,12 @@
   input = raw_input  # noqa: F821
 
 
+# Maximum sleep time allowed during retries.
+MAXIMUM_RETRY_SLEEP_SEC = 3600.0
+# +-10% random jitter is added to each Fetches retry sleep duration.
+RETRY_JITTER_PERCENT = 0.1
+
+
 def _lwrite(path, content):
   lock = '%s.lock' % path
 
@@ -875,6 +881,7 @@
                is_derived=False,
                dest_branch=None,
                optimized_fetch=False,
+               retry_fetches=0,
                old_revision=None):
     """Init a Project object.
 
@@ -901,6 +908,8 @@
       dest_branch: The branch to which to push changes for review by default.
       optimized_fetch: If True, when a project is set to a sha1 revision, only
                        fetch from the remote if the sha1 is not present locally.
+      retry_fetches: Retry remote fetches n times upon receiving transient error
+                     with exponential backoff and jitter.
       old_revision: saved git commit id for open GITC projects.
     """
     self.manifest = manifest
@@ -936,6 +945,7 @@
     self.use_git_worktrees = use_git_worktrees
     self.is_derived = is_derived
     self.optimized_fetch = optimized_fetch
+    self.retry_fetches = max(0, retry_fetches)
     self.subprojects = []
 
     self.snapshots = {}
@@ -1449,6 +1459,7 @@
                        tags=True,
                        archive=False,
                        optimized_fetch=False,
+                       retry_fetches=0,
                        prune=False,
                        submodules=False,
                        clone_filter=None):
@@ -1532,7 +1543,7 @@
               current_branch_only=current_branch_only,
               tags=tags, prune=prune, depth=depth,
               submodules=submodules, force_sync=force_sync,
-              clone_filter=clone_filter):
+              clone_filter=clone_filter, retry_fetches=retry_fetches):
         return False
 
     mp = self.manifest.manifestProject
@@ -2334,8 +2345,10 @@
                    depth=None,
                    submodules=False,
                    force_sync=False,
-                   clone_filter=None):
-
+                   clone_filter=None,
+                   retry_fetches=2,
+                   retry_sleep_initial_sec=4.0,
+                   retry_exp_factor=2.0):
     is_sha1 = False
     tag_name = None
     # The depth should not be used when fetching to a mirror because
@@ -2497,18 +2510,37 @@
 
     cmd.extend(spec)
 
-    ok = False
-    for _i in range(2):
+    # At least one retry minimum due to git remote prune.
+    retry_fetches = max(retry_fetches, 2)
+    retry_cur_sleep = retry_sleep_initial_sec
+    ok = prune_tried = False
+    for try_n in range(retry_fetches):
       gitcmd = GitCommand(self, cmd, bare=True, ssh_proxy=ssh_proxy,
                           merge_output=True, capture_stdout=quiet)
       ret = gitcmd.Wait()
       if ret == 0:
         ok = True
         break
-      # If needed, run the 'git remote prune' the first time through the loop
-      elif (not _i and
-            "error:" in gitcmd.stderr and
-            "git remote prune" in gitcmd.stderr):
+
+      # Retry later due to HTTP 429 Too Many Requests.
+      elif ('error:' in gitcmd.stderr and
+            'HTTP 429' in gitcmd.stderr):
+        if not quiet:
+          print('429 received, sleeping: %s sec' % retry_cur_sleep,
+                file=sys.stderr)
+        time.sleep(retry_cur_sleep)
+        retry_cur_sleep = min(retry_exp_factor * retry_cur_sleep,
+                              MAXIMUM_RETRY_SLEEP_SEC)
+        retry_cur_sleep *= (1 - random.uniform(-RETRY_JITTER_PERCENT,
+                                               RETRY_JITTER_PERCENT))
+        continue
+
+      # If this is not last attempt, try 'git remote prune'.
+      elif (try_n < retry_fetches - 1 and
+            'error:' in gitcmd.stderr and
+            'git remote prune' in gitcmd.stderr and
+            not prune_tried):
+        prune_tried = True
         prunecmd = GitCommand(self, ['remote', 'prune', name], bare=True,
                               ssh_proxy=ssh_proxy)
         ret = prunecmd.Wait()
diff --git a/subcmds/sync.py b/subcmds/sync.py
index de6deec..efd3961 100644
--- a/subcmds/sync.py
+++ b/subcmds/sync.py
@@ -265,6 +265,9 @@
     p.add_option('--optimized-fetch',
                  dest='optimized_fetch', action='store_true',
                  help='only fetch projects fixed to sha1 if revision does not exist locally')
+    p.add_option('--retry-fetches',
+                 default=0, action='store', type='int',
+                 help='number of times to retry fetches on transient errors')
     p.add_option('--prune', dest='prune', action='store_true',
                  help='delete refs that no longer exist on the remote')
     if show_smart:
@@ -342,6 +345,7 @@
             clone_bundle=opt.clone_bundle,
             tags=opt.tags, archive=self.manifest.IsArchive,
             optimized_fetch=opt.optimized_fetch,
+            retry_fetches=opt.retry_fetches,
             prune=opt.prune,
             clone_filter=clone_filter)
         self._fetch_times.Set(project, time.time() - start)
@@ -777,6 +781,7 @@
                                     current_branch_only=opt.current_branch_only,
                                     tags=opt.tags,
                                     optimized_fetch=opt.optimized_fetch,
+                                    retry_fetches=opt.retry_fetches,
                                     submodules=self.manifest.HasSubmodules,
                                     clone_filter=self.manifest.CloneFilter)
       finish = time.time()