sync: try to checkout repos across sync failures

Currently our default behavior is:
* Try to sync all repos
  * If any errors seen, exit
* Try to garbage collect all repos
  * If any errors seen, exit
* Try to update local project list
  * If any errors seen, exit
* Try to checkout out all local repos
  * If any errors seen, exit

Users find these incomplete syncs confusing, so lets try to complete
as much as possible by default and printing out summaries at the end.

Bug: https://crbug.com/gerrit/11293
Change-Id: Idd17cc9c3bbc574d8a0f08a30225dec7bfe414cb
Reviewed-on: https://gerrit-review.googlesource.com/c/git-repo/+/238554
Reviewed-by: Michael Mortensen <mmortensen@google.com>
Reviewed-by: Mike Frysinger <vapier@google.com>
Tested-by: Mike Frysinger <vapier@google.com>
diff --git a/subcmds/sync.py b/subcmds/sync.py
index ca4b97b..808df20 100644
--- a/subcmds/sync.py
+++ b/subcmds/sync.py
@@ -368,7 +368,7 @@
 
     return success
 
-  def _Fetch(self, projects, opt):
+  def _Fetch(self, projects, opt, err_event):
     fetched = set()
     lock = _threading.Lock()
     pm = Progress('Fetching projects', len(projects),
@@ -380,7 +380,6 @@
 
     threads = set()
     sem = _threading.Semaphore(self.jobs)
-    err_event = _threading.Event()
     for project_list in objdir_project_map.values():
       # Check for any errors before running any more tasks.
       # ...we'll let existing threads finish, though.
@@ -409,16 +408,11 @@
     for t in threads:
       t.join()
 
-    # If we saw an error, exit with code 1 so that other scripts can check.
-    if err_event.isSet() and opt.fail_fast:
-      print('\nerror: Exited sync due to fetch errors', file=sys.stderr)
-      sys.exit(1)
-
     pm.end()
     self._fetch_times.Save()
 
     if not self.manifest.IsArchive:
-      self._GCProjects(projects)
+      self._GCProjects(projects, opt, err_event)
 
     return fetched
 
@@ -504,12 +498,16 @@
 
     return success
 
-  def _Checkout(self, all_projects, opt):
+  def _Checkout(self, all_projects, opt, err_event, err_results):
     """Checkout projects listed in all_projects
 
     Args:
       all_projects: List of all projects that should be checked out.
       opt: Program options returned from optparse.  See _Options().
+      err_event: We'll set this event in the case of an error (after printing
+          out info about the error).
+      err_results: A list of strings, paths to git repos where checkout
+          failed.
     """
 
     # Perform checkouts in multiple threads when we are using partial clone.
@@ -528,8 +526,6 @@
 
     threads = set()
     sem = _threading.Semaphore(syncjobs)
-    err_event = _threading.Event()
-    err_results = []
 
     for project in all_projects:
       # Check for any errors before running any more tasks.
@@ -560,15 +556,8 @@
       t.join()
 
     pm.end()
-    # If we saw an error, exit with code 1 so that other scripts can check.
-    if err_event.isSet():
-      print('\nerror: Exited sync due to checkout errors', file=sys.stderr)
-      if err_results:
-        print('Failing repos:\n%s' % '\n'.join(err_results),
-              file=sys.stderr)
-      sys.exit(1)
 
-  def _GCProjects(self, projects):
+  def _GCProjects(self, projects, opt, err_event):
     gc_gitdirs = {}
     for project in projects:
       if len(project.manifest.GetProjectsWithName(project.name)) > 1:
@@ -592,7 +581,6 @@
 
     threads = set()
     sem = _threading.Semaphore(jobs)
-    err_event = _threading.Event()
 
     def GC(bare_git):
       try:
@@ -607,7 +595,7 @@
         sem.release()
 
     for bare_git in gc_gitdirs.values():
-      if err_event.isSet():
+      if err_event.isSet() and opt.fail_fast:
         break
       sem.acquire()
       t = _threading.Thread(target=GC, args=(bare_git,))
@@ -618,10 +606,6 @@
     for t in threads:
       t.join()
 
-    if err_event.isSet():
-      print('\nerror: Exited sync due to gc errors', file=sys.stderr)
-      sys.exit(1)
-
   def _ReloadManifest(self, manifest_name=None):
     if manifest_name:
       # Override calls _Unload already
@@ -902,6 +886,8 @@
           print('error: failed to remove existing smart sync override manifest: %s' %
                 e, file=sys.stderr)
 
+    err_event = _threading.Event()
+
     rp = self.manifest.repoProject
     rp.PreSync()
 
@@ -955,6 +941,10 @@
                                     missing_ok=True,
                                     submodules_ok=opt.fetch_submodules)
 
+    err_network_sync = False
+    err_update_projects = False
+    err_checkout = False
+
     self._fetch_times = _FetchTimes(self.manifest)
     if not opt.local_only:
       to_fetch = []
@@ -964,10 +954,14 @@
       to_fetch.extend(all_projects)
       to_fetch.sort(key=self._fetch_times.Get, reverse=True)
 
-      fetched = self._Fetch(to_fetch, opt)
+      fetched = self._Fetch(to_fetch, opt, err_event)
+
       _PostRepoFetch(rp, opt.no_repo_verify)
       if opt.network_only:
         # bail out now; the rest touches the working tree
+        if err_event.isSet():
+          print('\nerror: Exited sync due to fetch errors.\n', file=sys.stderr)
+          sys.exit(1)
         return
 
       # Iteratively fetch missing and/or nested unregistered submodules
@@ -989,22 +983,56 @@
         if previously_missing_set == missing_set:
           break
         previously_missing_set = missing_set
-        fetched.update(self._Fetch(missing, opt))
+        fetched.update(self._Fetch(missing, opt, err_event))
+
+      # If we saw an error, exit with code 1 so that other scripts can check.
+      if err_event.isSet():
+        err_network_sync = True
+        if opt.fail_fast:
+          print('\nerror: Exited sync due to fetch errors.\n'
+                'Local checkouts *not* updated. Resolve network issues & '
+                'retry.\n'
+                '`repo sync -l` will update some local checkouts.',
+                file=sys.stderr)
+          sys.exit(1)
 
     if self.manifest.IsMirror or self.manifest.IsArchive:
       # bail out now, we have no working tree
       return
 
     if self.UpdateProjectList(opt):
-      sys.exit(1)
+      err_event.set()
+      err_update_projects = True
+      if opt.fail_fast:
+        print('\nerror: Local checkouts *not* updated.', file=sys.stderr)
+        sys.exit(1)
 
-    self._Checkout(all_projects, opt)
+    err_results = []
+    self._Checkout(all_projects, opt, err_event, err_results)
+    if err_event.isSet():
+      err_checkout = True
+      # NB: We don't exit here because this is the last step.
 
     # If there's a notice that's supposed to print at the end of the sync, print
     # it now...
     if self.manifest.notice:
       print(self.manifest.notice)
 
+    # If we saw an error, exit with code 1 so that other scripts can check.
+    if err_event.isSet():
+      print('\nerror: Unable to fully sync the tree.', file=sys.stderr)
+      if err_network_sync:
+        print('error: Downloading network changes failed.', file=sys.stderr)
+      if err_update_projects:
+        print('error: Updating local project lists failed.', file=sys.stderr)
+      if err_checkout:
+        print('error: Checking out local projects failed.', file=sys.stderr)
+        if err_results:
+          print('Failing repos:\n%s' % '\n'.join(err_results), file=sys.stderr)
+      print('Try re-running with "-j1 --fail-fast" to exit at the first error.',
+            file=sys.stderr)
+      sys.exit(1)
+
 def _PostRepoUpgrade(manifest, quiet=False):
   wrapper = Wrapper()
   if wrapper.NeedSetupGnuPG():