command: add a helper for the parallel execution boilerplate Now that we have a bunch of subcommands doing parallel execution, a common pattern arises that we can factor out for most of them. We leave forall alone as it's a bit too complicated atm to cut over. Change-Id: I3617a4f7c66142bcd1ab030cb4cca698a65010ac Reviewed-on: https://gerrit-review.googlesource.com/c/git-repo/+/301942 Tested-by: Mike Frysinger <vapier@google.com> Reviewed-by: Chris Mcdonald <cjmcdonald@google.com>

commit: b5d075d04f1e555f85aad27e74f16073a50b2ae6 [log] [tgz]
author: Mike Frysinger <vapier@google.com> Mon Mar 01 00:56:38 2021 -0500
committer: Mike Frysinger <vapier@google.com> Thu Apr 15 05:10:16 2021 +0000
tree: b7342a0cd0a8d081cceb801b615bf8bbe1cc5647
parent: b8bf291ddbe00731d441a34cbf1ec5b5f95f401b [diff]
diff --git a/command.py b/command.py
index be2d6a6..9b1220d 100644
--- a/command.py
+++ b/command.py

@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import multiprocessing
 import os
 import optparse
 import platform
@@ -21,6 +22,7 @@
 from event_log import EventLog
 from error import NoSuchProjectError
 from error import InvalidProjectGroupsError
+import progress
 
 
 # Number of projects to submit to a single worker process at a time.
@@ -156,6 +158,44 @@
     """
     raise NotImplementedError
 
+  @staticmethod
+  def ExecuteInParallel(jobs, func, inputs, callback, output=None, ordered=False):
+    """Helper for managing parallel execution boiler plate.
+
+    For subcommands that can easily split their work up.
+
+    Args:
+      jobs: How many parallel processes to use.
+      func: The function to apply to each of the |inputs|.  Usually a
+          functools.partial for wrapping additional arguments.  It will be run
+          in a separate process, so it must be pickalable, so nested functions
+          won't work.  Methods on the subcommand Command class should work.
+      inputs: The list of items to process.  Must be a list.
+      callback: The function to pass the results to for processing.  It will be
+          executed in the main thread and process the results of |func| as they
+          become available.  Thus it may be a local nested function.  Its return
+          value is passed back directly.  It takes three arguments:
+          - The processing pool (or None with one job).
+          - The |output| argument.
+          - An iterator for the results.
+      output: An output manager.  May be progress.Progess or color.Coloring.
+      ordered: Whether the jobs should be processed in order.
+
+    Returns:
+      The |callback| function's results are returned.
+    """
+    try:
+      # NB: Multiprocessing is heavy, so don't spin it up for one job.
+      if len(inputs) == 1 or jobs == 1:
+        return callback(None, output, (func(x) for x in inputs))
+      else:
+        with multiprocessing.Pool(jobs) as pool:
+          submit = pool.imap if ordered else pool.imap_unordered
+          return callback(pool, output, submit(func, inputs, chunksize=WORKER_BATCH_SIZE))
+    finally:
+      if isinstance(output, progress.Progress):
+        output.end()
+
   def _ResetPathToProjectMap(self, projects):
     self._by_path = dict((p.worktree, p) for p in projects)
 

diff --git a/subcmds/abandon.py b/subcmds/abandon.py
index 1d22917..c7c127d 100644
--- a/subcmds/abandon.py
+++ b/subcmds/abandon.py

@@ -15,10 +15,9 @@
 from collections import defaultdict
 import functools
 import itertools
-import multiprocessing
 import sys
 
-from command import Command, DEFAULT_LOCAL_JOBS, WORKER_BATCH_SIZE
+from command import Command, DEFAULT_LOCAL_JOBS
 from git_command import git
 from progress import Progress
 
@@ -52,9 +51,9 @@
     else:
       args.insert(0, "'All local branches'")
 
-  def _ExecuteOne(self, opt, nb, project):
+  def _ExecuteOne(self, all_branches, nb, project):
     """Abandon one project."""
-    if opt.all:
+    if all_branches:
       branches = project.GetBranches()
     else:
       branches = [nb]
@@ -72,7 +71,7 @@
     success = defaultdict(list)
     all_projects = self.GetProjects(args[1:])
 
-    def _ProcessResults(states):
+    def _ProcessResults(_pool, pm, states):
       for (results, project) in states:
         for branch, status in results.items():
           if status:
@@ -81,17 +80,12 @@
             err[branch].append(project)
         pm.update()
 
-    pm = Progress('Abandon %s' % nb, len(all_projects), quiet=opt.quiet)
-    # NB: Multiprocessing is heavy, so don't spin it up for one job.
-    if len(all_projects) == 1 or opt.jobs == 1:
-      _ProcessResults(self._ExecuteOne(opt, nb, x) for x in all_projects)
-    else:
-      with multiprocessing.Pool(opt.jobs) as pool:
-        states = pool.imap_unordered(
-            functools.partial(self._ExecuteOne, opt, nb), all_projects,
-            chunksize=WORKER_BATCH_SIZE)
-        _ProcessResults(states)
-    pm.end()
+    self.ExecuteInParallel(
+        opt.jobs,
+        functools.partial(self._ExecuteOne, opt.all, nb),
+        all_projects,
+        callback=_ProcessResults,
+        output=Progress('Abandon %s' % (nb,), len(all_projects), quiet=opt.quiet))
 
     width = max(itertools.chain(
         [25], (len(x) for x in itertools.chain(success, err))))

diff --git a/subcmds/branches.py b/subcmds/branches.py
index d5ea580..2dc102b 100644
--- a/subcmds/branches.py
+++ b/subcmds/branches.py

@@ -13,10 +13,10 @@
 # limitations under the License.
 
 import itertools
-import multiprocessing
 import sys
+
 from color import Coloring
-from command import Command, DEFAULT_LOCAL_JOBS, WORKER_BATCH_SIZE
+from command import Command, DEFAULT_LOCAL_JOBS
 
 
 class BranchColoring(Coloring):
@@ -102,15 +102,19 @@
     out = BranchColoring(self.manifest.manifestProject.config)
     all_branches = {}
     project_cnt = len(projects)
-    with multiprocessing.Pool(processes=opt.jobs) as pool:
-      project_branches = pool.imap_unordered(
-          expand_project_to_branches, projects, chunksize=WORKER_BATCH_SIZE)
 
-      for name, b in itertools.chain.from_iterable(project_branches):
+    def _ProcessResults(_pool, _output, results):
+      for name, b in itertools.chain.from_iterable(results):
         if name not in all_branches:
           all_branches[name] = BranchInfo(name)
         all_branches[name].add(b)
 
+    self.ExecuteInParallel(
+        opt.jobs,
+        expand_project_to_branches,
+        projects,
+        callback=_ProcessResults)
+
     names = sorted(all_branches)
 
     if not names:

diff --git a/subcmds/checkout.py b/subcmds/checkout.py
index 6b71a8f..4d8009b 100644
--- a/subcmds/checkout.py
+++ b/subcmds/checkout.py

@@ -13,10 +13,9 @@
 # limitations under the License.
 
 import functools
-import multiprocessing
 import sys
 
-from command import Command, DEFAULT_LOCAL_JOBS, WORKER_BATCH_SIZE
+from command import Command, DEFAULT_LOCAL_JOBS
 from progress import Progress
 
 
@@ -50,7 +49,7 @@
     success = []
     all_projects = self.GetProjects(args[1:])
 
-    def _ProcessResults(results):
+    def _ProcessResults(_pool, pm, results):
       for status, project in results:
         if status is not None:
           if status:
@@ -59,17 +58,12 @@
             err.append(project)
         pm.update()
 
-    pm = Progress('Checkout %s' % nb, len(all_projects), quiet=opt.quiet)
-    # NB: Multiprocessing is heavy, so don't spin it up for one job.
-    if len(all_projects) == 1 or opt.jobs == 1:
-      _ProcessResults(self._ExecuteOne(nb, x) for x in all_projects)
-    else:
-      with multiprocessing.Pool(opt.jobs) as pool:
-        results = pool.imap_unordered(
-            functools.partial(self._ExecuteOne, nb), all_projects,
-            chunksize=WORKER_BATCH_SIZE)
-        _ProcessResults(results)
-    pm.end()
+    self.ExecuteInParallel(
+        opt.jobs,
+        functools.partial(self._ExecuteOne, nb),
+        all_projects,
+        callback=_ProcessResults,
+        output=Progress('Checkout %s' % (nb,), len(all_projects), quiet=opt.quiet))
 
     if err:
       for p in err:

diff --git a/subcmds/diff.py b/subcmds/diff.py
index cdc262e..4966bb1 100644
--- a/subcmds/diff.py
+++ b/subcmds/diff.py

@@ -14,9 +14,8 @@
 
 import functools
 import io
-import multiprocessing
 
-from command import DEFAULT_LOCAL_JOBS, PagedCommand, WORKER_BATCH_SIZE
+from command import DEFAULT_LOCAL_JOBS, PagedCommand
 
 
 class Diff(PagedCommand):
@@ -36,7 +35,7 @@
                  dest='absolute', action='store_true',
                  help='Paths are relative to the repository root')
 
-  def _DiffHelper(self, absolute, project):
+  def _ExecuteOne(self, absolute, project):
     """Obtains the diff for a specific project.
 
     Args:
@@ -51,22 +50,20 @@
     return (ret, buf.getvalue())
 
   def Execute(self, opt, args):
-    ret = 0
     all_projects = self.GetProjects(args)
 
-    # NB: Multiprocessing is heavy, so don't spin it up for one job.
-    if len(all_projects) == 1 or opt.jobs == 1:
-      for project in all_projects:
-        if not project.PrintWorkTreeDiff(opt.absolute):
+    def _ProcessResults(_pool, _output, results):
+      ret = 0
+      for (state, output) in results:
+        if output:
+          print(output, end='')
+        if not state:
           ret = 1
-    else:
-      with multiprocessing.Pool(opt.jobs) as pool:
-        states = pool.imap(functools.partial(self._DiffHelper, opt.absolute),
-                           all_projects, WORKER_BATCH_SIZE)
-        for (state, output) in states:
-          if output:
-            print(output, end='')
-          if not state:
-            ret = 1
+      return ret
 
-    return ret
+    return self.ExecuteInParallel(
+        opt.jobs,
+        functools.partial(self._ExecuteOne, opt.absolute),
+        all_projects,
+        callback=_ProcessResults,
+        ordered=True)

diff --git a/subcmds/grep.py b/subcmds/grep.py
index 9a4a8a3..6cb1445 100644
--- a/subcmds/grep.py
+++ b/subcmds/grep.py

@@ -13,11 +13,10 @@
 # limitations under the License.
 
 import functools
-import multiprocessing
 import sys
 
 from color import Coloring
-from command import DEFAULT_LOCAL_JOBS, PagedCommand, WORKER_BATCH_SIZE
+from command import DEFAULT_LOCAL_JOBS, PagedCommand
 from error import GitError
 from git_command import GitCommand
 
@@ -173,7 +172,7 @@
     return (project, p.Wait(), p.stdout, p.stderr)
 
   @staticmethod
-  def _ProcessResults(out, full_name, have_rev, results):
+  def _ProcessResults(full_name, have_rev, _pool, out, results):
     git_failed = False
     bad_rev = False
     have_match = False
@@ -256,18 +255,13 @@
       cmd_argv.extend(opt.revision)
     cmd_argv.append('--')
 
-    process_results = functools.partial(
-        self._ProcessResults, out, full_name, have_rev)
-    # NB: Multiprocessing is heavy, so don't spin it up for one job.
-    if len(projects) == 1 or opt.jobs == 1:
-      git_failed, bad_rev, have_match = process_results(
-          self._ExecuteOne(cmd_argv, x) for x in projects)
-    else:
-      with multiprocessing.Pool(opt.jobs) as pool:
-        results = pool.imap(
-            functools.partial(self._ExecuteOne, cmd_argv), projects,
-            chunksize=WORKER_BATCH_SIZE)
-        git_failed, bad_rev, have_match = process_results(results)
+    git_failed, bad_rev, have_match = self.ExecuteInParallel(
+        opt.jobs,
+        functools.partial(self._ExecuteOne, cmd_argv),
+        projects,
+        callback=functools.partial(self._ProcessResults, full_name, have_rev),
+        output=out,
+        ordered=True)
 
     if git_failed:
       sys.exit(1)

diff --git a/subcmds/prune.py b/subcmds/prune.py
index 4084c8b..236b647 100644
--- a/subcmds/prune.py
+++ b/subcmds/prune.py

@@ -13,10 +13,9 @@
 # limitations under the License.
 
 import itertools
-import multiprocessing
 
 from color import Coloring
-from command import DEFAULT_LOCAL_JOBS, PagedCommand, WORKER_BATCH_SIZE
+from command import DEFAULT_LOCAL_JOBS, PagedCommand
 
 
 class Prune(PagedCommand):
@@ -36,18 +35,15 @@
 
     # NB: Should be able to refactor this module to display summary as results
     # come back from children.
-    def _ProcessResults(results):
+    def _ProcessResults(_pool, _output, results):
       return list(itertools.chain.from_iterable(results))
 
-    # NB: Multiprocessing is heavy, so don't spin it up for one job.
-    if len(projects) == 1 or opt.jobs == 1:
-      all_branches = _ProcessResults(self._ExecuteOne(x) for x in projects)
-    else:
-      with multiprocessing.Pool(opt.jobs) as pool:
-        results = pool.imap(
-            self._ExecuteOne, projects,
-            chunksize=WORKER_BATCH_SIZE)
-        all_branches = _ProcessResults(results)
+    all_branches = self.ExecuteInParallel(
+        opt.jobs,
+        self._ExecuteOne,
+        projects,
+        callback=_ProcessResults,
+        ordered=True)
 
     if not all_branches:
       return

diff --git a/subcmds/start.py b/subcmds/start.py
index aa2f915..ff2bae5 100644
--- a/subcmds/start.py
+++ b/subcmds/start.py

@@ -13,11 +13,10 @@
 # limitations under the License.
 
 import functools
-import multiprocessing
 import os
 import sys
 
-from command import Command, DEFAULT_LOCAL_JOBS, WORKER_BATCH_SIZE
+from command import Command, DEFAULT_LOCAL_JOBS
 from git_config import IsImmutable
 from git_command import git
 import gitc_utils
@@ -55,7 +54,7 @@
     if not git.check_ref_format('heads/%s' % nb):
       self.OptionParser.error("'%s' is not a valid name" % nb)
 
-  def _ExecuteOne(self, opt, nb, project):
+  def _ExecuteOne(self, revision, nb, project):
     """Start one project."""
     # If the current revision is immutable, such as a SHA1, a tag or
     # a change, then we can't push back to it. Substitute with
@@ -69,7 +68,7 @@
 
     try:
       ret = project.StartBranch(
-          nb, branch_merge=branch_merge, revision=opt.revision)
+          nb, branch_merge=branch_merge, revision=revision)
     except Exception as e:
       print('error: unable to checkout %s: %s' % (project.name, e), file=sys.stderr)
       ret = False
@@ -123,23 +122,18 @@
         pm.update()
       pm.end()
 
-    def _ProcessResults(results):
+    def _ProcessResults(_pool, pm, results):
       for (result, project) in results:
         if not result:
           err.append(project)
         pm.update()
 
-    pm = Progress('Starting %s' % nb, len(all_projects), quiet=opt.quiet)
-    # NB: Multiprocessing is heavy, so don't spin it up for one job.
-    if len(all_projects) == 1 or opt.jobs == 1:
-      _ProcessResults(self._ExecuteOne(opt, nb, x) for x in all_projects)
-    else:
-      with multiprocessing.Pool(opt.jobs) as pool:
-        results = pool.imap_unordered(
-            functools.partial(self._ExecuteOne, opt, nb), all_projects,
-            chunksize=WORKER_BATCH_SIZE)
-        _ProcessResults(results)
-    pm.end()
+    self.ExecuteInParallel(
+        opt.jobs,
+        functools.partial(self._ExecuteOne, opt.revision, nb),
+        all_projects,
+        callback=_ProcessResults,
+        output=Progress('Starting %s' % (nb,), len(all_projects), quiet=opt.quiet))
 
     if err:
       for p in err:

diff --git a/subcmds/status.py b/subcmds/status.py
index dc223a0..1b48dce 100644
--- a/subcmds/status.py
+++ b/subcmds/status.py

@@ -15,10 +15,9 @@
 import functools
 import glob
 import io
-import multiprocessing
 import os
 
-from command import DEFAULT_LOCAL_JOBS, PagedCommand, WORKER_BATCH_SIZE
+from command import DEFAULT_LOCAL_JOBS, PagedCommand
 
 from color import Coloring
 import platform_utils
@@ -119,22 +118,23 @@
 
   def Execute(self, opt, args):
     all_projects = self.GetProjects(args)
-    counter = 0
 
-    if opt.jobs == 1:
-      for project in all_projects:
-        state = project.PrintWorkTreeStatus(quiet=opt.quiet)
+    def _ProcessResults(_pool, _output, results):
+      ret = 0
+      for (state, output) in results:
+        if output:
+          print(output, end='')
         if state == 'CLEAN':
-          counter += 1
-    else:
-      with multiprocessing.Pool(opt.jobs) as pool:
-        states = pool.imap(functools.partial(self._StatusHelper, opt.quiet),
-                           all_projects, chunksize=WORKER_BATCH_SIZE)
-        for (state, output) in states:
-          if output:
-            print(output, end='')
-          if state == 'CLEAN':
-            counter += 1
+          ret += 1
+      return ret
+
+    counter = self.ExecuteInParallel(
+        opt.jobs,
+        functools.partial(self._StatusHelper, opt.quiet),
+        all_projects,
+        callback=_ProcessResults,
+        ordered=True)
+
     if not opt.quiet and len(all_projects) == counter:
       print('nothing to commit (working directory clean)')
 

diff --git a/subcmds/sync.py b/subcmds/sync.py
index 21166af..4763fad 100644
--- a/subcmds/sync.py
+++ b/subcmds/sync.py

@@ -51,7 +51,7 @@
 import gitc_utils
 from project import Project
 from project import RemoteSpec
-from command import Command, MirrorSafeCommand, WORKER_BATCH_SIZE
+from command import Command, MirrorSafeCommand
 from error import RepoChangedException, GitError, ManifestParseError
 import platform_utils
 from project import SyncBuffer
@@ -428,11 +428,12 @@
 
     return (ret, fetched)
 
-  def _CheckoutOne(self, opt, project):
+  def _CheckoutOne(self, detach_head, force_sync, project):
     """Checkout work tree for one project
 
     Args:
-      opt: Program options returned from optparse.  See _Options().
+      detach_head: Whether to leave a detached HEAD.
+      force_sync: Force checking out of the repo.
       project: Project object for the project to checkout.
 
     Returns:
@@ -440,10 +441,10 @@
     """
     start = time.time()
     syncbuf = SyncBuffer(self.manifest.manifestProject.config,
-                         detach_head=opt.detach_head)
+                         detach_head=detach_head)
     success = False
     try:
-      project.Sync_LocalHalf(syncbuf, force_sync=opt.force_sync)
+      project.Sync_LocalHalf(syncbuf, force_sync=force_sync)
       success = syncbuf.Finish()
     except Exception as e:
       print('error: Cannot checkout %s: %s: %s' %
@@ -464,44 +465,32 @@
       opt: Program options returned from optparse.  See _Options().
       err_results: A list of strings, paths to git repos where checkout failed.
     """
-    ret = True
-    jobs = opt.jobs_checkout if opt.jobs_checkout else self.jobs
-
     # Only checkout projects with worktrees.
     all_projects = [x for x in all_projects if x.worktree]
 
-    pm = Progress('Checking out', len(all_projects), quiet=opt.quiet)
-
-    def _ProcessResults(results):
+    def _ProcessResults(pool, pm, results):
+      ret = True
       for (success, project, start, finish) in results:
         self.event_log.AddSync(project, event_log.TASK_SYNC_LOCAL,
                                start, finish, success)
         # Check for any errors before running any more tasks.
         # ...we'll let existing jobs finish, though.
         if not success:
+          ret = False
           err_results.append(project.relpath)
           if opt.fail_fast:
-            return False
+            if pool:
+              pool.close()
+            return ret
         pm.update(msg=project.name)
-      return True
+      return ret
 
-    # NB: Multiprocessing is heavy, so don't spin it up for one job.
-    if len(all_projects) == 1 or jobs == 1:
-      if not _ProcessResults(self._CheckoutOne(opt, x) for x in all_projects):
-        ret = False
-    else:
-      with multiprocessing.Pool(jobs) as pool:
-        results = pool.imap_unordered(
-            functools.partial(self._CheckoutOne, opt),
-            all_projects,
-            chunksize=WORKER_BATCH_SIZE)
-        if not _ProcessResults(results):
-          ret = False
-          pool.close()
-
-    pm.end()
-
-    return ret and not err_results
+    return self.ExecuteInParallel(
+        opt.jobs_checkout if opt.jobs_checkout else self.jobs,
+        functools.partial(self._CheckoutOne, opt.detach_head, opt.force_sync),
+        all_projects,
+        callback=_ProcessResults,
+        output=Progress('Checking out', len(all_projects), quiet=opt.quiet)) and not err_results
 
   def _GCProjects(self, projects, opt, err_event):
     pm = Progress('Garbage collecting', len(projects), delay=False, quiet=opt.quiet)
commit	b5d075d04f1e555f85aad27e74f16073a50b2ae6	[log] [tgz]
author	Mike Frysinger <vapier@google.com>	Mon Mar 01 00:56:38 2021 -0500
committer	Mike Frysinger <vapier@google.com>	Thu Apr 15 05:10:16 2021 +0000
tree	b7342a0cd0a8d081cceb801b615bf8bbe1cc5647
parent	b8bf291ddbe00731d441a34cbf1ec5b5f95f401b [diff]