Support projects filtering using repo manifest

When dealing with multiple projects, using a tool like
repo and an XML manifest file is very common.

Allow the ETL to receive a manifest XML file as a parameter
and automatically retrieve all the projects mentioned
in the manifest file.

Change-Id: I839f9eea690edb53053396e3e7fc83eb6c53d0a1
diff --git a/gitcommits/src/main/scala/com/gerritforge/analytics/gitcommits/job/Main.scala b/gitcommits/src/main/scala/com/gerritforge/analytics/gitcommits/job/Main.scala
index b80e99d..b012a0d 100644
--- a/gitcommits/src/main/scala/com/gerritforge/analytics/gitcommits/job/Main.scala
+++ b/gitcommits/src/main/scala/com/gerritforge/analytics/gitcommits/job/Main.scala
@@ -86,6 +86,10 @@
         c.copy(extractBranches = Some(input))
       } text "enables branches extraction for each commit"
 
+      opt[String]('m', "manifest") optional () action { (x, c) =>
+        c.copy(manifest = Some(x))
+      } text "repo manifest XML with the list of projects to process"
+
     }
 
   cliOptionParser.parse(args, GerritEndpointConfig()) match {
@@ -147,11 +151,11 @@
 }
 
 trait FetchRemoteProjects extends FetchProjects {
-  def fetchProjects(config: GerritEndpointConfig): Seq[GerritProject] = {
+  def fetchProjects(config: GerritEndpointConfig): Seq[GerritProject] =
+  config.projectsFromManifest.map(_.toSeq).getOrElse(
     config.gerritProjectsUrl.toSeq.flatMap { url =>
       GerritProjectsSupport.parseJsonProjectListResponse(
         config.gerritApiConnection.getContentFromApi(url)
       )
-    }
-  }
+    })
 }
diff --git a/gitcommits/src/main/scala/com/gerritforge/analytics/gitcommits/model/GerritEndpointConfig.scala b/gitcommits/src/main/scala/com/gerritforge/analytics/gitcommits/model/GerritEndpointConfig.scala
index a27611f..cc42499 100644
--- a/gitcommits/src/main/scala/com/gerritforge/analytics/gitcommits/model/GerritEndpointConfig.scala
+++ b/gitcommits/src/main/scala/com/gerritforge/analytics/gitcommits/model/GerritEndpointConfig.scala
@@ -15,11 +15,14 @@
 package com.gerritforge.analytics.gitcommits.model
 
 import java.net.URLEncoder
+import scala.xml._
 import java.time.format.DateTimeFormatter
 import java.time.{LocalDate, ZoneOffset}
-
 import com.gerritforge.analytics.common.api.GerritConnectivity
 import com.gerritforge.analytics.support.ops.AnalyticsDateTimeFormatter
+
+import java.nio.charset.StandardCharsets
+
 case class GerritEndpointConfig(
     baseUrl: Option[String] = None,
     prefix: Option[String] = None,
@@ -33,9 +36,22 @@
     username: Option[String] = None,
     password: Option[String] = None,
     ignoreSSLCert: Option[Boolean] = None,
-    extractBranches: Option[Boolean] = None
+    extractBranches: Option[Boolean] = None,
+    manifest: Option[String] = None
 ) {
 
+  lazy val projectsFromManifest: Option[Set[GerritProject]] = manifest.map { mf =>
+      val mfDoc = XML.loadFile(mf)
+      val mfProjects = mfDoc \ "project"
+    mfProjects.theSeq
+      .flatMap(_.attribute("name").toSeq)
+      .flatten
+      .map(_.text)
+      .map(_.stripSuffix(".git"))
+      .map(p => GerritProject(URLEncoder.encode(p, "UTF-8"),p))
+      .toSet
+  }
+
   val gerritApiConnection: GerritConnectivity =
     new GerritConnectivity(username, password, ignoreSSLCert.getOrElse(false))
 
diff --git a/gitcommits/src/test/scala/com/gerritforge/analytics/gitcommits/model/GerritEndpointConfigTest.scala b/gitcommits/src/test/scala/com/gerritforge/analytics/gitcommits/model/GerritEndpointConfigTest.scala
index 6830a6e..a58d584 100644
--- a/gitcommits/src/test/scala/com/gerritforge/analytics/gitcommits/model/GerritEndpointConfigTest.scala
+++ b/gitcommits/src/test/scala/com/gerritforge/analytics/gitcommits/model/GerritEndpointConfigTest.scala
@@ -16,7 +16,12 @@
 
 import org.scalatest.{FlatSpec, Matchers}
 
-class GerritEndpointConfigTest extends FlatSpec with Matchers {
+import java.io.File
+import java.nio.charset.StandardCharsets
+import java.nio.file.{Files, Paths}
+import scala.xml.XML
+
+class GerritEndpointConfigTest extends FlatSpec with Matchers with ManifestXML {
 
   "gerritProjectsUrl" should "contain prefix when available" in {
     val prefix = "prefixMustBeThere"
@@ -29,4 +34,50 @@
     conf.gerritProjectsUrl should contain(s"testBaseUrl/projects/")
   }
 
+  it should "return projects contained in a repo manifest XML" in {
+    val conf = GerritEndpointConfig(baseUrl = Some("testBaseUrl"), manifest = Option(manifestFile.getAbsolutePath))
+    val projectNamesFromManifest = conf.projectsFromManifest.toSeq.flatten.map(_.name)
+
+    projectNamesFromManifest should contain only ("sel4_projects_libs", "seL4_tools", "sel4runtime", "musllibc", "seL4_libs", "prefix/util_libs", "sel4test", "nanopb", "opensbi")
+    projectNamesFromManifest should not contain ("non-existent-project")
+  }
+
+  it should "return projects ids with URL encoded names mentioned in a repo manifest XML" in {
+    val conf = GerritEndpointConfig(baseUrl = Some("testBaseUrl"), manifest = Option(manifestFile.getAbsolutePath))
+    val projectIdsFromManifest = conf.projectsFromManifest.toSeq.flatten.map(_.id)
+
+    projectIdsFromManifest should contain only ("sel4_projects_libs", "seL4_tools", "sel4runtime", "musllibc", "seL4_libs", "prefix%2Futil_libs", "sel4test", "nanopb", "opensbi")
+  }
+}
+
+trait ManifestXML {
+  // Sample from https://docs.sel4.systems/projects/buildsystem/repo-cheatsheet.html
+  lazy val manifestFile: File = {
+    val tmpFile = File.createTempFile("analytics-etl", s"-${System.nanoTime()}.xml")
+    Files.write(tmpFile.toPath,
+      """<manifest>
+        |<remote name="seL4" fetch="." />
+        |<remote fetch="../sel4proj" name="sel4proj"/>
+        |<remote fetch="https://github.com/nanopb" name="nanopb" />
+        |<remote fetch="https://github.com/riscv" name="opensbi"/>
+        |
+        |<default revision="master" remote="seL4" />
+        |
+        |<project name="seL4_tools.git" path="tools/seL4">
+        |    <linkfile src="cmake-tool/init-build.sh" dest="init-build.sh"/>
+        |    <linkfile src="cmake-tool/griddle" dest="griddle"/>
+        |</project>
+        |<project name="sel4runtime.git" path="projects/sel4runtime"/>
+        |<project name="musllibc.git" path="projects/musllibc" revision="sel4"/>
+        |<project name="seL4_libs.git" path="projects/seL4_libs"/>
+        |<project name="prefix/util_libs.git" path="projects/util_libs"/>
+        |<project name="sel4test.git" path="projects/sel4test">
+        |    <linkfile src="easy-settings.cmake" dest="easy-settings.cmake"/>
+        |</project>
+        |<project name="sel4_projects_libs" path="projects/sel4_projects_libs" />
+        |<project name="opensbi" remote="opensbi" revision="refs/tags/v0.8" path="tools/opensbi"/>
+        |<project name="nanopb" path="tools/nanopb" revision="refs/tags/0.4.3" upstream="master" remote="nanopb"/>
+        |</manifest>""".stripMargin.getBytes(StandardCharsets.UTF_8))
+    tmpFile
+  }
 }
diff --git a/project/SharedSettings.scala b/project/SharedSettings.scala
index 136fef6..0d16824 100644
--- a/project/SharedSettings.scala
+++ b/project/SharedSettings.scala
@@ -43,7 +43,8 @@
       "com.github.scopt"           %% "scopt"                % scopt,
       "org.scalactic"              %% "scalactic"            % scalactic % "test",
       "org.scalatest"              %% "scalatest"            % scalaTest % "test",
-      "com.dimafeng"               %% "testcontainers-scala" % TestContainersScala % Test
+      "com.dimafeng"               %% "testcontainers-scala" % TestContainersScala % Test,
+      "org.scala-lang.modules"     %% "scala-xml"            % "1.3.1",
     )
   )