Leverage bot-like-regexps parameter of analytics plugin

This commit makes use of the bot-like-regexps GET parameter of the
analytics plugin, which allows to pass a comma-separated list of regular
expressions to use in deciding whether a particular commit looks like a
bot-like commit.

This change also index the boolean is_bot_like field in elasticsearch
so that it can be used to build analytics on.

Change-Id: I12620ab37eb0866210ec1a690c827b459481d36e
diff --git a/README.md b/README.md
index 3bb8ce6..0206de6 100644
--- a/README.md
+++ b/README.md
@@ -36,6 +36,7 @@
     --since 2000-06-01 \
     --aggregate email_hour \
     --url http://gerrit.mycompany.com \
+    --botlike-filename-regexps='.+\.xml,.+\.bzl,BUILD,WORKSPACE,\.gitignore,plugins/,\.settings' \
     -e gerrit \
     --username gerrit-api-username \
     --password gerrit-api-password
@@ -63,6 +64,7 @@
     the system temporary directory
 - -a --email-aliases (*optional*) "emails to author alias" input data path.
 - -k --ignore-ssl-cert allows to proceed even for server connections otherwise considered insecure.
+- -n --botlike-filename-regexps comma separated list of regexps that identify a bot-like commit, commits that modify only files whose name is a match will be flagged as bot-like
 
 
   CSVs with 3 columns are expected in input.
diff --git a/gitcommits/src/main/scala/com/gerritforge/analytics/gitcommits/engine/GerritAnalyticsTransformations.scala b/gitcommits/src/main/scala/com/gerritforge/analytics/gitcommits/engine/GerritAnalyticsTransformations.scala
index af185aa..ce96f28 100644
--- a/gitcommits/src/main/scala/com/gerritforge/analytics/gitcommits/engine/GerritAnalyticsTransformations.scala
+++ b/gitcommits/src/main/scala/com/gerritforge/analytics/gitcommits/engine/GerritAnalyticsTransformations.scala
@@ -84,7 +84,8 @@
                                  commits: Array[CommitInfo],
                                  branches: Array[String],
                                  last_commit_date: Long,
-                                 is_merge: Boolean)
+                                 is_merge: Boolean,
+                                 is_bot_like: Boolean)
 
   import org.apache.spark.sql.Encoders
 
@@ -114,7 +115,8 @@
           "json.num_files as num_files", "json.num_distinct_files as num_distinct_files",
           "json.added_lines as added_lines", "json.deleted_lines as deleted_lines",
           "json.num_commits as num_commits", "json.last_commit_date as last_commit_date",
-          "json.is_merge as is_merge", "json.commits as commits", "json.branches as branches"
+          "json.is_merge as is_merge", "json.commits as commits", "json.branches as branches",
+          "json.is_bot_like"
         )
     }
 
diff --git a/gitcommits/src/main/scala/com/gerritforge/analytics/gitcommits/job/Main.scala b/gitcommits/src/main/scala/com/gerritforge/analytics/gitcommits/job/Main.scala
index dd9f6bb..9df2f9b 100644
--- a/gitcommits/src/main/scala/com/gerritforge/analytics/gitcommits/job/Main.scala
+++ b/gitcommits/src/main/scala/com/gerritforge/analytics/gitcommits/job/Main.scala
@@ -83,6 +83,9 @@
         c.copy(extractBranches = Some(input))
       } text "enables branches extraction for each commit"
 
+      opt[String]('n', "botlike-filename-regexps") optional () action { (input, c) =>
+        c.copy(botLikeRegexps = Some(input))
+      } text "comma separated list of regexps that identify a bot-like commit, commits that modify only files whose name is a match will be flagged as bot-like"
     }
 
   cliOptionParser.parse(args, GerritEndpointConfig()) match {
diff --git a/gitcommits/src/main/scala/com/gerritforge/analytics/gitcommits/model/GerritEndpointConfig.scala b/gitcommits/src/main/scala/com/gerritforge/analytics/gitcommits/model/GerritEndpointConfig.scala
index 4b5b9fc..fdee284 100644
--- a/gitcommits/src/main/scala/com/gerritforge/analytics/gitcommits/model/GerritEndpointConfig.scala
+++ b/gitcommits/src/main/scala/com/gerritforge/analytics/gitcommits/model/GerritEndpointConfig.scala
@@ -14,6 +14,7 @@
 
 package com.gerritforge.analytics.gitcommits.model
 
+import java.net.URLEncoder
 import java.time.format.DateTimeFormatter
 import java.time.{LocalDate, ZoneOffset}
 
@@ -32,7 +33,8 @@
     username: Option[String] = None,
     password: Option[String] = None,
     ignoreSSLCert: Option[Boolean] = None,
-    extractBranches: Option[Boolean] = None) {
+    extractBranches: Option[Boolean] = None,
+    botLikeRegexps: Option[String] = None) {
 
   val gerritApiConnection: GerritConnectivity = new GerritConnectivity(username, password, ignoreSSLCert.getOrElse(false))
 
@@ -53,7 +55,8 @@
     "since"            -> since.map(format.format),
     "until"            -> until.map(format.format),
     "aggregate"        -> aggregate,
-    "extract-branches" -> extractBranches.map(_.toString)
+    "extract-branches" -> extractBranches.map(_.toString),
+    "botlike-filename-regexps" -> botLikeRegexps.map(URLEncoder.encode(_, "UTF-8"))
   ).flatMap(queryOpt).mkString("?", "&", "")
 
   def contributorsUrl(projectName: String): Option[String] =
diff --git a/gitcommits/src/main/scala/com/gerritforge/analytics/gitcommits/plugin/ProcessGitCommitsCommand.scala b/gitcommits/src/main/scala/com/gerritforge/analytics/gitcommits/plugin/ProcessGitCommitsCommand.scala
index fccfc81..67fcc41 100644
--- a/gitcommits/src/main/scala/com/gerritforge/analytics/gitcommits/plugin/ProcessGitCommitsCommand.scala
+++ b/gitcommits/src/main/scala/com/gerritforge/analytics/gitcommits/plugin/ProcessGitCommitsCommand.scala
@@ -57,6 +57,13 @@
              usage = "enables branches extraction for each commit")
   var extractBranches: Boolean = false
 
+
+  @ArgOption(name = "--botlike-filename-regexps",
+    aliases = Array("-n"),
+    usage = "comma separated list of regexps that identify a bot-like commit, commits that modify only files whose name is a match will be flagged as bot-like")
+  var botLikeRegexps: String = ""
+
+
   override def run() {
     implicit val config = GerritEndpointConfig(gerritConfig.getListenUrl(),
                                                prefix =
@@ -67,7 +74,8 @@
                                                endDate,
                                                aggregate,
                                                emailAlias,
-                                               ignoreSSLCert=Some(ignoreSSLCert)
+                                               ignoreSSLCert=Some(ignoreSSLCert),
+                                               botLikeRegexps=botLikeRegexps
     )
 
     implicit val spark: SparkSession = SparkSession
diff --git a/gitcommits/src/test/scala/com/gerritforge/analytics/GerritAnalyticsTransformationsSpec.scala b/gitcommits/src/test/scala/com/gerritforge/analytics/GerritAnalyticsTransformationsSpec.scala
index 40c5391..72e0836 100644
--- a/gitcommits/src/test/scala/com/gerritforge/analytics/GerritAnalyticsTransformationsSpec.scala
+++ b/gitcommits/src/test/scala/com/gerritforge/analytics/GerritAnalyticsTransformationsSpec.scala
@@ -116,10 +116,10 @@
     import sql.implicits._
 
     val rdd = sc.parallelize(Seq(
-      ("p1","""{"name":"a","email":"a@mail.com","year":2017,"month":9, "day":11, "hour":23, "num_commits":1, "num_files": 2, "num_distinct_files": 2, "added_lines":1, "deleted_lines":1, "last_commit_date":0, "is_merge": false, "commits":[{ "sha1": "e063a806c33bd524e89a87732bd3f1ad9a77a41e", "date":0,"merge":false, "files": ["file1.txt", "file2.txt"]}], "branches": ["master", "stable-2.14"] }"""),
-      ("p2","""{"name":"b","email":"b@mail.com","year":2017,"month":9, "day":11, "hour":23, "num_commits":428, "num_files": 2, "num_distinct_files": 3, "added_lines":1, "deleted_lines":1, "last_commit_date":1500000000000, "is_merge": true, "commits":[{"sha1":"e063a806c33bd524e89a87732bd3f1ad9a77a41e", "date":0,"merge":true, "files": ["file3.txt", "file4.txt"] },{"sha1":"e063a806c33bd524e89a87732bd3f1ad9a77a41e", "date":1500000000000,"merge":true, "files": ["file1.txt", "file4.txt"]}]}, "branches":[]"""),
+      ("p1","""{"name":"a","email":"a@mail.com","year":2017,"month":9, "day":11, "hour":23, "num_commits":1, "num_files": 2, "num_distinct_files": 2, "added_lines":1, "deleted_lines":1, "last_commit_date":0, "is_merge": false, "is_bot_like": false, "commits":[{ "sha1": "e063a806c33bd524e89a87732bd3f1ad9a77a41e", "date":0,"merge":false, "files": ["file1.txt", "file2.txt"]}], "branches": ["master", "stable-2.14"]}"""),
+      ("p2","""{"name":"b","email":"b@mail.com","year":2017,"month":9, "day":11, "hour":23, "num_commits":428, "num_files": 2, "num_distinct_files": 3, "added_lines":1, "deleted_lines":1, "last_commit_date":1500000000000, "is_merge": true, "is_bot_like":true, "commits":[{"sha1":"e063a806c33bd524e89a87732bd3f1ad9a77a41e", "date":0,"merge":true, "files": ["file3.txt", "file4.txt"] },{"sha1":"e063a806c33bd524e89a87732bd3f1ad9a77a41e", "date":1500000000000,"merge":true, "files": ["file1.txt", "file4.txt"]}]}, "branches":[]"""),
       // last commit is missing hour,day,month,year to check optionality
-      ("p3","""{"name":"c","email":"c@mail.com","num_commits":12,"num_files": 4, "num_distinct_files": 2, "added_lines":1, "deleted_lines":1, "last_commit_date":1600000000000,"is_merge": true,"commits":[{"sha1":"e063a806c33bd524e89a87732bd3f1ad9a77a41e", "date":0,"merge":true, "files": ["file1.txt", "file2.txt"] },{"sha1":"e063a806c33bd524e89a87732bd3f1ad9a77a41e", "date":1600000000000,"merge":true, "files": ["file1.txt", "file2.txt"]}]}, "branches":[]""")
+      ("p3","""{"name":"c","email":"c@mail.com","num_commits":12,"num_files": 4, "num_distinct_files": 2, "added_lines":1, "deleted_lines":1, "last_commit_date":1600000000000,"is_merge": true, "is_bot_like":false,"commits":[{"sha1":"e063a806c33bd524e89a87732bd3f1ad9a77a41e", "date":0,"merge":true, "files": ["file1.txt", "file2.txt"] },{"sha1":"e063a806c33bd524e89a87732bd3f1ad9a77a41e", "date":1600000000000,"merge":true, "files": ["file1.txt", "file2.txt"]}]}, "branches":[]""")
     ))
 
     val df = rdd.toDF("project", "json")
@@ -133,15 +133,15 @@
       "year", "month", "day", "hour",
       "num_files", "num_distinct_files", "added_lines", "deleted_lines",
       "num_commits", "last_commit_date",
-      "is_merge", "commits", "branches")
+      "is_merge", "commits", "branches", "is_bot_like")
 
     collected should contain allOf(
       Row("p1", "a", "a@mail.com", 2017, 9, 11, 23, 2, 2, 1, 1, 1, 0, false,
-        new mutable.WrappedArray.ofRef(Array(Row("e063a806c33bd524e89a87732bd3f1ad9a77a41e", 0l, false))), new mutable.WrappedArray.ofRef(Array("master", "stable-2.14"))),
+        new mutable.WrappedArray.ofRef(Array(Row("e063a806c33bd524e89a87732bd3f1ad9a77a41e", 0l, false))), new mutable.WrappedArray.ofRef(Array("master", "stable-2.14")), false),
       Row("p2", "b", "b@mail.com", 2017, 9, 11, 23, 2, 3, 1, 1, 428, 1500000000000L, true,
-        new mutable.WrappedArray.ofRef[Row](Array(Row("e063a806c33bd524e89a87732bd3f1ad9a77a41e", 0l, true), Row("e063a806c33bd524e89a87732bd3f1ad9a77a41e", 1500000000000L, true))), null),
+        new mutable.WrappedArray.ofRef[Row](Array(Row("e063a806c33bd524e89a87732bd3f1ad9a77a41e", 0l, true), Row("e063a806c33bd524e89a87732bd3f1ad9a77a41e", 1500000000000L, true))), null, true),
       Row("p3", "c", "c@mail.com", null, null, null, null, 4, 2, 1, 1, 12, 1600000000000L, true,
-        new mutable.WrappedArray.ofRef[Row](Array(Row("e063a806c33bd524e89a87732bd3f1ad9a77a41e", 0l, true), Row("e063a806c33bd524e89a87732bd3f1ad9a77a41e", 1600000000000L, true))), null)
+        new mutable.WrappedArray.ofRef[Row](Array(Row("e063a806c33bd524e89a87732bd3f1ad9a77a41e", 0l, true), Row("e063a806c33bd524e89a87732bd3f1ad9a77a41e", 1600000000000L, true))), null, false)
     )
   }