Leverage bot-like-regexps parameter of analytics plugin
This commit makes use of the bot-like-regexps GET parameter of the
analytics plugin, which allows to pass a comma-separated list of regular
expressions to use in deciding whether a particular commit looks like a
bot-like commit.
This change also index the boolean is_bot_like field in elasticsearch
so that it can be used to build analytics on.
Change-Id: I12620ab37eb0866210ec1a690c827b459481d36e
diff --git a/README.md b/README.md
index 3bb8ce6..0206de6 100644
--- a/README.md
+++ b/README.md
@@ -36,6 +36,7 @@
--since 2000-06-01 \
--aggregate email_hour \
--url http://gerrit.mycompany.com \
+ --botlike-filename-regexps='.+\.xml,.+\.bzl,BUILD,WORKSPACE,\.gitignore,plugins/,\.settings' \
-e gerrit \
--username gerrit-api-username \
--password gerrit-api-password
@@ -63,6 +64,7 @@
the system temporary directory
- -a --email-aliases (*optional*) "emails to author alias" input data path.
- -k --ignore-ssl-cert allows to proceed even for server connections otherwise considered insecure.
+- -n --botlike-filename-regexps comma separated list of regexps that identify a bot-like commit, commits that modify only files whose name is a match will be flagged as bot-like
CSVs with 3 columns are expected in input.
diff --git a/gitcommits/src/main/scala/com/gerritforge/analytics/gitcommits/engine/GerritAnalyticsTransformations.scala b/gitcommits/src/main/scala/com/gerritforge/analytics/gitcommits/engine/GerritAnalyticsTransformations.scala
index af185aa..ce96f28 100644
--- a/gitcommits/src/main/scala/com/gerritforge/analytics/gitcommits/engine/GerritAnalyticsTransformations.scala
+++ b/gitcommits/src/main/scala/com/gerritforge/analytics/gitcommits/engine/GerritAnalyticsTransformations.scala
@@ -84,7 +84,8 @@
commits: Array[CommitInfo],
branches: Array[String],
last_commit_date: Long,
- is_merge: Boolean)
+ is_merge: Boolean,
+ is_bot_like: Boolean)
import org.apache.spark.sql.Encoders
@@ -114,7 +115,8 @@
"json.num_files as num_files", "json.num_distinct_files as num_distinct_files",
"json.added_lines as added_lines", "json.deleted_lines as deleted_lines",
"json.num_commits as num_commits", "json.last_commit_date as last_commit_date",
- "json.is_merge as is_merge", "json.commits as commits", "json.branches as branches"
+ "json.is_merge as is_merge", "json.commits as commits", "json.branches as branches",
+ "json.is_bot_like"
)
}
diff --git a/gitcommits/src/main/scala/com/gerritforge/analytics/gitcommits/job/Main.scala b/gitcommits/src/main/scala/com/gerritforge/analytics/gitcommits/job/Main.scala
index dd9f6bb..9df2f9b 100644
--- a/gitcommits/src/main/scala/com/gerritforge/analytics/gitcommits/job/Main.scala
+++ b/gitcommits/src/main/scala/com/gerritforge/analytics/gitcommits/job/Main.scala
@@ -83,6 +83,9 @@
c.copy(extractBranches = Some(input))
} text "enables branches extraction for each commit"
+ opt[String]('n', "botlike-filename-regexps") optional () action { (input, c) =>
+ c.copy(botLikeRegexps = Some(input))
+ } text "comma separated list of regexps that identify a bot-like commit, commits that modify only files whose name is a match will be flagged as bot-like"
}
cliOptionParser.parse(args, GerritEndpointConfig()) match {
diff --git a/gitcommits/src/main/scala/com/gerritforge/analytics/gitcommits/model/GerritEndpointConfig.scala b/gitcommits/src/main/scala/com/gerritforge/analytics/gitcommits/model/GerritEndpointConfig.scala
index 4b5b9fc..fdee284 100644
--- a/gitcommits/src/main/scala/com/gerritforge/analytics/gitcommits/model/GerritEndpointConfig.scala
+++ b/gitcommits/src/main/scala/com/gerritforge/analytics/gitcommits/model/GerritEndpointConfig.scala
@@ -14,6 +14,7 @@
package com.gerritforge.analytics.gitcommits.model
+import java.net.URLEncoder
import java.time.format.DateTimeFormatter
import java.time.{LocalDate, ZoneOffset}
@@ -32,7 +33,8 @@
username: Option[String] = None,
password: Option[String] = None,
ignoreSSLCert: Option[Boolean] = None,
- extractBranches: Option[Boolean] = None) {
+ extractBranches: Option[Boolean] = None,
+ botLikeRegexps: Option[String] = None) {
val gerritApiConnection: GerritConnectivity = new GerritConnectivity(username, password, ignoreSSLCert.getOrElse(false))
@@ -53,7 +55,8 @@
"since" -> since.map(format.format),
"until" -> until.map(format.format),
"aggregate" -> aggregate,
- "extract-branches" -> extractBranches.map(_.toString)
+ "extract-branches" -> extractBranches.map(_.toString),
+ "botlike-filename-regexps" -> botLikeRegexps.map(URLEncoder.encode(_, "UTF-8"))
).flatMap(queryOpt).mkString("?", "&", "")
def contributorsUrl(projectName: String): Option[String] =
diff --git a/gitcommits/src/main/scala/com/gerritforge/analytics/gitcommits/plugin/ProcessGitCommitsCommand.scala b/gitcommits/src/main/scala/com/gerritforge/analytics/gitcommits/plugin/ProcessGitCommitsCommand.scala
index fccfc81..67fcc41 100644
--- a/gitcommits/src/main/scala/com/gerritforge/analytics/gitcommits/plugin/ProcessGitCommitsCommand.scala
+++ b/gitcommits/src/main/scala/com/gerritforge/analytics/gitcommits/plugin/ProcessGitCommitsCommand.scala
@@ -57,6 +57,13 @@
usage = "enables branches extraction for each commit")
var extractBranches: Boolean = false
+
+ @ArgOption(name = "--botlike-filename-regexps",
+ aliases = Array("-n"),
+ usage = "comma separated list of regexps that identify a bot-like commit, commits that modify only files whose name is a match will be flagged as bot-like")
+ var botLikeRegexps: String = ""
+
+
override def run() {
implicit val config = GerritEndpointConfig(gerritConfig.getListenUrl(),
prefix =
@@ -67,7 +74,8 @@
endDate,
aggregate,
emailAlias,
- ignoreSSLCert=Some(ignoreSSLCert)
+ ignoreSSLCert=Some(ignoreSSLCert),
+ botLikeRegexps=botLikeRegexps
)
implicit val spark: SparkSession = SparkSession
diff --git a/gitcommits/src/test/scala/com/gerritforge/analytics/GerritAnalyticsTransformationsSpec.scala b/gitcommits/src/test/scala/com/gerritforge/analytics/GerritAnalyticsTransformationsSpec.scala
index 40c5391..72e0836 100644
--- a/gitcommits/src/test/scala/com/gerritforge/analytics/GerritAnalyticsTransformationsSpec.scala
+++ b/gitcommits/src/test/scala/com/gerritforge/analytics/GerritAnalyticsTransformationsSpec.scala
@@ -116,10 +116,10 @@
import sql.implicits._
val rdd = sc.parallelize(Seq(
- ("p1","""{"name":"a","email":"a@mail.com","year":2017,"month":9, "day":11, "hour":23, "num_commits":1, "num_files": 2, "num_distinct_files": 2, "added_lines":1, "deleted_lines":1, "last_commit_date":0, "is_merge": false, "commits":[{ "sha1": "e063a806c33bd524e89a87732bd3f1ad9a77a41e", "date":0,"merge":false, "files": ["file1.txt", "file2.txt"]}], "branches": ["master", "stable-2.14"] }"""),
- ("p2","""{"name":"b","email":"b@mail.com","year":2017,"month":9, "day":11, "hour":23, "num_commits":428, "num_files": 2, "num_distinct_files": 3, "added_lines":1, "deleted_lines":1, "last_commit_date":1500000000000, "is_merge": true, "commits":[{"sha1":"e063a806c33bd524e89a87732bd3f1ad9a77a41e", "date":0,"merge":true, "files": ["file3.txt", "file4.txt"] },{"sha1":"e063a806c33bd524e89a87732bd3f1ad9a77a41e", "date":1500000000000,"merge":true, "files": ["file1.txt", "file4.txt"]}]}, "branches":[]"""),
+ ("p1","""{"name":"a","email":"a@mail.com","year":2017,"month":9, "day":11, "hour":23, "num_commits":1, "num_files": 2, "num_distinct_files": 2, "added_lines":1, "deleted_lines":1, "last_commit_date":0, "is_merge": false, "is_bot_like": false, "commits":[{ "sha1": "e063a806c33bd524e89a87732bd3f1ad9a77a41e", "date":0,"merge":false, "files": ["file1.txt", "file2.txt"]}], "branches": ["master", "stable-2.14"]}"""),
+ ("p2","""{"name":"b","email":"b@mail.com","year":2017,"month":9, "day":11, "hour":23, "num_commits":428, "num_files": 2, "num_distinct_files": 3, "added_lines":1, "deleted_lines":1, "last_commit_date":1500000000000, "is_merge": true, "is_bot_like":true, "commits":[{"sha1":"e063a806c33bd524e89a87732bd3f1ad9a77a41e", "date":0,"merge":true, "files": ["file3.txt", "file4.txt"] },{"sha1":"e063a806c33bd524e89a87732bd3f1ad9a77a41e", "date":1500000000000,"merge":true, "files": ["file1.txt", "file4.txt"]}]}, "branches":[]"""),
// last commit is missing hour,day,month,year to check optionality
- ("p3","""{"name":"c","email":"c@mail.com","num_commits":12,"num_files": 4, "num_distinct_files": 2, "added_lines":1, "deleted_lines":1, "last_commit_date":1600000000000,"is_merge": true,"commits":[{"sha1":"e063a806c33bd524e89a87732bd3f1ad9a77a41e", "date":0,"merge":true, "files": ["file1.txt", "file2.txt"] },{"sha1":"e063a806c33bd524e89a87732bd3f1ad9a77a41e", "date":1600000000000,"merge":true, "files": ["file1.txt", "file2.txt"]}]}, "branches":[]""")
+ ("p3","""{"name":"c","email":"c@mail.com","num_commits":12,"num_files": 4, "num_distinct_files": 2, "added_lines":1, "deleted_lines":1, "last_commit_date":1600000000000,"is_merge": true, "is_bot_like":false,"commits":[{"sha1":"e063a806c33bd524e89a87732bd3f1ad9a77a41e", "date":0,"merge":true, "files": ["file1.txt", "file2.txt"] },{"sha1":"e063a806c33bd524e89a87732bd3f1ad9a77a41e", "date":1600000000000,"merge":true, "files": ["file1.txt", "file2.txt"]}]}, "branches":[]""")
))
val df = rdd.toDF("project", "json")
@@ -133,15 +133,15 @@
"year", "month", "day", "hour",
"num_files", "num_distinct_files", "added_lines", "deleted_lines",
"num_commits", "last_commit_date",
- "is_merge", "commits", "branches")
+ "is_merge", "commits", "branches", "is_bot_like")
collected should contain allOf(
Row("p1", "a", "a@mail.com", 2017, 9, 11, 23, 2, 2, 1, 1, 1, 0, false,
- new mutable.WrappedArray.ofRef(Array(Row("e063a806c33bd524e89a87732bd3f1ad9a77a41e", 0l, false))), new mutable.WrappedArray.ofRef(Array("master", "stable-2.14"))),
+ new mutable.WrappedArray.ofRef(Array(Row("e063a806c33bd524e89a87732bd3f1ad9a77a41e", 0l, false))), new mutable.WrappedArray.ofRef(Array("master", "stable-2.14")), false),
Row("p2", "b", "b@mail.com", 2017, 9, 11, 23, 2, 3, 1, 1, 428, 1500000000000L, true,
- new mutable.WrappedArray.ofRef[Row](Array(Row("e063a806c33bd524e89a87732bd3f1ad9a77a41e", 0l, true), Row("e063a806c33bd524e89a87732bd3f1ad9a77a41e", 1500000000000L, true))), null),
+ new mutable.WrappedArray.ofRef[Row](Array(Row("e063a806c33bd524e89a87732bd3f1ad9a77a41e", 0l, true), Row("e063a806c33bd524e89a87732bd3f1ad9a77a41e", 1500000000000L, true))), null, true),
Row("p3", "c", "c@mail.com", null, null, null, null, 4, 2, 1, 1, 12, 1600000000000L, true,
- new mutable.WrappedArray.ofRef[Row](Array(Row("e063a806c33bd524e89a87732bd3f1ad9a77a41e", 0l, true), Row("e063a806c33bd524e89a87732bd3f1ad9a77a41e", 1600000000000L, true))), null)
+ new mutable.WrappedArray.ofRef[Row](Array(Row("e063a806c33bd524e89a87732bd3f1ad9a77a41e", 0l, true), Row("e063a806c33bd524e89a87732bd3f1ad9a77a41e", 1600000000000L, true))), null, false)
)
}