Lowercase aliased organizations
This avoid duplications of organizations, in particular when the same organization
comes from the aliases file and from the email.
Change-Id: I163ee9ecac7ff2a01774408e9585032ea2ee31f9
diff --git a/src/main/scala/com/gerritforge/analytics/engine/GerritAnalyticsTransformations.scala b/src/main/scala/com/gerritforge/analytics/engine/GerritAnalyticsTransformations.scala
index 195cc02..5baafbb 100644
--- a/src/main/scala/com/gerritforge/analytics/engine/GerritAnalyticsTransformations.scala
+++ b/src/main/scala/com/gerritforge/analytics/engine/GerritAnalyticsTransformations.scala
@@ -109,8 +109,8 @@
df.join(renamedAliasesDF, df("email") === renamedAliasesDF("email_alias"), "left_outer" )
.withColumn("organization",
- when(renamedAliasesDF("organization_alias").notEqual(""), renamedAliasesDF("organization_alias"))
- .otherwise(df("organization")) )
+ when(renamedAliasesDF("organization_alias").notEqual(""), lower(renamedAliasesDF("organization_alias")))
+ .otherwise(df("organization")))
.withColumn("author", coalesce(renamedAliasesDF("author_alias"), df("author")))
.drop("email_alias","author_alias", "organization_alias")
}
diff --git a/src/test/scala/com/gerritforge/analytics/GerritAnalyticsTransformationsSpec.scala b/src/test/scala/com/gerritforge/analytics/GerritAnalyticsTransformationsSpec.scala
index 0e7702f..b00d36c 100644
--- a/src/test/scala/com/gerritforge/analytics/GerritAnalyticsTransformationsSpec.scala
+++ b/src/test/scala/com/gerritforge/analytics/GerritAnalyticsTransformationsSpec.scala
@@ -200,6 +200,25 @@
df.schema.fields.map(_.name) should contain allOf("author", "email", "organization")
}
+ it should "lowercase aliased organizations" in {
+ import spark.implicits._
+ val inputSampleDF = sc.parallelize(Seq(
+ ("author_name", "email@mail.com", "an_organization")
+ )).toDF("author", "email", "organization")
+
+ val aliasDF = sc.parallelize(Seq(
+ ("author_name", "email@mail.com", "OrGaNiZaTiOnToBeLoWeRcAsEd")
+ )).toDF("author", "email", "organization")
+
+ val df = inputSampleDF.handleAliases(Some(aliasDF))
+
+ val expectedDF = sc.parallelize(Seq(
+ ("author_name", "email@mail.com", "organizationtobelowercased")
+ )).toDF("author", "email", "organization")
+
+ df.collect should contain theSameElementsAs expectedDF.collect
+ }
+
"addOrganization" should "compute organization column from the email" in {
import sql.implicits._