blob: 195cc0230b98cbf64e4b6bfd4f13b914c95ea169 [file] [log] [blame]
// Copyright (C) 2017 GerritForge Ltd
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package com.gerritforge.analytics.engine
import java.io.{BufferedReader, IOException, InputStreamReader}
import java.net.URL
import java.nio.charset.StandardCharsets
import java.time.format.DateTimeFormatter
import java.time.{LocalDateTime, ZoneId, ZoneOffset, ZonedDateTime}
import com.gerritforge.analytics.model._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.functions.{udf, _}
import org.apache.spark.sql.{DataFrame, SparkSession}
import scala.collection.JavaConverters._
object GerritAnalyticsTransformations {
implicit class PimpedRDDString(val rdd: RDD[GerritProject]) extends AnyVal {
def enrichWithSource(implicit config: GerritEndpointConfig): RDD[ProjectContributionSource] = {
rdd.map { project =>
ProjectContributionSource(project.name, config.contributorsUrl(project.id))
}
}
}
def getLinesFromURL(sourceURL: String): Iterator[String] = {
val is = new URL(sourceURL).openConnection.getInputStream
new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8))
.lines.iterator().asScala
.filterNot(_.trim.isEmpty)
}
def getProjectJsonContributorsArray(project: String, sourceURL: String): Array[(String, String)] = {
try {
getLinesFromURL(sourceURL)
.map(s => (project, s))
.toArray
} catch {
case e: IOException => Array()
}
}
def getAliasDF(emailAliases: Option[String])(implicit spark: SparkSession): Option[DataFrame] = {
emailAliases.map { path =>
spark.sqlContext.read
.option("header", "true")
.option("mode", "DROPMALFORMED")
.option("inferSchema", "true")
.csv(path)
.toDF()
}
}
case class CommitterInfo(author: String, email_alias: String)
case class CommitInfo(sha1: String, date: Long, isMerge: Boolean)
case class UserActivitySummary(year: Integer,
month: Integer,
day: Integer,
hour: Integer,
name: String,
email: String,
num_commits: Integer,
num_files: Integer,
added_lines: Integer,
deleted_lines: Integer,
commits: Array[CommitInfo],
last_commit_date: Long)
import org.apache.spark.sql.Encoders
val schema = Encoders.product[UserActivitySummary].schema
implicit class PimpedDataFrame(val df: DataFrame) extends AnyVal {
def transformCommitterInfo()(implicit spark: SparkSession): DataFrame = {
import org.apache.spark.sql.functions.from_json
import spark.sqlContext.implicits._
df.withColumn("json", from_json($"json", schema))
.selectExpr(
"project", "json.name as author", "json.email as email",
"json.year as year", "json.month as month", "json.day as day", "json.hour as hour",
"json.num_files as num_files", "json.added_lines as added_lines", "json.deleted_lines as deleted_lines",
"json.num_commits as num_commits", "json.last_commit_date as last_commit_date")
}
def handleAliases(aliasesDF: Option[DataFrame])(implicit spark: SparkSession): DataFrame = {
aliasesDF
.map{ eaDF =>
val renamedAliasesDF = eaDF
.withColumnRenamed("email", "email_alias")
.withColumnRenamed("author", "author_alias")
.withColumnRenamed("organization", "organization_alias")
df.join(renamedAliasesDF, df("email") === renamedAliasesDF("email_alias"), "left_outer" )
.withColumn("organization",
when(renamedAliasesDF("organization_alias").notEqual(""), renamedAliasesDF("organization_alias"))
.otherwise(df("organization")) )
.withColumn("author", coalesce(renamedAliasesDF("author_alias"), df("author")))
.drop("email_alias","author_alias", "organization_alias")
}
.getOrElse(df)
}
def convertDates(columnName: String)(implicit spark: SparkSession): DataFrame = {
df.withColumn(columnName, longDateToISOUdf(col(columnName)))
}
def addOrganization()(implicit spark: SparkSession): DataFrame =
df.withColumn("organization", emailToDomainUdf(col("email")))
}
private def emailToDomain(email: String): String = email match {
case Email(_, domain) => domain
case _ => ""
}
private def emailToDomainUdf = udf(emailToDomain(_: String))
implicit class PimpedRDDProjectContributionSource(val projectsAndUrls: RDD[ProjectContributionSource]) extends AnyVal {
def fetchRawContributors(implicit spark: SparkSession): RDD[(String, String)] = {
projectsAndUrls.flatMap {
p => getProjectJsonContributorsArray(p.name, p.contributorsUrl)
}
}
}
import org.apache.spark.sql.functions.udf
val longDateToISOUdf = udf(longDateToISO(_: Number))
def longDateToISO(in: Number): String =
ZonedDateTime.ofInstant(
LocalDateTime.ofEpochSecond(in.longValue() / 1000L, 0, ZoneOffset.UTC),
ZoneOffset.UTC, ZoneId.of("Z")
) format DateTimeFormatter.ISO_OFFSET_DATE_TIME
}