Skip to content

Commit

Permalink
[SPARK-33084][CORE][SQL] Add jar support ivy path
Browse files Browse the repository at this point in the history
### What changes were proposed in this pull request?
Support add jar with ivy path

### Why are the changes needed?
Since submit app can support ivy, add jar we can also support ivy now.

### Does this PR introduce _any_ user-facing change?
User can add jar with sql like
```
add jar ivy:://group:artifict:version?exclude=xxx,xxx&transitive=true
add jar ivy:://group:artifict:version?exclude=xxx,xxx&transitive=false
```

core api
```
sparkContext.addJar("ivy:://group:artifict:version?exclude=xxx,xxx&transitive=true")
sparkContext.addJar("ivy:://group:artifict:version?exclude=xxx,xxx&transitive=false")
```

#### Doc Update snapshot
![image](https://user-images.githubusercontent.com/46485123/101227738-de451200-36d3-11eb-813d-78a8b879da4f.png)

### How was this patch tested?
Added UT

Closes #29966 from AngersZhuuuu/support-add-jar-ivy.

Lead-authored-by: angerszhu <angers.zhu@gmail.com>
Co-authored-by: AngersZhuuuu <angers.zhu@gmail.com>
Signed-off-by: Takeshi Yamamuro <yamamuro@apache.org>
  • Loading branch information
AngersZhuuuu authored and maropu committed Dec 25, 2020
1 parent 65a9ac2 commit 10b6466
Show file tree
Hide file tree
Showing 15 changed files with 475 additions and 50 deletions.
45 changes: 28 additions & 17 deletions core/src/main/scala/org/apache/spark/SparkContext.scala
Original file line number Diff line number Diff line change
Expand Up @@ -1929,7 +1929,7 @@ class SparkContext(config: SparkConf) extends Logging {
}

private def addJar(path: String, addedOnSubmit: Boolean): Unit = {
def addLocalJarFile(file: File): String = {
def addLocalJarFile(file: File): Seq[String] = {
try {
if (!file.exists()) {
throw new FileNotFoundException(s"Jar ${file.getAbsolutePath} not found")
Expand All @@ -1938,15 +1938,15 @@ class SparkContext(config: SparkConf) extends Logging {
throw new IllegalArgumentException(
s"Directory ${file.getAbsoluteFile} is not allowed for addJar")
}
env.rpcEnv.fileServer.addJar(file)
Seq(env.rpcEnv.fileServer.addJar(file))
} catch {
case NonFatal(e) =>
logError(s"Failed to add $path to Spark environment", e)
null
Nil
}
}

def checkRemoteJarFile(path: String): String = {
def checkRemoteJarFile(path: String): Seq[String] = {
val hadoopPath = new Path(path)
val scheme = hadoopPath.toUri.getScheme
if (!Array("http", "https", "ftp").contains(scheme)) {
Expand All @@ -1959,47 +1959,58 @@ class SparkContext(config: SparkConf) extends Logging {
throw new IllegalArgumentException(
s"Directory ${path} is not allowed for addJar")
}
path
Seq(path)
} catch {
case NonFatal(e) =>
logError(s"Failed to add $path to Spark environment", e)
null
Nil
}
} else {
path
Seq(path)
}
}

if (path == null || path.isEmpty) {
logWarning("null or empty path specified as parameter to addJar")
} else {
val key = if (path.contains("\\") && Utils.isWindows) {
val (keys, scheme) = if (path.contains("\\") && Utils.isWindows) {
// For local paths with backslashes on Windows, URI throws an exception
addLocalJarFile(new File(path))
(addLocalJarFile(new File(path)), "local")
} else {
val uri = new Path(path).toUri
// SPARK-17650: Make sure this is a valid URL before adding it to the list of dependencies
Utils.validateURL(uri)
uri.getScheme match {
val uriScheme = uri.getScheme
val jarPaths = uriScheme match {
// A JAR file which exists only on the driver node
case null =>
// SPARK-22585 path without schema is not url encoded
addLocalJarFile(new File(uri.getPath))
// A JAR file which exists only on the driver node
case "file" => addLocalJarFile(new File(uri.getPath))
// A JAR file which exists locally on every worker node
case "local" => "file:" + uri.getPath
case "local" => Seq("file:" + uri.getPath)
case "ivy" =>
// Since `new Path(path).toUri` will lose query information,
// so here we use `URI.create(path)`
DependencyUtils.resolveMavenDependencies(URI.create(path))
.flatMap(jar => addLocalJarFile(new File(jar)))
case _ => checkRemoteJarFile(path)
}
(jarPaths, uriScheme)
}
if (key != null) {
if (keys.nonEmpty) {
val timestamp = if (addedOnSubmit) startTime else System.currentTimeMillis
if (addedJars.putIfAbsent(key, timestamp).isEmpty) {
logInfo(s"Added JAR $path at $key with timestamp $timestamp")
val (added, existed) = keys.partition(addedJars.putIfAbsent(_, timestamp).isEmpty)
if (added.nonEmpty) {
val jarMessage = if (scheme != "ivy") "JAR" else "dependency jars of Ivy URI"
logInfo(s"Added $jarMessage $path at ${added.mkString(",")} with timestamp $timestamp")
postEnvironmentUpdate()
} else {
logWarning(s"The jar $path has been added already. Overwriting of added jars " +
"is not supported in the current version.")
}
if (existed.nonEmpty) {
val jarMessage = if (scheme != "ivy") "JAR" else "dependency jars of Ivy URI"
logInfo(s"The $jarMessage $path at ${existed.mkString(",")} has been added already." +
" Overwriting of added jar is not supported in the current version.")
}
}
}
Expand Down
8 changes: 5 additions & 3 deletions core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
Original file line number Diff line number Diff line change
Expand Up @@ -304,8 +304,8 @@ private[spark] class SparkSubmit extends Logging {
// Resolve maven dependencies if there are any and add classpath to jars. Add them to py-files
// too for packages that include Python code
val resolvedMavenCoordinates = DependencyUtils.resolveMavenDependencies(
args.packagesExclusions, args.packages, args.repositories, args.ivyRepoPath,
args.ivySettingsPath)
packagesTransitive = true, args.packagesExclusions, args.packages,
args.repositories, args.ivyRepoPath, args.ivySettingsPath)

if (!StringUtils.isBlank(resolvedMavenCoordinates)) {
// In K8s client mode, when in the driver, add resolved jars early as we might need
Expand Down Expand Up @@ -1360,13 +1360,15 @@ private[spark] object SparkSubmitUtils {
* Resolves any dependencies that were supplied through maven coordinates
* @param coordinates Comma-delimited string of maven coordinates
* @param ivySettings An IvySettings containing resolvers to use
* @param transitive Whether resolving transitive dependencies, default is true
* @param exclusions Exclusions to apply when resolving transitive dependencies
* @return The comma-delimited path to the jars of the given maven artifacts including their
* transitive dependencies
*/
def resolveMavenCoordinates(
coordinates: String,
ivySettings: IvySettings,
transitive: Boolean,
exclusions: Seq[String] = Nil,
isTest: Boolean = false): String = {
if (coordinates == null || coordinates.trim.isEmpty) {
Expand Down Expand Up @@ -1396,7 +1398,7 @@ private[spark] object SparkSubmitUtils {
val ivy = Ivy.newInstance(ivySettings)
// Set resolve options to download transitive dependencies as well
val resolveOptions = new ResolveOptions
resolveOptions.setTransitive(true)
resolveOptions.setTransitive(transitive)
val retrieveOptions = new RetrieveOptions
// Turn downloading and logging off for testing
if (isTest) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ import java.io.File
import org.apache.commons.lang3.StringUtils

import org.apache.spark.{SecurityManager, SparkConf}
import org.apache.spark.deploy.{DependencyUtils, SparkHadoopUtil}
import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.internal.{config, Logging}
import org.apache.spark.rpc.RpcEnv
import org.apache.spark.util._
Expand Down Expand Up @@ -79,17 +79,11 @@ object DriverWrapper extends Logging {
val secMgr = new SecurityManager(sparkConf)
val hadoopConf = SparkHadoopUtil.newConfiguration(sparkConf)

val Seq(packagesExclusions, packages, repositories, ivyRepoPath, ivySettingsPath) =
Seq(
"spark.jars.excludes",
"spark.jars.packages",
"spark.jars.repositories",
"spark.jars.ivy",
"spark.jars.ivySettings"
).map(sys.props.get(_).orNull)
val ivyProperties = DependencyUtils.getIvyProperties()

val resolvedMavenCoordinates = DependencyUtils.resolveMavenDependencies(packagesExclusions,
packages, repositories, ivyRepoPath, Option(ivySettingsPath))
val resolvedMavenCoordinates = DependencyUtils.resolveMavenDependencies(true,
ivyProperties.packagesExclusions, ivyProperties.packages, ivyProperties.repositories,
ivyProperties.ivyRepoPath, Option(ivyProperties.ivySettingsPath))
val jars = {
val jarsProp = sys.props.get(config.JARS.key).orNull
if (!StringUtils.isBlank(resolvedMavenCoordinates)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
* limitations under the License.
*/

package org.apache.spark.deploy
package org.apache.spark.util

import java.io.File
import java.net.URI
Expand All @@ -25,12 +25,140 @@ import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}

import org.apache.spark.{SecurityManager, SparkConf, SparkException}
import org.apache.spark.deploy.SparkSubmitUtils
import org.apache.spark.internal.Logging
import org.apache.spark.util.{MutableURLClassLoader, Utils}

private[deploy] object DependencyUtils extends Logging {
case class IvyProperties(
packagesExclusions: String,
packages: String,
repositories: String,
ivyRepoPath: String,
ivySettingsPath: String)

private[spark] object DependencyUtils extends Logging {

def getIvyProperties(): IvyProperties = {
val Seq(packagesExclusions, packages, repositories, ivyRepoPath, ivySettingsPath) = Seq(
"spark.jars.excludes",
"spark.jars.packages",
"spark.jars.repositories",
"spark.jars.ivy",
"spark.jars.ivySettings"
).map(sys.props.get(_).orNull)
IvyProperties(packagesExclusions, packages, repositories, ivyRepoPath, ivySettingsPath)
}

private def isInvalidQueryString(tokens: Array[String]): Boolean = {
tokens.length != 2 || StringUtils.isBlank(tokens(0)) || StringUtils.isBlank(tokens(1))
}

/**
* Parse URI query string's parameter value of `transitive` and `exclude`.
* Other invalid parameters will be ignored.
*
* @param uri Ivy URI need to be downloaded.
* @return Tuple value of parameter `transitive` and `exclude` value.
*
* 1. transitive: whether to download dependency jar of Ivy URI, default value is false
* and this parameter value is case-sensitive. Invalid value will be treat as false.
* Example: Input: exclude=org.mortbay.jetty:jetty&transitive=true
* Output: true
*
* 2. exclude: comma separated exclusions to apply when resolving transitive dependencies,
* consists of `group:module` pairs separated by commas.
* Example: Input: excludeorg.mortbay.jetty:jetty,org.eclipse.jetty:jetty-http
* Output: [org.mortbay.jetty:jetty,org.eclipse.jetty:jetty-http]
*/
private def parseQueryParams(uri: URI): (Boolean, String) = {
val uriQuery = uri.getQuery
if (uriQuery == null) {
(false, "")
} else {
val mapTokens = uriQuery.split("&").map(_.split("="))
if (mapTokens.exists(isInvalidQueryString)) {
throw new IllegalArgumentException(
s"Invalid query string in Ivy URI ${uri.toString}: $uriQuery")
}
val groupedParams = mapTokens.map(kv => (kv(0), kv(1))).groupBy(_._1)

// Parse transitive parameters (e.g., transitive=true) in an Ivy URI, default value is false
val transitiveParams = groupedParams.get("transitive")
if (transitiveParams.map(_.size).getOrElse(0) > 1) {
logWarning("It's best to specify `transitive` parameter in ivy URI query only once." +
" If there are multiple `transitive` parameter, we will select the last one")
}
val transitive =
transitiveParams.flatMap(_.takeRight(1).map(_._2 == "true").headOption).getOrElse(false)

// Parse an excluded list (e.g., exclude=org.mortbay.jetty:jetty,org.eclipse.jetty:jetty-http)
// in an Ivy URI. When download Ivy URI jar, Spark won't download transitive jar
// in a excluded list.
val exclusionList = groupedParams.get("exclude").map { params =>
params.map(_._2).flatMap { excludeString =>
val excludes = excludeString.split(",")
if (excludes.map(_.split(":")).exists(isInvalidQueryString)) {
throw new IllegalArgumentException(
s"Invalid exclude string in Ivy URI ${uri.toString}:" +
" expected 'org:module,org:module,..', found " + excludeString)
}
excludes
}.mkString(",")
}.getOrElse("")

val validParams = Set("transitive", "exclude")
val invalidParams = groupedParams.keys.filterNot(validParams.contains).toSeq
if (invalidParams.nonEmpty) {
logWarning(s"Invalid parameters `${invalidParams.sorted.mkString(",")}` found " +
s"in Ivy URI query `$uriQuery`.")
}

(transitive, exclusionList)
}
}

/**
* Download Ivy URI's dependency jars.
*
* @param uri Ivy URI need to be downloaded. The URI format should be:
* `ivy://group:module:version[?query]`
* Ivy URI query part format should be:
* `parameter=value&parameter=value...`
* Note that currently Ivy URI query part support two parameters:
* 1. transitive: whether to download dependent jars related to your Ivy URI.
* transitive=false or `transitive=true`, if not set, the default value is false.
* 2. exclude: exclusion list when download Ivy URI jar and dependency jars.
* The `exclude` parameter content is a ',' separated `group:module` pair string :
* `exclude=group:module,group:module...`
* @return Comma separated string list of jars downloaded.
*/
def resolveMavenDependencies(uri: URI): Seq[String] = {
val ivyProperties = DependencyUtils.getIvyProperties()
val authority = uri.getAuthority
if (authority == null) {
throw new IllegalArgumentException(
s"Invalid Ivy URI authority in uri ${uri.toString}:" +
" Expected 'org:module:version', found null.")
}
if (authority.split(":").length != 3) {
throw new IllegalArgumentException(
s"Invalid Ivy URI authority in uri ${uri.toString}:" +
s" Expected 'org:module:version', found $authority.")
}

val (transitive, exclusionList) = parseQueryParams(uri)

resolveMavenDependencies(
transitive,
exclusionList,
authority,
ivyProperties.repositories,
ivyProperties.ivyRepoPath,
Option(ivyProperties.ivySettingsPath)
).split(",")
}

def resolveMavenDependencies(
packagesTransitive: Boolean,
packagesExclusions: String,
packages: String,
repositories: String,
Expand All @@ -51,7 +179,8 @@ private[deploy] object DependencyUtils extends Logging {
SparkSubmitUtils.buildIvySettings(Option(repositories), Option(ivyRepoPath))
}

SparkSubmitUtils.resolveMavenCoordinates(packages, ivySettings, exclusions = exclusions)
SparkSubmitUtils.resolveMavenCoordinates(packages, ivySettings,
transitive = packagesTransitive, exclusions = exclusions)
}

def resolveAndDownloadJars(
Expand Down
Loading

0 comments on commit 10b6466

Please sign in to comment.