Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use parse_url kernel for HOST parsing #9845

Merged
merged 44 commits into from
Dec 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
8235c95
WIP: Support parse_url
thirtiseven Jul 20, 2023
9f17539
Merge branch 'NVIDIA:branch-23.08' into prase_url
thirtiseven Jul 20, 2023
729fe35
fix build failures
thirtiseven Jul 20, 2023
85c3284
regex refactor
thirtiseven Aug 3, 2023
6819214
Merge branch 'NVIDIA:branch-23.08' into prase_url
thirtiseven Aug 3, 2023
4166362
Separate regexes and UTF-8 special characters support
thirtiseven Aug 3, 2023
43acceb
hostname validation
thirtiseven Aug 3, 2023
64d8373
hostname validation
thirtiseven Aug 3, 2023
e6a45d3
ipv4 validation
thirtiseven Aug 4, 2023
8c4dc7a
verify
thirtiseven Aug 4, 2023
fee5a3d
wip ipv6 and SPARK-44500
thirtiseven Aug 4, 2023
e81d8a3
optional protocol and ref validation
thirtiseven Aug 7, 2023
93a9342
IPV6 VALIDATION
thirtiseven Aug 8, 2023
1ad665f
clean up
thirtiseven Aug 8, 2023
3edb929
Fix ipv6 validation, it is still wip
thirtiseven Aug 9, 2023
daa61ea
Fix ipv6 validation and some clean up
thirtiseven Aug 9, 2023
70a5d88
Merge branch 'prase_url' into parse_url_protocol
thirtiseven Oct 19, 2023
b3abaf6
Use parse_url kernel for PROTOCOL parsing
thirtiseven Oct 19, 2023
592c642
verify
thirtiseven Oct 19, 2023
9db1b2a
edit compatibility and update IT
thirtiseven Oct 19, 2023
d09f06d
update integration tests
thirtiseven Oct 20, 2023
3b71c4d
address comments
thirtiseven Oct 24, 2023
46527f3
remove unnecessary error handling
thirtiseven Oct 24, 2023
6161fa4
clean up
thirtiseven Oct 24, 2023
e16fe1e
Merge branch 'parse_url_protocol' of https://github.com/thirtiseven/s…
thirtiseven Nov 16, 2023
8e7ed44
Merge branch 'thirtiseven-parse_url_protocol' into parse_url_protocol
thirtiseven Nov 16, 2023
f93b944
Merge branch 'NVIDIA:branch-23.12' into parse_url_protocol
thirtiseven Nov 16, 2023
8f4990c
Revert scala tests temporarily for easier testing
thirtiseven Nov 16, 2023
3376376
Fix two nits
thirtiseven Nov 16, 2023
4e98888
Updated results
thirtiseven Nov 22, 2023
6d916c4
clean up
thirtiseven Nov 22, 2023
e2cbf2f
Merge branch 'branch-24.02' into parse_url_host
thirtiseven Nov 23, 2023
2ba1a53
Use parse_url kernel for HOST parsing
thirtiseven Nov 23, 2023
1b36090
rename urlFunctions to GpuParseUrl
thirtiseven Nov 28, 2023
12ea091
Merge branch 'parse_url_protocol' into parse_url_host
thirtiseven Nov 28, 2023
6b3feb6
Merge branch 'branch-24.02' into parse_url_host
thirtiseven Nov 28, 2023
451c201
address comments
thirtiseven Nov 28, 2023
5f43687
Merge branch 'NVIDIA:branch-24.02' into parse_url_host
thirtiseven Dec 5, 2023
a62c00c
update test names
thirtiseven Dec 5, 2023
2704487
remove scala test
thirtiseven Dec 6, 2023
11ef578
Merge branch 'NVIDIA:branch-24.02' into parse_url_host
thirtiseven Dec 8, 2023
e56173b
verify
thirtiseven Dec 8, 2023
c50b3df
format
thirtiseven Dec 11, 2023
1732ece
Merge branch 'branch-24.02' into parse_url_host
thirtiseven Dec 13, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/supported_ops.md
Original file line number Diff line number Diff line change
Expand Up @@ -10770,7 +10770,7 @@ are limited.
<td> </td>
<td> </td>
<td> </td>
<td><em>PS<br/>only support partToExtract=PROTOCOL;<br/>Literal value only</em></td>
<td><em>PS<br/>only support partToExtract = PROTOCOL | HOST;<br/>Literal value only</em></td>
<td> </td>
<td> </td>
<td> </td>
Expand Down
22 changes: 9 additions & 13 deletions integration_tests/src/main/python/url_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,23 +145,19 @@
edge_cases_gen = SetValuesGen(StringType(), edge_cases)

url_gen = StringGen(url_pattern)

supported_parts = ['PROTOCOL', 'HOST']
unsupported_parts = ['PATH', 'QUERY', 'REF', 'FILE', 'AUTHORITY', 'USERINFO']

@pytest.mark.parametrize('data_gen', [url_gen, edge_cases_gen], ids=idfn)
def test_parse_url_protocol(data_gen):
@pytest.mark.parametrize('part', supported_parts, ids=idfn)
def test_parse_url_supported(data_gen, part):
assert_gpu_and_cpu_are_equal_collect(
lambda spark : unary_op_df(spark, data_gen).selectExpr(
"a",
"parse_url(a, 'PROTOCOL')"
))

unsupported_parts = ['HOST', 'PATH', 'QUERY', 'REF', 'FILE', 'AUTHORITY', 'USERINFO']
lambda spark: unary_op_df(spark, data_gen).selectExpr("a", "parse_url(a, '" + part + "')"))

@allow_non_gpu('ProjectExec', 'ParseUrl')
@pytest.mark.parametrize('part', unsupported_parts, ids=idfn)
def test_parse_url_host_fallback(part):
def test_parse_url_unsupported_fallback(part):
assert_gpu_fallback_collect(
lambda spark : unary_op_df(spark, url_gen).selectExpr(
"a",
"parse_url(a, '" + part + "')"
),
'ParseUrl')
lambda spark: unary_op_df(spark, url_gen).selectExpr("a", "parse_url(a, '" + part + "')"),
'ParseUrl')
Original file line number Diff line number Diff line change
Expand Up @@ -3231,7 +3231,7 @@ object GpuOverrides extends Logging {
ExprChecks.projectOnly(TypeSig.STRING, TypeSig.STRING,
Seq(ParamCheck("url", TypeSig.STRING, TypeSig.STRING),
ParamCheck("partToExtract", TypeSig.lit(TypeEnum.STRING).withPsNote(
TypeEnum.STRING, "only support partToExtract=PROTOCOL"), TypeSig.STRING)),
TypeEnum.STRING, "only support partToExtract = PROTOCOL | HOST"), TypeSig.STRING)),
// Should really be an OptionalParam
Some(RepeatingParamCheck("key", TypeSig.lit(TypeEnum.STRING), TypeSig.STRING))),
(a, conf, p, r) => new ExprMeta[ParseUrl](a, conf, p, r) {
Expand All @@ -3241,7 +3241,7 @@ object GpuOverrides extends Logging {
}

extractStringLit(a.children(1)).map(_.toUpperCase) match {
case Some(GpuParseUrl.PROTOCOL) =>
case Some(part) if GpuParseUrl.isSupportedPart(part) =>
case Some(other) =>
willNotWorkOnGpu(s"Part to extract $other is not supported on GPU")
case None =>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,15 @@ object GpuParseUrl {
val FILE = "FILE"
val AUTHORITY = "AUTHORITY"
val USERINFO = "USERINFO"

def isSupportedPart(part: String): Boolean = {
part match {
case PROTOCOL | HOST =>
true
case _ =>
false
}
}
}

case class GpuParseUrl(children: Seq[Expression])
Expand All @@ -54,8 +63,11 @@ case class GpuParseUrl(children: Seq[Expression])
part match {
case PROTOCOL =>
ParseURI.parseURIProtocol(url.getBase)
case HOST | PATH | QUERY | REF | FILE | AUTHORITY | USERINFO =>
throw new UnsupportedOperationException(s"$this is not supported partToExtract=$part")
case HOST =>
ParseURI.parseURIHost(url.getBase)
case PATH | QUERY | REF | FILE | AUTHORITY | USERINFO =>
throw new UnsupportedOperationException(s"$this is not supported partToExtract=$part. " +
s"Only PROTOCOL and HOST are supported")
case _ =>
throw new IllegalArgumentException(s"Invalid partToExtract: $partToExtract")
}
Expand All @@ -67,7 +79,8 @@ case class GpuParseUrl(children: Seq[Expression])
// return a null columnvector
return ColumnVector.fromStrings(null, null)
}
throw new UnsupportedOperationException(s"$this only supports partToExtract = PROTOCOL")
throw new UnsupportedOperationException(s"$this is not supported partToExtract=$part. " +
s"Only PROTOCOL and HOST are supported")
}

override def columnarEval(batch: ColumnarBatch): GpuColumnVector = {
Expand All @@ -79,8 +92,8 @@ case class GpuParseUrl(children: Seq[Expression])
case partScalar: GpuScalar =>
GpuColumnVector.from(doColumnar(urls, partScalar), dataType)
case _ =>
throw new
UnsupportedOperationException(s"Cannot columnar evaluate expression: $this")
throw new UnsupportedOperationException(
s"Cannot columnar evaluate expression: $this")
}
}
}
Expand Down