Skip to content

Commit

Permalink
Move FsInput creation into AvroFileReader (#5556)
Browse files Browse the repository at this point in the history
Signed-off-by: Firestarman <firestarmanllc@gmail.com>
  • Loading branch information
firestarman authored May 20, 2022
1 parent c9f3595 commit b30d80e
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 12 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
package com.nvidia.spark.rapids

import java.io.{EOFException, InputStream, IOException, OutputStream}
import java.net.URI
import java.nio.charset.StandardCharsets

import scala.collection.mutable
Expand All @@ -25,7 +26,10 @@ import org.apache.avro.Schema
import org.apache.avro.file.DataFileConstants._
import org.apache.avro.file.SeekableInput
import org.apache.avro.io.{BinaryData, BinaryDecoder, DecoderFactory}
import org.apache.avro.mapred.FsInput
import org.apache.commons.io.output.{CountingOutputStream, NullOutputStream}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path

private[rapids] class AvroSeekableInputStream(in: SeekableInput) extends InputStream
with SeekableInput {
Expand Down Expand Up @@ -434,13 +438,21 @@ class AvroDataFileReader(si: SeekableInput) extends AvroFileReader(si) {

}

object AvroFileReader {
object AvroFileReader extends Arm {

def openMetaReader(si: SeekableInput): AvroMetaFileReader = {
new AvroMetaFileReader(si)
def openMetaReader(filePath: String, conf: Configuration): AvroMetaFileReader = {
closeOnExcept(openFile(filePath, conf)) { si =>
new AvroMetaFileReader(si)
}
}

def openDataReader(filePath: String, conf: Configuration): AvroDataFileReader = {
closeOnExcept(openFile(filePath, conf)) { si =>
new AvroDataFileReader(si)
}
}

def openDataReader(si: SeekableInput): AvroDataFileReader = {
new AvroDataFileReader(si)
private def openFile(filePath: String, conf: Configuration): SeekableInput = {
new FsInput(new Path(new URI(filePath)), conf)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ import com.nvidia.spark.rapids.GpuMetric.{GPU_DECODE_TIME, NUM_OUTPUT_BATCHES, P
import com.nvidia.spark.rapids.RapidsPluginImplicits._
import org.apache.avro.Schema
import org.apache.avro.file.DataFileConstants.SYNC_SIZE
import org.apache.avro.mapred.FsInput
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FSDataInputStream, Path}

Expand Down Expand Up @@ -701,9 +700,7 @@ class GpuMultiFileCloudAvroPartitionReader(
*/
private def doRead(): HostMemoryBuffersWithMetaDataBase = {
val startingBytesRead = fileSystemBytesRead()
val in = new FsInput(new Path(new URI(partFile.filePath)), config)
val reader = closeOnExcept(in) { _ => AvroFileReader.openDataReader(in) }
withResource(reader) { _ =>
withResource(AvroFileReader.openDataReader(partFile.filePath, config)) { reader =>
// Go to the start of the first block after the start position
reader.sync(partFile.start)
if (!reader.hasNextBlock || isDone) {
Expand Down Expand Up @@ -959,9 +956,7 @@ case class AvroFileFilterHandler(

def filterBlocks(partFile: PartitionedFile): AvroBlockMeta = {
if (ignoreExtension || partFile.filePath.endsWith(".avro")) {
val in = new FsInput(new Path(new URI(partFile.filePath)), hadoopConf)
val reader = closeOnExcept(in) { _ => AvroFileReader.openMetaReader(in) }
withResource(reader) { _ =>
withResource(AvroFileReader.openMetaReader(partFile.filePath, hadoopConf)) { reader =>
// Get blocks only belong to this split
reader.sync(partFile.start)
val partBlocks = reader.getPartialBlocks(partFile.start + partFile.length)
Expand Down

0 comments on commit b30d80e

Please sign in to comment.