From f2c9544b1ffcd0142cc198a69a939139ec30cb27 Mon Sep 17 00:00:00 2001 From: Ruslan Iushchenko Date: Sat, 23 Mar 2024 12:07:02 +0100 Subject: [PATCH] #659 Fix record length option when record id generation is turned on. --- .../cobol/reader/VarLenNestedReader.scala | 9 +- .../parameters/CobolParametersParser.scala | 2 +- .../regression/Test22RecordLengthGenId.scala | 102 ++++++++++++++++++ 3 files changed, 107 insertions(+), 6 deletions(-) create mode 100644 spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/regression/Test22RecordLengthGenId.scala diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/VarLenNestedReader.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/VarLenNestedReader.scala index 41e13e49..a40edab6 100644 --- a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/VarLenNestedReader.scala +++ b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/VarLenNestedReader.scala @@ -17,16 +17,13 @@ package za.co.absa.cobrix.cobol.reader import za.co.absa.cobrix.cobol.internal.Logging - -import java.nio.charset.{Charset, StandardCharsets} import za.co.absa.cobrix.cobol.parser.common.Constants import za.co.absa.cobrix.cobol.parser.encoding.codepage.CodePage import za.co.absa.cobrix.cobol.parser.encoding.{ASCII, EBCDIC} import za.co.absa.cobrix.cobol.parser.headerparsers.{RecordHeaderParser, RecordHeaderParserFactory} -import za.co.absa.cobrix.cobol.parser.policies.FillerNamingPolicy import za.co.absa.cobrix.cobol.parser.recordformats.RecordFormat.{FixedBlock, VariableBlock} import za.co.absa.cobrix.cobol.parser.{Copybook, CopybookParser} -import za.co.absa.cobrix.cobol.reader.extractors.raw.{FixedBlockParameters, FixedBlockRawRecordExtractor, RawRecordContext, RawRecordExtractor, RawRecordExtractorFactory, TextFullRecordExtractor, TextRecordExtractor, VarOccursRecordExtractor, VariableBlockVariableRecordExtractor} +import za.co.absa.cobrix.cobol.reader.extractors.raw._ import za.co.absa.cobrix.cobol.reader.extractors.record.RecordHandler import za.co.absa.cobrix.cobol.reader.index.IndexGenerator import za.co.absa.cobrix.cobol.reader.index.entry.SparseIndexEntry @@ -37,6 +34,7 @@ import za.co.absa.cobrix.cobol.reader.schema.CobolSchema import za.co.absa.cobrix.cobol.reader.stream.SimpleStream import za.co.absa.cobrix.cobol.reader.validator.ReaderParametersValidator +import java.nio.charset.{Charset, StandardCharsets} import scala.collection.immutable.HashMap import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag @@ -317,8 +315,9 @@ class VarLenNestedReader[T: ClassTag](copybookContents: Seq[String], } } else { // Fixed record length record parser + val recordSize = readerProperties.recordLength.getOrElse(cobolSchema.getRecordSize) RecordHeaderParserFactory.createRecordHeaderParser(Constants.RhRdwFixedLength, - cobolSchema.getRecordSize, + recordSize, readerProperties.fileStartOffset, readerProperties.fileEndOffset, 0 diff --git a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/parameters/CobolParametersParser.scala b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/parameters/CobolParametersParser.scala index 818a31cc..fe835aea 100644 --- a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/parameters/CobolParametersParser.scala +++ b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/parameters/CobolParametersParser.scala @@ -492,7 +492,7 @@ object CobolParametersParser extends Logging { logger.warn(s"Option '$PARAM_BLOCK_LENGTH' is ignored for record format: VB") } if (recordFormat == FixedBlock && bdw.recordsPerBlock.nonEmpty) { - logger.warn(s"Option '$PARAM_RECORDS_PER_BLOCK' is ignored for record format: VB") + logger.warn(s"Option '$PARAM_RECORDS_PER_BLOCK' is ignored for record format: F") } Some(bdw) } else { diff --git a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/regression/Test22RecordLengthGenId.scala b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/regression/Test22RecordLengthGenId.scala new file mode 100644 index 00000000..6ff6c022 --- /dev/null +++ b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/regression/Test22RecordLengthGenId.scala @@ -0,0 +1,102 @@ +/* + * Copyright 2018 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.cobrix.spark.cobol.source.regression + +import org.scalatest.wordspec.AnyWordSpec +import org.slf4j.{Logger, LoggerFactory} +import za.co.absa.cobrix.spark.cobol.source.base.{SimpleComparisonBase, SparkTestBase} +import za.co.absa.cobrix.spark.cobol.source.fixtures.BinaryFileFixture +import za.co.absa.cobrix.spark.cobol.utils.SparkUtils + +class Test22RecordLengthGenId extends AnyWordSpec with SparkTestBase with BinaryFileFixture with SimpleComparisonBase { + + private implicit val logger: Logger = LoggerFactory.getLogger(this.getClass) + + private val copybook = + """ 01 R. + 05 A PIC 9(1). + 05 B PIC X(2). + """ + + val binFileContents: Array[Byte] = Array[Byte]( + // 123{456} + 0xF1.toByte, 0xF2.toByte, 0xF3.toByte, 0xC0.toByte, + // 789J123A + 0xF7.toByte, 0xF8.toByte, 0xF9.toByte, 0xD1.toByte, + // 65432101 + 0xF6.toByte, 0xF5.toByte, 0xF4.toByte, 0xF3.toByte + ) + + "EBCDIC files" should { + "correctly work without record it generation" in { + withTempBinFile("sign_overpunch", ".dat", binFileContents) { tmpFileName => + val df = spark + .read + .format("cobol") + .option("copybook_contents", copybook) + .option("record_format", "F") + .option("record_length", "4") + .option("pedantic", "true") + .load(tmpFileName) + + val expected = """[{"A":1,"B":"23"},{"A":7,"B":"89"},{"A":6,"B":"54"}]""" + + val actual = df.toJSON.collect().mkString("[", ",", "]") + + assertEqualsMultiline(actual, expected) + } + } + + "correctly work with record it generation" in { + withTempBinFile("sign_overpunch", ".dat", binFileContents) { tmpFileName => + val df = spark + .read + .format("cobol") + .option("copybook_contents", copybook) + .option("record_format", "F") + .option("record_length", "4") + .option("generate_record_id", "true") + .option("pedantic", "true") + .load(tmpFileName) + + val expected = """[ { + | "File_Id" : 0, + | "Record_Id" : 0, + | "Record_Byte_Length" : 4, + | "A" : 1, + | "B" : "23" + |}, { + | "File_Id" : 0, + | "Record_Id" : 1, + | "Record_Byte_Length" : 4, + | "A" : 7, + | "B" : "89" + |}, { + | "File_Id" : 0, + | "Record_Id" : 2, + | "Record_Byte_Length" : 4, + | "A" : 6, + | "B" : "54" + |} ]""".stripMargin + + val actual = SparkUtils.convertDataFrameToPrettyJSON(df) + + assertEqualsMultiline(actual, expected) + } + } + } +}