Merge pull request #1 from srowen/Suggestions

Suggestions, fixes from test run
sethah · Jun 19, 2017 · a7c2cc2 · a7c2cc2
2 parents 086fc04 + 1c6b1f6
commit a7c2cc2
Show file tree

Hide file tree

Showing 11 changed files with 100 additions and 111 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,12 +1,4 @@
 target/
-pom.xml.tag
-pom.xml.releaseBackup
-pom.xml.versionsBackup
-pom.xml.next
-release.properties
 dependency-reduced-pom.xml
-buildNumber.properties
-.mvn/timing.properties
-
 .idea/
 *.iml
diff --git a/README.md b/README.md
@@ -2,8 +2,7 @@
 
 ````
 cd [wherever this repo is]
-mvn clean package
-zip -d dl4j-ui/target/dl4j-ui-1.0.0-SNAPSHOT-jar-with-dependencies.jar META-INF/*.RSA META-INF/*.DSA META-INF/*.SF
+mvn -Pspark-deploy clean package
 ````
 
 ## Deployment
@@ -13,28 +12,28 @@ zip -d dl4j-ui/target/dl4j-ui-1.0.0-SNAPSHOT-jar-with-dependencies.jar META-INF/
 On Mac OSX:
 
 ````
-curl -L -o 256_ObjectCategories.tar http://www.vision.caltech.edu/Image_Datasets/Caltech256/256_ObjectCategories.tar
+curl -L -O http://www.vision.caltech.edu/Image_Datasets/Caltech256/256_ObjectCategories.tar
 tar -xf 256_ObjectCategories.tar
-mkdir 256_ObjectCategories/train 256_ObjectCategories/test 256_ObjectCategories/valid
-find -E ./256_ObjectCategories/ -type f -regex ".*/[0-9][0-9][0-9]\..+" -print | gshuf | head -n 5000 | xargs -I {} mv {} ./256_ObjectCategories/valid/
-find -E ./256_ObjectCategories/ -type f -regex ".*/[0-9][0-9][0-9]\..+" -print | gshuf | head -n 6000 | xargs -I {} mv {} ./256_ObjectCategories/test/
-find -E ./256_ObjectCategories/ -type f -regex ".*/[0-9][0-9][0-9]\..+" -print | xargs -I {} mv {} ./256_ObjectCategories/train/
-find -E ./256_ObjectCategories/ -type d -regex ".*/[0-9][0-9][0-9]\..+" -delete
+mkdir -p 256_ObjectCategories/train 256_ObjectCategories/test 256_ObjectCategories/valid
+find -E ./256_ObjectCategories/ -type f -regex ".*/[0-9]{3}\..+" -print | gshuf | head -n 5000 | xargs -I {} mv {} ./256_ObjectCategories/valid/
+find -E ./256_ObjectCategories/ -type f -regex ".*/[0-9]{3}\..+" -print | gshuf | head -n 6000 | xargs -I {} mv {} ./256_ObjectCategories/test/
+find -E ./256_ObjectCategories/ -type f -regex ".*/[0-9]{3}\..+" -print | xargs -I {} mv {} ./256_ObjectCategories/train/
+find -E ./256_ObjectCategories/ -type d -regex ".*/[0-9]{3}\..+" -delete
 ````
 
 On Linux:
 
 ````
-curl -L -o 256_ObjectCategories.tar http://www.vision.caltech.edu/Image_Datasets/Caltech256/256_ObjectCategories.tar
+curl -L -O http://www.vision.caltech.edu/Image_Datasets/Caltech256/256_ObjectCategories.tar
 tar -xf 256_ObjectCategories.tar
-mkdir 256_ObjectCategories/train 256_ObjectCategories/test 256_ObjectCategories/valid
-find ./256_ObjectCategories/ -regextype posix-extended -type f -regex ".*/[0-9][0-9][0-9]\..+" -print | shuf | head -n 5000 | xargs -I {} mv {} ./256_ObjectCategories/valid/
-find ./256_ObjectCategories/ -regextype posix-extended -type f -regex ".*/[0-9][0-9][0-9]\..+" -print | shuf | head -n 6000 | xargs -I {} mv {} ./256_ObjectCategories/test/
-find ./256_ObjectCategories/ -regextype posix-extended -type f -regex ".*/[0-9][0-9][0-9]\..+" -print | xargs -I {} mv {} ./256_ObjectCategories/train/
-find ./256_ObjectCategories/ -regextype posix-extended -type d -regex ".*/[0-9][0-9][0-9]\..+" -delete
+mkdir -p 256_ObjectCategories/train 256_ObjectCategories/test 256_ObjectCategories/valid
+find ./256_ObjectCategories/ -regextype posix-extended -type f -regex ".*/[0-9]{3}\..+" -print | shuf | head -n 5000 | xargs -I {} mv {} ./256_ObjectCategories/valid/
+find ./256_ObjectCategories/ -regextype posix-extended -type f -regex ".*/[0-9]{3}\..+" -print | shuf | head -n 6000 | xargs -I {} mv {} ./256_ObjectCategories/test/
+find ./256_ObjectCategories/ -regextype posix-extended -type f -regex ".*/[0-9]{3}\..+" -print | xargs -I {} mv {} ./256_ObjectCategories/train/
+find ./256_ObjectCategories/ -regextype posix-extended -type d -regex ".*/[0-9]{3}\..+" -delete
 ````
 
-Save to HDFS.
+Copy to HDFS.
 
 ````
 hadoop fs -put ./256_ObjectCategories
@@ -43,13 +42,13 @@ hadoop fs -put ./256_ObjectCategories
 ### Copy app to edge node.
 
 ````
-scp dl4j-cnn/target/dl4j-cnn-1.0.0-SNAPSHOT-jar-with-dependencies.jar [cluster]:
+scp dl4j-cnn/target/dl4j-cnn-1.0.0-jar-with-dependencies.jar [cluster]:
 ````
 
-If using webui:
+If using the web UI:
 
 ````
-scp dl4j-ui/target/dl4j-ui-1.0.0-SNAPSHOT-jar-with-dependencies.jar [cluster]:
+scp dl4j-ui/target/dl4j-ui-1.0.0-jar-with-dependencies.jar [cluster]:
 ````
 
 ### Featurize the input data.
@@ -60,11 +59,13 @@ scp dl4j-ui/target/dl4j-ui-1.0.0-SNAPSHOT-jar-with-dependencies.jar [cluster]:
 been featurized, instructions are below.
 
 ````
+GC_FLAGS="-XX:+UseG1GC -XX:MaxGCPauseMillis=200 -Dorg.bytedeco.javacpp.maxretries=100 -Dorg.bytedeco.javacpp.maxbytes=25000000000"
+BIGTOP_JAVA_MAJOR=8 # ensures Java 8 on distros like CDH
 spark2-submit \
 --master yarn \
 --deploy-mode client \
---conf spark.driver.extraJavaOptions="-XX:+UseG1GC -XX:MaxGCPauseMillis=200 -Dorg.bytedeco.javacpp.maxretries=100 -Dorg.bytedeco.javacpp.maxbytes=25000000000" \
---conf spark.executor.extraJavaOptions="-XX:+UseG1GC -XX:MaxGCPauseMillis=200 -Dorg.bytedeco.javacpp.maxretries=100 -Dorg.bytedeco.javacpp.maxbytes=25000000000" \
+--conf spark.driver.extraJavaOptions="$GC_FLAGS" \
+--conf spark.executor.extraJavaOptions="$GC_FLAGS" \
 --conf spark.locality.wait=0 \
 --conf spark.driver.maxResultSize=10g \
 --conf spark.yarn.executor.memoryOverhead=27g \
@@ -77,7 +78,7 @@ spark2-submit \
 --driver-memory 10g \
 --num-executors=5 \
 --class "com.cloudera.datascience.dl4j.cnn.examples.caltech256.SaveFeaturizedData" \
-dl4j-cnn-1.0.0-SNAPSHOT-jar-with-dependencies.jar \
+dl4j-cnn-1.0.0-jar-with-dependencies.jar \
 --numClasses 257 \
 --outputLayer fc2 \
 --imagePath hdfs:///path/to/256_ObjectCategories/ \
@@ -93,32 +94,35 @@ here: [Caltech256_FeaturizedFC2](https://storage.googleapis.com/dl4j-256-objectc
 Extract this folder and put it into HDFS.
 
 ````
-tar -xvf 256_ObjectCategories_Featurized_FC2.tar
+curl -L -O https://storage.googleapis.com/dl4j-256-objectcategories/256_ObjectCategories_Featurized_FC2.tar
+tar -xf 256_ObjectCategories_Featurized_FC2.tar
 hadoop fs -put 256_ObjectCategories_Featurized_FC2/
 ````
 
 ### Train a model
 
-#### \[Optional\] Start the web ui
+#### \[Optional\] Start the web UI
 
-Specify the port via -p \[PORT\]
+Specify the port via `-p [PORT]`
 
 ````
-java -jar dl4j-ui-1.0.0-SNAPSHOT-jar-with-dependencies.jar -p 9000
+java -jar dl4j-ui-1.0.0-jar-with-dependencies.jar -p 9000
 
 ````
 
 Once you have the data featurized and saved in HDFS, you can train a model.
 
-*Note*: Substitute the IP address of your machine in the command below in order to view the ui.
+*Note*: Substitute the IP address of your machine in the command below in order to view the UI.
 Optionally, don't provide a `--ui` argument to skip it entirely.
 
 ````
-BIGTOP_JAVA_MAJOR=8 spark2-submit \
+GC_FLAGS="-XX:+UseG1GC -XX:MaxGCPauseMillis=200 -Dorg.bytedeco.javacpp.maxretries=100 -Dorg.bytedeco.javacpp.maxbytes=13000000000"
+BIGTOP_JAVA_MAJOR=8 # ensures Java 8 on distros like CDH
+spark2-submit \
 --master yarn \
 --deploy-mode client \
---conf spark.driver.extraJavaOptions="-XX:+UseG1GC -XX:MaxGCPauseMillis=200 -Dorg.bytedeco.javacpp.maxretries=100 -Dorg.bytedeco.javacpp.maxbytes=13000000000" \
---conf spark.executor.extraJavaOptions="-XX:+UseG1GC -XX:MaxGCPauseMillis=200 -Dorg.bytedeco.javacpp.maxretries=100 -Dorg.bytedeco.javacpp.maxbytes=13000000000" \
+--conf spark.driver.extraJavaOptions="$GC_FLAGS" \
+--conf spark.executor.extraJavaOptions="$GC_FLAGS" \
 --conf spark.locality.wait=0 \
 --conf spark.driver.maxResultSize=10g \
 --conf spark.yarn.executor.memoryOverhead=15g \
@@ -131,7 +135,7 @@ BIGTOP_JAVA_MAJOR=8 spark2-submit \
 --driver-memory 10g \
 --num-executors=5 \
 --class "com.cloudera.datascience.dl4j.cnn.examples.caltech256.TrainFeaturized" \
-dl4j-cnn-1.0.0-SNAPSHOT-jar-with-dependencies.jar \
+dl4j-cnn-1.0.0-jar-with-dependencies.jar \
 --inputLayer fc2 \
 --train hdfs:///path/to/256_ObjectCategories_Featurized_FC2/train/ \
 --valid hdfs:///path/to/256_ObjectCategories_Featurized_FC2/valid/ \

diff --git a/dl4j-cnn/pom.xml b/dl4j-cnn/pom.xml
@@ -7,15 +7,15 @@
   <parent>
     <groupId>com.cloudera.datascience</groupId>
     <artifactId>dl4j-cnn-parent</artifactId>
-    <version>1.0.0-SNAPSHOT</version>
+    <version>1.0.0</version>
   </parent>
 
   <artifactId>dl4j-cnn</artifactId>
-  <version>1.0.0-SNAPSHOT</version>
+  <version>1.0.0</version>
   <packaging>jar</packaging>
 
   <properties>
-    <spark.version>2.1.0</spark.version>
+    <spark.version>2.1.1</spark.version>
     <spark.deps.scope>compile</spark.deps.scope>
     <dl4j.spark.version>${dl4j.version}_spark_2</dl4j.spark.version>
     <nd4j.version>${dl4j.version}</nd4j.version>
@@ -87,19 +87,16 @@
       <groupId>org.deeplearning4j</groupId>
       <artifactId>deeplearning4j-core</artifactId>
       <version>${dl4j.version}</version>
-      <scope>${spark.deps.scope}</scope>
     </dependency>
     <dependency>
       <groupId>org.deeplearning4j</groupId>
       <artifactId>deeplearning4j-nn</artifactId>
       <version>${dl4j.version}</version>
-      <scope>${spark.deps.scope}</scope>
     </dependency>
     <dependency>
       <groupId>org.deeplearning4j</groupId>
       <artifactId>dl4j-spark_${scala.minor.version}</artifactId>
       <version>${dl4j.spark.version}</version>
-      <scope>${spark.deps.scope}</scope>
     </dependency>
     <dependency>
       <groupId>org.deeplearning4j</groupId>
@@ -112,39 +109,33 @@
       <groupId>org.nd4j</groupId>
       <artifactId>nd4j-native-platform</artifactId>
       <version>${nd4j.version}</version>
-      <scope>${spark.deps.scope}</scope>
     </dependency>
     <dependency>
       <groupId>org.nd4j</groupId>
       <artifactId>nd4j-native</artifactId>
       <version>${nd4j.version}</version>
-      <scope>${spark.deps.scope}</scope>
     </dependency>
     <dependency>
       <groupId>org.nd4j</groupId>
       <artifactId>nd4j-api</artifactId>
       <version>${nd4j.version}</version>
-      <scope>${spark.deps.scope}</scope>
     </dependency>
     <dependency>
       <groupId>org.nd4j</groupId>
       <artifactId>nd4j-kryo_${scala.minor.version}</artifactId>
       <version>${nd4j.version}</version>
-      <scope>${spark.deps.scope}</scope>
     </dependency>
 
     <dependency>
       <groupId>org.datavec</groupId>
       <artifactId>datavec-api</artifactId>
       <version>${datavec.version}</version>
-      <scope>${spark.deps.scope}</scope>
     </dependency>
 
     <dependency>
       <groupId>org.datavec</groupId>
       <artifactId>datavec-spark_${scala.minor.version}</artifactId>
       <version>${datavec.spark.version}</version>
-      <scope>${spark.deps.scope}</scope>
     </dependency>
 
     <!-- Image processing -->
@@ -161,7 +152,7 @@
     <profile>
       <id>spark-deploy</id>
       <properties>
-        <spark.deps.scope>compile</spark.deps.scope>
+        <spark.deps.scope>provided</spark.deps.scope>
       </properties>
     </profile>
     <profile>

diff --git a/...main/scala/com/cloudera/datascience/dl4j/cnn/examples/caltech256/SaveFeaturizedData.scala b/...main/scala/com/cloudera/datascience/dl4j/cnn/examples/caltech256/SaveFeaturizedData.scala
@@ -5,7 +5,7 @@ import java.io.File
 import com.cloudera.datascience.dl4j.cnn.Utils
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.SparkSession
-import org.apache.spark.{SparkConf, SparkContext}
+import org.apache.spark.SparkContext
 import org.datavec.image.loader.NativeImageLoader
 import org.deeplearning4j.nn.api.Layer
 import org.deeplearning4j.nn.graph.ComputationGraph
@@ -63,12 +63,11 @@ object SaveFeaturizedData {
     val savePath = param.savePath
     val modelPath = param.modelPath
 
-    val sparkConf = new SparkConf().setAppName("Save output of convolutional layers").setMaster("local[*]")
-    val sc = new SparkContext(sparkConf)
-    sc.hadoopConfiguration.set("mapreduce.input.fileinputformat.input.dir.recursive", "true")
+    val spark = SparkSession.builder().master("local[*]").appName("Save output of convolutional layers").getOrCreate()
     val logger = org.apache.log4j.LogManager.getLogger(this.getClass)
     try {
-      val spark = SparkSession.builder().getOrCreate()
+      val sc = spark.sparkContext
+      sc.hadoopConfiguration.set("mapreduce.input.fileinputformat.input.dir.recursive", "true")
       import spark.sqlContext.implicits._
       val vgg16 = modelPath.map { path =>
         val modelFile = new File(path)
@@ -98,7 +97,7 @@ object SaveFeaturizedData {
         df.write.parquet(s"$savePath$dir/")
       }
     } finally {
-      sc.stop()
+      spark.stop()
     }
   }
 

diff --git a/...src/main/scala/com/cloudera/datascience/dl4j/cnn/examples/caltech256/SaveFullOutput.scala b/...src/main/scala/com/cloudera/datascience/dl4j/cnn/examples/caltech256/SaveFullOutput.scala
@@ -4,7 +4,6 @@ import java.io.File
 
 import com.cloudera.datascience.dl4j.cnn.Utils
 import org.apache.spark.sql.{Row, SparkSession}
-import org.apache.spark.{SparkConf, SparkContext}
 import org.deeplearning4j.nn.graph.ComputationGraph
 import org.deeplearning4j.nn.transferlearning.{FineTuneConfiguration, TransferLearning, TransferLearningHelper}
 import org.deeplearning4j.util.ModelSerializer
@@ -50,28 +49,30 @@ object SaveFullOutput {
 
   def main(args: Array[String]): Unit = {
     val param = Params.parseArgs(args)
-    val sparkConf = new SparkConf().setAppName("Featurize VGG dense layers")
-    val sc = new SparkContext(sparkConf)
-    val spark = SparkSession.builder().getOrCreate()
-    import spark.sqlContext.implicits._
+    val spark = SparkSession.builder().appName("Featurize VGG dense layers").getOrCreate()
+    try {
+      import spark.sqlContext.implicits._
 
-    val modelFile = new File(param.modelPath)
-    val vgg16 = ModelSerializer.restoreComputationGraph(modelFile)
-    val data = spark.read.parquet(param.dataPath)
-      .rdd
-      .map { case Row(f: Array[Byte], l: Array[Byte]) =>
-        new DataSet(Nd4j.fromByteArray(f), Nd4j.fromByteArray(l))
+      val modelFile = new File(param.modelPath)
+      val vgg16 = ModelSerializer.restoreComputationGraph(modelFile)
+      val data = spark.read.parquet(param.dataPath)
+        .rdd
+        .map { case Row(f: Array[Byte], l: Array[Byte]) =>
+          new DataSet(Nd4j.fromByteArray(f), Nd4j.fromByteArray(l))
+        }
+      val model = param.lastLayer match {
+        case "predictions" => convToPredictions(vgg16)
+        case "fc2" => convToFC2(vgg16)
       }
-    val model = param.lastLayer match {
-      case "predictions" => convToPredictions(vgg16)
-      case "fc2" => convToFC2(vgg16)
-    }
 
-    val finalOutput = Utils.getPredictions(data, model, sc)
-    val df = finalOutput.map { ds =>
-      (Nd4j.toByteArray(ds.getFeatureMatrix), Nd4j.toByteArray(ds.getLabels))
-    }.toDF()
-    df.write.parquet(param.savePath)
+      val finalOutput = Utils.getPredictions(data, model, spark.sparkContext)
+      val df = finalOutput.map { ds =>
+        (Nd4j.toByteArray(ds.getFeatureMatrix), Nd4j.toByteArray(ds.getLabels))
+      }.toDF()
+      df.write.parquet(param.savePath)
+    } finally {
+      spark.stop()
+    }
   }
 
   private def convToFC2(vgg: ComputationGraph): ComputationGraph = {

diff --git a/...rc/main/scala/com/cloudera/datascience/dl4j/cnn/examples/caltech256/ScoreSparkModel.scala b/...rc/main/scala/com/cloudera/datascience/dl4j/cnn/examples/caltech256/ScoreSparkModel.scala
@@ -4,7 +4,6 @@ import java.io.File
 
 import com.cloudera.datascience.dl4j.cnn.Utils
 import org.apache.spark.sql.{Row, SparkSession}
-import org.apache.spark.{SparkConf, SparkContext}
 import org.deeplearning4j.util.ModelSerializer
 import org.nd4j.linalg.dataset.DataSet
 import org.nd4j.linalg.factory.Nd4j
@@ -32,11 +31,10 @@ object ScoreSparkModel {
 
   def main(args: Array[String]): Unit = {
     val param = Params.parseArgs(args)
-    val sparkConf = new SparkConf().setAppName("score a model")
-    val sc = new SparkContext(sparkConf)
-    sc.hadoopConfiguration.set("mapreduce.input.fileinputformat.input.dir.recursive", "true")
+    val spark = SparkSession.builder().appName("score a model").getOrCreate()
     try {
-      val spark = SparkSession.builder().getOrCreate()
+      val sc = spark.sparkContext
+      sc.hadoopConfiguration.set("mapreduce.input.fileinputformat.input.dir.recursive", "true")
       val restorePath = new File(param.modelPath)
       val restored = ModelSerializer.restoreComputationGraph(restorePath)
       val testRDD = spark.read.parquet(param.dataPath)
@@ -48,7 +46,7 @@ object ScoreSparkModel {
       println(Utils.prettyPrintEvaluationStats(eval))
 
     } finally {
-      sc.stop()
+      spark.stop()
     }
   }
 }