Skip to content

Commit

Permalink
Merge pull request #1 from srowen/Suggestions
Browse files Browse the repository at this point in the history
Suggestions, fixes from test run
  • Loading branch information
sethah committed Jun 19, 2017
2 parents 086fc04 + 1c6b1f6 commit a7c2cc2
Show file tree
Hide file tree
Showing 11 changed files with 100 additions and 111 deletions.
8 changes: 0 additions & 8 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,12 +1,4 @@
target/
pom.xml.tag
pom.xml.releaseBackup
pom.xml.versionsBackup
pom.xml.next
release.properties
dependency-reduced-pom.xml
buildNumber.properties
.mvn/timing.properties

.idea/
*.iml
64 changes: 34 additions & 30 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,7 @@

````
cd [wherever this repo is]
mvn clean package
zip -d dl4j-ui/target/dl4j-ui-1.0.0-SNAPSHOT-jar-with-dependencies.jar META-INF/*.RSA META-INF/*.DSA META-INF/*.SF
mvn -Pspark-deploy clean package
````

## Deployment
Expand All @@ -13,28 +12,28 @@ zip -d dl4j-ui/target/dl4j-ui-1.0.0-SNAPSHOT-jar-with-dependencies.jar META-INF/
On Mac OSX:

````
curl -L -o 256_ObjectCategories.tar http://www.vision.caltech.edu/Image_Datasets/Caltech256/256_ObjectCategories.tar
curl -L -O http://www.vision.caltech.edu/Image_Datasets/Caltech256/256_ObjectCategories.tar
tar -xf 256_ObjectCategories.tar
mkdir 256_ObjectCategories/train 256_ObjectCategories/test 256_ObjectCategories/valid
find -E ./256_ObjectCategories/ -type f -regex ".*/[0-9][0-9][0-9]\..+" -print | gshuf | head -n 5000 | xargs -I {} mv {} ./256_ObjectCategories/valid/
find -E ./256_ObjectCategories/ -type f -regex ".*/[0-9][0-9][0-9]\..+" -print | gshuf | head -n 6000 | xargs -I {} mv {} ./256_ObjectCategories/test/
find -E ./256_ObjectCategories/ -type f -regex ".*/[0-9][0-9][0-9]\..+" -print | xargs -I {} mv {} ./256_ObjectCategories/train/
find -E ./256_ObjectCategories/ -type d -regex ".*/[0-9][0-9][0-9]\..+" -delete
mkdir -p 256_ObjectCategories/train 256_ObjectCategories/test 256_ObjectCategories/valid
find -E ./256_ObjectCategories/ -type f -regex ".*/[0-9]{3}\..+" -print | gshuf | head -n 5000 | xargs -I {} mv {} ./256_ObjectCategories/valid/
find -E ./256_ObjectCategories/ -type f -regex ".*/[0-9]{3}\..+" -print | gshuf | head -n 6000 | xargs -I {} mv {} ./256_ObjectCategories/test/
find -E ./256_ObjectCategories/ -type f -regex ".*/[0-9]{3}\..+" -print | xargs -I {} mv {} ./256_ObjectCategories/train/
find -E ./256_ObjectCategories/ -type d -regex ".*/[0-9]{3}\..+" -delete
````

On Linux:

````
curl -L -o 256_ObjectCategories.tar http://www.vision.caltech.edu/Image_Datasets/Caltech256/256_ObjectCategories.tar
curl -L -O http://www.vision.caltech.edu/Image_Datasets/Caltech256/256_ObjectCategories.tar
tar -xf 256_ObjectCategories.tar
mkdir 256_ObjectCategories/train 256_ObjectCategories/test 256_ObjectCategories/valid
find ./256_ObjectCategories/ -regextype posix-extended -type f -regex ".*/[0-9][0-9][0-9]\..+" -print | shuf | head -n 5000 | xargs -I {} mv {} ./256_ObjectCategories/valid/
find ./256_ObjectCategories/ -regextype posix-extended -type f -regex ".*/[0-9][0-9][0-9]\..+" -print | shuf | head -n 6000 | xargs -I {} mv {} ./256_ObjectCategories/test/
find ./256_ObjectCategories/ -regextype posix-extended -type f -regex ".*/[0-9][0-9][0-9]\..+" -print | xargs -I {} mv {} ./256_ObjectCategories/train/
find ./256_ObjectCategories/ -regextype posix-extended -type d -regex ".*/[0-9][0-9][0-9]\..+" -delete
mkdir -p 256_ObjectCategories/train 256_ObjectCategories/test 256_ObjectCategories/valid
find ./256_ObjectCategories/ -regextype posix-extended -type f -regex ".*/[0-9]{3}\..+" -print | shuf | head -n 5000 | xargs -I {} mv {} ./256_ObjectCategories/valid/
find ./256_ObjectCategories/ -regextype posix-extended -type f -regex ".*/[0-9]{3}\..+" -print | shuf | head -n 6000 | xargs -I {} mv {} ./256_ObjectCategories/test/
find ./256_ObjectCategories/ -regextype posix-extended -type f -regex ".*/[0-9]{3}\..+" -print | xargs -I {} mv {} ./256_ObjectCategories/train/
find ./256_ObjectCategories/ -regextype posix-extended -type d -regex ".*/[0-9]{3}\..+" -delete
````

Save to HDFS.
Copy to HDFS.

````
hadoop fs -put ./256_ObjectCategories
Expand All @@ -43,13 +42,13 @@ hadoop fs -put ./256_ObjectCategories
### Copy app to edge node.

````
scp dl4j-cnn/target/dl4j-cnn-1.0.0-SNAPSHOT-jar-with-dependencies.jar [cluster]:
scp dl4j-cnn/target/dl4j-cnn-1.0.0-jar-with-dependencies.jar [cluster]:
````

If using webui:
If using the web UI:

````
scp dl4j-ui/target/dl4j-ui-1.0.0-SNAPSHOT-jar-with-dependencies.jar [cluster]:
scp dl4j-ui/target/dl4j-ui-1.0.0-jar-with-dependencies.jar [cluster]:
````

### Featurize the input data.
Expand All @@ -60,11 +59,13 @@ scp dl4j-ui/target/dl4j-ui-1.0.0-SNAPSHOT-jar-with-dependencies.jar [cluster]:
been featurized, instructions are below.

````
GC_FLAGS="-XX:+UseG1GC -XX:MaxGCPauseMillis=200 -Dorg.bytedeco.javacpp.maxretries=100 -Dorg.bytedeco.javacpp.maxbytes=25000000000"
BIGTOP_JAVA_MAJOR=8 # ensures Java 8 on distros like CDH
spark2-submit \
--master yarn \
--deploy-mode client \
--conf spark.driver.extraJavaOptions="-XX:+UseG1GC -XX:MaxGCPauseMillis=200 -Dorg.bytedeco.javacpp.maxretries=100 -Dorg.bytedeco.javacpp.maxbytes=25000000000" \
--conf spark.executor.extraJavaOptions="-XX:+UseG1GC -XX:MaxGCPauseMillis=200 -Dorg.bytedeco.javacpp.maxretries=100 -Dorg.bytedeco.javacpp.maxbytes=25000000000" \
--conf spark.driver.extraJavaOptions="$GC_FLAGS" \
--conf spark.executor.extraJavaOptions="$GC_FLAGS" \
--conf spark.locality.wait=0 \
--conf spark.driver.maxResultSize=10g \
--conf spark.yarn.executor.memoryOverhead=27g \
Expand All @@ -77,7 +78,7 @@ spark2-submit \
--driver-memory 10g \
--num-executors=5 \
--class "com.cloudera.datascience.dl4j.cnn.examples.caltech256.SaveFeaturizedData" \
dl4j-cnn-1.0.0-SNAPSHOT-jar-with-dependencies.jar \
dl4j-cnn-1.0.0-jar-with-dependencies.jar \
--numClasses 257 \
--outputLayer fc2 \
--imagePath hdfs:///path/to/256_ObjectCategories/ \
Expand All @@ -93,32 +94,35 @@ here: [Caltech256_FeaturizedFC2](https://storage.googleapis.com/dl4j-256-objectc
Extract this folder and put it into HDFS.

````
tar -xvf 256_ObjectCategories_Featurized_FC2.tar
curl -L -O https://storage.googleapis.com/dl4j-256-objectcategories/256_ObjectCategories_Featurized_FC2.tar
tar -xf 256_ObjectCategories_Featurized_FC2.tar
hadoop fs -put 256_ObjectCategories_Featurized_FC2/
````

### Train a model

#### \[Optional\] Start the web ui
#### \[Optional\] Start the web UI

Specify the port via -p \[PORT\]
Specify the port via `-p [PORT]`

````
java -jar dl4j-ui-1.0.0-SNAPSHOT-jar-with-dependencies.jar -p 9000
java -jar dl4j-ui-1.0.0-jar-with-dependencies.jar -p 9000
````

Once you have the data featurized and saved in HDFS, you can train a model.

*Note*: Substitute the IP address of your machine in the command below in order to view the ui.
*Note*: Substitute the IP address of your machine in the command below in order to view the UI.
Optionally, don't provide a `--ui` argument to skip it entirely.

````
BIGTOP_JAVA_MAJOR=8 spark2-submit \
GC_FLAGS="-XX:+UseG1GC -XX:MaxGCPauseMillis=200 -Dorg.bytedeco.javacpp.maxretries=100 -Dorg.bytedeco.javacpp.maxbytes=13000000000"
BIGTOP_JAVA_MAJOR=8 # ensures Java 8 on distros like CDH
spark2-submit \
--master yarn \
--deploy-mode client \
--conf spark.driver.extraJavaOptions="-XX:+UseG1GC -XX:MaxGCPauseMillis=200 -Dorg.bytedeco.javacpp.maxretries=100 -Dorg.bytedeco.javacpp.maxbytes=13000000000" \
--conf spark.executor.extraJavaOptions="-XX:+UseG1GC -XX:MaxGCPauseMillis=200 -Dorg.bytedeco.javacpp.maxretries=100 -Dorg.bytedeco.javacpp.maxbytes=13000000000" \
--conf spark.driver.extraJavaOptions="$GC_FLAGS" \
--conf spark.executor.extraJavaOptions="$GC_FLAGS" \
--conf spark.locality.wait=0 \
--conf spark.driver.maxResultSize=10g \
--conf spark.yarn.executor.memoryOverhead=15g \
Expand All @@ -131,7 +135,7 @@ BIGTOP_JAVA_MAJOR=8 spark2-submit \
--driver-memory 10g \
--num-executors=5 \
--class "com.cloudera.datascience.dl4j.cnn.examples.caltech256.TrainFeaturized" \
dl4j-cnn-1.0.0-SNAPSHOT-jar-with-dependencies.jar \
dl4j-cnn-1.0.0-jar-with-dependencies.jar \
--inputLayer fc2 \
--train hdfs:///path/to/256_ObjectCategories_Featurized_FC2/train/ \
--valid hdfs:///path/to/256_ObjectCategories_Featurized_FC2/valid/ \
Expand Down
17 changes: 4 additions & 13 deletions dl4j-cnn/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,15 @@
<parent>
<groupId>com.cloudera.datascience</groupId>
<artifactId>dl4j-cnn-parent</artifactId>
<version>1.0.0-SNAPSHOT</version>
<version>1.0.0</version>
</parent>

<artifactId>dl4j-cnn</artifactId>
<version>1.0.0-SNAPSHOT</version>
<version>1.0.0</version>
<packaging>jar</packaging>

<properties>
<spark.version>2.1.0</spark.version>
<spark.version>2.1.1</spark.version>
<spark.deps.scope>compile</spark.deps.scope>
<dl4j.spark.version>${dl4j.version}_spark_2</dl4j.spark.version>
<nd4j.version>${dl4j.version}</nd4j.version>
Expand Down Expand Up @@ -87,19 +87,16 @@
<groupId>org.deeplearning4j</groupId>
<artifactId>deeplearning4j-core</artifactId>
<version>${dl4j.version}</version>
<scope>${spark.deps.scope}</scope>
</dependency>
<dependency>
<groupId>org.deeplearning4j</groupId>
<artifactId>deeplearning4j-nn</artifactId>
<version>${dl4j.version}</version>
<scope>${spark.deps.scope}</scope>
</dependency>
<dependency>
<groupId>org.deeplearning4j</groupId>
<artifactId>dl4j-spark_${scala.minor.version}</artifactId>
<version>${dl4j.spark.version}</version>
<scope>${spark.deps.scope}</scope>
</dependency>
<dependency>
<groupId>org.deeplearning4j</groupId>
Expand All @@ -112,39 +109,33 @@
<groupId>org.nd4j</groupId>
<artifactId>nd4j-native-platform</artifactId>
<version>${nd4j.version}</version>
<scope>${spark.deps.scope}</scope>
</dependency>
<dependency>
<groupId>org.nd4j</groupId>
<artifactId>nd4j-native</artifactId>
<version>${nd4j.version}</version>
<scope>${spark.deps.scope}</scope>
</dependency>
<dependency>
<groupId>org.nd4j</groupId>
<artifactId>nd4j-api</artifactId>
<version>${nd4j.version}</version>
<scope>${spark.deps.scope}</scope>
</dependency>
<dependency>
<groupId>org.nd4j</groupId>
<artifactId>nd4j-kryo_${scala.minor.version}</artifactId>
<version>${nd4j.version}</version>
<scope>${spark.deps.scope}</scope>
</dependency>

<dependency>
<groupId>org.datavec</groupId>
<artifactId>datavec-api</artifactId>
<version>${datavec.version}</version>
<scope>${spark.deps.scope}</scope>
</dependency>

<dependency>
<groupId>org.datavec</groupId>
<artifactId>datavec-spark_${scala.minor.version}</artifactId>
<version>${datavec.spark.version}</version>
<scope>${spark.deps.scope}</scope>
</dependency>

<!-- Image processing -->
Expand All @@ -161,7 +152,7 @@
<profile>
<id>spark-deploy</id>
<properties>
<spark.deps.scope>compile</spark.deps.scope>
<spark.deps.scope>provided</spark.deps.scope>
</properties>
</profile>
<profile>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ import java.io.File
import com.cloudera.datascience.dl4j.cnn.Utils
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.SparkContext
import org.datavec.image.loader.NativeImageLoader
import org.deeplearning4j.nn.api.Layer
import org.deeplearning4j.nn.graph.ComputationGraph
Expand Down Expand Up @@ -63,12 +63,11 @@ object SaveFeaturizedData {
val savePath = param.savePath
val modelPath = param.modelPath

val sparkConf = new SparkConf().setAppName("Save output of convolutional layers").setMaster("local[*]")
val sc = new SparkContext(sparkConf)
sc.hadoopConfiguration.set("mapreduce.input.fileinputformat.input.dir.recursive", "true")
val spark = SparkSession.builder().master("local[*]").appName("Save output of convolutional layers").getOrCreate()
val logger = org.apache.log4j.LogManager.getLogger(this.getClass)
try {
val spark = SparkSession.builder().getOrCreate()
val sc = spark.sparkContext
sc.hadoopConfiguration.set("mapreduce.input.fileinputformat.input.dir.recursive", "true")
import spark.sqlContext.implicits._
val vgg16 = modelPath.map { path =>
val modelFile = new File(path)
Expand Down Expand Up @@ -98,7 +97,7 @@ object SaveFeaturizedData {
df.write.parquet(s"$savePath$dir/")
}
} finally {
sc.stop()
spark.stop()
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ import java.io.File

import com.cloudera.datascience.dl4j.cnn.Utils
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.{SparkConf, SparkContext}
import org.deeplearning4j.nn.graph.ComputationGraph
import org.deeplearning4j.nn.transferlearning.{FineTuneConfiguration, TransferLearning, TransferLearningHelper}
import org.deeplearning4j.util.ModelSerializer
Expand Down Expand Up @@ -50,28 +49,30 @@ object SaveFullOutput {

def main(args: Array[String]): Unit = {
val param = Params.parseArgs(args)
val sparkConf = new SparkConf().setAppName("Featurize VGG dense layers")
val sc = new SparkContext(sparkConf)
val spark = SparkSession.builder().getOrCreate()
import spark.sqlContext.implicits._
val spark = SparkSession.builder().appName("Featurize VGG dense layers").getOrCreate()
try {
import spark.sqlContext.implicits._

val modelFile = new File(param.modelPath)
val vgg16 = ModelSerializer.restoreComputationGraph(modelFile)
val data = spark.read.parquet(param.dataPath)
.rdd
.map { case Row(f: Array[Byte], l: Array[Byte]) =>
new DataSet(Nd4j.fromByteArray(f), Nd4j.fromByteArray(l))
val modelFile = new File(param.modelPath)
val vgg16 = ModelSerializer.restoreComputationGraph(modelFile)
val data = spark.read.parquet(param.dataPath)
.rdd
.map { case Row(f: Array[Byte], l: Array[Byte]) =>
new DataSet(Nd4j.fromByteArray(f), Nd4j.fromByteArray(l))
}
val model = param.lastLayer match {
case "predictions" => convToPredictions(vgg16)
case "fc2" => convToFC2(vgg16)
}
val model = param.lastLayer match {
case "predictions" => convToPredictions(vgg16)
case "fc2" => convToFC2(vgg16)
}

val finalOutput = Utils.getPredictions(data, model, sc)
val df = finalOutput.map { ds =>
(Nd4j.toByteArray(ds.getFeatureMatrix), Nd4j.toByteArray(ds.getLabels))
}.toDF()
df.write.parquet(param.savePath)
val finalOutput = Utils.getPredictions(data, model, spark.sparkContext)
val df = finalOutput.map { ds =>
(Nd4j.toByteArray(ds.getFeatureMatrix), Nd4j.toByteArray(ds.getLabels))
}.toDF()
df.write.parquet(param.savePath)
} finally {
spark.stop()
}
}

private def convToFC2(vgg: ComputationGraph): ComputationGraph = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ import java.io.File

import com.cloudera.datascience.dl4j.cnn.Utils
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.{SparkConf, SparkContext}
import org.deeplearning4j.util.ModelSerializer
import org.nd4j.linalg.dataset.DataSet
import org.nd4j.linalg.factory.Nd4j
Expand Down Expand Up @@ -32,11 +31,10 @@ object ScoreSparkModel {

def main(args: Array[String]): Unit = {
val param = Params.parseArgs(args)
val sparkConf = new SparkConf().setAppName("score a model")
val sc = new SparkContext(sparkConf)
sc.hadoopConfiguration.set("mapreduce.input.fileinputformat.input.dir.recursive", "true")
val spark = SparkSession.builder().appName("score a model").getOrCreate()
try {
val spark = SparkSession.builder().getOrCreate()
val sc = spark.sparkContext
sc.hadoopConfiguration.set("mapreduce.input.fileinputformat.input.dir.recursive", "true")
val restorePath = new File(param.modelPath)
val restored = ModelSerializer.restoreComputationGraph(restorePath)
val testRDD = spark.read.parquet(param.dataPath)
Expand All @@ -48,7 +46,7 @@ object ScoreSparkModel {
println(Utils.prettyPrintEvaluationStats(eval))

} finally {
sc.stop()
spark.stop()
}
}
}
Loading

0 comments on commit a7c2cc2

Please sign in to comment.