PacktPublishing
diff --git a/‎10_Developing applications in distributed environment/sourceCode/cookbookapp/.gitignore‎
Lines changed: 11 additions & 0 deletions b/‎10_Developing applications in distributed environment/sourceCode/cookbookapp/.gitignore‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎10_Developing applications in distributed environment/sourceCode/cookbookapp/pom.xml‎
Lines changed: 166 additions & 0 deletions b/‎10_Developing applications in distributed environment/sourceCode/cookbookapp/pom.xml‎
Lines changed: 166 additions & 0 deletions
diff --git a/‎10_Developing applications in distributed environment/sourceCode/cookbookapp/src/main/java/com/javacookbook/app/PreProcessLocal.java‎
Lines changed: 49 additions & 0 deletions b/‎10_Developing applications in distributed environment/sourceCode/cookbookapp/src/main/java/com/javacookbook/app/PreProcessLocal.java‎
Lines changed: 49 additions & 0 deletions
diff --git a/‎10_Developing applications in distributed environment/sourceCode/cookbookapp/src/main/java/com/javacookbook/app/PreprocessSpark.java‎
Lines changed: 78 additions & 0 deletions b/‎10_Developing applications in distributed environment/sourceCode/cookbookapp/src/main/java/com/javacookbook/app/PreprocessSpark.java‎
Lines changed: 78 additions & 0 deletions
@@ -0,0 +1,11 @@
+.idea
+target
+cookbook-app.iml
+cookbookapp.iml
+cookbook-app.iws
+cookbook-app.ipr
+cookbookapp.iml
+workspace.xml
+dependency-reduced-pom.xml
+model.zip
+LocalExecuteExample.csv
@@ -0,0 +1,166 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project
+xmlns="http://maven.apache.org/POM/4.0.0"
+xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+<modelVersion>4.0.0</modelVersion>
+<groupId>com.javadeeplearningcookbook.app</groupId>
+<artifactId>cookbookapp</artifactId>
+<version>1.0-SNAPSHOT</version>
+<build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-compiler-plugin</artifactId>
+ <version>3.8.0</version>
+ <configuration>
+ <source>1.8</source>
+ <target>1.8</target>
+ </configuration>
+ </plugin>
+ <!--<plugin>
+ <artifactId>maven-assembly-plugin</artifactId>
+ <configuration>
+ <archive>
+ <manifest>
+ <mainClass>com.javadeeplearningcookbook.examples.CustomerRetentionPredictionExample</mainClass>
+ </manifest>
+ </archive>
+ <descriptorRefs>
+ <descriptorRef>jar-with-dependencies</descriptorRef>
+ </descriptorRefs>
+ </configuration>
+ </plugin>-->
+ <!--<plugin>
+ <artifactId>maven-jar-plugin</artifactId>
+ <configuration>
+ <archive>
+ <manifest>
+ <mainClass>
+ com.javadeeplearningcookbook.examples.CustomerRetentionPredictionExample
+ </mainClass>
+ </manifest>
+ </archive>
+ </configuration>
+ </plugin>-->
+<!-- <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-dependency-plugin</artifactId>
+ <version>3.1.1</version>
+ <executions>
+ <execution>
+ <id>Analize</id>
+ <goals>
+ <goal>analyze</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>-->
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-shade-plugin</artifactId>
+ <version>3.2.0</version>
+ <executions>
+ <execution>
+ <phase>package</phase>
+ <goals>
+ <goal>shade</goal>
+ </goals>
+ <configuration>
+ <filters>
+ <filter>
+ <artifact>*:*</artifact>
+ <excludes>
+ <exclude>META-INF/*.SF</exclude>
+ <exclude>META-INF/*.DSA</exclude>
+ <exclude>META-INF/*.RSA</exclude>
+ </excludes>
+ </filter>
+ </filters>
+ <transformers>
+ <transformer
+ implementation="org.apache.maven.plugins.shade.resource.ApacheLicenseResourceTransformer" />
+ <transformer
+ implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
+ <mainClass>com.javacookbook.app.SparkExample</mainClass>
+ </transformer>
+ </transformers>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+ </plugins>
+</build>
+<dependencies>
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <version>4.11</version>
+ <scope>test</scope>
+ </dependency>
+<dependency>
+ <groupId>org.apache.spark</groupId>
+ <artifactId>spark-core_2.11</artifactId>
+ <version>2.3.3</version>
+ </dependency>
+ <dependency>
+ <groupId>org.datavec</groupId>
+ <artifactId>datavec-spark_2.11</artifactId>
+ <version>1.0.0-beta4_spark_2</version>
+ </dependency>
+<dependency>
+ <groupId>org.deeplearning4j</groupId>
+ <artifactId>dl4j-spark_2.11</artifactId>
+ <version>1.0.0-beta4_spark_2</version>
+ </dependency>
+<dependency>
+ <groupId>org.deeplearning4j</groupId>
+ <artifactId>dl4j-spark-parameterserver_2.11</artifactId>
+ <version>1.0.0-beta4_spark_2</version>
+ </dependency>
+ <dependency>
+ <groupId>org.nd4j</groupId>
+ <artifactId>nd4j-native-platform</artifactId>
+ <version>1.0.0-beta4</version>
+ </dependency>
+ <dependency>
+ <groupId>org.datavec</groupId>
+ <artifactId>datavec-api</artifactId>
+ <version>1.0.0-beta4</version>
+ </dependency>
+ <dependency>
+ <groupId>org.deeplearning4j</groupId>
+ <artifactId>deeplearning4j-zoo</artifactId>
+ <version>1.0.0-beta4</version>
+ </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-simple</artifactId>
+ <version>1.8.0-beta4</version>
+ </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-api</artifactId>
+ <version>1.8.0-beta4</version>
+ </dependency>
+ <dependency>
+ <groupId>com.beust</groupId>
+ <artifactId>jcommander</artifactId>
+ <version>1.72</version>
+ </dependency>
+</dependencies>
+<!-- Uncomment to use snapshot version -->
+<!--<repositories>
+ <repository>
+ <id>snapshots-repo</id>
+ <url>https://oss.sonatype.org/content/repositories/snapshots</url>
+ <releases>
+ <enabled>false</enabled>
+ </releases>
+ <snapshots>
+ <enabled>true</enabled>
+ <updatePolicy>daily</updatePolicy> &lt;!&ndash; Optional, update daily &ndash;&gt;
+ </snapshots>
+ </repository>
+</repositories>-->
+</project>
@@ -0,0 +1,49 @@
+package com.javacookbook.app;
+
+import com.beust.jcommander.Parameter;
+import org.datavec.image.loader.NativeImageLoader;
+import org.deeplearning4j.common.resources.DL4JResources;
+import org.deeplearning4j.common.resources.ResourceType;
+import org.deeplearning4j.datasets.fetchers.TinyImageNetFetcher;
+import org.deeplearning4j.spark.util.SparkDataUtils;
+
+import java.io.File;
+
+/*
+ Use this source code to pre-process the data and save the batch files to your local disk.
+ You would need to manually transfer them to HDFS.
+ */
+public class PreProcessLocal {
+ //Replace
+ private String localSaveDir = "D:/Application/imagenet-preprocessed/";
+
+ @Parameter(names = {"--batchSize"}, description = "Batch size for saving the data", required = false)
+ private int batchSize = 32;
+
+ public static void main(String[] args) throws Exception {
+ new PreProcessLocal().entryPoint(args);
+ }
+
+ protected void entryPoint(String[] args) throws Exception {
+
+ TinyImageNetFetcher f = new TinyImageNetFetcher();
+ f.downloadAndExtract();
+
+ //Preprocess the training set
+ File baseDirTrain = DL4JResources.getDirectory(ResourceType.DATASET, f.localCacheName() + "/train");
+ File saveDirTrain = new File(localSaveDir, "train");
+ if(!saveDirTrain.exists())
+ saveDirTrain.mkdirs();
+ SparkDataUtils.createFileBatchesLocal(baseDirTrain, NativeImageLoader.ALLOWED_FORMATS, true, saveDirTrain, batchSize);
+
+ //Preprocess the test set
+ File baseDirTest = DL4JResources.getDirectory(ResourceType.DATASET, f.localCacheName() + "/test");
+ File saveDirTest = new File(localSaveDir, "test");
+ if(!saveDirTest.exists())
+ saveDirTest.mkdirs();
+ SparkDataUtils.createFileBatchesLocal(baseDirTest, NativeImageLoader.ALLOWED_FORMATS, true, saveDirTest, batchSize);
+
+ System.out.println("----- Data Preprocessing Complete -----");
+ }
+
+}
@@ -0,0 +1,78 @@
+package com.javacookbook.app;
+
+import com.beust.jcommander.JCommander;
+import com.beust.jcommander.Parameter;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.datavec.image.loader.NativeImageLoader;
+import org.deeplearning4j.spark.util.SparkDataUtils;
+import org.deeplearning4j.spark.util.SparkUtils;
+
+/**
+ * This file is for preparing the training data for the tiny imagenet CNN example.
+ * Either this class OR PreprocessLocal (but not both) must be run before training can be run on a cluster via TrainSpark.
+ *
+ * PreprocessSpark requires that the tiny imagenet source image files (.jpg format) available on network storage that
+ * Spark can access, such as HDFS, Azure blob storage, S3 etc.
+ *
+ * To get these image files, you have two options:
+ *
+ * Option 1: Direct download (We followed this approach in this source.)
+ * Step 1: Download https://deeplearning4jblob.blob.core.windows.net/datasets/tinyimagenet_200_dl4j.v1.zip or http://cs231n.stanford.edu/tiny-imagenet-200.zip
+ * Step 2: Extract files locally
+ * Step 3: Copy contents (in their existing train/test subdirectories) to remote storage (for example, using Hadoop FS utils or similar)
+ *
+ * Option 2: Use TinyImageNetFetcher to download
+ * Step 1: Run {@code new TinyImageNetFetcher().downloadAndExtract()} to download the files
+ * Step 2: Copy the contents of the following directory to remote storage (for example, using Hadoop FS utils or similar)
+ * Windows: C:\Users\<username>\.deeplearning4j\data\TINYIMAGENET_200
+ * Linux: ~/.deeplearning4j/data/TINYIMAGENET_200
+ *
+ * After completing the steps of option 1 or option 2, then run this script to preprocess the data.
+ *
+ * @author Alex Black
+ */
+public class PreprocessSpark {
+
+ /*
+ Sample data to be passed: --sourceDir="hdfs://localhost:9000/user/hadoop/tiny-imagenet-200/" ;
+ */
+ @Parameter(names = {"--sourceDir"}, description = "Directory to get source image files", required = true)
+ public String sourceDir=null;
+
+ /*
+ Sample data to be passed: --saveDir="hdfs://localhost:9000/user/hadoop/batches/" ;
+ */
+ @Parameter(names = {"--saveDir"}, description = "Directory to save the preprocessed data files on remote storage (for example, HDFS)", required = true)
+ private String saveDir=null;
+
+ @Parameter(names = {"--batchSize"}, description = "Batch size for saving the data", required = false)
+ private int batchSize = 32;
+
+ public static void main(String[] args) throws Exception {
+ new PreprocessSpark().entryPoint(args);
+ }
+
+ protected void entryPoint(String[] args) throws Exception {
+ JCommander jcmdr = new JCommander(this);
+ jcmdr.parse(args);
+ //JCommanderUtils.parseArgs(this, args);
+ SparkConf conf = new SparkConf();
+ conf.setMaster("local[*]");
+ conf.setAppName("DL4JTinyImageNetSparkPreproc");
+ JavaSparkContext sc = new JavaSparkContext(conf);
+
+ //Create training set
+ JavaRDD<String> filePathsTrain = SparkUtils.listPaths(sc, sourceDir + "/train", true, NativeImageLoader.ALLOWED_FORMATS);
+ SparkDataUtils.createFileBatchesSpark(filePathsTrain, saveDir, batchSize, sc);
+
+ //Create test set
+ JavaRDD<String> filePathsTest = SparkUtils.listPaths(sc, sourceDir + "/test", true, NativeImageLoader.ALLOWED_FORMATS);
+ SparkDataUtils.createFileBatchesSpark(filePathsTest, saveDir, batchSize, sc);
+
+
+ System.out.println("----- Data Preprocessing Complete -----");
+ }
+
+}