delphi-xk
diff --git a/‎pom.xml‎
Lines changed: 7 additions & 0 deletions b/‎pom.xml‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎src/main/resources/spark_tuning_test.md‎
Lines changed: 4 additions & 0 deletions b/‎src/main/resources/spark_tuning_test.md‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/test/scala/SparkTest.scala‎
Lines changed: 114 additions & 0 deletions b/‎src/test/scala/SparkTest.scala‎
Lines changed: 114 additions & 0 deletions
@@ -102,6 +102,13 @@
  <version>${spark.version}</version>
  </dependency>
 
+ <dependency>
+ <groupId>com.holdenkarau</groupId>
+ <artifactId>spark-testing-base_${scala.tools.version}</artifactId>
+ <version>${spark.version}_0.10.0</version>
+ <scope>test</scope>
+ </dependency>
+
  <dependency>
  <groupId>com.fasterxml.jackson.core</groupId>
  <artifactId>jackson-core</artifactId>
 
@@ -14,6 +14,8 @@
 
 #### narrow dependency and wide dependency(窄依赖和宽依赖)
 - byKey的变换都是宽依赖，涉及到shuffle数据的过程
+- shuffle类似于hash keys into buckets
+- shuffle的过程会在磁盘上产生中间临时文件
 
 #### Job, Stage, Task
 - Spark程序内的每个Action操作会产生一个新Job的提交
@@ -27,6 +29,7 @@
 - Master,Worker是集群启动时配置的负责分配资源和具体执行任务的节点。
 - Driver和Executor是Spark应用启动时才有的，每个Spark应用有一个Driver和多个Executor，Driver会启动Spark Context，会向Master节点请求资源，Master会根据Worker节点的情况启动对应的Executor，每个Executor是Worker节点上启动的单独用于执行Task的进程。
 - Standalone模式会默认在每个Worker节点启动一个Executor，当使用YARN能更有效的利用和监控集群的资源使用情况，有效调度任务，按需分配Executor个数
+- Executor是spark应用在节点（worker）上启动的一个进程（process）
 - 一个Worker节点可以启动多个Executor，没必要在一个节点上启动多个Worker Instance。
 
 > https://stackoverflow.com/questions/24696777/what-is-the-relationship-between-workers-worker-instances-and-executors
@@ -52,6 +55,7 @@
 - spark1.2版本前使用的shuffle过程，spark2.0后移除。
 - 每个mapper会根据reducer个数，遍历所有record，生成R个文件。
 - 在shuffle过程中，集群最多会生成M\*R个文件，会造成文件系统效率低下及巨大的网络流量压力。
+- `spark.shuffle.consolidateFiles=true`能使得每个executor shuffle write在同一个文件，不会因reducer个数造成大量临时文件。
 
 #### Sort Shuffle
 - spark1.2后默认使用的shuffle过程。
 
@@ -0,0 +1,114 @@
+import com.holdenkarau.spark.testing.{SharedSparkContext, StreamingSuiteBase}
+import org.apache.spark.rdd.RDD
+import org.apache.spark.streaming.Seconds
+import org.apache.spark.streaming.dstream.DStream
+import org.scalactic.Equality
+import org.scalatest.FunSuite
+
+/**
+ * Created by xk on 2018/9/10.
+ */
+class SparkTest extends FunSuite with SharedSparkContext{
+
+ test("test initializing spark context") {
+ val list = List(1, 2, 3, 4)
+ val rdd = sc.parallelize(list)
+
+ assert(rdd.count === list.length)
+ }
+
+}
+
+class SampleStreamingTest extends FunSuite with StreamingSuiteBase {
+
+ test("simple two stream streaming test") {
+ val input = List(List("hi", "pandas"), List("hi holden"), List("bye"))
+ val input2 = List(List("hi"), List("pandas"), List("byes"))
+ val expected = List(List("pandas"), List("hi holden"), List("bye"))
+ testOperation[String, String, String](input, input2, subtract _, expected, ordered = false)
+ }
+
+ def subtract(d1: DStream[String], d2: DStream[String]): DStream[String] = {
+ d1.transformWith(d2, SampleStreamingTest.subtractRDDs _)
+ }
+
+ test("really simple transformation") {
+ val input = List(List("hi"), List("hi holden"), List("bye"))
+ val expected = List(List("hi"), List("hi", "holden"), List("bye"))
+ testOperation[String, String](input, tokenize _, expected, ordered = false)
+ }
+
+ // This is the sample operation we are testing
+ def tokenize(f: DStream[String]): DStream[String] = {
+ f.flatMap(_.split(" "))
+ }
+
+ test("CountByWindow with windowDuration 3s and slideDuration=2s") {
+ // There should be 2 windows : {batch2, batch1}, {batch4, batch3, batch2}
+ val batch1 = List("a", "b")
+ val batch2 = List("d", "f", "a", "b")
+ val batch3 = List("f", "g"," h")
+ val batch4 = List("a")
+ val input= List(batch1, batch2, batch3, batch4)
+ val expected = List(List(6L), List(8L))
+ val expected2 = List(List(2L), List(6L), List(7L), List(4L))
+
+
+ def countByWindow(ds:DStream[String]):DStream[Long] = {
+ ds.countByWindow(windowDuration = Seconds(3), slideDuration = Seconds(2))
+ }
+
+ def countByWindow2(ds:DStream[String]):DStream[Long] = {
+ ds.countByWindow(windowDuration = Seconds(2), slideDuration = Seconds(1))
+ }
+
+ testOperation[String, Long](input, countByWindow _, expected, ordered = true)
+ testOperation[String, Long](input, countByWindow2 _, expected2, ordered = true)
+ }
+
+ test("empty batch by using null") {
+ def multiply(stream1: DStream[Int]) = stream1.map(_ * 3)
+
+ val input1 = List(List(1), null, List(10))
+ val output = List(List(3), List(30))
+
+ testOperation(input1, multiply _, output, ordered = false)
+ }
+
+ test("custom equality object (Integer)") {
+ val input = List(List(-1), List(-2, 3, -4), List(5, -6))
+ val expected = List(List(1), List(2, 3, 4), List(5, 6))
+
+ implicit val integerCustomEquality =
+ new Equality[Int] {
+ override def areEqual(a: Int, b: Any): Boolean =
+ b match {
+ case n: Int => Math.abs(a) == Math.abs(n)
+ case _ => false
+ }
+ }
+
+ def doNothing(ds: DStream[Int]) = ds
+
+ testOperation[Int, Int](input, doNothing _, expected, ordered = false)
+ testOperation[Int, Int](input, doNothing _, expected, ordered = true)
+ }
+
+ override def maxWaitTimeMillis: Int = 20000
+
+ test("increase duration more than 10 seconds") {
+ val input = (1 to 1000).toList.map(x => List(x))
+ val expectedOutput = (1 to 1000).toList.map(x => List(2 * x))
+
+ def multiply(ds: DStream[Int]) = ds.map(_ * 2)
+
+ testOperation[Int, Int](input, multiply _, expectedOutput, ordered = true)
+ }
+
+}
+
+object SampleStreamingTest {
+ def subtractRDDs(r1: RDD[String], r2: RDD[String]): RDD[String] = {
+ r1.subtract(r2)
+ }
+}