Skip to content

Commit 9fff7f6

Browse files
committed
update doc
1 parent d2c7916 commit 9fff7f6

File tree

3 files changed

+125
-0
lines changed

3 files changed

+125
-0
lines changed

pom.xml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,13 @@
102102
<version>${spark.version}</version>
103103
</dependency>
104104

105+
<dependency>
106+
<groupId>com.holdenkarau</groupId>
107+
<artifactId>spark-testing-base_${scala.tools.version}</artifactId>
108+
<version>${spark.version}_0.10.0</version>
109+
<scope>test</scope>
110+
</dependency>
111+
105112
<dependency>
106113
<groupId>com.fasterxml.jackson.core</groupId>
107114
<artifactId>jackson-core</artifactId>

src/main/resources/spark_tuning_test.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@
1414

1515
#### narrow dependency and wide dependency(窄依赖和宽依赖)
1616
- byKey的变换都是宽依赖,涉及到shuffle数据的过程
17+
- shuffle类似于hash keys into buckets
18+
- shuffle的过程会在磁盘上产生中间临时文件
1719

1820
#### Job, Stage, Task
1921
- Spark程序内的每个Action操作会产生一个新Job的提交
@@ -27,6 +29,7 @@
2729
- Master,Worker是集群启动时配置的负责分配资源和具体执行任务的节点。
2830
- Driver和Executor是Spark应用启动时才有的,每个Spark应用有一个Driver和多个Executor,Driver会启动Spark Context,会向Master节点请求资源,Master会根据Worker节点的情况启动对应的Executor,每个Executor是Worker节点上启动的单独用于执行Task的进程。
2931
- Standalone模式会默认在每个Worker节点启动一个Executor,当使用YARN能更有效的利用和监控集群的资源使用情况,有效调度任务,按需分配Executor个数
32+
- Executor是spark应用在节点(worker)上启动的一个进程(process)
3033
- 一个Worker节点可以启动多个Executor,没必要在一个节点上启动多个Worker Instance。
3134

3235
> https://stackoverflow.com/questions/24696777/what-is-the-relationship-between-workers-worker-instances-and-executors
@@ -52,6 +55,7 @@
5255
- spark1.2版本前使用的shuffle过程,spark2.0后移除。
5356
- 每个mapper会根据reducer个数,遍历所有record,生成R个文件。
5457
- 在shuffle过程中,集群最多会生成M\*R个文件,会造成文件系统效率低下及巨大的网络流量压力。
58+
- `spark.shuffle.consolidateFiles=true`能使得每个executor shuffle write在同一个文件,不会因reducer个数造成大量临时文件。
5559

5660
#### Sort Shuffle
5761
- spark1.2后默认使用的shuffle过程。

src/test/scala/SparkTest.scala

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
import com.holdenkarau.spark.testing.{SharedSparkContext, StreamingSuiteBase}
2+
import org.apache.spark.rdd.RDD
3+
import org.apache.spark.streaming.Seconds
4+
import org.apache.spark.streaming.dstream.DStream
5+
import org.scalactic.Equality
6+
import org.scalatest.FunSuite
7+
8+
/**
9+
* Created by xk on 2018/9/10.
10+
*/
11+
class SparkTest extends FunSuite with SharedSparkContext{
12+
13+
test("test initializing spark context") {
14+
val list = List(1, 2, 3, 4)
15+
val rdd = sc.parallelize(list)
16+
17+
assert(rdd.count === list.length)
18+
}
19+
20+
}
21+
22+
class SampleStreamingTest extends FunSuite with StreamingSuiteBase {
23+
24+
test("simple two stream streaming test") {
25+
val input = List(List("hi", "pandas"), List("hi holden"), List("bye"))
26+
val input2 = List(List("hi"), List("pandas"), List("byes"))
27+
val expected = List(List("pandas"), List("hi holden"), List("bye"))
28+
testOperation[String, String, String](input, input2, subtract _, expected, ordered = false)
29+
}
30+
31+
def subtract(d1: DStream[String], d2: DStream[String]): DStream[String] = {
32+
d1.transformWith(d2, SampleStreamingTest.subtractRDDs _)
33+
}
34+
35+
test("really simple transformation") {
36+
val input = List(List("hi"), List("hi holden"), List("bye"))
37+
val expected = List(List("hi"), List("hi", "holden"), List("bye"))
38+
testOperation[String, String](input, tokenize _, expected, ordered = false)
39+
}
40+
41+
// This is the sample operation we are testing
42+
def tokenize(f: DStream[String]): DStream[String] = {
43+
f.flatMap(_.split(" "))
44+
}
45+
46+
test("CountByWindow with windowDuration 3s and slideDuration=2s") {
47+
// There should be 2 windows : {batch2, batch1}, {batch4, batch3, batch2}
48+
val batch1 = List("a", "b")
49+
val batch2 = List("d", "f", "a", "b")
50+
val batch3 = List("f", "g"," h")
51+
val batch4 = List("a")
52+
val input= List(batch1, batch2, batch3, batch4)
53+
val expected = List(List(6L), List(8L))
54+
val expected2 = List(List(2L), List(6L), List(7L), List(4L))
55+
56+
57+
def countByWindow(ds:DStream[String]):DStream[Long] = {
58+
ds.countByWindow(windowDuration = Seconds(3), slideDuration = Seconds(2))
59+
}
60+
61+
def countByWindow2(ds:DStream[String]):DStream[Long] = {
62+
ds.countByWindow(windowDuration = Seconds(2), slideDuration = Seconds(1))
63+
}
64+
65+
testOperation[String, Long](input, countByWindow _, expected, ordered = true)
66+
testOperation[String, Long](input, countByWindow2 _, expected2, ordered = true)
67+
}
68+
69+
test("empty batch by using null") {
70+
def multiply(stream1: DStream[Int]) = stream1.map(_ * 3)
71+
72+
val input1 = List(List(1), null, List(10))
73+
val output = List(List(3), List(30))
74+
75+
testOperation(input1, multiply _, output, ordered = false)
76+
}
77+
78+
test("custom equality object (Integer)") {
79+
val input = List(List(-1), List(-2, 3, -4), List(5, -6))
80+
val expected = List(List(1), List(2, 3, 4), List(5, 6))
81+
82+
implicit val integerCustomEquality =
83+
new Equality[Int] {
84+
override def areEqual(a: Int, b: Any): Boolean =
85+
b match {
86+
case n: Int => Math.abs(a) == Math.abs(n)
87+
case _ => false
88+
}
89+
}
90+
91+
def doNothing(ds: DStream[Int]) = ds
92+
93+
testOperation[Int, Int](input, doNothing _, expected, ordered = false)
94+
testOperation[Int, Int](input, doNothing _, expected, ordered = true)
95+
}
96+
97+
override def maxWaitTimeMillis: Int = 20000
98+
99+
test("increase duration more than 10 seconds") {
100+
val input = (1 to 1000).toList.map(x => List(x))
101+
val expectedOutput = (1 to 1000).toList.map(x => List(2 * x))
102+
103+
def multiply(ds: DStream[Int]) = ds.map(_ * 2)
104+
105+
testOperation[Int, Int](input, multiply _, expectedOutput, ordered = true)
106+
}
107+
108+
}
109+
110+
object SampleStreamingTest {
111+
def subtractRDDs(r1: RDD[String], r2: RDD[String]): RDD[String] = {
112+
r1.subtract(r2)
113+
}
114+
}

0 commit comments

Comments
 (0)