Skip to content

Commit 42f6631

Browse files
[O11y][Apache Spark] Add dimension mapping for Executor datastream (elastic#7993)
* add dimension mapping for Executor datastream * update changelog.yml * update pipeline * elastic-package check
1 parent 620f766 commit 42f6631

File tree

7 files changed

+88
-47
lines changed

7 files changed

+88
-47
lines changed

packages/apache_spark/changelog.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,9 @@
11
# newer versions go on top
2+
- version: "0.7.4"
3+
changes:
4+
- description: Added dimension mapping for Executor datastream.
5+
type: enhancement
6+
link: https://github.com/elastic/integrations/pull/7993
27
- version: "0.7.3"
38
changes:
49
- description: Add metric_type mapping for driver datastream.

packages/apache_spark/data_stream/executor/elasticsearch/ingest_pipeline/default.yml

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,12 @@ processors:
3333
ctx.apache_spark.executor.application_name = app_name;
3434
ctx.apache_spark.executor.id = executor_id;
3535
}
36+
- rename:
37+
field: apache_spark.mbean
38+
target_field: apache_spark.executor.mbean
39+
ignore_missing: true
3640
- remove:
37-
field:
38-
- apache_spark.mbean
41+
field:
3942
- jolokia
4043
ignore_failure: true
4144
on_failure:

packages/apache_spark/data_stream/executor/fields/ecs.yml

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,24 @@
1+
- external: ecs
2+
name: agent.id
3+
dimension: true
4+
- external: ecs
5+
name: cloud.account.id
6+
dimension: true
7+
- external: ecs
8+
name: cloud.availability_zone
9+
dimension: true
10+
- external: ecs
11+
name: cloud.instance.id
12+
dimension: true
13+
- external: ecs
14+
name: cloud.provider
15+
dimension: true
16+
- external: ecs
17+
name: cloud.region
18+
dimension: true
19+
- external: ecs
20+
name: container.id
21+
dimension: true
122
- external: ecs
223
name: ecs.version
324
- external: ecs
@@ -12,8 +33,12 @@
1233
name: event.type
1334
- external: ecs
1435
name: host.ip
36+
- external: ecs
37+
name: host.name
38+
dimension: true
1539
- external: ecs
1640
name: service.address
41+
dimension: true
1742
- external: ecs
1843
name: service.type
1944
- external: ecs

packages/apache_spark/data_stream/executor/fields/fields.yml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,11 @@
77
- name: application_name
88
type: keyword
99
description: Name of application.
10+
dimension: true
1011
- name: id
1112
type: keyword
1213
description: ID of executor.
14+
dimension: true
1315
- name: bytes
1416
type: group
1517
fields:
@@ -19,6 +21,11 @@
1921
- name: written
2022
type: long
2123
description: Total number of bytes written.
24+
- name: mbean
25+
type: keyword
26+
description: The name of the jolokia mbean.
27+
# Reason to add as a dimension field: There can be many jolokia mbeans.
28+
dimension: true
2229
- name: memory
2330
type: group
2431
fields:

packages/apache_spark/data_stream/executor/sample_event.json

Lines changed: 18 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,20 @@
11
{
2-
"@timestamp": "2022-04-11T08:29:56.056Z",
2+
"@timestamp": "2023-09-28T09:26:45.771Z",
33
"agent": {
4-
"ephemeral_id": "c7d892ac-3b23-471c-80e4-041490eaab8d",
5-
"id": "c5e2a51e-e10a-4561-9861-75b38aa09f4b",
4+
"ephemeral_id": "3a3db920-eb4b-4045-b351-33526910ae8a",
5+
"id": "a6bdbb4a-4bac-4243-83cb-dba157f24987",
66
"name": "docker-fleet-agent",
77
"type": "metricbeat",
8-
"version": "8.1.0"
8+
"version": "8.8.0"
99
},
1010
"apache_spark": {
1111
"executor": {
12-
"application_name": "app-20220411082945-0000",
13-
"gc": {
14-
"major": {
15-
"count": 0
16-
}
12+
"application_name": "app-20230928092630-0000",
13+
"id": "0",
14+
"jvm": {
15+
"cpu_time": 20010000000
1716
},
18-
"id": "0"
17+
"mbean": "metrics:name=app-20230928092630-0000.0.JVMCPU.jvmCpuTime,type=gauges"
1918
}
2019
},
2120
"data_stream": {
@@ -27,15 +26,15 @@
2726
"version": "8.5.1"
2827
},
2928
"elastic_agent": {
30-
"id": "c5e2a51e-e10a-4561-9861-75b38aa09f4b",
29+
"id": "a6bdbb4a-4bac-4243-83cb-dba157f24987",
3130
"snapshot": false,
32-
"version": "8.1.0"
31+
"version": "8.8.0"
3332
},
3433
"event": {
3534
"agent_id_status": "verified",
3635
"dataset": "apache_spark.executor",
37-
"duration": 32964497,
38-
"ingested": "2022-04-11T08:29:59Z",
36+
"duration": 2849184715,
37+
"ingested": "2023-09-28T09:26:49Z",
3938
"kind": "metric",
4039
"module": "apache_spark",
4140
"type": "info"
@@ -44,21 +43,18 @@
4443
"architecture": "x86_64",
4544
"containerized": true,
4645
"hostname": "docker-fleet-agent",
47-
"ip": [
48-
"172.23.0.7"
49-
],
50-
"mac": [
51-
"02:42:ac:17:00:07"
52-
],
46+
"id": "e8978f2086c14e13b7a0af9ed0011d19",
47+
"ip": "172.20.0.7",
48+
"mac": "02-42-AC-14-00-07",
5349
"name": "docker-fleet-agent",
5450
"os": {
5551
"codename": "focal",
5652
"family": "debian",
57-
"kernel": "5.4.0-107-generic",
53+
"kernel": "3.10.0-1160.90.1.el7.x86_64",
5854
"name": "Ubuntu",
5955
"platform": "ubuntu",
6056
"type": "linux",
61-
"version": "20.04.3 LTS (Focal Fossa)"
57+
"version": "20.04.6 LTS (Focal Fossa)"
6258
}
6359
},
6460
"metricset": {

packages/apache_spark/docs/README.md

Lines changed: 27 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -362,23 +362,22 @@ An example event for `executor` looks as following:
362362

363363
```json
364364
{
365-
"@timestamp": "2022-04-11T08:29:56.056Z",
365+
"@timestamp": "2023-09-28T09:26:45.771Z",
366366
"agent": {
367-
"ephemeral_id": "c7d892ac-3b23-471c-80e4-041490eaab8d",
368-
"id": "c5e2a51e-e10a-4561-9861-75b38aa09f4b",
367+
"ephemeral_id": "3a3db920-eb4b-4045-b351-33526910ae8a",
368+
"id": "a6bdbb4a-4bac-4243-83cb-dba157f24987",
369369
"name": "docker-fleet-agent",
370370
"type": "metricbeat",
371-
"version": "8.1.0"
371+
"version": "8.8.0"
372372
},
373373
"apache_spark": {
374374
"executor": {
375-
"application_name": "app-20220411082945-0000",
376-
"gc": {
377-
"major": {
378-
"count": 0
379-
}
375+
"application_name": "app-20230928092630-0000",
376+
"id": "0",
377+
"jvm": {
378+
"cpu_time": 20010000000
380379
},
381-
"id": "0"
380+
"mbean": "metrics:name=app-20230928092630-0000.0.JVMCPU.jvmCpuTime,type=gauges"
382381
}
383382
},
384383
"data_stream": {
@@ -390,15 +389,15 @@ An example event for `executor` looks as following:
390389
"version": "8.5.1"
391390
},
392391
"elastic_agent": {
393-
"id": "c5e2a51e-e10a-4561-9861-75b38aa09f4b",
392+
"id": "a6bdbb4a-4bac-4243-83cb-dba157f24987",
394393
"snapshot": false,
395-
"version": "8.1.0"
394+
"version": "8.8.0"
396395
},
397396
"event": {
398397
"agent_id_status": "verified",
399398
"dataset": "apache_spark.executor",
400-
"duration": 32964497,
401-
"ingested": "2022-04-11T08:29:59Z",
399+
"duration": 2849184715,
400+
"ingested": "2023-09-28T09:26:49Z",
402401
"kind": "metric",
403402
"module": "apache_spark",
404403
"type": "info"
@@ -407,21 +406,18 @@ An example event for `executor` looks as following:
407406
"architecture": "x86_64",
408407
"containerized": true,
409408
"hostname": "docker-fleet-agent",
410-
"ip": [
411-
"172.23.0.7"
412-
],
413-
"mac": [
414-
"02:42:ac:17:00:07"
415-
],
409+
"id": "e8978f2086c14e13b7a0af9ed0011d19",
410+
"ip": "172.20.0.7",
411+
"mac": "02-42-AC-14-00-07",
416412
"name": "docker-fleet-agent",
417413
"os": {
418414
"codename": "focal",
419415
"family": "debian",
420-
"kernel": "5.4.0-107-generic",
416+
"kernel": "3.10.0-1160.90.1.el7.x86_64",
421417
"name": "Ubuntu",
422418
"platform": "ubuntu",
423419
"type": "linux",
424-
"version": "20.04.3 LTS (Focal Fossa)"
420+
"version": "20.04.6 LTS (Focal Fossa)"
425421
}
426422
},
427423
"metricset": {
@@ -440,6 +436,7 @@ An example event for `executor` looks as following:
440436
| Field | Description | Type |
441437
|---|---|---|
442438
| @timestamp | Event timestamp. | date |
439+
| agent.id | Unique identifier of this agent (if one exists). Example: For Beats this would be beat.id. | keyword |
443440
| apache_spark.executor.application_name | Name of application. | keyword |
444441
| apache_spark.executor.bytes.read | Total number of bytes read. | long |
445442
| apache_spark.executor.bytes.written | Total number of bytes written. | long |
@@ -470,6 +467,7 @@ An example event for `executor` looks as following:
470467
| apache_spark.executor.id | ID of executor. | keyword |
471468
| apache_spark.executor.jvm.cpu_time | Elapsed CPU time the JVM spent. | long |
472469
| apache_spark.executor.jvm.gc_time | Elapsed time the JVM spent in garbage collection while executing this task. | long |
470+
| apache_spark.executor.mbean | The name of the jolokia mbean. | keyword |
473471
| apache_spark.executor.memory.direct_pool | Peak memory that the JVM is using for direct buffer pool. | long |
474472
| apache_spark.executor.memory.jvm.heap | Peak memory usage of the heap that is used for object allocation. | long |
475473
| apache_spark.executor.memory.jvm.off_heap | Peak memory usage of non-heap memory that is used by the Java virtual machine. | long |
@@ -509,6 +507,12 @@ An example event for `executor` looks as following:
509507
| apache_spark.executor.threadpool.current_pool_size | The size of the current thread pool of the executor. | long |
510508
| apache_spark.executor.threadpool.max_pool_size | The maximum size of the thread pool of the executor. | long |
511509
| apache_spark.executor.threadpool.started_tasks | The number of tasks started in the thread pool of the executor. | long |
510+
| cloud.account.id | The cloud account or organization id used to identify different entities in a multi-tenant environment. Examples: AWS account id, Google Cloud ORG Id, or other unique identifier. | keyword |
511+
| cloud.availability_zone | Availability zone in which this host, resource, or service is located. | keyword |
512+
| cloud.instance.id | Instance ID of the host machine. | keyword |
513+
| cloud.provider | Name of the cloud provider. Example values are aws, azure, gcp, or digitalocean. | keyword |
514+
| cloud.region | Region in which this host, resource, or service is located. | keyword |
515+
| container.id | Unique container id. | keyword |
512516
| data_stream.dataset | Data stream dataset. | constant_keyword |
513517
| data_stream.namespace | Data stream namespace. | constant_keyword |
514518
| data_stream.type | Data stream type. | constant_keyword |
@@ -519,6 +523,7 @@ An example event for `executor` looks as following:
519523
| event.module | Name of the module this data is coming from. If your monitoring agent supports the concept of modules or plugins to process events of a given source (e.g. Apache logs), `event.module` should contain the name of this module. | keyword |
520524
| event.type | This is one of four ECS Categorization Fields, and indicates the third level in the ECS category hierarchy. `event.type` represents a categorization "sub-bucket" that, when used along with the `event.category` field values, enables filtering events down to a level appropriate for single visualization. This field is an array. This will allow proper categorization of some events that fall in multiple event types. | keyword |
521525
| host.ip | Host ip addresses. | ip |
526+
| host.name | Name of the host. It can contain what `hostname` returns on Unix systems, the fully qualified domain name, or a name specified by the user. The sender decides which value to use. | keyword |
522527
| service.address | Address where data about this service was collected from. This should be a URI, network address (ipv4:port or [ipv6]:port) or a resource path (sockets). | keyword |
523528
| service.type | The type of the service data is collected from. The type can be used to group and correlate logs and metrics from one service type. Example: If logs or metrics are collected from Elasticsearch, `service.type` would be `elasticsearch`. | keyword |
524529
| tags | List of keywords used to tag each event. | keyword |

packages/apache_spark/manifest.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
format_version: 1.0.0
22
name: apache_spark
33
title: Apache Spark
4-
version: "0.7.3"
4+
version: "0.7.4"
55
license: basic
66
description: Collect metrics from Apache Spark with Elastic Agent.
77
type: integration

0 commit comments

Comments
 (0)