Skip to content

Commit f24191c

Browse files
Pothulapatiroboquat
authored andcommitted
Move rollout and analysis into separate packages
This also includes: - make actions using RolloutAction Interface - abstract out the analysis logic into a separate package - bugfix: don't close the channel - working prototype with metric analysis - logs refactor Signed-off-by: Tarun Pothulapati <tarun@gitpod.io>
1 parent 13f6345 commit f24191c

File tree

9 files changed

+327
-144
lines changed

9 files changed

+327
-144
lines changed

components/workspace-rollout-job/cmd/root.go

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,13 @@
55
package cmd
66

77
import (
8+
"context"
89
"fmt"
910
"os"
11+
"time"
1012

1113
"github.com/gitpod-io/gitpod/common-go/log"
14+
"github.com/gitpod-io/gitpod/workspace-rollout-job/pkg/analysis"
1215
"github.com/gitpod-io/gitpod/workspace-rollout-job/pkg/rollout"
1316
"github.com/gitpod-io/gitpod/workspace-rollout-job/pkg/wsbridge"
1417
"github.com/spf13/cobra"
@@ -19,6 +22,7 @@ var rootCmd = &cobra.Command{
1922
Short: "Rollout from old to a new cluster while monitoring metrics",
2023
Run: func(cmd *cobra.Command, args []string) {
2124
log.Info("Starting workspace-rollout-job")
25+
ctx := context.Background()
2226
var err error
2327
old, presence := os.LookupEnv("OLD_CLUSTER")
2428
if !presence {
@@ -30,20 +34,22 @@ var rootCmd = &cobra.Command{
3034
log.WithError(err).Fatal("cannot get new cluster")
3135
}
3236

37+
wsManagerBridgeClient := wsbridge.NewWsManagerBridgeClient("localhost:8080")
3338
// Check if the old cluster has a 100 score.
34-
if err = wsbridge.CheckScore(old, 100); err != nil {
39+
if score, err := wsManagerBridgeClient.GetScore(ctx, old); err != nil || score != 100 {
3540
log.WithError(err).Fatal("init condition does not satisfy")
3641
}
3742

3843
// Check if the new cluster has a 0 zero score.
3944
// TODO: Check if the new cluster has no constraints.
40-
if err = wsbridge.CheckScore(new, 0); err != nil {
45+
if score, err := wsManagerBridgeClient.GetScore(ctx, new); err != nil || score != 0 {
4146
log.WithError(err).Fatal("init condition does not satisfy")
4247
}
4348

44-
// Start the rollout process.
45-
job := rollout.New(old, new, "http://prometheus:9090")
46-
job.Start()
49+
// Start the rollout process
50+
prometheusAnalyzer := analysis.NewPrometheusAnalyzer("http://localhost:9090")
51+
job := rollout.New(old, new, 20*time.Second, 1*time.Second, 10, prometheusAnalyzer, wsManagerBridgeClient)
52+
job.Start(ctx)
4753
},
4854
}
4955

components/workspace-rollout-job/go.mod

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,24 +6,28 @@ require (
66
github.com/gitpod-io/gitpod/common-go v0.0.0-00010101000000-000000000000
77
github.com/gitpod-io/gitpod/ws-manager-bridge/api v0.0.0-00010101000000-000000000000
88
github.com/prometheus/client_golang v1.13.0
9+
github.com/prometheus/common v0.37.0
910
github.com/spf13/cobra v1.4.0
11+
github.com/stretchr/testify v1.7.0
1012
google.golang.org/grpc v1.49.0
1113
)
1214

1315
require (
16+
github.com/davecgh/go-spew v1.1.1 // indirect
1417
github.com/golang/protobuf v1.5.2 // indirect
1518
github.com/inconshreveable/mousetrap v1.0.0 // indirect
1619
github.com/json-iterator/go v1.1.12 // indirect
1720
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
1821
github.com/modern-go/reflect2 v1.0.2 // indirect
19-
github.com/prometheus/common v0.37.0 // indirect
22+
github.com/pmezard/go-difflib v1.0.0 // indirect
2023
github.com/sirupsen/logrus v1.8.1 // indirect
2124
github.com/spf13/pflag v1.0.5 // indirect
2225
golang.org/x/net v0.0.0-20220225172249-27dd8689420f // indirect
2326
golang.org/x/sys v0.0.0-20220825204002-c680a09ffe64 // indirect
2427
golang.org/x/text v0.3.7 // indirect
2528
google.golang.org/genproto v0.0.0-20201019141844-1ed22bb0c154 // indirect
2629
google.golang.org/protobuf v1.28.1 // indirect
30+
gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b // indirect
2731
)
2832

2933
replace github.com/gitpod-io/gitpod/ws-manager-bridge/api => ../ws-manager-bridge-api/go // leeway

components/workspace-rollout-job/go.sum

Lines changed: 6 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
// Copyright (c) 2023 Gitpod GmbH. All rights reserved.
2+
// Licensed under the GNU Affero General Public License (AGPL).
3+
// See License.AGPL.txt in the project root for license information.
4+
5+
package analysis
6+
7+
import (
8+
"context"
9+
)
10+
11+
type Analyzer interface {
12+
// Given a cluster name, MoveForward is called by the rollout routine
13+
// repeatedly to determine whether to move forward on the rollout or not.
14+
MoveForward(ctx context.Context, clusterName string) (bool, error)
15+
}
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
// Copyright (c) 2023 Gitpod GmbH. All rights reserved.
2+
// Licensed under the GNU Affero General Public License (AGPL).
3+
// See License.AGPL.txt in the project root for license information.
4+
5+
package analysis
6+
7+
import (
8+
"context"
9+
"fmt"
10+
"math"
11+
"time"
12+
13+
logrus "github.com/gitpod-io/gitpod/common-go/log"
14+
"github.com/prometheus/client_golang/api"
15+
v1 "github.com/prometheus/client_golang/api/prometheus/v1"
16+
"github.com/prometheus/common/model"
17+
)
18+
19+
const (
20+
// Rate of increase in error count
21+
errorMetric = "sum by (cluster) (rate(gitpod_ws_manager_workspace_starts_failure_total{cluster=~\"%s.*\"}[%dms]))"
22+
)
23+
24+
type PrometheusAnalyzer struct {
25+
prometheusURL string
26+
startTime time.Time
27+
}
28+
29+
func NewPrometheusAnalyzer(promURL string) *PrometheusAnalyzer {
30+
return &PrometheusAnalyzer{
31+
prometheusURL: promURL,
32+
startTime: time.Now(),
33+
}
34+
}
35+
36+
func (pa *PrometheusAnalyzer) MoveForward(ctx context.Context, clusterName string) (bool, error) {
37+
log := logrus.WithField("component", "prometheus-analyzer")
38+
client, err := api.NewClient(api.Config{
39+
Address: pa.prometheusURL,
40+
})
41+
if err != nil {
42+
return false, err
43+
}
44+
45+
v1api := v1.NewAPI(client)
46+
ctx, cancel := context.WithTimeout(ctx, 10*time.Second)
47+
defer cancel()
48+
49+
queryResult, warnings, err := v1api.Query(ctx, fmt.Sprintf(errorMetric, clusterName, time.Since(pa.startTime).Milliseconds()), time.Now())
50+
if err != nil {
51+
return false, err
52+
}
53+
if len(warnings) > 0 {
54+
log.Warnf("Warnings: %v\n", warnings)
55+
}
56+
57+
result, ok := queryResult.(model.Vector)
58+
if !ok {
59+
return false, fmt.Errorf("unexpected result type: %T", queryResult)
60+
}
61+
62+
if len(result) != 1 {
63+
if len(result) == 0 {
64+
log.Infof("No error data found for %s. Proceeding", clusterName)
65+
return true, nil
66+
}
67+
return false, fmt.Errorf("unexpected result Prometheus result vector length: %d", len(result))
68+
}
69+
val := float64(result[0].Value)
70+
if math.IsNaN(val) {
71+
return false, fmt.Errorf("unexpected sample value: %v", result[0].Value)
72+
}
73+
74+
// Return true if the error rate is 0
75+
if val > 0 {
76+
log.Infof("Found error metric rate as %f ", val)
77+
return false, nil
78+
}
79+
80+
return true, nil
81+
}

components/workspace-rollout-job/pkg/prometheus/prometheus.go

Lines changed: 0 additions & 55 deletions
This file was deleted.

0 commit comments

Comments
 (0)