Skip to content

Commit 95dd05c

Browse files
committed
Resilience improvements
1 parent 448f340 commit 95dd05c

File tree

6 files changed

+105
-24
lines changed

6 files changed

+105
-24
lines changed

pkg/deployment/deployment_inspector.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ import (
2727
"time"
2828

2929
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha"
30+
"github.com/arangodb/kube-arangodb/pkg/util"
3031
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
3132
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3233
)
@@ -87,9 +88,11 @@ func (d *Deployment) inspectDeployment(lastInterval time.Duration) time.Duration
8788
}
8889

8990
// Inspection of generated resources needed
90-
if err := d.resources.InspectPods(ctx); err != nil {
91+
if x, err := d.resources.InspectPods(ctx); err != nil {
9192
hasError = true
9293
d.CreateEvent(k8sutil.NewErrorEvent("Pod inspection failed", err, d.apiObject))
94+
} else {
95+
nextInterval = util.MinDuration(nextInterval, x)
9396
}
9497
if err := d.resources.InspectPVCs(ctx); err != nil {
9598
hasError = true

pkg/deployment/resources/pod_finalizers.go

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,13 @@ import (
3737
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
3838
)
3939

40+
const (
41+
recheckPodFinalizerInterval = time.Second * 10
42+
)
43+
4044
// runPodFinalizers goes through the list of pod finalizers to see if they can be removed.
41-
func (r *Resources) runPodFinalizers(ctx context.Context, p *v1.Pod, memberStatus api.MemberStatus, updateMember func(api.MemberStatus) error) error {
45+
// Returns: Interval_till_next_inspection, error
46+
func (r *Resources) runPodFinalizers(ctx context.Context, p *v1.Pod, memberStatus api.MemberStatus, updateMember func(api.MemberStatus) error) (time.Duration, error) {
4247
log := r.log.With().Str("pod-name", p.GetName()).Logger()
4348
var removalList []string
4449
for _, f := range p.ObjectMeta.GetFinalizers() {
@@ -55,7 +60,7 @@ func (r *Resources) runPodFinalizers(ctx context.Context, p *v1.Pod, memberStatu
5560
if err := r.inspectFinalizerPodDrainDBServer(ctx, log, p, memberStatus, updateMember); err == nil {
5661
removalList = append(removalList, f)
5762
} else {
58-
log.Debug().Err(err).Str("finalizer", f).Msg("Cannot remove finalizer yet")
63+
log.Debug().Err(err).Str("finalizer", f).Msg("Cannot remove Pod finalizer yet")
5964
}
6065
}
6166
}
@@ -65,10 +70,15 @@ func (r *Resources) runPodFinalizers(ctx context.Context, p *v1.Pod, memberStatu
6570
ignoreNotFound := false
6671
if err := k8sutil.RemovePodFinalizers(log, kubecli, p, removalList, ignoreNotFound); err != nil {
6772
log.Debug().Err(err).Msg("Failed to update pod (to remove finalizers)")
68-
return maskAny(err)
73+
return 0, maskAny(err)
74+
} else {
75+
log.Debug().Strs("finalizers", removalList).Msg("Removed finalizer(s) from Pod")
6976
}
77+
} else {
78+
// Check again at given interval
79+
return recheckPodFinalizerInterval, nil
7080
}
71-
return nil
81+
return maxPodInspectorInterval, nil
7282
}
7383

7484
// inspectFinalizerPodAgencyServing checks the finalizer condition for agency-serving.
@@ -131,14 +141,16 @@ func (r *Resources) inspectFinalizerPodAgencyServing(ctx context.Context, log ze
131141
return maskAny(fmt.Errorf("No more remaining agents"))
132142
}
133143
if err := agency.AreAgentsHealthy(ctx, agencyConns); err != nil {
134-
log.Debug().Err(err).Msg("Remaining agents are not health")
144+
log.Debug().Err(err).Msg("Remaining agents are not healthy")
135145
return maskAny(err)
136146
}
137147

138148
// Remaining agents are healthy, we can remove this one and trigger a delete of the PVC
139-
if err := pvcs.Delete(memberStatus.PersistentVolumeClaimName, &metav1.DeleteOptions{}); err != nil {
149+
if err := pvcs.Delete(memberStatus.PersistentVolumeClaimName, &metav1.DeleteOptions{}); err != nil && !k8sutil.IsNotFound(err) {
140150
log.Warn().Err(err).Msg("Failed to delete PVC for member")
141151
return maskAny(err)
152+
} else {
153+
log.Debug().Str("pvc-name", memberStatus.PersistentVolumeClaimName).Msg("Removed PVC of member so agency can be completely replaced")
142154
}
143155

144156
return nil

pkg/deployment/resources/pod_inspector.go

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ import (
3131

3232
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha"
3333
"github.com/arangodb/kube-arangodb/pkg/metrics"
34+
"github.com/arangodb/kube-arangodb/pkg/util"
3435
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
3536
)
3637

@@ -39,19 +40,22 @@ var (
3940
)
4041

4142
const (
42-
podScheduleTimeout = time.Minute // How long we allow the schedule to take scheduling a pod.
43+
podScheduleTimeout = time.Minute // How long we allow the schedule to take scheduling a pod.
44+
maxPodInspectorInterval = time.Hour
4345
)
4446

4547
// InspectPods lists all pods that belong to the given deployment and updates
4648
// the member status of the deployment accordingly.
47-
func (r *Resources) InspectPods(ctx context.Context) error {
49+
// Returns: Interval_till_next_inspection, error
50+
func (r *Resources) InspectPods(ctx context.Context) (time.Duration, error) {
4851
log := r.log
4952
var events []*k8sutil.Event
53+
nextInterval := maxPodInspectorInterval // Large by default, will be made smaller if needed in the rest of the function
5054

5155
pods, err := r.context.GetOwnedPods()
5256
if err != nil {
5357
log.Debug().Err(err).Msg("Failed to get owned pods")
54-
return maskAny(err)
58+
return 0, maskAny(err)
5559
}
5660

5761
// Update member status from all pods found
@@ -80,7 +84,7 @@ func (r *Resources) InspectPods(ctx context.Context) error {
8084
ignoreNotFound := false
8185
if err := k8sutil.RemovePodFinalizers(log, kubecli, &p, p.GetFinalizers(), ignoreNotFound); err != nil {
8286
log.Debug().Err(err).Msg("Failed to update pod (to remove all finalizers)")
83-
return maskAny(err)
87+
return 0, maskAny(err)
8488
}
8589
}
8690
continue
@@ -136,18 +140,20 @@ func (r *Resources) InspectPods(ctx context.Context) error {
136140
}
137141
if k8sutil.IsPodMarkedForDeletion(&p) {
138142
// Process finalizers
139-
if err := r.runPodFinalizers(ctx, &p, memberStatus, func(m api.MemberStatus) error {
143+
if x, err := r.runPodFinalizers(ctx, &p, memberStatus, func(m api.MemberStatus) error {
140144
updateMemberStatusNeeded = true
141145
memberStatus = m
142146
return nil
143147
}); err != nil {
144148
// Only log here, since we'll be called to try again.
145149
log.Warn().Err(err).Msg("Failed to run pod finalizers")
150+
} else {
151+
nextInterval = util.MinDuration(nextInterval, x)
146152
}
147153
}
148154
if updateMemberStatusNeeded {
149155
if err := status.Members.Update(memberStatus, group); err != nil {
150-
return maskAny(err)
156+
return 0, maskAny(err)
151157
}
152158
}
153159
}
@@ -238,14 +244,14 @@ func (r *Resources) InspectPods(ctx context.Context) error {
238244

239245
// Save status
240246
if err := r.context.UpdateStatus(status, lastVersion); err != nil {
241-
return maskAny(err)
247+
return 0, maskAny(err)
242248
}
243249

244250
// Create events
245251
for _, evt := range events {
246252
r.context.CreateEvent(evt)
247253
}
248-
return nil
254+
return nextInterval, nil
249255
}
250256

251257
// GetExpectedPodArguments creates command line arguments for a server in the given group with given ID.

pkg/deployment/resources/pvc_finalizers.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ func (r *Resources) runPVCFinalizers(ctx context.Context, p *v1.PersistentVolume
4646
if err := r.inspectFinalizerPVCMemberExists(ctx, log, p, group, memberStatus); err == nil {
4747
removalList = append(removalList, f)
4848
} else {
49-
log.Debug().Err(err).Str("finalizer", f).Msg("Cannot remove finalizer yet")
49+
log.Debug().Err(err).Str("finalizer", f).Msg("Cannot remove PVC finalizer yet")
5050
}
5151
}
5252
}
@@ -57,6 +57,8 @@ func (r *Resources) runPVCFinalizers(ctx context.Context, p *v1.PersistentVolume
5757
if err := k8sutil.RemovePVCFinalizers(log, kubecli, p, removalList, ignoreNotFound); err != nil {
5858
log.Debug().Err(err).Msg("Failed to update PVC (to remove finalizers)")
5959
return maskAny(err)
60+
} else {
61+
log.Debug().Strs("finalizers", removalList).Msg("Removed finalizer(s) from PVC")
6062
}
6163
}
6264
return nil

pkg/util/duration.go

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
//
2+
// DISCLAIMER
3+
//
4+
// Copyright 2018 ArangoDB GmbH, Cologne, Germany
5+
//
6+
// Licensed under the Apache License, Version 2.0 (the "License");
7+
// you may not use this file except in compliance with the License.
8+
// You may obtain a copy of the License at
9+
//
10+
// http://www.apache.org/licenses/LICENSE-2.0
11+
//
12+
// Unless required by applicable law or agreed to in writing, software
13+
// distributed under the License is distributed on an "AS IS" BASIS,
14+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
// See the License for the specific language governing permissions and
16+
// limitations under the License.
17+
//
18+
// Copyright holder is ArangoDB GmbH, Cologne, Germany
19+
//
20+
// Author Ewout Prangsma
21+
//
22+
23+
package util
24+
25+
import (
26+
"time"
27+
)
28+
29+
// MaxDuration returns the largest of the given durations
30+
func MaxDuration(a, b time.Duration) time.Duration {
31+
if a > b {
32+
return a
33+
}
34+
return b
35+
}
36+
37+
// MinDuration returns the smallest of the given durations
38+
func MinDuration(a, b time.Duration) time.Duration {
39+
if a < b {
40+
return a
41+
}
42+
return b
43+
}

tests/resilience_test.go

Lines changed: 23 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -122,16 +122,28 @@ func TestResiliencePod(t *testing.T) {
122122
removeDeployment(c, depl.GetName(), ns)
123123
}
124124

125-
// TestResiliencePVC
126-
// Tests handling of individual pod deletions
127-
func TestResiliencePVC(t *testing.T) {
125+
// TestResiliencePVCAgents
126+
// Tests handling of individual PVCs of agents being deleted
127+
func TestResiliencePVCAgents(t *testing.T) {
128+
testResiliencePVC(api.ServerGroupAgents, t)
129+
}
130+
131+
// TestResiliencePVCDBServers
132+
// Tests handling of individual PVCs of dbservers being deleted
133+
func TestResiliencePVCDBServers(t *testing.T) {
134+
testResiliencePVC(api.ServerGroupDBServers, t)
135+
}
136+
137+
// testResiliencePVC
138+
// Tests handling of individual PVCs of given group being deleted
139+
func testResiliencePVC(testGroup api.ServerGroup, t *testing.T) {
128140
longOrSkip(t)
129141
c := client.MustNewInCluster()
130142
kubecli := mustNewKubeClient(t)
131143
ns := getNamespace(t)
132144

133145
// Prepare deployment config
134-
depl := newDeployment("test-pvc-resilience-" + uniuri.NewLen(4))
146+
depl := newDeployment(fmt.Sprintf("test-pvc-resilience-%s-%s", testGroup.AsRoleAbbreviated(), uniuri.NewLen(4)))
135147
depl.Spec.Mode = api.NewMode(api.DeploymentModeCluster)
136148
depl.Spec.SetDefaults(depl.GetName()) // this must be last
137149

@@ -166,9 +178,8 @@ func TestResiliencePVC(t *testing.T) {
166178

167179
// Delete one pvc after the other
168180
apiObject.ForeachServerGroup(func(group api.ServerGroup, spec api.ServerGroupSpec, status *api.MemberStatusList) error {
169-
if group != api.ServerGroupAgents {
170-
// Coordinators have no PVC
171-
// DBServers will be cleaned out and create a new member
181+
if group != testGroup {
182+
// We only test a specific group here
172183
return nil
173184
}
174185
for _, m := range *status {
@@ -195,9 +206,13 @@ func TestResiliencePVC(t *testing.T) {
195206
}
196207
return nil
197208
}
198-
if err := retry.Retry(op, time.Minute); err != nil {
209+
if err := retry.Retry(op, time.Minute*2); err != nil {
199210
t.Fatalf("PVC did not restart: %v", err)
200211
}
212+
// Wait for deployment to be ready
213+
if _, err = waitUntilDeployment(c, depl.GetName(), ns, deploymentIsReady()); err != nil {
214+
t.Fatalf("Deployment not running in time: %v", err)
215+
}
201216
// Wait for cluster to be completely ready
202217
if err := waitUntilClusterHealth(client, func(h driver.ClusterHealth) error {
203218
return clusterHealthEqualsSpec(h, apiObject.Spec)

0 commit comments

Comments
 (0)