Skip to content

Commit

Permalink
fix: restart nbctl-daemon if not response
Browse files Browse the repository at this point in the history
Sometimes nbctl-daemon may hang and not response anymore and all request to ovn-nb will failed. We need to check nbctl-daemon repeatedly and use liveness prob to kill the controller that can not connect to ovn-nb for a period.

(cherry picked from commit 1823907)
  • Loading branch information
oilbeater committed Jan 2, 2020
1 parent 09e27ce commit f16209b
Show file tree
Hide file tree
Showing 7 changed files with 52 additions and 11 deletions.
7 changes: 7 additions & 0 deletions cmd/controller/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,5 +43,12 @@ func loopOvnNbctlDaemon(config *controller.Configuration) {
if _, err := os.Stat(daemonSocket); os.IsNotExist(err) || daemonSocket == "" {
ovs.StartOvnNbctlDaemon(config.OvnNbHost, config.OvnNbPort)
}

// ovn-nbctl daemon may hang and cannot precess further request.
// In case of that, we need to start a new daemon.
if err := ovs.CheckAlive(); err != nil {
klog.Warningf("ovn-nbctl daemon doesn't return, start a new daemon")
ovs.StartOvnNbctlDaemon(config.OvnNbHost, config.OvnNbPort)
}
}
}
5 changes: 5 additions & 0 deletions cmd/webhook/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,5 +100,10 @@ func loopOvnNbctlDaemon(ovnNbHost string, ovnNbPort int) {
if _, err := os.Stat(daemonSocket); os.IsNotExist(err) || daemonSocket == "" {
ovs.StartOvnNbctlDaemon(ovnNbHost, ovnNbPort)
}

if err := ovs.CheckAlive(); err != nil {
klog.Warningf("ovn-nbctl daemon doesn't return, start a new daemon")
ovs.StartOvnNbctlDaemon(ovnNbHost, ovnNbPort)
}
}
}
1 change: 1 addition & 0 deletions dist/images/Dockerfile.controller
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,5 @@ WORKDIR /kube-ovn
CMD ["sh", "start-controller.sh"]

COPY start-controller.sh /kube-ovn/start-controller.sh
COPY kube-ovn-controller-healthcheck.sh /kube-ovn/kube-ovn-controller-healthcheck.sh
COPY kube-ovn-controller /kube-ovn/kube-ovn-controller
8 changes: 8 additions & 0 deletions dist/images/kube-ovn-controller-healthcheck.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!/bin/bash
set -euo pipefail

OVN_NB_DAEMON=/var/run/openvswitch/ovn-nbctl.$(cat /var/run/openvswitch/ovn-nbctl.pid).ctl ovn-nbctl --timeout=10 show > /dev/null

nc -z -w3 127.0.0.1 10660

nc -z -w3 "$KUBERNETES_SERVICE_HOST" "$KUBERNETES_SERVICE_PORT"
2 changes: 1 addition & 1 deletion dist/images/start-controller.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env bash
set -euo pipefail
export OVN_NB_DAEMON=$(ovn-nbctl --db=tcp:["${OVN_NB_SERVICE_HOST}"]:"${OVN_NB_SERVICE_PORT}" --pidfile --detach)
export OVN_NB_DAEMON=$(ovn-nbctl --db=tcp:["${OVN_NB_SERVICE_HOST}"]:"${OVN_NB_SERVICE_PORT}" --pidfile --detach --overwrite-pidfile)
exec ./kube-ovn-controller --ovn-nb-host="${OVN_NB_SERVICE_HOST}" --ovn-nb-port="${OVN_NB_SERVICE_PORT}" $@
26 changes: 26 additions & 0 deletions pkg/ovs/ovn-nbctl.go
Original file line number Diff line number Diff line change
Expand Up @@ -607,10 +607,21 @@ func (c Client) SetAddressesToAddressSet(addresses []string, as string) error {
func StartOvnNbctlDaemon(nbHost string, nbPort int) (string, error) {
klog.Infof("start ovn-nbctl daemon")
output, err := exec.Command(
"pkill",
"-f",
"ovn-nbctl",
).CombinedOutput()
if err != nil {
klog.Errorf("failed to kill old ovn-nbctl daemon: %v", string(output))
return "", err
}

output, err = exec.Command(
"ovn-nbctl",
fmt.Sprintf("--db=tcp:%s:%d", nbHost, nbPort),
"--pidfile",
"--detach",
"--overwrite-pidfile",
).CombinedOutput()
if err != nil {
klog.Errorf("start ovn-nbctl daemon failed, %s", string(output))
Expand All @@ -622,6 +633,21 @@ func StartOvnNbctlDaemon(nbHost string, nbPort int) (string, error) {
return daemonSocket, nil
}

// CheckAlive check if kube-ovn-controller can access ovn-nb from nbctl-daemon
func CheckAlive() error {
output, err := exec.Command(
"ovn-nbctl",
"--timeout=10",
"show",
).CombinedOutput()

if err != nil {
klog.Errorf("failed to access ovn-nb from daemon, %s", string(output))
return err
}
return nil
}

// GetLogicalSwitchExcludeIPS get a logical switch exclude ips
// ovn-nbctl get logical_switch ovn-default other_config:exclude_ips => "10.17.0.1 10.17.0.2 10.17.0.3..10.17.0.5"
func (c Client) GetLogicalSwitchExcludeIPS(logicalSwitch string) ([]string, error) {
Expand Down
14 changes: 4 additions & 10 deletions yamls/kube-ovn.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -62,20 +62,14 @@ spec:
readinessProbe:
exec:
command:
- nc
- -z
- -w3
- 127.0.0.1
- "10660"
- sh
- /kube-ovn/kube-ovn-controller-healthcheck.sh
periodSeconds: 3
livenessProbe:
exec:
command:
- nc
- -z
- -w3
- 127.0.0.1
- "10660"
- sh
- /kube-ovn/kube-ovn-controller-healthcheck.sh
initialDelaySeconds: 30
periodSeconds: 7
failureThreshold: 5
Expand Down

0 comments on commit f16209b

Please sign in to comment.