Skip to content

Commit

Permalink
ci: detect ovs/ovn memory leak (#2839)
Browse files Browse the repository at this point in the history
  • Loading branch information
zhangzujian committed May 23, 2023
1 parent 7765677 commit 6ee56d0
Show file tree
Hide file tree
Showing 5 changed files with 80 additions and 9 deletions.
50 changes: 50 additions & 0 deletions .github/workflows/build-x86-image.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,9 @@ jobs:
- name: Load image
run: docker load --input kube-ovn.tar

- name: Export debug image tag
run: echo "DEBUG_TAG='$(cat VERSION)-debug'" >> "$GITHUB_ENV"

- name: Create kind cluster
run: |
sudo pip3 install j2cli
Expand All @@ -354,6 +357,9 @@ jobs:
sudo chown -R $(id -un). ~/.kube/
- name: Install Kube-OVN
env:
VERSION: ${{ env.DEBUG_TAG }}
DEBUG_WRAPPER: valgrind
run: make kind-install-${{ matrix.mode }}-${{ matrix.ip-family }}

- name: Run E2E
Expand All @@ -377,6 +383,25 @@ jobs:
name: k8s-conformance-e2e-${{ matrix.ip-family }}-${{ matrix.mode }}-ko-log
path: k8s-conformance-e2e-${{ matrix.ip-family }}-${{ matrix.mode }}-ko-log.tar.gz

- name: Check valgrind result
run: |
kubectl -n kube-system rollout restart deploy ovn-central
kubectl -n kube-system rollout restart ds ovs-ovn
kubectl -n kube-system rollout status deploy ovn-central
kubectl -n kube-system rollout status ds ovs-ovn
kubectl ko log ovn
kubectl ko log ovs
exit_code=0
find kubectl-ko-log -type f -name '*.valgrind.*' | while read f; do
if grep -qw 'definitely lost' "$f"; then
exit_code=1
echo $f; cat "$f";
fi;
done
exit $exit_code
k8s-netpol-e2e:
name: Kubernetes Network Policy E2E
if: |
Expand Down Expand Up @@ -755,6 +780,9 @@ jobs:
- name: Load image
run: docker load --input kube-ovn.tar

- name: Export debug image tag
run: echo "DEBUG_TAG='$(cat VERSION)-debug'" >> "$GITHUB_ENV"

- name: Create kind cluster
run: |
sudo pip3 install j2cli
Expand All @@ -764,6 +792,9 @@ jobs:
sudo chown -R $(id -un). ~/.kube/
- name: Install Kube-OVN
env:
VERSION: ${{ env.DEBUG_TAG }}
DEBUG_WRAPPER: valgrind
run: make kind-install-${{ matrix.mode }}-${{ matrix.ip-family }}

- name: Run E2E
Expand All @@ -787,6 +818,25 @@ jobs:
name: kube-ovn-conformance-e2e-${{ matrix.mode }}-${{ matrix.ip-family }}-ko-log
path: kube-ovn-conformance-e2e-${{ matrix.mode }}-${{ matrix.ip-family }}-ko-log.tar.gz

- name: Check valgrind result
run: |
kubectl -n kube-system rollout restart deploy ovn-central
kubectl -n kube-system rollout restart ds ovs-ovn
kubectl -n kube-system rollout status deploy ovn-central
kubectl -n kube-system rollout status ds ovs-ovn
kubectl ko log ovn
kubectl ko log ovs
exit_code=0
find kubectl-ko-log -type f -name '*.valgrind.*' | while read f; do
if grep -qw 'definitely lost' "$f"; then
exit_code=1
echo $f; cat "$f";
fi;
done
exit $exit_code
- name: Cleanup
run: sh dist/images/cleanup.sh

Expand Down
4 changes: 4 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -428,6 +428,10 @@ kind-install-dev:
kind-install-debug:
@VERSION=$(DEBUG_TAG) $(MAKE) kind-install

.PHONY: kind-install-debug-valgrind
kind-install-debug-valgrind:
@DEBUG_WRAPPER=valgrind $(MAKE) kind-install-debug

.PHONY: kind-install-ipv4
kind-install-ipv4: kind-install-overlay-ipv4

Expand Down
9 changes: 9 additions & 0 deletions dist/images/install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@ IFACE=${IFACE:-}
DPDK_TUNNEL_IFACE=${DPDK_TUNNEL_IFACE:-br-phy}
ENABLE_BIND_LOCAL_IP=${ENABLE_BIND_LOCAL_IP:-true}

# debug
DEBUG_WRAPPER=${DEBUG_WRAPPER:-}

KUBELET_DIR=${KUBELET_DIR:-/var/lib/kubelet}

CNI_CONF_DIR="/etc/cni/net.d"
Expand Down Expand Up @@ -2898,6 +2901,8 @@ spec:
fieldPath: status.podIPs
- name: ENABLE_BIND_LOCAL_IP
value: "$ENABLE_BIND_LOCAL_IP"
- name: DEBUG_WRAPPER
value: "$DEBUG_WRAPPER"
resources:
requests:
cpu: 300m
Expand Down Expand Up @@ -3409,6 +3414,8 @@ spec:
fieldPath: status.podIPs
- name: ENABLE_BIND_LOCAL_IP
value: "$ENABLE_BIND_LOCAL_IP"
- name: DEBUG_WRAPPER
value: "$DEBUG_WRAPPER"
resources:
requests:
cpu: 300m
Expand Down Expand Up @@ -3553,6 +3560,8 @@ spec:
fieldPath: spec.nodeName
- name: OVN_DB_IPS
value: $addresses
- name: DEBUG_WRAPPER
value: "$DEBUG_WRAPPER"
volumeMounts:
- mountPath: /var/run/netns
name: host-ns
Expand Down
17 changes: 12 additions & 5 deletions dist/images/start-db.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
#!/bin/bash
set -eo pipefail

DEBUG_WRAPPER=${DEBUG_WRAPPER:-}
DEBUG_OPT="--ovn-northd-wrapper=$DEBUG_WRAPPER --ovsdb-nb-wrapper=$DEBUG_WRAPPER --ovsdb-sb-wrapper=$DEBUG_WRAPPER"

# https://bugs.launchpad.net/neutron/+bug/1776778
if grep -q "3.10.0-862" /proc/version
then
Expand Down Expand Up @@ -229,7 +232,8 @@ if [[ "$ENABLE_SSL" == "false" ]]; then
set -eo pipefail
# leader up only when no cluster and on first node
if [[ ${result} -eq 1 && "$nb_leader_ip" == "$DB_CLUSTER_ADDR" ]]; then
ovn_ctl_args="--db-nb-create-insecure-remote=yes \
ovn_ctl_args="$DEBUG_OPT \
--db-nb-create-insecure-remote=yes \
--db-sb-create-insecure-remote=yes \
--db-nb-cluster-local-addr=[$DB_CLUSTER_ADDR] \
--db-sb-cluster-local-addr=[$DB_CLUSTER_ADDR] \
Expand All @@ -242,7 +246,7 @@ if [[ "$ENABLE_SSL" == "false" ]]; then
--db-nb-use-remote-in-db=no \
--db-sb-use-remote-in-db=no \
--ovn-northd-nb-db=$(gen_conn_str 6641) \
--ovn-northd-sb-db=$(gen_conn_str 6642)"
--ovn-northd-sb-db=$(gen_conn_str 6642) "
# Start ovn-northd, ovn-nb and ovn-sb
/usr/share/ovn/scripts/ovn-ctl $ovn_ctl_args \
start_nb_ovsdb -- \
Expand Down Expand Up @@ -282,7 +286,8 @@ if [[ "$ENABLE_SSL" == "false" ]]; then
fi
set -eo pipefail
# otherwise go to first node
ovn_ctl_args="--db-nb-create-insecure-remote=yes \
ovn_ctl_args="$DEBUG_OPT \
--db-nb-create-insecure-remote=yes \
--db-sb-create-insecure-remote=yes \
--db-nb-cluster-local-addr=[$DB_CLUSTER_ADDR] \
--db-sb-cluster-local-addr=[$DB_CLUSTER_ADDR] \
Expand Down Expand Up @@ -354,7 +359,8 @@ else
result=$?
set -eo pipefail
if [[ ${result} -eq 1 && "$nb_leader_ip" == "${DB_CLUSTER_ADDR}" ]]; then
ovn_ctl_args="--ovn-nb-db-ssl-key=/var/run/tls/key \
ovn_ctl_args="$DEBUG_OPT
--ovn-nb-db-ssl-key=/var/run/tls/key \
--ovn-nb-db-ssl-cert=/var/run/tls/cert \
--ovn-nb-db-ssl-ca-cert=/var/run/tls/cacert \
--ovn-sb-db-ssl-key=/var/run/tls/key \
Expand Down Expand Up @@ -411,7 +417,8 @@ else
done
fi
set -eo pipefail
ovn_ctl_args="--ovn-nb-db-ssl-key=/var/run/tls/key \
ovn_ctl_args="$DEBUG_OPT
--ovn-nb-db-ssl-key=/var/run/tls/key \
--ovn-nb-db-ssl-cert=/var/run/tls/cert \
--ovn-nb-db-ssl-ca-cert=/var/run/tls/cacert \
--ovn-sb-db-ssl-key=/var/run/tls/key \
Expand Down
9 changes: 5 additions & 4 deletions dist/images/start-ovs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ ENABLE_SSL=${ENABLE_SSL:-false}
OVN_DB_IPS=${OVN_DB_IPS:-}
TUNNEL_TYPE=${TUNNEL_TYPE:-geneve}
FLOW_LIMIT=${FLOW_LIMIT:-10}
DEBUG_WRAPPER=${DEBUG_WRAPPER:-}

# Check required kernel module
modinfo openvswitch
Expand Down Expand Up @@ -63,7 +64,7 @@ trap quit EXIT
iptables -V

# Start ovsdb
/usr/share/openvswitch/scripts/ovs-ctl restart --no-ovs-vswitchd --system-id=random
/usr/share/openvswitch/scripts/ovs-ctl restart --no-ovs-vswitchd --system-id=random --ovsdb-server-wrapper=$DEBUG_WRAPPER
# Restrict the number of pthreads ovs-vswitchd creates to reduce the
# amount of RSS it uses on hosts with many cores
# https://bugzilla.redhat.com/show_bug.cgi?id=1571379
Expand Down Expand Up @@ -107,7 +108,7 @@ function handle_underlay_bridges() {
handle_underlay_bridges

# Start vswitchd. restart will automatically set/unset flow-restore-wait which is not what we want
/usr/share/openvswitch/scripts/ovs-ctl restart --no-ovsdb-server --system-id=random --no-mlockall
/usr/share/openvswitch/scripts/ovs-ctl restart --no-ovsdb-server --system-id=random --no-mlockall --ovs-vswitchd-wrapper=$DEBUG_WRAPPER
/usr/share/openvswitch/scripts/ovs-ctl --protocol=udp --dport=6081 enable-protocol

function gen_conn_str {
Expand Down Expand Up @@ -136,9 +137,9 @@ ovs-vsctl set open . external-ids:hostname="${KUBE_NODE_NAME}"

# Start ovn-controller
if [[ "$ENABLE_SSL" == "false" ]]; then
/usr/share/ovn/scripts/ovn-ctl restart_controller
/usr/share/ovn/scripts/ovn-ctl --ovn-controller-wrapper=$DEBUG_WRAPPER restart_controller
else
/usr/share/ovn/scripts/ovn-ctl --ovn-controller-ssl-key=/var/run/tls/key --ovn-controller-ssl-cert=/var/run/tls/cert --ovn-controller-ssl-ca-cert=/var/run/tls/cacert restart_controller
/usr/share/ovn/scripts/ovn-ctl --ovn-controller-ssl-key=/var/run/tls/key --ovn-controller-ssl-cert=/var/run/tls/cert --ovn-controller-ssl-ca-cert=/var/run/tls/cacert --ovn-controller-wrapper=$DEBUG_WRAPPER restart_controller
fi

chmod 600 /etc/openvswitch/*
Expand Down

0 comments on commit 6ee56d0

Please sign in to comment.