Skip to content

Commit

Permalink
ovn db: recover automatically on startup if db corruption is detected (
Browse files Browse the repository at this point in the history
  • Loading branch information
zhangzujian committed Oct 21, 2022
1 parent e430042 commit e7f3fb5
Show file tree
Hide file tree
Showing 2 changed files with 76 additions and 0 deletions.
2 changes: 2 additions & 0 deletions dist/images/Dockerfile.base
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ RUN cd /usr/src/ && \
curl -s https://github.com/kubeovn/ovs/commit/8c2f28b778129161bbf8f0738fa41d385860d5bc.patch | git apply && \
# fdb: fix mac learning in environments with hairpin enabled
curl -s https://github.com/kubeovn/ovs/commit/1cb138aaf2fdf922d75a587e4e9cf610d38f9fee.patch | git apply && \
# ovsdb-tool: add optional server id parameter for "join-cluster" command
curl -s https://github.com/kubeovn/ovs/commit/2e2ec1161cadbec79786d63fde9475053d996586.patch | git apply && \
# compile without avx512
if [ "$ARCH" = "amd64" -a "$NO_AVX512" = "true" ]; then curl -s https://github.com/kubeovn/ovs/commit/c257b0794b827cfae9660a9f3238bee8a29e7676.patch | git apply; fi && \
./boot.sh && \
Expand Down
74 changes: 74 additions & 0 deletions dist/images/start-db.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,20 @@ DB_SB_ADDR=${DB_SB_ADDR:-::}
DB_SB_PORT=${DB_SB_PORT:-6642}
ENABLE_SSL=${ENABLE_SSL:-false}

. /usr/share/openvswitch/scripts/ovs-lib || exit 1

function random_str {
echo $RANDOM | md5sum | head -c 6
}

function gen_conn_addr {
if [[ "$ENABLE_SSL" == "false" ]]; then
echo "tcp:[$1]:$2"
else
echo "ssl:[$1]:$2"
fi
}

function gen_conn_str {
t=$(echo -n "${NODE_IPS}" | sed 's/[[:space:]]//g' | sed 's/,/ /g')
if [[ "$ENABLE_SSL" == "false" ]]; then
Expand Down Expand Up @@ -74,6 +88,62 @@ function is_clustered {
return 1
}

# create a new db file and join it to the cluster
# if the nb/sb db file is corrputed
function ovn_db_pre_start() {
local db=""
local port=""
case $1 in
nb)
db=OVN_Northbound
port=6643
;;
sb)
db=OVN_Southbound
port=6644
;;
*)
echo "invalid database: $1"
exit 1
;;
esac

local db_file="/etc/ovn/ovn${1}_db.db"
[ ! -e "$db_file" ] && return
ovsdb_tool check-cluster "$db_file" && return

echo "detected database corruption for file $db_file, rebuild it."
local sid=$(ovsdb-tool db-sid "$db_file")
if ! echo -n "$sid" | grep -qE '^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$'; then
echo "failed to get sid from $1 db file $db_file"
return 1
fi
echo "get local server id $sid"

local local_addr="$(gen_conn_addr $POD_IP $port)"
echo "local address: $local_addr"

local remote_addr=()
local ips=$(echo -n "${NODE_IPS}" | sed 's/,/ /g')
for ip in ${ips[*]}; do
if [ ! "$POD_IP" = "$ip" ]; then
remote_addr=(${remote_addr[*]} "$(gen_conn_addr $ip $port)")
fi
done
echo "remote addresses: ${remote_addr[*]}"

local db_new="$db_file.init-$(random_str)"
echo "generating new database file $db_new"
ovsdb_tool --sid $sid join-cluster "$db_new" $db $local_addr ${remote_addr[*]} || return 1

local db_bak="$db_file.backup-$(random_str)"
echo "backup $db_file to $db_bak"
mv "$db_file" "$db_bak" || return 1

echo "use new database file $db_new"
mv "$db_new" "$db_file"
}

trap quit EXIT
if [[ "$ENABLE_SSL" == "false" ]]; then
if [[ -z "$NODE_IPS" ]]; then
Expand All @@ -90,6 +160,8 @@ if [[ "$ENABLE_SSL" == "false" ]]; then
exit 1
fi
/usr/share/ovn/scripts/ovn-ctl stop_northd
ovn_db_pre_start nb
ovn_db_pre_start sb

nb_leader_ip=$(get_leader_ip nb)
sb_leader_ip=$(get_leader_ip sb)
Expand Down Expand Up @@ -182,6 +254,8 @@ else
exit 1
fi
/usr/share/ovn/scripts/ovn-ctl stop_northd
ovn_db_pre_start nb
ovn_db_pre_start sb

nb_leader_ip=$(get_leader_ip nb)
sb_leader_ip=$(get_leader_ip sb)
Expand Down

0 comments on commit e7f3fb5

Please sign in to comment.