From c60feaaad2d6d6ca64c962e4d17445eef03f8342 Mon Sep 17 00:00:00 2001 From: Thales Macedo Garitezi Date: Fri, 29 Oct 2021 11:30:57 -0300 Subject: [PATCH 1/3] test(fvt): extend functional verification tests to use replicant node This parameterizes the Functional Verification Tests (FVTs) that run in CI to use a replication log (RLOG) role of "replicant" for one of the nodes. With this addition, our FVTs may explore more scenarios with data replication. --- ...er-compose-emqx-cluster-rlog.override.yaml | 27 ++++++++++++++++ .../docker-compose-emqx-cluster.yaml | 28 +++++++---------- .ci/docker-compose-file/haproxy/haproxy.cfg | 1 - .ci/docker-compose-file/python/pytest.sh | 2 +- .ci/docker-compose-file/scripts/run-emqx.sh | 31 +++++++++++++++++++ .github/workflows/run_fvt_tests.yaml | 22 +++++-------- 6 files changed, 79 insertions(+), 32 deletions(-) create mode 100644 .ci/docker-compose-file/docker-compose-emqx-cluster-rlog.override.yaml create mode 100755 .ci/docker-compose-file/scripts/run-emqx.sh diff --git a/.ci/docker-compose-file/docker-compose-emqx-cluster-rlog.override.yaml b/.ci/docker-compose-file/docker-compose-emqx-cluster-rlog.override.yaml new file mode 100644 index 000000000..3d8b86dd3 --- /dev/null +++ b/.ci/docker-compose-file/docker-compose-emqx-cluster-rlog.override.yaml @@ -0,0 +1,27 @@ +x-default-emqx: &default-emqx + image: $TARGET:$EMQX_TAG + env_file: + - conf.cluster.env + healthcheck: + test: ["CMD", "/opt/emqx/bin/emqx_ctl", "status"] + interval: 5s + timeout: 25s + retries: 5 + +services: + emqx1: + <<: *default-emqx + environment: + - "EMQX_HOST=node1.emqx.io" + - "EMQX_CLUSTER__DB_BACKEND=rlog" + - "EMQX_CLUSTER__RLOG__ROLE=core" + - "EMQX_CLUSTER__STATIC__SEEDS=[emqx@node1.emqx.io]" + + emqx2: + <<: *default-emqx + environment: + - "EMQX_HOST=node2.emqx.io" + - "EMQX_CLUSTER__DB_BACKEND=rlog" + - "EMQX_CLUSTER__RLOG__ROLE=replicant" + - "EMQX_CLUSTER__RLOG__CORE_NODES=emqx@node1.emqx.io" + - "EMQX_CLUSTER__STATIC__SEEDS=[emqx@node1.emqx.io]" diff --git a/.ci/docker-compose-file/docker-compose-emqx-cluster.yaml b/.ci/docker-compose-file/docker-compose-emqx-cluster.yaml index 656905eb0..b2635ecfe 100644 --- a/.ci/docker-compose-file/docker-compose-emqx-cluster.yaml +++ b/.ci/docker-compose-file/docker-compose-emqx-cluster.yaml @@ -1,5 +1,15 @@ version: '3.9' +x-default-emqx: &default-emqx + image: $TARGET:$EMQX_TAG + env_file: + - conf.cluster.env + healthcheck: + test: ["CMD", "/opt/emqx/bin/emqx_ctl", "status"] + interval: 5s + timeout: 25s + retries: 5 + services: haproxy: container_name: haproxy @@ -28,34 +38,20 @@ services: haproxy -f /usr/local/etc/haproxy/haproxy.cfg emqx1: + <<: *default-emqx container_name: node1.emqx.io - image: $TARGET:$EMQX_TAG - env_file: - - conf.cluster.env environment: - "EMQX_HOST=node1.emqx.io" - healthcheck: - test: ["CMD", "/opt/emqx/bin/emqx_ctl", "status"] - interval: 5s - timeout: 25s - retries: 5 networks: emqx_bridge: aliases: - node1.emqx.io emqx2: + <<: *default-emqx container_name: node2.emqx.io - image: $TARGET:$EMQX_TAG - env_file: - - conf.cluster.env environment: - "EMQX_HOST=node2.emqx.io" - healthcheck: - test: ["CMD", "/opt/emqx/bin/emqx", "ping"] - interval: 5s - timeout: 25s - retries: 5 networks: emqx_bridge: aliases: diff --git a/.ci/docker-compose-file/haproxy/haproxy.cfg b/.ci/docker-compose-file/haproxy/haproxy.cfg index b658789da..89c1d7d5d 100644 --- a/.ci/docker-compose-file/haproxy/haproxy.cfg +++ b/.ci/docker-compose-file/haproxy/haproxy.cfg @@ -54,7 +54,6 @@ backend emqx_dashboard_back server emqx-1 node1.emqx.io:18083 server emqx-2 node2.emqx.io:18083 - ##---------------------------------------------------------------- ## public ##---------------------------------------------------------------- diff --git a/.ci/docker-compose-file/python/pytest.sh b/.ci/docker-compose-file/python/pytest.sh index eacbecc3b..75f6441b5 100755 --- a/.ci/docker-compose-file/python/pytest.sh +++ b/.ci/docker-compose-file/python/pytest.sh @@ -1,7 +1,7 @@ #!/bin/sh ## This script is to run emqx cluster smoke tests (fvt) in github action -## This script is executed in pacho_client +## This script is executed in paho_client set -x set +e diff --git a/.ci/docker-compose-file/scripts/run-emqx.sh b/.ci/docker-compose-file/scripts/run-emqx.sh new file mode 100755 index 000000000..ebb07b8b6 --- /dev/null +++ b/.ci/docker-compose-file/scripts/run-emqx.sh @@ -0,0 +1,31 @@ +#!/bin/bash +set -euxo pipefail + +if [ "$EMQX_TEST_DB_BACKEND" = "rlog" ] +then + CLUSTER_OVERRIDES="-f .ci/docker-compose-file/docker-compose-emqx-cluster-rlog.override.yaml" +else + CLUSTER_OVERRIDES="" +fi + +{ + echo "HOCON_ENV_OVERRIDE_PREFIX=EMQX_" + echo "EMQX_ZONES__DEFAULT__MQTT__RETRY_INTERVAL=2s" + echo "EMQX_ZONES__DEFAULT__MQTT__MAX_TOPIC_ALIAS=10" +} >> .ci/docker-compose-file/conf.cluster.env + +is_cluster_up() { + docker exec -i node1.emqx.io \ + bash -c "emqx eval \"['emqx@node1.emqx.io','emqx@node2.emqx.io'] = maps:get(running_nodes, ekka_cluster:info()).\"" > /dev/null 2>&1 +} + +docker-compose \ + -f .ci/docker-compose-file/docker-compose-emqx-cluster.yaml \ + $CLUSTER_OVERRIDES \ + -f .ci/docker-compose-file/docker-compose-python.yaml \ + up -d + +while ! is_cluster_up; do + echo "['$(date -u +"%Y-%m-%dT%H:%M:%SZ")']:waiting emqx"; + sleep 5; +done diff --git a/.github/workflows/run_fvt_tests.yaml b/.github/workflows/run_fvt_tests.yaml index 509e84bab..e696ade29 100644 --- a/.github/workflows/run_fvt_tests.yaml +++ b/.github/workflows/run_fvt_tests.yaml @@ -69,8 +69,11 @@ jobs: fail-fast: false matrix: otp: - - 23.2.7.2-emqx-2 - - 24.1.1-emqx-1 + - 23.2.7.2-emqx-2 + - 24.1.1-emqx-1 + cluster_db_backend: + - "mnesia" + - "rlog" steps: - uses: actions/download-artifact@v2 @@ -91,18 +94,9 @@ jobs: timeout-minutes: 5 working-directory: source run: | - set -e -u -x - echo "HOCON_ENV_OVERRIDE_PREFIX=EMQX_" >> .ci/docker-compose-file/conf.cluster.env - echo "EMQX_ZONES__DEFAULT__MQTT__RETRY_INTERVAL=2s" >> .ci/docker-compose-file/conf.cluster.env - echo "EMQX_ZONES__DEFAULT__MQTT__MAX_TOPIC_ALIAS=10" >> .ci/docker-compose-file/conf.cluster.env - docker-compose \ - -f .ci/docker-compose-file/docker-compose-emqx-cluster.yaml \ - -f .ci/docker-compose-file/docker-compose-python.yaml \ - up -d - while ! docker exec -i node1.emqx.io bash -c "emqx eval \"['emqx@node1.emqx.io','emqx@node2.emqx.io'] = maps:get(running_nodes, ekka_cluster:info()).\"" > /dev/null 2>&1; do - echo "['$(date -u +"%Y-%m-%dT%H:%M:%SZ")']:waiting emqx"; - sleep 5; - done + set -x + export EMQX_TEST_DB_BACKEND="${{ matrix.cluster_db_backend }}" + ./.ci/docker-compose-file/scripts/run-emqx.sh - name: make paho tests run: | if ! docker exec -i python /scripts/pytest.sh; then From f8fc67b313a10ee5eda16be2d97b67ac94cf7cb7 Mon Sep 17 00:00:00 2001 From: Thales Macedo Garitezi Date: Sun, 7 Nov 2021 16:54:38 -0300 Subject: [PATCH 2/3] fix(lag): target only replica if rlog core+replicant there seems to be race conditions related to some tests with sessions hitting the core and the replicant alternately and rlog. for intance, if there is some delay in this replication, a new connection made to the replica with a just-created session in the core may not have been replicated to the replicant, resulting in a test failure if it expects the session to be present. since such replication lags are inherent to the core-replicant topology, we can try to target only the replicant to avoid seeing this inconsistent view of the system during the tests. --- ...er-compose-emqx-cluster-rlog.override.yaml | 6 ++++++ .ci/docker-compose-file/python/pytest.sh | 14 ++++++++++--- .ci/docker-compose-file/scripts/run-emqx.sh | 20 +++++++++++++++++-- .github/workflows/run_fvt_tests.yaml | 5 ++++- 4 files changed, 39 insertions(+), 6 deletions(-) diff --git a/.ci/docker-compose-file/docker-compose-emqx-cluster-rlog.override.yaml b/.ci/docker-compose-file/docker-compose-emqx-cluster-rlog.override.yaml index 3d8b86dd3..8be146eb5 100644 --- a/.ci/docker-compose-file/docker-compose-emqx-cluster-rlog.override.yaml +++ b/.ci/docker-compose-file/docker-compose-emqx-cluster-rlog.override.yaml @@ -11,17 +11,23 @@ x-default-emqx: &default-emqx services: emqx1: <<: *default-emqx + container_name: node1.emqx.io environment: - "EMQX_HOST=node1.emqx.io" - "EMQX_CLUSTER__DB_BACKEND=rlog" - "EMQX_CLUSTER__RLOG__ROLE=core" - "EMQX_CLUSTER__STATIC__SEEDS=[emqx@node1.emqx.io]" + - "EMQX_LISTENERS__TCP__DEFAULT__PROXY_PROTOCOL=false" + - "EMQX_LISTENERS__WS__DEFAULT__PROXY_PROTOCOL=false" emqx2: <<: *default-emqx + container_name: node2.emqx.io environment: - "EMQX_HOST=node2.emqx.io" - "EMQX_CLUSTER__DB_BACKEND=rlog" - "EMQX_CLUSTER__RLOG__ROLE=replicant" - "EMQX_CLUSTER__RLOG__CORE_NODES=emqx@node1.emqx.io" - "EMQX_CLUSTER__STATIC__SEEDS=[emqx@node1.emqx.io]" + - "EMQX_LISTENERS__TCP__DEFAULT__PROXY_PROTOCOL=false" + - "EMQX_LISTENERS__WS__DEFAULT__PROXY_PROTOCOL=false" diff --git a/.ci/docker-compose-file/python/pytest.sh b/.ci/docker-compose-file/python/pytest.sh index 75f6441b5..4579691b3 100755 --- a/.ci/docker-compose-file/python/pytest.sh +++ b/.ci/docker-compose-file/python/pytest.sh @@ -6,16 +6,24 @@ set -x set +e -LB="haproxy" +EMQX_TEST_DB_BACKEND=$1 +if [ "$EMQX_TEST_DB_BACKEND" = "rlog" ] +then + # target only replica to avoid replication races + TARGET_HOST="node2.emqx.io" +else + # use loadbalancer + TARGET_HOST="haproxy" +fi apk update && apk add git curl git clone -b develop-4.0 https://github.com/emqx/paho.mqtt.testing.git /paho.mqtt.testing pip install pytest -pytest -v /paho.mqtt.testing/interoperability/test_client/V5/test_connect.py -k test_basic --host "$LB" +pytest -v /paho.mqtt.testing/interoperability/test_client/V5/test_connect.py -k test_basic --host "$TARGET_HOST" RESULT=$? -pytest -v /paho.mqtt.testing/interoperability/test_client --host "$LB" +pytest -v /paho.mqtt.testing/interoperability/test_client --host "$TARGET_HOST" RESULT=$(( RESULT + $? )) # pytest -v /paho.mqtt.testing/interoperability/test_cluster --host1 "node1.emqx.io" --host2 "node2.emqx.io" diff --git a/.ci/docker-compose-file/scripts/run-emqx.sh b/.ci/docker-compose-file/scripts/run-emqx.sh index ebb07b8b6..1465cb655 100755 --- a/.ci/docker-compose-file/scripts/run-emqx.sh +++ b/.ci/docker-compose-file/scripts/run-emqx.sh @@ -14,11 +14,27 @@ fi echo "EMQX_ZONES__DEFAULT__MQTT__MAX_TOPIC_ALIAS=10" } >> .ci/docker-compose-file/conf.cluster.env -is_cluster_up() { - docker exec -i node1.emqx.io \ +is_node_up() { + local node + node="$1" + docker exec -i "$node" \ bash -c "emqx eval \"['emqx@node1.emqx.io','emqx@node2.emqx.io'] = maps:get(running_nodes, ekka_cluster:info()).\"" > /dev/null 2>&1 } +is_node_listening() { + local node + node="$1" + docker exec -i "$node" \ + emqx eval "ok = case gen_tcp:connect(\"localhost\", 1883, []) of {ok, P} -> gen_tcp:close(P), ok; _ -> exit(1) end." > /dev/null 2>&1 +} + +is_cluster_up() { + is_node_up node1.emqx.io && \ + is_node_up node2.emqx.io && \ + is_node_listening node1.emqx.io && \ + is_node_listening node2.emqx.io +} + docker-compose \ -f .ci/docker-compose-file/docker-compose-emqx-cluster.yaml \ $CLUSTER_OVERRIDES \ diff --git a/.github/workflows/run_fvt_tests.yaml b/.github/workflows/run_fvt_tests.yaml index e696ade29..46ce95dab 100644 --- a/.github/workflows/run_fvt_tests.yaml +++ b/.github/workflows/run_fvt_tests.yaml @@ -99,10 +99,13 @@ jobs: ./.ci/docker-compose-file/scripts/run-emqx.sh - name: make paho tests run: | - if ! docker exec -i python /scripts/pytest.sh; then + if ! docker exec -i python /scripts/pytest.sh "${{ matrix.cluster_db_backend }}"; then echo "DUMP_CONTAINER_LOGS_BGN" + echo "============== haproxy ==============" docker logs haproxy + echo "============== node1 ==============" docker logs node1.emqx.io + echo "============== node2 ==============" docker logs node2.emqx.io echo "DUMP_CONTAINER_LOGS_END" exit 1 From 030e4857ec7ab31a028b3f60b7d011c43778d0fd Mon Sep 17 00:00:00 2001 From: Thales Macedo Garitezi Date: Mon, 8 Nov 2021 15:36:54 -0300 Subject: [PATCH 3/3] docs(issue): mark solution as TODO and link related issue https://github.com/emqx/emqx/issues/6094 --- .ci/docker-compose-file/python/pytest.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.ci/docker-compose-file/python/pytest.sh b/.ci/docker-compose-file/python/pytest.sh index 4579691b3..c079a65a4 100755 --- a/.ci/docker-compose-file/python/pytest.sh +++ b/.ci/docker-compose-file/python/pytest.sh @@ -9,7 +9,8 @@ set +e EMQX_TEST_DB_BACKEND=$1 if [ "$EMQX_TEST_DB_BACKEND" = "rlog" ] then - # target only replica to avoid replication races + # TODO: target only replica to avoid replication races + # see: https://github.com/emqx/emqx/issues/6094 TARGET_HOST="node2.emqx.io" else # use loadbalancer