Files
zulip/tools/ci/production-helper
arpit551 e6edf469ee ci: Remove the need of using TRAVIS in env.
Since now we want to use production suites on Circle CI so there
is no need to set TRAVIS in env while running scripts.

CIRCLECI is set default in the enviroment of Circle CI builds
so we can use it directly.

Also Travis CI had rabbitmq-server installed so we had to add workaround
in install script to avoid the error. That workaround is removed.
2020-04-21 14:46:40 -07:00

125 lines
5.4 KiB
Bash
Executable File

#!/usr/bin/env bash
# This test installs a Zulip production environment (from the release
# tarball from setup-production), and then runs some Nagios checks and
# other tools to verify that everything is working properly.
set -e
set -x
ZULIP_PATH=$(mktemp -d)
tar -xf zulip-server-test.tar.gz -C "$ZULIP_PATH" --strip-components=1
# Do an apt upgrade to start with an up-to-date machine
APT_OPTIONS=(-o 'Dpkg::Options::=--force-confdef' -o 'Dpkg::Options::=--force-confold')
apt-get update
if ! apt-get dist-upgrade -y "${APT_OPTIONS[@]}"; then
echo "\`apt-get dist-upgrade\`: Failure occurred while trying to perform distribution upgrade, Retrying..."
apt-get dist-upgrade -y "${APT_OPTIONS[@]}"
fi
# Install Zulip
"$ZULIP_PATH"/scripts/setup/install --self-signed-cert --hostname 127.0.0.1 --email circleci@example.com
cat >>/etc/zulip/settings.py <<EOF
# Circle CI override settings above
AUTHENTICATION_BACKENDS = ( 'zproject.backends.EmailAuthBackend', )
NOREPLY_EMAIL_ADDRESS = 'noreply@circleci.example.com'
ALLOWED_HOSTS = []
EOF
echo; echo "Now testing that the supervisord jobs are running properly"; echo
sleep 15 # Guaranteed to have a working supervisord process get an extra digit
if supervisorctl status | grep -vq RUNNING || supervisorctl status | sed 's/^.*uptime //' | grep -q 0:00:0; then
set +x
echo
echo "FAILURE: Supervisor output shows daemons are crashing:"
echo
supervisorctl status
echo
echo "DEBUG: printing Zulip server's error log:"
cat /var/log/zulip/errors.log
echo
echo "DEBUG: printing Zulip server's workers log:"
cat /var/log/zulip/workers.log
echo
echo "DEBUG: printing Zulip server's tornado log:"
cat /var/log/zulip/tornado.log
exit 1
fi
# TODO: Ideally this would test actually logging in, but this is a start.
echo; echo "Now testing that the newly installed server's homepage loads"; echo
wget https://localhost -O /tmp/index.html --no-check-certificate -S 2> /tmp/wget-output || true # || true so we see errors.log if this 500s
grep -vi '\(Vary\|Content-Language\|expires\|issued by\|modified\|saved\|[.][.][.]\|Date\|[-][-]\)' /tmp/wget-output > /tmp/http-headers-processed
# Simplify the diff by getting replacing 4-5 digit length numbers with <Length>.
sed -i 's|Length: [0-9]\+\( [(][0-9]\+[.][0-9]K[)]\)\?|Length: <Length>|' /tmp/http-headers-processed
sed -i 's|Length: [0-9]\+\( [(][0-9]\+[.][0-9]K[)]\)\?|Length: <Length>|' ~/success-http-headers.txt
if ! diff -ur /tmp/http-headers-processed ~/success-http-headers.txt; then
set +x
echo
echo "FAILURE: The HTTP Headers returned from loading the homepage on the server do not match the contents of tools/ci/success-http-headers.txt. Typically, this means that the server threw a 500 when trying to load the homepage."
echo "Displaying the contents of the server's error log:"
echo
cat /var/log/zulip/errors.log
echo
echo "Displaying the contents of the main server log:"
echo
cat /var/log/zulip/server.log
exit 1
fi
# Start the RabbitMQ queue worker related section
echo; echo "Now confirming all the RabbitMQ queue processors are correctly registered!"; echo
# These hacky shell scripts just extract the sorted list of queue processors, running and expected
supervisorctl status | cut -f1 -dR | cut -f2- -d: | grep events | cut -f1 -d" " | cut -f3- -d_ | cut -f1 -d- | sort -u > /tmp/running_queue_processors.txt
su zulip -c /home/zulip/deployments/current/scripts/lib/queue_workers.py | grep -v ^test$ | sort -u > /tmp/expected_queue_processors.txt
if ! diff /tmp/expected_queue_processors.txt /tmp/running_queue_processors.txt >/dev/null; then
set +x
echo "FAILURE: Runnable queue processors declared in zerver/worker/queue_processors.py "
echo "do not match those in puppet/manifests/zulip/base.pp"
echo "See https://zulip.readthedocs.io/en/latest/subsystems/queuing.html for details."
echo
diff -ur /tmp/expected_queue_processors.txt /tmp/running_queue_processors.txt
exit 1
fi
echo; echo "Now running RabbitMQ consumer Nagios tests"; echo
# First run the check that usually runs in cron and populates the state files
/home/zulip/deployments/current/scripts/nagios/check-rabbitmq-consumers
# Then, compute the list of all Django queue workers to run Nagios checks against
consumer_list=$(/home/zulip/deployments/current/scripts/lib/queue_workers.py --queue-type=consumer)
for consumer in $consumer_list; do
if ! /usr/lib/nagios/plugins/zulip_app_frontend/check_rabbitmq_consumers "$consumer"; then
set +x
echo
echo "FAILURE: Missing Nagios consumer for $consumer; displaying full consumer output:"
rabbitmqctl list_consumers
supervisorctl status
echo "EVENTS LOGS"
echo
cat /var/log/zulip/events*.log
echo
exit 1
fi
done
# Some of the Nagios tests have been temporarily disabled to work
# around a Travis CI infrastructure issue.
echo; echo "Now running additional Nagios tests"; echo
if ! /usr/lib/nagios/plugins/zulip_app_frontend/check_queue_worker_errors || \
! su zulip -c /usr/lib/nagios/plugins/zulip_postgres_appdb/check_fts_update_log; then # || \
# ! su zulip -c "/usr/lib/nagios/plugins/zulip_app_frontend/check_send_receive_time --site=https://127.0.0.1/api --nagios --insecure"; then
set +x
echo
echo "FAILURE: Nagios checks don't pass:"
echo
echo "DEBUG: printing Zulip server's error log:"
cat /var/log/zulip/errors.log
exit 1
fi
echo "Production installation test successful!"
exit 0