Files
zulip/tools/ci/production-verify
arpit551 4f6cd6577c http_headers: Substitute Nginx version based on OS.
success-http-headers-bionic.txt and success-http-headers-focal.txt
differ only in the nginx version so this substitution will allow
us to have single file for both of them. Also this change helps
to avoid CI failure if Nginx version is updated in the OS.
2020-07-07 11:20:05 -07:00

110 lines
4.9 KiB
Bash
Executable File

#!/usr/bin/env bash
# This tests a Zulip production environment (installed via
# production-install) with some Nagios checks and other tools to
# verify that everything is working properly.
set -e
set -x
cat >>/etc/zulip/settings.py <<EOF
# CircleCI override settings above
AUTHENTICATION_BACKENDS = ( 'zproject.backends.EmailAuthBackend', )
NOREPLY_EMAIL_ADDRESS = 'noreply@circleci.example.com'
ALLOWED_HOSTS = []
EOF
echo; echo "Now testing that the supervisord jobs are running properly"; echo
sleep 15 # Guaranteed to have a working supervisord process get an extra digit
if supervisorctl status | grep -vq RUNNING || supervisorctl status | sed 's/^.*uptime //' | grep -q 0:00:0; then
set +x
echo
echo "FAILURE: Supervisor output shows daemons are crashing:"
echo
supervisorctl status
echo
echo "DEBUG: printing Zulip server's error log:"
cat /var/log/zulip/errors.log
echo
echo "DEBUG: printing Zulip server's workers log:"
cat /var/log/zulip/workers.log
echo
echo "DEBUG: printing Zulip server's tornado log:"
cat /var/log/zulip/tornado.log
exit 1
fi
# TODO: Ideally this would test actually logging in, but this is a start.
echo; echo "Now testing that the newly installed server's homepage loads"; echo
wget https://localhost -O /tmp/index.html --no-check-certificate -S 2> /tmp/wget-output || true # || true so we see errors.log if this 500s
grep -vi '\(Vary\|Content-Language\|expires\|issued by\|modified\|saved\|[.][.][.]\|Date\|[-][-]\)' /tmp/wget-output > /tmp/http-headers-processed
nginx_version="$(nginx -v 2>&1 | awk '{print $3, $4}')"
# Simplify the diff by getting replacing 4-5 digit length numbers with <Length>.
sed -i 's|Length: [0-9]\+\( [(][0-9]\+[.][0-9]K[)]\)\?|Length: <Length>|' /tmp/http-headers-processed
sed -i -e 's|Length: [0-9]\+\( [(][0-9]\+[.][0-9]K[)]\)\?|Length: <Length>|' -e "s|{nginx_version_string}|$nginx_version|g" /tmp/success-http-headers.template.txt
if ! diff -ur /tmp/http-headers-processed /tmp/success-http-headers.template.txt; then
set +x
echo
echo "FAILURE: The HTTP Headers returned from loading the homepage on the server do not match the contents of tools/ci/success-http-headers.template.txt. Typically, this means that the server threw a 500 when trying to load the homepage."
echo "Displaying the contents of the server's error log:"
echo
cat /var/log/zulip/errors.log
echo
echo "Displaying the contents of the main server log:"
echo
cat /var/log/zulip/server.log
exit 1
fi
# Start the RabbitMQ queue worker related section
echo; echo "Now confirming all the RabbitMQ queue processors are correctly registered!"; echo
# These hacky shell scripts just extract the sorted list of queue processors, running and expected
supervisorctl status | cut -f1 -dR | cut -f2- -d: | grep events | cut -f1 -d" " | cut -f3- -d_ | cut -f1 -d- | sort -u > /tmp/running_queue_processors.txt
su zulip -c /home/zulip/deployments/current/scripts/lib/queue_workers.py | grep -v ^test$ | sort -u > /tmp/expected_queue_processors.txt
if ! diff /tmp/expected_queue_processors.txt /tmp/running_queue_processors.txt >/dev/null; then
set +x
echo "FAILURE: Runnable queue processors declared in zerver/worker/queue_processors.py "
echo "do not match those in puppet/manifests/zulip/base.pp"
echo "See https://zulip.readthedocs.io/en/latest/subsystems/queuing.html for details."
echo
diff -ur /tmp/expected_queue_processors.txt /tmp/running_queue_processors.txt
exit 1
fi
echo; echo "Now running RabbitMQ consumer Nagios tests"; echo
# First run the check that usually runs in cron and populates the state files
/home/zulip/deployments/current/scripts/nagios/check-rabbitmq-consumers
# Then, compute the list of all Django queue workers to run Nagios checks against
consumer_list=$(/home/zulip/deployments/current/scripts/lib/queue_workers.py --queue-type=consumer)
for consumer in $consumer_list; do
if ! /usr/lib/nagios/plugins/zulip_app_frontend/check_rabbitmq_consumers "$consumer"; then
set +x
echo
echo "FAILURE: Missing Nagios consumer for $consumer; displaying full consumer output:"
set -x
rabbitmqctl list_consumers
supervisorctl status
tail -n +1 /var/log/zulip/events*.log
exit 1
fi
done
# Some of the Nagios tests have been temporarily disabled to work
# around a Travis CI infrastructure issue.
echo; echo "Now running additional Nagios tests"; echo
if ! /usr/lib/nagios/plugins/zulip_app_frontend/check_queue_worker_errors || \
! su zulip -c /usr/lib/nagios/plugins/zulip_postgres_appdb/check_fts_update_log; then # || \
# ! su zulip -c "/usr/lib/nagios/plugins/zulip_app_frontend/check_send_receive_time --site=https://127.0.0.1/api --nagios --insecure"; then
set +x
echo
echo "FAILURE: Nagios checks don't pass:"
echo
echo "DEBUG: printing Zulip server's error log:"
cat /var/log/zulip/errors.log
exit 1
fi
echo "Production installation test successful!"
exit 0